diff options
Diffstat (limited to 'extractor/features')
23 files changed, 593 insertions, 0 deletions
diff --git a/extractor/features/count_source_target.cc b/extractor/features/count_source_target.cc new file mode 100644 index 00000000..9441b451 --- /dev/null +++ b/extractor/features/count_source_target.cc @@ -0,0 +1,11 @@ +#include "count_source_target.h" + +#include <cmath> + +double CountSourceTarget::Score(const FeatureContext& context) const { + return log10(1 + context.pair_count); +} + +string CountSourceTarget::GetName() const { + return "CountEF"; +} diff --git a/extractor/features/count_source_target.h b/extractor/features/count_source_target.h new file mode 100644 index 00000000..a2481944 --- /dev/null +++ b/extractor/features/count_source_target.h @@ -0,0 +1,13 @@ +#ifndef _COUNT_SOURCE_TARGET_H_ +#define _COUNT_SOURCE_TARGET_H_ + +#include "feature.h" + +class CountSourceTarget : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +#endif diff --git a/extractor/features/count_source_target_test.cc b/extractor/features/count_source_target_test.cc new file mode 100644 index 00000000..22633bb6 --- /dev/null +++ b/extractor/features/count_source_target_test.cc @@ -0,0 +1,32 @@ +#include <gtest/gtest.h> + +#include <memory> +#include <string> + +#include "count_source_target.h" + +using namespace std; +using namespace ::testing; + +namespace { + +class CountSourceTargetTest : public Test { + protected: + virtual void SetUp() { + feature = make_shared<CountSourceTarget>(); + } + + shared_ptr<CountSourceTarget> feature; +}; + +TEST_F(CountSourceTargetTest, TestGetName) { + EXPECT_EQ("CountEF", feature->GetName()); +} + +TEST_F(CountSourceTargetTest, TestScore) { + Phrase phrase; + FeatureContext context(phrase, phrase, 0.5, 9, 13); + EXPECT_EQ(1.0, feature->Score(context)); +} + +} // namespace diff --git a/extractor/features/feature.cc b/extractor/features/feature.cc new file mode 100644 index 00000000..876f5f8f --- /dev/null +++ b/extractor/features/feature.cc @@ -0,0 +1,5 @@ +#include "feature.h" + +const double Feature::MAX_SCORE = 99.0; + +Feature::~Feature() {} diff --git a/extractor/features/feature.h b/extractor/features/feature.h new file mode 100644 index 00000000..aca58401 --- /dev/null +++ b/extractor/features/feature.h @@ -0,0 +1,36 @@ +#ifndef _FEATURE_H_ +#define _FEATURE_H_ + +#include <string> + +//TODO(pauldb): include headers nicely. +#include "../phrase.h" + +using namespace std; + +struct FeatureContext { + FeatureContext(const Phrase& source_phrase, const Phrase& target_phrase, + double source_phrase_count, int pair_count, int num_samples) : + source_phrase(source_phrase), target_phrase(target_phrase), + source_phrase_count(source_phrase_count), pair_count(pair_count), + num_samples(num_samples) {} + + Phrase source_phrase; + Phrase target_phrase; + double source_phrase_count; + int pair_count; + int num_samples; +}; + +class Feature { + public: + virtual double Score(const FeatureContext& context) const = 0; + + virtual string GetName() const = 0; + + virtual ~Feature(); + + static const double MAX_SCORE; +}; + +#endif diff --git a/extractor/features/is_source_singleton.cc b/extractor/features/is_source_singleton.cc new file mode 100644 index 00000000..98d4e5fe --- /dev/null +++ b/extractor/features/is_source_singleton.cc @@ -0,0 +1,11 @@ +#include "is_source_singleton.h" + +#include <cmath> + +double IsSourceSingleton::Score(const FeatureContext& context) const { + return context.source_phrase_count == 1; +} + +string IsSourceSingleton::GetName() const { + return "IsSingletonF"; +} diff --git a/extractor/features/is_source_singleton.h b/extractor/features/is_source_singleton.h new file mode 100644 index 00000000..7cc72828 --- /dev/null +++ b/extractor/features/is_source_singleton.h @@ -0,0 +1,13 @@ +#ifndef _IS_SOURCE_SINGLETON_H_ +#define _IS_SOURCE_SINGLETON_H_ + +#include "feature.h" + +class IsSourceSingleton : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +#endif diff --git a/extractor/features/is_source_singleton_test.cc b/extractor/features/is_source_singleton_test.cc new file mode 100644 index 00000000..8c71e593 --- /dev/null +++ b/extractor/features/is_source_singleton_test.cc @@ -0,0 +1,35 @@ +#include <gtest/gtest.h> + +#include <memory> +#include <string> + +#include "is_source_singleton.h" + +using namespace std; +using namespace ::testing; + +namespace { + +class IsSourceSingletonTest : public Test { + protected: + virtual void SetUp() { + feature = make_shared<IsSourceSingleton>(); + } + + shared_ptr<IsSourceSingleton> feature; +}; + +TEST_F(IsSourceSingletonTest, TestGetName) { + EXPECT_EQ("IsSingletonF", feature->GetName()); +} + +TEST_F(IsSourceSingletonTest, TestScore) { + Phrase phrase; + FeatureContext context(phrase, phrase, 0.5, 3, 31); + EXPECT_EQ(0, feature->Score(context)); + + context = FeatureContext(phrase, phrase, 1, 3, 25); + EXPECT_EQ(1, feature->Score(context)); +} + +} // namespace diff --git a/extractor/features/is_source_target_singleton.cc b/extractor/features/is_source_target_singleton.cc new file mode 100644 index 00000000..31d36532 --- /dev/null +++ b/extractor/features/is_source_target_singleton.cc @@ -0,0 +1,11 @@ +#include "is_source_target_singleton.h" + +#include <cmath> + +double IsSourceTargetSingleton::Score(const FeatureContext& context) const { + return context.pair_count == 1; +} + +string IsSourceTargetSingleton::GetName() const { + return "IsSingletonFE"; +} diff --git a/extractor/features/is_source_target_singleton.h b/extractor/features/is_source_target_singleton.h new file mode 100644 index 00000000..58913b74 --- /dev/null +++ b/extractor/features/is_source_target_singleton.h @@ -0,0 +1,13 @@ +#ifndef _IS_SOURCE_TARGET_SINGLETON_H_ +#define _IS_SOURCE_TARGET_SINGLETON_H_ + +#include "feature.h" + +class IsSourceTargetSingleton : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +#endif diff --git a/extractor/features/is_source_target_singleton_test.cc b/extractor/features/is_source_target_singleton_test.cc new file mode 100644 index 00000000..a51f77c9 --- /dev/null +++ b/extractor/features/is_source_target_singleton_test.cc @@ -0,0 +1,35 @@ +#include <gtest/gtest.h> + +#include <memory> +#include <string> + +#include "is_source_target_singleton.h" + +using namespace std; +using namespace ::testing; + +namespace { + +class IsSourceTargetSingletonTest : public Test { + protected: + virtual void SetUp() { + feature = make_shared<IsSourceTargetSingleton>(); + } + + shared_ptr<IsSourceTargetSingleton> feature; +}; + +TEST_F(IsSourceTargetSingletonTest, TestGetName) { + EXPECT_EQ("IsSingletonFE", feature->GetName()); +} + +TEST_F(IsSourceTargetSingletonTest, TestScore) { + Phrase phrase; + FeatureContext context(phrase, phrase, 0.5, 3, 7); + EXPECT_EQ(0, feature->Score(context)); + + context = FeatureContext(phrase, phrase, 2.3, 1, 28); + EXPECT_EQ(1, feature->Score(context)); +} + +} // namespace diff --git a/extractor/features/max_lex_source_given_target.cc b/extractor/features/max_lex_source_given_target.cc new file mode 100644 index 00000000..21f5c76a --- /dev/null +++ b/extractor/features/max_lex_source_given_target.cc @@ -0,0 +1,31 @@ +#include "max_lex_source_given_target.h" + +#include <cmath> + +#include "../data_array.h" +#include "../translation_table.h" + +MaxLexSourceGivenTarget::MaxLexSourceGivenTarget( + shared_ptr<TranslationTable> table) : + table(table) {} + +double MaxLexSourceGivenTarget::Score(const FeatureContext& context) const { + vector<string> source_words = context.source_phrase.GetWords(); + vector<string> target_words = context.target_phrase.GetWords(); + target_words.push_back(DataArray::NULL_WORD_STR); + + double score = 0; + for (string source_word: source_words) { + double max_score = 0; + for (string target_word: target_words) { + max_score = max(max_score, + table->GetSourceGivenTargetScore(source_word, target_word)); + } + score += max_score > 0 ? -log10(max_score) : MAX_SCORE; + } + return score; +} + +string MaxLexSourceGivenTarget::GetName() const { + return "MaxLexFgivenE"; +} diff --git a/extractor/features/max_lex_source_given_target.h b/extractor/features/max_lex_source_given_target.h new file mode 100644 index 00000000..e87c1c8e --- /dev/null +++ b/extractor/features/max_lex_source_given_target.h @@ -0,0 +1,24 @@ +#ifndef _MAX_LEX_SOURCE_GIVEN_TARGET_H_ +#define _MAX_LEX_SOURCE_GIVEN_TARGET_H_ + +#include <memory> + +#include "feature.h" + +using namespace std; + +class TranslationTable; + +class MaxLexSourceGivenTarget : public Feature { + public: + MaxLexSourceGivenTarget(shared_ptr<TranslationTable> table); + + double Score(const FeatureContext& context) const; + + string GetName() const; + + private: + shared_ptr<TranslationTable> table; +}; + +#endif diff --git a/extractor/features/max_lex_source_given_target_test.cc b/extractor/features/max_lex_source_given_target_test.cc new file mode 100644 index 00000000..5fd41f8b --- /dev/null +++ b/extractor/features/max_lex_source_given_target_test.cc @@ -0,0 +1,74 @@ +#include <gtest/gtest.h> + +#include <cmath> +#include <memory> +#include <string> + +#include "../mocks/mock_translation_table.h" +#include "../mocks/mock_vocabulary.h" +#include "../data_array.h" +#include "../phrase_builder.h" +#include "max_lex_source_given_target.h" + +using namespace std; +using namespace ::testing; + +namespace { + +class MaxLexSourceGivenTargetTest : public Test { + protected: + virtual void SetUp() { + vector<string> source_words = {"f1", "f2", "f3"}; + vector<string> target_words = {"e1", "e2", "e3"}; + + vocabulary = make_shared<MockVocabulary>(); + for (size_t i = 0; i < source_words.size(); ++i) { + EXPECT_CALL(*vocabulary, GetTerminalValue(i)) + .WillRepeatedly(Return(source_words[i])); + } + for (size_t i = 0; i < target_words.size(); ++i) { + EXPECT_CALL(*vocabulary, GetTerminalValue(i + source_words.size())) + .WillRepeatedly(Return(target_words[i])); + } + + phrase_builder = make_shared<PhraseBuilder>(vocabulary); + + table = make_shared<MockTranslationTable>(); + for (size_t i = 0; i < source_words.size(); ++i) { + for (size_t j = 0; j < target_words.size(); ++j) { + int value = i - j; + EXPECT_CALL(*table, GetSourceGivenTargetScore( + source_words[i], target_words[j])).WillRepeatedly(Return(value)); + } + } + + for (size_t i = 0; i < source_words.size(); ++i) { + int value = i * 3; + EXPECT_CALL(*table, GetSourceGivenTargetScore( + source_words[i], DataArray::NULL_WORD_STR)) + .WillRepeatedly(Return(value)); + } + + feature = make_shared<MaxLexSourceGivenTarget>(table); + } + + shared_ptr<MockVocabulary> vocabulary; + shared_ptr<PhraseBuilder> phrase_builder; + shared_ptr<MockTranslationTable> table; + shared_ptr<MaxLexSourceGivenTarget> feature; +}; + +TEST_F(MaxLexSourceGivenTargetTest, TestGetName) { + EXPECT_EQ("MaxLexFgivenE", feature->GetName()); +} + +TEST_F(MaxLexSourceGivenTargetTest, TestScore) { + vector<int> source_symbols = {0, 1, 2}; + Phrase source_phrase = phrase_builder->Build(source_symbols); + vector<int> target_symbols = {3, 4, 5}; + Phrase target_phrase = phrase_builder->Build(target_symbols); + FeatureContext context(source_phrase, target_phrase, 0.3, 7, 11); + EXPECT_EQ(99 - log10(18), feature->Score(context)); +} + +} // namespace diff --git a/extractor/features/max_lex_target_given_source.cc b/extractor/features/max_lex_target_given_source.cc new file mode 100644 index 00000000..f2bc2474 --- /dev/null +++ b/extractor/features/max_lex_target_given_source.cc @@ -0,0 +1,31 @@ +#include "max_lex_target_given_source.h" + +#include <cmath> + +#include "../data_array.h" +#include "../translation_table.h" + +MaxLexTargetGivenSource::MaxLexTargetGivenSource( + shared_ptr<TranslationTable> table) : + table(table) {} + +double MaxLexTargetGivenSource::Score(const FeatureContext& context) const { + vector<string> source_words = context.source_phrase.GetWords(); + source_words.push_back(DataArray::NULL_WORD_STR); + vector<string> target_words = context.target_phrase.GetWords(); + + double score = 0; + for (string target_word: target_words) { + double max_score = 0; + for (string source_word: source_words) { + max_score = max(max_score, + table->GetTargetGivenSourceScore(source_word, target_word)); + } + score += max_score > 0 ? -log10(max_score) : MAX_SCORE; + } + return score; +} + +string MaxLexTargetGivenSource::GetName() const { + return "MaxLexEgivenF"; +} diff --git a/extractor/features/max_lex_target_given_source.h b/extractor/features/max_lex_target_given_source.h new file mode 100644 index 00000000..9585ff04 --- /dev/null +++ b/extractor/features/max_lex_target_given_source.h @@ -0,0 +1,24 @@ +#ifndef _MAX_LEX_TARGET_GIVEN_SOURCE_H_ +#define _MAX_LEX_TARGET_GIVEN_SOURCE_H_ + +#include <memory> + +#include "feature.h" + +using namespace std; + +class TranslationTable; + +class MaxLexTargetGivenSource : public Feature { + public: + MaxLexTargetGivenSource(shared_ptr<TranslationTable> table); + + double Score(const FeatureContext& context) const; + + string GetName() const; + + private: + shared_ptr<TranslationTable> table; +}; + +#endif diff --git a/extractor/features/max_lex_target_given_source_test.cc b/extractor/features/max_lex_target_given_source_test.cc new file mode 100644 index 00000000..c8701bf7 --- /dev/null +++ b/extractor/features/max_lex_target_given_source_test.cc @@ -0,0 +1,74 @@ +#include <gtest/gtest.h> + +#include <cmath> +#include <memory> +#include <string> + +#include "../mocks/mock_translation_table.h" +#include "../mocks/mock_vocabulary.h" +#include "../data_array.h" +#include "../phrase_builder.h" +#include "max_lex_target_given_source.h" + +using namespace std; +using namespace ::testing; + +namespace { + +class MaxLexTargetGivenSourceTest : public Test { + protected: + virtual void SetUp() { + vector<string> source_words = {"f1", "f2", "f3"}; + vector<string> target_words = {"e1", "e2", "e3"}; + + vocabulary = make_shared<MockVocabulary>(); + for (size_t i = 0; i < source_words.size(); ++i) { + EXPECT_CALL(*vocabulary, GetTerminalValue(i)) + .WillRepeatedly(Return(source_words[i])); + } + for (size_t i = 0; i < target_words.size(); ++i) { + EXPECT_CALL(*vocabulary, GetTerminalValue(i + source_words.size())) + .WillRepeatedly(Return(target_words[i])); + } + + phrase_builder = make_shared<PhraseBuilder>(vocabulary); + + table = make_shared<MockTranslationTable>(); + for (size_t i = 0; i < source_words.size(); ++i) { + for (size_t j = 0; j < target_words.size(); ++j) { + int value = i - j; + EXPECT_CALL(*table, GetTargetGivenSourceScore( + source_words[i], target_words[j])).WillRepeatedly(Return(value)); + } + } + + for (size_t i = 0; i < target_words.size(); ++i) { + int value = i * 3; + EXPECT_CALL(*table, GetTargetGivenSourceScore( + DataArray::NULL_WORD_STR, target_words[i])) + .WillRepeatedly(Return(value)); + } + + feature = make_shared<MaxLexTargetGivenSource>(table); + } + + shared_ptr<MockVocabulary> vocabulary; + shared_ptr<PhraseBuilder> phrase_builder; + shared_ptr<MockTranslationTable> table; + shared_ptr<MaxLexTargetGivenSource> feature; +}; + +TEST_F(MaxLexTargetGivenSourceTest, TestGetName) { + EXPECT_EQ("MaxLexEgivenF", feature->GetName()); +} + +TEST_F(MaxLexTargetGivenSourceTest, TestScore) { + vector<int> source_symbols = {0, 1, 2}; + Phrase source_phrase = phrase_builder->Build(source_symbols); + vector<int> target_symbols = {3, 4, 5}; + Phrase target_phrase = phrase_builder->Build(target_symbols); + FeatureContext context(source_phrase, target_phrase, 0.3, 7, 19); + EXPECT_EQ(-log10(36), feature->Score(context)); +} + +} // namespace diff --git a/extractor/features/sample_source_count.cc b/extractor/features/sample_source_count.cc new file mode 100644 index 00000000..88b645b1 --- /dev/null +++ b/extractor/features/sample_source_count.cc @@ -0,0 +1,11 @@ +#include "sample_source_count.h" + +#include <cmath> + +double SampleSourceCount::Score(const FeatureContext& context) const { + return log10(1 + context.num_samples); +} + +string SampleSourceCount::GetName() const { + return "SampleCountF"; +} diff --git a/extractor/features/sample_source_count.h b/extractor/features/sample_source_count.h new file mode 100644 index 00000000..62d236c8 --- /dev/null +++ b/extractor/features/sample_source_count.h @@ -0,0 +1,13 @@ +#ifndef _SAMPLE_SOURCE_COUNT_H_ +#define _SAMPLE_SOURCE_COUNT_H_ + +#include "feature.h" + +class SampleSourceCount : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +#endif diff --git a/extractor/features/sample_source_count_test.cc b/extractor/features/sample_source_count_test.cc new file mode 100644 index 00000000..7d226104 --- /dev/null +++ b/extractor/features/sample_source_count_test.cc @@ -0,0 +1,36 @@ +#include <gtest/gtest.h> + +#include <cmath> +#include <memory> +#include <string> + +#include "sample_source_count.h" + +using namespace std; +using namespace ::testing; + +namespace { + +class SampleSourceCountTest : public Test { + protected: + virtual void SetUp() { + feature = make_shared<SampleSourceCount>(); + } + + shared_ptr<SampleSourceCount> feature; +}; + +TEST_F(SampleSourceCountTest, TestGetName) { + EXPECT_EQ("SampleCountF", feature->GetName()); +} + +TEST_F(SampleSourceCountTest, TestScore) { + Phrase phrase; + FeatureContext context(phrase, phrase, 0, 3, 1); + EXPECT_EQ(log10(2), feature->Score(context)); + + context = FeatureContext(phrase, phrase, 3.2, 3, 9); + EXPECT_EQ(1.0, feature->Score(context)); +} + +} // namespace diff --git a/extractor/features/target_given_source_coherent.cc b/extractor/features/target_given_source_coherent.cc new file mode 100644 index 00000000..274b3364 --- /dev/null +++ b/extractor/features/target_given_source_coherent.cc @@ -0,0 +1,12 @@ +#include "target_given_source_coherent.h" + +#include <cmath> + +double TargetGivenSourceCoherent::Score(const FeatureContext& context) const { + double prob = (double) context.pair_count / context.num_samples; + return prob > 0 ? -log10(prob) : MAX_SCORE; +} + +string TargetGivenSourceCoherent::GetName() const { + return "EgivenFCoherent"; +} diff --git a/extractor/features/target_given_source_coherent.h b/extractor/features/target_given_source_coherent.h new file mode 100644 index 00000000..09c8edb1 --- /dev/null +++ b/extractor/features/target_given_source_coherent.h @@ -0,0 +1,13 @@ +#ifndef _TARGET_GIVEN_SOURCE_COHERENT_H_ +#define _TARGET_GIVEN_SOURCE_COHERENT_H_ + +#include "feature.h" + +class TargetGivenSourceCoherent : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +#endif diff --git a/extractor/features/target_given_source_coherent_test.cc b/extractor/features/target_given_source_coherent_test.cc new file mode 100644 index 00000000..c54c06c2 --- /dev/null +++ b/extractor/features/target_given_source_coherent_test.cc @@ -0,0 +1,35 @@ +#include <gtest/gtest.h> + +#include <memory> +#include <string> + +#include "target_given_source_coherent.h" + +using namespace std; +using namespace ::testing; + +namespace { + +class TargetGivenSourceCoherentTest : public Test { + protected: + virtual void SetUp() { + feature = make_shared<TargetGivenSourceCoherent>(); + } + + shared_ptr<TargetGivenSourceCoherent> feature; +}; + +TEST_F(TargetGivenSourceCoherentTest, TestGetName) { + EXPECT_EQ("EgivenFCoherent", feature->GetName()); +} + +TEST_F(TargetGivenSourceCoherentTest, TestScore) { + Phrase phrase; + FeatureContext context(phrase, phrase, 0.3, 2, 20); + EXPECT_EQ(1.0, feature->Score(context)); + + context = FeatureContext(phrase, phrase, 1.9, 0, 1); + EXPECT_EQ(99.0, feature->Score(context)); +} + +} // namespace |