diff options
Diffstat (limited to 'extractor/features')
23 files changed, 747 insertions, 0 deletions
diff --git a/extractor/features/count_source_target.cc b/extractor/features/count_source_target.cc new file mode 100644 index 00000000..db0385e0 --- /dev/null +++ b/extractor/features/count_source_target.cc @@ -0,0 +1,17 @@ +#include "count_source_target.h" + +#include <cmath> + +namespace extractor { +namespace features { + +double CountSourceTarget::Score(const FeatureContext& context) const { + return log10(1 + context.pair_count); +} + +string CountSourceTarget::GetName() const { + return "CountEF"; +} + +} // namespace features +} // namespace extractor diff --git a/extractor/features/count_source_target.h b/extractor/features/count_source_target.h new file mode 100644 index 00000000..8747fa60 --- /dev/null +++ b/extractor/features/count_source_target.h @@ -0,0 +1,22 @@ +#ifndef _COUNT_SOURCE_TARGET_H_ +#define _COUNT_SOURCE_TARGET_H_ + +#include "feature.h" + +namespace extractor { +namespace features { + +/** + * Feature for the number of times a word pair was found in the bitext. + */ +class CountSourceTarget : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +} // namespace features +} // namespace extractor + +#endif diff --git a/extractor/features/count_source_target_test.cc b/extractor/features/count_source_target_test.cc new file mode 100644 index 00000000..1fd0c2aa --- /dev/null +++ b/extractor/features/count_source_target_test.cc @@ -0,0 +1,36 @@ +#include <gtest/gtest.h> + +#include <memory> +#include <string> + +#include "count_source_target.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace features { +namespace { + +class CountSourceTargetTest : public Test { + protected: + virtual void SetUp() { + feature = make_shared<CountSourceTarget>(); + } + + shared_ptr<CountSourceTarget> feature; +}; + +TEST_F(CountSourceTargetTest, TestGetName) { + EXPECT_EQ("CountEF", feature->GetName()); +} + +TEST_F(CountSourceTargetTest, TestScore) { + Phrase phrase; + FeatureContext context(phrase, phrase, 0.5, 9, 13); + EXPECT_EQ(1.0, feature->Score(context)); +} + +} // namespace +} // namespace features +} // namespace extractor diff --git a/extractor/features/feature.cc b/extractor/features/feature.cc new file mode 100644 index 00000000..939bcc59 --- /dev/null +++ b/extractor/features/feature.cc @@ -0,0 +1,11 @@ +#include "feature.h" + +namespace extractor { +namespace features { + +const double Feature::MAX_SCORE = 99.0; + +Feature::~Feature() {} + +} // namespace features +} // namespace extractor diff --git a/extractor/features/feature.h b/extractor/features/feature.h new file mode 100644 index 00000000..36ea504a --- /dev/null +++ b/extractor/features/feature.h @@ -0,0 +1,47 @@ +#ifndef _FEATURE_H_ +#define _FEATURE_H_ + +#include <string> + +#include "phrase.h" + +using namespace std; + +namespace extractor { +namespace features { + +/** + * Structure providing context for computing feature scores. + */ +struct FeatureContext { + FeatureContext(const Phrase& source_phrase, const Phrase& target_phrase, + double source_phrase_count, int pair_count, int num_samples) : + source_phrase(source_phrase), target_phrase(target_phrase), + source_phrase_count(source_phrase_count), pair_count(pair_count), + num_samples(num_samples) {} + + Phrase source_phrase; + Phrase target_phrase; + double source_phrase_count; + int pair_count; + int num_samples; +}; + +/** + * Base class for features. + */ +class Feature { + public: + virtual double Score(const FeatureContext& context) const = 0; + + virtual string GetName() const = 0; + + virtual ~Feature(); + + static const double MAX_SCORE; +}; + +} // namespace features +} // namespace extractor + +#endif diff --git a/extractor/features/is_source_singleton.cc b/extractor/features/is_source_singleton.cc new file mode 100644 index 00000000..1abb486f --- /dev/null +++ b/extractor/features/is_source_singleton.cc @@ -0,0 +1,17 @@ +#include "is_source_singleton.h" + +#include <cmath> + +namespace extractor { +namespace features { + +double IsSourceSingleton::Score(const FeatureContext& context) const { + return fabs(context.source_phrase_count - 1) < 1e-6; +} + +string IsSourceSingleton::GetName() const { + return "IsSingletonF"; +} + +} // namespace features +} // namespace extractor diff --git a/extractor/features/is_source_singleton.h b/extractor/features/is_source_singleton.h new file mode 100644 index 00000000..b8352d0e --- /dev/null +++ b/extractor/features/is_source_singleton.h @@ -0,0 +1,22 @@ +#ifndef _IS_SOURCE_SINGLETON_H_ +#define _IS_SOURCE_SINGLETON_H_ + +#include "feature.h" + +namespace extractor { +namespace features { + +/** + * Boolean feature checking if the source phrase occurs only once in the data. + */ +class IsSourceSingleton : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +} // namespace features +} // namespace extractor + +#endif diff --git a/extractor/features/is_source_singleton_test.cc b/extractor/features/is_source_singleton_test.cc new file mode 100644 index 00000000..f4266671 --- /dev/null +++ b/extractor/features/is_source_singleton_test.cc @@ -0,0 +1,39 @@ +#include <gtest/gtest.h> + +#include <memory> +#include <string> + +#include "is_source_singleton.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace features { +namespace { + +class IsSourceSingletonTest : public Test { + protected: + virtual void SetUp() { + feature = make_shared<IsSourceSingleton>(); + } + + shared_ptr<IsSourceSingleton> feature; +}; + +TEST_F(IsSourceSingletonTest, TestGetName) { + EXPECT_EQ("IsSingletonF", feature->GetName()); +} + +TEST_F(IsSourceSingletonTest, TestScore) { + Phrase phrase; + FeatureContext context(phrase, phrase, 0.5, 3, 31); + EXPECT_EQ(0, feature->Score(context)); + + context = FeatureContext(phrase, phrase, 1, 3, 25); + EXPECT_EQ(1, feature->Score(context)); +} + +} // namespace +} // namespace features +} // namespace extractor diff --git a/extractor/features/is_source_target_singleton.cc b/extractor/features/is_source_target_singleton.cc new file mode 100644 index 00000000..03b3c62c --- /dev/null +++ b/extractor/features/is_source_target_singleton.cc @@ -0,0 +1,17 @@ +#include "is_source_target_singleton.h" + +#include <cmath> + +namespace extractor { +namespace features { + +double IsSourceTargetSingleton::Score(const FeatureContext& context) const { + return context.pair_count == 1; +} + +string IsSourceTargetSingleton::GetName() const { + return "IsSingletonFE"; +} + +} // namespace features +} // namespace extractor diff --git a/extractor/features/is_source_target_singleton.h b/extractor/features/is_source_target_singleton.h new file mode 100644 index 00000000..dacfebba --- /dev/null +++ b/extractor/features/is_source_target_singleton.h @@ -0,0 +1,22 @@ +#ifndef _IS_SOURCE_TARGET_SINGLETON_H_ +#define _IS_SOURCE_TARGET_SINGLETON_H_ + +#include "feature.h" + +namespace extractor { +namespace features { + +/** + * Boolean feature checking if the phrase pair occurs only once in the data. + */ +class IsSourceTargetSingleton : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +} // namespace features +} // namespace extractor + +#endif diff --git a/extractor/features/is_source_target_singleton_test.cc b/extractor/features/is_source_target_singleton_test.cc new file mode 100644 index 00000000..929635b0 --- /dev/null +++ b/extractor/features/is_source_target_singleton_test.cc @@ -0,0 +1,39 @@ +#include <gtest/gtest.h> + +#include <memory> +#include <string> + +#include "is_source_target_singleton.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace features { +namespace { + +class IsSourceTargetSingletonTest : public Test { + protected: + virtual void SetUp() { + feature = make_shared<IsSourceTargetSingleton>(); + } + + shared_ptr<IsSourceTargetSingleton> feature; +}; + +TEST_F(IsSourceTargetSingletonTest, TestGetName) { + EXPECT_EQ("IsSingletonFE", feature->GetName()); +} + +TEST_F(IsSourceTargetSingletonTest, TestScore) { + Phrase phrase; + FeatureContext context(phrase, phrase, 0.5, 3, 7); + EXPECT_EQ(0, feature->Score(context)); + + context = FeatureContext(phrase, phrase, 2.3, 1, 28); + EXPECT_EQ(1, feature->Score(context)); +} + +} // namespace +} // namespace features +} // namespace extractor diff --git a/extractor/features/max_lex_source_given_target.cc b/extractor/features/max_lex_source_given_target.cc new file mode 100644 index 00000000..65d0ec68 --- /dev/null +++ b/extractor/features/max_lex_source_given_target.cc @@ -0,0 +1,37 @@ +#include "max_lex_source_given_target.h" + +#include <cmath> + +#include "data_array.h" +#include "translation_table.h" + +namespace extractor { +namespace features { + +MaxLexSourceGivenTarget::MaxLexSourceGivenTarget( + shared_ptr<TranslationTable> table) : + table(table) {} + +double MaxLexSourceGivenTarget::Score(const FeatureContext& context) const { + vector<string> source_words = context.source_phrase.GetWords(); + vector<string> target_words = context.target_phrase.GetWords(); + target_words.push_back(DataArray::NULL_WORD_STR); + + double score = 0; + for (string source_word: source_words) { + double max_score = 0; + for (string target_word: target_words) { + max_score = max(max_score, + table->GetSourceGivenTargetScore(source_word, target_word)); + } + score += max_score > 0 ? -log10(max_score) : MAX_SCORE; + } + return score; +} + +string MaxLexSourceGivenTarget::GetName() const { + return "MaxLexFgivenE"; +} + +} // namespace features +} // namespace extractor diff --git a/extractor/features/max_lex_source_given_target.h b/extractor/features/max_lex_source_given_target.h new file mode 100644 index 00000000..461b0ebf --- /dev/null +++ b/extractor/features/max_lex_source_given_target.h @@ -0,0 +1,34 @@ +#ifndef _MAX_LEX_SOURCE_GIVEN_TARGET_H_ +#define _MAX_LEX_SOURCE_GIVEN_TARGET_H_ + +#include <memory> + +#include "feature.h" + +using namespace std; + +namespace extractor { + +class TranslationTable; + +namespace features { + +/** + * Feature computing max(p(f | e)) across all pairs of words in the phrase pair. + */ +class MaxLexSourceGivenTarget : public Feature { + public: + MaxLexSourceGivenTarget(shared_ptr<TranslationTable> table); + + double Score(const FeatureContext& context) const; + + string GetName() const; + + private: + shared_ptr<TranslationTable> table; +}; + +} // namespace features +} // namespace extractor + +#endif diff --git a/extractor/features/max_lex_source_given_target_test.cc b/extractor/features/max_lex_source_given_target_test.cc new file mode 100644 index 00000000..7f6aae41 --- /dev/null +++ b/extractor/features/max_lex_source_given_target_test.cc @@ -0,0 +1,78 @@ +#include <gtest/gtest.h> + +#include <cmath> +#include <memory> +#include <string> + +#include "data_array.h" +#include "mocks/mock_translation_table.h" +#include "mocks/mock_vocabulary.h" +#include "phrase_builder.h" +#include "max_lex_source_given_target.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace features { +namespace { + +class MaxLexSourceGivenTargetTest : public Test { + protected: + virtual void SetUp() { + vector<string> source_words = {"f1", "f2", "f3"}; + vector<string> target_words = {"e1", "e2", "e3"}; + + vocabulary = make_shared<MockVocabulary>(); + for (size_t i = 0; i < source_words.size(); ++i) { + EXPECT_CALL(*vocabulary, GetTerminalValue(i)) + .WillRepeatedly(Return(source_words[i])); + } + for (size_t i = 0; i < target_words.size(); ++i) { + EXPECT_CALL(*vocabulary, GetTerminalValue(i + source_words.size())) + .WillRepeatedly(Return(target_words[i])); + } + + phrase_builder = make_shared<PhraseBuilder>(vocabulary); + + table = make_shared<MockTranslationTable>(); + for (size_t i = 0; i < source_words.size(); ++i) { + for (size_t j = 0; j < target_words.size(); ++j) { + int value = i - j; + EXPECT_CALL(*table, GetSourceGivenTargetScore( + source_words[i], target_words[j])).WillRepeatedly(Return(value)); + } + } + + for (size_t i = 0; i < source_words.size(); ++i) { + int value = i * 3; + EXPECT_CALL(*table, GetSourceGivenTargetScore( + source_words[i], DataArray::NULL_WORD_STR)) + .WillRepeatedly(Return(value)); + } + + feature = make_shared<MaxLexSourceGivenTarget>(table); + } + + shared_ptr<MockVocabulary> vocabulary; + shared_ptr<PhraseBuilder> phrase_builder; + shared_ptr<MockTranslationTable> table; + shared_ptr<MaxLexSourceGivenTarget> feature; +}; + +TEST_F(MaxLexSourceGivenTargetTest, TestGetName) { + EXPECT_EQ("MaxLexFgivenE", feature->GetName()); +} + +TEST_F(MaxLexSourceGivenTargetTest, TestScore) { + vector<int> source_symbols = {0, 1, 2}; + Phrase source_phrase = phrase_builder->Build(source_symbols); + vector<int> target_symbols = {3, 4, 5}; + Phrase target_phrase = phrase_builder->Build(target_symbols); + FeatureContext context(source_phrase, target_phrase, 0.3, 7, 11); + EXPECT_EQ(99 - log10(18), feature->Score(context)); +} + +} // namespace +} // namespace features +} // namespace extractor diff --git a/extractor/features/max_lex_target_given_source.cc b/extractor/features/max_lex_target_given_source.cc new file mode 100644 index 00000000..33783054 --- /dev/null +++ b/extractor/features/max_lex_target_given_source.cc @@ -0,0 +1,37 @@ +#include "max_lex_target_given_source.h" + +#include <cmath> + +#include "data_array.h" +#include "translation_table.h" + +namespace extractor { +namespace features { + +MaxLexTargetGivenSource::MaxLexTargetGivenSource( + shared_ptr<TranslationTable> table) : + table(table) {} + +double MaxLexTargetGivenSource::Score(const FeatureContext& context) const { + vector<string> source_words = context.source_phrase.GetWords(); + source_words.push_back(DataArray::NULL_WORD_STR); + vector<string> target_words = context.target_phrase.GetWords(); + + double score = 0; + for (string target_word: target_words) { + double max_score = 0; + for (string source_word: source_words) { + max_score = max(max_score, + table->GetTargetGivenSourceScore(source_word, target_word)); + } + score += max_score > 0 ? -log10(max_score) : MAX_SCORE; + } + return score; +} + +string MaxLexTargetGivenSource::GetName() const { + return "MaxLexEgivenF"; +} + +} // namespace features +} // namespace extractor diff --git a/extractor/features/max_lex_target_given_source.h b/extractor/features/max_lex_target_given_source.h new file mode 100644 index 00000000..c3c87327 --- /dev/null +++ b/extractor/features/max_lex_target_given_source.h @@ -0,0 +1,34 @@ +#ifndef _MAX_LEX_TARGET_GIVEN_SOURCE_H_ +#define _MAX_LEX_TARGET_GIVEN_SOURCE_H_ + +#include <memory> + +#include "feature.h" + +using namespace std; + +namespace extractor { + +class TranslationTable; + +namespace features { + +/** + * Feature computing max(p(e | f)) across all pairs of words in the phrase pair. + */ +class MaxLexTargetGivenSource : public Feature { + public: + MaxLexTargetGivenSource(shared_ptr<TranslationTable> table); + + double Score(const FeatureContext& context) const; + + string GetName() const; + + private: + shared_ptr<TranslationTable> table; +}; + +} // namespace features +} // namespace extractor + +#endif diff --git a/extractor/features/max_lex_target_given_source_test.cc b/extractor/features/max_lex_target_given_source_test.cc new file mode 100644 index 00000000..6d0efd9c --- /dev/null +++ b/extractor/features/max_lex_target_given_source_test.cc @@ -0,0 +1,78 @@ +#include <gtest/gtest.h> + +#include <cmath> +#include <memory> +#include <string> + +#include "data_array.h" +#include "mocks/mock_translation_table.h" +#include "mocks/mock_vocabulary.h" +#include "phrase_builder.h" +#include "max_lex_target_given_source.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace features { +namespace { + +class MaxLexTargetGivenSourceTest : public Test { + protected: + virtual void SetUp() { + vector<string> source_words = {"f1", "f2", "f3"}; + vector<string> target_words = {"e1", "e2", "e3"}; + + vocabulary = make_shared<MockVocabulary>(); + for (size_t i = 0; i < source_words.size(); ++i) { + EXPECT_CALL(*vocabulary, GetTerminalValue(i)) + .WillRepeatedly(Return(source_words[i])); + } + for (size_t i = 0; i < target_words.size(); ++i) { + EXPECT_CALL(*vocabulary, GetTerminalValue(i + source_words.size())) + .WillRepeatedly(Return(target_words[i])); + } + + phrase_builder = make_shared<PhraseBuilder>(vocabulary); + + table = make_shared<MockTranslationTable>(); + for (size_t i = 0; i < source_words.size(); ++i) { + for (size_t j = 0; j < target_words.size(); ++j) { + int value = i - j; + EXPECT_CALL(*table, GetTargetGivenSourceScore( + source_words[i], target_words[j])).WillRepeatedly(Return(value)); + } + } + + for (size_t i = 0; i < target_words.size(); ++i) { + int value = i * 3; + EXPECT_CALL(*table, GetTargetGivenSourceScore( + DataArray::NULL_WORD_STR, target_words[i])) + .WillRepeatedly(Return(value)); + } + + feature = make_shared<MaxLexTargetGivenSource>(table); + } + + shared_ptr<MockVocabulary> vocabulary; + shared_ptr<PhraseBuilder> phrase_builder; + shared_ptr<MockTranslationTable> table; + shared_ptr<MaxLexTargetGivenSource> feature; +}; + +TEST_F(MaxLexTargetGivenSourceTest, TestGetName) { + EXPECT_EQ("MaxLexEgivenF", feature->GetName()); +} + +TEST_F(MaxLexTargetGivenSourceTest, TestScore) { + vector<int> source_symbols = {0, 1, 2}; + Phrase source_phrase = phrase_builder->Build(source_symbols); + vector<int> target_symbols = {3, 4, 5}; + Phrase target_phrase = phrase_builder->Build(target_symbols); + FeatureContext context(source_phrase, target_phrase, 0.3, 7, 19); + EXPECT_EQ(-log10(36), feature->Score(context)); +} + +} // namespace +} // namespace features +} // namespace extractor diff --git a/extractor/features/sample_source_count.cc b/extractor/features/sample_source_count.cc new file mode 100644 index 00000000..b110fc51 --- /dev/null +++ b/extractor/features/sample_source_count.cc @@ -0,0 +1,17 @@ +#include "sample_source_count.h" + +#include <cmath> + +namespace extractor { +namespace features { + +double SampleSourceCount::Score(const FeatureContext& context) const { + return log10(1 + context.num_samples); +} + +string SampleSourceCount::GetName() const { + return "SampleCountF"; +} + +} // namespace features +} // namespace extractor diff --git a/extractor/features/sample_source_count.h b/extractor/features/sample_source_count.h new file mode 100644 index 00000000..ee6e59a0 --- /dev/null +++ b/extractor/features/sample_source_count.h @@ -0,0 +1,23 @@ +#ifndef _SAMPLE_SOURCE_COUNT_H_ +#define _SAMPLE_SOURCE_COUNT_H_ + +#include "feature.h" + +namespace extractor { +namespace features { + +/** + * Feature scoring the number of times the source phrase occurs in the sampled + * set. + */ +class SampleSourceCount : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +} // namespace features +} // namespace extractor + +#endif diff --git a/extractor/features/sample_source_count_test.cc b/extractor/features/sample_source_count_test.cc new file mode 100644 index 00000000..63856b9d --- /dev/null +++ b/extractor/features/sample_source_count_test.cc @@ -0,0 +1,40 @@ +#include <gtest/gtest.h> + +#include <cmath> +#include <memory> +#include <string> + +#include "sample_source_count.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace features { +namespace { + +class SampleSourceCountTest : public Test { + protected: + virtual void SetUp() { + feature = make_shared<SampleSourceCount>(); + } + + shared_ptr<SampleSourceCount> feature; +}; + +TEST_F(SampleSourceCountTest, TestGetName) { + EXPECT_EQ("SampleCountF", feature->GetName()); +} + +TEST_F(SampleSourceCountTest, TestScore) { + Phrase phrase; + FeatureContext context(phrase, phrase, 0, 3, 1); + EXPECT_EQ(log10(2), feature->Score(context)); + + context = FeatureContext(phrase, phrase, 3.2, 3, 9); + EXPECT_EQ(1.0, feature->Score(context)); +} + +} // namespace +} // namespace features +} // namespace extractor diff --git a/extractor/features/target_given_source_coherent.cc b/extractor/features/target_given_source_coherent.cc new file mode 100644 index 00000000..c4551d88 --- /dev/null +++ b/extractor/features/target_given_source_coherent.cc @@ -0,0 +1,18 @@ +#include "target_given_source_coherent.h" + +#include <cmath> + +namespace extractor { +namespace features { + +double TargetGivenSourceCoherent::Score(const FeatureContext& context) const { + double prob = (double) context.pair_count / context.num_samples; + return prob > 0 ? -log10(prob) : MAX_SCORE; +} + +string TargetGivenSourceCoherent::GetName() const { + return "EgivenFCoherent"; +} + +} // namespace features +} // namespace extractor diff --git a/extractor/features/target_given_source_coherent.h b/extractor/features/target_given_source_coherent.h new file mode 100644 index 00000000..e66d70a5 --- /dev/null +++ b/extractor/features/target_given_source_coherent.h @@ -0,0 +1,23 @@ +#ifndef _TARGET_GIVEN_SOURCE_COHERENT_H_ +#define _TARGET_GIVEN_SOURCE_COHERENT_H_ + +#include "feature.h" + +namespace extractor { +namespace features { + +/** + * Feature computing the ratio of the phrase pair count over all source phrase + * occurrences (sampled). + */ +class TargetGivenSourceCoherent : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +} // namespace features +} // namespace extractor + +#endif diff --git a/extractor/features/target_given_source_coherent_test.cc b/extractor/features/target_given_source_coherent_test.cc new file mode 100644 index 00000000..454105e1 --- /dev/null +++ b/extractor/features/target_given_source_coherent_test.cc @@ -0,0 +1,39 @@ +#include <gtest/gtest.h> + +#include <memory> +#include <string> + +#include "target_given_source_coherent.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace features { +namespace { + +class TargetGivenSourceCoherentTest : public Test { + protected: + virtual void SetUp() { + feature = make_shared<TargetGivenSourceCoherent>(); + } + + shared_ptr<TargetGivenSourceCoherent> feature; +}; + +TEST_F(TargetGivenSourceCoherentTest, TestGetName) { + EXPECT_EQ("EgivenFCoherent", feature->GetName()); +} + +TEST_F(TargetGivenSourceCoherentTest, TestScore) { + Phrase phrase; + FeatureContext context(phrase, phrase, 0.3, 2, 20); + EXPECT_EQ(1.0, feature->Score(context)); + + context = FeatureContext(phrase, phrase, 1.9, 0, 1); + EXPECT_EQ(99.0, feature->Score(context)); +} + +} // namespace +} // namespace features +} // namespace extractor |