summaryrefslogtreecommitdiff
path: root/extractor/features
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-04-23 19:35:18 -0400
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-04-23 19:35:18 -0400
commit6d347f1ce078dede3da0e1498f75e357351c6543 (patch)
tree8e872b8747c530e741e55e25e9917c1bd8b32c5b /extractor/features
parentd11b76def6899790161c47a73018146311356d8b (diff)
parent5e9605b65202f4e5fc59843b197d88c4774f0ac8 (diff)
merge paul's extractor code
Diffstat (limited to 'extractor/features')
-rw-r--r--extractor/features/count_source_target.cc17
-rw-r--r--extractor/features/count_source_target.h22
-rw-r--r--extractor/features/count_source_target_test.cc36
-rw-r--r--extractor/features/feature.cc11
-rw-r--r--extractor/features/feature.h47
-rw-r--r--extractor/features/is_source_singleton.cc17
-rw-r--r--extractor/features/is_source_singleton.h22
-rw-r--r--extractor/features/is_source_singleton_test.cc39
-rw-r--r--extractor/features/is_source_target_singleton.cc17
-rw-r--r--extractor/features/is_source_target_singleton.h22
-rw-r--r--extractor/features/is_source_target_singleton_test.cc39
-rw-r--r--extractor/features/max_lex_source_given_target.cc37
-rw-r--r--extractor/features/max_lex_source_given_target.h34
-rw-r--r--extractor/features/max_lex_source_given_target_test.cc78
-rw-r--r--extractor/features/max_lex_target_given_source.cc37
-rw-r--r--extractor/features/max_lex_target_given_source.h34
-rw-r--r--extractor/features/max_lex_target_given_source_test.cc78
-rw-r--r--extractor/features/sample_source_count.cc17
-rw-r--r--extractor/features/sample_source_count.h23
-rw-r--r--extractor/features/sample_source_count_test.cc40
-rw-r--r--extractor/features/target_given_source_coherent.cc18
-rw-r--r--extractor/features/target_given_source_coherent.h23
-rw-r--r--extractor/features/target_given_source_coherent_test.cc39
23 files changed, 747 insertions, 0 deletions
diff --git a/extractor/features/count_source_target.cc b/extractor/features/count_source_target.cc
new file mode 100644
index 00000000..db0385e0
--- /dev/null
+++ b/extractor/features/count_source_target.cc
@@ -0,0 +1,17 @@
+#include "count_source_target.h"
+
+#include <cmath>
+
+namespace extractor {
+namespace features {
+
+double CountSourceTarget::Score(const FeatureContext& context) const {
+ return log10(1 + context.pair_count);
+}
+
+string CountSourceTarget::GetName() const {
+ return "CountEF";
+}
+
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/count_source_target.h b/extractor/features/count_source_target.h
new file mode 100644
index 00000000..8747fa60
--- /dev/null
+++ b/extractor/features/count_source_target.h
@@ -0,0 +1,22 @@
+#ifndef _COUNT_SOURCE_TARGET_H_
+#define _COUNT_SOURCE_TARGET_H_
+
+#include "feature.h"
+
+namespace extractor {
+namespace features {
+
+/**
+ * Feature for the number of times a word pair was found in the bitext.
+ */
+class CountSourceTarget : public Feature {
+ public:
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+};
+
+} // namespace features
+} // namespace extractor
+
+#endif
diff --git a/extractor/features/count_source_target_test.cc b/extractor/features/count_source_target_test.cc
new file mode 100644
index 00000000..1fd0c2aa
--- /dev/null
+++ b/extractor/features/count_source_target_test.cc
@@ -0,0 +1,36 @@
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+
+#include "count_source_target.h"
+
+using namespace std;
+using namespace ::testing;
+
+namespace extractor {
+namespace features {
+namespace {
+
+class CountSourceTargetTest : public Test {
+ protected:
+ virtual void SetUp() {
+ feature = make_shared<CountSourceTarget>();
+ }
+
+ shared_ptr<CountSourceTarget> feature;
+};
+
+TEST_F(CountSourceTargetTest, TestGetName) {
+ EXPECT_EQ("CountEF", feature->GetName());
+}
+
+TEST_F(CountSourceTargetTest, TestScore) {
+ Phrase phrase;
+ FeatureContext context(phrase, phrase, 0.5, 9, 13);
+ EXPECT_EQ(1.0, feature->Score(context));
+}
+
+} // namespace
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/feature.cc b/extractor/features/feature.cc
new file mode 100644
index 00000000..939bcc59
--- /dev/null
+++ b/extractor/features/feature.cc
@@ -0,0 +1,11 @@
+#include "feature.h"
+
+namespace extractor {
+namespace features {
+
+const double Feature::MAX_SCORE = 99.0;
+
+Feature::~Feature() {}
+
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/feature.h b/extractor/features/feature.h
new file mode 100644
index 00000000..36ea504a
--- /dev/null
+++ b/extractor/features/feature.h
@@ -0,0 +1,47 @@
+#ifndef _FEATURE_H_
+#define _FEATURE_H_
+
+#include <string>
+
+#include "phrase.h"
+
+using namespace std;
+
+namespace extractor {
+namespace features {
+
+/**
+ * Structure providing context for computing feature scores.
+ */
+struct FeatureContext {
+ FeatureContext(const Phrase& source_phrase, const Phrase& target_phrase,
+ double source_phrase_count, int pair_count, int num_samples) :
+ source_phrase(source_phrase), target_phrase(target_phrase),
+ source_phrase_count(source_phrase_count), pair_count(pair_count),
+ num_samples(num_samples) {}
+
+ Phrase source_phrase;
+ Phrase target_phrase;
+ double source_phrase_count;
+ int pair_count;
+ int num_samples;
+};
+
+/**
+ * Base class for features.
+ */
+class Feature {
+ public:
+ virtual double Score(const FeatureContext& context) const = 0;
+
+ virtual string GetName() const = 0;
+
+ virtual ~Feature();
+
+ static const double MAX_SCORE;
+};
+
+} // namespace features
+} // namespace extractor
+
+#endif
diff --git a/extractor/features/is_source_singleton.cc b/extractor/features/is_source_singleton.cc
new file mode 100644
index 00000000..1abb486f
--- /dev/null
+++ b/extractor/features/is_source_singleton.cc
@@ -0,0 +1,17 @@
+#include "is_source_singleton.h"
+
+#include <cmath>
+
+namespace extractor {
+namespace features {
+
+double IsSourceSingleton::Score(const FeatureContext& context) const {
+ return fabs(context.source_phrase_count - 1) < 1e-6;
+}
+
+string IsSourceSingleton::GetName() const {
+ return "IsSingletonF";
+}
+
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/is_source_singleton.h b/extractor/features/is_source_singleton.h
new file mode 100644
index 00000000..b8352d0e
--- /dev/null
+++ b/extractor/features/is_source_singleton.h
@@ -0,0 +1,22 @@
+#ifndef _IS_SOURCE_SINGLETON_H_
+#define _IS_SOURCE_SINGLETON_H_
+
+#include "feature.h"
+
+namespace extractor {
+namespace features {
+
+/**
+ * Boolean feature checking if the source phrase occurs only once in the data.
+ */
+class IsSourceSingleton : public Feature {
+ public:
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+};
+
+} // namespace features
+} // namespace extractor
+
+#endif
diff --git a/extractor/features/is_source_singleton_test.cc b/extractor/features/is_source_singleton_test.cc
new file mode 100644
index 00000000..f4266671
--- /dev/null
+++ b/extractor/features/is_source_singleton_test.cc
@@ -0,0 +1,39 @@
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+
+#include "is_source_singleton.h"
+
+using namespace std;
+using namespace ::testing;
+
+namespace extractor {
+namespace features {
+namespace {
+
+class IsSourceSingletonTest : public Test {
+ protected:
+ virtual void SetUp() {
+ feature = make_shared<IsSourceSingleton>();
+ }
+
+ shared_ptr<IsSourceSingleton> feature;
+};
+
+TEST_F(IsSourceSingletonTest, TestGetName) {
+ EXPECT_EQ("IsSingletonF", feature->GetName());
+}
+
+TEST_F(IsSourceSingletonTest, TestScore) {
+ Phrase phrase;
+ FeatureContext context(phrase, phrase, 0.5, 3, 31);
+ EXPECT_EQ(0, feature->Score(context));
+
+ context = FeatureContext(phrase, phrase, 1, 3, 25);
+ EXPECT_EQ(1, feature->Score(context));
+}
+
+} // namespace
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/is_source_target_singleton.cc b/extractor/features/is_source_target_singleton.cc
new file mode 100644
index 00000000..03b3c62c
--- /dev/null
+++ b/extractor/features/is_source_target_singleton.cc
@@ -0,0 +1,17 @@
+#include "is_source_target_singleton.h"
+
+#include <cmath>
+
+namespace extractor {
+namespace features {
+
+double IsSourceTargetSingleton::Score(const FeatureContext& context) const {
+ return context.pair_count == 1;
+}
+
+string IsSourceTargetSingleton::GetName() const {
+ return "IsSingletonFE";
+}
+
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/is_source_target_singleton.h b/extractor/features/is_source_target_singleton.h
new file mode 100644
index 00000000..dacfebba
--- /dev/null
+++ b/extractor/features/is_source_target_singleton.h
@@ -0,0 +1,22 @@
+#ifndef _IS_SOURCE_TARGET_SINGLETON_H_
+#define _IS_SOURCE_TARGET_SINGLETON_H_
+
+#include "feature.h"
+
+namespace extractor {
+namespace features {
+
+/**
+ * Boolean feature checking if the phrase pair occurs only once in the data.
+ */
+class IsSourceTargetSingleton : public Feature {
+ public:
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+};
+
+} // namespace features
+} // namespace extractor
+
+#endif
diff --git a/extractor/features/is_source_target_singleton_test.cc b/extractor/features/is_source_target_singleton_test.cc
new file mode 100644
index 00000000..929635b0
--- /dev/null
+++ b/extractor/features/is_source_target_singleton_test.cc
@@ -0,0 +1,39 @@
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+
+#include "is_source_target_singleton.h"
+
+using namespace std;
+using namespace ::testing;
+
+namespace extractor {
+namespace features {
+namespace {
+
+class IsSourceTargetSingletonTest : public Test {
+ protected:
+ virtual void SetUp() {
+ feature = make_shared<IsSourceTargetSingleton>();
+ }
+
+ shared_ptr<IsSourceTargetSingleton> feature;
+};
+
+TEST_F(IsSourceTargetSingletonTest, TestGetName) {
+ EXPECT_EQ("IsSingletonFE", feature->GetName());
+}
+
+TEST_F(IsSourceTargetSingletonTest, TestScore) {
+ Phrase phrase;
+ FeatureContext context(phrase, phrase, 0.5, 3, 7);
+ EXPECT_EQ(0, feature->Score(context));
+
+ context = FeatureContext(phrase, phrase, 2.3, 1, 28);
+ EXPECT_EQ(1, feature->Score(context));
+}
+
+} // namespace
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/max_lex_source_given_target.cc b/extractor/features/max_lex_source_given_target.cc
new file mode 100644
index 00000000..65d0ec68
--- /dev/null
+++ b/extractor/features/max_lex_source_given_target.cc
@@ -0,0 +1,37 @@
+#include "max_lex_source_given_target.h"
+
+#include <cmath>
+
+#include "data_array.h"
+#include "translation_table.h"
+
+namespace extractor {
+namespace features {
+
+MaxLexSourceGivenTarget::MaxLexSourceGivenTarget(
+ shared_ptr<TranslationTable> table) :
+ table(table) {}
+
+double MaxLexSourceGivenTarget::Score(const FeatureContext& context) const {
+ vector<string> source_words = context.source_phrase.GetWords();
+ vector<string> target_words = context.target_phrase.GetWords();
+ target_words.push_back(DataArray::NULL_WORD_STR);
+
+ double score = 0;
+ for (string source_word: source_words) {
+ double max_score = 0;
+ for (string target_word: target_words) {
+ max_score = max(max_score,
+ table->GetSourceGivenTargetScore(source_word, target_word));
+ }
+ score += max_score > 0 ? -log10(max_score) : MAX_SCORE;
+ }
+ return score;
+}
+
+string MaxLexSourceGivenTarget::GetName() const {
+ return "MaxLexFgivenE";
+}
+
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/max_lex_source_given_target.h b/extractor/features/max_lex_source_given_target.h
new file mode 100644
index 00000000..461b0ebf
--- /dev/null
+++ b/extractor/features/max_lex_source_given_target.h
@@ -0,0 +1,34 @@
+#ifndef _MAX_LEX_SOURCE_GIVEN_TARGET_H_
+#define _MAX_LEX_SOURCE_GIVEN_TARGET_H_
+
+#include <memory>
+
+#include "feature.h"
+
+using namespace std;
+
+namespace extractor {
+
+class TranslationTable;
+
+namespace features {
+
+/**
+ * Feature computing max(p(f | e)) across all pairs of words in the phrase pair.
+ */
+class MaxLexSourceGivenTarget : public Feature {
+ public:
+ MaxLexSourceGivenTarget(shared_ptr<TranslationTable> table);
+
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+
+ private:
+ shared_ptr<TranslationTable> table;
+};
+
+} // namespace features
+} // namespace extractor
+
+#endif
diff --git a/extractor/features/max_lex_source_given_target_test.cc b/extractor/features/max_lex_source_given_target_test.cc
new file mode 100644
index 00000000..7f6aae41
--- /dev/null
+++ b/extractor/features/max_lex_source_given_target_test.cc
@@ -0,0 +1,78 @@
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <memory>
+#include <string>
+
+#include "data_array.h"
+#include "mocks/mock_translation_table.h"
+#include "mocks/mock_vocabulary.h"
+#include "phrase_builder.h"
+#include "max_lex_source_given_target.h"
+
+using namespace std;
+using namespace ::testing;
+
+namespace extractor {
+namespace features {
+namespace {
+
+class MaxLexSourceGivenTargetTest : public Test {
+ protected:
+ virtual void SetUp() {
+ vector<string> source_words = {"f1", "f2", "f3"};
+ vector<string> target_words = {"e1", "e2", "e3"};
+
+ vocabulary = make_shared<MockVocabulary>();
+ for (size_t i = 0; i < source_words.size(); ++i) {
+ EXPECT_CALL(*vocabulary, GetTerminalValue(i))
+ .WillRepeatedly(Return(source_words[i]));
+ }
+ for (size_t i = 0; i < target_words.size(); ++i) {
+ EXPECT_CALL(*vocabulary, GetTerminalValue(i + source_words.size()))
+ .WillRepeatedly(Return(target_words[i]));
+ }
+
+ phrase_builder = make_shared<PhraseBuilder>(vocabulary);
+
+ table = make_shared<MockTranslationTable>();
+ for (size_t i = 0; i < source_words.size(); ++i) {
+ for (size_t j = 0; j < target_words.size(); ++j) {
+ int value = i - j;
+ EXPECT_CALL(*table, GetSourceGivenTargetScore(
+ source_words[i], target_words[j])).WillRepeatedly(Return(value));
+ }
+ }
+
+ for (size_t i = 0; i < source_words.size(); ++i) {
+ int value = i * 3;
+ EXPECT_CALL(*table, GetSourceGivenTargetScore(
+ source_words[i], DataArray::NULL_WORD_STR))
+ .WillRepeatedly(Return(value));
+ }
+
+ feature = make_shared<MaxLexSourceGivenTarget>(table);
+ }
+
+ shared_ptr<MockVocabulary> vocabulary;
+ shared_ptr<PhraseBuilder> phrase_builder;
+ shared_ptr<MockTranslationTable> table;
+ shared_ptr<MaxLexSourceGivenTarget> feature;
+};
+
+TEST_F(MaxLexSourceGivenTargetTest, TestGetName) {
+ EXPECT_EQ("MaxLexFgivenE", feature->GetName());
+}
+
+TEST_F(MaxLexSourceGivenTargetTest, TestScore) {
+ vector<int> source_symbols = {0, 1, 2};
+ Phrase source_phrase = phrase_builder->Build(source_symbols);
+ vector<int> target_symbols = {3, 4, 5};
+ Phrase target_phrase = phrase_builder->Build(target_symbols);
+ FeatureContext context(source_phrase, target_phrase, 0.3, 7, 11);
+ EXPECT_EQ(99 - log10(18), feature->Score(context));
+}
+
+} // namespace
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/max_lex_target_given_source.cc b/extractor/features/max_lex_target_given_source.cc
new file mode 100644
index 00000000..33783054
--- /dev/null
+++ b/extractor/features/max_lex_target_given_source.cc
@@ -0,0 +1,37 @@
+#include "max_lex_target_given_source.h"
+
+#include <cmath>
+
+#include "data_array.h"
+#include "translation_table.h"
+
+namespace extractor {
+namespace features {
+
+MaxLexTargetGivenSource::MaxLexTargetGivenSource(
+ shared_ptr<TranslationTable> table) :
+ table(table) {}
+
+double MaxLexTargetGivenSource::Score(const FeatureContext& context) const {
+ vector<string> source_words = context.source_phrase.GetWords();
+ source_words.push_back(DataArray::NULL_WORD_STR);
+ vector<string> target_words = context.target_phrase.GetWords();
+
+ double score = 0;
+ for (string target_word: target_words) {
+ double max_score = 0;
+ for (string source_word: source_words) {
+ max_score = max(max_score,
+ table->GetTargetGivenSourceScore(source_word, target_word));
+ }
+ score += max_score > 0 ? -log10(max_score) : MAX_SCORE;
+ }
+ return score;
+}
+
+string MaxLexTargetGivenSource::GetName() const {
+ return "MaxLexEgivenF";
+}
+
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/max_lex_target_given_source.h b/extractor/features/max_lex_target_given_source.h
new file mode 100644
index 00000000..c3c87327
--- /dev/null
+++ b/extractor/features/max_lex_target_given_source.h
@@ -0,0 +1,34 @@
+#ifndef _MAX_LEX_TARGET_GIVEN_SOURCE_H_
+#define _MAX_LEX_TARGET_GIVEN_SOURCE_H_
+
+#include <memory>
+
+#include "feature.h"
+
+using namespace std;
+
+namespace extractor {
+
+class TranslationTable;
+
+namespace features {
+
+/**
+ * Feature computing max(p(e | f)) across all pairs of words in the phrase pair.
+ */
+class MaxLexTargetGivenSource : public Feature {
+ public:
+ MaxLexTargetGivenSource(shared_ptr<TranslationTable> table);
+
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+
+ private:
+ shared_ptr<TranslationTable> table;
+};
+
+} // namespace features
+} // namespace extractor
+
+#endif
diff --git a/extractor/features/max_lex_target_given_source_test.cc b/extractor/features/max_lex_target_given_source_test.cc
new file mode 100644
index 00000000..6d0efd9c
--- /dev/null
+++ b/extractor/features/max_lex_target_given_source_test.cc
@@ -0,0 +1,78 @@
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <memory>
+#include <string>
+
+#include "data_array.h"
+#include "mocks/mock_translation_table.h"
+#include "mocks/mock_vocabulary.h"
+#include "phrase_builder.h"
+#include "max_lex_target_given_source.h"
+
+using namespace std;
+using namespace ::testing;
+
+namespace extractor {
+namespace features {
+namespace {
+
+class MaxLexTargetGivenSourceTest : public Test {
+ protected:
+ virtual void SetUp() {
+ vector<string> source_words = {"f1", "f2", "f3"};
+ vector<string> target_words = {"e1", "e2", "e3"};
+
+ vocabulary = make_shared<MockVocabulary>();
+ for (size_t i = 0; i < source_words.size(); ++i) {
+ EXPECT_CALL(*vocabulary, GetTerminalValue(i))
+ .WillRepeatedly(Return(source_words[i]));
+ }
+ for (size_t i = 0; i < target_words.size(); ++i) {
+ EXPECT_CALL(*vocabulary, GetTerminalValue(i + source_words.size()))
+ .WillRepeatedly(Return(target_words[i]));
+ }
+
+ phrase_builder = make_shared<PhraseBuilder>(vocabulary);
+
+ table = make_shared<MockTranslationTable>();
+ for (size_t i = 0; i < source_words.size(); ++i) {
+ for (size_t j = 0; j < target_words.size(); ++j) {
+ int value = i - j;
+ EXPECT_CALL(*table, GetTargetGivenSourceScore(
+ source_words[i], target_words[j])).WillRepeatedly(Return(value));
+ }
+ }
+
+ for (size_t i = 0; i < target_words.size(); ++i) {
+ int value = i * 3;
+ EXPECT_CALL(*table, GetTargetGivenSourceScore(
+ DataArray::NULL_WORD_STR, target_words[i]))
+ .WillRepeatedly(Return(value));
+ }
+
+ feature = make_shared<MaxLexTargetGivenSource>(table);
+ }
+
+ shared_ptr<MockVocabulary> vocabulary;
+ shared_ptr<PhraseBuilder> phrase_builder;
+ shared_ptr<MockTranslationTable> table;
+ shared_ptr<MaxLexTargetGivenSource> feature;
+};
+
+TEST_F(MaxLexTargetGivenSourceTest, TestGetName) {
+ EXPECT_EQ("MaxLexEgivenF", feature->GetName());
+}
+
+TEST_F(MaxLexTargetGivenSourceTest, TestScore) {
+ vector<int> source_symbols = {0, 1, 2};
+ Phrase source_phrase = phrase_builder->Build(source_symbols);
+ vector<int> target_symbols = {3, 4, 5};
+ Phrase target_phrase = phrase_builder->Build(target_symbols);
+ FeatureContext context(source_phrase, target_phrase, 0.3, 7, 19);
+ EXPECT_EQ(-log10(36), feature->Score(context));
+}
+
+} // namespace
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/sample_source_count.cc b/extractor/features/sample_source_count.cc
new file mode 100644
index 00000000..b110fc51
--- /dev/null
+++ b/extractor/features/sample_source_count.cc
@@ -0,0 +1,17 @@
+#include "sample_source_count.h"
+
+#include <cmath>
+
+namespace extractor {
+namespace features {
+
+double SampleSourceCount::Score(const FeatureContext& context) const {
+ return log10(1 + context.num_samples);
+}
+
+string SampleSourceCount::GetName() const {
+ return "SampleCountF";
+}
+
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/sample_source_count.h b/extractor/features/sample_source_count.h
new file mode 100644
index 00000000..ee6e59a0
--- /dev/null
+++ b/extractor/features/sample_source_count.h
@@ -0,0 +1,23 @@
+#ifndef _SAMPLE_SOURCE_COUNT_H_
+#define _SAMPLE_SOURCE_COUNT_H_
+
+#include "feature.h"
+
+namespace extractor {
+namespace features {
+
+/**
+ * Feature scoring the number of times the source phrase occurs in the sampled
+ * set.
+ */
+class SampleSourceCount : public Feature {
+ public:
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+};
+
+} // namespace features
+} // namespace extractor
+
+#endif
diff --git a/extractor/features/sample_source_count_test.cc b/extractor/features/sample_source_count_test.cc
new file mode 100644
index 00000000..63856b9d
--- /dev/null
+++ b/extractor/features/sample_source_count_test.cc
@@ -0,0 +1,40 @@
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <memory>
+#include <string>
+
+#include "sample_source_count.h"
+
+using namespace std;
+using namespace ::testing;
+
+namespace extractor {
+namespace features {
+namespace {
+
+class SampleSourceCountTest : public Test {
+ protected:
+ virtual void SetUp() {
+ feature = make_shared<SampleSourceCount>();
+ }
+
+ shared_ptr<SampleSourceCount> feature;
+};
+
+TEST_F(SampleSourceCountTest, TestGetName) {
+ EXPECT_EQ("SampleCountF", feature->GetName());
+}
+
+TEST_F(SampleSourceCountTest, TestScore) {
+ Phrase phrase;
+ FeatureContext context(phrase, phrase, 0, 3, 1);
+ EXPECT_EQ(log10(2), feature->Score(context));
+
+ context = FeatureContext(phrase, phrase, 3.2, 3, 9);
+ EXPECT_EQ(1.0, feature->Score(context));
+}
+
+} // namespace
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/target_given_source_coherent.cc b/extractor/features/target_given_source_coherent.cc
new file mode 100644
index 00000000..c4551d88
--- /dev/null
+++ b/extractor/features/target_given_source_coherent.cc
@@ -0,0 +1,18 @@
+#include "target_given_source_coherent.h"
+
+#include <cmath>
+
+namespace extractor {
+namespace features {
+
+double TargetGivenSourceCoherent::Score(const FeatureContext& context) const {
+ double prob = (double) context.pair_count / context.num_samples;
+ return prob > 0 ? -log10(prob) : MAX_SCORE;
+}
+
+string TargetGivenSourceCoherent::GetName() const {
+ return "EgivenFCoherent";
+}
+
+} // namespace features
+} // namespace extractor
diff --git a/extractor/features/target_given_source_coherent.h b/extractor/features/target_given_source_coherent.h
new file mode 100644
index 00000000..e66d70a5
--- /dev/null
+++ b/extractor/features/target_given_source_coherent.h
@@ -0,0 +1,23 @@
+#ifndef _TARGET_GIVEN_SOURCE_COHERENT_H_
+#define _TARGET_GIVEN_SOURCE_COHERENT_H_
+
+#include "feature.h"
+
+namespace extractor {
+namespace features {
+
+/**
+ * Feature computing the ratio of the phrase pair count over all source phrase
+ * occurrences (sampled).
+ */
+class TargetGivenSourceCoherent : public Feature {
+ public:
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+};
+
+} // namespace features
+} // namespace extractor
+
+#endif
diff --git a/extractor/features/target_given_source_coherent_test.cc b/extractor/features/target_given_source_coherent_test.cc
new file mode 100644
index 00000000..454105e1
--- /dev/null
+++ b/extractor/features/target_given_source_coherent_test.cc
@@ -0,0 +1,39 @@
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+
+#include "target_given_source_coherent.h"
+
+using namespace std;
+using namespace ::testing;
+
+namespace extractor {
+namespace features {
+namespace {
+
+class TargetGivenSourceCoherentTest : public Test {
+ protected:
+ virtual void SetUp() {
+ feature = make_shared<TargetGivenSourceCoherent>();
+ }
+
+ shared_ptr<TargetGivenSourceCoherent> feature;
+};
+
+TEST_F(TargetGivenSourceCoherentTest, TestGetName) {
+ EXPECT_EQ("EgivenFCoherent", feature->GetName());
+}
+
+TEST_F(TargetGivenSourceCoherentTest, TestScore) {
+ Phrase phrase;
+ FeatureContext context(phrase, phrase, 0.3, 2, 20);
+ EXPECT_EQ(1.0, feature->Score(context));
+
+ context = FeatureContext(phrase, phrase, 1.9, 0, 1);
+ EXPECT_EQ(99.0, feature->Score(context));
+}
+
+} // namespace
+} // namespace features
+} // namespace extractor