summaryrefslogtreecommitdiff
path: root/extractor/features
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-02-01 16:11:10 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-02-01 16:11:10 +0000
commit0a53f7eca74c165b5ce1c238f1999ddf1febea55 (patch)
tree5a5231767bc2f92203711ab4aee75336b8bc2175 /extractor/features
parent5530575ae0ad939e17f08d6bd49978acea388ab7 (diff)
Second working commit.
Diffstat (limited to 'extractor/features')
-rw-r--r--extractor/features/count_source_target.cc11
-rw-r--r--extractor/features/count_source_target.h13
-rw-r--r--extractor/features/feature.cc3
-rw-r--r--extractor/features/feature.h32
-rw-r--r--extractor/features/is_source_singleton.cc11
-rw-r--r--extractor/features/is_source_singleton.h13
-rw-r--r--extractor/features/is_source_target_singleton.cc11
-rw-r--r--extractor/features/is_source_target_singleton.h13
-rw-r--r--extractor/features/max_lex_source_given_target.cc30
-rw-r--r--extractor/features/max_lex_source_given_target.h24
-rw-r--r--extractor/features/max_lex_target_given_source.cc30
-rw-r--r--extractor/features/max_lex_target_given_source.h24
-rw-r--r--extractor/features/sample_source_count.cc11
-rw-r--r--extractor/features/sample_source_count.h13
-rw-r--r--extractor/features/target_given_source_coherent.cc12
-rw-r--r--extractor/features/target_given_source_coherent.h13
16 files changed, 264 insertions, 0 deletions
diff --git a/extractor/features/count_source_target.cc b/extractor/features/count_source_target.cc
new file mode 100644
index 00000000..9441b451
--- /dev/null
+++ b/extractor/features/count_source_target.cc
@@ -0,0 +1,11 @@
+#include "count_source_target.h"
+
+#include <cmath>
+
+double CountSourceTarget::Score(const FeatureContext& context) const {
+ return log10(1 + context.pair_count);
+}
+
+string CountSourceTarget::GetName() const {
+ return "CountEF";
+}
diff --git a/extractor/features/count_source_target.h b/extractor/features/count_source_target.h
new file mode 100644
index 00000000..a2481944
--- /dev/null
+++ b/extractor/features/count_source_target.h
@@ -0,0 +1,13 @@
+#ifndef _COUNT_SOURCE_TARGET_H_
+#define _COUNT_SOURCE_TARGET_H_
+
+#include "feature.h"
+
+class CountSourceTarget : public Feature {
+ public:
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+};
+
+#endif
diff --git a/extractor/features/feature.cc b/extractor/features/feature.cc
new file mode 100644
index 00000000..7381c35a
--- /dev/null
+++ b/extractor/features/feature.cc
@@ -0,0 +1,3 @@
+#include "feature.h"
+
+const double Feature::MAX_SCORE = 99.0;
diff --git a/extractor/features/feature.h b/extractor/features/feature.h
new file mode 100644
index 00000000..ad22d3e7
--- /dev/null
+++ b/extractor/features/feature.h
@@ -0,0 +1,32 @@
+#ifndef _FEATURE_H_
+#define _FEATURE_H_
+
+#include <string>
+
+//TODO(pauldb): include headers nicely.
+#include "../phrase.h"
+
+using namespace std;
+
+struct FeatureContext {
+ FeatureContext(const Phrase& source_phrase, const Phrase& target_phrase,
+ double sample_source_count, int pair_count) :
+ source_phrase(source_phrase), target_phrase(target_phrase),
+ sample_source_count(sample_source_count), pair_count(pair_count) {}
+
+ Phrase source_phrase;
+ Phrase target_phrase;
+ double sample_source_count;
+ int pair_count;
+};
+
+class Feature {
+ public:
+ virtual double Score(const FeatureContext& context) const = 0;
+
+ virtual string GetName() const = 0;
+
+ static const double MAX_SCORE;
+};
+
+#endif
diff --git a/extractor/features/is_source_singleton.cc b/extractor/features/is_source_singleton.cc
new file mode 100644
index 00000000..754df3bf
--- /dev/null
+++ b/extractor/features/is_source_singleton.cc
@@ -0,0 +1,11 @@
+#include "is_source_singleton.h"
+
+#include <cmath>
+
+double IsSourceSingleton::Score(const FeatureContext& context) const {
+ return context.sample_source_count == 1;
+}
+
+string IsSourceSingleton::GetName() const {
+ return "IsSingletonF";
+}
diff --git a/extractor/features/is_source_singleton.h b/extractor/features/is_source_singleton.h
new file mode 100644
index 00000000..7cc72828
--- /dev/null
+++ b/extractor/features/is_source_singleton.h
@@ -0,0 +1,13 @@
+#ifndef _IS_SOURCE_SINGLETON_H_
+#define _IS_SOURCE_SINGLETON_H_
+
+#include "feature.h"
+
+class IsSourceSingleton : public Feature {
+ public:
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+};
+
+#endif
diff --git a/extractor/features/is_source_target_singleton.cc b/extractor/features/is_source_target_singleton.cc
new file mode 100644
index 00000000..ec816509
--- /dev/null
+++ b/extractor/features/is_source_target_singleton.cc
@@ -0,0 +1,11 @@
+#include "is_source_target_singleton.h"
+
+#include <cmath>
+
+double IsSourceTargetSingleton::Score(const FeatureContext& context) const {
+ return context.pair_count == 1;
+}
+
+string IsSourceTargetSingleton::GetName() const {
+ return "IsSingletonEF";
+}
diff --git a/extractor/features/is_source_target_singleton.h b/extractor/features/is_source_target_singleton.h
new file mode 100644
index 00000000..58913b74
--- /dev/null
+++ b/extractor/features/is_source_target_singleton.h
@@ -0,0 +1,13 @@
+#ifndef _IS_SOURCE_TARGET_SINGLETON_H_
+#define _IS_SOURCE_TARGET_SINGLETON_H_
+
+#include "feature.h"
+
+class IsSourceTargetSingleton : public Feature {
+ public:
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+};
+
+#endif
diff --git a/extractor/features/max_lex_source_given_target.cc b/extractor/features/max_lex_source_given_target.cc
new file mode 100644
index 00000000..c4792d49
--- /dev/null
+++ b/extractor/features/max_lex_source_given_target.cc
@@ -0,0 +1,30 @@
+#include "max_lex_source_given_target.h"
+
+#include <cmath>
+
+#include "../translation_table.h"
+
+MaxLexSourceGivenTarget::MaxLexSourceGivenTarget(
+ shared_ptr<TranslationTable> table) :
+ table(table) {}
+
+double MaxLexSourceGivenTarget::Score(const FeatureContext& context) const {
+ vector<string> source_words = context.source_phrase.GetWords();
+ // TODO(pauldb): Add NULL to target_words, after fixing translation table.
+ vector<string> target_words = context.target_phrase.GetWords();
+
+ double score = 0;
+ for (string source_word: source_words) {
+ double max_score = 0;
+ for (string target_word: target_words) {
+ max_score = max(max_score,
+ table->GetSourceGivenTargetScore(source_word, target_word));
+ }
+ score += max_score > 0 ? -log10(max_score) : MAX_SCORE;
+ }
+ return score;
+}
+
+string MaxLexSourceGivenTarget::GetName() const {
+ return "MaxLexFGivenE";
+}
diff --git a/extractor/features/max_lex_source_given_target.h b/extractor/features/max_lex_source_given_target.h
new file mode 100644
index 00000000..e87c1c8e
--- /dev/null
+++ b/extractor/features/max_lex_source_given_target.h
@@ -0,0 +1,24 @@
+#ifndef _MAX_LEX_SOURCE_GIVEN_TARGET_H_
+#define _MAX_LEX_SOURCE_GIVEN_TARGET_H_
+
+#include <memory>
+
+#include "feature.h"
+
+using namespace std;
+
+class TranslationTable;
+
+class MaxLexSourceGivenTarget : public Feature {
+ public:
+ MaxLexSourceGivenTarget(shared_ptr<TranslationTable> table);
+
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+
+ private:
+ shared_ptr<TranslationTable> table;
+};
+
+#endif
diff --git a/extractor/features/max_lex_target_given_source.cc b/extractor/features/max_lex_target_given_source.cc
new file mode 100644
index 00000000..d82182fe
--- /dev/null
+++ b/extractor/features/max_lex_target_given_source.cc
@@ -0,0 +1,30 @@
+#include "max_lex_target_given_source.h"
+
+#include <cmath>
+
+#include "../translation_table.h"
+
+MaxLexTargetGivenSource::MaxLexTargetGivenSource(
+ shared_ptr<TranslationTable> table) :
+ table(table) {}
+
+double MaxLexTargetGivenSource::Score(const FeatureContext& context) const {
+ // TODO(pauldb): Add NULL to source_words, after fixing translation table.
+ vector<string> source_words = context.source_phrase.GetWords();
+ vector<string> target_words = context.target_phrase.GetWords();
+
+ double score = 0;
+ for (string target_word: target_words) {
+ double max_score = 0;
+ for (string source_word: source_words) {
+ max_score = max(max_score,
+ table->GetTargetGivenSourceScore(source_word, target_word));
+ }
+ score += max_score > 0 ? -log10(max_score) : MAX_SCORE;
+ }
+ return score;
+}
+
+string MaxLexTargetGivenSource::GetName() const {
+ return "MaxLexEGivenF";
+}
diff --git a/extractor/features/max_lex_target_given_source.h b/extractor/features/max_lex_target_given_source.h
new file mode 100644
index 00000000..9585ff04
--- /dev/null
+++ b/extractor/features/max_lex_target_given_source.h
@@ -0,0 +1,24 @@
+#ifndef _MAX_LEX_TARGET_GIVEN_SOURCE_H_
+#define _MAX_LEX_TARGET_GIVEN_SOURCE_H_
+
+#include <memory>
+
+#include "feature.h"
+
+using namespace std;
+
+class TranslationTable;
+
+class MaxLexTargetGivenSource : public Feature {
+ public:
+ MaxLexTargetGivenSource(shared_ptr<TranslationTable> table);
+
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+
+ private:
+ shared_ptr<TranslationTable> table;
+};
+
+#endif
diff --git a/extractor/features/sample_source_count.cc b/extractor/features/sample_source_count.cc
new file mode 100644
index 00000000..c8124cfb
--- /dev/null
+++ b/extractor/features/sample_source_count.cc
@@ -0,0 +1,11 @@
+#include "sample_source_count.h"
+
+#include <cmath>
+
+double SampleSourceCount::Score(const FeatureContext& context) const {
+ return log10(1 + context.sample_source_count);
+}
+
+string SampleSourceCount::GetName() const {
+ return "SampleCountF";
+}
diff --git a/extractor/features/sample_source_count.h b/extractor/features/sample_source_count.h
new file mode 100644
index 00000000..62d236c8
--- /dev/null
+++ b/extractor/features/sample_source_count.h
@@ -0,0 +1,13 @@
+#ifndef _SAMPLE_SOURCE_COUNT_H_
+#define _SAMPLE_SOURCE_COUNT_H_
+
+#include "feature.h"
+
+class SampleSourceCount : public Feature {
+ public:
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+};
+
+#endif
diff --git a/extractor/features/target_given_source_coherent.cc b/extractor/features/target_given_source_coherent.cc
new file mode 100644
index 00000000..748413c3
--- /dev/null
+++ b/extractor/features/target_given_source_coherent.cc
@@ -0,0 +1,12 @@
+#include "target_given_source_coherent.h"
+
+#include <cmath>
+
+double TargetGivenSourceCoherent::Score(const FeatureContext& context) const {
+ double prob = context.pair_count / context.sample_source_count;
+ return prob > 0 ? -log10(prob) : MAX_SCORE;
+}
+
+string TargetGivenSourceCoherent::GetName() const {
+ return "EGivenFCoherent";
+}
diff --git a/extractor/features/target_given_source_coherent.h b/extractor/features/target_given_source_coherent.h
new file mode 100644
index 00000000..09c8edb1
--- /dev/null
+++ b/extractor/features/target_given_source_coherent.h
@@ -0,0 +1,13 @@
+#ifndef _TARGET_GIVEN_SOURCE_COHERENT_H_
+#define _TARGET_GIVEN_SOURCE_COHERENT_H_
+
+#include "feature.h"
+
+class TargetGivenSourceCoherent : public Feature {
+ public:
+ double Score(const FeatureContext& context) const;
+
+ string GetName() const;
+};
+
+#endif