diff options
author | Paul Baltescu <pauldb89@gmail.com> | 2013-02-01 16:11:10 +0000 |
---|---|---|
committer | Paul Baltescu <pauldb89@gmail.com> | 2013-02-01 16:11:10 +0000 |
commit | 0a53f7eca74c165b5ce1c238f1999ddf1febea55 (patch) | |
tree | 5a5231767bc2f92203711ab4aee75336b8bc2175 /extractor/features | |
parent | 5530575ae0ad939e17f08d6bd49978acea388ab7 (diff) |
Second working commit.
Diffstat (limited to 'extractor/features')
-rw-r--r-- | extractor/features/count_source_target.cc | 11 | ||||
-rw-r--r-- | extractor/features/count_source_target.h | 13 | ||||
-rw-r--r-- | extractor/features/feature.cc | 3 | ||||
-rw-r--r-- | extractor/features/feature.h | 32 | ||||
-rw-r--r-- | extractor/features/is_source_singleton.cc | 11 | ||||
-rw-r--r-- | extractor/features/is_source_singleton.h | 13 | ||||
-rw-r--r-- | extractor/features/is_source_target_singleton.cc | 11 | ||||
-rw-r--r-- | extractor/features/is_source_target_singleton.h | 13 | ||||
-rw-r--r-- | extractor/features/max_lex_source_given_target.cc | 30 | ||||
-rw-r--r-- | extractor/features/max_lex_source_given_target.h | 24 | ||||
-rw-r--r-- | extractor/features/max_lex_target_given_source.cc | 30 | ||||
-rw-r--r-- | extractor/features/max_lex_target_given_source.h | 24 | ||||
-rw-r--r-- | extractor/features/sample_source_count.cc | 11 | ||||
-rw-r--r-- | extractor/features/sample_source_count.h | 13 | ||||
-rw-r--r-- | extractor/features/target_given_source_coherent.cc | 12 | ||||
-rw-r--r-- | extractor/features/target_given_source_coherent.h | 13 |
16 files changed, 264 insertions, 0 deletions
diff --git a/extractor/features/count_source_target.cc b/extractor/features/count_source_target.cc new file mode 100644 index 00000000..9441b451 --- /dev/null +++ b/extractor/features/count_source_target.cc @@ -0,0 +1,11 @@ +#include "count_source_target.h" + +#include <cmath> + +double CountSourceTarget::Score(const FeatureContext& context) const { + return log10(1 + context.pair_count); +} + +string CountSourceTarget::GetName() const { + return "CountEF"; +} diff --git a/extractor/features/count_source_target.h b/extractor/features/count_source_target.h new file mode 100644 index 00000000..a2481944 --- /dev/null +++ b/extractor/features/count_source_target.h @@ -0,0 +1,13 @@ +#ifndef _COUNT_SOURCE_TARGET_H_ +#define _COUNT_SOURCE_TARGET_H_ + +#include "feature.h" + +class CountSourceTarget : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +#endif diff --git a/extractor/features/feature.cc b/extractor/features/feature.cc new file mode 100644 index 00000000..7381c35a --- /dev/null +++ b/extractor/features/feature.cc @@ -0,0 +1,3 @@ +#include "feature.h" + +const double Feature::MAX_SCORE = 99.0; diff --git a/extractor/features/feature.h b/extractor/features/feature.h new file mode 100644 index 00000000..ad22d3e7 --- /dev/null +++ b/extractor/features/feature.h @@ -0,0 +1,32 @@ +#ifndef _FEATURE_H_ +#define _FEATURE_H_ + +#include <string> + +//TODO(pauldb): include headers nicely. +#include "../phrase.h" + +using namespace std; + +struct FeatureContext { + FeatureContext(const Phrase& source_phrase, const Phrase& target_phrase, + double sample_source_count, int pair_count) : + source_phrase(source_phrase), target_phrase(target_phrase), + sample_source_count(sample_source_count), pair_count(pair_count) {} + + Phrase source_phrase; + Phrase target_phrase; + double sample_source_count; + int pair_count; +}; + +class Feature { + public: + virtual double Score(const FeatureContext& context) const = 0; + + virtual string GetName() const = 0; + + static const double MAX_SCORE; +}; + +#endif diff --git a/extractor/features/is_source_singleton.cc b/extractor/features/is_source_singleton.cc new file mode 100644 index 00000000..754df3bf --- /dev/null +++ b/extractor/features/is_source_singleton.cc @@ -0,0 +1,11 @@ +#include "is_source_singleton.h" + +#include <cmath> + +double IsSourceSingleton::Score(const FeatureContext& context) const { + return context.sample_source_count == 1; +} + +string IsSourceSingleton::GetName() const { + return "IsSingletonF"; +} diff --git a/extractor/features/is_source_singleton.h b/extractor/features/is_source_singleton.h new file mode 100644 index 00000000..7cc72828 --- /dev/null +++ b/extractor/features/is_source_singleton.h @@ -0,0 +1,13 @@ +#ifndef _IS_SOURCE_SINGLETON_H_ +#define _IS_SOURCE_SINGLETON_H_ + +#include "feature.h" + +class IsSourceSingleton : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +#endif diff --git a/extractor/features/is_source_target_singleton.cc b/extractor/features/is_source_target_singleton.cc new file mode 100644 index 00000000..ec816509 --- /dev/null +++ b/extractor/features/is_source_target_singleton.cc @@ -0,0 +1,11 @@ +#include "is_source_target_singleton.h" + +#include <cmath> + +double IsSourceTargetSingleton::Score(const FeatureContext& context) const { + return context.pair_count == 1; +} + +string IsSourceTargetSingleton::GetName() const { + return "IsSingletonEF"; +} diff --git a/extractor/features/is_source_target_singleton.h b/extractor/features/is_source_target_singleton.h new file mode 100644 index 00000000..58913b74 --- /dev/null +++ b/extractor/features/is_source_target_singleton.h @@ -0,0 +1,13 @@ +#ifndef _IS_SOURCE_TARGET_SINGLETON_H_ +#define _IS_SOURCE_TARGET_SINGLETON_H_ + +#include "feature.h" + +class IsSourceTargetSingleton : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +#endif diff --git a/extractor/features/max_lex_source_given_target.cc b/extractor/features/max_lex_source_given_target.cc new file mode 100644 index 00000000..c4792d49 --- /dev/null +++ b/extractor/features/max_lex_source_given_target.cc @@ -0,0 +1,30 @@ +#include "max_lex_source_given_target.h" + +#include <cmath> + +#include "../translation_table.h" + +MaxLexSourceGivenTarget::MaxLexSourceGivenTarget( + shared_ptr<TranslationTable> table) : + table(table) {} + +double MaxLexSourceGivenTarget::Score(const FeatureContext& context) const { + vector<string> source_words = context.source_phrase.GetWords(); + // TODO(pauldb): Add NULL to target_words, after fixing translation table. + vector<string> target_words = context.target_phrase.GetWords(); + + double score = 0; + for (string source_word: source_words) { + double max_score = 0; + for (string target_word: target_words) { + max_score = max(max_score, + table->GetSourceGivenTargetScore(source_word, target_word)); + } + score += max_score > 0 ? -log10(max_score) : MAX_SCORE; + } + return score; +} + +string MaxLexSourceGivenTarget::GetName() const { + return "MaxLexFGivenE"; +} diff --git a/extractor/features/max_lex_source_given_target.h b/extractor/features/max_lex_source_given_target.h new file mode 100644 index 00000000..e87c1c8e --- /dev/null +++ b/extractor/features/max_lex_source_given_target.h @@ -0,0 +1,24 @@ +#ifndef _MAX_LEX_SOURCE_GIVEN_TARGET_H_ +#define _MAX_LEX_SOURCE_GIVEN_TARGET_H_ + +#include <memory> + +#include "feature.h" + +using namespace std; + +class TranslationTable; + +class MaxLexSourceGivenTarget : public Feature { + public: + MaxLexSourceGivenTarget(shared_ptr<TranslationTable> table); + + double Score(const FeatureContext& context) const; + + string GetName() const; + + private: + shared_ptr<TranslationTable> table; +}; + +#endif diff --git a/extractor/features/max_lex_target_given_source.cc b/extractor/features/max_lex_target_given_source.cc new file mode 100644 index 00000000..d82182fe --- /dev/null +++ b/extractor/features/max_lex_target_given_source.cc @@ -0,0 +1,30 @@ +#include "max_lex_target_given_source.h" + +#include <cmath> + +#include "../translation_table.h" + +MaxLexTargetGivenSource::MaxLexTargetGivenSource( + shared_ptr<TranslationTable> table) : + table(table) {} + +double MaxLexTargetGivenSource::Score(const FeatureContext& context) const { + // TODO(pauldb): Add NULL to source_words, after fixing translation table. + vector<string> source_words = context.source_phrase.GetWords(); + vector<string> target_words = context.target_phrase.GetWords(); + + double score = 0; + for (string target_word: target_words) { + double max_score = 0; + for (string source_word: source_words) { + max_score = max(max_score, + table->GetTargetGivenSourceScore(source_word, target_word)); + } + score += max_score > 0 ? -log10(max_score) : MAX_SCORE; + } + return score; +} + +string MaxLexTargetGivenSource::GetName() const { + return "MaxLexEGivenF"; +} diff --git a/extractor/features/max_lex_target_given_source.h b/extractor/features/max_lex_target_given_source.h new file mode 100644 index 00000000..9585ff04 --- /dev/null +++ b/extractor/features/max_lex_target_given_source.h @@ -0,0 +1,24 @@ +#ifndef _MAX_LEX_TARGET_GIVEN_SOURCE_H_ +#define _MAX_LEX_TARGET_GIVEN_SOURCE_H_ + +#include <memory> + +#include "feature.h" + +using namespace std; + +class TranslationTable; + +class MaxLexTargetGivenSource : public Feature { + public: + MaxLexTargetGivenSource(shared_ptr<TranslationTable> table); + + double Score(const FeatureContext& context) const; + + string GetName() const; + + private: + shared_ptr<TranslationTable> table; +}; + +#endif diff --git a/extractor/features/sample_source_count.cc b/extractor/features/sample_source_count.cc new file mode 100644 index 00000000..c8124cfb --- /dev/null +++ b/extractor/features/sample_source_count.cc @@ -0,0 +1,11 @@ +#include "sample_source_count.h" + +#include <cmath> + +double SampleSourceCount::Score(const FeatureContext& context) const { + return log10(1 + context.sample_source_count); +} + +string SampleSourceCount::GetName() const { + return "SampleCountF"; +} diff --git a/extractor/features/sample_source_count.h b/extractor/features/sample_source_count.h new file mode 100644 index 00000000..62d236c8 --- /dev/null +++ b/extractor/features/sample_source_count.h @@ -0,0 +1,13 @@ +#ifndef _SAMPLE_SOURCE_COUNT_H_ +#define _SAMPLE_SOURCE_COUNT_H_ + +#include "feature.h" + +class SampleSourceCount : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +#endif diff --git a/extractor/features/target_given_source_coherent.cc b/extractor/features/target_given_source_coherent.cc new file mode 100644 index 00000000..748413c3 --- /dev/null +++ b/extractor/features/target_given_source_coherent.cc @@ -0,0 +1,12 @@ +#include "target_given_source_coherent.h" + +#include <cmath> + +double TargetGivenSourceCoherent::Score(const FeatureContext& context) const { + double prob = context.pair_count / context.sample_source_count; + return prob > 0 ? -log10(prob) : MAX_SCORE; +} + +string TargetGivenSourceCoherent::GetName() const { + return "EGivenFCoherent"; +} diff --git a/extractor/features/target_given_source_coherent.h b/extractor/features/target_given_source_coherent.h new file mode 100644 index 00000000..09c8edb1 --- /dev/null +++ b/extractor/features/target_given_source_coherent.h @@ -0,0 +1,13 @@ +#ifndef _TARGET_GIVEN_SOURCE_COHERENT_H_ +#define _TARGET_GIVEN_SOURCE_COHERENT_H_ + +#include "feature.h" + +class TargetGivenSourceCoherent : public Feature { + public: + double Score(const FeatureContext& context) const; + + string GetName() const; +}; + +#endif |