summaryrefslogtreecommitdiff
path: root/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'extractor')
-rw-r--r--extractor/data_array.h2
-rw-r--r--extractor/features/is_source_singleton.cc2
-rw-r--r--extractor/run_extractor.cc17
-rw-r--r--extractor/target_phrase_extractor_test.cc28
4 files changed, 35 insertions, 14 deletions
diff --git a/extractor/data_array.h b/extractor/data_array.h
index 42e12135..a26bbecf 100644
--- a/extractor/data_array.h
+++ b/extractor/data_array.h
@@ -17,7 +17,7 @@ enum Side {
TARGET
};
-// TODO: This class has features for both the source and target data arrays.
+// Note: This class has features for both the source and target data arrays.
// Maybe we can save some memory by having more specific implementations (e.g.
// sentence_id is only needed for the source data array).
class DataArray {
diff --git a/extractor/features/is_source_singleton.cc b/extractor/features/is_source_singleton.cc
index ab54e51a..1abb486f 100644
--- a/extractor/features/is_source_singleton.cc
+++ b/extractor/features/is_source_singleton.cc
@@ -6,7 +6,7 @@ namespace extractor {
namespace features {
double IsSourceSingleton::Score(const FeatureContext& context) const {
- return context.source_phrase_count == 1;
+ return fabs(context.source_phrase_count - 1) < 1e-6;
}
string IsSourceSingleton::GetName() const {
diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc
index 0f91236d..ae3a875e 100644
--- a/extractor/run_extractor.cc
+++ b/extractor/run_extractor.cc
@@ -60,7 +60,6 @@ int main(int argc, char** argv) {
"Minimum number of occurences for a pharse to be considered frequent")
("max_samples", po::value<int>()->default_value(300),
"Maximum number of samples")
- // TODO(pauldb): Check if this works when set to false.
("tight_phrases", po::value<bool>()->default_value(true),
"False if phrases may be loose (better, but slower)");
@@ -144,17 +143,15 @@ int main(int argc, char** argv) {
<< GetDuration(preprocess_start_time, preprocess_stop_time)
<< " seconds" << endl;
- cerr << "creating grammar extractor" << endl;
-
Clock::time_point extraction_start_time = Clock::now();
vector<shared_ptr<Feature> > features = {
-// make_shared<TargetGivenSourceCoherent>(),
-// make_shared<SampleSourceCount>(),
-// make_shared<CountSourceTarget>(),
-// make_shared<MaxLexSourceGivenTarget>(table),
-// make_shared<MaxLexTargetGivenSource>(table),
-// make_shared<IsSourceSingleton>(),
-// make_shared<IsSourceTargetSingleton>()
+ make_shared<TargetGivenSourceCoherent>(),
+ make_shared<SampleSourceCount>(),
+ make_shared<CountSourceTarget>(),
+ make_shared<MaxLexSourceGivenTarget>(table),
+ make_shared<MaxLexTargetGivenSource>(table),
+ make_shared<IsSourceSingleton>(),
+ make_shared<IsSourceTargetSingleton>()
};
shared_ptr<Scorer> scorer = make_shared<Scorer>(features);
diff --git a/extractor/target_phrase_extractor_test.cc b/extractor/target_phrase_extractor_test.cc
index a686d20b..80927dee 100644
--- a/extractor/target_phrase_extractor_test.cc
+++ b/extractor/target_phrase_extractor_test.cc
@@ -111,8 +111,32 @@ TEST_F(TargetPhraseExtractorTest, TestExtractPhrasesTightPhrasesFalse) {
target_gaps, target_low, 1, 5, source_indexes, 0);
EXPECT_EQ(10, results.size());
- // TODO(pauldb): Finish unit test once it's clear how these alignments should
- // look like.
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 4; j <= 6; ++j) {
+ for (int k = 4; k <= j; ++k) {
+ vector<string> expected_words;
+ for (int l = i; l < 2; ++l) {
+ expected_words.push_back(target_words[l]);
+ }
+ for (int l = k; l < j; ++l) {
+ expected_words.push_back(target_words[l]);
+ }
+
+ PhraseAlignment expected_alignment;
+ expected_alignment.push_back(make_pair(1, 1 - i));
+
+ bool found_expected_pair = false;
+ for (auto result: results) {
+ if (result.first.GetWords() == expected_words &&
+ result.second == expected_alignment) {
+ found_expected_pair = true;
+ }
+ }
+
+ EXPECT_TRUE(found_expected_pair);
+ }
+ }
+ }
}
} // namespace