diff options
Diffstat (limited to 'extractor')
-rw-r--r-- | extractor/data_array.h | 2 | ||||
-rw-r--r-- | extractor/features/is_source_singleton.cc | 2 | ||||
-rw-r--r-- | extractor/run_extractor.cc | 17 | ||||
-rw-r--r-- | extractor/target_phrase_extractor_test.cc | 28 |
4 files changed, 35 insertions, 14 deletions
diff --git a/extractor/data_array.h b/extractor/data_array.h index 42e12135..a26bbecf 100644 --- a/extractor/data_array.h +++ b/extractor/data_array.h @@ -17,7 +17,7 @@ enum Side { TARGET }; -// TODO: This class has features for both the source and target data arrays. +// Note: This class has features for both the source and target data arrays. // Maybe we can save some memory by having more specific implementations (e.g. // sentence_id is only needed for the source data array). class DataArray { diff --git a/extractor/features/is_source_singleton.cc b/extractor/features/is_source_singleton.cc index ab54e51a..1abb486f 100644 --- a/extractor/features/is_source_singleton.cc +++ b/extractor/features/is_source_singleton.cc @@ -6,7 +6,7 @@ namespace extractor { namespace features { double IsSourceSingleton::Score(const FeatureContext& context) const { - return context.source_phrase_count == 1; + return fabs(context.source_phrase_count - 1) < 1e-6; } string IsSourceSingleton::GetName() const { diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 0f91236d..ae3a875e 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -60,7 +60,6 @@ int main(int argc, char** argv) { "Minimum number of occurences for a pharse to be considered frequent") ("max_samples", po::value<int>()->default_value(300), "Maximum number of samples") - // TODO(pauldb): Check if this works when set to false. ("tight_phrases", po::value<bool>()->default_value(true), "False if phrases may be loose (better, but slower)"); @@ -144,17 +143,15 @@ int main(int argc, char** argv) { << GetDuration(preprocess_start_time, preprocess_stop_time) << " seconds" << endl; - cerr << "creating grammar extractor" << endl; - Clock::time_point extraction_start_time = Clock::now(); vector<shared_ptr<Feature> > features = { -// make_shared<TargetGivenSourceCoherent>(), -// make_shared<SampleSourceCount>(), -// make_shared<CountSourceTarget>(), -// make_shared<MaxLexSourceGivenTarget>(table), -// make_shared<MaxLexTargetGivenSource>(table), -// make_shared<IsSourceSingleton>(), -// make_shared<IsSourceTargetSingleton>() + make_shared<TargetGivenSourceCoherent>(), + make_shared<SampleSourceCount>(), + make_shared<CountSourceTarget>(), + make_shared<MaxLexSourceGivenTarget>(table), + make_shared<MaxLexTargetGivenSource>(table), + make_shared<IsSourceSingleton>(), + make_shared<IsSourceTargetSingleton>() }; shared_ptr<Scorer> scorer = make_shared<Scorer>(features); diff --git a/extractor/target_phrase_extractor_test.cc b/extractor/target_phrase_extractor_test.cc index a686d20b..80927dee 100644 --- a/extractor/target_phrase_extractor_test.cc +++ b/extractor/target_phrase_extractor_test.cc @@ -111,8 +111,32 @@ TEST_F(TargetPhraseExtractorTest, TestExtractPhrasesTightPhrasesFalse) { target_gaps, target_low, 1, 5, source_indexes, 0); EXPECT_EQ(10, results.size()); - // TODO(pauldb): Finish unit test once it's clear how these alignments should - // look like. + for (int i = 0; i < 2; ++i) { + for (int j = 4; j <= 6; ++j) { + for (int k = 4; k <= j; ++k) { + vector<string> expected_words; + for (int l = i; l < 2; ++l) { + expected_words.push_back(target_words[l]); + } + for (int l = k; l < j; ++l) { + expected_words.push_back(target_words[l]); + } + + PhraseAlignment expected_alignment; + expected_alignment.push_back(make_pair(1, 1 - i)); + + bool found_expected_pair = false; + for (auto result: results) { + if (result.first.GetWords() == expected_words && + result.second == expected_alignment) { + found_expected_pair = true; + } + } + + EXPECT_TRUE(found_expected_pair); + } + } + } } } // namespace |