From b34c347cd7f4f8965e4d943543a31f9a4e886f54 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Thu, 7 Mar 2013 14:38:23 +0000 Subject: Added unit test for loose phrases. --- extractor/data_array.h | 2 +- extractor/features/is_source_singleton.cc | 2 +- extractor/run_extractor.cc | 17 +++++++---------- extractor/target_phrase_extractor_test.cc | 28 ++++++++++++++++++++++++++-- 4 files changed, 35 insertions(+), 14 deletions(-) (limited to 'extractor') diff --git a/extractor/data_array.h b/extractor/data_array.h index 42e12135..a26bbecf 100644 --- a/extractor/data_array.h +++ b/extractor/data_array.h @@ -17,7 +17,7 @@ enum Side { TARGET }; -// TODO: This class has features for both the source and target data arrays. +// Note: This class has features for both the source and target data arrays. // Maybe we can save some memory by having more specific implementations (e.g. // sentence_id is only needed for the source data array). class DataArray { diff --git a/extractor/features/is_source_singleton.cc b/extractor/features/is_source_singleton.cc index ab54e51a..1abb486f 100644 --- a/extractor/features/is_source_singleton.cc +++ b/extractor/features/is_source_singleton.cc @@ -6,7 +6,7 @@ namespace extractor { namespace features { double IsSourceSingleton::Score(const FeatureContext& context) const { - return context.source_phrase_count == 1; + return fabs(context.source_phrase_count - 1) < 1e-6; } string IsSourceSingleton::GetName() const { diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 0f91236d..ae3a875e 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -60,7 +60,6 @@ int main(int argc, char** argv) { "Minimum number of occurences for a pharse to be considered frequent") ("max_samples", po::value()->default_value(300), "Maximum number of samples") - // TODO(pauldb): Check if this works when set to false. ("tight_phrases", po::value()->default_value(true), "False if phrases may be loose (better, but slower)"); @@ -144,17 +143,15 @@ int main(int argc, char** argv) { << GetDuration(preprocess_start_time, preprocess_stop_time) << " seconds" << endl; - cerr << "creating grammar extractor" << endl; - Clock::time_point extraction_start_time = Clock::now(); vector > features = { -// make_shared(), -// make_shared(), -// make_shared(), -// make_shared(table), -// make_shared(table), -// make_shared(), -// make_shared() + make_shared(), + make_shared(), + make_shared(), + make_shared(table), + make_shared(table), + make_shared(), + make_shared() }; shared_ptr scorer = make_shared(features); diff --git a/extractor/target_phrase_extractor_test.cc b/extractor/target_phrase_extractor_test.cc index a686d20b..80927dee 100644 --- a/extractor/target_phrase_extractor_test.cc +++ b/extractor/target_phrase_extractor_test.cc @@ -111,8 +111,32 @@ TEST_F(TargetPhraseExtractorTest, TestExtractPhrasesTightPhrasesFalse) { target_gaps, target_low, 1, 5, source_indexes, 0); EXPECT_EQ(10, results.size()); - // TODO(pauldb): Finish unit test once it's clear how these alignments should - // look like. + for (int i = 0; i < 2; ++i) { + for (int j = 4; j <= 6; ++j) { + for (int k = 4; k <= j; ++k) { + vector expected_words; + for (int l = i; l < 2; ++l) { + expected_words.push_back(target_words[l]); + } + for (int l = k; l < j; ++l) { + expected_words.push_back(target_words[l]); + } + + PhraseAlignment expected_alignment; + expected_alignment.push_back(make_pair(1, 1 - i)); + + bool found_expected_pair = false; + for (auto result: results) { + if (result.first.GetWords() == expected_words && + result.second == expected_alignment) { + found_expected_pair = true; + } + } + + EXPECT_TRUE(found_expected_pair); + } + } + } } } // namespace -- cgit v1.2.3