Added unit test for loose phrases.

author: Paul Baltescu <pauldb89@gmail.com> 2013-03-07 14:38:23 +0000
committer: Paul Baltescu <pauldb89@gmail.com> 2013-03-07 14:39:05 +0000
commit: d7271db305bd1aeaf9c3d9ac1043546fec22a402 (patch)
tree: a2f1021577dd1846e5fa583d4859ee6ecb702299 /extractor
parent: 6b5039cd1c167be93ecdbb2fa1721e91fe8f689a (diff)
4 files changed, 35 insertions, 14 deletions
diff --git a/extractor/data_array.h b/extractor/data_array.h
index 42e12135..a26bbecf 100644
--- a/extractor/data_array.h
+++ b/extractor/data_array.h
@@ -17,7 +17,7 @@ enum Side {
   TARGET
 };
 
-// TODO: This class has features for both the source and target data arrays.
+// Note: This class has features for both the source and target data arrays.
 // Maybe we can save some memory by having more specific implementations (e.g.
 // sentence_id is only needed for the source data array).
 class DataArray {
diff --git a/extractor/features/is_source_singleton.cc b/extractor/features/is_source_singleton.cc
index ab54e51a..1abb486f 100644
--- a/extractor/features/is_source_singleton.cc
+++ b/extractor/features/is_source_singleton.cc
@@ -6,7 +6,7 @@ namespace extractor {
 namespace features {
 
 double IsSourceSingleton::Score(const FeatureContext& context) const {
-  return context.source_phrase_count == 1;
+  return fabs(context.source_phrase_count - 1) < 1e-6;
 }
 
 string IsSourceSingleton::GetName() const {
diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc
index 0f91236d..ae3a875e 100644
--- a/extractor/run_extractor.cc
+++ b/extractor/run_extractor.cc
@@ -60,7 +60,6 @@ int main(int argc, char** argv) {
         "Minimum number of occurences for a pharse to be considered frequent")
     ("max_samples", po::value<int>()->default_value(300),
         "Maximum number of samples")
-    // TODO(pauldb): Check if this works when set to false.
     ("tight_phrases", po::value<bool>()->default_value(true),
         "False if phrases may be loose (better, but slower)");
 
@@ -144,17 +143,15 @@ int main(int argc, char** argv) {
        << GetDuration(preprocess_start_time, preprocess_stop_time)
        << " seconds" << endl;
 
-  cerr << "creating grammar extractor" << endl;
-
   Clock::time_point extraction_start_time = Clock::now();
   vector<shared_ptr<Feature> > features = {
-//      make_shared<TargetGivenSourceCoherent>(),
-//      make_shared<SampleSourceCount>(),
-//      make_shared<CountSourceTarget>(),
-//      make_shared<MaxLexSourceGivenTarget>(table),
-//      make_shared<MaxLexTargetGivenSource>(table),
-//      make_shared<IsSourceSingleton>(),
-//      make_shared<IsSourceTargetSingleton>()
+      make_shared<TargetGivenSourceCoherent>(),
+      make_shared<SampleSourceCount>(),
+      make_shared<CountSourceTarget>(),
+      make_shared<MaxLexSourceGivenTarget>(table),
+      make_shared<MaxLexTargetGivenSource>(table),
+      make_shared<IsSourceSingleton>(),
+      make_shared<IsSourceTargetSingleton>()
   };
   shared_ptr<Scorer> scorer = make_shared<Scorer>(features);
 
diff --git a/extractor/target_phrase_extractor_test.cc b/extractor/target_phrase_extractor_test.cc
index a686d20b..80927dee 100644
--- a/extractor/target_phrase_extractor_test.cc
+++ b/extractor/target_phrase_extractor_test.cc
@@ -111,8 +111,32 @@ TEST_F(TargetPhraseExtractorTest, TestExtractPhrasesTightPhrasesFalse) {
       target_gaps, target_low, 1, 5, source_indexes, 0);
   EXPECT_EQ(10, results.size());
 
-  // TODO(pauldb): Finish unit test once it's clear how these alignments should
-  // look like.
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 4; j <= 6; ++j) {
+      for (int k = 4; k <= j; ++k) {
+        vector<string> expected_words;
+        for (int l = i; l < 2; ++l) {
+          expected_words.push_back(target_words[l]);
+        }
+        for (int l = k; l < j; ++l) {
+          expected_words.push_back(target_words[l]);
+        }
+
+        PhraseAlignment expected_alignment;
+        expected_alignment.push_back(make_pair(1, 1 - i));
+
+        bool found_expected_pair = false;
+        for (auto result: results) {
+          if (result.first.GetWords() == expected_words &&
+              result.second == expected_alignment) {
+            found_expected_pair = true;
+          }
+        }
+
+        EXPECT_TRUE(found_expected_pair);
+      }
+    }
+  }
 }
 
 } // namespace
author	Paul Baltescu <pauldb89@gmail.com>	2013-03-07 14:38:23 +0000
committer	Paul Baltescu <pauldb89@gmail.com>	2013-03-07 14:39:05 +0000
commit	d7271db305bd1aeaf9c3d9ac1043546fec22a402 (patch)
tree	a2f1021577dd1846e5fa583d4859ee6ecb702299 /extractor
parent	6b5039cd1c167be93ecdbb2fa1721e91fe8f689a (diff)