Merge branch 'master' of https://github.com/redpony/cdec

author: Paul Baltescu <pauldb89@gmail.com> 2013-11-23 17:33:47 +0000
committer: Paul Baltescu <pauldb89@gmail.com> 2013-11-23 17:33:47 +0000
commit: 072c4bb1edde483b87b93bc6f4eec36fc8a21008 (patch)
tree: 6ceaa6ae1e08df9e523282740b14f4857236297c /extractor/run_extractor.cc
parent: 7e90b8ea10904f9b83f4e77e14c7396a3e6f7d5d (diff)
parent: 9e80389b9763aa4f7f626ec71b561ccf6948d3ad (diff)
1 files changed, 11 insertions, 2 deletions
diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc
index 8a9ca89d..6eb55073 100644
--- a/extractor/run_extractor.cc
+++ b/extractor/run_extractor.cc
@@ -75,7 +75,9 @@ int main(int argc, char** argv) {
     ("max_samples", po::value<int>()->default_value(300),
         "Maximum number of samples")
     ("tight_phrases", po::value<bool>()->default_value(true),
-        "False if phrases may be loose (better, but slower)");
+        "False if phrases may be loose (better, but slower)")
+    ("leave_one_out", po::value<bool>()->zero_tokens(),
+        "do leave-one-out estimation of grammars (e.g. for extracting grammars for the training set");
 
   po::variables_map vm;
   po::store(po::parse_command_line(argc, argv, desc), vm);
@@ -96,6 +98,11 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+  bool leave_one_out = false;
+  if (vm.count("leave_one_out")) {
+    leave_one_out = true;
+  }
+
   int num_threads = vm["threads"].as<int>();
   cerr << "Grammar extraction will use " << num_threads << " threads." << endl;
 
@@ -223,7 +230,9 @@ int main(int argc, char** argv) {
     }
     suffixes[i] = suffix;
 
-    Grammar grammar = extractor.GetGrammar(sentences[i]);
+    unordered_set<int> blacklisted_sentence_ids;
+    if (leave_one_out) blacklisted_sentence_ids.insert(i);
+    Grammar grammar = extractor.GetGrammar(sentences[i], blacklisted_sentence_ids, source_data_array);
     ofstream output(GetGrammarFilePath(grammar_path, i).c_str());
     output << grammar;
   }
author	Paul Baltescu <pauldb89@gmail.com>	2013-11-23 17:33:47 +0000
committer	Paul Baltescu <pauldb89@gmail.com>	2013-11-23 17:33:47 +0000
commit	072c4bb1edde483b87b93bc6f4eec36fc8a21008 (patch)
tree	6ceaa6ae1e08df9e523282740b14f4857236297c /extractor/run_extractor.cc
parent	7e90b8ea10904f9b83f4e77e14c7396a3e6f7d5d (diff)
parent	9e80389b9763aa4f7f626ec71b561ccf6948d3ad (diff)