From 252fb164c208ec8f3005f8a652eb3b48c0644e3d Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Fri, 1 Feb 2013 16:11:10 +0000 Subject: Second working commit. --- extractor/run_extractor.cc | 70 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 9 deletions(-) (limited to 'extractor/run_extractor.cc') diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 4f841864..37a9cba0 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -1,16 +1,31 @@ +#include #include #include +#include +#include #include #include #include "alignment.h" #include "data_array.h" +#include "features/count_source_target.h" +#include "features/feature.h" +#include "features/is_source_singleton.h" +#include "features/is_source_target_singleton.h" +#include "features/max_lex_source_given_target.h" +#include "features/max_lex_target_given_source.h" +#include "features/sample_source_count.h" +#include "features/target_given_source_coherent.h" +#include "grammar.h" #include "grammar_extractor.h" #include "precomputation.h" +#include "rule.h" +#include "scorer.h" #include "suffix_array.h" #include "translation_table.h" +namespace fs = boost::filesystem; namespace po = boost::program_options; using namespace std; @@ -23,21 +38,26 @@ int main(int argc, char** argv) { ("target,e", po::value(), "Target language corpus") ("bitext,b", po::value(), "Parallel text (source ||| target)") ("alignment,a", po::value()->required(), "Bitext word alignment") + ("grammars,g", po::value()->required(), "Grammars output path") ("frequent", po::value()->default_value(100), "Number of precomputed frequent patterns") ("super_frequent", po::value()->default_value(10), "Number of precomputed super frequent patterns") - ("max_rule_span,s", po::value()->default_value(15), + ("max_rule_span", po::value()->default_value(15), "Maximum rule span") ("max_rule_symbols,l", po::value()->default_value(5), "Maximum number of symbols (terminals + nontermals) in a rule") - ("min_gap_size,g", po::value()->default_value(1), "Minimum gap size") - ("max_phrase_len,p", po::value()->default_value(4), + ("min_gap_size", po::value()->default_value(1), "Minimum gap size") + ("max_phrase_len", po::value()->default_value(4), "Maximum frequent phrase length") ("max_nonterminals", po::value()->default_value(2), "Maximum number of nonterminals in a rule") ("min_frequency", po::value()->default_value(1000), "Minimum number of occurences for a pharse to be considered frequent") + ("max_samples", po::value()->default_value(300), + "Maximum number of samples") + ("tight_phrases", po::value()->default_value(true), + "False if phrases may be loose (better, but slower)") ("baeza_yates", po::value()->default_value(true), "Use double binary search"); @@ -74,9 +94,10 @@ int main(int argc, char** argv) { make_shared(source_data_array); - Alignment alignment(vm["alignment"].as()); + shared_ptr alignment = + make_shared(vm["alignment"].as()); - Precomputation precomputation( + shared_ptr precomputation = make_shared( source_suffix_array, vm["frequent"].as(), vm["super_frequent"].as(), @@ -86,7 +107,19 @@ int main(int argc, char** argv) { vm["max_phrase_len"].as(), vm["min_frequency"].as()); - TranslationTable table(source_data_array, target_data_array, alignment); + shared_ptr table = make_shared( + source_data_array, target_data_array, alignment); + + vector > features = { + make_shared(), + make_shared(), + make_shared(), + make_shared(table), + make_shared(table), + make_shared(), + make_shared() + }; + shared_ptr scorer = make_shared(features); // TODO(pauldb): Add parallelization. GrammarExtractor extractor( @@ -94,15 +127,34 @@ int main(int argc, char** argv) { target_data_array, alignment, precomputation, + scorer, vm["min_gap_size"].as(), vm["max_rule_span"].as(), vm["max_nonterminals"].as(), vm["max_rule_symbols"].as(), - vm["baeza_yates"].as()); + vm["max_samples"].as(), + vm["baeza_yates"].as(), + vm["tight_phrases"].as()); - string sentence; + int grammar_id = 0; + fs::path grammar_path = vm["grammars"].as(); + string sentence, delimiter = "|||"; while (getline(cin, sentence)) { - extractor.GetGrammar(sentence); + string suffix = ""; + int position = sentence.find(delimiter); + if (position != sentence.npos) { + suffix = sentence.substr(position); + sentence = sentence.substr(0, position); + } + + Grammar grammar = extractor.GetGrammar(sentence); + fs::path grammar_file = grammar_path / to_string(grammar_id); + ofstream output(grammar_file.c_str()); + output << grammar; + + cout << " " << sentence << " " << suffix << endl; + ++grammar_id; } return 0; -- cgit v1.2.3