summaryrefslogtreecommitdiff
path: root/extractor/run_extractor.cc
diff options
context:
space:
mode:
Diffstat (limited to 'extractor/run_extractor.cc')
-rw-r--r--extractor/run_extractor.cc70
1 files changed, 61 insertions, 9 deletions
diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc
index 4f841864..37a9cba0 100644
--- a/extractor/run_extractor.cc
+++ b/extractor/run_extractor.cc
@@ -1,16 +1,31 @@
+#include <fstream>
#include <iostream>
#include <string>
+#include <vector>
+#include <boost/filesystem.hpp>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
#include "alignment.h"
#include "data_array.h"
+#include "features/count_source_target.h"
+#include "features/feature.h"
+#include "features/is_source_singleton.h"
+#include "features/is_source_target_singleton.h"
+#include "features/max_lex_source_given_target.h"
+#include "features/max_lex_target_given_source.h"
+#include "features/sample_source_count.h"
+#include "features/target_given_source_coherent.h"
+#include "grammar.h"
#include "grammar_extractor.h"
#include "precomputation.h"
+#include "rule.h"
+#include "scorer.h"
#include "suffix_array.h"
#include "translation_table.h"
+namespace fs = boost::filesystem;
namespace po = boost::program_options;
using namespace std;
@@ -23,21 +38,26 @@ int main(int argc, char** argv) {
("target,e", po::value<string>(), "Target language corpus")
("bitext,b", po::value<string>(), "Parallel text (source ||| target)")
("alignment,a", po::value<string>()->required(), "Bitext word alignment")
+ ("grammars,g", po::value<string>()->required(), "Grammars output path")
("frequent", po::value<int>()->default_value(100),
"Number of precomputed frequent patterns")
("super_frequent", po::value<int>()->default_value(10),
"Number of precomputed super frequent patterns")
- ("max_rule_span,s", po::value<int>()->default_value(15),
+ ("max_rule_span", po::value<int>()->default_value(15),
"Maximum rule span")
("max_rule_symbols,l", po::value<int>()->default_value(5),
"Maximum number of symbols (terminals + nontermals) in a rule")
- ("min_gap_size,g", po::value<int>()->default_value(1), "Minimum gap size")
- ("max_phrase_len,p", po::value<int>()->default_value(4),
+ ("min_gap_size", po::value<int>()->default_value(1), "Minimum gap size")
+ ("max_phrase_len", po::value<int>()->default_value(4),
"Maximum frequent phrase length")
("max_nonterminals", po::value<int>()->default_value(2),
"Maximum number of nonterminals in a rule")
("min_frequency", po::value<int>()->default_value(1000),
"Minimum number of occurences for a pharse to be considered frequent")
+ ("max_samples", po::value<int>()->default_value(300),
+ "Maximum number of samples")
+ ("tight_phrases", po::value<bool>()->default_value(true),
+ "False if phrases may be loose (better, but slower)")
("baeza_yates", po::value<bool>()->default_value(true),
"Use double binary search");
@@ -74,9 +94,10 @@ int main(int argc, char** argv) {
make_shared<SuffixArray>(source_data_array);
- Alignment alignment(vm["alignment"].as<string>());
+ shared_ptr<Alignment> alignment =
+ make_shared<Alignment>(vm["alignment"].as<string>());
- Precomputation precomputation(
+ shared_ptr<Precomputation> precomputation = make_shared<Precomputation>(
source_suffix_array,
vm["frequent"].as<int>(),
vm["super_frequent"].as<int>(),
@@ -86,7 +107,19 @@ int main(int argc, char** argv) {
vm["max_phrase_len"].as<int>(),
vm["min_frequency"].as<int>());
- TranslationTable table(source_data_array, target_data_array, alignment);
+ shared_ptr<TranslationTable> table = make_shared<TranslationTable>(
+ source_data_array, target_data_array, alignment);
+
+ vector<shared_ptr<Feature> > features = {
+ make_shared<TargetGivenSourceCoherent>(),
+ make_shared<SampleSourceCount>(),
+ make_shared<CountSourceTarget>(),
+ make_shared<MaxLexTargetGivenSource>(table),
+ make_shared<MaxLexSourceGivenTarget>(table),
+ make_shared<IsSourceSingleton>(),
+ make_shared<IsSourceTargetSingleton>()
+ };
+ shared_ptr<Scorer> scorer = make_shared<Scorer>(features);
// TODO(pauldb): Add parallelization.
GrammarExtractor extractor(
@@ -94,15 +127,34 @@ int main(int argc, char** argv) {
target_data_array,
alignment,
precomputation,
+ scorer,
vm["min_gap_size"].as<int>(),
vm["max_rule_span"].as<int>(),
vm["max_nonterminals"].as<int>(),
vm["max_rule_symbols"].as<int>(),
- vm["baeza_yates"].as<bool>());
+ vm["max_samples"].as<int>(),
+ vm["baeza_yates"].as<bool>(),
+ vm["tight_phrases"].as<bool>());
- string sentence;
+ int grammar_id = 0;
+ fs::path grammar_path = vm["grammars"].as<string>();
+ string sentence, delimiter = "|||";
while (getline(cin, sentence)) {
- extractor.GetGrammar(sentence);
+ string suffix = "";
+ int position = sentence.find(delimiter);
+ if (position != sentence.npos) {
+ suffix = sentence.substr(position);
+ sentence = sentence.substr(0, position);
+ }
+
+ Grammar grammar = extractor.GetGrammar(sentence);
+ fs::path grammar_file = grammar_path / to_string(grammar_id);
+ ofstream output(grammar_file.c_str());
+ output << grammar;
+
+ cout << "<seg grammar=\"" << grammar_file << "\" id=\"" << grammar_id
+ << "\"> " << sentence << " </seg> " << suffix << endl;
+ ++grammar_id;
}
return 0;