diff options
Diffstat (limited to 'extractor/run_extractor.cc')
| -rw-r--r-- | extractor/run_extractor.cc | 28 | 
1 files changed, 15 insertions, 13 deletions
| diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 6f59f0b6..f1aa5e35 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -5,10 +5,10 @@  #include <string>  #include <vector> -#include <omp.h>  #include <boost/filesystem.hpp>  #include <boost/program_options.hpp>  #include <boost/program_options/variables_map.hpp> +#include <omp.h>  #include "alignment.h"  #include "data_array.h" @@ -28,6 +28,7 @@  #include "suffix_array.h"  #include "time_util.h"  #include "translation_table.h" +#include "vocabulary.h"  namespace fs = boost::filesystem;  namespace po = boost::program_options; @@ -77,7 +78,8 @@ int main(int argc, char** argv) {      ("tight_phrases", po::value<bool>()->default_value(true),          "False if phrases may be loose (better, but slower)")      ("leave_one_out", po::value<bool>()->zero_tokens(), -        "do leave-one-out estimation (e.g. for extracting grammars for the training set)"); +        "do leave-one-out estimation of grammars " +        "(e.g. for extracting grammars for the training set");    po::variables_map vm;    po::store(po::parse_command_line(argc, argv, desc), vm); @@ -98,11 +100,6 @@ int main(int argc, char** argv) {      return 1;    } -  bool leave_one_out = false; -  if (vm.count("leave_one_out")) { -    leave_one_out = true; -  } -    int num_threads = vm["threads"].as<int>();    cerr << "Grammar extraction will use " << num_threads << " threads." << endl; @@ -142,11 +139,14 @@ int main(int argc, char** argv) {    cerr << "Reading alignment took "         << GetDuration(start_time, stop_time) << " seconds" << endl; +  shared_ptr<Vocabulary> vocabulary = make_shared<Vocabulary>(); +    // Constructs an index storing the occurrences in the source data for each    // frequent collocation.    start_time = Clock::now();    cerr << "Precomputing collocations..." << endl;    shared_ptr<Precomputation> precomputation = make_shared<Precomputation>( +      vocabulary,        source_suffix_array,        vm["frequent"].as<int>(),        vm["super_frequent"].as<int>(), @@ -174,8 +174,8 @@ int main(int argc, char** argv) {         << GetDuration(preprocess_start_time, preprocess_stop_time)         << " seconds" << endl; -  // Features used to score each grammar rule.    Clock::time_point extraction_start_time = Clock::now(); +  // Features used to score each grammar rule.    vector<shared_ptr<Feature>> features = {        make_shared<TargetGivenSourceCoherent>(),        make_shared<SampleSourceCount>(), @@ -194,6 +194,7 @@ int main(int argc, char** argv) {        alignment,        precomputation,        scorer, +      vocabulary,        vm["min_gap_size"].as<int>(),        vm["max_rule_span"].as<int>(),        vm["max_nonterminals"].as<int>(), @@ -201,9 +202,6 @@ int main(int argc, char** argv) {        vm["max_samples"].as<int>(),        vm["tight_phrases"].as<bool>()); -  // Releases extra memory used by the initial precomputation. -  precomputation.reset(); -    // Creates the grammars directory if it doesn't exist.    fs::path grammar_path = vm["grammars"].as<string>();    if (!fs::is_directory(grammar_path)) { @@ -219,6 +217,7 @@ int main(int argc, char** argv) {    }    // Extracts the grammar for each sentence and saves it to a file. +  bool leave_one_out = vm.count("leave_one_out");    vector<string> suffixes(sentences.size());    #pragma omp parallel for schedule(dynamic) num_threads(num_threads)    for (size_t i = 0; i < sentences.size(); ++i) { @@ -231,8 +230,11 @@ int main(int argc, char** argv) {      suffixes[i] = suffix;      unordered_set<int> blacklisted_sentence_ids; -    if (leave_one_out) blacklisted_sentence_ids.insert(i); -    Grammar grammar = extractor.GetGrammar(sentences[i], blacklisted_sentence_ids, source_data_array); +    if (leave_one_out) { +      blacklisted_sentence_ids.insert(i); +    } +    Grammar grammar = extractor.GetGrammar( +        sentences[i], blacklisted_sentence_ids);      ofstream output(GetGrammarFilePath(grammar_path, i).c_str());      output << grammar;    } | 
