summaryrefslogtreecommitdiff
path: root/extractor/run_extractor.cc
diff options
context:
space:
mode:
Diffstat (limited to 'extractor/run_extractor.cc')
-rw-r--r--extractor/run_extractor.cc28
1 files changed, 15 insertions, 13 deletions
diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc
index 6f59f0b6..f1aa5e35 100644
--- a/extractor/run_extractor.cc
+++ b/extractor/run_extractor.cc
@@ -5,10 +5,10 @@
#include <string>
#include <vector>
-#include <omp.h>
#include <boost/filesystem.hpp>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
+#include <omp.h>
#include "alignment.h"
#include "data_array.h"
@@ -28,6 +28,7 @@
#include "suffix_array.h"
#include "time_util.h"
#include "translation_table.h"
+#include "vocabulary.h"
namespace fs = boost::filesystem;
namespace po = boost::program_options;
@@ -77,7 +78,8 @@ int main(int argc, char** argv) {
("tight_phrases", po::value<bool>()->default_value(true),
"False if phrases may be loose (better, but slower)")
("leave_one_out", po::value<bool>()->zero_tokens(),
- "do leave-one-out estimation (e.g. for extracting grammars for the training set)");
+ "do leave-one-out estimation of grammars "
+ "(e.g. for extracting grammars for the training set");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
@@ -98,11 +100,6 @@ int main(int argc, char** argv) {
return 1;
}
- bool leave_one_out = false;
- if (vm.count("leave_one_out")) {
- leave_one_out = true;
- }
-
int num_threads = vm["threads"].as<int>();
cerr << "Grammar extraction will use " << num_threads << " threads." << endl;
@@ -142,11 +139,14 @@ int main(int argc, char** argv) {
cerr << "Reading alignment took "
<< GetDuration(start_time, stop_time) << " seconds" << endl;
+ shared_ptr<Vocabulary> vocabulary = make_shared<Vocabulary>();
+
// Constructs an index storing the occurrences in the source data for each
// frequent collocation.
start_time = Clock::now();
cerr << "Precomputing collocations..." << endl;
shared_ptr<Precomputation> precomputation = make_shared<Precomputation>(
+ vocabulary,
source_suffix_array,
vm["frequent"].as<int>(),
vm["super_frequent"].as<int>(),
@@ -174,8 +174,8 @@ int main(int argc, char** argv) {
<< GetDuration(preprocess_start_time, preprocess_stop_time)
<< " seconds" << endl;
- // Features used to score each grammar rule.
Clock::time_point extraction_start_time = Clock::now();
+ // Features used to score each grammar rule.
vector<shared_ptr<Feature>> features = {
make_shared<TargetGivenSourceCoherent>(),
make_shared<SampleSourceCount>(),
@@ -194,6 +194,7 @@ int main(int argc, char** argv) {
alignment,
precomputation,
scorer,
+ vocabulary,
vm["min_gap_size"].as<int>(),
vm["max_rule_span"].as<int>(),
vm["max_nonterminals"].as<int>(),
@@ -201,9 +202,6 @@ int main(int argc, char** argv) {
vm["max_samples"].as<int>(),
vm["tight_phrases"].as<bool>());
- // Releases extra memory used by the initial precomputation.
- precomputation.reset();
-
// Creates the grammars directory if it doesn't exist.
fs::path grammar_path = vm["grammars"].as<string>();
if (!fs::is_directory(grammar_path)) {
@@ -219,6 +217,7 @@ int main(int argc, char** argv) {
}
// Extracts the grammar for each sentence and saves it to a file.
+ bool leave_one_out = vm.count("leave_one_out");
vector<string> suffixes(sentences.size());
#pragma omp parallel for schedule(dynamic) num_threads(num_threads)
for (size_t i = 0; i < sentences.size(); ++i) {
@@ -231,8 +230,11 @@ int main(int argc, char** argv) {
suffixes[i] = suffix;
unordered_set<int> blacklisted_sentence_ids;
- if (leave_one_out) blacklisted_sentence_ids.insert(i);
- Grammar grammar = extractor.GetGrammar(sentences[i], blacklisted_sentence_ids, source_data_array);
+ if (leave_one_out) {
+ blacklisted_sentence_ids.insert(i);
+ }
+ Grammar grammar = extractor.GetGrammar(
+ sentences[i], blacklisted_sentence_ids);
ofstream output(GetGrammarFilePath(grammar_path, i).c_str());
output << grammar;
}