From deb555e5ab40a62738269050b43b412335d4b66a Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 12 Apr 2016 10:07:48 +0200 Subject: extractor: gzip compressed grammars --- extractor/run_extractor.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'extractor/run_extractor.cc') diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 00564a36..81d0d8be 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -24,6 +24,7 @@ #include "features/max_lex_target_given_source.h" #include "features/sample_source_count.h" #include "features/target_given_source_coherent.h" +#include "filelib.h" #include "grammar.h" #include "grammar_extractor.h" #include "precomputation.h" @@ -41,8 +42,8 @@ using namespace extractor; using namespace features; // Returns the file path in which a given grammar should be written. -fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number) { - string file_name = "grammar." + to_string(file_number); +fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number, bool use_zip) { + string file_name = "grammar." + to_string(file_number) + (use_zip ? ".gz" : ""); return grammar_path / file_name; } @@ -61,6 +62,7 @@ int main(int argc, char** argv) { ("bitext,b", po::value(), "Parallel text (source ||| target)") ("alignment,a", po::value()->required(), "Bitext word alignment") ("grammars,g", po::value()->required(), "Grammars output path") + ("gzip,z", "Gzip grammars") ("threads,t", po::value()->default_value(1), threads_option.c_str()) ("frequent", po::value()->default_value(100), "Number of precomputed frequent patterns") @@ -205,6 +207,7 @@ int main(int argc, char** argv) { vm["max_rule_symbols"].as(), vm["max_samples"].as(), vm["tight_phrases"].as()); + const bool use_zip = vm.count("gzip"); // Creates the grammars directory if it doesn't exist. fs::path grammar_path = vm["grammars"].as(); @@ -239,12 +242,12 @@ int main(int argc, char** argv) { } Grammar grammar = extractor.GetGrammar( sentences[i], blacklisted_sentence_ids); - ofstream output(GetGrammarFilePath(grammar_path, i).c_str()); - output << grammar; + WriteFile wf(GetGrammarFilePath(grammar_path, i, use_zip).c_str()); + *wf.stream() << grammar; } for (size_t i = 0; i < sentences.size(); ++i) { - cout << " " << sentences[i] << " " << suffixes[i] << endl; } -- cgit v1.2.3