diff options
Diffstat (limited to 'extractor/run_extractor.cc')
-rw-r--r-- | extractor/run_extractor.cc | 27 |
1 files changed, 21 insertions, 6 deletions
diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index dba4578c..d5ff23b2 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -35,6 +35,7 @@ using namespace std; using namespace extractor; using namespace features; +// Returns the file path in which a given grammar should be written. fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number) { string file_name = "grammar." + to_string(file_number); return grammar_path / file_name; @@ -45,6 +46,7 @@ int main(int argc, char** argv) { #pragma omp parallel num_threads_default = omp_get_num_threads(); + // Sets up the command line arguments map. po::options_description desc("Command line options"); desc.add_options() ("help,h", "Show available options") @@ -69,7 +71,7 @@ int main(int argc, char** argv) { ("max_nonterminals", po::value<int>()->default_value(2), "Maximum number of nonterminals in a rule") ("min_frequency", po::value<int>()->default_value(1000), - "Minimum number of occurences for a pharse to be considered frequent") + "Minimum number of occurrences for a pharse to be considered frequent") ("max_samples", po::value<int>()->default_value(300), "Maximum number of samples") ("tight_phrases", po::value<bool>()->default_value(true), @@ -78,8 +80,8 @@ int main(int argc, char** argv) { po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); - // Check for help argument before notify, so we don't need to pass in the - // required parameters. + // Checks for the help option before calling notify, so the we don't get an + // exception for missing required arguments. if (vm.count("help")) { cout << desc << endl; return 0; @@ -94,6 +96,7 @@ int main(int argc, char** argv) { return 1; } + // Reads the parallel corpus. Clock::time_point preprocess_start_time = Clock::now(); cerr << "Reading source and target data..." << endl; Clock::time_point start_time = Clock::now(); @@ -111,6 +114,7 @@ int main(int argc, char** argv) { cerr << "Reading data took " << GetDuration(start_time, stop_time) << " seconds" << endl; + // Constructs the suffix array for the source data. cerr << "Creating source suffix array..." << endl; start_time = Clock::now(); shared_ptr<SuffixArray> source_suffix_array = @@ -119,6 +123,7 @@ int main(int argc, char** argv) { cerr << "Creating suffix array took " << GetDuration(start_time, stop_time) << " seconds" << endl; + // Reads the alignment. cerr << "Reading alignment..." << endl; start_time = Clock::now(); shared_ptr<Alignment> alignment = @@ -127,6 +132,8 @@ int main(int argc, char** argv) { cerr << "Reading alignment took " << GetDuration(start_time, stop_time) << " seconds" << endl; + // Constructs an index storing the occurrences in the source data for each + // frequent collocation. cerr << "Precomputing collocations..." << endl; start_time = Clock::now(); shared_ptr<Precomputation> precomputation = make_shared<Precomputation>( @@ -142,6 +149,8 @@ int main(int argc, char** argv) { cerr << "Precomputing collocations took " << GetDuration(start_time, stop_time) << " seconds" << endl; + // Constructs a table storing p(e | f) and p(f | e) for every pair of source + // and target words. cerr << "Precomputing conditional probabilities..." << endl; start_time = Clock::now(); shared_ptr<TranslationTable> table = make_shared<TranslationTable>( @@ -155,6 +164,7 @@ int main(int argc, char** argv) { << GetDuration(preprocess_start_time, preprocess_stop_time) << " seconds" << endl; + // Features used to score each grammar rule. Clock::time_point extraction_start_time = Clock::now(); vector<shared_ptr<Feature> > features = { make_shared<TargetGivenSourceCoherent>(), @@ -167,6 +177,7 @@ int main(int argc, char** argv) { }; shared_ptr<Scorer> scorer = make_shared<Scorer>(features); + // Sets up the grammar extractor. GrammarExtractor extractor( source_suffix_array, target_data_array, @@ -180,26 +191,30 @@ int main(int argc, char** argv) { vm["max_samples"].as<int>(), vm["tight_phrases"].as<bool>()); - // Release extra memory used by the initial precomputation. + // Releases extra memory used by the initial precomputation. precomputation.reset(); + // Creates the grammars directory if it doesn't exist. fs::path grammar_path = vm["grammars"].as<string>(); if (!fs::is_directory(grammar_path)) { fs::create_directory(grammar_path); } + // Reads all sentences for which we extract grammar rules (the paralellization + // is simplified if we read all sentences upfront). string sentence; vector<string> sentences; while (getline(cin, sentence)) { sentences.push_back(sentence); } + // Extracts the grammar for each sentence and saves it to a file. vector<string> suffixes(sentences.size()); #pragma omp parallel for schedule(dynamic) \ num_threads(vm["threads"].as<int>()) for (size_t i = 0; i < sentences.size(); ++i) { - string delimiter = "|||", suffix; - int position = sentences[i].find(delimiter); + string suffix; + int position = sentences[i].find("|||"); if (position != sentences[i].npos) { suffix = sentences[i].substr(position); sentences[i] = sentences[i].substr(0, position); |