diff options
Diffstat (limited to 'extractor/run_extractor.cc')
| -rw-r--r-- | extractor/run_extractor.cc | 27 | 
1 files changed, 21 insertions, 6 deletions
| diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index dba4578c..d5ff23b2 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -35,6 +35,7 @@ using namespace std;  using namespace extractor;  using namespace features; +// Returns the file path in which a given grammar should be written.  fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number) {    string file_name = "grammar." + to_string(file_number);    return grammar_path / file_name; @@ -45,6 +46,7 @@ int main(int argc, char** argv) {    #pragma omp parallel    num_threads_default = omp_get_num_threads(); +  // Sets up the command line arguments map.    po::options_description desc("Command line options");    desc.add_options()      ("help,h", "Show available options") @@ -69,7 +71,7 @@ int main(int argc, char** argv) {      ("max_nonterminals", po::value<int>()->default_value(2),          "Maximum number of nonterminals in a rule")      ("min_frequency", po::value<int>()->default_value(1000), -        "Minimum number of occurences for a pharse to be considered frequent") +        "Minimum number of occurrences for a pharse to be considered frequent")      ("max_samples", po::value<int>()->default_value(300),          "Maximum number of samples")      ("tight_phrases", po::value<bool>()->default_value(true), @@ -78,8 +80,8 @@ int main(int argc, char** argv) {    po::variables_map vm;    po::store(po::parse_command_line(argc, argv, desc), vm); -  // Check for help argument before notify, so we don't need to pass in the -  // required parameters. +  // Checks for the help option before calling notify, so the we don't get an +  // exception for missing required arguments.    if (vm.count("help")) {      cout << desc << endl;      return 0; @@ -94,6 +96,7 @@ int main(int argc, char** argv) {      return 1;    } +  // Reads the parallel corpus.    Clock::time_point preprocess_start_time = Clock::now();    cerr << "Reading source and target data..." << endl;    Clock::time_point start_time = Clock::now(); @@ -111,6 +114,7 @@ int main(int argc, char** argv) {    cerr << "Reading data took " << GetDuration(start_time, stop_time)         << " seconds" << endl; +  // Constructs the suffix array for the source data.    cerr << "Creating source suffix array..." << endl;    start_time = Clock::now();    shared_ptr<SuffixArray> source_suffix_array = @@ -119,6 +123,7 @@ int main(int argc, char** argv) {    cerr << "Creating suffix array took "         << GetDuration(start_time, stop_time) << " seconds" << endl; +  // Reads the alignment.    cerr << "Reading alignment..." << endl;    start_time = Clock::now();    shared_ptr<Alignment> alignment = @@ -127,6 +132,8 @@ int main(int argc, char** argv) {    cerr << "Reading alignment took "         << GetDuration(start_time, stop_time) << " seconds" << endl; +  // Constructs an index storing the occurrences in the source data for each +  // frequent collocation.    cerr << "Precomputing collocations..." << endl;    start_time = Clock::now();    shared_ptr<Precomputation> precomputation = make_shared<Precomputation>( @@ -142,6 +149,8 @@ int main(int argc, char** argv) {    cerr << "Precomputing collocations took "         << GetDuration(start_time, stop_time) << " seconds" << endl; +  // Constructs a table storing p(e | f) and p(f | e) for every pair of source +  // and target words.    cerr << "Precomputing conditional probabilities..." << endl;    start_time = Clock::now();    shared_ptr<TranslationTable> table = make_shared<TranslationTable>( @@ -155,6 +164,7 @@ int main(int argc, char** argv) {         << GetDuration(preprocess_start_time, preprocess_stop_time)         << " seconds" << endl; +  // Features used to score each grammar rule.    Clock::time_point extraction_start_time = Clock::now();    vector<shared_ptr<Feature> > features = {        make_shared<TargetGivenSourceCoherent>(), @@ -167,6 +177,7 @@ int main(int argc, char** argv) {    };    shared_ptr<Scorer> scorer = make_shared<Scorer>(features); +  // Sets up the grammar extractor.    GrammarExtractor extractor(        source_suffix_array,        target_data_array, @@ -180,26 +191,30 @@ int main(int argc, char** argv) {        vm["max_samples"].as<int>(),        vm["tight_phrases"].as<bool>()); -  // Release extra memory used by the initial precomputation. +  // Releases extra memory used by the initial precomputation.    precomputation.reset(); +  // Creates the grammars directory if it doesn't exist.    fs::path grammar_path = vm["grammars"].as<string>();    if (!fs::is_directory(grammar_path)) {      fs::create_directory(grammar_path);    } +  // Reads all sentences for which we extract grammar rules (the paralellization +  // is simplified if we read all sentences upfront).    string sentence;    vector<string> sentences;    while (getline(cin, sentence)) {      sentences.push_back(sentence);    } +  // Extracts the grammar for each sentence and saves it to a file.    vector<string> suffixes(sentences.size());    #pragma omp parallel for schedule(dynamic) \        num_threads(vm["threads"].as<int>())    for (size_t i = 0; i < sentences.size(); ++i) { -    string delimiter = "|||", suffix; -    int position = sentences[i].find(delimiter); +    string suffix; +    int position = sentences[i].find("|||");      if (position != sentences[i].npos) {        suffix = sentences[i].substr(position);        sentences[i] = sentences[i].substr(0, position); | 
