From 0187447a643c3ea262b13b3052cb1531990eafe6 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Tue, 19 Feb 2013 21:23:48 +0000 Subject: Timing every part of the extractor. --- extractor/run_extractor.cc | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) (limited to 'extractor/run_extractor.cc') diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index ed30e6fe..38f10a5f 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -1,3 +1,4 @@ +#include #include #include #include @@ -23,6 +24,7 @@ #include "rule.h" #include "scorer.h" #include "suffix_array.h" +#include "time_util.h" #include "translation_table.h" namespace fs = boost::filesystem; @@ -56,6 +58,9 @@ int main(int argc, char** argv) { "Minimum number of occurences for a pharse to be considered frequent") ("max_samples", po::value()->default_value(300), "Maximum number of samples") + ("fast_intersect", po::value()->default_value(false), + "Enable fast intersect") + // TODO(pauldb): Check if this works when set to false. ("tight_phrases", po::value()->default_value(true), "False if phrases may be loose (better, but slower)") ("baeza_yates", po::value()->default_value(true), @@ -80,6 +85,9 @@ int main(int argc, char** argv) { return 1; } + Clock::time_point preprocess_start_time = Clock::now(); + cerr << "Reading source and target data..." << endl; + Clock::time_point start_time = Clock::now(); shared_ptr source_data_array, target_data_array; if (vm.count("bitext")) { source_data_array = make_shared( @@ -90,13 +98,28 @@ int main(int argc, char** argv) { source_data_array = make_shared(vm["source"].as()); target_data_array = make_shared(vm["target"].as()); } + Clock::time_point stop_time = Clock::now(); + cerr << "Reading data took " << GetDuration(start_time, stop_time) + << " seconds" << endl; + + cerr << "Creating source suffix array..." << endl; + start_time = Clock::now(); shared_ptr source_suffix_array = make_shared(source_data_array); + stop_time = Clock::now(); + cerr << "Creating suffix array took " + << GetDuration(start_time, stop_time) << " seconds" << endl; - + cerr << "Reading alignment..." << endl; + start_time = Clock::now(); shared_ptr alignment = make_shared(vm["alignment"].as()); + stop_time = Clock::now(); + cerr << "Reading alignment took " + << GetDuration(start_time, stop_time) << " seconds" << endl; + cerr << "Precomputating collocations..." << endl; + start_time = Clock::now(); shared_ptr precomputation = make_shared( source_suffix_array, vm["frequent"].as(), @@ -106,10 +129,24 @@ int main(int argc, char** argv) { vm["min_gap_size"].as(), vm["max_phrase_len"].as(), vm["min_frequency"].as()); + stop_time = Clock::now(); + cerr << "Precomputing collocations took " + << GetDuration(start_time, stop_time) << " seconds" << endl; + cerr << "Precomputing conditional probabilities..." << endl; + start_time = Clock::now(); shared_ptr table = make_shared( source_data_array, target_data_array, alignment); + stop_time = Clock::now(); + cerr << "Precomputing conditional probabilities took " + << GetDuration(start_time, stop_time) << " seconds" << endl; + + Clock::time_point preprocess_stop_time = Clock::now(); + cerr << "Overall preprocessing step took " + << GetDuration(preprocess_start_time, preprocess_stop_time) + << " seconds" << endl; + Clock::time_point extraction_start_time = Clock::now(); vector > features = { make_shared(), make_shared(), @@ -133,6 +170,7 @@ int main(int argc, char** argv) { vm["max_nonterminals"].as(), vm["max_rule_symbols"].as(), vm["max_samples"].as(), + vm["fast_intersect"].as(), vm["baeza_yates"].as(), vm["tight_phrases"].as()); @@ -161,6 +199,10 @@ int main(int argc, char** argv) { << "\"> " << sentence << " " << suffix << endl; ++grammar_id; } + Clock::time_point extraction_stop_time = Clock::now(); + cerr << "Overall extraction step took " + << GetDuration(extraction_start_time, extraction_stop_time) + << " seconds" << endl; return 0; } -- cgit v1.2.3