summaryrefslogtreecommitdiff
path: root/extractor/run_extractor.cc
diff options
context:
space:
mode:
Diffstat (limited to 'extractor/run_extractor.cc')
-rw-r--r--extractor/run_extractor.cc27
1 files changed, 21 insertions, 6 deletions
diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc
index dba4578c..d5ff23b2 100644
--- a/extractor/run_extractor.cc
+++ b/extractor/run_extractor.cc
@@ -35,6 +35,7 @@ using namespace std;
using namespace extractor;
using namespace features;
+// Returns the file path in which a given grammar should be written.
fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number) {
string file_name = "grammar." + to_string(file_number);
return grammar_path / file_name;
@@ -45,6 +46,7 @@ int main(int argc, char** argv) {
#pragma omp parallel
num_threads_default = omp_get_num_threads();
+ // Sets up the command line arguments map.
po::options_description desc("Command line options");
desc.add_options()
("help,h", "Show available options")
@@ -69,7 +71,7 @@ int main(int argc, char** argv) {
("max_nonterminals", po::value<int>()->default_value(2),
"Maximum number of nonterminals in a rule")
("min_frequency", po::value<int>()->default_value(1000),
- "Minimum number of occurences for a pharse to be considered frequent")
+ "Minimum number of occurrences for a pharse to be considered frequent")
("max_samples", po::value<int>()->default_value(300),
"Maximum number of samples")
("tight_phrases", po::value<bool>()->default_value(true),
@@ -78,8 +80,8 @@ int main(int argc, char** argv) {
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
- // Check for help argument before notify, so we don't need to pass in the
- // required parameters.
+ // Checks for the help option before calling notify, so the we don't get an
+ // exception for missing required arguments.
if (vm.count("help")) {
cout << desc << endl;
return 0;
@@ -94,6 +96,7 @@ int main(int argc, char** argv) {
return 1;
}
+ // Reads the parallel corpus.
Clock::time_point preprocess_start_time = Clock::now();
cerr << "Reading source and target data..." << endl;
Clock::time_point start_time = Clock::now();
@@ -111,6 +114,7 @@ int main(int argc, char** argv) {
cerr << "Reading data took " << GetDuration(start_time, stop_time)
<< " seconds" << endl;
+ // Constructs the suffix array for the source data.
cerr << "Creating source suffix array..." << endl;
start_time = Clock::now();
shared_ptr<SuffixArray> source_suffix_array =
@@ -119,6 +123,7 @@ int main(int argc, char** argv) {
cerr << "Creating suffix array took "
<< GetDuration(start_time, stop_time) << " seconds" << endl;
+ // Reads the alignment.
cerr << "Reading alignment..." << endl;
start_time = Clock::now();
shared_ptr<Alignment> alignment =
@@ -127,6 +132,8 @@ int main(int argc, char** argv) {
cerr << "Reading alignment took "
<< GetDuration(start_time, stop_time) << " seconds" << endl;
+ // Constructs an index storing the occurrences in the source data for each
+ // frequent collocation.
cerr << "Precomputing collocations..." << endl;
start_time = Clock::now();
shared_ptr<Precomputation> precomputation = make_shared<Precomputation>(
@@ -142,6 +149,8 @@ int main(int argc, char** argv) {
cerr << "Precomputing collocations took "
<< GetDuration(start_time, stop_time) << " seconds" << endl;
+ // Constructs a table storing p(e | f) and p(f | e) for every pair of source
+ // and target words.
cerr << "Precomputing conditional probabilities..." << endl;
start_time = Clock::now();
shared_ptr<TranslationTable> table = make_shared<TranslationTable>(
@@ -155,6 +164,7 @@ int main(int argc, char** argv) {
<< GetDuration(preprocess_start_time, preprocess_stop_time)
<< " seconds" << endl;
+ // Features used to score each grammar rule.
Clock::time_point extraction_start_time = Clock::now();
vector<shared_ptr<Feature> > features = {
make_shared<TargetGivenSourceCoherent>(),
@@ -167,6 +177,7 @@ int main(int argc, char** argv) {
};
shared_ptr<Scorer> scorer = make_shared<Scorer>(features);
+ // Sets up the grammar extractor.
GrammarExtractor extractor(
source_suffix_array,
target_data_array,
@@ -180,26 +191,30 @@ int main(int argc, char** argv) {
vm["max_samples"].as<int>(),
vm["tight_phrases"].as<bool>());
- // Release extra memory used by the initial precomputation.
+ // Releases extra memory used by the initial precomputation.
precomputation.reset();
+ // Creates the grammars directory if it doesn't exist.
fs::path grammar_path = vm["grammars"].as<string>();
if (!fs::is_directory(grammar_path)) {
fs::create_directory(grammar_path);
}
+ // Reads all sentences for which we extract grammar rules (the paralellization
+ // is simplified if we read all sentences upfront).
string sentence;
vector<string> sentences;
while (getline(cin, sentence)) {
sentences.push_back(sentence);
}
+ // Extracts the grammar for each sentence and saves it to a file.
vector<string> suffixes(sentences.size());
#pragma omp parallel for schedule(dynamic) \
num_threads(vm["threads"].as<int>())
for (size_t i = 0; i < sentences.size(); ++i) {
- string delimiter = "|||", suffix;
- int position = sentences[i].find(delimiter);
+ string suffix;
+ int position = sentences[i].find("|||");
if (position != sentences[i].npos) {
suffix = sentences[i].substr(position);
sentences[i] = sentences[i].substr(0, position);