summaryrefslogtreecommitdiff
path: root/extractor/compile.cc
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-06-04 23:17:57 +0100
committerPaul Baltescu <pauldb89@gmail.com>2013-06-04 23:17:57 +0100
commita3243017d6b8c46cc3e41f4243311dc3dbc80ab4 (patch)
tree8b4ac016293196f10b48bed86fe9db0bdc371ac8 /extractor/compile.cc
parent4a254c100a7565633fb79d57bffafdcb4e8f03db (diff)
Serialize data structures.
Diffstat (limited to 'extractor/compile.cc')
-rw-r--r--extractor/compile.cc82
1 files changed, 76 insertions, 6 deletions
diff --git a/extractor/compile.cc b/extractor/compile.cc
index a9ae2cef..65fdd509 100644
--- a/extractor/compile.cc
+++ b/extractor/compile.cc
@@ -1,6 +1,8 @@
+#include <fstream>
#include <iostream>
#include <string>
+#include <boost/archive/binary_oarchive.hpp>
#include <boost/filesystem.hpp>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
@@ -9,8 +11,10 @@
#include "data_array.h"
#include "precomputation.h"
#include "suffix_array.h"
+#include "time_util.h"
#include "translation_table.h"
+namespace ar = boost::archive;
namespace fs = boost::filesystem;
namespace po = boost::program_options;
using namespace std;
@@ -58,11 +62,14 @@ int main(int argc, char** argv) {
return 1;
}
- fs::path output_dir(vm["output"].as<string>().c_str());
+ fs::path output_dir(vm["output"].as<string>());
if (!fs::exists(output_dir)) {
fs::create_directory(output_dir);
}
+ // Reading source and target data.
+ Clock::time_point start_time = Clock::now();
+ cerr << "Reading source and target data..." << endl;
shared_ptr<DataArray> source_data_array, target_data_array;
if (vm.count("bitext")) {
source_data_array = make_shared<DataArray>(
@@ -73,15 +80,53 @@ int main(int argc, char** argv) {
source_data_array = make_shared<DataArray>(vm["source"].as<string>());
target_data_array = make_shared<DataArray>(vm["target"].as<string>());
}
+
+ Clock::time_point start_write = Clock::now();
+ ofstream target_fstream((output_dir / fs::path("target.bin")).string());
+ ar::binary_oarchive target_stream(target_fstream);
+ target_stream << *target_data_array;
+ Clock::time_point stop_write = Clock::now();
+ double write_duration = GetDuration(start_write, stop_write);
+
+ Clock::time_point stop_time = Clock::now();
+ cerr << "Reading data took " << GetDuration(start_time, stop_time)
+ << " seconds" << endl;
+
+ // Constructing and compiling the suffix array.
+ start_time = Clock::now();
+ cerr << "Constructing source suffix array..." << endl;
shared_ptr<SuffixArray> source_suffix_array =
make_shared<SuffixArray>(source_data_array);
- source_suffix_array->WriteBinary(output_dir / fs::path("f.bin"));
- target_data_array->WriteBinary(output_dir / fs::path("e.bin"));
+ start_write = Clock::now();
+ ofstream source_fstream((output_dir / fs::path("source.bin")).string());
+ ar::binary_oarchive output_stream(source_fstream);
+ output_stream << *source_suffix_array;
+ stop_write = Clock::now();
+ write_duration += GetDuration(start_write, stop_write);
+
+ cerr << "Constructing suffix array took "
+ << GetDuration(start_time, stop_time) << " seconds" << endl;
+
+ // Reading alignment.
+ start_time = Clock::now();
+ cerr << "Reading alignment..." << endl;
shared_ptr<Alignment> alignment =
make_shared<Alignment>(vm["alignment"].as<string>());
- alignment->WriteBinary(output_dir / fs::path("a.bin"));
+ start_write = Clock::now();
+ ofstream alignment_fstream((output_dir / fs::path("alignment.bin")).string());
+ ar::binary_oarchive alignment_stream(alignment_fstream);
+ alignment_stream << *alignment;
+ stop_write = Clock::now();
+ write_duration += GetDuration(start_write, stop_write);
+
+ stop_time = Clock::now();
+ cerr << "Reading alignment took "
+ << GetDuration(start_time, stop_time) << " seconds" << endl;
+
+ start_time = Clock::now();
+ cerr << "Precomputing collocations..." << endl;
Precomputation precomputation(
source_suffix_array,
vm["frequent"].as<int>(),
@@ -91,10 +136,35 @@ int main(int argc, char** argv) {
vm["min_gap_size"].as<int>(),
vm["max_phrase_len"].as<int>(),
vm["min_frequency"].as<int>());
- precomputation.WriteBinary(output_dir / fs::path("precompute.bin"));
+ start_write = Clock::now();
+ ofstream precomp_fstream((output_dir / fs::path("precomp.bin")).string());
+ ar::binary_oarchive precomp_stream(precomp_fstream);
+ precomp_stream << precomputation;
+ stop_write = Clock::now();
+ write_duration += GetDuration(start_write, stop_write);
+
+ stop_time = Clock::now();
+ cerr << "Precomputing collocations took "
+ << GetDuration(start_time, stop_time) << " seconds" << endl;
+
+ start_time = Clock::now();
+ cerr << "Precomputing conditional probabilities..." << endl;
TranslationTable table(source_data_array, target_data_array, alignment);
- table.WriteBinary(output_dir / fs::path("lex.bin"));
+
+ start_write = Clock::now();
+ ofstream table_fstream((output_dir / fs::path("bilex.bin")).string());
+ ar::binary_oarchive table_stream(table_fstream);
+ table_stream << table;
+ stop_write = Clock::now();
+ write_duration += GetDuration(start_write, stop_write);
+
+ stop_time = Clock::now();
+ cerr << "Precomputing conditional probabilities took "
+ << GetDuration(start_time, stop_time) << " seconds" << endl;
+
+ cerr << "Total time spent writing: " << write_duration
+ << " seconds" << endl;
return 0;
}