From 3973a7e4a8302b4a02fee7d2950bb469b37e2452 Mon Sep 17 00:00:00 2001
From: Paul Baltescu <pauldb89@gmail.com>
Date: Sun, 24 Nov 2013 13:19:28 +0000
Subject: Reduce memory overhead for constructing the intersector.

---
 extractor/compile.cc | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'extractor/compile.cc')
diff --git a/extractor/compile.cc b/extractor/compile.cc
index 65fdd509..0d62757e 100644
--- a/extractor/compile.cc
+++ b/extractor/compile.cc
@@ -13,6 +13,7 @@
 #include "suffix_array.h"
 #include "time_util.h"
 #include "translation_table.h"
+#include "vocabulary.h"
 
 namespace ar = boost::archive;
 namespace fs = boost::filesystem;
@@ -125,9 +126,12 @@ int main(int argc, char** argv) {
   cerr << "Reading alignment took "
        << GetDuration(start_time, stop_time) << " seconds" << endl;
 
+  shared_ptr<Vocabulary> vocabulary;
+
   start_time = Clock::now();
   cerr << "Precomputing collocations..." << endl;
   Precomputation precomputation(
+      vocabulary,
       source_suffix_array,
       vm["frequent"].as<int>(),
       vm["super_frequent"].as<int>(),
-- 
cgit v1.2.3


From e633526bc2ba1f73e88989f495d70c0d2ec84a97 Mon Sep 17 00:00:00 2001
From: Paul Baltescu <pauldb89@gmail.com>
Date: Tue, 26 Nov 2013 01:14:28 +0000
Subject: Serialize vocabulary.

---
 extractor/Makefile.am        | 12 +++++++++---
 extractor/compile.cc         |  4 ++++
 extractor/vocabulary.cc      |  4 ++++
 extractor/vocabulary.h       | 23 +++++++++++++++++++++-
 extractor/vocabulary_test.cc | 45 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 84 insertions(+), 4 deletions(-)
 create mode 100644 extractor/vocabulary_test.cc

(limited to 'extractor/compile.cc')

diff --git a/extractor/Makefile.am b/extractor/Makefile.am
index 65a3d436..64a5a2b5 100644
--- a/extractor/Makefile.am
+++ b/extractor/Makefile.am
@@ -24,7 +24,8 @@ EXTRA_PROGRAMS = alignment_test \
     scorer_test \
     suffix_array_test \
     target_phrase_extractor_test \
-    translation_table_test
+    translation_table_test \
+    vocabulary_test
 
 if HAVE_GTEST
   RUNNABLE_TESTS = alignment_test \
@@ -48,12 +49,14 @@ if HAVE_GTEST
     scorer_test \
     suffix_array_test \
     target_phrase_extractor_test \
-    translation_table_test
+    translation_table_test \
+    vocabulary_test
 endif
 
 noinst_PROGRAMS = $(RUNNABLE_TESTS)
 
-TESTS = $(RUNNABLE_TESTS)
+# TESTS = $(RUNNABLE_TESTS)
+TESTS = vocabulary_test
 
 alignment_test_SOURCES = alignment_test.cc
 alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
@@ -99,6 +102,8 @@ target_phrase_extractor_test_SOURCES = target_phrase_extractor_test.cc
 target_phrase_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
 translation_table_test_SOURCES = translation_table_test.cc
 translation_table_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
+vocabulary_test_SOURCES = vocabulary_test.cc
+vocabulary_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
 
 noinst_LIBRARIES = libextractor.a libcompile.a
 
@@ -115,6 +120,7 @@ libcompile_a_SOURCES = \
   suffix_array.cc \
   time_util.cc \
   translation_table.cc \
+  vocabulary.cc \
   alignment.h \
   data_array.h \
   fast_intersector.h \
diff --git a/extractor/compile.cc b/extractor/compile.cc
index 0d62757e..9e8044ad 100644
--- a/extractor/compile.cc
+++ b/extractor/compile.cc
@@ -145,6 +145,10 @@ int main(int argc, char** argv) {
   ofstream precomp_fstream((output_dir / fs::path("precomp.bin")).string());
   ar::binary_oarchive precomp_stream(precomp_fstream);
   precomp_stream << precomputation;
+
+  ofstream vocab_fstream((output_dir / fs::path("vocab.bin")).string());
+  ar::binary_oarchive vocab_stream(vocab_fstream);
+  vocab_stream << *vocabulary;
   stop_write = Clock::now();
   write_duration += GetDuration(start_write, stop_write);
 
diff --git a/extractor/vocabulary.cc b/extractor/vocabulary.cc
index aef674a5..c9c2d6f4 100644
--- a/extractor/vocabulary.cc
+++ b/extractor/vocabulary.cc
@@ -35,4 +35,8 @@ string Vocabulary::GetTerminalValue(int symbol) {
   return word;
 }
 
+bool Vocabulary::operator==(const Vocabulary& other) const {
+  return words == other.words && dictionary == other.dictionary;
+}
+
 } // namespace extractor
diff --git a/extractor/vocabulary.h b/extractor/vocabulary.h
index c8fd9411..db092e99 100644
--- a/extractor/vocabulary.h
+++ b/extractor/vocabulary.h
@@ -5,6 +5,10 @@
 #include <unordered_map>
 #include <vector>
 
+#include <boost/serialization/serialization.hpp>
+#include <boost/serialization/string.hpp>
+#include <boost/serialization/vector.hpp>
+
 using namespace std;
 
 namespace extractor {
@@ -14,7 +18,7 @@ namespace extractor {
  *
  * This strucure contains words located in the frequent collocations and words
  * encountered during the grammar extraction time. This dictionary is
- * considerably smaller than the dictionaries in the data arrays (and so is the
+ * considerably smaller than the dictionaries in the data arays (and so is the
  * query time). Note that this is the single data structure that changes state
  * and needs to have thread safe read/write operations.
  *
@@ -38,7 +42,24 @@ class Vocabulary {
   // Returns the word corresponding to the given word id.
   virtual string GetTerminalValue(int symbol);
 
+  bool operator==(const Vocabulary& vocabulary) const;
+
  private:
+  friend class boost::serialization::access;
+
+  template<class Archive> void save(Archive& ar, unsigned int) const {
+    ar << words;
+  }
+
+  template<class Archive> void load(Archive& ar, unsigned int) {
+    ar >> words;
+    for (size_t i = 0; i < words.size(); ++i) {
+      dictionary[words[i]] = i;
+    }
+  }
+
+  BOOST_SERIALIZATION_SPLIT_MEMBER();
+
   unordered_map<string, int> dictionary;
   vector<string> words;
 };
diff --git a/extractor/vocabulary_test.cc b/extractor/vocabulary_test.cc
new file mode 100644
index 00000000..cf5e3e36
--- /dev/null
+++ b/extractor/vocabulary_test.cc
@@ -0,0 +1,45 @@
+#include <gtest/gtest.h>
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <boost/archive/text_iarchive.hpp>
+#include <boost/archive/text_oarchive.hpp>
+
+#include "vocabulary.h"
+
+using namespace std;
+using namespace ::testing;
+namespace ar = boost::archive;
+
+namespace extractor {
+namespace {
+
+TEST(VocabularyTest, TestIndexes) {
+  Vocabulary vocabulary;
+  EXPECT_EQ(0, vocabulary.GetTerminalIndex("zero"));
+  EXPECT_EQ("zero", vocabulary.GetTerminalValue(0));
+
+  EXPECT_EQ(1, vocabulary.GetTerminalIndex("one"));
+  EXPECT_EQ("one", vocabulary.GetTerminalValue(1));
+}
+
+TEST(VocabularyTest, TestSerialization) {
+  Vocabulary vocabulary;
+  EXPECT_EQ(0, vocabulary.GetTerminalIndex("zero"));
+  EXPECT_EQ("zero", vocabulary.GetTerminalValue(0));
+
+  stringstream stream(ios_base::out | ios_base::in);
+  ar::text_oarchive output_stream(stream, ar::no_header);
+  output_stream << vocabulary;
+
+  Vocabulary vocabulary_copy;
+  ar::text_iarchive input_stream(stream, ar::no_header);
+  input_stream >> vocabulary_copy;
+
+  EXPECT_EQ(vocabulary, vocabulary_copy);
+}
+
+} // namespace
+} // namespace extractor
-- 
cgit v1.2.3


From 304103565d3b79cc9c98c1ee0356a8824fc982c2 Mon Sep 17 00:00:00 2001
From: Paul Baltescu <pauldb89@gmail.com>
Date: Tue, 26 Nov 2013 16:03:16 +0000
Subject: Write config file after compiling data structures.

---
 extractor/compile.cc | 30 +++++++++++++++++++++++-------
 extractor/extract.cc |  4 ++--
 2 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'extractor/compile.cc')

diff --git a/extractor/compile.cc b/extractor/compile.cc
index 9e8044ad..3ee668ce 100644
--- a/extractor/compile.cc
+++ b/extractor/compile.cc
@@ -30,6 +30,8 @@ int main(int argc, char** argv) {
     ("bitext,b", po::value<string>(), "Parallel text (source ||| target)")
     ("alignment,a", po::value<string>()->required(), "Bitext word alignment")
     ("output,o", po::value<string>()->required(), "Output path")
+    ("config,c", po::value<string>()->required(),
+        "Path where the config file will be generated")
     ("frequent", po::value<int>()->default_value(100),
         "Number of precomputed frequent patterns")
     ("super_frequent", po::value<int>()->default_value(10),
@@ -82,8 +84,12 @@ int main(int argc, char** argv) {
     target_data_array = make_shared<DataArray>(vm["target"].as<string>());
   }
 
+  ofstream config_stream(vm["config"].as<string>());
+
   Clock::time_point start_write = Clock::now();
-  ofstream target_fstream((output_dir / fs::path("target.bin")).string());
+  string target_path = (output_dir / fs::path("target.bin")).string();
+  config_stream << "target = " << target_path << endl;
+  ofstream target_fstream(target_path);
   ar::binary_oarchive target_stream(target_fstream);
   target_stream << *target_data_array;
   Clock::time_point stop_write = Clock::now();
@@ -100,7 +106,9 @@ int main(int argc, char** argv) {
       make_shared<SuffixArray>(source_data_array);
 
   start_write = Clock::now();
-  ofstream source_fstream((output_dir / fs::path("source.bin")).string());
+  string source_path = (output_dir / fs::path("source.bin")).string();
+  config_stream << "source = " << source_path << endl;
+  ofstream source_fstream(source_path);
   ar::binary_oarchive output_stream(source_fstream);
   output_stream << *source_suffix_array;
   stop_write = Clock::now();
@@ -116,7 +124,9 @@ int main(int argc, char** argv) {
       make_shared<Alignment>(vm["alignment"].as<string>());
 
   start_write = Clock::now();
-  ofstream alignment_fstream((output_dir / fs::path("alignment.bin")).string());
+  string alignment_path = (output_dir / fs::path("alignment.bin")).string();
+  config_stream << "alignment = " << alignment_path << endl;
+  ofstream alignment_fstream(alignment_path);
   ar::binary_oarchive alignment_stream(alignment_fstream);
   alignment_stream << *alignment;
   stop_write = Clock::now();
@@ -126,7 +136,7 @@ int main(int argc, char** argv) {
   cerr << "Reading alignment took "
        << GetDuration(start_time, stop_time) << " seconds" << endl;
 
-  shared_ptr<Vocabulary> vocabulary;
+  shared_ptr<Vocabulary> vocabulary = make_shared<Vocabulary>();
 
   start_time = Clock::now();
   cerr << "Precomputing collocations..." << endl;
@@ -142,11 +152,15 @@ int main(int argc, char** argv) {
       vm["min_frequency"].as<int>());
 
   start_write = Clock::now();
-  ofstream precomp_fstream((output_dir / fs::path("precomp.bin")).string());
+  string precomputation_path = (output_dir / fs::path("precomp.bin")).string();
+  config_stream << "precomputation = " << precomputation_path << endl;
+  ofstream precomp_fstream(precomputation_path);
   ar::binary_oarchive precomp_stream(precomp_fstream);
   precomp_stream << precomputation;
 
-  ofstream vocab_fstream((output_dir / fs::path("vocab.bin")).string());
+  string vocabulary_path = (output_dir / fs::path("vocab.bin")).string();
+  config_stream << "vocabulary = " << vocabulary_path << endl;
+  ofstream vocab_fstream(vocabulary_path);
   ar::binary_oarchive vocab_stream(vocab_fstream);
   vocab_stream << *vocabulary;
   stop_write = Clock::now();
@@ -161,7 +175,9 @@ int main(int argc, char** argv) {
   TranslationTable table(source_data_array, target_data_array, alignment);
 
   start_write = Clock::now();
-  ofstream table_fstream((output_dir / fs::path("bilex.bin")).string());
+  string table_path = (output_dir / fs::path("bilex.bin")).string();
+  config_stream << "ttable = " << table_path << endl;
+  ofstream table_fstream(table_path);
   ar::binary_oarchive table_stream(table_fstream);
   table_stream << table;
   stop_write = Clock::now();
diff --git a/extractor/extract.cc b/extractor/extract.cc
index 2d5831fa..387cbe9b 100644
--- a/extractor/extract.cc
+++ b/extractor/extract.cc
@@ -72,7 +72,7 @@ int main(int argc, char** argv) {
   po::options_description cmdline_options("Command line options");
   cmdline_options.add_options()
     ("help", "Show available options")
-    ("config", po::value<string>()->required(), "Path to config file");
+    ("config,c", po::value<string>()->required(), "Path to config file");
   cmdline_options.add(general_options);
 
   po::options_description config_options("Config file options");
@@ -236,7 +236,7 @@ int main(int argc, char** argv) {
     Grammar grammar = extractor.GetGrammar(
         sentences[i], blacklisted_sentence_ids);
     ofstream output(GetGrammarFilePath(grammar_path, i).c_str());
-    // output << grammar;
+    output << grammar;
   }
 
   for (size_t i = 0; i < sentences.size(); ++i) {
-- 
cgit v1.2.3