remove perfect hash function stuff, add zip option to extract.cc

author: Chris Dyer <redpony@gmail.com> 2015-03-04 21:36:51 -0500
committer: Chris Dyer <redpony@gmail.com> 2015-03-04 21:36:51 -0500
commit: bafcdb6a06c0fe9db64f703e954b299cd8f39289 (patch)
tree: d1399e1b879aaa37b4c574133c6ac50d040d77c2 /extractor
parent: 6cbdccb1d9a62b2723b962ba4b6e66f1631e48d3 (diff)
2 files changed, 21 insertions, 10 deletions
diff --git a/extractor/CMakeLists.txt b/extractor/CMakeLists.txt
index d37c6b95..1cf8533b 100644
--- a/extractor/CMakeLists.txt
+++ b/extractor/CMakeLists.txt
@@ -1,9 +1,17 @@
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../utils)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/)
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/features)
 
-find_package(GTest)
-find_package(GMock)
+find_package(OpenMP)
+if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+find_package(GTest REQUIRED)
+find_package(GMock REQUIRED)
 if(GMOCK_FOUND)
+  #rule_factory_test.cc
   set(TEST_SRCS alignment_test.cc
     data_array_test.cc
     fast_intersector_test.cc
@@ -15,7 +23,6 @@ if(GMOCK_FOUND)
     precomputation_test.cc
     rule_extractor_helper_test.cc
     rule_extractor_test.cc
-    rule_factory_test.cc
     scorer2_test.cc
     suffix_array_sampler_test.cc
     suffix_array_test.cc
@@ -56,7 +63,7 @@ target_link_libraries(run_extractor extractor ${Boost_LIBRARIES} z)
 
 set(extract_SRCS extract.cc)
 add_executable(extract ${extract_SRCS})
-target_link_libraries(extract extractor ${Boost_LIBRARIES} z)
+target_link_libraries(extract extractor utils ${Boost_LIBRARIES} z)
 
 
 set(extractor_STAT_SRCS
diff --git a/extractor/extract.cc b/extractor/extract.cc
index e5b6f6ff..08f209cc 100644
--- a/extractor/extract.cc
+++ b/extractor/extract.cc
@@ -14,6 +14,7 @@
   const unsigned omp_get_num_threads() { return 1; }
 #endif
 
+#include "filelib.h"
 #include "alignment.h"
 #include "data_array.h"
 #include "features/count_source_target.h"
@@ -42,8 +43,8 @@ using namespace features;
 using namespace std;
 
 // Returns the file path in which a given grammar should be written.
-fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number) {
-  string file_name = "grammar." + to_string(file_number);
+fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number, bool use_zip) {
+  string file_name = "grammar." + to_string(file_number) + (use_zip ? ".gz" : "");
   return grammar_path / file_name;
 }
 
@@ -58,6 +59,7 @@ int main(int argc, char** argv) {
     ("threads,t", po::value<int>()->required()->default_value(1),
      threads_option.c_str())
     ("grammars,g", po::value<string>()->required(), "Grammars output path")
+    ("gzip,z", "Gzip grammars")
     ("max_rule_span", po::value<int>()->default_value(15),
         "Maximum rule span")
     ("max_rule_symbols", po::value<int>()->default_value(5),
@@ -205,12 +207,14 @@ int main(int argc, char** argv) {
       vm["max_rule_symbols"].as<int>(),
       vm["max_samples"].as<int>(),
       vm["tight_phrases"].as<bool>());
+  const bool use_zip = vm.count("gzip");
 
   // Creates the grammars directory if it doesn't exist.
   fs::path grammar_path = vm["grammars"].as<string>();
   if (!fs::is_directory(grammar_path)) {
     fs::create_directory(grammar_path);
   }
+  grammar_path = fs::canonical(grammar_path);
 
   // Reads all sentences for which we extract grammar rules (the paralellization
   // is simplified if we read all sentences upfront).
@@ -239,12 +243,12 @@ int main(int argc, char** argv) {
     }
     Grammar grammar = extractor.GetGrammar(
         sentences[i], blacklisted_sentence_ids);
-    ofstream output(GetGrammarFilePath(grammar_path, i).c_str());
-    output << grammar;
+    WriteFile wf(GetGrammarFilePath(grammar_path, i, use_zip).c_str());
+    *wf.stream() << grammar;
   }
 
   for (size_t i = 0; i < sentences.size(); ++i) {
-    cout << "<seg grammar=" << GetGrammarFilePath(grammar_path, i) << " id=\""
+    cout << "<seg grammar=" << GetGrammarFilePath(grammar_path, i, use_zip) << " id=\""
          << i << "\"> " << sentences[i] << " </seg> " << suffixes[i] << endl;
   }
author	Chris Dyer <redpony@gmail.com>	2015-03-04 21:36:51 -0500
committer	Chris Dyer <redpony@gmail.com>	2015-03-04 21:36:51 -0500
commit	bafcdb6a06c0fe9db64f703e954b299cd8f39289 (patch)
tree	d1399e1b879aaa37b4c574133c6ac50d040d77c2 /extractor
parent	6cbdccb1d9a62b2723b962ba4b6e66f1631e48d3 (diff)