diff options
Diffstat (limited to 'extractor')
-rw-r--r-- | extractor/CMakeLists.txt | 17 | ||||
-rw-r--r-- | extractor/extract.cc | 14 |
2 files changed, 21 insertions, 10 deletions
diff --git a/extractor/CMakeLists.txt b/extractor/CMakeLists.txt index d37c6b95..1cf8533b 100644 --- a/extractor/CMakeLists.txt +++ b/extractor/CMakeLists.txt @@ -1,9 +1,17 @@ -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../utils) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/features) -find_package(GTest) -find_package(GMock) +find_package(OpenMP) +if (OPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() + +find_package(GTest REQUIRED) +find_package(GMock REQUIRED) if(GMOCK_FOUND) + #rule_factory_test.cc set(TEST_SRCS alignment_test.cc data_array_test.cc fast_intersector_test.cc @@ -15,7 +23,6 @@ if(GMOCK_FOUND) precomputation_test.cc rule_extractor_helper_test.cc rule_extractor_test.cc - rule_factory_test.cc scorer2_test.cc suffix_array_sampler_test.cc suffix_array_test.cc @@ -56,7 +63,7 @@ target_link_libraries(run_extractor extractor ${Boost_LIBRARIES} z) set(extract_SRCS extract.cc) add_executable(extract ${extract_SRCS}) -target_link_libraries(extract extractor ${Boost_LIBRARIES} z) +target_link_libraries(extract extractor utils ${Boost_LIBRARIES} z) set(extractor_STAT_SRCS diff --git a/extractor/extract.cc b/extractor/extract.cc index e5b6f6ff..08f209cc 100644 --- a/extractor/extract.cc +++ b/extractor/extract.cc @@ -14,6 +14,7 @@ const unsigned omp_get_num_threads() { return 1; } #endif +#include "filelib.h" #include "alignment.h" #include "data_array.h" #include "features/count_source_target.h" @@ -42,8 +43,8 @@ using namespace features; using namespace std; // Returns the file path in which a given grammar should be written. -fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number) { - string file_name = "grammar." + to_string(file_number); +fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number, bool use_zip) { + string file_name = "grammar." + to_string(file_number) + (use_zip ? ".gz" : ""); return grammar_path / file_name; } @@ -58,6 +59,7 @@ int main(int argc, char** argv) { ("threads,t", po::value<int>()->required()->default_value(1), threads_option.c_str()) ("grammars,g", po::value<string>()->required(), "Grammars output path") + ("gzip,z", "Gzip grammars") ("max_rule_span", po::value<int>()->default_value(15), "Maximum rule span") ("max_rule_symbols", po::value<int>()->default_value(5), @@ -205,12 +207,14 @@ int main(int argc, char** argv) { vm["max_rule_symbols"].as<int>(), vm["max_samples"].as<int>(), vm["tight_phrases"].as<bool>()); + const bool use_zip = vm.count("gzip"); // Creates the grammars directory if it doesn't exist. fs::path grammar_path = vm["grammars"].as<string>(); if (!fs::is_directory(grammar_path)) { fs::create_directory(grammar_path); } + grammar_path = fs::canonical(grammar_path); // Reads all sentences for which we extract grammar rules (the paralellization // is simplified if we read all sentences upfront). @@ -239,12 +243,12 @@ int main(int argc, char** argv) { } Grammar grammar = extractor.GetGrammar( sentences[i], blacklisted_sentence_ids); - ofstream output(GetGrammarFilePath(grammar_path, i).c_str()); - output << grammar; + WriteFile wf(GetGrammarFilePath(grammar_path, i, use_zip).c_str()); + *wf.stream() << grammar; } for (size_t i = 0; i < sentences.size(); ++i) { - cout << "<seg grammar=" << GetGrammarFilePath(grammar_path, i) << " id=\"" + cout << "<seg grammar=" << GetGrammarFilePath(grammar_path, i, use_zip) << " id=\"" << i << "\"> " << sentences[i] << " </seg> " << suffixes[i] << endl; } |