diff options
Diffstat (limited to 'extractor')
-rw-r--r-- | extractor/CMakeLists.txt | 141 | ||||
-rw-r--r-- | extractor/Makefile.am | 192 | ||||
-rw-r--r-- | extractor/extract.cc | 14 | ||||
-rw-r--r-- | extractor/scorer2_test.cc (renamed from extractor/scorer_test.cc) | 0 |
4 files changed, 150 insertions, 197 deletions
diff --git a/extractor/CMakeLists.txt b/extractor/CMakeLists.txt new file mode 100644 index 00000000..93a524cc --- /dev/null +++ b/extractor/CMakeLists.txt @@ -0,0 +1,141 @@ +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../utils) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/features) + +find_package(OpenMP) +if (OPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() + +find_package(GTest) +find_package(GMock) +if(GTEST_FOUND) + if(GMOCK_FOUND) + #rule_factory_test.cc + set(TEST_SRCS alignment_test.cc + data_array_test.cc + fast_intersector_test.cc + grammar_extractor_test.cc + matchings_finder_test.cc + matchings_sampler_test.cc + phrase_location_sampler_test.cc + phrase_test.cc + precomputation_test.cc + rule_extractor_helper_test.cc + rule_extractor_test.cc + scorer2_test.cc + suffix_array_sampler_test.cc + suffix_array_test.cc + target_phrase_extractor_test.cc + translation_table_test.cc + vocabulary_test.cc) + INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS}) + INCLUDE_DIRECTORIES(${GMOCK_INCLUDE_DIRS}) + foreach(testSrc ${TEST_SRCS}) + #Extract the filename without an extension (NAME_WE) + get_filename_component(testName ${testSrc} NAME_WE) + + #Add compile target + add_executable(${testName} ${testSrc}) + + #link to Boost libraries AND your targets and dependencies + target_link_libraries(${testName} extractor ${GMOCK_BOTH_LIBRARIES} ${GTEST_BOTH_LIBRARIES} ${Boost_LIBRARIES} ${ZLIB_LIBRARIES}) + + #I like to move testing binaries into a testBin directory + set_target_properties(${testName} PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + + #Finally add it to test execution - + #Notice the WORKING_DIRECTORY and COMMAND + add_test(NAME ${testName} COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/${testName} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endforeach(testSrc) + endif(GMOCK_FOUND) +endif(GTEST_FOUND) + +set(sacompile_SRCS sacompile.cc) +add_executable(sacompile ${sacompile_SRCS}) +target_link_libraries(sacompile extractor ${Boost_LIBRARIES} z) + +set(run_extractor_SRCS run_extractor.cc) +add_executable(run_extractor ${run_extractor_SRCS}) +target_link_libraries(run_extractor extractor ${Boost_LIBRARIES} z) + + +set(extract_SRCS extract.cc) +add_executable(extract ${extract_SRCS}) +target_link_libraries(extract extractor utils ${Boost_LIBRARIES} z) + + +set(extractor_STAT_SRCS + alignment.cc + backoff_sampler.cc + data_array.cc + fast_intersector.cc + features/count_source_target.cc + features/feature.cc + features/is_source_singleton.cc + features/is_source_target_singleton.cc + features/max_lex_source_given_target.cc + features/max_lex_target_given_source.cc + features/sample_source_count.cc + features/target_given_source_coherent.cc + features/count_source_target.h + features/feature.h + features/is_source_singleton.h + features/is_source_target_singleton.h + features/max_lex_source_given_target.h + features/max_lex_target_given_source.h + features/sample_source_count.h + features/target_given_source_coherent.h + grammar.cc + grammar_extractor.cc + matchings_finder.cc + matchings_sampler.cc + matchings_trie.cc + phrase.cc + phrase_builder.cc + phrase_location.cc + phrase_location_sampler.cc + precomputation.cc + rule.cc + rule_extractor.cc + rule_extractor_helper.cc + rule_factory.cc + scorer.cc + suffix_array.cc + suffix_array_sampler.cc + target_phrase_extractor.cc + time_util.cc + translation_table.cc + vocabulary.cc + alignment.h + backoff_sampler.h + data_array.h + fast_intersector.h + grammar.h + grammar_extractor.h + matchings_finder.h + matchings_sampler.h + matchings_trie.h + phrase.h + phrase_builder.h + phrase_location.h + phrase_location_sampler.h + precomputation.h + rule.h + rule_extractor.h + rule_extractor_helper.h + rule_factory.h + sampler.h + scorer.h + suffix_array.h + suffix_array_sampler.h + target_phrase_extractor.h + time_util.h + translation_table.h + vocabulary.h) + +add_library(extractor STATIC ${extractor_STAT_SRCS}) + diff --git a/extractor/Makefile.am b/extractor/Makefile.am deleted file mode 100644 index a406d9dc..00000000 --- a/extractor/Makefile.am +++ /dev/null @@ -1,192 +0,0 @@ - -bin_PROGRAMS = sacompile run_extractor extract - -EXTRA_PROGRAMS = alignment_test \ - data_array_test \ - fast_intersector_test \ - feature_count_source_target_test \ - feature_is_source_singleton_test \ - feature_is_source_target_singleton_test \ - feature_max_lex_source_given_target_test \ - feature_max_lex_target_given_source_test \ - feature_sample_source_count_test \ - feature_target_given_source_coherent_test \ - grammar_extractor_test \ - matchings_finder_test \ - matchings_sampler_test \ - phrase_location_sampler_test \ - phrase_test \ - precomputation_test \ - rule_extractor_helper_test \ - rule_extractor_test \ - rule_factory_test \ - scorer_test \ - suffix_array_sampler_test \ - suffix_array_test \ - target_phrase_extractor_test \ - translation_table_test \ - vocabulary_test - -if HAVE_GTEST - RUNNABLE_TESTS = alignment_test \ - data_array_test \ - fast_intersector_test \ - feature_count_source_target_test \ - feature_is_source_singleton_test \ - feature_is_source_target_singleton_test \ - feature_max_lex_source_given_target_test \ - feature_max_lex_target_given_source_test \ - feature_sample_source_count_test \ - feature_target_given_source_coherent_test \ - grammar_extractor_test \ - matchings_finder_test \ - matchings_sampler_test \ - phrase_location_sampler_test \ - phrase_test \ - precomputation_test \ - rule_extractor_helper_test \ - rule_extractor_test \ - rule_factory_test \ - scorer_test \ - suffix_array_sampler_test \ - suffix_array_test \ - target_phrase_extractor_test \ - translation_table_test \ - vocabulary_test -endif - -noinst_PROGRAMS = $(RUNNABLE_TESTS) - -TESTS = $(RUNNABLE_TESTS) - -alignment_test_SOURCES = alignment_test.cc -alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -data_array_test_SOURCES = data_array_test.cc -data_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -fast_intersector_test_SOURCES = fast_intersector_test.cc -fast_intersector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -feature_count_source_target_test_SOURCES = features/count_source_target_test.cc -feature_count_source_target_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -feature_is_source_singleton_test_SOURCES = features/is_source_singleton_test.cc -feature_is_source_singleton_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -feature_is_source_target_singleton_test_SOURCES = features/is_source_target_singleton_test.cc -feature_is_source_target_singleton_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -feature_max_lex_source_given_target_test_SOURCES = features/max_lex_source_given_target_test.cc -feature_max_lex_source_given_target_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -feature_max_lex_target_given_source_test_SOURCES = features/max_lex_target_given_source_test.cc -feature_max_lex_target_given_source_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -feature_sample_source_count_test_SOURCES = features/sample_source_count_test.cc -feature_sample_source_count_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -feature_target_given_source_coherent_test_SOURCES = features/target_given_source_coherent_test.cc -feature_target_given_source_coherent_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -grammar_extractor_test_SOURCES = grammar_extractor_test.cc -grammar_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -matchings_finder_test_SOURCES = matchings_finder_test.cc -matchings_finder_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -matchings_sampler_test_SOURCES = matchings_sampler_test.cc -matchings_sampler_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -phrase_location_sampler_test_SOURCES = phrase_location_sampler_test.cc -phrase_location_sampler_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -phrase_test_SOURCES = phrase_test.cc -phrase_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -precomputation_test_SOURCES = precomputation_test.cc -precomputation_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -rule_extractor_helper_test_SOURCES = rule_extractor_helper_test.cc -rule_extractor_helper_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -rule_extractor_test_SOURCES = rule_extractor_test.cc -rule_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -rule_factory_test_SOURCES = rule_factory_test.cc -rule_factory_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -scorer_test_SOURCES = scorer_test.cc -scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -suffix_array_sampler_test_SOURCES = suffix_array_sampler_test.cc -suffix_array_sampler_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -suffix_array_test_SOURCES = suffix_array_test.cc -suffix_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -target_phrase_extractor_test_SOURCES = target_phrase_extractor_test.cc -target_phrase_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -translation_table_test_SOURCES = translation_table_test.cc -translation_table_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -vocabulary_test_SOURCES = vocabulary_test.cc -vocabulary_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a - -noinst_LIBRARIES = libextractor.a - -sacompile_SOURCES = sacompile.cc -sacompile_LDADD = libextractor.a -run_extractor_SOURCES = run_extractor.cc -run_extractor_LDADD = libextractor.a -extract_SOURCES = extract.cc -extract_LDADD = libextractor.a - -libextractor_a_SOURCES = \ - alignment.cc \ - backoff_sampler.cc \ - data_array.cc \ - fast_intersector.cc \ - features/count_source_target.cc \ - features/feature.cc \ - features/is_source_singleton.cc \ - features/is_source_target_singleton.cc \ - features/max_lex_source_given_target.cc \ - features/max_lex_target_given_source.cc \ - features/sample_source_count.cc \ - features/target_given_source_coherent.cc \ - features/count_source_target.h \ - features/feature.h \ - features/is_source_singleton.h \ - features/is_source_target_singleton.h \ - features/max_lex_source_given_target.h \ - features/max_lex_target_given_source.h \ - features/sample_source_count.h \ - features/target_given_source_coherent.h \ - grammar.cc \ - grammar_extractor.cc \ - matchings_finder.cc \ - matchings_sampler.cc \ - matchings_trie.cc \ - phrase.cc \ - phrase_builder.cc \ - phrase_location.cc \ - phrase_location_sampler.cc \ - precomputation.cc \ - rule.cc \ - rule_extractor.cc \ - rule_extractor_helper.cc \ - rule_factory.cc \ - scorer.cc \ - suffix_array.cc \ - suffix_array_sampler.cc \ - target_phrase_extractor.cc \ - time_util.cc \ - translation_table.cc \ - vocabulary.cc \ - alignment.h \ - backoff_sampler.h \ - data_array.h \ - fast_intersector.h \ - grammar.h \ - grammar_extractor.h \ - matchings_finder.h \ - matchings_sampler.h \ - matchings_trie.h \ - phrase.h \ - phrase_builder.h \ - phrase_location.h \ - phrase_location_sampler.h \ - precomputation.h \ - rule.h \ - rule_extractor.h \ - rule_extractor_helper.h \ - rule_factory.h \ - sampler.h \ - scorer.h \ - suffix_array.h \ - suffix_array_sampler.h \ - target_phrase_extractor.h \ - time_util.h \ - translation_table.h \ - vocabulary.h - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(OPENMP_CXXFLAGS) $(GTEST_CPPFLAGS) $(GMOCK_CPPFLAGS) -AM_LDFLAGS = $(OPENMP_CXXFLAGS) diff --git a/extractor/extract.cc b/extractor/extract.cc index e5b6f6ff..08f209cc 100644 --- a/extractor/extract.cc +++ b/extractor/extract.cc @@ -14,6 +14,7 @@ const unsigned omp_get_num_threads() { return 1; } #endif +#include "filelib.h" #include "alignment.h" #include "data_array.h" #include "features/count_source_target.h" @@ -42,8 +43,8 @@ using namespace features; using namespace std; // Returns the file path in which a given grammar should be written. -fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number) { - string file_name = "grammar." + to_string(file_number); +fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number, bool use_zip) { + string file_name = "grammar." + to_string(file_number) + (use_zip ? ".gz" : ""); return grammar_path / file_name; } @@ -58,6 +59,7 @@ int main(int argc, char** argv) { ("threads,t", po::value<int>()->required()->default_value(1), threads_option.c_str()) ("grammars,g", po::value<string>()->required(), "Grammars output path") + ("gzip,z", "Gzip grammars") ("max_rule_span", po::value<int>()->default_value(15), "Maximum rule span") ("max_rule_symbols", po::value<int>()->default_value(5), @@ -205,12 +207,14 @@ int main(int argc, char** argv) { vm["max_rule_symbols"].as<int>(), vm["max_samples"].as<int>(), vm["tight_phrases"].as<bool>()); + const bool use_zip = vm.count("gzip"); // Creates the grammars directory if it doesn't exist. fs::path grammar_path = vm["grammars"].as<string>(); if (!fs::is_directory(grammar_path)) { fs::create_directory(grammar_path); } + grammar_path = fs::canonical(grammar_path); // Reads all sentences for which we extract grammar rules (the paralellization // is simplified if we read all sentences upfront). @@ -239,12 +243,12 @@ int main(int argc, char** argv) { } Grammar grammar = extractor.GetGrammar( sentences[i], blacklisted_sentence_ids); - ofstream output(GetGrammarFilePath(grammar_path, i).c_str()); - output << grammar; + WriteFile wf(GetGrammarFilePath(grammar_path, i, use_zip).c_str()); + *wf.stream() << grammar; } for (size_t i = 0; i < sentences.size(); ++i) { - cout << "<seg grammar=" << GetGrammarFilePath(grammar_path, i) << " id=\"" + cout << "<seg grammar=" << GetGrammarFilePath(grammar_path, i, use_zip) << " id=\"" << i << "\"> " << sentences[i] << " </seg> " << suffixes[i] << endl; } diff --git a/extractor/scorer_test.cc b/extractor/scorer2_test.cc index bf77f7ef..bf77f7ef 100644 --- a/extractor/scorer_test.cc +++ b/extractor/scorer2_test.cc |