diff options
| author | Chris Dyer <redpony@gmail.com> | 2015-04-02 00:50:04 -0400 | 
|---|---|---|
| committer | Chris Dyer <redpony@gmail.com> | 2015-04-02 00:50:04 -0400 | 
| commit | 5ee02ce1602f2fce6d5af5db93c2278fe6c9ede5 (patch) | |
| tree | 7ebad8dd99e38d190c579f425c3eb959363e96e5 /extractor | |
| parent | e7d77de8a9b9929b22fc6562f88f3668900f9662 (diff) | |
| parent | 737ed7a7f932b1a7e40d2755bcdee6bc0aa2de63 (diff) | |
Merge pull request #70 from redpony/cmake
Cmake
Diffstat (limited to 'extractor')
| -rw-r--r-- | extractor/CMakeLists.txt | 141 | ||||
| -rw-r--r-- | extractor/Makefile.am | 192 | ||||
| -rw-r--r-- | extractor/extract.cc | 14 | ||||
| -rw-r--r-- | extractor/scorer2_test.cc (renamed from extractor/scorer_test.cc) | 0 | 
4 files changed, 150 insertions, 197 deletions
| diff --git a/extractor/CMakeLists.txt b/extractor/CMakeLists.txt new file mode 100644 index 00000000..93a524cc --- /dev/null +++ b/extractor/CMakeLists.txt @@ -0,0 +1,141 @@ +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../utils) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/features) + +find_package(OpenMP) +if (OPENMP_FOUND) +    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") +    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() + +find_package(GTest) +find_package(GMock) +if(GTEST_FOUND) + if(GMOCK_FOUND) +  #rule_factory_test.cc +  set(TEST_SRCS alignment_test.cc +    data_array_test.cc +    fast_intersector_test.cc +    grammar_extractor_test.cc +    matchings_finder_test.cc +    matchings_sampler_test.cc +    phrase_location_sampler_test.cc +    phrase_test.cc +    precomputation_test.cc +    rule_extractor_helper_test.cc +    rule_extractor_test.cc +    scorer2_test.cc +    suffix_array_sampler_test.cc +    suffix_array_test.cc +    target_phrase_extractor_test.cc +    translation_table_test.cc +    vocabulary_test.cc) +  INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS}) +  INCLUDE_DIRECTORIES(${GMOCK_INCLUDE_DIRS}) +  foreach(testSrc ${TEST_SRCS}) +    #Extract the filename without an extension (NAME_WE) +    get_filename_component(testName ${testSrc} NAME_WE) + +    #Add compile target +    add_executable(${testName} ${testSrc}) + +    #link to Boost libraries AND your targets and dependencies +    target_link_libraries(${testName} extractor ${GMOCK_BOTH_LIBRARIES} ${GTEST_BOTH_LIBRARIES} ${Boost_LIBRARIES} ${ZLIB_LIBRARIES}) + +    #I like to move testing binaries into a testBin directory +    set_target_properties(${testName} PROPERTIES  +      RUNTIME_OUTPUT_DIRECTORY  ${CMAKE_CURRENT_SOURCE_DIR}) + +    #Finally add it to test execution -  +    #Notice the WORKING_DIRECTORY and COMMAND +    add_test(NAME ${testName} COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/${testName}  +       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) +  endforeach(testSrc) + endif(GMOCK_FOUND) +endif(GTEST_FOUND) + +set(sacompile_SRCS sacompile.cc) +add_executable(sacompile ${sacompile_SRCS}) +target_link_libraries(sacompile extractor ${Boost_LIBRARIES} z) + +set(run_extractor_SRCS run_extractor.cc) +add_executable(run_extractor ${run_extractor_SRCS}) +target_link_libraries(run_extractor extractor ${Boost_LIBRARIES} z) + + +set(extract_SRCS extract.cc) +add_executable(extract ${extract_SRCS}) +target_link_libraries(extract extractor utils ${Boost_LIBRARIES} z) + + +set(extractor_STAT_SRCS +    alignment.cc +    backoff_sampler.cc +    data_array.cc +    fast_intersector.cc +    features/count_source_target.cc +    features/feature.cc +    features/is_source_singleton.cc +    features/is_source_target_singleton.cc +    features/max_lex_source_given_target.cc +    features/max_lex_target_given_source.cc +    features/sample_source_count.cc +    features/target_given_source_coherent.cc +    features/count_source_target.h +    features/feature.h +    features/is_source_singleton.h +    features/is_source_target_singleton.h +    features/max_lex_source_given_target.h +    features/max_lex_target_given_source.h +    features/sample_source_count.h +    features/target_given_source_coherent.h +    grammar.cc +    grammar_extractor.cc +    matchings_finder.cc +    matchings_sampler.cc +    matchings_trie.cc +    phrase.cc +    phrase_builder.cc +    phrase_location.cc +    phrase_location_sampler.cc +    precomputation.cc +    rule.cc +    rule_extractor.cc +    rule_extractor_helper.cc +    rule_factory.cc +    scorer.cc +    suffix_array.cc +    suffix_array_sampler.cc +    target_phrase_extractor.cc +    time_util.cc +    translation_table.cc +    vocabulary.cc +    alignment.h +    backoff_sampler.h +    data_array.h +    fast_intersector.h +    grammar.h +    grammar_extractor.h +    matchings_finder.h +    matchings_sampler.h +    matchings_trie.h +    phrase.h +    phrase_builder.h +    phrase_location.h +    phrase_location_sampler.h +    precomputation.h +    rule.h +    rule_extractor.h +    rule_extractor_helper.h +    rule_factory.h +    sampler.h +    scorer.h +    suffix_array.h +    suffix_array_sampler.h +    target_phrase_extractor.h +    time_util.h +    translation_table.h +    vocabulary.h) + +add_library(extractor STATIC ${extractor_STAT_SRCS}) + diff --git a/extractor/Makefile.am b/extractor/Makefile.am deleted file mode 100644 index a406d9dc..00000000 --- a/extractor/Makefile.am +++ /dev/null @@ -1,192 +0,0 @@ - -bin_PROGRAMS = sacompile run_extractor extract - -EXTRA_PROGRAMS = alignment_test \ -    data_array_test \ -    fast_intersector_test \ -    feature_count_source_target_test \ -    feature_is_source_singleton_test \ -    feature_is_source_target_singleton_test \ -    feature_max_lex_source_given_target_test \ -    feature_max_lex_target_given_source_test \ -    feature_sample_source_count_test \ -    feature_target_given_source_coherent_test \ -    grammar_extractor_test \ -    matchings_finder_test \ -    matchings_sampler_test \ -    phrase_location_sampler_test \ -    phrase_test \ -    precomputation_test \ -    rule_extractor_helper_test \ -    rule_extractor_test \ -    rule_factory_test \ -    scorer_test \ -    suffix_array_sampler_test \ -    suffix_array_test \ -    target_phrase_extractor_test \ -    translation_table_test \ -    vocabulary_test - -if HAVE_GTEST -  RUNNABLE_TESTS = alignment_test \ -    data_array_test \ -    fast_intersector_test \ -    feature_count_source_target_test \ -    feature_is_source_singleton_test \ -    feature_is_source_target_singleton_test \ -    feature_max_lex_source_given_target_test \ -    feature_max_lex_target_given_source_test \ -    feature_sample_source_count_test \ -    feature_target_given_source_coherent_test \ -    grammar_extractor_test \ -    matchings_finder_test \ -    matchings_sampler_test \ -    phrase_location_sampler_test \ -    phrase_test \ -    precomputation_test \ -    rule_extractor_helper_test \ -    rule_extractor_test \ -    rule_factory_test \ -    scorer_test \ -    suffix_array_sampler_test \ -    suffix_array_test \ -    target_phrase_extractor_test \ -    translation_table_test \ -    vocabulary_test -endif - -noinst_PROGRAMS = $(RUNNABLE_TESTS) - -TESTS = $(RUNNABLE_TESTS) - -alignment_test_SOURCES = alignment_test.cc -alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -data_array_test_SOURCES = data_array_test.cc -data_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -fast_intersector_test_SOURCES = fast_intersector_test.cc -fast_intersector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -feature_count_source_target_test_SOURCES = features/count_source_target_test.cc -feature_count_source_target_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -feature_is_source_singleton_test_SOURCES = features/is_source_singleton_test.cc -feature_is_source_singleton_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -feature_is_source_target_singleton_test_SOURCES = features/is_source_target_singleton_test.cc -feature_is_source_target_singleton_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -feature_max_lex_source_given_target_test_SOURCES = features/max_lex_source_given_target_test.cc -feature_max_lex_source_given_target_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -feature_max_lex_target_given_source_test_SOURCES = features/max_lex_target_given_source_test.cc -feature_max_lex_target_given_source_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -feature_sample_source_count_test_SOURCES = features/sample_source_count_test.cc -feature_sample_source_count_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -feature_target_given_source_coherent_test_SOURCES = features/target_given_source_coherent_test.cc -feature_target_given_source_coherent_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -grammar_extractor_test_SOURCES = grammar_extractor_test.cc -grammar_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -matchings_finder_test_SOURCES = matchings_finder_test.cc -matchings_finder_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -matchings_sampler_test_SOURCES = matchings_sampler_test.cc -matchings_sampler_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -phrase_location_sampler_test_SOURCES = phrase_location_sampler_test.cc -phrase_location_sampler_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -phrase_test_SOURCES = phrase_test.cc -phrase_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -precomputation_test_SOURCES = precomputation_test.cc -precomputation_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -rule_extractor_helper_test_SOURCES = rule_extractor_helper_test.cc -rule_extractor_helper_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -rule_extractor_test_SOURCES = rule_extractor_test.cc -rule_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -rule_factory_test_SOURCES = rule_factory_test.cc -rule_factory_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -scorer_test_SOURCES = scorer_test.cc -scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -suffix_array_sampler_test_SOURCES = suffix_array_sampler_test.cc -suffix_array_sampler_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -suffix_array_test_SOURCES = suffix_array_test.cc -suffix_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -target_phrase_extractor_test_SOURCES = target_phrase_extractor_test.cc -target_phrase_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -translation_table_test_SOURCES = translation_table_test.cc -translation_table_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -vocabulary_test_SOURCES = vocabulary_test.cc -vocabulary_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a - -noinst_LIBRARIES = libextractor.a - -sacompile_SOURCES = sacompile.cc -sacompile_LDADD = libextractor.a -run_extractor_SOURCES = run_extractor.cc -run_extractor_LDADD = libextractor.a -extract_SOURCES = extract.cc -extract_LDADD = libextractor.a - -libextractor_a_SOURCES = \ -  alignment.cc \ -  backoff_sampler.cc \ -  data_array.cc \ -  fast_intersector.cc \ -  features/count_source_target.cc \ -  features/feature.cc \ -  features/is_source_singleton.cc \ -  features/is_source_target_singleton.cc \ -  features/max_lex_source_given_target.cc \ -  features/max_lex_target_given_source.cc \ -  features/sample_source_count.cc \ -  features/target_given_source_coherent.cc \ -  features/count_source_target.h \ -  features/feature.h \ -  features/is_source_singleton.h \ -  features/is_source_target_singleton.h \ -  features/max_lex_source_given_target.h \ -  features/max_lex_target_given_source.h \ -  features/sample_source_count.h \ -  features/target_given_source_coherent.h \ -  grammar.cc \ -  grammar_extractor.cc \ -  matchings_finder.cc \ -  matchings_sampler.cc \ -  matchings_trie.cc \ -  phrase.cc \ -  phrase_builder.cc \ -  phrase_location.cc \ -  phrase_location_sampler.cc \ -  precomputation.cc \ -  rule.cc \ -  rule_extractor.cc \ -  rule_extractor_helper.cc \ -  rule_factory.cc \ -  scorer.cc \ -  suffix_array.cc \ -  suffix_array_sampler.cc \ -  target_phrase_extractor.cc \ -  time_util.cc \ -  translation_table.cc \ -  vocabulary.cc \ -  alignment.h \ -  backoff_sampler.h \ -  data_array.h \ -  fast_intersector.h \ -  grammar.h \ -  grammar_extractor.h \ -  matchings_finder.h \ -  matchings_sampler.h \ -  matchings_trie.h \ -  phrase.h \ -  phrase_builder.h \ -  phrase_location.h \ -  phrase_location_sampler.h \ -  precomputation.h \ -  rule.h \ -  rule_extractor.h \ -  rule_extractor_helper.h \ -  rule_factory.h \ -  sampler.h \ -  scorer.h \ -  suffix_array.h \ -  suffix_array_sampler.h \ -  target_phrase_extractor.h \ -  time_util.h \ -  translation_table.h \ -  vocabulary.h - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(OPENMP_CXXFLAGS) $(GTEST_CPPFLAGS) $(GMOCK_CPPFLAGS) -AM_LDFLAGS = $(OPENMP_CXXFLAGS) diff --git a/extractor/extract.cc b/extractor/extract.cc index e5b6f6ff..08f209cc 100644 --- a/extractor/extract.cc +++ b/extractor/extract.cc @@ -14,6 +14,7 @@    const unsigned omp_get_num_threads() { return 1; }  #endif +#include "filelib.h"  #include "alignment.h"  #include "data_array.h"  #include "features/count_source_target.h" @@ -42,8 +43,8 @@ using namespace features;  using namespace std;  // Returns the file path in which a given grammar should be written. -fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number) { -  string file_name = "grammar." + to_string(file_number); +fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number, bool use_zip) { +  string file_name = "grammar." + to_string(file_number) + (use_zip ? ".gz" : "");    return grammar_path / file_name;  } @@ -58,6 +59,7 @@ int main(int argc, char** argv) {      ("threads,t", po::value<int>()->required()->default_value(1),       threads_option.c_str())      ("grammars,g", po::value<string>()->required(), "Grammars output path") +    ("gzip,z", "Gzip grammars")      ("max_rule_span", po::value<int>()->default_value(15),          "Maximum rule span")      ("max_rule_symbols", po::value<int>()->default_value(5), @@ -205,12 +207,14 @@ int main(int argc, char** argv) {        vm["max_rule_symbols"].as<int>(),        vm["max_samples"].as<int>(),        vm["tight_phrases"].as<bool>()); +  const bool use_zip = vm.count("gzip");    // Creates the grammars directory if it doesn't exist.    fs::path grammar_path = vm["grammars"].as<string>();    if (!fs::is_directory(grammar_path)) {      fs::create_directory(grammar_path);    } +  grammar_path = fs::canonical(grammar_path);    // Reads all sentences for which we extract grammar rules (the paralellization    // is simplified if we read all sentences upfront). @@ -239,12 +243,12 @@ int main(int argc, char** argv) {      }      Grammar grammar = extractor.GetGrammar(          sentences[i], blacklisted_sentence_ids); -    ofstream output(GetGrammarFilePath(grammar_path, i).c_str()); -    output << grammar; +    WriteFile wf(GetGrammarFilePath(grammar_path, i, use_zip).c_str()); +    *wf.stream() << grammar;    }    for (size_t i = 0; i < sentences.size(); ++i) { -    cout << "<seg grammar=" << GetGrammarFilePath(grammar_path, i) << " id=\"" +    cout << "<seg grammar=" << GetGrammarFilePath(grammar_path, i, use_zip) << " id=\""           << i << "\"> " << sentences[i] << " </seg> " << suffixes[i] << endl;    } diff --git a/extractor/scorer_test.cc b/extractor/scorer2_test.cc index bf77f7ef..bf77f7ef 100644 --- a/extractor/scorer_test.cc +++ b/extractor/scorer2_test.cc | 
