diff options
author | Chris Dyer <redpony@gmail.com> | 2015-03-04 21:36:51 -0500 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2015-03-04 21:36:51 -0500 |
commit | bafcdb6a06c0fe9db64f703e954b299cd8f39289 (patch) | |
tree | d1399e1b879aaa37b4c574133c6ac50d040d77c2 | |
parent | 6cbdccb1d9a62b2723b962ba4b6e66f1631e48d3 (diff) |
remove perfect hash function stuff, add zip option to extract.cc
-rw-r--r-- | extractor/CMakeLists.txt | 17 | ||||
-rw-r--r-- | extractor/extract.cc | 14 | ||||
-rwxr-xr-x | training/mira/mira.py | 59 | ||||
-rw-r--r-- | utils/CMakeLists.txt | 1 | ||||
-rw-r--r-- | utils/perfect_hash.cc | 37 | ||||
-rw-r--r-- | utils/perfect_hash.h | 27 | ||||
-rw-r--r-- | utils/phmt.cc | 42 |
7 files changed, 49 insertions, 148 deletions
diff --git a/extractor/CMakeLists.txt b/extractor/CMakeLists.txt index d37c6b95..1cf8533b 100644 --- a/extractor/CMakeLists.txt +++ b/extractor/CMakeLists.txt @@ -1,9 +1,17 @@ -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../utils) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/features) -find_package(GTest) -find_package(GMock) +find_package(OpenMP) +if (OPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() + +find_package(GTest REQUIRED) +find_package(GMock REQUIRED) if(GMOCK_FOUND) + #rule_factory_test.cc set(TEST_SRCS alignment_test.cc data_array_test.cc fast_intersector_test.cc @@ -15,7 +23,6 @@ if(GMOCK_FOUND) precomputation_test.cc rule_extractor_helper_test.cc rule_extractor_test.cc - rule_factory_test.cc scorer2_test.cc suffix_array_sampler_test.cc suffix_array_test.cc @@ -56,7 +63,7 @@ target_link_libraries(run_extractor extractor ${Boost_LIBRARIES} z) set(extract_SRCS extract.cc) add_executable(extract ${extract_SRCS}) -target_link_libraries(extract extractor ${Boost_LIBRARIES} z) +target_link_libraries(extract extractor utils ${Boost_LIBRARIES} z) set(extractor_STAT_SRCS diff --git a/extractor/extract.cc b/extractor/extract.cc index e5b6f6ff..08f209cc 100644 --- a/extractor/extract.cc +++ b/extractor/extract.cc @@ -14,6 +14,7 @@ const unsigned omp_get_num_threads() { return 1; } #endif +#include "filelib.h" #include "alignment.h" #include "data_array.h" #include "features/count_source_target.h" @@ -42,8 +43,8 @@ using namespace features; using namespace std; // Returns the file path in which a given grammar should be written. -fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number) { - string file_name = "grammar." + to_string(file_number); +fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number, bool use_zip) { + string file_name = "grammar." + to_string(file_number) + (use_zip ? ".gz" : ""); return grammar_path / file_name; } @@ -58,6 +59,7 @@ int main(int argc, char** argv) { ("threads,t", po::value<int>()->required()->default_value(1), threads_option.c_str()) ("grammars,g", po::value<string>()->required(), "Grammars output path") + ("gzip,z", "Gzip grammars") ("max_rule_span", po::value<int>()->default_value(15), "Maximum rule span") ("max_rule_symbols", po::value<int>()->default_value(5), @@ -205,12 +207,14 @@ int main(int argc, char** argv) { vm["max_rule_symbols"].as<int>(), vm["max_samples"].as<int>(), vm["tight_phrases"].as<bool>()); + const bool use_zip = vm.count("gzip"); // Creates the grammars directory if it doesn't exist. fs::path grammar_path = vm["grammars"].as<string>(); if (!fs::is_directory(grammar_path)) { fs::create_directory(grammar_path); } + grammar_path = fs::canonical(grammar_path); // Reads all sentences for which we extract grammar rules (the paralellization // is simplified if we read all sentences upfront). @@ -239,12 +243,12 @@ int main(int argc, char** argv) { } Grammar grammar = extractor.GetGrammar( sentences[i], blacklisted_sentence_ids); - ofstream output(GetGrammarFilePath(grammar_path, i).c_str()); - output << grammar; + WriteFile wf(GetGrammarFilePath(grammar_path, i, use_zip).c_str()); + *wf.stream() << grammar; } for (size_t i = 0; i < sentences.size(); ++i) { - cout << "<seg grammar=" << GetGrammarFilePath(grammar_path, i) << " id=\"" + cout << "<seg grammar=" << GetGrammarFilePath(grammar_path, i, use_zip) << " id=\"" << i << "\"> " << sentences[i] << " </seg> " << suffixes[i] << endl; } diff --git a/training/mira/mira.py b/training/mira/mira.py index ec9c2d64..ccecb10e 100755 --- a/training/mira/mira.py +++ b/training/mira/mira.py @@ -5,12 +5,6 @@ import argparse import logging import random, time import gzip, itertools -try: - import cdec.score -except ImportError: - sys.stderr.write('Could not import pycdec, see cdec/python/README.md for details\n') - sys.exit(1) -have_mpl = True try: import matplotlib matplotlib.use('Agg') @@ -19,26 +13,33 @@ except ImportError: have_mpl = False #mira run script -#requires pycdec to be built, since it is used for scoring hypothesis -#translations. #matplotlib must be installed for graphing to work #email option requires mail +script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) +fast_score_binary = script_dir+'/../../mteval/fast_score' +dlog = None + #scoring function using pycdec scoring def fast_score(hyps, refs, metric): - scorer = cdec.score.Scorer(metric) - logging.info('loaded {0} references for scoring with {1}'.format( - len(refs), metric)) - if metric=='BLEU': - logging.warning('BLEU is ambiguous, assuming IBM_BLEU\n') - metric = 'IBM_BLEU' - elif metric=='COMBI': - logging.warning('COMBI metric is no longer supported, switching to ' - 'COMB:TER=-0.5;BLEU=0.5') - metric = 'COMB:TER=-0.5;BLEU=0.5' - stats = sum(scorer(r).evaluate(h) for h,r in itertools.izip(hyps,refs)) - logging.info('Score={} ({})'.format(stats.score, stats.detail)) - return stats.score + #scorer = cdec.score.Scorer(metric) + #logging.info('loaded {0} references for scoring with {1}'.format( + # len(refs), metric)) + #if metric=='BLEU': + # logging.warning('BLEU is ambiguous, assuming IBM_BLEU\n') + # metric = 'IBM_BLEU' + #elif metric=='COMBI': + # logging.warning('COMBI metric is no longer supported, switching to ' + # 'COMB:TER=-0.5;BLEU=0.5') + # metric = 'COMB:TER=-0.5;BLEU=0.5' + #stats = sum(scorer(r).evaluate(h) for h,r in itertools.izip(hyps,refs)) + #logging.info('Score={} ({})'.format(stats.score, stats.detail)) + #return stats.score + cmd = ('{0} -r{1} -i {2} -m {3}').format(fast_score_binary, refs, hyps, metric) + proc = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) + o = proc.stdout.readline().strip() + print 'res: ', o + return float(o) #create new parallel input file in output directory in sgml format def enseg(devfile, newfile, gprefix): @@ -81,7 +82,6 @@ def enseg(devfile, newfile, gprefix): def main(): #set logging to write all info messages to stderr logging.basicConfig(level=logging.INFO) - script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) if not have_mpl: logging.warning('Failed to import matplotlib, graphs will not be generated.') @@ -373,7 +373,8 @@ def optimize(args, script_dir, dev_size): cmd = parallel_cmd + ' ' + decoder_cmd logging.info('OPTIMIZATION COMMAND: {}'.format(cmd)) - + + global dlog dlog = open(decoderlog,'w') runf = open(runfile,'w') retries = 0 @@ -420,7 +421,7 @@ def optimize(args, script_dir, dev_size): bests = [] fears = [] for line in run: - hope, best, fear = line.split(' ||| ') + hope, best, fear = line.strip().split(' ||| ') hopes.append(hope) bests.append(best) fears.append(fear) @@ -436,14 +437,10 @@ def optimize(args, script_dir, dev_size): gzip_file(runfile) gzip_file(decoderlog) - ref_file = open(refs) - references = [line.split(' ||| ') for line in - ref_file.read().strip().split('\n')] - ref_file.close() #get score for best hypothesis translations, hope and fear translations - dec_score = fast_score(bests, references, args.metric) - dec_score_h = fast_score(hopes, references, args.metric) - dec_score_f = fast_score(fears, references, args.metric) + dec_score = fast_score(runfile+'.B', refs, args.metric) + dec_score_h = fast_score(runfile+'.H', refs, args.metric) + dec_score_f = fast_score(runfile+'.F', refs, args.metric) hope_best_fear['hope'].append(dec_score) hope_best_fear['best'].append(dec_score_h) diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 17436263..59fb644d 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -70,7 +70,6 @@ set(utils_STAT_SRCS named_enum.h null_deleter.h null_traits.h - perfect_hash.h prob.h sampler.h semiring.h diff --git a/utils/perfect_hash.cc b/utils/perfect_hash.cc deleted file mode 100644 index 706e2741..00000000 --- a/utils/perfect_hash.cc +++ /dev/null @@ -1,37 +0,0 @@ -#include "config.h" - -#ifdef HAVE_CMPH - -#include "perfect_hash.h" - -#include <cstdio> -#include <iostream> - -using namespace std; - -PerfectHashFunction::~PerfectHashFunction() { - cmph_destroy(mphf_); -} - -PerfectHashFunction::PerfectHashFunction(const string& fname) { - FILE* f = fopen(fname.c_str(), "r"); - if (!f) { - cerr << "Failed to open file " << fname << " for reading: cannot load hash function.\n"; - abort(); - } - mphf_ = cmph_load(f); - if (!mphf_) { - cerr << "cmph_load failed on " << fname << "!\n"; - abort(); - } -} - -size_t PerfectHashFunction::operator()(const string& key) const { - return cmph_search(mphf_, &key[0], key.size()); -} - -size_t PerfectHashFunction::number_of_keys() const { - return cmph_size(mphf_); -} - -#endif diff --git a/utils/perfect_hash.h b/utils/perfect_hash.h deleted file mode 100644 index 8c12c9f0..00000000 --- a/utils/perfect_hash.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef PERFECT_HASH_MAP_H_ -#define PERFECT_HASH_MAP_H_ - -#include <vector> -#include <boost/utility.hpp> - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#ifdef HAVE_CMPH -#include "cmph.h" -#endif - -class PerfectHashFunction : boost::noncopyable { - public: - explicit PerfectHashFunction(const std::string& fname); - ~PerfectHashFunction(); - size_t operator()(const std::string& key) const; - size_t number_of_keys() const; - private: -#ifdef HAVE_CMPH - cmph_t *mphf_; -#endif -}; - -#endif diff --git a/utils/phmt.cc b/utils/phmt.cc deleted file mode 100644 index b17febf6..00000000 --- a/utils/phmt.cc +++ /dev/null @@ -1,42 +0,0 @@ -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#ifndef HAVE_CMPH -int main() { - return 0; -} -#else - -#include <iostream> -#include "weights.h" -#include "fdict.h" - -using namespace std; - -int main(int argc, char** argv) { - if (argc != 2) { cerr << "Usage: " << argv[0] << " file.mphf\n"; return 1; } - FD::EnableHash(argv[1]); - cerr << "Number of keys: " << FD::NumFeats() << endl; - cerr << "LexFE = " << FD::Convert("LexFE") << endl; - cerr << "LexEF = " << FD::Convert("LexEF") << endl; - { - vector<weight_t> v(FD::NumFeats()); - v[FD::Convert("LexFE")] = 1.0; - v[FD::Convert("LexEF")] = 0.5; - cerr << "Writing...\n"; - Weights::WriteToFile("weights.bin", v); - cerr << "Done.\n"; - } - { - vector<weight_t> v(FD::NumFeats()); - cerr << "Reading...\n"; - Weights::InitFromFile("weights.bin", &v); - cerr << "Done.\n"; - assert(v[FD::Convert("LexFE")] == 1.0); - assert(v[FD::Convert("LexEF")] == 0.5); - } -} - -#endif - |