remove perfect hash function stuff, add zip option to extract.cc

author: Chris Dyer <redpony@gmail.com> 2015-03-04 21:36:51 -0500
committer: Chris Dyer <redpony@gmail.com> 2015-03-04 21:36:51 -0500
commit: 06f1b6aedbd96d652d5337cf1b93b51e4dd9a620 (patch)
tree: a736ff77bbe61810c71b421e9ff0e0ecd7638446
parent: 95183b5760d7f168ae093ae8f9b29740628a278f (diff)
7 files changed, 49 insertions, 148 deletions
diff --git a/extractor/CMakeLists.txt b/extractor/CMakeLists.txt
index d37c6b95..1cf8533b 100644
--- a/extractor/CMakeLists.txt
+++ b/extractor/CMakeLists.txt
@@ -1,9 +1,17 @@
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../utils)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/)
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/features)
 
-find_package(GTest)
-find_package(GMock)
+find_package(OpenMP)
+if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+find_package(GTest REQUIRED)
+find_package(GMock REQUIRED)
 if(GMOCK_FOUND)
+  #rule_factory_test.cc
   set(TEST_SRCS alignment_test.cc
     data_array_test.cc
     fast_intersector_test.cc
@@ -15,7 +23,6 @@ if(GMOCK_FOUND)
     precomputation_test.cc
     rule_extractor_helper_test.cc
     rule_extractor_test.cc
-    rule_factory_test.cc
     scorer2_test.cc
     suffix_array_sampler_test.cc
     suffix_array_test.cc
@@ -56,7 +63,7 @@ target_link_libraries(run_extractor extractor ${Boost_LIBRARIES} z)
 
 set(extract_SRCS extract.cc)
 add_executable(extract ${extract_SRCS})
-target_link_libraries(extract extractor ${Boost_LIBRARIES} z)
+target_link_libraries(extract extractor utils ${Boost_LIBRARIES} z)
 
 
 set(extractor_STAT_SRCS
diff --git a/extractor/extract.cc b/extractor/extract.cc
index e5b6f6ff..08f209cc 100644
--- a/extractor/extract.cc
+++ b/extractor/extract.cc
@@ -14,6 +14,7 @@
   const unsigned omp_get_num_threads() { return 1; }
 #endif
 
+#include "filelib.h"
 #include "alignment.h"
 #include "data_array.h"
 #include "features/count_source_target.h"
@@ -42,8 +43,8 @@ using namespace features;
 using namespace std;
 
 // Returns the file path in which a given grammar should be written.
-fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number) {
-  string file_name = "grammar." + to_string(file_number);
+fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number, bool use_zip) {
+  string file_name = "grammar." + to_string(file_number) + (use_zip ? ".gz" : "");
   return grammar_path / file_name;
 }
 
@@ -58,6 +59,7 @@ int main(int argc, char** argv) {
     ("threads,t", po::value<int>()->required()->default_value(1),
      threads_option.c_str())
     ("grammars,g", po::value<string>()->required(), "Grammars output path")
+    ("gzip,z", "Gzip grammars")
     ("max_rule_span", po::value<int>()->default_value(15),
         "Maximum rule span")
     ("max_rule_symbols", po::value<int>()->default_value(5),
@@ -205,12 +207,14 @@ int main(int argc, char** argv) {
       vm["max_rule_symbols"].as<int>(),
       vm["max_samples"].as<int>(),
       vm["tight_phrases"].as<bool>());
+  const bool use_zip = vm.count("gzip");
 
   // Creates the grammars directory if it doesn't exist.
   fs::path grammar_path = vm["grammars"].as<string>();
   if (!fs::is_directory(grammar_path)) {
     fs::create_directory(grammar_path);
   }
+  grammar_path = fs::canonical(grammar_path);
 
   // Reads all sentences for which we extract grammar rules (the paralellization
   // is simplified if we read all sentences upfront).
@@ -239,12 +243,12 @@ int main(int argc, char** argv) {
     }
     Grammar grammar = extractor.GetGrammar(
         sentences[i], blacklisted_sentence_ids);
-    ofstream output(GetGrammarFilePath(grammar_path, i).c_str());
-    output << grammar;
+    WriteFile wf(GetGrammarFilePath(grammar_path, i, use_zip).c_str());
+    *wf.stream() << grammar;
   }
 
   for (size_t i = 0; i < sentences.size(); ++i) {
-    cout << "<seg grammar=" << GetGrammarFilePath(grammar_path, i) << " id=\""
+    cout << "<seg grammar=" << GetGrammarFilePath(grammar_path, i, use_zip) << " id=\""
          << i << "\"> " << sentences[i] << " </seg> " << suffixes[i] << endl;
   }
 
diff --git a/training/mira/mira.py b/training/mira/mira.py
index ec9c2d64..ccecb10e 100755
--- a/training/mira/mira.py
+++ b/training/mira/mira.py
@@ -5,12 +5,6 @@ import argparse
 import logging
 import random, time
 import gzip, itertools
-try:
-  import cdec.score
-except ImportError:
-  sys.stderr.write('Could not import pycdec, see cdec/python/README.md for details\n')
-  sys.exit(1)
-have_mpl = True
 try: 
   import matplotlib
   matplotlib.use('Agg')
@@ -19,26 +13,33 @@ except ImportError:
   have_mpl = False
 
 #mira run script
-#requires pycdec to be built, since it is used for scoring hypothesis
-#translations.
 #matplotlib must be installed for graphing to work
 #email option requires mail
 
+script_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
+fast_score_binary = script_dir+'/../../mteval/fast_score'
+dlog = None
+
 #scoring function using pycdec scoring
 def fast_score(hyps, refs, metric):
-  scorer = cdec.score.Scorer(metric)
-  logging.info('loaded {0} references for scoring with {1}'.format(
-                len(refs), metric))
-  if metric=='BLEU':
-    logging.warning('BLEU is ambiguous, assuming IBM_BLEU\n')
-    metric = 'IBM_BLEU'
-  elif metric=='COMBI':
-    logging.warning('COMBI metric is no longer supported, switching to '
-                    'COMB:TER=-0.5;BLEU=0.5')
-    metric = 'COMB:TER=-0.5;BLEU=0.5'
-  stats = sum(scorer(r).evaluate(h) for h,r in itertools.izip(hyps,refs))
-  logging.info('Score={} ({})'.format(stats.score, stats.detail))
-  return stats.score
+  #scorer = cdec.score.Scorer(metric)
+  #logging.info('loaded {0} references for scoring with {1}'.format(
+  #              len(refs), metric))
+  #if metric=='BLEU':
+  #  logging.warning('BLEU is ambiguous, assuming IBM_BLEU\n')
+  #  metric = 'IBM_BLEU'
+  #elif metric=='COMBI':
+  #  logging.warning('COMBI metric is no longer supported, switching to '
+  #                  'COMB:TER=-0.5;BLEU=0.5')
+  #  metric = 'COMB:TER=-0.5;BLEU=0.5'
+  #stats = sum(scorer(r).evaluate(h) for h,r in itertools.izip(hyps,refs))
+  #logging.info('Score={} ({})'.format(stats.score, stats.detail))
+  #return stats.score
+  cmd = ('{0} -r{1} -i {2} -m {3}').format(fast_score_binary, refs, hyps, metric)
+  proc = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE)
+  o = proc.stdout.readline().strip()
+  print 'res: ', o
+  return float(o)
 
 #create new parallel input file in output directory in sgml format
 def enseg(devfile, newfile, gprefix):
@@ -81,7 +82,6 @@ def enseg(devfile, newfile, gprefix):
 def main():
   #set logging to write all info messages to stderr
   logging.basicConfig(level=logging.INFO)
-  script_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
   if not have_mpl:
     logging.warning('Failed to import matplotlib, graphs will not be generated.')
 
@@ -373,7 +373,8 @@ def optimize(args, script_dir, dev_size):
     
     cmd = parallel_cmd + ' ' + decoder_cmd
     logging.info('OPTIMIZATION COMMAND: {}'.format(cmd))
-   
+  
+    global dlog 
     dlog = open(decoderlog,'w')
     runf = open(runfile,'w')
     retries = 0
@@ -420,7 +421,7 @@ def optimize(args, script_dir, dev_size):
     bests = []
     fears = []
     for line in run:
-      hope, best, fear = line.split(' ||| ')
+      hope, best, fear = line.strip().split(' ||| ')
       hopes.append(hope)
       bests.append(best)
       fears.append(fear)
@@ -436,14 +437,10 @@ def optimize(args, script_dir, dev_size):
     gzip_file(runfile)
     gzip_file(decoderlog)
 
-    ref_file = open(refs)
-    references = [line.split(' ||| ') for line in 
-                  ref_file.read().strip().split('\n')]
-    ref_file.close()
     #get score for best hypothesis translations, hope and fear translations
-    dec_score = fast_score(bests, references, args.metric)
-    dec_score_h = fast_score(hopes, references, args.metric)
-    dec_score_f = fast_score(fears, references, args.metric)
+    dec_score = fast_score(runfile+'.B', refs, args.metric)
+    dec_score_h = fast_score(runfile+'.H', refs, args.metric)
+    dec_score_f = fast_score(runfile+'.F', refs, args.metric)
     
     hope_best_fear['hope'].append(dec_score)
     hope_best_fear['best'].append(dec_score_h)
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
index 17436263..59fb644d 100644
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@@ -70,7 +70,6 @@ set(utils_STAT_SRCS
     named_enum.h
     null_deleter.h
     null_traits.h
-    perfect_hash.h
     prob.h
     sampler.h
     semiring.h
diff --git a/utils/perfect_hash.cc b/utils/perfect_hash.cc
deleted file mode 100644
index 706e2741..00000000
--- a/utils/perfect_hash.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "config.h"
-
-#ifdef HAVE_CMPH
-
-#include "perfect_hash.h"
-
-#include <cstdio>
-#include <iostream>
-
-using namespace std;
-
-PerfectHashFunction::~PerfectHashFunction() {
-  cmph_destroy(mphf_);
-}
-
-PerfectHashFunction::PerfectHashFunction(const string& fname) {
-  FILE* f = fopen(fname.c_str(), "r");
-  if (!f) {
-    cerr << "Failed to open file " << fname << " for reading: cannot load hash function.\n";
-    abort();
-  }
-  mphf_ = cmph_load(f);
-  if (!mphf_) {
-    cerr << "cmph_load failed on " << fname << "!\n";
-    abort();
-  }
-}
-
-size_t PerfectHashFunction::operator()(const string& key) const {
-  return cmph_search(mphf_, &key[0], key.size());
-}
-
-size_t PerfectHashFunction::number_of_keys() const {
-  return cmph_size(mphf_);
-}
-
-#endif
diff --git a/utils/perfect_hash.h b/utils/perfect_hash.h
deleted file mode 100644
index 8c12c9f0..00000000
--- a/utils/perfect_hash.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef PERFECT_HASH_MAP_H_
-#define PERFECT_HASH_MAP_H_
-
-#include <vector>
-#include <boost/utility.hpp>
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#ifdef HAVE_CMPH
-#include "cmph.h"
-#endif
-
-class PerfectHashFunction : boost::noncopyable {
- public:
-  explicit PerfectHashFunction(const std::string& fname);
-  ~PerfectHashFunction();
-  size_t operator()(const std::string& key) const;
-  size_t number_of_keys() const;
- private:
-#ifdef HAVE_CMPH
-  cmph_t *mphf_;
-#endif
-};
-
-#endif
diff --git a/utils/phmt.cc b/utils/phmt.cc
deleted file mode 100644
index b17febf6..00000000
--- a/utils/phmt.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#ifndef HAVE_CMPH
-int main() {
-  return 0;
-}
-#else
-
-#include <iostream>
-#include "weights.h"
-#include "fdict.h"
-
-using namespace std;
-
-int main(int argc, char** argv) {
-  if (argc != 2) { cerr << "Usage: " << argv[0] << " file.mphf\n"; return 1; }
-  FD::EnableHash(argv[1]);
-  cerr << "Number of keys: " << FD::NumFeats() << endl;
-  cerr << "LexFE = " << FD::Convert("LexFE") << endl;
-  cerr << "LexEF = " << FD::Convert("LexEF") << endl;
-  {
-    vector<weight_t> v(FD::NumFeats());
-    v[FD::Convert("LexFE")] = 1.0;
-    v[FD::Convert("LexEF")] = 0.5;
-    cerr << "Writing...\n";
-    Weights::WriteToFile("weights.bin", v);
-    cerr << "Done.\n";
-  }
-  {
-    vector<weight_t> v(FD::NumFeats());
-    cerr << "Reading...\n";
-    Weights::InitFromFile("weights.bin", &v);
-    cerr << "Done.\n";
-    assert(v[FD::Convert("LexFE")] == 1.0);
-    assert(v[FD::Convert("LexEF")] == 0.5);
-  }
-}
-
-#endif
-
author	Chris Dyer <redpony@gmail.com>	2015-03-04 21:36:51 -0500
committer	Chris Dyer <redpony@gmail.com>	2015-03-04 21:36:51 -0500
commit	06f1b6aedbd96d652d5337cf1b93b51e4dd9a620 (patch)
tree	a736ff77bbe61810c71b421e9ff0e0ecd7638446
parent	95183b5760d7f168ae093ae8f9b29740628a278f (diff)