From 3b004be48979da652cc64e7a01e685190eb79498 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 6 Jul 2011 20:41:52 -0400
Subject: tool to compute feature expectations in translation charts

---
 training/Makefile.am             |   4 +
 training/feature_expectations.cc | 232 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 236 insertions(+)
 create mode 100644 training/feature_expectations.cc

(limited to 'training')
diff --git a/training/Makefile.am b/training/Makefile.am
index 0d9085e4..e075e417 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -14,6 +14,7 @@ bin_PROGRAMS = \
   mpi_batch_optimize \
   mpi_em_optimize \
   compute_cllh \
+  feature_expectations \
   augment_grammar
 
 noinst_PROGRAMS = \
@@ -28,6 +29,9 @@ mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval
 mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc
 mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
+feature_expectations_SOURCES = feature_expectations.cc
+feature_expectations_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
 mpi_em_optimize_SOURCES = mpi_em_optimize.cc optimize.cc
 mpi_em_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
diff --git a/training/feature_expectations.cc b/training/feature_expectations.cc
new file mode 100644
index 00000000..f1a85495
--- /dev/null
+++ b/training/feature_expectations.cc
@@ -0,0 +1,232 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+#include <tr1/memory>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "verbose.h"
+#include "hg.h"
+#include "prob.h"
+#include "inside_outside.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "online_optimizer.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+#ifdef HAVE_MPI
+#include <boost/mpi/timer.hpp>
+#include <boost/mpi.hpp>
+namespace mpi = boost::mpi;
+#endif
+
+using namespace std;
+namespace po = boost::program_options;
+
+struct FComp {
+  const vector<double>& w_;
+  FComp(const vector<double>& w) : w_(w) {}
+  bool operator()(int a, int b) const {
+    return fabs(w_[a]) > fabs(w_[b]);
+  }
+};
+
+void ShowFeatures(const vector<double>& w) {
+  vector<int> fnums(w.size());
+  for (int i = 0; i < w.size(); ++i)
+    fnums[i] = i;
+  sort(fnums.begin(), fnums.end(), FComp(w));
+  for (vector<int>::iterator i = fnums.begin(); i != fnums.end(); ++i) {
+    if (w[*i]) cout << FD::Convert(*i) << ' ' << w[*i] << endl;
+  }
+}
+
+void ReadConfig(const string& ini, vector<string>* out) {
+  ReadFile rf(ini);
+  istream& in = *rf.stream();
+  while(in) {
+    string line;
+    getline(in, line);
+    if (!in) continue;
+    out->push_back(line);
+  }
+}
+
+void StoreConfig(const vector<string>& cfg, istringstream* o) {
+  ostringstream os;
+  for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; }
+  o->str(os.str());
+}
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("input,i",po::value<string>(),"Corpus of source language sentences")
+        ("weights,w",po::value<string>(),"Input feature weights file")
+        ("decoder_config,c",po::value<string>(), "cdec.ini file");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("input") || !conf->count("decoder_config")) {
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  int id = 0;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (id % size == rank) {
+      c->push_back(line);
+      order->push_back(id);
+    }
+    ++id;
+  }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct TrainingObserver : public DecoderObserver {
+  void Reset() {
+    acc_exp.clear();
+    total_complete = 0;
+  } 
+
+  virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
+    cur_model_exp.clear();
+    state = 1;
+  }
+
+  // compute model expectations, denominator of objective
+  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    assert(state == 1);
+    state = 2;
+    const prob_t z = InsideOutside<prob_t,
+                                   EdgeProb,
+                                   SparseVector<prob_t>,
+                                   EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
+    cur_model_exp /= z;
+    acc_exp += cur_model_exp;
+  }
+
+  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    cerr << "IGNORING ALIGNMENT FOREST!\n";
+  }
+
+  virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
+    if (state == 2) {
+      ++total_complete;
+    }
+  }
+
+  void GetExpectations(SparseVector<double>* g) const {
+    g->clear();
+    for (SparseVector<prob_t>::const_iterator it = acc_exp.begin(); it != acc_exp.end(); ++it)
+      g->set_value(it->first, it->second);
+  }
+
+  int total_complete;
+  SparseVector<prob_t> cur_model_exp;
+  SparseVector<prob_t> acc_exp;
+  int state;
+};
+
+#ifdef HAVE_MPI
+namespace boost { namespace mpi {
+  template<>
+  struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> > 
+    : mpl::true_ { };
+} } // end namespace boost::mpi
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  const int size = 1;
+  const int rank = 0;
+#endif
+  if (size > 1) SetSilent(true);  // turn off verbose decoder output
+  register_feature_functions();
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf))
+    return 1;
+
+  // load initial weights
+  Weights weights;
+  if (conf.count("weights"))
+    weights.InitFromFile(conf["weights"].as<string>());
+
+  vector<string> corpus;
+  vector<int> ids;
+  ReadTrainingCorpus(conf["input"].as<string>(), rank, size, &corpus, &ids);
+  assert(corpus.size() > 0);
+
+  vector<string> cdec_ini;
+  ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
+  istringstream ini;
+  StoreConfig(cdec_ini, &ini);
+  Decoder decoder(&ini);
+  if (decoder.GetConf()["input"].as<string>() != "-") {
+    cerr << "cdec.ini must not set an input file\n";
+    return 1;
+  }
+
+  SparseVector<double> x;
+  weights.InitSparseVector(&x);
+  TrainingObserver observer;
+
+  weights.InitFromVector(x);
+  vector<double> lambdas;
+  weights.InitVector(&lambdas);
+  decoder.SetWeights(lambdas);
+  observer.Reset();
+  for (unsigned i = 0; i < corpus.size(); ++i) {
+    int id = ids[i];
+    decoder.SetId(id);
+    decoder.Decode(corpus[i], &observer);
+  }
+  SparseVector<double> local_exps, exps;
+  observer.GetExpectations(&local_exps);
+#ifdef HAVE_MPI
+  reduce(world, local_exps, exps, std::plus<SparseVector<double> >(), 0);
+#else
+  exps.swap(local_exps);
+#endif
+
+  weights.InitFromVector(exps);
+  weights.InitVector(&lambdas);
+  ShowFeatures(lambdas);
+
+  return 0;
+}
-- 
cgit v1.2.3


From 251da4347ea356f799e6c227ac8cf541c0cef2f2 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 13 Sep 2011 17:36:23 +0100
Subject: get rid of bad Weights class so it no longer keeps a copy of a vector
 inside it

---
 decoder/decoder.cc                    |  64 ++++++++---------
 decoder/decoder.h                     |   9 ++-
 mira/kbest_mira.cc                    |  62 ++++-------------
 pro-train/mr_pro_map.cc               |   8 +--
 pro-train/mr_pro_reduce.cc            |  16 ++---
 training/Makefile.am                  |   8 ---
 training/augment_grammar.cc           |   4 +-
 training/collapse_weights.cc          |   6 +-
 training/compute_cllh.cc              |  23 +++---
 training/grammar_convert.cc           |   8 +--
 training/mpi_batch_optimize.cc        | 127 ++++++++--------------------------
 training/mpi_online_optimize.cc       |  69 +++++++-----------
 training/mr_optimize_reduce.cc        |  19 ++---
 utils/fdict.h                         |   2 +
 utils/phmt.cc                         |   8 +--
 utils/weights.cc                      |  75 ++++++++++++--------
 utils/weights.h                       |  22 +++---
 vest/mr_vest_generate_mapper_input.cc |   6 +-
 18 files changed, 201 insertions(+), 335 deletions(-)

(limited to 'training')

diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 25eb2de4..4d4b6245 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -159,8 +159,7 @@ struct RescoringPass {
   shared_ptr<ModelSet> models;
   shared_ptr<IntersectionConfiguration> inter_conf;
   vector<const FeatureFunction*> ffs;
-  shared_ptr<Weights> w;      // null == use previous weights
-  vector<double> weight_vector;
+  shared_ptr<vector<weight_t> > weight_vector;
   int fid_summary;            // 0 == no summary feature
   double density_prune;       // 0 == don't density prune
   double beam_prune;          // 0 == don't beam prune
@@ -169,7 +168,7 @@ struct RescoringPass {
 ostream& operator<<(ostream& os, const RescoringPass& rp) {
   os << "[num_fn=" << rp.ffs.size();
   if (rp.inter_conf) { os << " int_alg=" << *rp.inter_conf; }
-  if (rp.w) os << " new_weights";
+  //if (rp.weight_vector.size() > 0) os << " new_weights";
   if (rp.fid_summary) os << " summary_feature=" << FD::Convert(rp.fid_summary);
   if (rp.density_prune) os << " density_prune=" << rp.density_prune;
   if (rp.beam_prune) os << " beam_prune=" << rp.beam_prune;
@@ -181,13 +180,8 @@ struct DecoderImpl {
   DecoderImpl(po::variables_map& conf, int argc, char** argv, istream* cfg);
   ~DecoderImpl();
   bool Decode(const string& input, DecoderObserver*);
-  void SetWeights(const vector<double>& weights) {
-    init_weights = weights;
-    for (int i = 0; i < rescoring_passes.size(); ++i) {
-      if (rescoring_passes[i].models)
-        rescoring_passes[i].models->SetWeights(weights);
-      rescoring_passes[i].weight_vector = weights;
-    }
+  vector<weight_t>& CurrentWeightVector() {
+    return *rescoring_passes.back().weight_vector;
   }
   void SetId(int next_sent_id) { sent_id = next_sent_id - 1; }
 
@@ -300,8 +294,7 @@ struct DecoderImpl {
   OracleBleu oracle;
   string formalism;
   shared_ptr<Translator> translator;
-  Weights w_init_weights;      // used with initial parse
-  vector<double> init_weights; // weights used with initial parse
+  shared_ptr<vector<weight_t> > init_weights; // weights used with initial parse
   vector<shared_ptr<FeatureFunction> > pffs;
 #ifdef FSA_RESCORING
   CFGOptions cfg_options;
@@ -557,13 +550,18 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
     exit(1);
   }
 
-  // load initial feature weights (and possibly freeze feature set)
-  if (conf.count("weights")) {
-    w_init_weights.InitFromFile(str("weights",conf));
-    w_init_weights.InitVector(&init_weights);
-    init_weights.resize(FD::NumFeats());
+  // load perfect hash function for features
+  if (conf.count("cmph_perfect_feature_hash")) {
+    cerr << "Loading perfect hash function from " << conf["cmph_perfect_feature_hash"].as<string>() << " ...\n";
+    FD::EnableHash(conf["cmph_perfect_feature_hash"].as<string>());
+    cerr << "  " << FD::NumFeats() << " features in map\n";
   }
 
+  // load initial feature weights (and possibly freeze feature set)
+  init_weights.reset(new vector<weight_t>);
+  if (conf.count("weights"))
+    Weights::InitFromFile(str("weights",conf), init_weights.get());
+
   // cube pruning pop-limit: we may want to configure this on a per-pass basis
   pop_limit = conf["cubepruning_pop_limit"].as<int>();
 
@@ -582,9 +580,8 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
       RescoringPass& rp = rescoring_passes.back();
       // only configure new weights if pass > 0, otherwise we reuse the initial chart weights
       if (nth_pass_condition && conf.count(ws)) {
-        rp.w.reset(new Weights);
-        rp.w->InitFromFile(str(ws.c_str(), conf));
-        rp.w->InitVector(&rp.weight_vector);
+        rp.weight_vector.reset(new vector<weight_t>());
+        Weights::InitFromFile(str(ws.c_str(), conf), rp.weight_vector.get());
       }
       bool has_stateful = false;
       if (conf.count(ff)) {
@@ -624,11 +621,15 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
   }
 
   // set up weight vectors since later phases may reuse weights from earlier phases
-  const vector<double>* prev = &init_weights;
+  shared_ptr<vector<weight_t> > prev_weights = init_weights;
   for (int pass = 0; pass < rescoring_passes.size(); ++pass) {
     RescoringPass& rp = rescoring_passes[pass];
-    if (!rp.w) { rp.weight_vector = *prev; } else { prev = &rp.weight_vector; }
-    rp.models.reset(new ModelSet(rp.weight_vector, rp.ffs));
+    if (!rp.weight_vector) {
+      rp.weight_vector = prev_weights;
+    } else {
+      prev_weights = rp.weight_vector;
+    }
+    rp.models.reset(new ModelSet(*rp.weight_vector, rp.ffs));
     string ps = "Pass1 "; ps[4] += pass;
     if (!SILENT) show_models(conf,*rp.models,ps.c_str());
   }
@@ -650,12 +651,6 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
     FD::Freeze(); // this means we can't see the feature names of not-weighted features
   }
 
-  if (conf.count("cmph_perfect_feature_hash")) {
-    cerr << "Loading perfect hash function from " << conf["cmph_perfect_feature_hash"].as<string>() << " ...\n";
-    FD::EnableHash(conf["cmph_perfect_feature_hash"].as<string>());
-    cerr << "  " << FD::NumFeats() << " features in map\n";
-  }
-
   // set up translation back end
   if (formalism == "scfg")
     translator.reset(new SCFGTranslator(conf));
@@ -685,7 +680,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
   }
   if (!fsa_ffs.empty()) {
     cerr<<"FSA: ";
-    show_all_features(fsa_ffs,init_weights,cerr,cerr,true,true);
+    show_all_features(fsa_ffs,*init_weights,cerr,cerr,true,true);
   }
 #endif
 
@@ -733,7 +728,8 @@ bool Decoder::Decode(const string& input, DecoderObserver* o) {
   if (del) delete o;
   return res;
 }
-void Decoder::SetWeights(const vector<double>& weights) { pimpl_->SetWeights(weights); }
+vector<weight_t>& Decoder::CurrentWeightVector() { return pimpl_->CurrentWeightVector(); }
+const vector<weight_t>& Decoder::CurrentWeightVector() const { return pimpl_->CurrentWeightVector(); }
 void Decoder::SetSupplementalGrammar(const std::string& grammar_string) {
   assert(pimpl_->translator->GetDecoderType() == "SCFG");
   static_cast<SCFGTranslator&>(*pimpl_->translator).SetSupplementalGrammar(grammar_string);
@@ -774,7 +770,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
   translator->ProcessMarkupHints(smeta.sgml_);
   Timer t("Translation");
   const bool translation_successful =
-    translator->Translate(to_translate, &smeta, init_weights, &forest);
+    translator->Translate(to_translate, &smeta, *init_weights, &forest);
   translator->SentenceComplete();
 
   if (!translation_successful) {
@@ -812,7 +808,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
 
   for (int pass = 0; pass < rescoring_passes.size(); ++pass) {
     const RescoringPass& rp = rescoring_passes[pass];
-    const vector<double>& cur_weights = rp.weight_vector;
+    const vector<weight_t>& cur_weights = *rp.weight_vector;
     if (!SILENT) cerr << endl << "  RESCORING PASS #" << (pass+1) << " " << rp << endl;
 #ifdef FSA_RESCORING
     cfg_options.maybe_output_source(forest);
@@ -933,7 +929,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
 #endif
   }
 
-  const vector<double>& last_weights = (rescoring_passes.empty() ? init_weights : rescoring_passes.back().weight_vector);
+  const vector<double>& last_weights = (rescoring_passes.empty() ? *init_weights : *rescoring_passes.back().weight_vector);
 
   // Oracle Rescoring
   if(get_oracle_forest) {
diff --git a/decoder/decoder.h b/decoder/decoder.h
index 5491369f..9d009ffa 100644
--- a/decoder/decoder.h
+++ b/decoder/decoder.h
@@ -7,6 +7,8 @@
 #include <boost/shared_ptr.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "weights.h"  // weight_t
+
 #undef CP_TIME
 //#define CP_TIME
 #ifdef CP_TIME
@@ -39,7 +41,12 @@ struct Decoder {
   Decoder(int argc, char** argv);
   Decoder(std::istream* config_file);
   bool Decode(const std::string& input, DecoderObserver* observer = NULL);
-  void SetWeights(const std::vector<double>& weights);
+
+  // access this to either *read* or *write* to the decoder's last
+  // weight vector (i.e., the weights of the finest past)
+  std::vector<weight_t>& CurrentWeightVector();
+  const std::vector<weight_t>& CurrentWeightVector() const;
+
   void SetId(int id);
   ~Decoder();
   const boost::program_options::variables_map& GetConf() const { return conf; }
diff --git a/mira/kbest_mira.cc b/mira/kbest_mira.cc
index 6918a9a1..459a5e6f 100644
--- a/mira/kbest_mira.cc
+++ b/mira/kbest_mira.cc
@@ -32,21 +32,6 @@ namespace po = boost::program_options;
 bool invert_score;
 boost::shared_ptr<MT19937> rng;
 
-void SanityCheck(const vector<double>& w) {
-  for (int i = 0; i < w.size(); ++i) {
-    assert(!isnan(w[i]));
-    assert(!isinf(w[i]));
-  }
-}
-
-struct FComp {
-  const vector<double>& w_;
-  FComp(const vector<double>& w) : w_(w) {}
-  bool operator()(int a, int b) const {
-    return fabs(w_[a]) > fabs(w_[b]);
-  }
-};
-
 void RandomPermutation(int len, vector<int>* p_ids) {
   vector<int>& ids = *p_ids;
   ids.resize(len);
@@ -58,21 +43,6 @@ void RandomPermutation(int len, vector<int>* p_ids) {
   }  
 }
 
-void ShowLargestFeatures(const vector<double>& w) {
-  vector<int> fnums(w.size());
-  for (int i = 0; i < w.size(); ++i)
-    fnums[i] = i;
-  vector<int>::iterator mid = fnums.begin();
-  mid += (w.size() > 10 ? 10 : w.size());
-  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
-  cerr << "TOP FEATURES:";
-  --mid;
-  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
-    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
-  }
-  cerr << endl;
-}
-
 bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
@@ -209,14 +179,16 @@ int main(int argc, char** argv) {
     cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n";
     return 1;
   }
-  // load initial weights
-  Weights weights;
-  weights.InitFromFile(conf["input_weights"].as<string>());
-  SparseVector<double> lambdas;
-  weights.InitSparseVector(&lambdas);
 
   ReadFile ini_rf(conf["decoder_config"].as<string>());
   Decoder decoder(ini_rf.stream());
+
+  // load initial weights
+  vector<weight_t>& dense_weights = decoder.CurrentWeightVector();
+  SparseVector<weight_t> lambdas;
+  Weights::InitFromFile(conf["input_weights"].as<string>(), &dense_weights);
+  Weights::InitSparseVector(dense_weights, &lambdas);
+
   const double max_step_size = conf["max_step_size"].as<double>();
   const double mt_metric_scale = conf["mt_metric_scale"].as<double>();
 
@@ -230,7 +202,6 @@ int main(int argc, char** argv) {
   double tot_loss = 0;
   int dots = 0;
   int cur_pass = 0;
-  vector<double> dense_weights;
   SparseVector<double> tot;
   tot += lambdas;          // initial weights
   normalizer++;            // count for initial weights
@@ -240,27 +211,22 @@ int main(int argc, char** argv) {
   vector<int> order;
   RandomPermutation(corpus.size(), &order);
   while (lcount <= max_iteration) {
-    dense_weights.clear();
-    weights.InitFromVector(lambdas);
-    weights.InitVector(&dense_weights);
-    decoder.SetWeights(dense_weights);
+    lambdas.init_vector(&dense_weights);
     if ((cur_sent * 40 / corpus.size()) > dots) { ++dots; cerr << '.'; }
     if (corpus.size() == cur_sent) {
       cerr << " [AVG METRIC LAST PASS=" << (tot_loss / corpus.size()) << "]\n";
-      ShowLargestFeatures(dense_weights);
+      Weights::ShowLargestFeatures(dense_weights);
       cur_sent = 0;
       tot_loss = 0;
       dots = 0;
       ostringstream os;
       os << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << ".gz";
-      weights.WriteToFile(os.str(), true, &msg);
       SparseVector<double> x = tot;
       x /= normalizer;
       ostringstream sa;
       sa << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "-avg.gz";
-      Weights ww;
-      ww.InitFromVector(x);
-      ww.WriteToFile(sa.str(), true, &msga);
+      x.init_vector(&dense_weights);
+      Weights::WriteToFile(os.str(), dense_weights, true, &msg);
       ++cur_pass;
       RandomPermutation(corpus.size(), &order);
     }
@@ -294,11 +260,11 @@ int main(int argc, char** argv) {
     ++cur_sent;
   }
   cerr << endl;
-  weights.WriteToFile("weights.mira-final.gz", true, &msg);
+  Weights::WriteToFile("weights.mira-final.gz", dense_weights, true, &msg);
   tot /= normalizer;
-  weights.InitFromVector(tot);
+  tot.init_vector(dense_weights);
   msg = "# MIRA tuned weights (averaged vector)";
-  weights.WriteToFile("weights.mira-final-avg.gz", true, &msg);
+  Weights::WriteToFile("weights.mira-final-avg.gz", dense_weights, true, &msg);
   cerr << "Optimization complete.\nAVERAGED WEIGHTS: weights.mira-final-avg.gz\n";
   return 0;
 }
diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc
index 4324e8de..bc59285b 100644
--- a/pro-train/mr_pro_map.cc
+++ b/pro-train/mr_pro_map.cc
@@ -301,12 +301,8 @@ int main(int argc, char** argv) {
   const unsigned gamma = conf["candidate_pairs"].as<unsigned>();
   const unsigned xi = conf["best_pairs"].as<unsigned>();
   string weightsf = conf["weights"].as<string>();
-  vector<double> weights;
-  {
-    Weights w;
-    w.InitFromFile(weightsf);
-    w.InitVector(&weights);
-  }
+  vector<weight_t> weights;
+  Weights::InitFromFile(weightsf, &weights);
   string kbest_repo = conf["kbest_repository"].as<string>();
   MkDirP(kbest_repo);
   while(in) {
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 9b422f33..9caaa1d1 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -194,7 +194,7 @@ int main(int argc, char** argv) {
   InitCommandLine(argc, argv, &conf);
   string line;
   vector<pair<bool, SparseVector<double> > > training, testing;
-  SparseVector<double> old_weights;
+  SparseVector<weight_t> old_weights;
   const bool tune_regularizer = conf.count("tune_regularizer");
   if (tune_regularizer && !conf.count("testset")) {
     cerr << "--tune_regularizer requires --testset to be set\n";
@@ -210,9 +210,9 @@ int main(int argc, char** argv) {
   const double psi = conf["interpolation"].as<double>();
   if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; }
   if (conf.count("weights")) {
-    Weights w;
-    w.InitFromFile(conf["weights"].as<string>());
-    w.InitSparseVector(&old_weights);
+    vector<weight_t> dt;
+    Weights::InitFromFile(conf["weights"].as<string>(), &dt);
+    Weights::InitSparseVector(dt, &old_weights);
   }
   ReadCorpus(&cin, &training);
   if (conf.count("testset")) {
@@ -220,8 +220,8 @@ int main(int argc, char** argv) {
     ReadCorpus(rf.stream(), &testing);
   }
   cerr << "Number of features: " << FD::NumFeats() << endl;
-  vector<double> x(FD::NumFeats(), 0.0);  // x[0] is bias
-  for (SparseVector<double>::const_iterator it = old_weights.begin();
+  vector<weight_t> x(FD::NumFeats(), 0.0);  // x[0] is bias
+  for (SparseVector<weight_t>::const_iterator it = old_weights.begin();
        it != old_weights.end(); ++it)
     x[it->first] = it->second;
   double tppl = 0.0;
@@ -257,7 +257,6 @@ int main(int argc, char** argv) {
     sigsq = sp[best_i].first;
     tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as<unsigned>(), &x);
   }
-  Weights w;
   if (conf.count("weights")) {
     for (int i = 1; i < x.size(); ++i)
       x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi);
@@ -271,7 +270,6 @@ int main(int argc, char** argv) {
       cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl;
     }
   }
-  w.InitFromVector(x);
-  w.WriteToFile("-");
+  Weights::WriteToFile("-", x);
   return 0;
 }
diff --git a/training/Makefile.am b/training/Makefile.am
index e075e417..6e2c06f5 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -12,9 +12,7 @@ bin_PROGRAMS = \
   cllh_filter_grammar \
   mpi_online_optimize \
   mpi_batch_optimize \
-  mpi_em_optimize \
   compute_cllh \
-  feature_expectations \
   augment_grammar
 
 noinst_PROGRAMS = \
@@ -29,12 +27,6 @@ mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval
 mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc
 mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
-feature_expectations_SOURCES = feature_expectations.cc
-feature_expectations_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_em_optimize_SOURCES = mpi_em_optimize.cc optimize.cc
-mpi_em_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
 compute_cllh_SOURCES = compute_cllh.cc
 compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
diff --git a/training/augment_grammar.cc b/training/augment_grammar.cc
index df8d4ee8..e89a92d5 100644
--- a/training/augment_grammar.cc
+++ b/training/augment_grammar.cc
@@ -134,9 +134,7 @@ int main(int argc, char** argv) {
   } else { ngram = NULL; }
   extra_feature = conf.count("extra_lex_feature") > 0;
   if (conf.count("collapse_weights")) {
-    Weights w;
-    w.InitFromFile(conf["collapse_weights"].as<string>());
-    w.InitVector(&col_weights);
+    Weights::InitFromFile(conf["collapse_weights"].as<string>(), &col_weights);
   }
   clear_features = conf.count("clear_features_after_collapse") > 0;
   gather_rules = false;
diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc
index 4fb742fb..dc480f6c 100644
--- a/training/collapse_weights.cc
+++ b/training/collapse_weights.cc
@@ -59,10 +59,8 @@ int main(int argc, char** argv) {
   InitCommandLine(argc, argv, &conf);
   const string wfile = conf["weights"].as<string>();
   const string gfile = conf["grammar"].as<string>();
-  Weights wm;
-  wm.InitFromFile(wfile);
-  vector<double> w;
-  wm.InitVector(&w);
+  vector<weight_t> w;
+  Weights::InitFromFile(wfile, &w);
   MarginalMap e_tots;
   MarginalMap f_tots;
   prob_t tot;
diff --git a/training/compute_cllh.cc b/training/compute_cllh.cc
index 332f6d0c..b496d196 100644
--- a/training/compute_cllh.cc
+++ b/training/compute_cllh.cc
@@ -148,15 +148,6 @@ int main(int argc, char** argv) {
   if (!InitCommandLine(argc, argv, &conf))
     return false;
 
-  // load initial weights
-  Weights weights;
-  if (conf.count("weights"))
-    weights.InitFromFile(conf["weights"].as<string>());
-
-  // freeze feature set
-  //const bool freeze_feature_set = conf.count("freeze_feature_set");
-  //if (freeze_feature_set) FD::Freeze();
-
   // load cdec.ini and set up decoder
   ReadFile ini_rf(conf["decoder_config"].as<string>());
   Decoder decoder(ini_rf.stream());
@@ -165,17 +156,22 @@ int main(int argc, char** argv) {
     abort();
   }
 
+  // load weights
+  vector<weight_t>& weights = decoder.CurrentWeightVector();
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &weights);
+
+  // freeze feature set
+  //const bool freeze_feature_set = conf.count("freeze_feature_set");
+  //if (freeze_feature_set) FD::Freeze();
+
   vector<string> corpus; vector<int> ids;
   ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
   assert(corpus.size() > 0);
   assert(corpus.size() == ids.size());
 
-  vector<double> wv;
-  weights.InitVector(&wv);
-  decoder.SetWeights(wv);
   TrainingObserver observer;
   double objective = 0;
-  bool converged = false;
 
   observer.Reset();
   if (rank == 0)
@@ -197,3 +193,4 @@ int main(int argc, char** argv) {
 
   return 0;
 }
+
diff --git a/training/grammar_convert.cc b/training/grammar_convert.cc
index 8d292f8a..bf8abb26 100644
--- a/training/grammar_convert.cc
+++ b/training/grammar_convert.cc
@@ -251,12 +251,10 @@ int main(int argc, char **argv) {
   const bool is_split_input = (conf["format"].as<string>() == "split");
   const bool is_json_input = is_split_input || (conf["format"].as<string>() == "json");
   const bool collapse_weights = conf.count("collapse_weights");
-  Weights wts;
   vector<double> w;
-  if (conf.count("weights")) {
-    wts.InitFromFile(conf["weights"].as<string>());
-    wts.InitVector(&w);
-  }
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &w);
+
   if (collapse_weights && !w.size()) {
     cerr << "--collapse_weights requires a weights file to be specified!\n";
     exit(1);
diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc
index 39a8af7d..cc5953f6 100644
--- a/training/mpi_batch_optimize.cc
+++ b/training/mpi_batch_optimize.cc
@@ -31,42 +31,12 @@ using namespace std;
 using boost::shared_ptr;
 namespace po = boost::program_options;
 
-void SanityCheck(const vector<double>& w) {
-  for (int i = 0; i < w.size(); ++i) {
-    assert(!isnan(w[i]));
-    assert(!isinf(w[i]));
-  }
-}
-
-struct FComp {
-  const vector<double>& w_;
-  FComp(const vector<double>& w) : w_(w) {}
-  bool operator()(int a, int b) const {
-    return fabs(w_[a]) > fabs(w_[b]);
-  }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
-  vector<int> fnums(w.size());
-  for (int i = 0; i < w.size(); ++i)
-    fnums[i] = i;
-  vector<int>::iterator mid = fnums.begin();
-  mid += (w.size() > 10 ? 10 : w.size());
-  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
-  cerr << "TOP FEATURES:";
-  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
-    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
-  }
-  cerr << endl;
-}
-
 bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
         ("input_weights,w",po::value<string>(),"Input feature weights file")
         ("training_data,t",po::value<string>(),"Training data")
         ("decoder_config,d",po::value<string>(),"Decoder configuration file")
-        ("sharded_input,s",po::value<string>(), "Corpus and grammar files are 'sharded' so each processor loads its own input and grammar file. Argument is the directory containing the shards.")
         ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file")
         ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)")
 	("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory")
@@ -88,14 +58,10 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   }
   po::notify(*conf);
 
-  if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data") | conf->count("sharded_input")) || !conf->count("decoder_config")) {
+  if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data")) || !conf->count("decoder_config")) {
     cerr << dcmdline_options << endl;
     return false;
   }
-  if (conf->count("training_data") && conf->count("sharded_input")) {
-    cerr << "Cannot specify both --training_data and --sharded_input\n";
-    return false;
-  }
   return true;
 }
 
@@ -236,42 +202,9 @@ int main(int argc, char** argv) {
   po::variables_map conf;
   if (!InitCommandLine(argc, argv, &conf)) return 1;
 
-  string shard_dir;
-  if (conf.count("sharded_input")) {
-    shard_dir = conf["sharded_input"].as<string>();
-    if (!DirectoryExists(shard_dir)) {
-      if (rank == 0) cerr << "Can't find shard directory: " << shard_dir << endl;
-      return 1;
-    }
-    if (rank == 0)
-      cerr << "Shard directory: " << shard_dir << endl;
-  }
-
-  // load initial weights
-  Weights weights;
-  if (rank == 0) { cerr << "Loading weights...\n"; }
-  weights.InitFromFile(conf["input_weights"].as<string>());
-  if (rank == 0) { cerr << "Done loading weights.\n"; }
-
-  // freeze feature set (should be optional?)
-  const bool freeze_feature_set = true;
-  if (freeze_feature_set) FD::Freeze();
-
   // load cdec.ini and set up decoder
   vector<string> cdec_ini;
   ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
-  if (shard_dir.size()) {
-    if (rank == 0) {
-      for (int i = 0; i < cdec_ini.size(); ++i) {
-        if (cdec_ini[i].find("grammar=") == 0) {
-          cerr << "!!! using sharded input and " << conf["decoder_config"].as<string>() << " contains a grammar specification:\n" << cdec_ini[i] << "\n  VERIFY THAT THIS IS CORRECT!\n";
-        }
-      }
-    }
-    ostringstream g;
-    g << "grammar=" << shard_dir << "/grammar." << rank << "_of_" << size << ".gz";
-    cdec_ini.push_back(g.str());
-  }
   istringstream ini;
   StoreConfig(cdec_ini, &ini);
   if (rank == 0) cerr << "Loading grammar...\n";
@@ -282,22 +215,28 @@ int main(int argc, char** argv) {
   }
   if (rank == 0) cerr << "Done loading grammar!\n";
 
+  // load initial weights
+  if (rank == 0) { cerr << "Loading weights...\n"; }
+  vector<weight_t>& lambdas = decoder->CurrentWeightVector();
+  Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas);
+  if (rank == 0) { cerr << "Done loading weights.\n"; }
+
+  // freeze feature set (should be optional?)
+  const bool freeze_feature_set = true;
+  if (freeze_feature_set) FD::Freeze();
+
   const int num_feats = FD::NumFeats();
   if (rank == 0) cerr << "Number of features: " << num_feats << endl;
+  lambdas.resize(num_feats);
+
   const bool gaussian_prior = conf.count("gaussian_prior");
-  vector<double> means(num_feats, 0);
+  vector<weight_t> means(num_feats, 0);
   if (conf.count("means")) {
     if (!gaussian_prior) {
       cerr << "Don't use --means without --gaussian_prior!\n";
       exit(1);
     }
-    Weights wm; 
-    wm.InitFromFile(conf["means"].as<string>());
-    if (num_feats != FD::NumFeats()) {
-      cerr << "[ERROR] Means file had unexpected features!\n";
-      exit(1);
-    }
-    wm.InitVector(&means);
+    Weights::InitFromFile(conf["means"].as<string>(), &means);
   }
   shared_ptr<BatchOptimizer> o;
   if (rank == 0) {
@@ -309,26 +248,13 @@ int main(int argc, char** argv) {
     cerr << "Optimizer: " << o->Name() << endl;
   }
   double objective = 0;
-  vector<double> lambdas(num_feats, 0.0);
-  weights.InitVector(&lambdas);
-  if (lambdas.size() != num_feats) {
-    cerr << "Initial weights file did not have all features specified!\n  feats="
-         << num_feats << "\n  weights file=" << lambdas.size() << endl;
-    lambdas.resize(num_feats, 0.0);
-  }
   vector<double> gradient(num_feats, 0.0);
-  vector<double> rcv_grad(num_feats, 0.0);
+  vector<double> rcv_grad;
+  rcv_grad.clear();
   bool converged = false;
 
   vector<string> corpus;
-  if (shard_dir.size()) {
-    ostringstream os; os << shard_dir << "/corpus." << rank << "_of_" << size;
-    ReadTrainingCorpus(os.str(), 0, 1, &corpus);
-    cerr << os.str() << " has " << corpus.size() << " training examples. " << endl;
-    if (corpus.size() > 500) { corpus.resize(500); cerr << "  TRUNCATING\n"; }
-  } else {
-    ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
-  }
+  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
   assert(corpus.size() > 0);
 
   TrainingObserver observer;
@@ -341,19 +267,20 @@ int main(int argc, char** argv) {
     if (rank == 0) {
       cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n";
     }
-    decoder->SetWeights(lambdas);
     for (int i = 0; i < corpus.size(); ++i)
       decoder->Decode(corpus[i], &observer);
     cerr << "  process " << rank << '/' << size << " done\n";
     fill(gradient.begin(), gradient.end(), 0);
-    fill(rcv_grad.begin(), rcv_grad.end(), 0);
     observer.SetLocalGradientAndObjective(&gradient, &objective);
 
     double to = 0;
 #ifdef HAVE_MPI
+    rcv_grad.resize(num_feats, 0.0);
     mpi::reduce(world, &gradient[0], gradient.size(), &rcv_grad[0], plus<double>(), 0);
-    mpi::reduce(world, objective, to, plus<double>(), 0);
     swap(gradient, rcv_grad);
+    rcv_grad.clear();
+
+    mpi::reduce(world, objective, to, plus<double>(), 0);
     objective = to;
 #endif
 
@@ -378,7 +305,7 @@ int main(int argc, char** argv) {
       for (int i = 0; i < gradient.size(); ++i)
         gnorm += gradient[i] * gradient[i];
       cerr << "  GNORM=" << sqrt(gnorm) << endl;
-      vector<double> old = lambdas;
+      vector<weight_t> old = lambdas;
       int c = 0;
       while (old == lambdas) {
         ++c;
@@ -387,9 +314,8 @@ int main(int argc, char** argv) {
         assert(c < 5);
       }
       old.clear();
-      SanityCheck(lambdas);
-      ShowLargestFeatures(lambdas);
-      weights.InitFromVector(lambdas);
+      Weights::SanityCheck(lambdas);
+      Weights::ShowLargestFeatures(lambdas);
 
       converged = o->HasConverged();
       if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
@@ -399,7 +325,7 @@ int main(int argc, char** argv) {
       ostringstream vv;
       vv << "Objective = " << objective << "  (eval count=" << o->EvaluationCount() << ")";
       const string svv = vv.str();
-      weights.WriteToFile(fname, true, &svv);
+      Weights::WriteToFile(fname, lambdas, true, &svv);
     }  // rank == 0
     int cint = converged;
 #ifdef HAVE_MPI
@@ -411,3 +337,4 @@ int main(int argc, char** argv) {
   }
   return 0;
 }
+
diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc
index 32033c19..2ef4a2e7 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/mpi_online_optimize.cc
@@ -31,35 +31,6 @@ namespace mpi = boost::mpi;
 using namespace std;
 namespace po = boost::program_options;
 
-void SanityCheck(const vector<double>& w) {
-  for (int i = 0; i < w.size(); ++i) {
-    assert(!isnan(w[i]));
-    assert(!isinf(w[i]));
-  }
-}
-
-struct FComp {
-  const vector<double>& w_;
-  FComp(const vector<double>& w) : w_(w) {}
-  bool operator()(int a, int b) const {
-    return fabs(w_[a]) > fabs(w_[b]);
-  }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
-  vector<int> fnums(w.size());
-  for (int i = 0; i < w.size(); ++i)
-    fnums[i] = i;
-  vector<int>::iterator mid = fnums.begin();
-  mid += (w.size() > 10 ? 10 : w.size());
-  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
-  cerr << "TOP FEATURES:";
-  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
-    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
-  }
-  cerr << endl;
-}
-
 bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
@@ -250,10 +221,25 @@ int main(int argc, char** argv) {
   if (!InitCommandLine(argc, argv, &conf))
     return 1;
 
+  vector<pair<string, int> > agenda;
+  if (!LoadAgenda(conf["training_agenda"].as<string>(), &agenda))
+    return 1;
+  if (rank == 0)
+    cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n";
+
+  assert(agenda.size() > 0);
+
+  if (1) {  // hack to load the feature hash functions -- TODO this should not be in cdec.ini
+    const string& cur_config = agenda[0].first;
+    const unsigned max_iteration = agenda[0].second;
+    ReadFile ini_rf(cur_config);
+    Decoder decoder(ini_rf.stream());
+  }
+
   // load initial weights
-  Weights weights;
+  vector<weight_t> init_weights;
   if (conf.count("input_weights"))
-    weights.InitFromFile(conf["input_weights"].as<string>());
+    Weights::InitFromFile(conf["input_weights"].as<string>(), &init_weights);
 
   vector<int> frozen_fids;
   if (conf.count("frozen_features")) {
@@ -310,19 +296,12 @@ int main(int argc, char** argv) {
     rng.reset(new MT19937);
 
   SparseVector<double> x;
-  weights.InitSparseVector(&x);
+  Weights::InitSparseVector(init_weights, &x);
   TrainingObserver observer;
 
   int write_weights_every_ith = 100; // TODO configure
   int titer = -1;
 
-  vector<pair<string, int> > agenda;
-  if (!LoadAgenda(conf["training_agenda"].as<string>(), &agenda))
-    return 1;
-  if (rank == 0)
-    cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n";
-
-  vector<double> lambdas;
   for (int ai = 0; ai < agenda.size(); ++ai) {
     const string& cur_config = agenda[ai].first;
     const unsigned max_iteration = agenda[ai].second;
@@ -331,6 +310,8 @@ int main(int argc, char** argv) {
     // load cdec.ini and set up decoder
     ReadFile ini_rf(cur_config);
     Decoder decoder(ini_rf.stream());
+    vector<weight_t>& lambdas = decoder.CurrentWeightVector();
+    if (ai == 0) { lambdas.swap(init_weights); init_weights.clear(); }
 
     if (rank == 0)
       o->ResetEpoch(); // resets the learning rate-- TODO is this good?
@@ -341,15 +322,13 @@ int main(int argc, char** argv) {
 #ifdef HAVE_MPI
       mpi::timer timer;
 #endif
-      weights.InitFromVector(x);
-      weights.InitVector(&lambdas);
+      x.init_vector(&lambdas);
       ++iter; ++titer;
       observer.Reset();
-      decoder.SetWeights(lambdas);
       if (rank == 0) {
         converged = (iter == max_iteration);
-        SanityCheck(lambdas);
-        ShowLargestFeatures(lambdas);
+        Weights::SanityCheck(lambdas);
+        Weights::ShowLargestFeatures(lambdas);
         string fname = "weights.cur.gz";
         if (iter % write_weights_every_ith == 0) {
           ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz";
@@ -360,7 +339,7 @@ int main(int argc, char** argv) {
         vv << "total iter=" << titer << " (of current config iter=" << iter << ")  minibatch=" << size_per_proc << " sentences/proc x " << size << " procs.   num_feats=" << x.size() << '/' << FD::NumFeats() << "   passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << "   eta=" << lr->eta(titer);
         const string svv = vv.str();
         cerr << svv << endl;
-        weights.WriteToFile(fname, true, &svv);
+        Weights::WriteToFile(fname, lambdas, true, &svv);
       }
 
       for (int i = 0; i < size_per_proc; ++i) {
diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc
index b931991d..15e28fa1 100644
--- a/training/mr_optimize_reduce.cc
+++ b/training/mr_optimize_reduce.cc
@@ -88,25 +88,19 @@ int main(int argc, char** argv) {
 
   const bool use_b64 = conf["input_format"].as<string>() == "b64";
 
-  Weights weights;
-  weights.InitFromFile(conf["input_weights"].as<string>());
+  vector<weight_t> lambdas;
+  Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas);
   const string s_obj = "**OBJ**";
   int num_feats = FD::NumFeats();
   cerr << "Number of features: " << num_feats << endl;
   const bool gaussian_prior = conf.count("gaussian_prior");
-  vector<double> means(num_feats, 0);
+  vector<weight_t> means(num_feats, 0);
   if (conf.count("means")) {
     if (!gaussian_prior) {
       cerr << "Don't use --means without --gaussian_prior!\n";
       exit(1);
     }
-    Weights wm; 
-    wm.InitFromFile(conf["means"].as<string>());
-    if (num_feats != FD::NumFeats()) {
-      cerr << "[ERROR] Means file had unexpected features!\n";
-      exit(1);
-    }
-    wm.InitVector(&means);
+    Weights::InitFromFile(conf["means"].as<string>(), &means);
   }
   shared_ptr<BatchOptimizer> o;
   const string omethod = conf["optimization_method"].as<string>();
@@ -124,8 +118,6 @@ int main(int argc, char** argv) {
       cerr << "No state file found, assuming ITERATION 1\n";
   }
 
-  vector<double> lambdas(num_feats, 0);
-  weights.InitVector(&lambdas);
   double objective = 0;
   vector<double> gradient(num_feats, 0);
   // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2;
@@ -223,8 +215,7 @@ int main(int argc, char** argv) {
   old.clear();
   SanityCheck(lambdas);
   ShowLargestFeatures(lambdas);
-  weights.InitFromVector(lambdas);
-  weights.WriteToFile(conf["output_weights"].as<string>(), false);
+  Weights::WriteToFile(conf["output_weights"].as<string>(), lambdas, false);
 
   const bool conv = o->HasConverged();
   if (conv) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
diff --git a/utils/fdict.h b/utils/fdict.h
index 771e8b91..f0871b9a 100644
--- a/utils/fdict.h
+++ b/utils/fdict.h
@@ -28,6 +28,8 @@ struct FD {
   }
   static void EnableHash(const std::string& cmph_file) {
 #ifdef HAVE_CMPH
+    assert(dict_.max() == 0);  // dictionary must not have
+                               // been added to
     hash_ = new PerfectHashFunction(cmph_file);
 #endif
   }
diff --git a/utils/phmt.cc b/utils/phmt.cc
index 1f59afaf..48d9f093 100644
--- a/utils/phmt.cc
+++ b/utils/phmt.cc
@@ -19,22 +19,18 @@ int main(int argc, char** argv) {
   cerr << "LexFE = " << FD::Convert("LexFE") << endl;
   cerr << "LexEF = " << FD::Convert("LexEF") << endl;
   {
-    Weights w;
     vector<weight_t> v(FD::NumFeats());
     v[FD::Convert("LexFE")] = 1.0;
     v[FD::Convert("LexEF")] = 0.5;
-    w.InitFromVector(v);
     cerr << "Writing...\n";
-    w.WriteToFile("weights.bin");
+    Weights::WriteToFile("weights.bin", v);
     cerr << "Done.\n";
   }
   {
-    Weights w;
     vector<weight_t> v(FD::NumFeats());
     cerr << "Reading...\n";
-    w.InitFromFile("weights.bin");
+    Weights::InitFromFile("weights.bin", &v);
     cerr << "Done.\n";
-    w.InitVector(&v);
     assert(v[FD::Convert("LexFE")] == 1.0);
     assert(v[FD::Convert("LexEF")] == 0.5);
   }
diff --git a/utils/weights.cc b/utils/weights.cc
index 0916b72a..c49000be 100644
--- a/utils/weights.cc
+++ b/utils/weights.cc
@@ -8,7 +8,10 @@
 
 using namespace std;
 
-void Weights::InitFromFile(const std::string& filename, vector<string>* feature_list) {
+void Weights::InitFromFile(const string& filename,
+                           vector<weight_t>* pweights,
+                           vector<string>* feature_list) {
+  vector<weight_t>& weights = *pweights;
   if (!SILENT) cerr << "Reading weights from " << filename << endl;
   ReadFile in_file(filename);
   istream& in = *in_file.stream();
@@ -47,16 +50,16 @@ void Weights::InitFromFile(const std::string& filename, vector<string>* feature_
       int end = 0;
       while(end < buf.size() && buf[end] != ' ') ++end;
       const int fid = FD::Convert(buf.substr(start, end - start));
+      if (feature_list) { feature_list->push_back(buf.substr(start, end - start)); }
       while(end < buf.size() && buf[end] == ' ') ++end;
       val = strtod(&buf.c_str()[end], NULL);
       if (isnan(val)) {
         cerr << FD::Convert(fid) << " has weight NaN!\n";
         abort();
       }
-      if (wv_.size() <= fid)
-        wv_.resize(fid + 1);
-      wv_[fid] = val;
-      if (feature_list) { feature_list->push_back(FD::Convert(fid)); }
+      if (weights.size() <= fid)
+        weights.resize(fid + 1);
+      weights[fid] = val;
       ++weight_count;
       if (!SILENT) {
         if (weight_count %   50000 == 0) { cerr << '.' << flush; fl = true; }
@@ -76,8 +79,8 @@ void Weights::InitFromFile(const std::string& filename, vector<string>* feature_
       cerr << "Hash function reports " << FD::NumFeats() << " keys but weights file contains " << num_keys[0] << endl;
       abort();
     }
-    wv_.resize(num_keys[0]);
-    in.get(reinterpret_cast<char*>(&wv_[0]), num_keys[0] * sizeof(weight_t));
+    weights.resize(num_keys[0]);
+    in.get(reinterpret_cast<char*>(&weights[0]), num_keys[0] * sizeof(weight_t));
     if (!in.good()) {
       cerr << "Error loading weights!\n";
       abort();
@@ -85,7 +88,10 @@ void Weights::InitFromFile(const std::string& filename, vector<string>* feature_
   }
 }
 
-void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features, const string* extra) const {
+void Weights::WriteToFile(const string& fname,
+                          const vector<weight_t>& weights,
+                          bool hide_zero_value_features,
+                          const string* extra) {
   WriteFile out(fname);
   ostream& o = *out.stream();
   assert(o);
@@ -96,41 +102,54 @@ void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_feature
     o.precision(17);
     const int num_feats = FD::NumFeats();
     for (int i = 1; i < num_feats; ++i) {
-      const weight_t val = (i < wv_.size() ? wv_[i] : 0.0);
+      const weight_t val = (i < weights.size() ? weights[i] : 0.0);
       if (hide_zero_value_features && val == 0.0) continue;
       o << FD::Convert(i) << ' ' << val << endl;
     }
   } else {
     o.write("_PHWf", 5);
     const size_t keys = FD::NumFeats();
-    assert(keys <= wv_.size());
+    assert(keys <= weights.size());
     o.write(reinterpret_cast<const char*>(&keys), sizeof(keys));
-    o.write(reinterpret_cast<const char*>(&wv_[0]), keys * sizeof(weight_t));
+    o.write(reinterpret_cast<const char*>(&weights[0]), keys * sizeof(weight_t));
   }
 }
 
-void Weights::InitVector(std::vector<weight_t>* w) const {
-  *w = wv_;
+void Weights::InitSparseVector(const vector<weight_t>& dv,
+                               SparseVector<weight_t>* sv) {
+  sv->clear();
+  for (unsigned i = 1; i < dv.size(); ++i) {
+    if (dv[i]) sv->set_value(i, dv[i]);
+  }
 }
 
-void Weights::InitSparseVector(SparseVector<weight_t>* w) const {
-  for (int i = 1; i < wv_.size(); ++i) {
-    const weight_t& weight = wv_[i];
-    if (weight) w->set_value(i, weight);
+void Weights::SanityCheck(const vector<weight_t>& w) {
+  for (int i = 0; i < w.size(); ++i) {
+    assert(!isnan(w[i]));
+    assert(!isinf(w[i]));
   }
 }
 
-void Weights::InitFromVector(const std::vector<weight_t>& w) {
-  wv_ = w;
-  if (wv_.size() > FD::NumFeats())
-    cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n";
-  wv_.resize(FD::NumFeats(), 0);
-}
+struct FComp {
+  const vector<weight_t>& w_;
+  FComp(const vector<weight_t>& w) : w_(w) {}
+  bool operator()(int a, int b) const {
+    return fabs(w_[a]) > fabs(w_[b]);
+  }
+};
 
-void Weights::InitFromVector(const SparseVector<weight_t>& w) {
-  wv_.clear();
-  wv_.resize(FD::NumFeats(), 0.0);
-  for (int i = 1; i < FD::NumFeats(); ++i)
-    wv_[i] = w.value(i);
+void Weights::ShowLargestFeatures(const vector<weight_t>& w) {
+  vector<int> fnums(w.size());
+  for (int i = 0; i < w.size(); ++i)
+    fnums[i] = i;
+  vector<int>::iterator mid = fnums.begin();
+  mid += (w.size() > 10 ? 10 : w.size());
+  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
+  cerr << "TOP FEATURES:";
+  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
+    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
+  }
+  cerr << endl;
 }
 
+
diff --git a/utils/weights.h b/utils/weights.h
index 7664810b..30f71db0 100644
--- a/utils/weights.h
+++ b/utils/weights.h
@@ -10,15 +10,21 @@ typedef double weight_t;
 
 class Weights {
  public:
-  Weights() {}
-  void InitFromFile(const std::string& fname, std::vector<std::string>* feature_list = NULL);
-  void WriteToFile(const std::string& fname, bool hide_zero_value_features = true, const std::string* extra = NULL) const;
-  void InitVector(std::vector<weight_t>* w) const;
-  void InitSparseVector(SparseVector<weight_t>* w) const;
-  void InitFromVector(const std::vector<weight_t>& w);
-  void InitFromVector(const SparseVector<weight_t>& w);
+  static void InitFromFile(const std::string& fname,
+                           std::vector<weight_t>* weights,
+                           std::vector<std::string>* feature_list = NULL);
+  static void WriteToFile(const std::string& fname,
+                          const std::vector<weight_t>& weights,
+                          bool hide_zero_value_features = true,
+                          const std::string* extra = NULL);
+  static void InitSparseVector(const std::vector<weight_t>& dv,
+                               SparseVector<weight_t>* sv);
+  // check for infinities, NaNs, etc
+  static void SanityCheck(const std::vector<weight_t>& w);
+  // write weights with largest magnitude to cerr
+  static void ShowLargestFeatures(const std::vector<weight_t>& w);
  private:
-  std::vector<weight_t> wv_;
+  Weights();
 };
 
 #endif
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc
index b84c44bc..0c094fd5 100644
--- a/vest/mr_vest_generate_mapper_input.cc
+++ b/vest/mr_vest_generate_mapper_input.cc
@@ -223,16 +223,16 @@ struct oracle_directions {
     cerr << "Forest repo: " << forest_repository << endl;
     assert(DirectoryExists(forest_repository));
     vector<string> features;
-    weights.InitFromFile(weights_file, &features);
+    vector<weight_t> dorigin;
+    Weights::InitFromFile(weights_file, &dorigin, &features);
     if (optimize_features.size())
       features=optimize_features;
-    weights.InitSparseVector(&origin);
+    Weights::InitSparseVector(dorigin, &origin);
     fids.clear();
     AddFeatureIds(features);
     oracles.resize(dev_set_size);
   }
 
-  Weights weights;
   void AddFeatureIds(vector<string> const& features) {
     int i = fids.size();
     fids.resize(fids.size()+features.size());
-- 
cgit v1.2.3


From b9d54044619b964467857b20921c19ab9135326c Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Tue, 13 Sep 2011 21:55:02 +0100
Subject: binary to extract features encountered

---
 training/Makefile.am             |   4 ++
 training/mpi_extract_features.cc | 151 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 155 insertions(+)
 create mode 100644 training/mpi_extract_features.cc

(limited to 'training')

diff --git a/training/Makefile.am b/training/Makefile.am
index 6e2c06f5..7ceeda34 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -11,6 +11,7 @@ bin_PROGRAMS = \
   collapse_weights \
   cllh_filter_grammar \
   mpi_online_optimize \
+  mpi_extract_features \
   mpi_batch_optimize \
   compute_cllh \
   augment_grammar
@@ -24,6 +25,9 @@ TESTS = lbfgs_test optimize_test
 mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc
 mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
+mpi_extract_features_SOURCES = mpi_extract_features.cc
+mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
 mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc
 mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
diff --git a/training/mpi_extract_features.cc b/training/mpi_extract_features.cc
new file mode 100644
index 00000000..6750aa15
--- /dev/null
+++ b/training/mpi_extract_features.cc
@@ -0,0 +1,151 @@
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <cassert>
+
+#include "config.h"
+#ifdef HAVE_MPI
+#include <boost/mpi.hpp>
+#endif
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "ff_register.h"
+#include "verbose.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "decoder.h"
+#include "weights.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("training_data,t",po::value<string>(),"Training data corpus")
+        ("decoder_config,c",po::value<string>(),"Decoder configuration file")
+        ("weights,w", po::value<string>(), "(Optional) weights file; weights may affect what features are encountered in pruning configurations")
+        ("output_prefix,o",po::value<string>()->default_value("features"),"Output path prefix");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
+    cerr << "Decode an input set (optionally in parallel using MPI) and write\nout the feature strings encountered.\n";
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  int lc = 0;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (lc % size == rank) c->push_back(line);
+    ++lc;
+  }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct TrainingObserver : public DecoderObserver {
+
+  virtual void NotifyDecodingStart(const SentenceMetadata&) {
+  }
+
+  // compute model expectations, denominator of objective
+  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+  }
+
+  // compute "empirical" expectations, numerator of objective
+  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+  }
+};
+
+#ifdef HAVE_MPI
+namespace mpi = boost::mpi;
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  const int size = 1;
+  const int rank = 0;
+#endif
+  if (size > 1) SetSilent(true);  // turn off verbose decoder output
+  register_feature_functions();
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf))
+    return false;
+
+  // load cdec.ini and set up decoder
+  ReadFile ini_rf(conf["decoder_config"].as<string>());
+  Decoder decoder(ini_rf.stream());
+  if (decoder.GetConf()["input"].as<string>() != "-") {
+    cerr << "cdec.ini must not set an input file\n";
+    abort();
+  }
+
+  if (FD::UsingPerfectHashFunction()) {
+    cerr << "Your configuration file has enabled a cmph hash function. Please disable.\n";
+    return 1;
+  }
+
+  // load optional weights
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &decoder.CurrentWeightVector());
+
+  vector<string> corpus;
+  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
+  assert(corpus.size() > 0);
+
+  TrainingObserver observer;
+
+  if (rank == 0)
+    cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
+
+  for (int i = 0; i < corpus.size(); ++i)
+    decoder.Decode(corpus[i], &observer);
+
+  {
+    ostringstream os;
+    os << conf["output_prefix"].as<string>() << '.' << rank << "_of_" << size;
+    WriteFile wf(os.str());
+    ostream& out = *wf.stream();
+    const unsigned num_feats = FD::NumFeats();
+    for (unsigned i = 1; i < num_feats; ++i) {
+      out << FD::Convert(i) << endl;
+    }
+    cerr << "Wrote " << os.str() << endl;
+  }
+
+#ifdef HAVE_MPI
+  world.barrier();
+#else
+#endif
+
+  return 0;
+}
+
-- 
cgit v1.2.3


From 08f1814923005f702300d661c4d67f4635fc901c Mon Sep 17 00:00:00 2001
From: Guest_account Guest_account prguest11 <prguest11@taipan.cs>
Date: Thu, 15 Sep 2011 12:52:59 +0100
Subject: script to filter reachable sentences, weight cleanup

---
 decoder/apply_models.cc           |   3 +-
 decoder/hg.h                      |   8 +-
 training/Makefile.am              |  10 +-
 training/cllh_filter_grammar.cc   | 197 --------------------------------------
 training/mpi_extract_reachable.cc | 163 +++++++++++++++++++++++++++++++
 utils/feature_vector.h            |   4 +-
 6 files changed, 174 insertions(+), 211 deletions(-)
 delete mode 100644 training/cllh_filter_grammar.cc
 create mode 100644 training/mpi_extract_reachable.cc

(limited to 'training')

diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc
index 26cdb881..40fd27e4 100644
--- a/decoder/apply_models.cc
+++ b/decoder/apply_models.cc
@@ -276,8 +276,7 @@ public:
     make_heap(cand.begin(), cand.end(), HeapCandCompare());
     State2Node state2node;   // "buf" in Figure 2
     int pops = 0;
-    int pop_limit_eff=max(1,int(v.promise*pop_limit_));
-    while(!cand.empty() && pops < pop_limit_eff) {
+    while(!cand.empty() && pops < pop_limit_) {
       pop_heap(cand.begin(), cand.end(), HeapCandCompare());
       Candidate* item = cand.back();
       cand.pop_back();
diff --git a/decoder/hg.h b/decoder/hg.h
index e5ef05f8..f0ddbb76 100644
--- a/decoder/hg.h
+++ b/decoder/hg.h
@@ -49,16 +49,14 @@ public:
   // TODO get rid of cat_?
   // TODO keep cat_ and add span and/or state? :)
   struct Node {
-    Node() : id_(), cat_(), promise(1) {}
+    Node() : id_(), cat_() {}
     int id_; // equal to this object's position in the nodes_ vector
     WordID cat_;  // non-terminal category if <0, 0 if not set
     WordID NT() const { return -cat_; }
     EdgesVector in_edges_;   // an in edge is an edge with this node as its head.  (in edges come from the bottom up to us)  indices in edges_
     EdgesVector out_edges_;  // an out edge is an edge with this node as its tail.  (out edges leave us up toward the top/goal). indices in edges_
-    double promise; // set in global pruning; in [0,infty) so that mean is 1.  use: e.g. scale cube poplimit.  //TODO: appears to be useless, compile without this?  on the other hand, pretty cheap.
     void copy_fixed(Node const& o) { // nonstructural fields only - structural ones are managed by sorting/pruning/subsetting
       cat_=o.cat_;
-      promise=o.promise;
     }
     void copy_reindex(Node const& o,indices_after const& n2,indices_after const& e2) {
       copy_fixed(o);
@@ -81,7 +79,7 @@ public:
     int head_node_;               // refers to a position in nodes_
     TailNodeVector tail_nodes_;   // contents refer to positions in nodes_
     TRulePtr rule_;
-    FeatureVector feature_values_;
+    SparseVector<weight_t> feature_values_;
     prob_t edge_prob_;             // dot product of weights and feat_values
     int id_;   // equal to this object's position in the edges_ vector
 
@@ -468,7 +466,7 @@ public:
   /// drop edge i if edge_margin[i] < prune_below, unless preserve_mask[i]
   void MarginPrune(EdgeProbs const& edge_margin,prob_t prune_below,EdgeMask const* preserve_mask=0,bool safe_inside=false,bool verbose=false);
 
-  //TODO: in my opinion, looking at the ratio of logprobs (features \dot weights) rather than the absolute difference generalizes more nicely across sentence lengths and weight vectors that are constant multiples of one another.  at least make that an option.  i worked around this a little in cdec by making "beam alpha per source word" but that's not helping with different tuning runs.  this would also make me more comfortable about allocating Node.promise
+  //TODO: in my opinion, looking at the ratio of logprobs (features \dot weights) rather than the absolute difference generalizes more nicely across sentence lengths and weight vectors that are constant multiples of one another.  at least make that an option.  i worked around this a little in cdec by making "beam alpha per source word" but that's not helping with different tuning runs.
 
   // beam_alpha=0 means don't beam prune, otherwise drop things that are e^beam_alpha times worse than best -   // prunes any edge whose prob_t on the best path taking that edge is more than e^alpha times
   //density=0 means don't density prune:   // for density>=1.0, keep this many times the edges needed for the 1best derivation
diff --git a/training/Makefile.am b/training/Makefile.am
index 7ceeda34..5752859e 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -9,9 +9,9 @@ bin_PROGRAMS = \
   atools \
   plftools \
   collapse_weights \
-  cllh_filter_grammar \
-  mpi_online_optimize \
+  mpi_extract_reachable \
   mpi_extract_features \
+  mpi_online_optimize \
   mpi_batch_optimize \
   compute_cllh \
   augment_grammar
@@ -25,6 +25,9 @@ TESTS = lbfgs_test optimize_test
 mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc
 mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
+mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc
+mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
 mpi_extract_features_SOURCES = mpi_extract_features.cc
 mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
@@ -34,9 +37,6 @@ mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/
 compute_cllh_SOURCES = compute_cllh.cc
 compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
-cllh_filter_grammar_SOURCES = cllh_filter_grammar.cc
-cllh_filter_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
 augment_grammar_SOURCES = augment_grammar.cc
 augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
diff --git a/training/cllh_filter_grammar.cc b/training/cllh_filter_grammar.cc
deleted file mode 100644
index 6998ec2b..00000000
--- a/training/cllh_filter_grammar.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <unistd.h>   // fork
-#include <sys/wait.h> // waitpid
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "tdict.h"
-#include "ff_register.h"
-#include "verbose.h"
-#include "hg.h"
-#include "decoder.h"
-#include "filelib.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("training_data,t",po::value<string>(),"Training data corpus")
-        ("decoder_config,c",po::value<string>(),"Decoder configuration file")
-        ("shards,s",po::value<unsigned>()->default_value(1),"Number of shards")
-        ("starting_shard,S",po::value<unsigned>()->default_value(0), "In this invocation only process shards >= S")
-        ("work_limit,l",po::value<unsigned>()->default_value(9999), "Process maximially this many shards")
-        ("ncpus,C",po::value<unsigned>()->default_value(1),"Number of CPUs to use");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
-    cerr << dcmdline_options << endl;
-    exit(1);
-  }
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* ids) {
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  int lc = 0;
-  assert(size > 0);
-  assert(rank < size);
-  while(in) {
-    getline(in, line);
-    if (!in) break;
-    if (lc % size == rank) {
-      c->push_back(line);
-      ids->push_back(lc);
-    }
-    ++lc;
-  }
-}
-
-struct TrainingObserver : public DecoderObserver {
-  TrainingObserver() : s_lhs(-TD::Convert("S")), goal_lhs(-TD::Convert("Goal")) {}
-
-  void Reset() {
-    total_complete = 0;
-  } 
-
-  virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
-    state = 1;
-    used.clear();
-    failed = true;
-  }
-
-  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    assert(state == 1);
-    for (int i = 0; i < hg->edges_.size(); ++i) {
-      const TRule* rule = hg->edges_[i].rule_.get();
-      if (rule->lhs_ == s_lhs || rule->lhs_ == goal_lhs)  // fragile hack to filter out glue rules
-        continue;
-      used.insert(rule);
-    }
-    state = 2;
-  }
-
-  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    assert(state == 2);
-    state = 3;
-  }
-
-  virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
-    if (state == 3) {
-      failed = false;
-    } else {
-      failed = true;
-    }
-  }
-
-  set<const TRule*> used;
-
-  const int s_lhs;
-  const int goal_lhs;
-  bool failed;
-  int total_complete;
-  int state;
-};
-
-void work(const string& fname, int rank, int size, Decoder* decoder) {
-  cerr << "Worker " << rank << '/' << size << " starting.\n";
-  vector<string> corpus;
-  vector<int> ids;
-  ReadTrainingCorpus(fname, rank, size, &corpus, &ids);
-  assert(corpus.size() > 0);
-  assert(corpus.size() == ids.size());
-  cerr << "  " << rank << '/' << size << ": has " << corpus.size() << " sentences to process\n";
-  ostringstream oc; oc << "corpus." << rank << "_of_" << size;
-  WriteFile foc(oc.str());
-  ostringstream og; og << "grammar." << rank << "_of_" << size << ".gz";
-  WriteFile fog(og.str());
-
-  set<const TRule*> all_used;
-  TrainingObserver observer;
-  for (int i = 0; i < corpus.size(); ++i) {
-    const int sent_id = ids[i];
-    const string& input = corpus[i];
-    decoder->SetId(sent_id);
-    decoder->Decode(input, &observer);
-    if (observer.failed) {
-      // do nothing
-    } else {
-      (*foc.stream()) << input << endl;
-      for (set<const TRule*>::iterator it = observer.used.begin(); it != observer.used.end(); ++it) {
-        if (all_used.insert(*it).second)
-          (*fog.stream()) << **it << endl;
-      }
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  register_feature_functions();
-
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-  const string fname = conf["training_data"].as<string>();
-  const unsigned ncpus = conf["ncpus"].as<unsigned>();
-  const unsigned shards = conf["shards"].as<unsigned>();
-  const unsigned start = conf["starting_shard"].as<unsigned>();
-  const unsigned work_limit = conf["work_limit"].as<unsigned>();
-  const unsigned eff_shards = min(start + work_limit, shards);
-  cerr << "Processing shards " << start << "/" << shards << " to " << eff_shards << "/" << shards << endl;
-  assert(ncpus > 0);
-  ReadFile ini_rf(conf["decoder_config"].as<string>());
-  Decoder decoder(ini_rf.stream());
-  if (decoder.GetConf()["input"].as<string>() != "-") {
-    cerr << "cdec.ini must not set an input file\n";
-    abort();
-  }
-  SetSilent(true);  // turn off verbose decoder output
-  cerr << "Forking " << ncpus << " time(s)\n";
-  vector<pid_t> children;
-  for (int i = 0; i < ncpus; ++i) {
-    pid_t pid = fork();
-    if (pid < 0) {
-      cerr << "Fork failed!\n";
-      exit(1);
-    }
-    if (pid > 0) {
-      children.push_back(pid);
-    } else {
-      for (int j = start; j < eff_shards; ++j) {
-        if (j % ncpus == i) {
-          cerr << "  CPU " << i << " processing shard " << j << endl;
-          work(fname, j, shards, &decoder);
-          cerr << "  Shard " << j << "/" << shards << " finished.\n";
-        }
-      }
-      _exit(0);
-    }
-  }
-  for (int i = 0; i < children.size(); ++i) {
-    int status;
-    int w = waitpid(children[i], &status, 0);
-    if (w < 0) { cerr << "Error while waiting for children!"; return 1; }
-    if (WIFSIGNALED(status)) {
-      cerr << "Child " << i << " received signal " << WTERMSIG(status) << endl;
-      if (WTERMSIG(status) == 11) { cerr << " this is a SEGV- you may be trying to print temporarily created rules\n"; }
-    }
-  }
-  return 0;
-}
diff --git a/training/mpi_extract_reachable.cc b/training/mpi_extract_reachable.cc
new file mode 100644
index 00000000..2a7c2b9d
--- /dev/null
+++ b/training/mpi_extract_reachable.cc
@@ -0,0 +1,163 @@
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <cassert>
+
+#include "config.h"
+#ifdef HAVE_MPI
+#include <boost/mpi.hpp>
+#endif
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "ff_register.h"
+#include "verbose.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "decoder.h"
+#include "weights.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("training_data,t",po::value<string>(),"Training data corpus")
+        ("decoder_config,c",po::value<string>(),"Decoder configuration file")
+        ("weights,w", po::value<string>(), "(Optional) weights file; weights may affect what features are encountered in pruning configurations")
+        ("output_prefix,o",po::value<string>()->default_value("reachable"),"Output path prefix");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
+    cerr << "Decode an input set (optionally in parallel using MPI) and write\nout the inputs that produce reachable parallel parses.\n";
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  int lc = 0;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (lc % size == rank) c->push_back(line);
+    ++lc;
+  }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct ReachabilityObserver : public DecoderObserver {
+
+  virtual void NotifyDecodingStart(const SentenceMetadata&) {
+    reachable = false;
+  }
+
+  // compute model expectations, denominator of objective
+  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+  }
+
+  // compute "empirical" expectations, numerator of objective
+  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    reachable = true;
+  }
+
+  bool reachable;
+};
+
+#ifdef HAVE_MPI
+namespace mpi = boost::mpi;
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  const int size = 1;
+  const int rank = 0;
+#endif
+  if (size > 1) SetSilent(true);  // turn off verbose decoder output
+  register_feature_functions();
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf))
+    return false;
+
+  // load cdec.ini and set up decoder
+  ReadFile ini_rf(conf["decoder_config"].as<string>());
+  Decoder decoder(ini_rf.stream());
+  if (decoder.GetConf()["input"].as<string>() != "-") {
+    cerr << "cdec.ini must not set an input file\n";
+    abort();
+  }
+
+  if (FD::UsingPerfectHashFunction()) {
+    cerr << "Your configuration file has enabled a cmph hash function. Please disable.\n";
+    return 1;
+  }
+
+  // load optional weights
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &decoder.CurrentWeightVector());
+
+  vector<string> corpus;
+  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
+  assert(corpus.size() > 0);
+
+
+  if (rank == 0)
+    cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
+
+  size_t num_reached = 0;
+  {
+    ostringstream os;
+    os << conf["output_prefix"].as<string>() << '.' << rank << "_of_" << size;
+    WriteFile wf(os.str());
+    ostream& out = *wf.stream();
+    ReachabilityObserver observer;
+    for (int i = 0; i < corpus.size(); ++i) {
+      decoder.Decode(corpus[i], &observer);
+      if (observer.reachable) {
+         out << corpus[i] << endl;
+         ++num_reached;
+      }
+      corpus[i].clear();
+    }
+    cerr << "Shard " << rank << '/' << size << " finished, wrote "
+         << num_reached << " instances to " << os.str() << endl;
+  }
+
+  size_t total = 0;
+#ifdef HAVE_MPI
+  reduce(world, num_reached, total, std::plus<double>(), 0);
+#else
+  total = num_reached;
+#endif
+  if (rank == 0) {
+    cerr << "-----------------------------------------\n";
+    cerr << "TOTAL = " << total << " instances\n";
+  }
+  return 0;
+}
+
diff --git a/utils/feature_vector.h b/utils/feature_vector.h
index 733aa99e..a7b61a66 100755
--- a/utils/feature_vector.h
+++ b/utils/feature_vector.h
@@ -3,9 +3,9 @@
 
 #include <vector>
 #include "sparse_vector.h"
-#include "fdict.h"
+#include "weights.h"
 
-typedef double Featval;
+typedef weight_t Featval;
 typedef SparseVector<Featval> FeatureVector;
 typedef SparseVector<Featval> WeightVector;
 typedef std::vector<Featval> DenseWeightVector;
-- 
cgit v1.2.3


From a28c48d07df4e426a875f5381c80ebf4fbbd1de2 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 17 Sep 2011 01:08:45 +0100
Subject: enable ramdisk scratch for per-sentence-grammars

---
 training/mpi_batch_optimize.cc | 35 +++++++++++++++++++++++++++++++++++
 utils/filelib.cc               | 19 +++++++++++++++++++
 utils/filelib.h                |  5 +----
 3 files changed, 55 insertions(+), 4 deletions(-)

(limited to 'training')

diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc
index cc5953f6..0ba8c530 100644
--- a/training/mpi_batch_optimize.cc
+++ b/training/mpi_batch_optimize.cc
@@ -22,6 +22,7 @@ namespace mpi = boost::mpi;
 #include "ff_register.h"
 #include "decoder.h"
 #include "filelib.h"
+#include "stringlib.h"
 #include "optimize.h"
 #include "fdict.h"
 #include "weights.h"
@@ -42,6 +43,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
 	("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory")
         ("gaussian_prior,p","Use a Gaussian prior on the weights")
         ("means,u", po::value<string>(), "File containing the means for Gaussian prior")
+        ("per_sentence_grammar_scratch,P", po::value<string>(), "(Optional) location of scratch space to copy per-sentence grammars for fast access, useful if a RAM disk is available")
         ("sigma_squared", po::value<double>()->default_value(1.0), "Sigma squared term for spherical Gaussian prior");
   po::options_description clo("Command line options");
   clo.add_options()
@@ -186,6 +188,36 @@ struct VectorPlus : public binary_function<vector<T>, vector<T>, vector<T> >  {
   } 
 }; 
 
+void MovePerSentenceGrammars(const string& root, int size, int rank, vector<string>* c) {
+  if (!DirectoryExists(root)) {
+    cerr << "Can't find scratch space at " << root << endl;
+    abort();
+  }
+  ostringstream os;
+  os << root << "/psg." << size << "_of_" << rank;
+  const string path = os.str();
+  MkDirP(path);
+  string sent;
+  map<string, string> attr;
+  for (unsigned i = 0; i < c->size(); ++i) {
+    sent = (*c)[i];
+    attr.clear();
+    ProcessAndStripSGML(&sent, &attr);
+    map<string, string>::iterator it = attr.find("grammar");
+    if (it != attr.end()) {
+      string src_file = it->second;
+      bool is_gzipped = (src_file.size() > 3) && (src_file.rfind(".gz") == (src_file.size() - 3));
+      string new_name = path + "/" + md5(sent);
+      if (is_gzipped) new_name += ".gz";
+      CopyFile(src_file, new_name);
+      it->second = new_name;
+    }
+    ostringstream ns;
+    ns << SGMLOpenSegTag(attr) << ' ' << sent << " </seg>";
+    (*c)[i] = ns.str();
+  }
+}
+
 int main(int argc, char** argv) {
 #ifdef HAVE_MPI
   mpi::environment env(argc, argv);
@@ -257,6 +289,9 @@ int main(int argc, char** argv) {
   ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
   assert(corpus.size() > 0);
 
+  if (conf.count("per_sentence_grammar_scratch"))
+    MovePerSentenceGrammars(conf["per_sentence_grammar_scratch"].as<string>(), rank, size, &corpus);
+
   TrainingObserver observer;
   while (!converged) {
     observer.Reset();
diff --git a/utils/filelib.cc b/utils/filelib.cc
index a0969b1a..d206fc19 100644
--- a/utils/filelib.cc
+++ b/utils/filelib.cc
@@ -2,6 +2,12 @@
 
 #include <unistd.h>
 #include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <cstdlib>
+#include <cstdio>
+#include <sys/stat.h>
+#include <sys/types.h>
 
 using namespace std;
 
@@ -32,3 +38,16 @@ void MkDirP(const string& dir) {
   }
 }
 
+#if 0
+void CopyFile(const string& inf, const string& outf) {
+  WriteFile w(outf);
+  CopyFile(inf,*w);
+}
+#else
+void CopyFile(const string& inf, const string& outf) {
+  ofstream of(outf.c_str(), fstream::trunc|fstream::binary);
+  ifstream in(inf.c_str(), fstream::binary);
+  of << in.rdbuf();
+}
+#endif
+
diff --git a/utils/filelib.h b/utils/filelib.h
index a8622246..bb6e7415 100644
--- a/utils/filelib.h
+++ b/utils/filelib.h
@@ -113,9 +113,6 @@ inline void CopyFile(std::string const& inf,std::ostream &out) {
   CopyFile(*r,out);
 }
 
-inline void CopyFile(std::string const& inf,std::string const& outf) {
-  WriteFile w(outf);
-  CopyFile(inf,*w);
-}
+void CopyFile(std::string const& inf,std::string const& outf);
 
 #endif
-- 
cgit v1.2.3


From 1afbff874473c79619ce74cdf90f3c312185e4e1 Mon Sep 17 00:00:00 2001
From: Guest_account Guest_account prguest11 <prguest11@taipan.cs>
Date: Sat, 17 Sep 2011 01:39:07 +0100
Subject: add dep

---
 training/cluster-em.pl          | 114 ----------------
 training/cluster-ptrain.pl      | 206 -----------------------------
 training/compute_cllh.cc        | 196 ---------------------------
 training/make-lexcrf-grammar.pl | 285 ----------------------------------------
 training/mpi_compute_cllh.cc    | 196 +++++++++++++++++++++++++++
 utils/stringlib.cc              |  14 +-
 6 files changed, 203 insertions(+), 808 deletions(-)
 delete mode 100755 training/cluster-em.pl
 delete mode 100755 training/cluster-ptrain.pl
 delete mode 100644 training/compute_cllh.cc
 delete mode 100755 training/make-lexcrf-grammar.pl
 create mode 100644 training/mpi_compute_cllh.cc

(limited to 'training')

diff --git a/training/cluster-em.pl b/training/cluster-em.pl
deleted file mode 100755
index 267ab642..00000000
--- a/training/cluster-em.pl
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
-use Getopt::Long;
-my $parallel = 0;
-
-my $CWD=`pwd`; chomp $CWD;
-my $BIN_DIR = "$CWD/..";
-my $REDUCER = "$BIN_DIR/training/mr_em_adapted_reduce";
-my $REDUCE2WEIGHTS = "$BIN_DIR/training/mr_reduce_to_weights";
-my $ADAPTER = "$BIN_DIR/training/mr_em_map_adapter";
-my $DECODER = "$BIN_DIR/decoder/cdec";
-my $COMBINER_CACHE_SIZE = 10000000;
-my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl";
-die "Can't find $REDUCER" unless -f $REDUCER;
-die "Can't execute $REDUCER" unless -x $REDUCER;
-die "Can't find $REDUCE2WEIGHTS" unless -f $REDUCE2WEIGHTS;
-die "Can't execute $REDUCE2WEIGHTS" unless -x $REDUCE2WEIGHTS;
-die "Can't find $ADAPTER" unless -f $ADAPTER;
-die "Can't execute $ADAPTER" unless -x $ADAPTER;
-die "Can't find $DECODER" unless -f $DECODER;
-die "Can't execute $DECODER" unless -x $DECODER;
-my $restart = '';
-if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
-
-die "Usage: $0 [--restart] training.corpus cdec.ini\n" unless (scalar @ARGV == 2);
-
-my $training_corpus = shift @ARGV;
-my $config = shift @ARGV;
-my $pmem="2500mb";
-my $nodes = 40;
-my $max_iteration = 1000;
-my $CFLAG = "-C 1";
-if ($parallel) {
-  die "Can't find $PARALLEL" unless -f $PARALLEL;
-  die "Can't execute $PARALLEL" unless -x $PARALLEL;
-} else { $CFLAG = "-C 500"; }
-
-my $initial_weights = '';
-
-print STDERR <<EOT;
-EM TRAIN CONFIGURATION INFORMATION
-
-      Config file: $config
-  Training corpus: $training_corpus
-  Initial weights: $initial_weights
-   Decoder memory: $pmem
-  Nodes requested: $nodes
-   Max iterations: $max_iteration
-          restart: $restart
-EOT
-
-my $nodelist="1";
-for (my $i=1; $i<$nodes; $i++) { $nodelist .= " 1"; }
-my $iter = 1;
-
-my $dir = "$CWD/emtrain";
-if ($restart) {
-  die "$dir doesn't exist, but --restart specified!\n" unless -d $dir;
-  my $o = `ls -t $dir/weights.*`;
-  my ($a, @x) = split /\n/, $o;
-  if ($a =~ /weights.(\d+)\.gz$/) {
-    $iter = $1;
-  } else {
-    die "Unexpected file: $a!\n";
-  }
-  print STDERR "Restarting at iteration $iter\n";
-} else {
-  die "$dir already exists!\n" if -e $dir;
-  mkdir $dir or die "Can't create $dir: $!";
-
-  if ($initial_weights) {
-    unless ($initial_weights =~ /\.gz$/) {
-      `cp $initial_weights $dir/weights.1`;
-      `gzip -9 $dir/weights.1`;
-    } else {
-      `cp $initial_weights $dir/weights.1.gz`;
-    }
-  }
-}
-
-while ($iter < $max_iteration) {
-  my $cur_time = `date`; chomp $cur_time;
-  print STDERR "\nStarting iteration $iter...\n";
-  print STDERR "  time: $cur_time\n";
-  my $start = time;
-  my $next_iter = $iter + 1;
-  my $WSTR = "-w $dir/weights.$iter.gz";
-  if ($iter == 1) { $WSTR = ''; }
-  my $dec_cmd="$DECODER --feature_expectations -c $config $WSTR $CFLAG < $training_corpus 2> $dir/deco.log.$iter";
-  my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- ";
-  my $cmd = "";
-  if ($parallel) { $cmd = $pcmd; }
-  $cmd .= "$dec_cmd";
-  $cmd .= "| $ADAPTER | sort -k1 | $REDUCER | $REDUCE2WEIGHTS -o $dir/weights.$next_iter.gz";
-  print STDERR "EXECUTING: $cmd\n";
-  my $result = `$cmd`;
-  if ($? != 0) {
-    die "Error running iteration $iter: $!";
-  }
-  chomp $result;
-  my $end = time;
-  my $diff = ($end - $start);
-  print STDERR "  ITERATION $iter TOOK $diff SECONDS\n";
-  $iter = $next_iter;
-  if ($result =~ /1$/) {
-    print STDERR "Training converged.\n";
-    last;
-  }
-}
-
-print "FINAL WEIGHTS: $dir/weights.$iter\n";
-
diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl
deleted file mode 100755
index 03122df9..00000000
--- a/training/cluster-ptrain.pl
+++ /dev/null
@@ -1,206 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path getcwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
-use Getopt::Long;
-
-my $MAX_ITER_ATTEMPTS = 5; # number of times to retry a failed function evaluation
-my $CWD=getcwd();
-my $OPTIMIZER = "$SCRIPT_DIR/mr_optimize_reduce";
-my $DECODER = "$SCRIPT_DIR/../decoder/cdec";
-my $COMBINER_CACHE_SIZE = 150;
-# This is a hack to run this on a weird cluster,
-# eventually, I'll provide Hadoop scripts.
-my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl";
-die "Can't find $OPTIMIZER" unless -f $OPTIMIZER;
-die "Can't execute $OPTIMIZER" unless -x $OPTIMIZER;
-my $restart = '';
-if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
-
-my $pmem="2500mb";
-my $nodes = 1;
-my $max_iteration = 1000;
-my $PRIOR_FLAG = "";
-my $parallel = 1;
-my $CFLAG = "-C 1";
-my $LOCAL;
-my $DISTRIBUTED;
-my $PRIOR;
-my $OALG = "lbfgs";
-my $sigsq = 1;
-my $means_file;
-my $mem_buffers = 20;
-my $RESTART_IF_NECESSARY;
-GetOptions("cdec=s" => \$DECODER,
-           "distributed" => \$DISTRIBUTED,
-           "sigma_squared=f" => \$sigsq,
-           "lbfgs_memory_buffers=i" => \$mem_buffers,
-           "max_iteration=i" => \$max_iteration,
-           "means=s" => \$means_file,
-           "optimizer=s" => \$OALG,
-           "gaussian_prior" => \$PRIOR,
-           "restart_if_necessary" => \$RESTART_IF_NECESSARY,
-           "jobs=i" => \$nodes,
-           "pmem=s" => \$pmem
-          ) or usage();
-usage() unless scalar @ARGV==3;
-my $config_file = shift @ARGV;
-my $training_corpus = shift @ARGV;
-my $initial_weights = shift @ARGV;
-unless ($DISTRIBUTED) { $LOCAL = 1; }
-die "Can't find $config_file" unless -f $config_file;
-die "Can't find $DECODER" unless -f $DECODER;
-die "Can't execute $DECODER" unless -x $DECODER;
-if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; }
-if ($PRIOR) {
-  $PRIOR_FLAG="-p --sigma_squared $sigsq";
-  if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; }
-}
-
-if ($parallel) {
-  die "Can't find $PARALLEL" unless -f $PARALLEL;
-  die "Can't execute $PARALLEL" unless -x $PARALLEL;
-}
-unless ($parallel) { $CFLAG = "-C 500"; }
-unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; }
-my $clines = num_lines($training_corpus);
-my $dir = "$CWD/ptrain";
-
-if ($RESTART_IF_NECESSARY && -d $dir) {
-  $restart = 1;
-}
-
-print STDERR <<EOT;
-PTRAIN CONFIGURATION INFORMATION
-
-      Config file: $config_file
-  Training corpus: $training_corpus
-      Corpus size: $clines
-  Initial weights: $initial_weights
-   Decoder memory: $pmem
-   Max iterations: $max_iteration
-        Optimizer: $OALG
-   Jobs requested: $nodes
-           prior?: $PRIOR_FLAG
-         restart?: $restart
-EOT
-
-if ($OALG) { $OALG="-m $OALG"; }
-
-my $nodelist="1";
-for (my $i=1; $i<$nodes; $i++) { $nodelist .= " 1"; }
-my $iter = 1;
-
-if ($restart) {
-  die "$dir doesn't exist, but --restart specified!\n" unless -d $dir;
-  my $o = `ls -t $dir/weights.*`;
-  my ($a, @x) = split /\n/, $o;
-  if ($a =~ /weights.(\d+)\.gz$/) {
-    $iter = $1;
-  } else {
-    die "Unexpected file: $a!\n";
-  }
-  print STDERR "Restarting at iteration $iter\n";
-} else {
-  die "$dir already exists!\n" if -e $dir;
-  mkdir $dir or die "Can't create $dir: $!";
-
-  unless ($initial_weights =~ /\.gz$/) {
-    `cp $initial_weights $dir/weights.1`;
-    `gzip -9 $dir/weights.1`;
-  } else {
-    `cp $initial_weights $dir/weights.1.gz`;
-  }
-  open T, "<$training_corpus" or die "Can't read $training_corpus: $!";
-  open TO, ">$dir/training.in";
-  my $lc = 0;
-  while(<T>) {
-    chomp;
-    s/^\s+//;
-    s/\s+$//;
-    die "Expected A ||| B in input file" unless / \|\|\| /;
-    print TO "<seg id=\"$lc\">$_</seg>\n";
-    $lc++;
-  }
-  close T;
-  close TO;
-}
-$training_corpus = "$dir/training.in";
-
-my $iter_attempts = 1;
-while ($iter < $max_iteration) {
-  my $cur_time = `date`; chomp $cur_time;
-  print STDERR "\nStarting iteration $iter...\n";
-  print STDERR "  time: $cur_time\n";
-  my $start = time;
-  my $next_iter = $iter + 1;
-  my $dec_cmd="$DECODER -G $CFLAG -c $config_file -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter";
-  my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M $mem_buffers $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz";
-  my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- ";
-  my $cmd = "";
-  if ($parallel) { $cmd = $pcmd; }
-  $cmd .= "$dec_cmd | $opt_cmd";
-
-  print STDERR "EXECUTING: $cmd\n";
-  my $result = `$cmd`;
-  my $exit_code = $? >> 8;
-  if ($exit_code == 99) {
-    $iter_attempts++;
-    if ($iter_attempts > $MAX_ITER_ATTEMPTS) {
-      die "Received restart request $iter_attempts times from optimizer, giving up\n";
-    }
-    print STDERR "Function evaluation failed, retrying (attempt $iter_attempts)\n";
-    next;
-  }
-  if ($? != 0) {
-    die "Error running iteration $iter: $!";
-  }
-  chomp $result;
-  my $end = time;
-  my $diff = ($end - $start);
-  print STDERR "  ITERATION $iter TOOK $diff SECONDS\n";
-  $iter = $next_iter;
-  if ($result =~ /1$/) {
-    print STDERR "Training converged.\n";
-    last;
-  }
-  $iter_attempts = 1;
-}
-
-print "FINAL WEIGHTS: $dir/weights.$iter\n";
-`mv $dir/weights.$iter.gz $dir/weights.final.gz`;
-
-sub usage {
-  die <<EOT;
-
-Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init
-
-  Options:
-
-    --distributed      Parallelize function evaluation
-    --jobs N           Number of jobs to use
-    --cdec PATH        Path to cdec binary
-    --optimize OPT     lbfgs, rprop, sgd
-    --gaussian_prior   add Gaussian prior
-    --means FILE       if you want means other than 0
-    --sigma_squared S  variance on prior
-    --pmem MEM         Memory required for decoder
-    --lbfgs_memory_buffers Number of buffers to use
-                           with LBFGS optimizer
-
-EOT
-}
-
-sub num_lines {
-  my $file = shift;
-  my $fh;
-  if ($file=~ /\.gz$/) {
-    open $fh, "zcat $file|" or die "Couldn't fork zcat $file: $!";
-  } else {
-    open $fh, "<$file" or die "Couldn't read $file: $!";
-  }
-  my $lines = 0;
-  while(<$fh>) { $lines++; }
-  close $fh;
-  return $lines;
-}
diff --git a/training/compute_cllh.cc b/training/compute_cllh.cc
deleted file mode 100644
index b496d196..00000000
--- a/training/compute_cllh.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include "config.h"
-#ifdef HAVE_MPI
-#include <boost/mpi.hpp>
-#endif
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "verbose.h"
-#include "hg.h"
-#include "prob.h"
-#include "inside_outside.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "weights.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("weights,w",po::value<string>(),"Input feature weights file")
-        ("training_data,t",po::value<string>(),"Training data corpus")
-        ("decoder_config,c",po::value<string>(),"Decoder configuration file");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
-    cerr << dcmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* ids) {
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  int lc = 0;
-  while(in) {
-    getline(in, line);
-    if (!in) break;
-    if (lc % size == rank) {
-      c->push_back(line);
-      ids->push_back(lc);
-    }
-    ++lc;
-  }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct TrainingObserver : public DecoderObserver {
-  void Reset() {
-    acc_obj = 0;
-  } 
-
-  virtual void NotifyDecodingStart(const SentenceMetadata&) {
-    cur_obj = 0;
-    state = 1;
-  }
-
-  // compute model expectations, denominator of objective
-  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
-    assert(state == 1);
-    state = 2;
-    SparseVector<prob_t> cur_model_exp;
-    const prob_t z = InsideOutside<prob_t,
-                                   EdgeProb,
-                                   SparseVector<prob_t>,
-                                   EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
-    cur_obj = log(z);
-  }
-
-  // compute "empirical" expectations, numerator of objective
-  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    assert(state == 2);
-    state = 3;
-    SparseVector<prob_t> ref_exp;
-    const prob_t ref_z = InsideOutside<prob_t,
-                                       EdgeProb,
-                                       SparseVector<prob_t>,
-                                       EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp);
-
-    double log_ref_z;
-#if 0
-    if (crf_uniform_empirical) {
-      log_ref_z = ref_exp.dot(feature_weights);
-    } else {
-      log_ref_z = log(ref_z);
-    }
-#else
-    log_ref_z = log(ref_z);
-#endif
-
-    // rounding errors means that <0 is too strict
-    if ((cur_obj - log_ref_z) < kMINUS_EPSILON) {
-      cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
-      exit(1);
-    }
-    assert(!isnan(log_ref_z));
-    acc_obj += (cur_obj - log_ref_z);
-  }
-
-  double acc_obj;
-  double cur_obj;
-  int state;
-};
-
-#ifdef HAVE_MPI
-namespace mpi = boost::mpi;
-#endif
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
-  mpi::environment env(argc, argv);
-  mpi::communicator world;
-  const int size = world.size(); 
-  const int rank = world.rank();
-#else
-  const int size = 1;
-  const int rank = 0;
-#endif
-  if (size > 1) SetSilent(true);  // turn off verbose decoder output
-  register_feature_functions();
-
-  po::variables_map conf;
-  if (!InitCommandLine(argc, argv, &conf))
-    return false;
-
-  // load cdec.ini and set up decoder
-  ReadFile ini_rf(conf["decoder_config"].as<string>());
-  Decoder decoder(ini_rf.stream());
-  if (decoder.GetConf()["input"].as<string>() != "-") {
-    cerr << "cdec.ini must not set an input file\n";
-    abort();
-  }
-
-  // load weights
-  vector<weight_t>& weights = decoder.CurrentWeightVector();
-  if (conf.count("weights"))
-    Weights::InitFromFile(conf["weights"].as<string>(), &weights);
-
-  // freeze feature set
-  //const bool freeze_feature_set = conf.count("freeze_feature_set");
-  //if (freeze_feature_set) FD::Freeze();
-
-  vector<string> corpus; vector<int> ids;
-  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
-  assert(corpus.size() > 0);
-  assert(corpus.size() == ids.size());
-
-  TrainingObserver observer;
-  double objective = 0;
-
-  observer.Reset();
-  if (rank == 0)
-    cerr << "Each processor is decoding " << corpus.size() << " training examples...\n";
-
-  for (int i = 0; i < corpus.size(); ++i) {
-    decoder.SetId(ids[i]);
-    decoder.Decode(corpus[i], &observer);
-  }
-
-#ifdef HAVE_MPI
-  reduce(world, observer.acc_obj, objective, std::plus<double>(), 0);
-#else
-  objective = observer.acc_obj;
-#endif
-
-  if (rank == 0)
-    cout << "OBJECTIVE: " << objective << endl;
-
-  return 0;
-}
-
diff --git a/training/make-lexcrf-grammar.pl b/training/make-lexcrf-grammar.pl
deleted file mode 100755
index 8cdf7718..00000000
--- a/training/make-lexcrf-grammar.pl
+++ /dev/null
@@ -1,285 +0,0 @@
-#!/usr/bin/perl -w
-use utf8;
-use strict;
-my ($effile, $model1) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.model1\n" unless $effile && -f $effile && $model1 && -f $model1;
-
-open EF, "<$effile" or die;
-open M1, "<$model1" or die;
-binmode(EF,":utf8");
-binmode(M1,":utf8");
-binmode(STDOUT,":utf8");
-my %model1;
-while(<M1>) {
-  chomp;
-  my ($f, $e, $lp) = split /\s+/;
-  $model1{$f}->{$e} = $lp;
-}
-
-my $ADD_MODEL1 = 0;      # found that model1 hurts performance
-my $IS_FRENCH_F = 1;     # indicates that the f language is french
-my $IS_ARABIC_F = 0;     # indicates that the f language is arabic
-my $IS_URDU_F = 0;     # indicates that the f language is arabic
-my $ADD_PREFIX_ID = 0;
-my $ADD_LEN = 1;
-my $ADD_SIM = 1;
-my $ADD_DICE = 1;
-my $ADD_111 = 1;
-my $ADD_ID = 1;
-my $ADD_PUNC = 1;
-my $ADD_NUM_MM = 1;
-my $ADD_NULL = 1;
-my $ADD_STEM_ID = 1;
-my $BEAM_RATIO = 50;
-
-my %fdict;
-my %fcounts;
-my %ecounts;
-
-my %sdict;
-
-while(<EF>) {
-  chomp;
-  my ($f, $e) = split /\s*\|\|\|\s*/;
-  my @es = split /\s+/, $e;
-  my @fs = split /\s+/, $f;
-  for my $ew (@es){ $ecounts{$ew}++; }
-  push @fs, '<eps>' if $ADD_NULL;
-  for my $fw (@fs){ $fcounts{$fw}++; }
-  for my $fw (@fs){
-    for my $ew (@es){
-      $fdict{$fw}->{$ew}++;
-    }
-  }
-}
-
-print STDERR "Dice 0\n" if $ADD_DICE;
-print STDERR "OneOneOne 0\nId_OneOneOne 0\n" if $ADD_111;
-print STDERR "Identical 0\n" if $ADD_ID;
-print STDERR "PuncMiss 0\n" if $ADD_PUNC;
-print STDERR "IsNull 0\n" if $ADD_NULL;
-print STDERR "Model1 0\n" if $ADD_MODEL1;
-print STDERR "DLen 0\n" if $ADD_LEN;
-print STDERR "NumMM 0\nNumMatch 0\n" if $ADD_NUM_MM;
-print STDERR "OrthoSim 0\n" if $ADD_SIM;
-print STDERR "PfxIdentical 0\n" if ($ADD_PREFIX_ID);
-my $fc = 1000000;
-my $sids = 1000000;
-for my $f (sort keys %fdict) {
-  my $re = $fdict{$f};
-  my $max;
-  for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) {
-    my $efcount = $re->{$e};
-    unless (defined $max) { $max = $efcount; }
-    my $m1 = $model1{$f}->{$e};
-    unless (defined $m1) { next; }
-    $fc++;
-    my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
-    my $feats = "F$fc=1";
-    my $oe = $e;
-    my $of = $f;   # normalized form
-    if ($IS_FRENCH_F) {
-      # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French
-      $of =~ s/â/as/g;
-      $of =~ s/ê/es/g;
-      $of =~ s/î/is/g;
-      $of =~ s/ô/os/g;
-      $of =~ s/û/us/g;
-    } elsif ($IS_ARABIC_F) {
-      if (length($of) > 1 && !($of =~ /\d/)) {
-        $of =~ s/\$/sh/g;
-      }
-    } elsif ($IS_URDU_F) {
-      if (length($of) > 1 && !($of =~ /\d/)) {
-        $of =~ s/\$/sh/g;
-      }
-      $oe =~ s/^-e-//;
-      $oe =~ s/^al-/al/;
-      $of =~ s/([a-z])\~/$1$1/g;
-      $of =~ s/E/'/g;
-      $of =~ s/^Aw/o/g;
-      $of =~ s/\|/a/g;
-      $of =~ s/@/h/g;
-      $of =~ s/c/ch/g;
-      $of =~ s/x/kh/g;
-      $of =~ s/\*/dh/g;
-      $of =~ s/w/o/g;
-      $of =~ s/Z/dh/g;
-      $of =~ s/y/i/g;
-      $of =~ s/Y/a/g;
-      $of = lc $of;
-    }
-    my $len_e = length($oe);
-    my $len_f = length($of);
-    $feats .= " Model1=$m1" if ($ADD_MODEL1);
-    $feats .= " Dice=$dice" if $ADD_DICE;
-    my $is_null = undef;
-    if ($ADD_NULL && $f eq '<eps>') {
-      $feats .= " IsNull=1";
-      $is_null = 1;
-    }
-    if ($ADD_LEN) {
-      if (!$is_null) {
-        my $dlen = abs($len_e - $len_f);
-        $feats .= " DLen=$dlen";
-      }
-    }
-    my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
-    my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
-    my $both_non_numeric = (!$e_num && !$f_num);
-    if ($ADD_NUM_MM && (($f_num && !$e_num) || ($e_num && !$f_num))) {
-      $feats .= " NumMM=1";
-    }
-    if ($ADD_NUM_MM && ($f_num && $e_num) && ($oe eq $of)) {
-      $feats .= " NumMatch=1";
-    }
-    if ($ADD_STEM_ID) {
-      my $el = 4;
-      my $fl = 4;
-      if ($oe =~ /^al|re|co/) { $el++; }
-      if ($of =~ /^al|re|co/) { $fl++; }
-      if ($oe =~ /^trans|inter/) { $el+=2; }
-      if ($of =~ /^trans|inter/) { $fl+=2; }
-      if ($fl > length($of)) { $fl = length($of); }
-      if ($el > length($oe)) { $el = length($oe); }
-      my $sf = substr $of, 0, $fl;
-      my $se = substr $oe, 0, $el;
-      my $id = $sdict{$sf}->{$se};
-      if (!$id) {
-        $sids++;
-	$sdict{$sf}->{$se} = $sids;
-	$id = $sids;
-	print STDERR "S$sids 0\n"
-      }
-      $feats .= " S$id=1";
-    }
-    if ($ADD_PREFIX_ID) {
-      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { 
-        my $pe = substr $oe, 0, 3;
-        my $pf = substr $of, 0, 3;
-        if ($pe eq $pf) { $feats .= " PfxIdentical=1"; }
-      }
-    }
-    if ($ADD_SIM) {
-      my $ld = 0;
-      my $eff = $len_e;
-      if ($eff < $len_f) { $eff = $len_f; }
-      if (!$is_null) {
-        $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
-      }
-      $feats .= " OrthoSim=$ld";
-    }
-    my $ident = ($e eq $f);
-    if ($ident && $ADD_ID) { $feats .= " Identical=1"; }
-    if ($ADD_111 && ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1)) {
-      if ($ident && $ADD_ID) {
-        $feats .= " Id_OneOneOne=1";
-      }
-      $feats .= " OneOneOne=1";
-    }
-    if ($ADD_PUNC) {
-      if (($f =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $e =~ /[a-z]+/) ||
-          ($e =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $f =~ /[a-z]+/)) {
-        $feats .= " PuncMiss=1";
-      }
-    }
-    my $r = (0.5 - rand)/5;
-    print STDERR "F$fc $r\n";
-    print "$f ||| $e ||| $feats\n";
-  }
-}
-
-sub levenshtein
-{
-    # $s1 and $s2 are the two strings
-    # $len1 and $len2 are their respective lengths
-    #
-    my ($s1, $s2) = @_;
-    my ($len1, $len2) = (length $s1, length $s2);
-
-    # If one of the strings is empty, the distance is the length
-    # of the other string
-    #
-    return $len2 if ($len1 == 0);
-    return $len1 if ($len2 == 0);
-
-    my %mat;
-
-    # Init the distance matrix
-    #
-    # The first row to 0..$len1
-    # The first column to 0..$len2
-    # The rest to 0
-    #
-    # The first row and column are initialized so to denote distance
-    # from the empty string
-    #
-    for (my $i = 0; $i <= $len1; ++$i)
-    {
-        for (my $j = 0; $j <= $len2; ++$j)
-        {
-            $mat{$i}{$j} = 0;
-            $mat{0}{$j} = $j;
-        }
-
-        $mat{$i}{0} = $i;
-    }
-
-    # Some char-by-char processing is ahead, so prepare
-    # array of chars from the strings
-    #
-    my @ar1 = split(//, $s1);
-    my @ar2 = split(//, $s2);
-
-    for (my $i = 1; $i <= $len1; ++$i)
-    {
-        for (my $j = 1; $j <= $len2; ++$j)
-        {
-            # Set the cost to 1 iff the ith char of $s1
-            # equals the jth of $s2
-            # 
-            # Denotes a substitution cost. When the char are equal
-            # there is no need to substitute, so the cost is 0
-            #
-            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
-
-            # Cell $mat{$i}{$j} equals the minimum of:
-            #
-            # - The cell immediately above plus 1
-            # - The cell immediately to the left plus 1
-            # - The cell diagonally above and to the left plus the cost
-            #
-            # We can either insert a new char, delete a char or
-            # substitute an existing char (with an associated cost)
-            #
-            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
-                                $mat{$i}{$j-1} + 1,
-                                $mat{$i-1}{$j-1} + $cost]);
-        }
-    }
-
-    # Finally, the Levenshtein distance equals the rightmost bottom cell
-    # of the matrix
-    #
-    # Note that $mat{$x}{$y} denotes the distance between the substrings
-    # 1..$x and 1..$y
-    #
-    return $mat{$len1}{$len2};
-}
-
-
-# minimal element of a list
-#
-sub min
-{
-    my @list = @{$_[0]};
-    my $min = $list[0];
-
-    foreach my $i (@list)
-    {
-        $min = $i if ($i < $min);
-    }
-
-    return $min;
-}
-
diff --git a/training/mpi_compute_cllh.cc b/training/mpi_compute_cllh.cc
new file mode 100644
index 00000000..b496d196
--- /dev/null
+++ b/training/mpi_compute_cllh.cc
@@ -0,0 +1,196 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+#include "config.h"
+#ifdef HAVE_MPI
+#include <boost/mpi.hpp>
+#endif
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "verbose.h"
+#include "hg.h"
+#include "prob.h"
+#include "inside_outside.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "weights.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("weights,w",po::value<string>(),"Input feature weights file")
+        ("training_data,t",po::value<string>(),"Training data corpus")
+        ("decoder_config,c",po::value<string>(),"Decoder configuration file");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* ids) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  int lc = 0;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (lc % size == rank) {
+      c->push_back(line);
+      ids->push_back(lc);
+    }
+    ++lc;
+  }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct TrainingObserver : public DecoderObserver {
+  void Reset() {
+    acc_obj = 0;
+  } 
+
+  virtual void NotifyDecodingStart(const SentenceMetadata&) {
+    cur_obj = 0;
+    state = 1;
+  }
+
+  // compute model expectations, denominator of objective
+  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+    assert(state == 1);
+    state = 2;
+    SparseVector<prob_t> cur_model_exp;
+    const prob_t z = InsideOutside<prob_t,
+                                   EdgeProb,
+                                   SparseVector<prob_t>,
+                                   EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
+    cur_obj = log(z);
+  }
+
+  // compute "empirical" expectations, numerator of objective
+  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    assert(state == 2);
+    state = 3;
+    SparseVector<prob_t> ref_exp;
+    const prob_t ref_z = InsideOutside<prob_t,
+                                       EdgeProb,
+                                       SparseVector<prob_t>,
+                                       EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp);
+
+    double log_ref_z;
+#if 0
+    if (crf_uniform_empirical) {
+      log_ref_z = ref_exp.dot(feature_weights);
+    } else {
+      log_ref_z = log(ref_z);
+    }
+#else
+    log_ref_z = log(ref_z);
+#endif
+
+    // rounding errors means that <0 is too strict
+    if ((cur_obj - log_ref_z) < kMINUS_EPSILON) {
+      cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
+      exit(1);
+    }
+    assert(!isnan(log_ref_z));
+    acc_obj += (cur_obj - log_ref_z);
+  }
+
+  double acc_obj;
+  double cur_obj;
+  int state;
+};
+
+#ifdef HAVE_MPI
+namespace mpi = boost::mpi;
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  const int size = 1;
+  const int rank = 0;
+#endif
+  if (size > 1) SetSilent(true);  // turn off verbose decoder output
+  register_feature_functions();
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf))
+    return false;
+
+  // load cdec.ini and set up decoder
+  ReadFile ini_rf(conf["decoder_config"].as<string>());
+  Decoder decoder(ini_rf.stream());
+  if (decoder.GetConf()["input"].as<string>() != "-") {
+    cerr << "cdec.ini must not set an input file\n";
+    abort();
+  }
+
+  // load weights
+  vector<weight_t>& weights = decoder.CurrentWeightVector();
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &weights);
+
+  // freeze feature set
+  //const bool freeze_feature_set = conf.count("freeze_feature_set");
+  //if (freeze_feature_set) FD::Freeze();
+
+  vector<string> corpus; vector<int> ids;
+  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
+  assert(corpus.size() > 0);
+  assert(corpus.size() == ids.size());
+
+  TrainingObserver observer;
+  double objective = 0;
+
+  observer.Reset();
+  if (rank == 0)
+    cerr << "Each processor is decoding " << corpus.size() << " training examples...\n";
+
+  for (int i = 0; i < corpus.size(); ++i) {
+    decoder.SetId(ids[i]);
+    decoder.Decode(corpus[i], &observer);
+  }
+
+#ifdef HAVE_MPI
+  reduce(world, observer.acc_obj, objective, std::plus<double>(), 0);
+#else
+  objective = observer.acc_obj;
+#endif
+
+  if (rank == 0)
+    cout << "OBJECTIVE: " << objective << endl;
+
+  return 0;
+}
+
diff --git a/utils/stringlib.cc b/utils/stringlib.cc
index 3a56965c..1a152985 100644
--- a/utils/stringlib.cc
+++ b/utils/stringlib.cc
@@ -2,6 +2,7 @@
 
 #include <cstring>
 #include <cstdlib>
+#include <cstdio>
 #include <cassert>
 #include <iostream>
 #include <map>
@@ -104,11 +105,11 @@ public:
   typedef unsigned int size_type; // must be 32bit
 
   MD5();
-  MD5(const std::string& text);
+  MD5(const string& text);
   void update(const unsigned char *buf, size_type length);
   void update(const char *buf, size_type length);
   MD5& finalize();
-  std::string hexdigest() const;
+  string hexdigest() const;
 
 private:
   void init();
@@ -209,7 +210,7 @@ MD5::MD5()
 //////////////////////////////////////////////
 
 // nifty shortcut ctor, compute MD5 for string and finalize it right away
-MD5::MD5(const std::string &text)
+MD5::MD5(const string &text)
 {
   init();
   update(text.c_str(), text.length());
@@ -433,8 +434,7 @@ MD5& MD5::finalize()
 //////////////////////////////
 
 // return hex representation of digest as string
-std::string MD5::hexdigest() const
-{
+string MD5::hexdigest() const {
   if (!finalized)
     return "";
 
@@ -443,12 +443,12 @@ std::string MD5::hexdigest() const
     sprintf(buf+i*2, "%02x", digest[i]);
   buf[32]=0;
 
-  return std::string(buf);
+  return string(buf);
 }
 
 //////////////////////////////
 
-std::string md5(const std::string& str) {
+string md5(const string& str) {
     MD5 md5 = MD5(str);
     return md5.hexdigest();
 }
-- 
cgit v1.2.3


From ce830ec51477f345c811987e11a9ed4322edcac0 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 17 Sep 2011 16:19:11 +0100
Subject: make fix

---
 training/Makefile.am | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'training')

diff --git a/training/Makefile.am b/training/Makefile.am
index 5752859e..0b598fd5 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -13,7 +13,7 @@ bin_PROGRAMS = \
   mpi_extract_features \
   mpi_online_optimize \
   mpi_batch_optimize \
-  compute_cllh \
+  mpi_compute_cllh \
   augment_grammar
 
 noinst_PROGRAMS = \
@@ -34,8 +34,8 @@ mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteva
 mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc
 mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
-compute_cllh_SOURCES = compute_cllh.cc
-compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc
+mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
 augment_grammar_SOURCES = augment_grammar.cc
 augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-- 
cgit v1.2.3


From 0af7d663194beddcde420349bbd91430e0b2e423 Mon Sep 17 00:00:00 2001
From: Guest_account Guest_account prguest11 <prguest11@taipan.cs>
Date: Tue, 11 Oct 2011 16:16:53 +0100
Subject: remove implicit conversion-to-double operator from LogVal<T> that
 caused overflow errors, clean up some pf code

---
 decoder/aligner.cc              |  2 +-
 decoder/cfg.cc                  |  2 +-
 decoder/cfg_format.h            |  2 +-
 decoder/decoder.cc              | 10 ++++----
 decoder/hg.cc                   |  4 ++--
 decoder/rule_lexer.l            |  2 ++
 decoder/trule.h                 | 15 +++++++++++-
 gi/pf/brat.cc                   | 11 ---------
 gi/pf/cbgi.cc                   | 10 --------
 gi/pf/dpnaive.cc                | 12 ----------
 gi/pf/itg.cc                    | 11 ---------
 gi/pf/pfbrat.cc                 | 11 ---------
 gi/pf/pfdist.cc                 | 11 ---------
 gi/pf/pfnaive.cc                | 11 ---------
 mteval/mbr_kbest.cc             |  4 ++--
 phrasinator/ccrp_nt.h           | 24 +++++++++++++++----
 training/mpi_batch_optimize.cc  |  2 +-
 training/mpi_compute_cllh.cc    | 51 +++++++++++++++++++----------------------
 training/mpi_online_optimize.cc |  4 ++--
 utils/logval.h                  | 10 ++++----
 20 files changed, 78 insertions(+), 131 deletions(-)

(limited to 'training')

diff --git a/decoder/aligner.cc b/decoder/aligner.cc
index 292ee123..53e059fb 100644
--- a/decoder/aligner.cc
+++ b/decoder/aligner.cc
@@ -165,7 +165,7 @@ inline void WriteProbGrid(const Array2D<prob_t>& m, ostream* pos) {
       if (m(i,j) == prob_t::Zero()) {
         os << "\t---X---";
       } else {
-        snprintf(b, 1024, "%0.5f", static_cast<double>(m(i,j)));
+        snprintf(b, 1024, "%0.5f", m(i,j).as_float());
         os << '\t' << b;
       }
     }
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index 651978d2..cd7e66e9 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -639,7 +639,7 @@ void CFG::Print(std::ostream &o,CFGFormat const& f) const {
     o << '['<<f.goal_nt_name <<']';
     WordID rhs=-goal_nt;
     f.print_rhs(o,*this,&rhs,&rhs+1);
-    if (pushed_inside!=1)
+    if (pushed_inside!=prob_t::One())
       f.print_features(o,pushed_inside);
     o<<'\n';
   }
diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h
index c6a594b8..2f40d483 100755
--- a/decoder/cfg_format.h
+++ b/decoder/cfg_format.h
@@ -101,7 +101,7 @@ struct CFGFormat {
   }
 
   void print_features(std::ostream &o,prob_t p,FeatureVector const& fv=FeatureVector()) const {
-    bool logp=(logprob_feat && p!=1);
+    bool logp=(logprob_feat && p!=prob_t::One());
     if (features || logp) {
       o << partsep;
       if (logp)
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index c4fe3c4d..3b53fd6b 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -325,7 +325,7 @@ struct DecoderImpl {
 
   static void ConvertSV(const SparseVector<prob_t>& src, SparseVector<double>* trg) {
     for (SparseVector<prob_t>::const_iterator it = src.begin(); it != src.end(); ++it)
-      trg->set_value(it->first, it->second);
+      trg->set_value(it->first, it->second.as_float());
   }
 };
 
@@ -788,10 +788,10 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
   const bool show_tree_structure=conf.count("show_tree_structure");
   if (!SILENT) forest_stats(forest,"  Init. forest",show_tree_structure,oracle.show_derivation);
   if (conf.count("show_expected_length")) {
-    const PRPair<double, double> res =
-      Inside<PRPair<double, double>,
-             PRWeightFunction<double, EdgeProb, double, ELengthWeightFunction> >(forest);
-    cerr << "  Expected length  (words): " << res.r / res.p << "\t" << res << endl;
+    const PRPair<prob_t, prob_t> res =
+      Inside<PRPair<prob_t, prob_t>,
+             PRWeightFunction<prob_t, EdgeProb, prob_t, ELengthWeightFunction> >(forest);
+    cerr << "  Expected length  (words): " << (res.r / res.p).as_float() << "\t" << res << endl;
   }
 
   if (conf.count("show_partition")) {
diff --git a/decoder/hg.cc b/decoder/hg.cc
index 3ad17f1a..180986d7 100644
--- a/decoder/hg.cc
+++ b/decoder/hg.cc
@@ -157,14 +157,14 @@ prob_t Hypergraph::ComputeEdgePosteriors(double scale, vector<prob_t>* posts) co
   const ScaledEdgeProb weight(scale);
   const ScaledTransitionEventWeightFunction w2(scale);
   SparseVector<prob_t> pv;
-  const double inside = InsideOutside<prob_t,
+  const prob_t inside = InsideOutside<prob_t,
                   ScaledEdgeProb,
                   SparseVector<prob_t>,
                   ScaledTransitionEventWeightFunction>(*this, &pv, weight, w2);
   posts->resize(edges_.size());
   for (int i = 0; i < edges_.size(); ++i)
     (*posts)[i] = prob_t(pv.value(i));
-  return prob_t(inside);
+  return inside;
 }
 
 prob_t Hypergraph::ComputeBestPathThroughEdges(vector<prob_t>* post) const {
diff --git a/decoder/rule_lexer.l b/decoder/rule_lexer.l
index 9331d8ed..083a5bb1 100644
--- a/decoder/rule_lexer.l
+++ b/decoder/rule_lexer.l
@@ -220,6 +220,8 @@ NT [^\t \[\],]+
                   std::cerr << "Line " << lex_line << ": LHS and RHS arity mismatch!\n";
                   abort();
                 }
+		// const bool ignore_grammar_features = false;
+		// if (ignore_grammar_features) scfglex_num_feats = 0;
 		TRulePtr rp(new TRule(scfglex_lhs, scfglex_src_rhs, scfglex_src_rhs_size, scfglex_trg_rhs, scfglex_trg_rhs_size, scfglex_feat_ids, scfglex_feat_vals, scfglex_num_feats, scfglex_src_arity, scfglex_als, scfglex_num_als));
     check_and_update_ctf_stack(rp);
     TRulePtr coarse_rp = ((ctf_level == 0) ? TRulePtr() : ctf_rule_stack.top());
diff --git a/decoder/trule.h b/decoder/trule.h
index 4df4ec90..8eb2a059 100644
--- a/decoder/trule.h
+++ b/decoder/trule.h
@@ -5,7 +5,9 @@
 #include <vector>
 #include <cassert>
 #include <iostream>
-#include <boost/shared_ptr.hpp>
+
+#include "boost/shared_ptr.hpp"
+#include "boost/functional/hash.hpp"
 
 #include "sparse_vector.h"
 #include "wordid.h"
@@ -162,4 +164,15 @@ class TRule {
   bool SanityCheck() const;
 };
 
+inline size_t hash_value(const TRule& r) {
+  size_t h = boost::hash_value(r.e_);
+  boost::hash_combine(h, -r.lhs_);
+  boost::hash_combine(h, boost::hash_value(r.f_));
+  return h;
+}
+
+inline bool operator==(const TRule& a, const TRule& b) {
+  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
+}
+
 #endif
diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc
index 4c6ba3ef..7b60ef23 100644
--- a/gi/pf/brat.cc
+++ b/gi/pf/brat.cc
@@ -25,17 +25,6 @@ static unsigned kMAX_SRC_PHRASE;
 static unsigned kMAX_TRG_PHRASE;
 struct FSTState;
 
-size_t hash_value(const TRule& r) {
-  size_t h = 2 - r.lhs_;
-  boost::hash_combine(h, boost::hash_value(r.e_));
-  boost::hash_combine(h, boost::hash_value(r.f_));
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
 double log_poisson(unsigned x, const double& lambda) {
   assert(lambda > 0.0);
   return log(lambda) * x - lgamma(x + 1) - lambda;
diff --git a/gi/pf/cbgi.cc b/gi/pf/cbgi.cc
index 20204e8a..97f1ba34 100644
--- a/gi/pf/cbgi.cc
+++ b/gi/pf/cbgi.cc
@@ -27,16 +27,6 @@ double log_decay(unsigned x, const double& b) {
   return log(b - 1) - x * log(b);
 }
 
-size_t hash_value(const TRule& r) {
-  // TODO fix hash function
-  size_t h = boost::hash_value(r.e_) * boost::hash_value(r.f_) * r.lhs_;
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
 struct SimpleBase {
   SimpleBase(unsigned esize, unsigned fsize, unsigned ntsize = 144) :
     uniform_e(-log(esize)),
diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc
index 582d1be7..608f73d5 100644
--- a/gi/pf/dpnaive.cc
+++ b/gi/pf/dpnaive.cc
@@ -20,18 +20,6 @@ namespace po = boost::program_options;
 
 static unsigned kMAX_SRC_PHRASE;
 static unsigned kMAX_TRG_PHRASE;
-struct FSTState;
-
-size_t hash_value(const TRule& r) {
-  size_t h = 2 - r.lhs_;
-  boost::hash_combine(h, boost::hash_value(r.e_));
-  boost::hash_combine(h, boost::hash_value(r.f_));
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
 
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc
index 2c2a86f9..ac3c16a3 100644
--- a/gi/pf/itg.cc
+++ b/gi/pf/itg.cc
@@ -27,17 +27,6 @@ ostream& operator<<(ostream& os, const vector<WordID>& p) {
   return os << ']';
 }
 
-size_t hash_value(const TRule& r) {
-  size_t h = boost::hash_value(r.e_);
-  boost::hash_combine(h, -r.lhs_);
-  boost::hash_combine(h, boost::hash_value(r.f_));
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
 double log_poisson(unsigned x, const double& lambda) {
   assert(lambda > 0.0);
   return log(lambda) * x - lgamma(x + 1) - lambda;
diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc
index 4c6ba3ef..7b60ef23 100644
--- a/gi/pf/pfbrat.cc
+++ b/gi/pf/pfbrat.cc
@@ -25,17 +25,6 @@ static unsigned kMAX_SRC_PHRASE;
 static unsigned kMAX_TRG_PHRASE;
 struct FSTState;
 
-size_t hash_value(const TRule& r) {
-  size_t h = 2 - r.lhs_;
-  boost::hash_combine(h, boost::hash_value(r.e_));
-  boost::hash_combine(h, boost::hash_value(r.f_));
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
 double log_poisson(unsigned x, const double& lambda) {
   assert(lambda > 0.0);
   return log(lambda) * x - lgamma(x + 1) - lambda;
diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc
index 18dfd03b..81abd61b 100644
--- a/gi/pf/pfdist.cc
+++ b/gi/pf/pfdist.cc
@@ -24,17 +24,6 @@ namespace po = boost::program_options;
 
 shared_ptr<MT19937> prng;
 
-size_t hash_value(const TRule& r) {
-  size_t h = boost::hash_value(r.e_);
-  boost::hash_combine(h, -r.lhs_);
-  boost::hash_combine(h, boost::hash_value(r.f_));
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc
index 43c604c3..c30e7c4f 100644
--- a/gi/pf/pfnaive.cc
+++ b/gi/pf/pfnaive.cc
@@ -24,17 +24,6 @@ namespace po = boost::program_options;
 
 shared_ptr<MT19937> prng;
 
-size_t hash_value(const TRule& r) {
-  size_t h = boost::hash_value(r.e_);
-  boost::hash_combine(h, -r.lhs_);
-  boost::hash_combine(h, boost::hash_value(r.f_));
-  return h;
-}
-
-bool operator==(const TRule& a, const TRule& b) {
-  return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_);
-}
-
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc
index 2867b36b..64a6a8bf 100644
--- a/mteval/mbr_kbest.cc
+++ b/mteval/mbr_kbest.cc
@@ -32,7 +32,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
 }
 
 struct LossComparer {
-  bool operator()(const pair<vector<WordID>, double>& a, const pair<vector<WordID>, double>& b) const {
+  bool operator()(const pair<vector<WordID>, prob_t>& a, const pair<vector<WordID>, prob_t>& b) const {
     return a.second < b.second;
   }
 };
@@ -108,7 +108,7 @@ int main(int argc, char** argv) {
           ScoreP s = scorer->ScoreCandidate(list[j].first);
           double loss = 1.0 - s->ComputeScore();
           if (type == TER || type == AER) loss = 1.0 - loss;
-          double weighted_loss = loss * (joints[j] / marginal);
+          double weighted_loss = loss * (joints[j] / marginal).as_float();
           wl_acc += weighted_loss;
           if ((!output_list) && wl_acc > mbr_loss) break;
         }
diff --git a/phrasinator/ccrp_nt.h b/phrasinator/ccrp_nt.h
index 163b643a..811bce73 100644
--- a/phrasinator/ccrp_nt.h
+++ b/phrasinator/ccrp_nt.h
@@ -50,15 +50,26 @@ class CCRP_NoTable {
     return it->second;
   }
 
-  void increment(const Dish& dish) {
-    ++custs_[dish];
+  int increment(const Dish& dish) {
+    int table_diff = 0;
+    if (++custs_[dish] == 1)
+      table_diff = 1;
     ++num_customers_;
+    return table_diff;
   }
 
-  void decrement(const Dish& dish) {
-    if ((--custs_[dish]) == 0)
+  int decrement(const Dish& dish) {
+    int table_diff = 0;
+    int nc = --custs_[dish];
+    if (nc == 0) {
       custs_.erase(dish);
+      table_diff = -1;
+    } else if (nc < 0) {
+      std::cerr << "Dish counts dropped below zero for: " << dish << std::endl;
+      abort();
+    }
     --num_customers_;
+    return table_diff;
   }
 
   double prob(const Dish& dish, const double& p0) const {
@@ -66,6 +77,11 @@ class CCRP_NoTable {
     return (at_table + p0 * concentration_) / (num_customers_ + concentration_);
   }
 
+  double logprob(const Dish& dish, const double& logp0) const {
+    const unsigned at_table = num_customers(dish);
+    return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_);
+  }
+
   double log_crp_prob() const {
     return log_crp_prob(concentration_);
   }
diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc
index 0ba8c530..046e921c 100644
--- a/training/mpi_batch_optimize.cc
+++ b/training/mpi_batch_optimize.cc
@@ -92,7 +92,7 @@ struct TrainingObserver : public DecoderObserver {
   void SetLocalGradientAndObjective(vector<double>* g, double* o) const {
     *o = acc_obj;
     for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
-      (*g)[it->first] = it->second;
+      (*g)[it->first] = it->second.as_float();
   }
 
   virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
diff --git a/training/mpi_compute_cllh.cc b/training/mpi_compute_cllh.cc
index b496d196..d5caa745 100644
--- a/training/mpi_compute_cllh.cc
+++ b/training/mpi_compute_cllh.cc
@@ -1,6 +1,4 @@
-#include <sstream>
 #include <iostream>
-#include <fstream>
 #include <vector>
 #include <cassert>
 #include <cmath>
@@ -12,6 +10,7 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "sentence_metadata.h"
 #include "verbose.h"
 #include "hg.h"
 #include "prob.h"
@@ -52,7 +51,8 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   return true;
 }
 
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* ids) {
+void ReadInstances(const string& fname, int rank, int size, vector<string>* c) {
+  assert(fname != "-");
   ReadFile rf(fname);
   istream& in = *rf.stream();
   string line;
@@ -60,20 +60,16 @@ void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>*
   while(in) {
     getline(in, line);
     if (!in) break;
-    if (lc % size == rank) {
-      c->push_back(line);
-      ids->push_back(lc);
-    }
+    if (lc % size == rank) c->push_back(line);
     ++lc;
   }
 }
 
 static const double kMINUS_EPSILON = -1e-6;
 
-struct TrainingObserver : public DecoderObserver {
-  void Reset() {
-    acc_obj = 0;
-  } 
+struct ConditionalLikelihoodObserver : public DecoderObserver {
+
+  ConditionalLikelihoodObserver() : trg_words(), acc_obj(), cur_obj() {}
 
   virtual void NotifyDecodingStart(const SentenceMetadata&) {
     cur_obj = 0;
@@ -120,8 +116,10 @@ struct TrainingObserver : public DecoderObserver {
     }
     assert(!isnan(log_ref_z));
     acc_obj += (cur_obj - log_ref_z);
+    trg_words += smeta.GetReference().size();
   }
 
+  unsigned trg_words;
   double acc_obj;
   double cur_obj;
   int state;
@@ -161,35 +159,32 @@ int main(int argc, char** argv) {
   if (conf.count("weights"))
     Weights::InitFromFile(conf["weights"].as<string>(), &weights);
 
-  // freeze feature set
-  //const bool freeze_feature_set = conf.count("freeze_feature_set");
-  //if (freeze_feature_set) FD::Freeze();
-
-  vector<string> corpus; vector<int> ids;
-  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
+  vector<string> corpus;
+  ReadInstances(conf["training_data"].as<string>(), rank, size, &corpus);
   assert(corpus.size() > 0);
-  assert(corpus.size() == ids.size());
-
-  TrainingObserver observer;
-  double objective = 0;
 
-  observer.Reset();
   if (rank == 0)
-    cerr << "Each processor is decoding " << corpus.size() << " training examples...\n";
+    cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
 
-  for (int i = 0; i < corpus.size(); ++i) {
-    decoder.SetId(ids[i]);
+  ConditionalLikelihoodObserver observer;
+  for (int i = 0; i < corpus.size(); ++i)
     decoder.Decode(corpus[i], &observer);
-  }
 
+  double objective = 0;
+  unsigned total_words = 0;
 #ifdef HAVE_MPI
   reduce(world, observer.acc_obj, objective, std::plus<double>(), 0);
+  reduce(world, observer.trg_words, total_words, std::plus<unsigned>(), 0);
 #else
   objective = observer.acc_obj;
 #endif
 
-  if (rank == 0)
-    cout << "OBJECTIVE: " << objective << endl;
+  if (rank == 0) {
+    cout << "CONDITIONAL LOG_e LIKELIHOOD: " << objective << endl;
+    cout << "CONDITIONAL LOG_2 LIKELIHOOD: " << (objective/log(2)) << endl;
+    cout << "         CONDITIONAL ENTROPY: " << (objective/log(2) / total_words) << endl;
+    cout << "                  PERPLEXITY: " << pow(2, (objective/log(2) / total_words)) << endl;
+  }
 
   return 0;
 }
diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc
index 2ef4a2e7..f87b7274 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/mpi_online_optimize.cc
@@ -94,7 +94,7 @@ struct TrainingObserver : public DecoderObserver {
   void SetLocalGradientAndObjective(vector<double>* g, double* o) const {
     *o = acc_obj;
     for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
-      (*g)[it->first] = it->second;
+      (*g)[it->first] = it->second.as_float();
   }
 
   virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
@@ -158,7 +158,7 @@ struct TrainingObserver : public DecoderObserver {
   void GetGradient(SparseVector<double>* g) const {
     g->clear();
     for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
-      g->set_value(it->first, it->second);
+      g->set_value(it->first, it->second.as_float());
   }
 
   int total_complete;
diff --git a/utils/logval.h b/utils/logval.h
index 6fdc2c42..8a59d0b1 100644
--- a/utils/logval.h
+++ b/utils/logval.h
@@ -25,12 +25,13 @@ class LogVal {
   typedef LogVal<T> Self;
 
   LogVal() : s_(), v_(LOGVAL_LOG0) {}
-  explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {}
+  LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {}
+  const Self& operator=(double x) { s_ = std::signbit(x); v_ = s_ ? std::log(-x) : std::log(x); return *this; }
   LogVal(init_minus_1) : s_(true),v_(0) {  }
   LogVal(init_1) : s_(),v_(0) {  }
   LogVal(init_0) : s_(),v_(LOGVAL_LOG0) {  }
-  LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {}
-  LogVal(unsigned x) : s_(0), v_(std::log(x)) { }
+  explicit LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {}
+  explicit LogVal(unsigned x) : s_(0), v_(std::log(x)) { }
   LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {}
   LogVal(double lnx,init_lnx) : s_(),v_(lnx) {}
   static Self exp(T lnx) { return Self(lnx,false); }
@@ -141,9 +142,6 @@ class LogVal {
     return pow(1/root);
   }
 
-  operator T() const {
-    if (s_) return -std::exp(v_); else return std::exp(v_);
-  }
   T as_float() const {
     if (s_) return -std::exp(v_); else return std::exp(v_);
   }
-- 
cgit v1.2.3


From 171027795ba3a01ba2ed82d7036610ac397e1fe8 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Fri, 14 Oct 2011 11:51:12 +0100
Subject: remove FSA integration code. will have to be resurrected another day

---
 decoder/Makefile.am             |   1 -
 decoder/apply_fsa_models.cc     | 798 ----------------------------------------
 decoder/cdec_ff.cc              |  13 -
 decoder/feature_accum.h         | 129 -------
 decoder/ff_factory.h            |   2 -
 decoder/ff_from_fsa.h           | 304 ---------------
 decoder/ff_fsa.h                | 401 --------------------
 decoder/ff_fsa_data.h           | 131 -------
 decoder/ff_fsa_dynamic.h        | 208 -----------
 decoder/ff_lm.cc                |  48 ---
 decoder/ff_lm_fsa.h             | 140 -------
 decoder/ff_register.h           |  38 --
 decoder/hg_test.cc              |  16 +-
 training/mpi_online_optimize.cc |   2 +
 14 files changed, 10 insertions(+), 2221 deletions(-)
 delete mode 100755 decoder/apply_fsa_models.cc
 delete mode 100755 decoder/feature_accum.h
 delete mode 100755 decoder/ff_from_fsa.h
 delete mode 100755 decoder/ff_fsa.h
 delete mode 100755 decoder/ff_fsa_data.h
 delete mode 100755 decoder/ff_fsa_dynamic.h
 delete mode 100755 decoder/ff_lm_fsa.h

(limited to 'training')

diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index ede1cff0..6b9360d8 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -42,7 +42,6 @@ libcdec_a_SOURCES = \
   cfg.cc \
   dwarf.cc \
   ff_dwarf.cc \
-  apply_fsa_models.cc \
   rule_lexer.cc \
   fst_translator.cc \
   csplit.cc \
diff --git a/decoder/apply_fsa_models.cc b/decoder/apply_fsa_models.cc
deleted file mode 100755
index 3e93cadd..00000000
--- a/decoder/apply_fsa_models.cc
+++ /dev/null
@@ -1,798 +0,0 @@
-//see apply_fsa_models.README for notes on the l2r earley fsa+cfg intersection
-//implementation in this file (also some comments in this file)
-#define SAFE_VALGRIND 1
-
-#include "apply_fsa_models.h"
-#include <stdexcept>
-#include <cassert>
-#include <queue>
-#include <stdint.h>
-
-#include "writer.h"
-#include "hg.h"
-#include "ff_fsa_dynamic.h"
-#include "ff_from_fsa.h"
-#include "feature_vector.h"
-#include "stringlib.h"
-#include "apply_models.h"
-#include "cfg.h"
-#include "hg_cfg.h"
-#include "utoa.h"
-#include "hash.h"
-#include "value_array.h"
-#include "d_ary_heap.h"
-#include "agenda.h"
-#include "show.h"
-#include "string_to.h"
-
-
-#define DFSA(x) x
-//fsa earley chart
-
-#define DPFSA(x) x
-//prefix trie
-
-#define DBUILDTRIE(x)
-
-#define PRINT_PREFIX 1
-#if PRINT_PREFIX
-# define IF_PRINT_PREFIX(x) x
-#else
-# define IF_PRINT_PREFIX(x)
-#endif
-// keep backpointers in prefix trie so you can print a meaningful node id
-
-static const unsigned FSA_AGENDA_RESERVE=10; // TODO: increase to 1<<24 (16M)
-
-using namespace std;
-
-//impl details (not exported).  flat namespace for my ease.
-
-typedef CFG::RHS RHS;
-typedef CFG::BinRhs BinRhs;
-typedef CFG::NTs NTs;
-typedef CFG::NT NT;
-typedef CFG::NTHandle NTHandle;
-typedef CFG::Rules Rules;
-typedef CFG::Rule Rule;
-typedef CFG::RuleHandle RuleHandle;
-
-namespace {
-
-/*
-
-1) A -> x . * (trie)
-
-this is somewhat nice.  cost pushed for best first, of course.  similar benefit as left-branching binarization without the explicit predict/complete steps?
-
-vs. just
-
-2) * -> x . y
-
-here you have to potentially list out all A -> . x y as items * -> . x y immediately, and shared rhs seqs won't be shared except at the usual single-NT predict/complete.  of course, the prediction of items -> . x y can occur lazy best-first.
-
-vs.
-
-3) * -> x . *
-
-with 3, we predict all sorts of useless items - that won't give us our goal A and may not partcipate in any parse.  this is not a good option at all.
-
-I'm using option 1.
-*/
-
-// if we don't greedy-binarize, we want to encode recognized prefixes p (X -> p . rest) efficiently.  if we're doing this, we may as well also push costs so we can best-first select rules in a lazy fashion.  this is effectively left-branching binarization, of course.
-
-template <class K,class V,class Hash>
-struct fsa_map_type {
-  typedef std::map<K,V> type; // change to HASH_MAP ?
-};
-//template typedef - and macro to make it less painful
-#define FSA_MAP(k,v) fsa_map_type<k,v,boost::hash<k> >::type
-
-struct PrefixTrieNode;
-typedef PrefixTrieNode *NodeP;
-typedef PrefixTrieNode const *NodePc;
-
-// for debugging prints only
-struct TrieBackP {
-  WordID w;
-  NodePc from;
-  TrieBackP(WordID w=0,NodePc from=0) : w(w),from(from) {  }
-};
-
-FsaFeatureFunction const* print_fsa=0;
-CFG const* print_cfg=0;
-inline ostream& print_cfg_rhs(std::ostream &o,WordID w,CFG const*pcfg=print_cfg) {
-  if (pcfg)
-    pcfg->print_rhs_name(o,w);
-  else
-    CFG::static_print_rhs_name(o,w);
-  return o;
-}
-
-inline std::string nt_name(WordID n,CFG const*pcfg=print_cfg) {
-  if (pcfg) return pcfg->nt_name(n);
-  return CFG::static_nt_name(n);
-}
-
-template <class V>
-ostream& print_by_nt(std::ostream &o,V const& v,CFG const*pcfg=print_cfg,char const* header="\nNT -> X\n") {
-  o<<header;
-  for (int i=0;i<v.size();++i)
-    o << nt_name(i,pcfg) << " -> "<<v[i]<<"\n";
-  return o;
-}
-
-template <class V>
-ostream& print_map_by_nt(std::ostream &o,V const& v,CFG const*pcfg=print_cfg,char const* header="\nNT -> X\n") {
-  o<<header;
-  for (typename V::const_iterator i=v.begin(),e=v.end();i!=e;++i) {
-    print_cfg_rhs(o,i->first,pcfg) << " -> "<<i->second<<"\n";
-  }
-  return o;
-}
-
-struct PrefixTrieEdge {
-  PrefixTrieEdge()
-  //    : dest(0),w(TD::max_wordid)
-  {}
-  PrefixTrieEdge(WordID w,NodeP dest)
-    : dest(dest),w(w)
-  {}
-//  explicit PrefixTrieEdge(best_t p) : p(p),dest(0) {  }
-
-  best_t p;// viterbi additional prob, i.e. product over path incl. p_final = total rule prob.  note: for final edge, set this.
-  //DPFSA()
-  // we can probably just store deltas, but for debugging remember the full p
-  //    best_t delta; //
-  NodeP dest;
-  bool is_final() const { return dest==0; }
-  best_t p_dest() const;
-  WordID w; // for root and and is_final(), this will be (negated) NTHandle.
-
-  // for sorting most probable first in adj; actually >(p)
-  inline bool operator <(PrefixTrieEdge const& o) const {
-    return o.p<p;
-  }
-  PRINT_SELF(PrefixTrieEdge)
-  void print(std::ostream &o) const {
-    print_cfg_rhs(o,w);
-    o<<"{"<<p<<"}->"<<dest;
-  }
-};
-
-//note: ending a rule is handled with a special final edge, so that possibility can be explored in best-first order along with the rest (alternative: always finish a final rule by putting it on queue).  this edge has no symbol on it.
-struct PrefixTrieNode {
-  best_t p; // viterbi (max prob) of rule this node leads to - when building.  telescope later onto edges for best-first.
-//  bool final; // may also have successors, of course.  we don't really need to track this; a null dest edge in the adj list lets us encounter the fact in best first order.
-  void p_delta(int next,best_t &p) const {
-    p*=adj[next].p;
-  }
-  void inc_adj(int &next,best_t &p) const {
-    p/=adj[next].p; //TODO: cache deltas
-    ++next;
-    p*=adj[next].p;
-  }
-
-
-  typedef TrieBackP BP;
-  typedef std::vector<BP> BPs;
-  void back_vec(BPs &ns) const {
-    IF_PRINT_PREFIX(if(backp.from) { ns.push_back(backp); backp.from->back_vec(ns); })
-  }
-
-  BPs back_vec() const {
-    BPs ret;
-    back_vec(ret);
-    return ret;
-  }
-
-  unsigned size() const {
-    unsigned a=adj.size();
-    unsigned e=edge_for.size();
-    return a>e?a:e;
-  }
-
-  void print_back_str(std::ostream &o) const {
-    BPs back=back_vec();
-    unsigned i=back.size();
-    if (!i) {
-      o<<"PrefixTrieNode@"<<(uintptr_t)this;
-      return;
-    }
-    bool first=true;
-    while (i--<=0) {
-      if (!first) o<<',';
-      first=false;
-      WordID w=back[i].w;
-      print_cfg_rhs(o,w);
-    }
-  }
-  std::string back_str() const {
-    std::ostringstream o;
-    print_back_str(o);
-    return o.str();
-  }
-
-//  best_t p_final; // additional prob beyond what we already paid. while building, this is the total prob
-// instead of storing final, we'll say that an edge with a NULL dest is a final edge.  this way it gets sorted into the list of adj.
-
-  // instead of completed map, we have trie start w/ lhs.
-  NTHandle lhs; // nonneg. - instead of storing this in Item.
-  IF_PRINT_PREFIX(BP backp;)
-
-  enum { ROOT=-1 };
-  explicit PrefixTrieNode(NTHandle lhs=ROOT,best_t p=1) : p(p),lhs(lhs),IF_PRINT_PREFIX(backp()) {
-    //final=false;
-  }
-  bool is_root() const { return lhs==ROOT; } // means adj are the nonneg lhs indices, and we have the index edge_for still available
-
-  // outgoing edges will be ordered highest p to worst p
-
-  typedef FSA_MAP(WordID,PrefixTrieEdge) PrefixTrieEdgeFor;
-public:
-  PrefixTrieEdgeFor edge_for; //TODO: move builder elsewhere?  then need 2nd hash or edge include pointer to builder.  just clear this later
-  bool have_adj() const {
-    return adj.size()>=edge_for.size();
-  }
-  bool no_adj() const {
-    return adj.empty();
-  }
-
-  void index_adj() {
-    index_adj(edge_for);
-  }
-  template <class M>
-  void index_adj(M &m) {
-    assert(have_adj());
-    m.clear();
-    for (int i=0;i<adj.size();++i) {
-      PrefixTrieEdge const& e=adj[i];
-      SHOWM2(DPFSA,"index_adj",i,e);
-      m[e.w]=e;
-    }
-  }
-  template <class PV>
-  void index_lhs(PV &v) {
-    for (int i=0,e=adj.size();i!=e;++i) {
-      PrefixTrieEdge const& edge=adj[i];
-      // assert(edge.p.is_1());  // actually, after done_building, e will have telescoped dest->p/p.
-      NTHandle n=-edge.w;
-      assert(n>=0);
-//      SHOWM3(DPFSA,"index_lhs",i,edge,n);
-      v[n]=edge.dest;
-    }
-  }
-
-  template <class PV>
-  void done_root(PV &v) {
-    assert(is_root());
-    SHOWM1(DBUILDTRIE,"done_root",OSTRF1(print_map_by_nt,edge_for));
-    done_building_r(); //sets adj
-    SHOWM1(DBUILDTRIE,"done_root",OSTRF1(print_by_nt,adj));
-//    SHOWM1(DBUILDTRIE,done_root,adj);
-//    index_adj(); // we want an index for the root node?.  don't think so - index_lhs handles it.  also we stopped clearing edge_for.
-    index_lhs(v); // uses adj
-  }
-
-  // call only once.
-  void done_building_r() {
-    done_building();
-    for (int i=0;i<adj.size();++i)
-      if (adj[i].dest) // skip final edge
-        adj[i].dest->done_building_r();
-  }
-
-  // for done_building; compute incremental (telescoped) edge p
-  PrefixTrieEdge /*const&*/ operator()(PrefixTrieEdgeFor::value_type & pair) const {
-    PrefixTrieEdge &e=pair.second;//const_cast<PrefixTrieEdge&>(pair.second);
-    e.p=e.p_dest()/p;
-    return e;
-  }
-
-  // call only once.
-  void done_building() {
-    SHOWM3(DBUILDTRIE,"done_building",edge_for.size(),adj.size(),1);
-#if 1
-    adj.reinit_map(edge_for,*this);
-#else
-    adj.reinit(edge_for.size());
-    SHOWM3(DBUILDTRIE,"done_building_reinit",edge_for.size(),adj.size(),2);
-    Adj::iterator o=adj.begin();
-    for (PrefixTrieEdgeFor::iterator i=edge_for.begin(),e=edge_for.end();i!=e;++i) {
-      SHOWM3(DBUILDTRIE,"edge_for",o-adj.begin(),i->first,i->second);
-      PrefixTrieEdge &edge=i->second;
-      edge.p=(edge.dest->p)/p;
-      *o++=edge;
-//      (*this)(*i);
-    }
-#endif
-    SHOWM1(DBUILDTRIE,"done building adj",prange(adj.begin(),adj.end(),true));
-    assert(adj.size()==edge_for.size());
-//    if (final) p_final/=p;
-    std::sort(adj.begin(),adj.end());
-    //TODO: store adjacent differences on edges (compared to
-  }
-
-  typedef ValueArray<PrefixTrieEdge>  Adj;
-//  typedef vector<PrefixTrieEdge> Adj;
-  Adj adj;
-
-  typedef WordID W;
-
-  // let's compute p_min so that every rule reachable from the created node has p at least this low.
-  NodeP improve_edge(PrefixTrieEdge const& e,best_t rulep) {
-    NodeP d=e.dest;
-    maybe_improve(d->p,rulep);
-    return d;
-  }
-
-  inline NodeP build(W w,best_t rulep) {
-    return build(lhs,w,rulep);
-  }
-  inline NodeP build_lhs(NTHandle n,best_t rulep) {
-    return build(n,-n,rulep);
-  }
-
-  NodeP build(NTHandle lhs_,W w,best_t rulep) {
-    PrefixTrieEdgeFor::iterator i=edge_for.find(w);
-    if (i!=edge_for.end())
-      return improve_edge(i->second,rulep);
-    NodeP r=new PrefixTrieNode(lhs_,rulep);
-    IF_PRINT_PREFIX(r->backp=BP(w,this));
-//    edge_for.insert(i,PrefixTrieEdgeFor::value_type(w,PrefixTrieEdge(w,r)));
-    add(edge_for,w,PrefixTrieEdge(w,r));
-    SHOWM4(DBUILDTRIE,"built node",this,w,*r,r);
-    return r;
-  }
-
-  void set_final(NTHandle lhs_,best_t pf) {
-    assert(no_adj());
-//    final=true;
-    PrefixTrieEdge &e=edge_for[null_wordid];
-    e.p=pf;
-    e.dest=0;
-    e.w=lhs_;
-    maybe_improve(p,pf);
-  }
-
-private:
-  void destroy_children() {
-    assert(adj.size()>=edge_for.size());
-    for (int i=0,e=adj.size();i<e;++i) {
-      NodeP c=adj[i].dest;
-      if (c) { // final state has no end
-        delete c;
-      }
-    }
-  }
-public:
-  ~PrefixTrieNode() {
-    destroy_children();
-  }
-  void print(std::ostream &o) const {
-    o << "Node"<<this<< ": "<<lhs << "->" << p;
-    o << ',' << size() << ',';
-    print_back_str(o);
-  }
-  PRINT_SELF(PrefixTrieNode)
-};
-
-inline best_t PrefixTrieEdge::p_dest() const {
-  return dest ? dest->p : p; // for final edge, p was set (no sentinel node)
-}
-
-
-//Trie starts with lhs (nonneg index), then continues w/ rhs (mixed >0 word, else NT)
-// trie ends with final edge, which points to a per-lhs prefix node
-struct PrefixTrie {
-  void print(std::ostream &o) const {
-    o << cfgp << ' ' << root;
-  }
-  PRINT_SELF(PrefixTrie);
-  CFG *cfgp;
-  Rules const* rulesp;
-  Rules const& rules() const { return *rulesp; }
-  CFG const& cfg() const { return *cfgp; }
-  PrefixTrieNode root;
-  typedef std::vector<NodeP> LhsToTrie; // will have to check lhs2[lhs].p for best cost of some rule with that lhs, then use edge deltas after?  they're just caching a very cheap computation, really
-  LhsToTrie lhs2; // no reason to use a map or hash table; every NT in the CFG will have some rule rhses.  lhs_to_trie[i]=root.edge_for[i], i.e. we still have a root trie node conceptually, we just access through this since it's faster.
-  typedef LhsToTrie LhsToComplete;
-  LhsToComplete lhs2complete; // the sentinel "we're completing" node (dot at end) for that lhs.  special case of suffix-set=same trie minimization (aka right branching binarization) // these will be used to track kbest completions, along with a l state (r state will be in the list)
-  PrefixTrie(CFG &cfg) : cfgp(&cfg),rulesp(&cfg.rules),lhs2(cfg.nts.size(),0),lhs2complete(cfg.nts.size()) {
-//    cfg.SortLocalBestFirst(); // instead we'll sort in done_building_r
-    print_cfg=cfgp;
-    SHOWM2(DBUILDTRIE,"PrefixTrie()",rulesp->size(),lhs2.size());
-    cfg.VisitRuleIds(*this);
-    root.done_root(lhs2);
-    SHOWM3(DBUILDTRIE,"done w/ PrefixTrie: ",root,root.adj.size(),lhs2.size());
-    DBUILDTRIE(print_by_nt(cerr,lhs2,cfgp));
-    SHOWM1(DBUILDTRIE,"lhs2",OSTRF2(print_by_nt,lhs2,cfgp));
-  }
-
-  void operator()(int ri) {
-    Rule const& r=rules()[ri];
-    NTHandle lhs=r.lhs;
-    best_t p=r.p;
-//    NodeP n=const_cast<PrefixTrieNode&>(root).build_lhs(lhs,p);
-    NodeP n=root.build_lhs(lhs,p);
-    SHOWM4(DBUILDTRIE,"Prefixtrie rule id, root",ri,root,p,*n);
-    for (RHS::const_iterator i=r.rhs.begin(),e=r.rhs.end();;++i) {
-      SHOWM2(DBUILDTRIE,"PrefixTrie build or final",i-r.rhs.begin(),*n);
-      if (i==e) {
-        n->set_final(lhs,p);
-        break;
-      }
-      n=n->build(*i,p);
-      SHOWM2(DBUILDTRIE,"PrefixTrie built",*i,*n);
-    }
-//    root.build(lhs,r.p)->build(r.rhs,r.p);
-  }
-  inline NodeP lhs2_ex(NTHandle n) const {
-    NodeP r=lhs2[n];
-    if (!r) throw std::runtime_error("PrefixTrie: no CFG rule w/ lhs "+cfgp->nt_name(n));
-    return r;
-  }
-private:
-  PrefixTrie(PrefixTrie const& o);
-};
-
-
-
-typedef std::size_t ItemHash;
-
-
-struct ItemKey {
-  explicit ItemKey(NodeP start,Bytes const& start_state) : dot(start),q(start_state),r(start_state) {  }
-  explicit ItemKey(NodeP dot) : dot(dot) {  }
-  NodeP dot; // dot is a function of the stuff already recognized, and gives a set of suffixes y to complete to finish a rhs for lhs() -> dot y.  for a lhs A -> . *, this will point to lh2[A]
-  Bytes q,r; // (q->r are the fsa states; if r is empty it means
-  bool operator==(ItemKey const& o) const {
-    return dot==o.dot && q==o.q && r==o.r;
-  }
-  inline ItemHash hash() const {
-    ItemHash h=GOLDEN_MEAN_FRACTION*(ItemHash)(dot-NULL); // i.e. lower order bits of ptr are nonrandom
-    using namespace boost;
-    hash_combine(h,q);
-    hash_combine(h,r);
-    return h;
-  }
-  template<class O>
-  void print(O &o) const {
-    o<<"lhs="<<lhs();
-    if (dot)
-      dot->print_back_str(o);
-    if (print_fsa) {
-      o<<'/';
-      print_fsa->print_state(o,&q[0]);
-      o<<"->";
-      print_fsa->print_state(o,&r[0]);
-    }
-  }
-  NTHandle lhs() const { return dot->lhs; }
-  PRINT_SELF(ItemKey)
-};
-inline ItemHash hash_value(ItemKey const& x) {
-  return x.hash();
-}
-ItemKey null_item((PrefixTrieNode*)0);
-
-struct Item;
-typedef Item *ItemP;
-
-/* we use a single type of item so it can live in a single best-first queue.  we hold them by pointer so they can have mutable state, e.g. priority/location, but also lists of predictions and kbest completions (i.e. completions[L,r] = L -> * (r,s), by 1best for each possible s.  we may discover more s later.  we could use different subtypes since we hold by pointer, but for now everything will be packed as variants of Item */
-#undef INIT_LOCATION
-#if D_ARY_TRACK_OUT_OF_HEAP
-# define INIT_LOCATION                           , location(D_ARY_HEAP_NULL_INDEX)
-#elif !defined(NDEBUG) || SAFE_VALGRIND
- // avoid spurious valgrind warning - FIXME: still complains???
-# define INIT_LOCATION                           , location()
-#else
-# define INIT_LOCATION
-#endif
-
-// these should go in a global best-first queue
-struct ItemPrio {
-  // NOTE: sum = viterbi (max)
-  ItemPrio() : priority(init_0()),inner(init_0()) {  }
-  explicit ItemPrio(best_t priority) : priority(priority),inner(init_0()) {  }
-  best_t priority; // includes inner prob. (forward)
-  /* The forward probability alpha_i(X[k]->x.y) is the sum of the probabilities of all
-     constrained paths of length i that end in state X[k]->x.y*/
-  best_t inner;
-  /* The inner probability beta_i(X[k]->x.y) is the sum of the probabilities of all
-     paths of length i-k that start in state X[k,k]->.xy and end in X[k,i]->x.y, and generate the input symbols x[k,...,i-1] */
-  template<class O>
-  void print(O &o) const {
-    o<<priority; // TODO: show/use inner?
-  }
-  typedef ItemPrio self_type;
-  SELF_TYPE_PRINT
-};
-
-#define ITEM_TYPE(X,t)                              \
-  X(t,ADJ,=-1)                                 \
-
-#define ITEM_TYPE_TYPE ItemType
-
-DECLARE_NAMED_ENUM(ITEM_TYPE)
-DEFINE_NAMED_ENUM(ITEM_TYPE)
-
-struct Item : ItemPrio,ItemKey {
-/*  explicit Item(NodeP dot,best_t prio,int next) : ItemPrio(prio),ItemKey(dot),trienext(next),from(0)
-                                        INIT_LOCATION
-                                        {  }*/
-//  ItemType t;
-  // lazy queueing of succesors item:
-  bool is_trie_adj() const {
-    return trienext>=0;
-  }
-  explicit Item(FFState const& state,NodeP dot,best_t prio,int next=0) : ItemPrio(prio),ItemKey(dot,state),trienext(next),from(0)
-                                                             INIT_LOCATION
-  {
-//    t=ADJ;
-//    if (dot->adj.size())
-    dot->p_delta(next,priority);
-//    SHOWM1(DFSA,"Item(state,dot,prio)",prio);
-  }
-  typedef std::queue<ItemP> Predicted;
-//  Predicted predicted; // this is empty, unless this is a predicted L -> .asdf item, or a to-complete L -> asdf .
-  int trienext; // index of dot->adj to complete (if dest==0), or predict (if NT), or scan (if word).  note: we could store pointer inside adj since it and trie are @ fixed addrs.  less pointer arith, more space.
-  ItemP from; //backpointer - 0 for L -> . asdf for the rest; L -> a .sdf, it's the L -> .asdf item.
-  ItemP predicted_from() const {
-    ItemP p=(ItemP)this;
-    while(p->from) p=p->from;
-    return p;
-  }
-  template<class O>
-  void print(O &o) const {
-    o<< '[';
-    o<<this<<": ";
-    ItemKey::print(o);
-    o<<' ';
-    ItemPrio::print(o);
-    o<<" next="<<trienext;
-    o<< ']';
-  }
-  PRINT_SELF(Item)
-  unsigned location;
-};
-
-struct GetItemKey {
-  typedef Item argument_type;
-  typedef ItemKey result_type;
-  result_type const& operator()(Item const& i) const { return i; }
-  template <class T>
-  T const& operator()(T const& t) const { return t; }
-};
-
-/* here's what i imagine (best first):
-   all of these are looked up in a chart which includes the fsa states as part of the identity
-
-   perhaps some items are ephemeral and never reused (e.g. edge items of a cube, where we delay traversing trie based on probabilities), but in all ohter cases we make entirely new objects derived from the original one (memoizing).  let's ignore lazier edge items for now and always push all successors onto heap.
-
-   initial item (predicted): GOAL_NT -> . * (trie root for that lhs), start, start (fsa start states).  has a list of
-
-   completing item ( L -> * . needs to queue all the completions immediately.  when predicting before a completion happens, add to prediction list.  after it happens, immediately use the completed bests.  this is confusing to me: the completions for an original NT w/ a given r state may end up in several different ones.  we don't only care about the 1 best cost r item but all the different r.
-
-   the prediction's left/right uses the predictor's right
-
- */
-template <class FsaFF=FsaFeatureFunction>
-struct Chart {
-  //typedef HASH_MAP<Item,ItemP,boost::hash<Item> > Items;
-  //typedef Items::iterator FindItem;
-  //typedef std::pair<FindItem,bool> InsertItem;
-//  Items items;
-  CFG &cfg; // TODO: remove this from Chart
-  SentenceMetadata const& smeta;
-  FsaFF const& fsa;
-  NTHandle goal_nt;
-  PrefixTrie trie;
-  typedef Agenda<Item,BetterP,GetItemKey> A;
-  A a;
-
-  /* had to stop working on this for now - it's garbage/useless in this form - see NOTES.earley */
-
-  // p_partial is priority*p(rule) - excluding the FSA model score, and predicted
-  void succ(Item const& from,int adji,best_t p_partial) {
-    PrefixTrieEdge const& te=from.dot->adj[adji];
-    NodeP dest=te.dest;
-    if (te.is_final()) {
-      // complete
-      return;
-    }
-    WordID w=te.w;
-    if (w<0) {
-      NTHandle lhs=-w;
-    } else {
-
-    }
-  }
-
-  void extend1() {
-    BetterP better;
-    Item &t=*a.top();
-    best_t tp=t.priority;
-    if (t.is_trie_adj()) {
-      best_t pstop=a.second_best(); // remember; best_t a<b means a better than (higher prob) than b
-//      NodeP d=t.dot;
-      PrefixTrieNode::Adj const& adj=t.dot->adj;
-      int n=t.trienext,m=adj.size();
-      SHOWM3(DFSA,"popped",t,tp,pstop);
-      for (;n<m;++n) { // cube corner
-        PrefixTrieEdge const& te=adj[n];
-        SHOWM3(DFSA,"maybe try trie next",n,te.p,pstop);
-        if (better(te.p,pstop)) { // can get some improvement
-          SHOWM2(DFSA,"trying adj ",m,te);
-        } else {
-          goto done;
-        }
-      }
-      a.pop();
-    done:;
-    }
-  }
-
-  void best_first(unsigned kbest=1) {
-    assert(kbest==1); //TODO: k-best via best-first requires revisiting best things again and adjusting desc.  tricky.
-    while(!a.empty()) {
-      extend1();
-    }
-  }
-
-  Chart(CFG &cfg,SentenceMetadata const& smeta,FsaFF const& fsa,unsigned reserve=FSA_AGENDA_RESERVE)
-    : cfg(cfg),smeta(smeta),fsa(fsa),trie(cfg),a(reserve) {
-    assert(fsa.state_bytes());
-    print_fsa=&fsa;
-    goal_nt=cfg.goal_nt;
-    best_t prio=init_1();
-    SHOW1(DFSA,prio);
-    a.add(a.construct(fsa.start,trie.lhs2_ex(goal_nt),prio));
-  }
-};
-
-
-}//anon ns
-
-
-DEFINE_NAMED_ENUM(FSA_BY)
-
-template <class FsaFF=FsaFeatureFunction>
-struct ApplyFsa {
-  ApplyFsa(HgCFG &i,
-           const SentenceMetadata& smeta,
-           const FsaFeatureFunction& fsa,
-           DenseWeightVector const& weights,
-           ApplyFsaBy const& by,
-           Hypergraph* oh
-    )
-    :hgcfg(i),smeta(smeta),fsa(fsa),weights(weights),by(by),oh(oh)
-  {
-    stateless=!fsa.state_bytes();
-  }
-  void Compute() {
-    if (by.IsBottomUp() || stateless)
-      ApplyBottomUp();
-    else
-      ApplyEarley();
-  }
-  void ApplyBottomUp();
-  void ApplyEarley();
-  CFG const& GetCFG();
-private:
-  CFG cfg;
-  HgCFG &hgcfg;
-  SentenceMetadata const& smeta;
-  FsaFF const& fsa;
-//  WeightVector weight_vector;
-  DenseWeightVector weights;
-  ApplyFsaBy by;
-  Hypergraph* oh;
-  std::string cfg_out;
-  bool stateless;
-};
-
-template <class F>
-void ApplyFsa<F>::ApplyBottomUp()
-{
-  assert(by.IsBottomUp());
-  FeatureFunctionFromFsa<FsaFeatureFunctionFwd> buff(&fsa);
-  buff.Init(); // mandatory to call this (normally factory would do it)
-  vector<const FeatureFunction*> ffs(1,&buff);
-  ModelSet models(weights, ffs);
-  IntersectionConfiguration i(stateless ? BU_FULL : by.BottomUpAlgorithm(),by.pop_limit);
-  ApplyModelSet(hgcfg.ih,smeta,models,i,oh);
-}
-
-template <class F>
-void ApplyFsa<F>::ApplyEarley()
-{
-  hgcfg.GiveCFG(cfg);
-  print_cfg=&cfg;
-  print_fsa=&fsa;
-  Chart<F> chart(cfg,smeta,fsa);
-  // don't need to uniq - option to do that already exists in cfg_options
-  //TODO:
-  chart.best_first();
-  *oh=hgcfg.ih;
-}
-
-
-void ApplyFsaModels(HgCFG &i,
-                    const SentenceMetadata& smeta,
-                    const FsaFeatureFunction& fsa,
-                    DenseWeightVector const& weight_vector,
-                    ApplyFsaBy const& by,
-                    Hypergraph* oh)
-{
-  ApplyFsa<FsaFeatureFunction> a(i,smeta,fsa,weight_vector,by,oh);
-  a.Compute();
-}
-
-/*
-namespace {
-char const* anames[]={
-  "BU_CUBE",
-  "BU_FULL",
-  "EARLEY",
-  0
-};
-}
-*/
-
-//TODO: named enum type in boost?
-
-std::string ApplyFsaBy::name() const {
-//  return anames[algorithm];
-  return GetName(algorithm);
-}
-
-std::string ApplyFsaBy::all_names() {
-  return FsaByNames(" ");
-  /*
-  std::ostringstream o;
-  for (int i=0;i<N_ALGORITHMS;++i) {
-    assert(anames[i]);
-    if (i) o<<' ';
-    o<<anames[i];
-  }
-  return o.str();
-  */
-}
-
-ApplyFsaBy::ApplyFsaBy(std::string const& n, int pop_limit) : pop_limit(pop_limit) {
-  std::string uname=toupper(n);
-  algorithm=GetFsaBy(uname);
-/*anames=0;
-  while(anames[algorithm] && anames[algorithm] != uname) ++algorithm;
-  if (!anames[algorithm])
-    throw std::runtime_error("Unknown ApplyFsaBy type: "+n+" - legal types: "+all_names());
-*/
-}
-
-ApplyFsaBy::ApplyFsaBy(FsaBy i, int pop_limit) : pop_limit(pop_limit) {
-/*  if (i<0 || i>=N_ALGORITHMS)
-    throw std::runtime_error("Unknown ApplyFsaBy type id: "+itos(i)+" - legal types: "+all_names());
-*/
-  GetName(i); // checks validity
-  algorithm=i;
-}
-
-int ApplyFsaBy::BottomUpAlgorithm() const {
-  assert(IsBottomUp());
-  return algorithm==BU_CUBE ?
-    IntersectionConfiguration::CUBE
-    :IntersectionConfiguration::FULL;
-}
-
-void ApplyFsaModels(Hypergraph const& ih,
-                    const SentenceMetadata& smeta,
-                    const FsaFeatureFunction& fsa,
-                    DenseWeightVector const& weights, // pre: in is weighted by these (except with fsa featval=0 before this)
-                    ApplyFsaBy const& cfg,
-                    Hypergraph* out)
-{
-  HgCFG i(ih);
-  ApplyFsaModels(i,smeta,fsa,weights,cfg,out);
-}
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 69f40c93..4ce5749e 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -12,8 +12,6 @@
 #include "ff_rules.h"
 #include "ff_ruleshape.h"
 #include "ff_bleu.h"
-#include "ff_lm_fsa.h"
-#include "ff_sample_fsa.h"
 #include "ff_source_syntax.h"
 #include "ff_register.h"
 #include "ff_charset.h"
@@ -31,15 +29,6 @@ void register_feature_functions() {
   }
   registered = true;
 
-  //TODO: these are worthless example target FSA ffs.  remove later
-  RegisterFsaImpl<SameFirstLetter>(true);
-  RegisterFsaImpl<LongerThanPrev>(true);
-  RegisterFsaImpl<ShorterThanPrev>(true);
-//  ff_registry.Register("LanguageModelFsaDynamic",new FFFactory<FeatureFunctionFromFsa<FsaFeatureFunctionDynamic<LanguageModelFsa> > >); // to test correctness of FsaFeatureFunctionDynamic erasure
-  RegisterFsaDynToFF<LanguageModelFsa>();
-  RegisterFsaImpl<LanguageModelFsa>(true); // same as LM but using fsa wrapper
-  RegisterFsaDynToFF<SameFirstLetter>();
-
   RegisterFF<LanguageModel>();
 
   RegisterFF<WordPenalty>();
@@ -47,8 +36,6 @@ void register_feature_functions() {
   RegisterFF<ArityPenalty>();
   RegisterFF<BLEUModel>();
 
-  ff_registry.Register(new FFFactory<WordPenaltyFromFsa>); // same as WordPenalty, but implemented using ff_fsa
-
   //TODO: use for all features the new Register which requires static FF::usage(false,false) give name
 #ifdef HAVE_RANDLM
   ff_registry.Register("RandLM", new FFFactory<LanguageModelRandLM>);
diff --git a/decoder/feature_accum.h b/decoder/feature_accum.h
deleted file mode 100755
index 4b8338eb..00000000
--- a/decoder/feature_accum.h
+++ /dev/null
@@ -1,129 +0,0 @@
-#ifndef FEATURE_ACCUM_H
-#define FEATURE_ACCUM_H
-
-#include "ff.h"
-#include "sparse_vector.h"
-#include "value_array.h"
-
-struct SparseFeatureAccumulator : public FeatureVector {
-  typedef FeatureVector State;
-  SparseFeatureAccumulator() { assert(!"this code is disabled");  }
-  template <class FF>
-  FeatureVector const& describe(FF const& ) { return *this; }
-  void Store(FeatureVector *fv) const {
-//NO    fv->set_from(*this);
-  }
-  template <class FF>
-  void Store(FF const& /* ff */,FeatureVector *fv) const {
-//NO    fv->set_from(*this);
-  }
-  template <class FF>
-  void Add(FF const& /* ff */,FeatureVector const& fv) {
-    (*this)+=fv;
-  }
-  void Add(FeatureVector const& fv) {
-    (*this)+=fv;
-  }
-  /*
-  SparseFeatureAccumulator(FeatureVector const& fv) : State(fv) {}
-  FeatureAccumulator(Features const& fids) {}
-  FeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fv) {}
-  void Add(Features const& fids,FeatureVector const& fv) {
-    *this += fv;
-  }
-  */
-  void Add(int i,Featval v) {
-//NO    (*this)[i]+=v;
-  }
-  void Add(Features const& fids,int i,Featval v) {
-//NO    (*this)[i]+=v;
-  }
-};
-
-struct SingleFeatureAccumulator {
-  typedef Featval State;
-  typedef SingleFeatureAccumulator Self;
-  State v;
-  /*
-  void operator +=(State const& o) {
-    v+=o;
-  }
-  */
-  void operator +=(Self const& s) {
-    v+=s.v;
-  }
-  SingleFeatureAccumulator() : v() {}
-  template <class FF>
-  State const& describe(FF const& ) const { return v; }
-
-  template <class FF>
-  void Store(FF const& ff,FeatureVector *fv) const {
-    fv->set_value(ff.fid_,v);
-  }
-  void Store(Features const& fids,FeatureVector *fv) const {
-    assert(fids.size()==1);
-    fv->set_value(fids[0],v);
-  }
-  /*
-  SingleFeatureAccumulator(Features const& fids) { assert(fids.size()==1); }
-  SingleFeatureAccumulator(Features const& fids,FeatureVector const& fv)
-  {
-    assert(fids.size()==1);
-    v=fv.get_singleton();
-  }
-  */
-
-  template <class FF>
-  void Add(FF const& ff,FeatureVector const& fv) {
-    v+=fv.get(ff.fid_);
-  }
-  void Add(FeatureVector const& fv) {
-    v+=fv.get_singleton();
-  }
-
-  void Add(Features const& fids,FeatureVector const& fv) {
-    v += fv.get(fids[0]);
-  }
-  void Add(Featval dv) {
-    v+=dv;
-  }
-  void Add(int,Featval dv) {
-    v+=dv;
-  }
-  void Add(FeatureVector const& fids,int i,Featval dv) {
-    assert(fids.size()==1 && i==0);
-    v+=dv;
-  }
-};
-
-
-#if 0
-// omitting this so we can default construct an accum.  might be worth resurrecting in the future
-struct ArrayFeatureAccumulator : public ValueArray<Featval> {
-  typedef ValueArray<Featval> State;
-  template <class Fsa>
-  ArrayFeatureAccumulator(Fsa const& fsa) : State(fsa.features_.size()) { }
-  ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) {  }
-  ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) {  }
-  ArrayFeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fids.size()) {
-    for (int i=0,e=i<fids.size();i<e;++i)
-      (*this)[i]=fv.get(i);
-  }
-  State const& describe(Features const& fids) const { return *this; }
-  void Store(Features const& fids,FeatureVector *fv) const {
-    assert(fids.size()==size());
-    for (int i=0,e=i<fids.size();i<e;++i)
-      fv->set_value(fids[i],(*this)[i]);
-  }
-  void Add(Features const& fids,FeatureVector const& fv) {
-    for (int i=0,e=i<fids.size();i<e;++i)
-      (*this)[i]+=fv.get(i);
-  }
-  void Add(FeatureVector const& fids,int i,Featval v) {
-    (*this)[i]+=v;
-  }
-};
-#endif
-
-
-#endif
diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h
index 92334396..5eb68c8b 100644
--- a/decoder/ff_factory.h
+++ b/decoder/ff_factory.h
@@ -20,8 +20,6 @@
 
 #include <boost/shared_ptr.hpp>
 
-#include "ff_fsa_dynamic.h"
-
 class FeatureFunction;
 
 class FsaFeatureFunction;
diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h
deleted file mode 100755
index f8d79e03..00000000
--- a/decoder/ff_from_fsa.h
+++ /dev/null
@@ -1,304 +0,0 @@
-#ifndef FF_FROM_FSA_H
-#define FF_FROM_FSA_H
-
-#include "ff_fsa.h"
-
-#ifndef TD__none
-// replacing dependency on SRILM
-#define TD__none -1
-#endif
-
-#ifndef FSA_FF_DEBUG
-# define FSA_FF_DEBUG 0
-#endif
-#if FSA_FF_DEBUG
-# define FSAFFDBG(e,x) FSADBGif(debug(),e,x)
-# define FSAFFDBGnl(e) FSADBGif_nl(debug(),e)
-#else
-# define FSAFFDBG(e,x)
-# define FSAFFDBGnl(e)
-#endif
-
-/* regular bottom up scorer from Fsa feature
-   uses guarantee about markov order=N to score ASAP
-   encoding of state: if less than N-1 (ctxlen) words
-
-   usage:
-   typedef FeatureFunctionFromFsa<LanguageModelFsa> LanguageModelFromFsa;
-*/
-
-template <class Impl>
-class FeatureFunctionFromFsa : public FeatureFunction {
-  typedef void const* SP;
-  typedef WordID *W;
-  typedef WordID const* WP;
-public:
-  template <class I>
-  FeatureFunctionFromFsa(I const& param) : ff(param) {
-    debug_=true; // because factory won't set until after we construct.
-  }
-  template <class I>
-  FeatureFunctionFromFsa(I & param) : ff(param) {
-    debug_=true; // because factory won't set until after we construct.
-  }
-
-  static std::string usage(bool args,bool verbose) {
-    return Impl::usage(args,verbose);
-  }
-  void init_name_debug(std::string const& n,bool debug) {
-    FeatureFunction::init_name_debug(n,debug);
-    ff.init_name_debug(n,debug);
-  }
-
-  // this should override
-  Features features() const {
-    DBGINIT("FeatureFunctionFromFsa features() name="<<ff.name()<<" features="<<FD::Convert(ff.features()));
-    return ff.features();
-  }
-
-  // Log because it potentially stores info in edge.  otherwise the same as regular TraversalFeatures.
-  void TraversalFeaturesLog(const SentenceMetadata& smeta,
-                             Hypergraph::Edge& edge,
-                             const std::vector<const void*>& ant_contexts,
-                             FeatureVector* features,
-                             FeatureVector* estimated_features,
-                             void* out_state) const
-  {
-    TRule const& rule=*edge.rule_;
-    Sentence const& e = rule.e();  // items in target side of rule
-    typename Impl::Accum accum,h_accum;
-    if (!ssz) { // special case for no state - but still build up longer phrases to score in case FSA overrides ScanPhraseAccum
-      if (Impl::simple_phrase_score) {
-        // save the effort of building up the contiguous rule phrases - probably can just use the else branch, now that phrases aren't copied but are scanned off e directly.
-        for (int j=0,ee=e.size();j<ee;++j) {
-          if (e[j]>=1) // token
-            ff.ScanAccum(smeta,edge,(WordID)e[j],NULL,NULL,&accum);
-          FSAFFDBG(edge," "<<TD::Convert(e[j]));
-        }
-      } else {
-#undef RHS_WORD
-#define RHS_WORD(j) (e[j]>=1)
-        for (int j=0,ee=e.size();;++j) { // items in target side of rule
-          for(;;++j) {
-            if (j>=ee) goto rhs_done; // j may go 1 past ee due to k possibly getting to end
-            if (RHS_WORD(j)) break;
-          }
-          // word @j
-          int k=j;
-          while(k<ee) if (!RHS_WORD(++k)) break;
-          //end or nonword @k - [j,k) is phrase
-          FSAFFDBG(edge," ["<<TD::GetString(&e[j],&e[k])<<']');
-          ff.ScanPhraseAccum(smeta,edge,&e[j],&e[k],0,0,&accum);
-          j=k;
-        }
-      }
-    rhs_done:
-      accum.Store(ff,features);
-      FSAFFDBG(edge,"="<<accum.describe(ff));
-      FSAFFDBGnl(edge);
-      return;
-    }
-
-    // bear with me, because this is hard to understand.  reminder: ant_contexts and out_state are left-words first (up to M, TD__none padded).  if all M words are present, then FSA state follows.  otherwise 0 bytes to keep memcmp/hash happy.
-
-//why do we compute heuristic in so many places?  well, because that's how we know what state we should score words in once we're full on our left context (because of markov order bound, we know the score will be the same no matter what came before that left context)
-    // these left_* refer to our output (out_state):
-    W left_begin=(W)out_state;
-    W left_out=left_begin; // [left,fsa_state) = left ctx words.  if left words aren't full, then null wordid
-    WP left_full=left_end_full(out_state);
-    FsaScanner<Impl> fsa(ff,smeta,edge);
-    /* fsa holds our current state once we've seen our first M rule or child left-context words.  that state scores up the rest of the words at the time, and is replaced by the right state of any full child.  at the end, if we've got at least M left words in all, it becomes our right state (otherwise, we don't bother storing the partial state, which might seem useful any time we're built on by a rule that has our variable in the initial position - but without also storing the heuristic for that case, we just end up rescanning from scratch anyway to produce the heuristic.  so we just store all 0 bytes if we have less than M left words at the end. */
-    for (int j = 0,ee=e.size(); j < ee; ++j) { // items in target side of rule
-    s_rhs_next:
-      if (!RHS_WORD(j)) { // variable
-        // variables a* are referring to this child derivation state.
-        SP a = ant_contexts[-e[j]];
-        WP al=(WP)a,ale=left_end(a); // the child left words
-        int anw=ale-al;
-        FSAFFDBG(edge,' '<<describe_state(a));
-// anw left words in child.  full if == M.  we will use them to fill our left words, and then score the rest fully, knowing what state we're in based on h_state -> our left words -> any number of interior words which are scored then hidden
-        if (left_out+anw<left_full) { // still short of M after adding - nothing to score (not even our heuristic)
-          wordcpy(left_out,al,anw);
-          left_out+=anw;
-        } else if (left_out<left_full) { // we had less than M before, and will have a tleast M after adding.  so score heuristic and the rest M+1,... score inside.
-          int ntofill=left_full-left_out;
-          assert(ntofill==M-(left_out-left_begin));
-          wordcpy(left_out,al,ntofill);
-          left_out=(W)left_full;
-          // heuristic known now
-          fsa.reset(ff.heuristic_start_state());
-          fsa.scan(left_begin,left_full,&h_accum); // save heuristic (happens once only)
-          fsa.scan(al+ntofill,ale,&accum); // because of markov order, fully filled left words scored starting at h_start put us in the right state to score the extra words (which are forgotten)
-          al+=ntofill; // we used up the first ntofill words of al to end up in some known state via exactly M words total (M-ntofill were there beforehand).  now we can scan the remaining al words of this child
-        } else { // more to score / state to update (left already full)
-          fsa.scan(al,ale,&accum);
-        }
-        if (anw==M)
-          fsa.reset(fsa_state(a));
-        // if child had full state already, we must assume there was a gap and use its right state (note: if the child derivation was exactly M words, then we still use its state even though it will be equal to our current; there's no way to distinguish between such an M word item and an e.g. 2*M+k word item, although it's extremely unlikely that you'd have a >M word item that happens to have the same left and right boundary words).
-        assert(anw<=M); // of course, we never store more than M left words in an item.
-      } else { // single word
-        WordID ew=e[j];
-        // some redundancy: non-vectorized version of above handling of left words of child item
-        if (left_out<left_full) {
-          *left_out++=ew;
-          if (left_out==left_full) { // handle heuristic, once only, establish state
-            fsa.reset(ff.heuristic_start_state());
-            fsa.scan(left_begin,left_full,&h_accum); // save heuristic (happens only once)
-          }
-        } else {
-          if (Impl::simple_phrase_score) {
-            fsa.scan(ew,&accum); // single word scan isn't optimal if phrase is different
-            FSAFFDBG(edge,' '<<TD::Convert(ew));
-          } else {
-            int k=j;
-            while(k<ee) if (!RHS_WORD(++k)) break;
-            FSAFFDBG(edge," rule-phrase["<<TD::GetString(&e[j],&e[k])<<']');
-            fsa.scan(&e[j],&e[k],&accum);
-            if (k==ee) goto s_rhs_done;
-            j=k;
-            goto s_rhs_next;
-          }
-        }
-      }
-    }
-#undef RHS_WORD
-  s_rhs_done:
-    void *out_fsa_state=fsa_state(out_state);
-    if (left_out<left_full) { // finally: partial heuristic for unfilled items
-//      fsa.reset(ff.heuristic_start_state());      fsa.scan(left_begin,left_out,&h_accum);
-      ff.ScanPhraseAccumOnly(smeta,edge,left_begin,left_out,ff.heuristic_start_state(),&h_accum);
-      do { *left_out++=TD__none; } while(left_out<left_full); // none-terminate so left_end(out_state) will know how many words
-      ff.state_zero(out_fsa_state); // so we compare / hash correctly. don't know state yet because left context isn't full
-    } else // or else store final right-state.  heuristic was already assigned
-      ff.state_copy(out_fsa_state,fsa.cs);
-    accum.Store(ff,features);
-    h_accum.Store(ff,estimated_features);
-    FSAFFDBG(edge," = " << describe_state(out_state)<<" "<<name<<"="<<accum.describe(ff)<<" h="<<h_accum.describe(ff)<<")");
-    FSAFFDBGnl(edge);
-  }
-
-  void print_state(std::ostream &o,void const*ant) const {
-    WP l=(WP)ant,le=left_end(ant),lf=left_end_full(ant);
-    o<<'['<<Sentence(l,le);
-    if (le==lf) {
-      o<<" : ";
-      ff.print_state(o,lf);
-    }
-    o << ']';
-  }
-
-  std::string describe_state(void const*ant) const {
-    std::ostringstream o;
-    print_state(o,ant);
-    return o.str();
-  }
-
-  //FIXME: it's assumed that the final rule is just a unary no-target-terminal rewrite (same as ff_lm)
-  virtual void FinalTraversalFeatures(const SentenceMetadata& smeta,
-                                      Hypergraph::Edge& edge,
-                                      const void* residual_state,
-                                      FeatureVector* final_features) const
-  {
-    Sentence const& ends=ff.end_phrase();
-    typename Impl::Accum accum;
-    if (!ssz) {
-      FSAFFDBG(edge," (final,0state,end="<<ends<<")");
-      ff.ScanPhraseAccumOnly(smeta,edge,begin(ends),end(ends),0,&accum);
-    } else {
-      SP ss=ff.start_state();
-      WP l=(WP)residual_state,lend=left_end(residual_state);
-      SP rst=fsa_state(residual_state);
-      FSAFFDBG(edge," (final");// "<<name);//<< " before="<<*final_features);
-      if (lend==rst) { // implying we have an fsa state
-        ff.ScanPhraseAccumOnly(smeta,edge,l,lend,ss,&accum); // e.g. <s> score(full left unscored phrase)
-        FSAFFDBG(edge," start="<<ff.describe_state(ss)<<"->{"<<Sentence(l,lend)<<"}");
-        ff.ScanPhraseAccumOnly(smeta,edge,begin(ends),end(ends),rst,&accum); // e.g. [ctx for last M words] score("</s>")
-        FSAFFDBG(edge," end="<<ff.describe_state(rst)<<"->{"<<ends<<"}");
-      } else { // all we have is a single short phrase < M words before adding ends
-        int nl=lend-l;
-        Sentence whole(ends.size()+nl);
-        WordID *wb=begin(whole);
-        wordcpy(wb,l,nl);
-        wordcpy(wb+nl,begin(ends),ends.size());
-        FSAFFDBG(edge," whole={"<<whole<<"}");
-        // whole = left-words + end-phrase
-        ff.ScanPhraseAccumOnly(smeta,edge,wb,end(whole),ss,&accum);
-      }
-    }
-    FSAFFDBG(edge,' '<<name<<"="<<accum.describe(ff));
-    FSAFFDBGnl(edge);
-    accum.Store(ff,final_features);
-  }
-
-  bool rule_feature() const {
-    return StateSize()==0; // Fsa features don't get info about span
-  }
-
-  static void test() {
-    WordID w1[1],w1b[1],w2[2];
-    w1[0]=w2[0]=TD::Convert("hi");
-    w2[1]=w1b[0]=TD__none;
-    assert(left_end(w1,w1+1)==w1+1);
-    assert(left_end(w1b,w1b+1)==w1b);
-    assert(left_end(w2,w2+2)==w2+1);
-  }
-
-  // override from FeatureFunction; should be called by factory after constructor.  we'll also call in our own ctor
-  void Init() {
-    ff.Init();
-    ff.sync();
-    DBGINIT("base (single feature) FsaFeatureFunctionBase::Init name="<<name_<<" features="<<FD::Convert(features()));
-//    FeatureFunction::name_=Impl::usage(false,false); // already achieved by ff_factory.cc
-    M=ff.markov_order();
-    ssz=ff.state_bytes();
-    state_offset=sizeof(WordID)*M;
-    SetStateSize(ssz+state_offset);
-    assert(!ssz == !M); // no fsa state <=> markov order 0
-  }
-
-private:
-  Impl ff;
-  int M; // markov order (ctx len)
-  FeatureFunctionFromFsa(); // not allowed.
-
-  int state_offset; // NOTE: in bytes (add to char* only). store left-words first, then fsa state
-  int ssz; // bytes in fsa state
-  /*
-    state layout: left WordIds, followed by fsa state
-    left words have never been scored.  last ones remaining will be scored on FinalTraversalFeatures only.
-    right state is unknown until we have all M left words (less than M means TD__none will pad out right end).  unk right state will be zeroed out for proper hash/equal recombination.
-  */
-
-  static inline WordID const* left_end(WordID const* left, WordID const* e) {
-    for (;e>left;--e)
-      if (e[-1]!=TD__none) break;
-    //post: [left,e] are the seen left words
-    return e;
-  }
-  inline WP left_end(SP ant) const {
-    return left_end((WP)ant,(WP)fsa_state(ant));
-  }
-  inline WP left_end_full(SP ant) const {
-    return (WP)fsa_state(ant);
-  }
-  inline SP fsa_state(SP ant) const {
-    return ((char const*)ant+state_offset);
-  }
-  inline void *fsa_state(void * ant) const {
-    return ((char *)ant+state_offset);
-  }
-};
-
-#ifdef TEST_FSA
-# include "tdict.cc"
-# include "ff_sample_fsa.h"
-int main() {
-  std::cerr<<"Testing left_end...\n";
-  std::cerr<<"sizeof(FeatureVector)="<<sizeof(FeatureVector)<<"\n";
-  WordPenaltyFromFsa::test();
-  return 0;
-}
-#endif
-
-#endif
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
deleted file mode 100755
index 18e90bf1..00000000
--- a/decoder/ff_fsa.h
+++ /dev/null
@@ -1,401 +0,0 @@
-#ifndef FF_FSA_H
-#define FF_FSA_H
-
-/*
-  features whose score is just some PFSA over target string.  however, PFSA can use edge and smeta info (e.g. spans on edge) - not usually useful.
-
-//SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h
-
-  state is some fixed width byte array.  could actually be a void *, WordID sequence, whatever.
-
-  TODO: specify Scan return code or feature value = -inf for failure state (e.g. for hard intersection with desired target lattice?)
-
-  TODO: maybe ff that wants to know about SentenceMetadata should store a ref to
-  it permanently rather than get passed it for every operation.  we're never
-  decoding more than 1 sentence at once and it's annoying to pass it.  same
-  could apply for result edge as well since so far i only use it for logging
-  when USE_INFO_EDGE 1 - would make the most sense if the same change happened
-  to ff.h at the same time.
-
-  TODO: there are a confusing array of default-implemented supposedly slightly more efficient overrides enabled; however, the two key differences are: do you score a phrase, or just word at a time (the latter constraining you to obey markov_order() everywhere.  you have to implement the word case no matter what.
-
-  TODO: considerable simplification of implementation if Scan implementors are required to update state in place (using temporary copy if they need it), or e.g. using memmove (copy from end to beginning) to rotate state right.
-
-  TODO: at what sizes is memcpy/memmove better than looping over 2-3 ints and assigning?
-
-  TODO: fsa ff scores phrases not just words
-  TODO: fsa feature aggregator that presents itself as a single fsa; benefit: when wrapped in ff_from_fsa, only one set of left words is stored.  downside: compared to separate ff, the inside portion of lower-order models is incorporated later.  however, the full heuristic is already available and exact for those words.  so don't sweat it.
-
-  TODO: state (+ possibly span-specific) custom heuristic, e.g. in "longer than previous word" model, you can expect a higher outside if your state is a word of 2 letters.  this is on top of the nice heuristic for the unscored words, of course.  in ngrams, the avg prob will be about the same, but if the words possible for a source span are summarized, maybe it's possible to predict.  probably not worth the effort.
-*/
-
-#define FSA_DEBUG 0
-
-#if USE_INFO_EDGE
-#define FSA_DEBUG_CERR 0
-#else
-#define FSA_DEBUG_CERR 1
-#endif
-
-#define FSA_DEBUG_DEBUG 0
-# define FSADBGif(i,e,x) do { if (i) { if (FSA_DEBUG_CERR){std::cerr<<x;}  INFO_EDGE(e,x); if (FSA_DEBUG_DEBUG){std::cerr<<"FSADBGif edge.info "<<&e<<" = "<<e.info()<<std::endl;}} } while(0)
-# define FSADBGif_nl(i,e) do { if (i) { if (FSA_DEBUG_CERR) std::cerr<<std::endl; INFO_EDGE(e,"; "); } } while(0)
-#if FSA_DEBUG
-# include <iostream>
-# define FSADBG(e,x) FSADBGif(d().debug(),e,x)
-# define FSADBGnl(e) FSADBGif_nl(d().debug(),e,x)
-#else
-# define FSADBG(e,x)
-# define FSADBGnl(e)
-#endif
-
-#include "fast_lexical_cast.hpp"
-#include <sstream>
-#include <string>
-#include "ff.h"
-#include "sparse_vector.h"
-#include "tdict.h"
-#include "hg.h"
-#include "ff_fsa_data.h"
-
-/*
-usage: see ff_sample_fsa.h or ff_lm_fsa.h
-
- then, to decode, see ff_from_fsa.h (or TODO: left->right target-earley style rescoring)
-
- */
-
-
-template <class Impl>
-struct FsaFeatureFunctionBase : public FsaFeatureFunctionData {
-  Impl const& d() const { return static_cast<Impl const&>(*this); }
-  Impl & d()  { return static_cast<Impl &>(*this); }
-
-  // this will get called by factory - override if you have multiple or dynamically named features.  note: may be called repeatedly
-  void Init() {
-    Init(name());
-    DBGINIT("base (single feature) FsaFeatureFunctionBase::Init name="<<name()<<" features="<<FD::Convert(features_));
-  }
-  void Init(std::string const& fname) {
-    fid_=FD::Convert(fname);
-    InitHaveFid();
-  }
-  void InitHaveFid() {
-    features_=FeatureFunction::single_feature(fid_);
-  }
-  Features features() const {
-    DBGINIT("FeatureFunctionBase::features() name="<<name()<<" features="<<FD::Convert(features_));
-    return features_;
-  }
-
-public:
-  int fid_; // you can have more than 1 feature of course.
-
-  std::string describe() const {
-    std::ostringstream o;
-    o<<*this;
-    return o.str();
-  }
-
-  // can override to different return type, e.g. just return feats:
-  Featval describe_features(FeatureVector const& feats) const {
-    return feats.get(fid_);
-  }
-
-  bool debug() const { return true; }
-  int fid() const { return fid_; } // return the one most important feature (for debugging)
-  std::string name() const {
-    return Impl::usage(false,false);
-  }
-
-  void print_state(std::ostream &o,void const*state) const {
-    char const* i=(char const*)state;
-    char const* e=i+ssz;
-    for (;i!=e;++i)
-      print_hex_byte(o,*i);
-  }
-
-  std::string describe_state(void const* state) const {
-    std::ostringstream o;
-    d().print_state(o,state);
-    return o.str();
-  }
-  typedef SingleFeatureAccumulator Accum;
-
-  // return m: all strings x with the same final m+1 letters must end in this state
-  /* markov chain of order m: P(xn|xn-1...x1)=P(xn|xn-1...xn-m) */
-  int markov_order() const { return 0; } // override if you use state.  order 0 implies state_bytes()==0 as well, as far as scoring/splitting is concerned (you can still track state, though)
-  //TODO: if we wanted, we could mark certain states as maximal-context, but this would lose our fixed amount of left context in ff_from_fsa, and lose also our vector operations (have to scan left words 1 at a time, checking always to see where you change from h to inside - BUT, could detect equivalent LM states, which would be nice).
-
-
-
-  // if [i,end) are unscored words of length <= markov_order, score some of them on the right, and return the number scored, i.e. [end-r,end) will have been scored for return r.  CAREFUL: for ngram you have to sometimes remember to pay all of the backoff once you see a few more words to the left.
-  template <class Accum>
-  int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const {
-    return 0;
-  }
-
-  // this isn't currently used at all.  this left-shortening is not recommended (wasn't worth the computation expense for ngram): specifically for bottom up scoring (ff_from_fsa), you can return a shorter left-words context - but this means e.g. for ngram tracking that a backoff occurred where the final BO cost isn't yet known.  you would also have to remember any necessary info in your own state - in the future, ff_from_fsa on a list of fsa features would only shorten it to the max
-
-
-  // override this (static)
-  static std::string usage(bool param,bool verbose) {
-    return FeatureFunction::usage_helper("unnamed_fsa_feature","","",param,verbose);
-  }
-
-  // move from state to next_state after seeing word x, while emitting features->set_value(fid,val) possibly with duplicates.  state and next_state will never be the same memory.
-  //TODO: decide if we want to require you to support dest same as src, since that's how we use it most often in ff_from_fsa bottom-up wrapper (in l->r scoring, however, distinct copies will be the rule), and it probably wouldn't be too hard for most people to support.  however, it's good to hide the complexity here, once (see overly clever FsaScan loop that swaps src/dest addresses repeatedly to scan a sequence by effectively swapping)
-
-protected:
-  // overrides have different name because of inheritance method hiding;
-
-  // simple/common case; 1 fid.  these need not be overriden if you have multiple feature ids
-  Featval Scan1(WordID w,void const* state,void *next_state) const {
-    assert(0);
-    return 0;
-  }
-  Featval Scan1Meta(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& /* edge */,
-                    WordID w,void const* state,void *next_state) const {
-    return d().Scan1(w,state,next_state);
-  }
-public:
-
-  // must override this or Scan1Meta or Scan1
-  template <class Accum>
-  inline void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                        WordID w,void const* state,void *next_state,Accum *a) const {
-    Add(d().Scan1Meta(smeta,edge,w,state,next_state),a);
-  }
-
-  // bounce back and forth between two state vars starting at cs, returning end state location.  if we required src=dest addr safe state updating, this concept wouldn't need to exist.
-  // required that you override this if you score phrases differently than word-by-word, however, you can just use the SCAN_PHRASE_ACCUM_OVERRIDE macro to do that in terms of ScanPhraseAccum
-  template <class Accum>
-  void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const {
-    // extra code - IT'S FOR EFFICIENCY, MAN!  IT'S OK!  definitely no bugs here.
-    if (!ssz) {
-      for (;i<end;++i)
-        d().ScanAccum(smeta,edge,*i,0,0,accum);
-      return 0;
-    }
-    void *os,*es;
-    if ((end-i)&1) { // odd # of words
-      os=cs;
-      es=ns;
-      goto odd;
-    } else {
-      i+=1;
-      es=cs;
-      os=ns;
-    }
-    for (;i<end;i+=2) {
-      d().ScanAccum(smeta,edge,i[-1],es,os,accum); // e->o
-    odd:
-      d().ScanAccum(smeta,edge,i[0],os,es,accum); // o->e
-    }
-    return es;
-  }
-
-
-  static const bool simple_phrase_score=true; // if d().simple_phrase_score_, then you should expect different Phrase scores for phrase length > M.  so, set this false if you provide ScanPhraseAccum (SCAN_PHRASE_ACCUM_OVERRIDE macro does this)
-
-  // override this (and use SCAN_PHRASE_ACCUM_OVERRIDE  ) if you want e.g. maximum possible order ngram scores with markov_order < n-1.  in the future SparseFeatureAccumulator will probably be the only option for type-erased FSA ffs.
-  // note you'll still have to override ScanAccum
-  template <class Accum>
-  void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge,
-                              WordID const* i, WordID const* end,
-                              void const* state,void *next_state,Accum *accum) const {
-    if (!ssz) {
-      for (;i<end;++i)
-        d().ScanAccum(smeta,edge,*i,0,0,accum);
-      return;
-    }
-    char tstate[ssz];
-    void *tst=tstate;
-    bool odd=(end-i)&1;
-    void *cs,*ns;
-    // we're going to use Bounce (word by word alternating of states) such that the final place is next_state
-    if (odd) {
-      cs=tst;
-      ns=next_state;
-    } else {
-      cs=next_state;
-      ns=tst;
-    }
-    state_copy(cs,state);
-    void *est=d().ScanPhraseAccumBounce(smeta,edge,i,end,cs,ns,accum);
-    assert(est==next_state);
-  }
-
-
-
-  // could replace this with a CRTP subclass providing these impls.
-  // the d() subclass dispatch is not needed because these will be defined in the subclass
-#define SCAN_PHRASE_ACCUM_OVERRIDE \
-  static const bool simple_phrase_score=false; \
-  template <class Accum> \
-  void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const { \
-    ScanPhraseAccum(smeta,edge,i,end,cs,ns,accum);  \
-    return ns; \
-  } \
-  template <class Accum> \
-  void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, \
-                              WordID const* i, WordID const* end, \
-                              void const* state,Accum *accum) const { \
-    char s2[ssz]; ScanPhraseAccum(smeta,edge,i,end,state,(void*)s2,accum); \
-  }
-
-  // override this or bounce along with above.  note: you can just call ScanPhraseAccum
-  // doesn't set state (for heuristic in ff_from_fsa)
-  template <class Accum>
-  void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                           WordID const* i, WordID const* end,
-                           void const* state,Accum *accum) const {
-    char s1[ssz];
-    char s2[ssz];
-    state_copy(s1,state);
-    d().ScanPhraseAccumBounce(smeta,edge,i,end,(void*)s1,(void*)s2,accum);
-  }
-
-  // for single-feat only.  but will work for different accums
-  template <class Accum>
-  inline void Add(Featval v,Accum *a) const {
-    a->Add(fid_,v);
-  }
-  inline void set_feat(FeatureVector *features,Featval v) const {
-    features->set_value(fid_,v);
-  }
-
-  // don't set state-bytes etc. in ctor because it may depend on parsing param string
-  FsaFeatureFunctionBase(int statesz=0,Sentence const& end_sentence_phrase=Sentence())
-    : FsaFeatureFunctionData(statesz,end_sentence_phrase)
-  {
-    name_=name(); // should allow FsaDynamic wrapper to get name copied to it with sync
-  }
-
-};
-
-template <class Impl>
-struct MultipleFeatureFsa : public FsaFeatureFunctionBase<Impl> {
-  typedef SparseFeatureAccumulator Accum;
-};
-
-
-
-
-// if State is pod.  sets state size and allocs start, h_start
-// usage:
-// struct ShorterThanPrev : public FsaTypedBase<int,ShorterThanPrev>
-// i.e. Impl is a CRTP
-template <class St,class Impl>
-struct FsaTypedBase : public FsaFeatureFunctionBase<Impl> {
-  Impl const& d() const { return static_cast<Impl const&>(*this); }
-  Impl & d()  { return static_cast<Impl &>(*this); }
-protected:
-  typedef FsaFeatureFunctionBase<Impl> Base;
-  typedef St State;
-  static inline State & state(void *state) {
-    return *(State*)state;
-  }
-  static inline State const& state(void const* state) {
-    return *(State const*)state;
-  }
-  void set_starts(State const& s,State const& heuristic_s) {
-    if (0) { // already in ctor
-      Base::start.resize(sizeof(State));
-      Base::h_start.resize(sizeof(State));
-    }
-    assert(Base::start.size()==sizeof(State));
-    assert(Base::h_start.size()==sizeof(State));
-    state(Base::start.begin())=s;
-    state(Base::h_start.begin())=heuristic_s;
-  }
-  FsaTypedBase(St const& start_st=St()
-               ,St const& h_start_st=St()
-               ,Sentence const& end_sentence_phrase=Sentence())
-    : Base(sizeof(State),end_sentence_phrase) {
-    set_starts(start_st,h_start_st);
-  }
-public:
-  void print_state(std::ostream &o,void const*st) const {
-    o<<state(st);
-  }
-  int markov_order() const { return 1; }
-
-  // override this
-  Featval ScanT1S(WordID w,St const& /* from */ ,St & /* to */) const {
-    return 0;
-  }
-
-  // or this
-  Featval ScanT1(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& /* edge */,WordID w,St const& from ,St & to) const {
-    return d().ScanT1S(w,from,to);
-  }
-
-  // or this (most general)
-  template <class Accum>
-  inline void ScanT(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID w,St const& prev_st,St &new_st,Accum *a) const {
-    Add(d().ScanT1(smeta,edge,w,prev_st,new_st),a);
-  }
-
-  // note: you're on your own when it comes to Phrase overrides.  see FsaFeatureFunctionBase.  sorry.
-
-  template <class Accum>
-  inline void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID w,void const* st,void *next_state,Accum *a) const {
-    Impl const& im=d();
-    FSADBG(edge,"Scan "<<FD::Convert(im.fid_)<<" = "<<a->describe(im)<<" "<<im.state(st)<<"->"<<TD::Convert(w)<<" ");
-    im.ScanT(smeta,edge,w,state(st),state(next_state),a);
-    FSADBG(edge,state(next_state)<<" = "<<a->describe(im));
-    FSADBGnl(edge);
-  }
-};
-
-
-// keep a "current state" (bouncing back and forth)
-template <class FF>
-struct FsaScanner {
-//  enum {ALIGN=8};
-  static const int ALIGN=8;
-  FF const& ff;
-  SentenceMetadata const& smeta;
-  int ssz;
-  Bytes states; // first is at begin, second is at (char*)begin+stride
-  void *st0; // states
-  void *st1; // states+stride
-  void *cs; // initially st0, alternates between st0 and st1
-  inline void *nexts() const {
-    return (cs==st0)?st1:st0;
-  }
-  Hypergraph::Edge const& edge;
-  FsaScanner(FF const& ff,SentenceMetadata const& smeta,Hypergraph::Edge const& edge) : ff(ff),smeta(smeta),edge(edge)
-  {
-    ssz=ff.state_bytes();
-    int stride=((ssz+ALIGN-1)/ALIGN)*ALIGN; // round up to multiple of ALIGN
-    states.resize(stride+ssz);
-    st0=states.begin();
-    st1=(char*)st0+stride;
-//    for (int i=0;i<2;++i) st[i]=cs+(i*stride);
-  }
-  void reset(void const* state) {
-    cs=st0;
-    std::memcpy(st0,state,ssz);
-  }
-  template <class Accum>
-  void scan(WordID w,Accum *a) {
-    void *ns=nexts();
-    ff.ScanAccum(smeta,edge,w,cs,ns,a);
-    cs=ns;
-  }
-  template <class Accum>
-  void scan(WordID const* i,WordID const* end,Accum *a) {
-    // faster. and allows greater-order excursions
-    cs=ff.ScanPhraseAccumBounce(smeta,edge,i,end,cs,nexts(),a);
-  }
-};
-
-
-//TODO: combine 2 FsaFeatures typelist style (can recurse for more)
-
-
-
-
-#endif
diff --git a/decoder/ff_fsa_data.h b/decoder/ff_fsa_data.h
deleted file mode 100755
index d215e940..00000000
--- a/decoder/ff_fsa_data.h
+++ /dev/null
@@ -1,131 +0,0 @@
-#ifndef FF_FSA_DATA_H
-#define FF_FSA_DATA_H
-
-#include <stdint.h> //C99
-#include <sstream>
-#include "sentences.h"
-#include "feature_accum.h"
-#include "value_array.h"
-#include "ff.h" //debug
-typedef ValueArray<uint8_t> Bytes;
-
-// stuff I see no reason to have virtual.  but because it's impossible (w/o virtual inheritance to have dynamic fsa ff know where the impl's data starts, implemented a sync (copy) method that needs to be called.  init_name_debug was already necessary to keep state in sync between ff and ff_from_fsa, so no sync should be needed after it.  supposing all modifications were through setters, then no explicit sync call would ever be needed; updates could be mirrored.
-struct FsaFeatureFunctionData
-{
-  void init_name_debug(std::string const& n,bool debug) {
-    name_=n;
-    debug_=debug;
-  }
-  //HACK for diamond inheritance (w/o costing performance)
-  FsaFeatureFunctionData *sync_to_;
-
-  void sync() const { // call this if you modify any fields after your constructor is done
-    if (sync_to_) {
-      DBGINIT("sync to "<<*sync_to_);
-      *sync_to_=*this;
-      DBGINIT("synced result="<<*sync_to_<< " from this="<<*this);
-    } else {
-      DBGINIT("nobody to sync to - from FeatureFunctionData this="<<*this);
-    }
-  }
-
-  friend std::ostream &operator<<(std::ostream &o,FsaFeatureFunctionData const& d) {
-    o << "[FSA "<<d.name_<<" features="<<FD::Convert(d.features_)<<" state_bytes="<<d.state_bytes()<<" end='"<<d.end_phrase()<<"' start=";
-    d.print_state(o,d.start_state());
-    o<<"]";
-    return o;
-  }
-
-  FsaFeatureFunctionData(int statesz=0,Sentence const& end_sentence_phrase=Sentence()) : start(statesz),h_start(statesz),end_phrase_(end_sentence_phrase),ssz(statesz) {
-    debug_=true;
-    sync_to_=0;
-  }
-
-  std::string name_;
-  std::string name() const {
-    return name_;
-  }
-  typedef SparseFeatureAccumulator Accum;
-  bool debug_;
-  bool debug() const { return debug_; }
-  void state_copy(void *to,void const*from) const {
-    if (ssz)
-      std::memcpy(to,from,ssz);
-  }
-  void state_zero(void *st) const { // you should call this if you don't know the state yet and want it to be hashed/compared properly
-    std::memset(st,0,ssz);
-  }
-  Features features() const {
-    return features_;
-  }
-  int n_features() const {
-    return features_.size();
-  }
-  int state_bytes() const { return ssz; }
-  void const* start_state() const {
-    return start.begin();
-  }
-  void const * heuristic_start_state() const {
-    return h_start.begin();
-  }
-  Sentence const& end_phrase() const { return end_phrase_; }
-  template <class T>
-  static inline T* state_as(void *p) { return (T*)p; }
-  template <class T>
-  static inline T const* state_as(void const* p) { return (T*)p; }
-  std::string describe_features(FeatureVector const& feats) {
-    std::ostringstream o;
-    o<<feats;
-    return o.str();
-  }
-  void print_state(std::ostream &o,void const*state) const {
-    char const* i=(char const*)state;
-    char const* e=i+ssz;
-    for (;i!=e;++i)
-      print_hex_byte(o,*i);
-  }
-
-  Features features_;
-  Bytes start,h_start; // start state and estimated-features (heuristic) start state.  set these.  default empty.
-  Sentence end_phrase_; // words appended for final traversal (final state cost is assessed using Scan) e.g. "</s>" for lm.
-protected:
-  int ssz; // don't forget to set this. default 0 (it may depend on params of course)
-  // this can be called instead or after constructor (also set bytes and end_phrase_)
-  void set_state_bytes(int sb=0) {
-    if (start.size()!=sb) start.resize(sb);
-    if (h_start.size()!=sb) h_start.resize(sb);
-    ssz=sb;
-  }
-  void set_end_phrase(WordID single) {
-    end_phrase_=singleton_sentence(single);
-  }
-
-  inline void static to_state(void *state,char const* begin,char const* end) {
-    std::memcpy(state,begin,end-begin);
-  }
-  inline void static to_state(void *state,char const* begin,int n) {
-    std::memcpy(state,begin,n);
-  }
-  template <class T>
-  inline void static to_state(void *state,T const* begin,int n=1) {
-    to_state(state,(char const*)begin,n*sizeof(T));
-  }
-  template <class T>
-  inline void static to_state(void *state,T const* begin,T const* end) {
-    to_state(state,(char const*)begin,(char const*)end);
-  }
-  inline static char hexdigit(int i) {
-    int j=i-10;
-    return j>=0?'a'+j:'0'+i;
-  }
-  inline static void print_hex_byte(std::ostream &o,unsigned c) {
-    o<<hexdigit(c>>4);
-    o<<hexdigit(c&0x0f);
-  }
-  inline static void Add(Featval v,SingleFeatureAccumulator *a) {
-    a->Add(v);
-  }
-
-};
-
-#endif
diff --git a/decoder/ff_fsa_dynamic.h b/decoder/ff_fsa_dynamic.h
deleted file mode 100755
index 6f75bbe5..00000000
--- a/decoder/ff_fsa_dynamic.h
+++ /dev/null
@@ -1,208 +0,0 @@
-#ifndef FF_FSA_DYNAMIC_H
-#define FF_FSA_DYNAMIC_H
-
-struct SentenceMetadata;
-
-#include "ff_fsa_data.h"
-#include "hg.h" // can't forward declare nested Hypergraph::Edge class
-#include <sstream>
-
-// the type-erased interface
-
-//FIXME: diamond inheritance problem.  make a copy of the fixed data?  or else make the dynamic version not wrap but rather be templated CRTP base (yuck)
-struct FsaFeatureFunction : public FsaFeatureFunctionData {
-  static const bool simple_phrase_score=false;
-  virtual int markov_order() const = 0;
-
-  // see ff_fsa.h - FsaFeatureFunctionBase<Impl> gives you reasonable impls of these if you override just ScanAccum
-  virtual void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                        WordID w,void const* state,void *next_state,Accum *a) const = 0;
-  virtual void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge,
-                              WordID const* i, WordID const* end,
-                              void const* state,void *next_state,Accum *accum) const = 0;
-  virtual void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                           WordID const* i, WordID const* end,
-                           void const* state,Accum *accum) const = 0;
-  virtual void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const = 0;
-
-  virtual int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const { return 0; }
-  // called after constructor, before use
-  virtual void Init() = 0;
-  virtual std::string usage_v(bool param,bool verbose) const {
-    return FeatureFunction::usage_helper("unnamed_dynamic_fsa_feature","","",param,verbose);
-  }
-  virtual void init_name_debug(std::string const& n,bool debug) {
-    FsaFeatureFunctionData::init_name_debug(n,debug);
-  }
-
-  virtual void print_state(std::ostream &o,void const*state) const {
-    FsaFeatureFunctionData::print_state(o,state);
-  }
-  virtual std::string describe() const { return "[FSA unnamed_dynamic_fsa_feature]"; }
-
-  //end_phrase()
-  virtual ~FsaFeatureFunction() {}
-
-  // no need to override:
-  std::string describe_state(void const* state) const {
-    std::ostringstream o;
-    print_state(o,state);
-    return o.str();
-  }
-};
-
-// conforming to above interface, type erases FsaImpl
-// you might be wondering: why do this?  answer: it's cool, and it means that the bottom-up ff over ff_fsa wrapper doesn't go through multiple layers of dynamic dispatch
-// usage: typedef FsaFeatureFunctionDynamic<MyFsa> MyFsaDyn;
-template <class Impl>
-struct FsaFeatureFunctionDynamic : public FsaFeatureFunction {
-  static const bool simple_phrase_score=Impl::simple_phrase_score;
-  Impl& d() { return impl;//static_cast<Impl&>(*this);
-  }
-  Impl const& d() const { return impl;
-    //static_cast<Impl const&>(*this);
-  }
-  int markov_order() const { return d().markov_order(); }
-
-  std::string describe() const   {
-    return d().describe();
-  }
-
-  virtual void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                        WordID w,void const* state,void *next_state,Accum *a) const {
-    return d().ScanAccum(smeta,edge,w,state,next_state,a);
-  }
-
-  virtual void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge,
-                              WordID const* i, WordID const* end,
-                              void const* state,void *next_state,Accum *a) const {
-    return d().ScanPhraseAccum(smeta,edge,i,end,state,next_state,a);
-  }
-
-  virtual void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                           WordID const* i, WordID const* end,
-                           void const* state,Accum *a) const {
-    return d().ScanPhraseAccumOnly(smeta,edge,i,end,state,a);
-  }
-
-  virtual void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *a) const {
-    return d().ScanPhraseAccumBounce(smeta,edge,i,end,cs,ns,a);
-  }
-
-  virtual int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const {
-    return d().early_score_words(smeta,edge,i,end,accum);
-  }
-
-  static std::string usage(bool param,bool verbose) {
-    return Impl::usage(param,verbose);
-  }
-
-  std::string usage_v(bool param,bool verbose) const {
-    return Impl::usage(param,verbose);
-  }
-
-  virtual void print_state(std::ostream &o,void const*state) const {
-    return d().print_state(o,state);
-  }
-
-  void init_name_debug(std::string const& n,bool debug) {
-    FsaFeatureFunction::init_name_debug(n,debug);
-    d().init_name_debug(n,debug);
-  }
-
-  virtual void Init() {
-    d().sync_to_=(FsaFeatureFunctionData*)this;
-    d().Init();
-    d().sync();
-  }
-
-  template <class I>
-  FsaFeatureFunctionDynamic(I const& param) : impl(param) {
-    Init();
-  }
-private:
-  Impl impl;
-};
-
-// constructor takes ptr or shared_ptr to Impl, otherwise same as above - note: not virtual
-template <class Impl>
-struct FsaFeatureFunctionPimpl : public FsaFeatureFunctionData {
-  typedef boost::shared_ptr<Impl const> Pimpl;
-  static const bool simple_phrase_score=Impl::simple_phrase_score;
-  Impl const& d() const { return *p_; }
-  int markov_order() const { return d().markov_order(); }
-
-  std::string describe() const   {
-    return d().describe();
-  }
-
-  void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                        WordID w,void const* state,void *next_state,Accum *a) const {
-    return d().ScanAccum(smeta,edge,w,state,next_state,a);
-  }
-
-  void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge,
-                              WordID const* i, WordID const* end,
-                              void const* state,void *next_state,Accum *a) const {
-    return d().ScanPhraseAccum(smeta,edge,i,end,state,next_state,a);
-  }
-
-  void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,
-                           WordID const* i, WordID const* end,
-                           void const* state,Accum *a) const {
-    return d().ScanPhraseAccumOnly(smeta,edge,i,end,state,a);
-  }
-
-  void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *a) const {
-    return d().ScanPhraseAccumBounce(smeta,edge,i,end,cs,ns,a);
-  }
-
-  int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const {
-    return d().early_score_words(smeta,edge,i,end,accum);
-  }
-
-  static std::string usage(bool param,bool verbose) {
-    return Impl::usage(param,verbose);
-  }
-
-  std::string usage_v(bool param,bool verbose) const {
-    return Impl::usage(param,verbose);
-  }
-
-  void print_state(std::ostream &o,void const*state) const {
-    return d().print_state(o,state);
-  }
-
-#if 0
-  // this and Init() don't touch p_ because we want to leave the original alone.
-      void init_name_debug(std::string const& n,bool debug) {
-    FsaFeatureFunctionData::init_name_debug(n,debug);
-  }
-#endif
-  void Init() {
-    p_=hold_pimpl_.get();
-#if 0
-    d().sync_to_=static_cast<FsaFeatureFunctionData*>(this);
-    d().Init();
-#endif
-    *static_cast<FsaFeatureFunctionData*>(this)=d();
-  }
-
-  FsaFeatureFunctionPimpl(Impl const* const p) : hold_pimpl_(p,null_deleter()) {
-    Init();
-  }
-  FsaFeatureFunctionPimpl(Pimpl const& p) : hold_pimpl_(p) {
-    Init();
-  }
-private:
-  Impl const* p_;
-  Pimpl hold_pimpl_;
-};
-
-typedef FsaFeatureFunctionPimpl<FsaFeatureFunction> FsaFeatureFunctionFwd; // allow ff_from_fsa for an existing dynamic-type ff (as opposed to usual register a wrapped known-type FSA in ff_register, which is more efficient)
-//typedef FsaFeatureFunctionDynamic<FsaFeatureFunctionFwd> DynamicFsaFeatureFunctionFwd;  //if you really need to have a dynamic fsa facade that's also a dynamic fsa
-
-//TODO: combine 2 (or N) FsaFeatureFunction (type erased)
-
-
-#endif
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index afa36b96..5e16d4e3 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -46,7 +46,6 @@ char const* usage_verbose="-n determines the name of the feature (and its weight
 #endif
 
 #include "ff_lm.h"
-#include "ff_lm_fsa.h"
 
 #include <sstream>
 #include <unistd.h>
@@ -69,10 +68,6 @@ char const* usage_verbose="-n determines the name of the feature (and its weight
 
 using namespace std;
 
-string LanguageModelFsa::usage(bool param,bool verbose) {
-  return FeatureFunction::usage_helper("LanguageModelFsa",usage_short,usage_verbose,param,verbose);
-}
-
 string LanguageModel::usage(bool param,bool verbose) {
   return FeatureFunction::usage_helper(usage_name,usage_short,usage_verbose,param,verbose);
 }
@@ -524,49 +519,6 @@ LanguageModel::LanguageModel(const string& param) {
   SetStateSize(LanguageModelImpl::OrderToStateSize(order));
 }
 
-//TODO: decide whether to waste a word of space so states are always none-terminated for SRILM.  otherwise we have to copy
-void LanguageModelFsa::set_ngram_order(int i) {
-  assert(i>0);
-  ngram_order_=i;
-  ctxlen_=i-1;
-  set_state_bytes(ctxlen_*sizeof(WordID));
-  WordID *ss=(WordID*)start.begin();
-  WordID *hs=(WordID*)h_start.begin();
-  if (ctxlen_) { // avoid segfault in case of unigram lm (0 state)
-    set_end_phrase(TD::Convert("</s>"));
-// se is pretty boring in unigram case, just adds constant prob.  check that this is what we want
-    ss[0]=TD::Convert("<s>"); // start-sentence context (length 1)
-    hs[0]=0; // empty context
-    for (int i=1;i<ctxlen_;++i) {
-      ss[i]=hs[i]=0; // need this so storage is initialized for hashing.
-      //TODO: reevaluate whether state space comes cleared by allocator or not.
-    }
-  }
-  sync(); // for dynamic markov_order copy etc
-}
-
-LanguageModelFsa::LanguageModelFsa(string const& param) {
-  int lmorder;
-  pimpl_ = make_lm_impl(param,&lmorder,&fid_);
-  Init();
-  floor_=pimpl_->floor_;
-  set_ngram_order(lmorder);
-}
-
-void LanguageModelFsa::print_state(ostream &o,void const* st) const {
-  WordID const *wst=(WordID const*)st;
-  o<<'[';
-  bool sp=false;
-  for (int i=ctxlen_;i>0;sp=true) {
-    --i;
-    WordID w=wst[i];
-    if (w==0) continue;
-    if (sp) o<<' ';
-    o << TD::Convert(w);
-  }
-  o<<']';
-}
-
 Features LanguageModel::features() const {
   return single_feature(fid_);
 }
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
deleted file mode 100755
index 85b7ef44..00000000
--- a/decoder/ff_lm_fsa.h
+++ /dev/null
@@ -1,140 +0,0 @@
-#ifndef FF_LM_FSA_H
-#define FF_LM_FSA_H
-
-//FIXME: when FSA_LM_PHRASE 1, 3gram fsa has differences, especially with unk words, in about the 4th decimal digit (about .05%), compared to regular ff_lm.  this is USUALLY a bug (there's way more actual precision in there).  this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that).  also, LM_FSA_SHORTEN_CONTEXT gives identical scores with FSA_LM_PHRASE 0
-
-// enabling for now - retest unigram+ more, solve above puzzle
-
-// some impls in ff_lm.cc
-
-#define FSA_LM_PHRASE 1
-
-#define FSA_LM_DEBUG 0
-#if FSA_LM_DEBUG
-# define FSALMDBG(e,x) FSADBGif(debug(),e,x)
-# define FSALMDBGnl(e) FSADBGif_nl(debug(),e)
-#else
-# define FSALMDBG(e,x)
-# define FSALMDBGnl(e)
-#endif
-
-#include "ff_fsa.h"
-#include "ff_lm.h"
-
-#ifndef TD__none
-// replacing dependency on SRILM
-#define TD__none -1
-#endif
-
-namespace {
-WordID empty_context=TD__none;
-}
-
-struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
-  typedef WordID * W;
-  typedef WordID const* WP;
-
-  // overrides; implementations in ff_lm.cc
-  typedef SingleFeatureAccumulator Accum;
-  static std::string usage(bool,bool);
-  LanguageModelFsa(std::string const& param);
-  int markov_order() const { return ctxlen_; }
-  void print_state(std::ostream &,void const *) const;
-  inline Featval floored(Featval p) const {
-    return p<floor_?floor_:p;
-  }
-  static inline WordID const* left_end(WordID const* left, WordID const* e) {
-    for (;e>left;--e)
-      if (e[-1]!=TD__none) break;
-    //post: [left,e] are the seen left words
-    return e;
-  }
-
-  template <class Accum>
-  void ScanAccum(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& edge,WordID w,void const* old_st,void *new_st,Accum *a) const {
-#if USE_INFO_EDGE
-    Hypergraph::Edge &de=(Hypergraph::Edge &)edge;
-#endif
-    if (!ctxlen_) {
-      Add(floored(pimpl_->WordProb(w,&empty_context)),a);
-    } else {
-      WordID ctx[ngram_order_]; //alloca if you don't have C99
-      state_copy(ctx,old_st);
-      ctx[ctxlen_]=TD__none;
-      Featval p=floored(pimpl_->WordProb(w,ctx));
-      FSALMDBG(de,"p("<<TD::Convert(w)<<"|"<<TD::Convert(ctx,ctx+ctxlen_)<<")="<<p);FSALMDBGnl(de);
-      // states are srilm contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.).
-      WordID *nst=(WordID *)new_st;
-      nst[0]=w; // new most recent word
-      to_state(nst+1,ctx,ctxlen_-1); // rotate old words right
-#if LM_FSA_SHORTEN_CONTEXT
-      p+=pimpl_->ShortenContext(nst,ctxlen_);
-#endif
-      Add(p,a);
-    }
-  }
-
-#if FSA_LM_PHRASE
-  //FIXME: there is a bug in here somewhere, or else the 3gram LM we use gives different scores for phrases (impossible? BOW nonzero when shortening context past what LM has?)
-  template <class Accum>
-  void ScanPhraseAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge&edge,WordID const* begin,WordID const* end,void const* old_st,void *new_st,Accum *a) const {
-    Hypergraph::Edge &de=(Hypergraph::Edge &)edge;(void)de;
-    if (begin==end) return; // otherwise w/ shortening it's possible to end up with no words at all.
-    /* // this is forcing unigram prob always.  we will instead build the phrase
-    if (!ctxlen_) {
-      Featval p=0;
-      for (;i<end;++i)
-        p+=floored(pimpl_->WordProb(*i,e&mpty_context));
-      Add(p,a);
-      return;
-      } */
-    int nw=end-begin;
-    WP st=(WP)old_st;
-    WP st_end=st+ctxlen_; // may include some null already (or none if full)
-    int nboth=nw+ctxlen_;
-    WordID ctx[nboth+1];
-    ctx[nboth]=TD__none;
-    // reverse order - state at very end of context, then [i,end) in rev order ending at ctx[0]
-    W ctx_score_end=wordcpy_reverse(ctx,begin,end);
-    wordcpy(ctx_score_end,st,st_end); // st already reversed.
-    assert(ctx_score_end==ctx+nw);
-    // we could just copy the filled state words, but it probably doesn't save much time (and might cost some to scan to find the nones.  most contexts are full except for the shortest source spans.
-    FSALMDBG(de," scan.r->l("<<TD::GetString(ctx,ctx_score_end)<<"|"<<TD::GetString(ctx_score_end,ctx+nboth)<<')');
-    Featval p=0;
-    FSALMDBGnl(edge);
-    for(;ctx_score_end>ctx;--ctx_score_end)
-      p+=floored(pimpl_->WordProb(ctx_score_end[-1],ctx_score_end));
-    //TODO: look for score discrepancy -
-    // i had some idea that maybe shortencontext would return a different prob if the length provided was > ctxlen_; however, since the same disagreement happens with LM_FSA_SHORTEN_CONTEXT 0 anyway, it's not that.  perhaps look to SCAN_PHRASE_ACCUM_OVERRIDE - make sure they do the right thing.
-#if LM_FSA_SHORTEN_CONTEXT
-    p+=pimpl_->ShortenContext(ctx,nboth<ctxlen_?nboth:ctxlen_);
-#endif
-    state_copy(new_st,ctx);
-    FSALMDBG(de," lm.Scan("<<TD::GetString(begin,end)<<"|"<<describe_state(old_st)<<")"<<"="<<p<<","<<describe_state(new_st));
-    FSALMDBGnl(edge);
-    Add(p,a);
-  }
-
-  SCAN_PHRASE_ACCUM_OVERRIDE
-#endif
-
-  // impl details:
-  void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows.  otherwise, it's the same as a "-o i" argument to constructor
-  // note: if you adjust ngram_order, ff_from_fsa won't notice.
-
-  double floor_; // log10prob minimum used (e.g. unk words)
-
-  // because we might have a custom fid due to lm name option:
-  void Init() {
-    InitHaveFid();
-  }
-
-private:
-  int ngram_order_;
-  int ctxlen_; // 1 less than above
-  LanguageModelInterface *pimpl_;
-
-};
-
-
-#endif
diff --git a/decoder/ff_register.h b/decoder/ff_register.h
index eff23537..80b1457e 100755
--- a/decoder/ff_register.h
+++ b/decoder/ff_register.h
@@ -2,50 +2,12 @@
 #define FF_FSA_REGISTER_H
 
 #include "ff_factory.h"
-#include "ff_from_fsa.h"
-#include "ff_fsa_dynamic.h"
-
-inline std::string prefix_fsa(std::string const& name,bool fsa_prefix_ff) {
-  return fsa_prefix_ff ? "Fsa"+name : name;
-}
-
-//FIXME: problem with FeatureFunctionFromFsa<FsaFeatureFunction> - need to use factory rather than ctor.
-#if 0
-template <class DynFsa>
-inline void RegisterFsa(bool ff_also=true,bool fsa_prefix_ff=true) {
-  assert(!ff_also);
-//  global_fsa_ff_registry->RegisterFsa<DynFsa>();
-//if (ff_also) ff_registry.RegisterFF<FeatureFunctionFromFsa<DynFsa> >(prefix_fsa(DynFsa::usage(false,false)),fsa_prefix_ff);
-}
-#endif
-
-//TODO: ff from fsa that uses pointer to fsa impl?  e.g. in LanguageModel we share underlying lm file by recognizing same param, but without that effort, otherwise stateful ff may duplicate state if we enable both fsa and ff_from_fsa
-template <class FsaImpl>
-inline void RegisterFsaImpl(bool ff_also=true,bool fsa_prefix_ff=false) {
-  typedef FsaFeatureFunctionDynamic<FsaImpl> DynFsa;
-  typedef FeatureFunctionFromFsa<FsaImpl> FFFrom;
-  std::string name=FsaImpl::usage(false,false);
-  fsa_ff_registry.Register(new FsaFactory<DynFsa>);
-  if (ff_also)
-    ff_registry.Register(prefix_fsa(name,fsa_prefix_ff),new FFFactory<FFFrom>);
-}
 
 template <class Impl>
 inline void RegisterFF() {
   ff_registry.Register(new FFFactory<Impl>);
 }
 
-template <class FsaImpl>
-inline void RegisterFsaDynToFF(std::string name,bool prefix=true) {
-  typedef FsaFeatureFunctionDynamic<FsaImpl> DynFsa;
-  ff_registry.Register(prefix?"DynamicFsa"+name:name,new FFFactory<FeatureFunctionFromFsa<DynFsa> >);
-}
-
-template <class FsaImpl>
-inline void RegisterFsaDynToFF(bool prefix=true) {
-  RegisterFsaDynToFF<FsaImpl>(FsaImpl::usage(false,false),prefix);
-}
-
 void register_feature_functions();
 
 #endif
diff --git a/decoder/hg_test.cc b/decoder/hg_test.cc
index 3be5b82d..5d1910fb 100644
--- a/decoder/hg_test.cc
+++ b/decoder/hg_test.cc
@@ -57,7 +57,7 @@ TEST_F(HGTest,Union) {
   c3 = ViterbiESentence(hg1, &t3);
   int l3 = ViterbiPathLength(hg1);
   cerr << c3 << "\t" << TD::GetString(t3) << endl;
-  EXPECT_FLOAT_EQ(c2, c3);
+  EXPECT_FLOAT_EQ(c2.as_float(), c3.as_float());
   EXPECT_EQ(TD::GetString(t2), TD::GetString(t3));
   EXPECT_EQ(l2, l3);
 
@@ -117,7 +117,7 @@ TEST_F(HGTest,InsideScore) {
   cerr << "cost: " << cost << "\n";
   hg.PrintGraphviz();
   prob_t inside = Inside<prob_t, EdgeProb>(hg);
-  EXPECT_FLOAT_EQ(1.7934048, inside);  // computed by hand
+  EXPECT_FLOAT_EQ(1.7934048, inside.as_float());  // computed by hand
   vector<prob_t> post;
   inside = hg.ComputeBestPathThroughEdges(&post);
   EXPECT_FLOAT_EQ(-0.3, log(inside));  // computed by hand
@@ -282,13 +282,13 @@ TEST_F(HGTest, TestGenericInside) {
   hg.Reweight(wts);
   vector<prob_t> inside;
   prob_t ins = Inside<prob_t, EdgeProb>(hg, &inside);
-  EXPECT_FLOAT_EQ(1.7934048, ins);  // computed by hand
+  EXPECT_FLOAT_EQ(1.7934048, ins.as_float());  // computed by hand
   vector<prob_t> outside;
   Outside<prob_t, EdgeProb>(hg, inside, &outside);
   EXPECT_EQ(3, outside.size());
-  EXPECT_FLOAT_EQ(1.7934048, outside[0]);
-  EXPECT_FLOAT_EQ(1.3114071, outside[1]);
-  EXPECT_FLOAT_EQ(1.0, outside[2]);
+  EXPECT_FLOAT_EQ(1.7934048, outside[0].as_float());
+  EXPECT_FLOAT_EQ(1.3114071, outside[1].as_float());
+  EXPECT_FLOAT_EQ(1.0, outside[2].as_float());
 }
 
 TEST_F(HGTest,TestGenericInside2) {
@@ -327,8 +327,8 @@ TEST_F(HGTest,TestAddExpectations) {
   SparseVector<prob_t> feat_exps;
   prob_t z = InsideOutside<prob_t, EdgeProb,
                   SparseVector<prob_t>, EdgeFeaturesAndProbWeightFunction>(hg, &feat_exps);
-  EXPECT_FLOAT_EQ(-2.5439765, feat_exps.value(FD::Convert("f1")) / z);
-  EXPECT_FLOAT_EQ(-2.6357865, feat_exps.value(FD::Convert("f2")) / z);
+  EXPECT_FLOAT_EQ(-2.5439765, (feat_exps.value(FD::Convert("f1")) / z).as_float());
+  EXPECT_FLOAT_EQ(-2.6357865, (feat_exps.value(FD::Convert("f2")) / z).as_float());
   cerr << feat_exps << endl;
   cerr << "Z=" << z << endl;
 }
diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc
index f87b7274..993627f0 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/mpi_online_optimize.cc
@@ -9,6 +9,7 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "stringlib.h"
 #include "verbose.h"
 #include "hg.h"
 #include "prob.h"
@@ -204,6 +205,7 @@ bool LoadAgenda(const string& file, vector<pair<string, int> >* a) {
 }
 
 int main(int argc, char** argv) {
+  cerr << "THIS SOFTWARE IS DEPRECATED YOU SHOULD USE mpi_flex_optimize\n";
 #ifdef HAVE_MPI
   mpi::environment env(argc, argv);
   mpi::communicator world;
-- 
cgit v1.2.3


From f036d4ec5c79db95df3470adb7cd317ff258ab7d Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Fri, 14 Oct 2011 22:39:37 +0100
Subject: le optimizer

---
 training/Makefile.am          |   4 +
 training/mpi_flex_optimize.cc | 346 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 350 insertions(+)
 create mode 100644 training/mpi_flex_optimize.cc

(limited to 'training')

diff --git a/training/Makefile.am b/training/Makefile.am
index 0b598fd5..2a11ae52 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -12,6 +12,7 @@ bin_PROGRAMS = \
   mpi_extract_reachable \
   mpi_extract_features \
   mpi_online_optimize \
+  mpi_flex_optimize \
   mpi_batch_optimize \
   mpi_compute_cllh \
   augment_grammar
@@ -25,6 +26,9 @@ TESTS = lbfgs_test optimize_test
 mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc
 mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
+mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc online_optimizer.cc optimize.cc
+mpi_flex_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
 mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc
 mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
diff --git a/training/mpi_flex_optimize.cc b/training/mpi_flex_optimize.cc
new file mode 100644
index 00000000..87c5f331
--- /dev/null
+++ b/training/mpi_flex_optimize.cc
@@ -0,0 +1,346 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "stringlib.h"
+#include "verbose.h"
+#include "hg.h"
+#include "prob.h"
+#include "inside_outside.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "optimize.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+#ifdef HAVE_MPI
+#include <boost/mpi/timer.hpp>
+#include <boost/mpi.hpp>
+namespace mpi = boost::mpi;
+#endif
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("cdec_config,c",po::value<string>(),"Decoder configuration file")
+        ("weights,w",po::value<string>(),"Initial feature weights")
+        ("training_data,d",po::value<string>(),"Training data")
+        ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(6), "Number of training instances evaluated per processor in each minibatch")
+        ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (options: lbfgs, sgd, rprop)")
+        ("minibatch_iterations,i", po::value<unsigned>()->default_value(10), "Number of optimization iterations per minibatch (1 = standard SGD)")
+        ("iterations,I", po::value<unsigned>()->default_value(50), "Number of passes through the training data before termination")
+        ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+        ("lbfgs_memory_buffers,M", po::value<unsigned>()->default_value(10), "Number of memory buffers for LBFGS history")
+        ("eta_0,e", po::value<double>()->default_value(0.1), "Initial learning rate for SGD")
+        ("L1,1","Use L1 regularization")
+        ("L2,2","Use L2 regularization")
+        ("regularization_strength,C", po::value<double>()->default_value(1.0), "Regularization strength (C)");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("training_data") || !conf->count("cdec_config")) {
+    cerr << "General-purpose minibatch online optimizer (MPI support "
+#if HAVE_MPI
+         << "enabled"
+#else
+         << "not enabled"
+#endif
+         << ")\n" << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  int id = 0;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (id % size == rank) {
+      c->push_back(line);
+      order->push_back(id);
+    }
+    ++id;
+  }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct CopyHGsObserver : public DecoderObserver {
+  Hypergraph* hg_;
+  Hypergraph* gold_hg_;
+
+  // this can free up some memory
+  void RemoveRules(Hypergraph* h) {
+    for (unsigned i = 0; i < h->edges_.size(); ++i)
+      h->edges_[i].rule_.reset();
+  }
+
+  void SetCurrentHypergraphs(Hypergraph* h, Hypergraph* gold_h) {
+    hg_ = h;
+    gold_hg_ = gold_h;
+  }
+
+  virtual void NotifyDecodingStart(const SentenceMetadata&) {
+    state = 1;
+  }
+
+  // compute model expectations, denominator of objective
+  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+    *hg_ = *hg;
+    RemoveRules(hg_);
+    assert(state == 1);
+    state = 2;
+  }
+
+  // compute "empirical" expectations, numerator of objective
+  virtual void NotifyAlignmentForest(const SentenceMetadata&, Hypergraph* hg) {
+    assert(state == 2);
+    state = 3;
+    *gold_hg_ = *hg;
+    RemoveRules(gold_hg_);
+  }
+
+  virtual void NotifyDecodingComplete(const SentenceMetadata&) {
+    if (state == 3) {
+    } else {
+      hg_->clear();
+      gold_hg_->clear();
+    }
+  }
+
+  int state;
+};
+
+void ReadConfig(const string& ini, istringstream* out) {
+  ReadFile rf(ini);
+  istream& in = *rf.stream();
+  ostringstream os;
+  while(in) {
+    string line;
+    getline(in, line);
+    if (!in) continue;
+    os << line << endl;
+  }
+  out->str(os.str());
+}
+
+#ifdef HAVE_MPI
+namespace boost { namespace mpi {
+  template<>
+  struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> > 
+    : mpl::true_ { };
+} } // end namespace boost::mpi
+#endif
+
+void AddGrad(const SparseVector<prob_t> x, double s, SparseVector<double>* acc) {
+  for (SparseVector<prob_t>::const_iterator it = x.begin(); it != x.end(); ++it)
+    acc->add_value(it->first, it->second.as_float() * s);
+}
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  const int size = 1;
+  const int rank = 0;
+#endif
+  if (size > 1) SetSilent(true);  // turn off verbose decoder output
+  register_feature_functions();
+  MT19937* rng = NULL;
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf))
+    return 1;
+
+  boost::shared_ptr<BatchOptimizer> o;
+  const unsigned lbfgs_memory_buffers = conf["lbfgs_memory_buffers"].as<unsigned>();
+
+  istringstream ins;
+  ReadConfig(conf["cdec_config"].as<string>(), &ins);
+  Decoder decoder(&ins);
+
+  // load initial weights
+  vector<weight_t> init_weights;
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &init_weights);
+
+  vector<string> corpus;
+  vector<int> ids;
+  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
+  assert(corpus.size() > 0);
+
+  const unsigned size_per_proc = conf["minibatch_size_per_proc"].as<unsigned>();
+  if (size_per_proc > corpus.size()) {
+    cerr << "Minibatch size must be smaller than corpus size!\n";
+    return 1;
+  }
+
+  size_t total_corpus_size = 0;
+#ifdef HAVE_MPI
+  reduce(world, corpus.size(), total_corpus_size, std::plus<size_t>(), 0);
+#else
+  total_corpus_size = corpus.size();
+#endif
+
+  if (conf.count("random_seed"))
+    rng = new MT19937(conf["random_seed"].as<uint32_t>());
+  else
+    rng = new MT19937;
+
+  const unsigned minibatch_iterations = conf["minibatch_iterations"].as<unsigned>();
+
+  if (rank == 0) {
+    cerr << "Total corpus size: " << total_corpus_size << endl;
+    const unsigned batch_size = size_per_proc * size;
+  }
+
+  SparseVector<double> x;
+  Weights::InitSparseVector(init_weights, &x);
+  CopyHGsObserver observer;
+
+  int write_weights_every_ith = 100; // TODO configure
+  int titer = -1;
+
+  vector<weight_t>& lambdas = decoder.CurrentWeightVector();
+  lambdas.swap(init_weights);
+  init_weights.clear();
+
+  int iter = -1;
+  bool converged = false;
+  while (!converged) {
+#ifdef HAVE_MPI
+    mpi::timer timer;
+#endif
+    x.init_vector(&lambdas);
+    ++iter; ++titer;
+#if 0
+    if (rank == 0) {
+      converged = (iter == max_iteration);
+      Weights::SanityCheck(lambdas);
+      Weights::ShowLargestFeatures(lambdas);
+        string fname = "weights.cur.gz";
+        if (iter % write_weights_every_ith == 0) {
+          ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz";
+          fname = o.str();
+        }
+        if (converged && ((ai+1)==agenda.size())) { fname = "weights.final.gz"; }
+        ostringstream vv;
+        vv << "total iter=" << titer << " (of current config iter=" << iter << ")  minibatch=" << size_per_proc << " sentences/proc x " << size << " procs.   num_feats=" << x.size() << '/' << FD::NumFeats() << "   passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << "   eta=" << lr->eta(titer);
+        const string svv = vv.str();
+        cerr << svv << endl;
+        Weights::WriteToFile(fname, lambdas, true, &svv);
+      }
+#endif
+
+      vector<Hypergraph> hgs(size_per_proc);
+      vector<Hypergraph> gold_hgs(size_per_proc);
+      for (int i = 0; i < size_per_proc; ++i) {
+        int ei = corpus.size() * rng->next();
+        int id = ids[ei];
+        observer.SetCurrentHypergraphs(&hgs[i], &gold_hgs[i]);
+        decoder.SetId(id);
+        decoder.Decode(corpus[ei], &observer);
+      }
+
+      SparseVector<double> local_grad, g;
+      double local_obj = 0;
+      o.reset();
+      for (unsigned mi = 0; mi < minibatch_iterations; ++mi) {
+        local_grad.clear();
+        g.clear();
+        local_obj = 0;
+
+        for (unsigned i = 0; i < size_per_proc; ++i) {
+          Hypergraph& hg = hgs[i];
+          Hypergraph& hg_gold = gold_hgs[i];
+          if (hg.edges_.size() < 2) continue;
+
+          hg.Reweight(lambdas);
+          hg_gold.Reweight(lambdas);
+          SparseVector<prob_t> model_exp, gold_exp;
+          const prob_t z = InsideOutside<prob_t,
+                                         EdgeProb,
+                                         SparseVector<prob_t>,
+                                         EdgeFeaturesAndProbWeightFunction>(hg, &model_exp);
+          local_obj += log(z);
+          model_exp /= z;
+          AddGrad(model_exp, 1.0, &local_grad);
+          model_exp.clear();
+
+          const prob_t goldz = InsideOutside<prob_t,
+                                         EdgeProb,
+                                         SparseVector<prob_t>,
+                                         EdgeFeaturesAndProbWeightFunction>(hg_gold, &gold_exp);
+          local_obj -= log(goldz);
+
+          if (log(z) - log(goldz) < kMINUS_EPSILON) {
+            cerr << "DIFF. ERR! log_model_z < log_gold_z: " << log(z) << " " << log(goldz) << endl;
+            return 1;
+          }
+
+          gold_exp /= goldz;
+          AddGrad(gold_exp, -1.0, &local_grad);
+        }
+
+        double obj = 0;
+#ifdef HAVE_MPI
+        // TODO obj
+        reduce(world, local_grad, g, std::plus<SparseVector<double> >(), 0);
+#else
+        obj = local_obj;
+        g.swap(local_grad);
+#endif
+        local_grad.clear();
+        if (rank == 0) {
+          g /= (size_per_proc * size);
+          if (!o)
+            o.reset(new LBFGSOptimizer(FD::NumFeats(), lbfgs_memory_buffers));
+          vector<double> gg(FD::NumFeats());
+          if (gg.size() != lambdas.size()) { lambdas.resize(gg.size()); }
+          for (SparseVector<double>::const_iterator it = g.begin(); it != g.end(); ++it)
+            if (it->first) { gg[it->first] = it->second; }
+          cerr << "OBJ: " << obj << endl;
+          o->Optimize(obj, gg, &lambdas);
+        }
+#ifdef HAVE_MPI
+        broadcast(world, x, 0);
+        broadcast(world, converged, 0);
+        world.barrier();
+        if (rank == 0) { cerr << "  ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; }
+#endif
+    }
+  }
+  return 0;
+}
-- 
cgit v1.2.3