8 files changed, 539 insertions, 416 deletions
diff --git a/training/Makefile.am b/training/Makefile.am
index 2a11ae52..991ac210 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -1,12 +1,12 @@
 bin_PROGRAMS = \
   model1 \
+  lbl_model \
   test_ngram \
   mr_em_map_adapter \
   mr_em_adapted_reduce \
   mr_reduce_to_weights \
   mr_optimize_reduce \
   grammar_convert \
-  atools \
   plftools \
   collapse_weights \
   mpi_extract_reachable \
@@ -47,12 +47,12 @@ augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/lib
 test_ngram_SOURCES = test_ngram.cc
 test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
-atools_SOURCES = atools.cc
-atools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
-
 model1_SOURCES = model1.cc ttables.cc
 model1_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
 
+lbl_model_SOURCES = lbl_model.cc optimize.cc
+lbl_model_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
+
 grammar_convert_SOURCES = grammar_convert.cc
 grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
 
diff --git a/training/atools.cc b/training/atools.cc
deleted file mode 100644
index 42579627..00000000
--- a/training/atools.cc
+++ /dev/null
@@ -1,370 +0,0 @@
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-#include <queue>
-#include <map>
-#include <boost/program_options.hpp>
-#include <boost/shared_ptr.hpp>
-
-#include "filelib.h"
-#include "aligner.h"
-#include "alignment_pharaoh.h"
-
-namespace po = boost::program_options;
-using namespace std;
-using boost::shared_ptr;
-
-struct Command {
-  virtual ~Command() {}
-  virtual string Name() const = 0;
-
-  // returns 1 for alignment grid output [default]
-  // returns 2 if Summary() should be called [for AER, etc]
-  virtual int Result() const { return 1; }
-
-  virtual bool RequiresTwoOperands() const { return true; }
-  virtual void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) = 0;
-  void EnsureSize(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
-    x->resize(max(a.width(), b.width()), max(a.height(), b.height()));
-  }
-  static bool Safe(const Array2D<bool>& a, int i, int j) {
-    if (i >= 0 && j >= 0 && i < a.width() && j < a.height())
-      return a(i,j);
-    else
-      return false;
-  }
-  virtual void Summary() { assert(!"Summary should have been overridden"); }
-};
-
-// compute fmeasure, second alignment is reference, first is hyp
-struct FMeasureCommand : public Command {
-  FMeasureCommand() : matches(), num_predicted(), num_in_ref() {}
-  int Result() const { return 2; }
-  string Name() const { return "fmeasure"; }
-  bool RequiresTwoOperands() const { return true; }
-  void Apply(const Array2D<bool>& hyp, const Array2D<bool>& ref, Array2D<bool>* x) {
-    (void) x;   // AER just computes statistics, not an alignment
-    int i_len = ref.width();
-    int j_len = ref.height();
-    for (int i = 0; i < i_len; ++i) {
-      for (int j = 0; j < j_len; ++j) {
-        if (ref(i,j)) {
-          ++num_in_ref;
-          if (Safe(hyp, i, j)) ++matches;
-        } 
-      }
-    }
-    for (int i = 0; i < hyp.width(); ++i)
-      for (int j = 0; j < hyp.height(); ++j)
-        if (hyp(i,j)) ++num_predicted;
-  }
-  void Summary() {
-    if (num_predicted == 0 || num_in_ref == 0) {
-      cerr << "Insufficient statistics to compute f-measure!\n";
-      abort();
-    }
-    const double prec = static_cast<double>(matches) / num_predicted;
-    const double rec = static_cast<double>(matches) / num_in_ref;
-    cout << "P: " << prec << endl;
-    cout << "R: " << rec << endl;
-    const double f = (2.0 * prec * rec) / (rec + prec);
-    cout << "F: " << f << endl;
-  }
-  int matches;
-  int num_predicted;
-  int num_in_ref;
-};
-
-struct DisplayCommand : public Command {
-  string Name() const { return "display"; }
-  bool RequiresTwoOperands() const { return false; }
-  void Apply(const Array2D<bool>& in, const Array2D<bool>&not_used, Array2D<bool>* x) {
-    *x = in;
-    cout << *x << endl;
-  }
-};
-
-struct ConvertCommand : public Command {
-  string Name() const { return "convert"; }
-  bool RequiresTwoOperands() const { return false; }
-  void Apply(const Array2D<bool>& in, const Array2D<bool>&not_used, Array2D<bool>* x) {
-    *x = in;
-  }
-};
-
-struct InvertCommand : public Command {
-  string Name() const { return "invert"; }
-  bool RequiresTwoOperands() const { return false; }
-  void Apply(const Array2D<bool>& in, const Array2D<bool>&not_used, Array2D<bool>* x) {
-    Array2D<bool>& res = *x;
-    res.resize(in.height(), in.width());
-    for (int i = 0; i < in.height(); ++i)
-      for (int j = 0; j < in.width(); ++j)
-        res(i, j) = in(j, i);
-  }
-};
-
-struct IntersectCommand : public Command {
-  string Name() const { return "intersect"; }
-  bool RequiresTwoOperands() const { return true; }
-  void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
-    EnsureSize(a, b, x);
-    Array2D<bool>& res = *x;
-    for (int i = 0; i < a.width(); ++i)
-      for (int j = 0; j < a.height(); ++j)
-        res(i, j) = Safe(a, i, j) && Safe(b, i, j);
-  }
-};
-
-struct UnionCommand : public Command {
-  string Name() const { return "union"; }
-  bool RequiresTwoOperands() const { return true; }
-  void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
-    EnsureSize(a, b, x);
-    Array2D<bool>& res = *x;
-    for (int i = 0; i < res.width(); ++i)
-      for (int j = 0; j < res.height(); ++j)
-        res(i, j) = Safe(a, i, j) || Safe(b, i, j);
-  }
-};
-
-struct RefineCommand : public Command {
-  RefineCommand() {
-    neighbors_.push_back(make_pair(1,0));
-    neighbors_.push_back(make_pair(-1,0));
-    neighbors_.push_back(make_pair(0,1));
-    neighbors_.push_back(make_pair(0,-1));
-  }
-  bool RequiresTwoOperands() const { return true; }
-
-  void Align(int i, int j) {
-    res_(i, j) = true;
-    is_i_aligned_[i] = true;
-    is_j_aligned_[j] = true;
-  }
-
-  bool IsNeighborAligned(int i, int j) const {
-    for (int k = 0; k < neighbors_.size(); ++k) {
-      const int di = neighbors_[k].first;
-      const int dj = neighbors_[k].second;
-      if (Safe(res_, i + di, j + dj))
-        return true;
-    }
-    return false;
-  }
-
-  bool IsNeitherAligned(int i, int j) const {
-    return !(is_i_aligned_[i] || is_j_aligned_[j]);
-  }
-
-  bool IsOneOrBothUnaligned(int i, int j) const {
-    return !(is_i_aligned_[i] && is_j_aligned_[j]);
-  }
-
-  bool KoehnAligned(int i, int j) const {
-    return IsOneOrBothUnaligned(i, j) && IsNeighborAligned(i, j);
-  }
-
-  typedef bool (RefineCommand::*Predicate)(int i, int j) const;
-
- protected:
-  void InitRefine(
-      const Array2D<bool>& a,
-      const Array2D<bool>& b) {
-    res_.clear();
-    EnsureSize(a, b, &res_);
-    in_.clear(); un_.clear(); is_i_aligned_.clear(); is_j_aligned_.clear();
-    EnsureSize(a, b, &in_);
-    EnsureSize(a, b, &un_);
-    is_i_aligned_.resize(res_.width(), false);
-    is_j_aligned_.resize(res_.height(), false);
-    for (int i = 0; i < in_.width(); ++i)
-      for (int j = 0; j < in_.height(); ++j) {
-        un_(i, j) = Safe(a, i, j) || Safe(b, i, j);
-        in_(i, j) = Safe(a, i, j) && Safe(b, i, j);
-        if (in_(i, j)) Align(i, j);
-    }
-  }
-  // "grow" the resulting alignment using the points in adds
-  // if they match the constraints determined by pred
-  void Grow(Predicate pred, bool idempotent, const Array2D<bool>& adds) {
-    if (idempotent) {
-      for (int i = 0; i < adds.width(); ++i)
-        for (int j = 0; j < adds.height(); ++j) {
-          if (adds(i, j) && !res_(i, j) &&
-              (this->*pred)(i, j)) Align(i, j);
-        }
-      return;
-    }
-    set<pair<int, int> > p;
-    for (int i = 0; i < adds.width(); ++i)
-      for (int j = 0; j < adds.height(); ++j)
-        if (adds(i, j) && !res_(i, j))
-          p.insert(make_pair(i, j));
-    bool keep_going = !p.empty();
-    while (keep_going) {
-      keep_going = false;
-      for (set<pair<int, int> >::iterator pi = p.begin();
-           pi != p.end(); ++pi) {
-        if ((this->*pred)(pi->first, pi->second)) {
-          Align(pi->first, pi->second);
-          p.erase(pi);
-          keep_going = true;
-        }
-      }
-    }
-  }
-  Array2D<bool> res_;  // refined alignment
-  Array2D<bool> in_;   // intersection alignment
-  Array2D<bool> un_;   // union alignment
-  vector<bool> is_i_aligned_;
-  vector<bool> is_j_aligned_;
-  vector<pair<int,int> > neighbors_;
-};
-
-struct DiagCommand : public RefineCommand {
-  DiagCommand() {
-    neighbors_.push_back(make_pair(1,1));
-    neighbors_.push_back(make_pair(-1,1));
-    neighbors_.push_back(make_pair(1,-1));
-    neighbors_.push_back(make_pair(-1,-1));
-  }
-};
-
-struct GDCommand : public DiagCommand {
-  string Name() const { return "grow-diag"; }
-  void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
-    InitRefine(a, b);
-    Grow(&RefineCommand::KoehnAligned, false, un_);
-    *x = res_;
-  }
-};
-
-struct GDFCommand : public DiagCommand {
-  string Name() const { return "grow-diag-final"; }
-  void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
-    InitRefine(a, b);
-    Grow(&RefineCommand::KoehnAligned, false, un_);
-    Grow(&RefineCommand::IsOneOrBothUnaligned, true, a);
-    Grow(&RefineCommand::IsOneOrBothUnaligned, true, b);
-    *x = res_;
-  }
-};
-
-struct GDFACommand : public DiagCommand {
-  string Name() const { return "grow-diag-final-and"; }
-  void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
-    InitRefine(a, b);
-    Grow(&RefineCommand::KoehnAligned, false, un_);
-    Grow(&RefineCommand::IsNeitherAligned, true, a);
-    Grow(&RefineCommand::IsNeitherAligned, true, b);
-    *x = res_;
-  }
-};
-
-map<string, boost::shared_ptr<Command> > commands;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  ostringstream os;
-  os << "[REQ] Operation to perform:";
-  for (map<string, boost::shared_ptr<Command> >::iterator it = commands.begin();
-       it != commands.end(); ++it) {
-    os << ' ' << it->first;
-  }
-  string cstr = os.str();
-  opts.add_options()
-        ("input_1,i", po::value<string>(), "[REQ] Alignment 1 file, - for STDIN")
-        ("input_2,j", po::value<string>(), "[OPT] Alignment 2 file, - for STDIN")
-	("command,c", po::value<string>()->default_value("convert"), cstr.c_str())
-        ("help,h", "Print this help message and exit");
-  po::options_description clo("Command line options");
-  po::options_description dcmdline_options;
-  dcmdline_options.add(opts);
-
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  po::notify(*conf);
-
-  if (conf->count("help") || conf->count("input_1") == 0 || conf->count("command") == 0) {
-    cerr << dcmdline_options << endl;
-    exit(1);
-  }
-  const string cmd = (*conf)["command"].as<string>();
-  if (commands.count(cmd) == 0) {
-    cerr << "Don't understand command: " << cmd << endl;
-    exit(1);
-  }
-  if (commands[cmd]->RequiresTwoOperands()) {
-    if (conf->count("input_2") == 0) {
-      cerr << "Command '" << cmd << "' requires two alignment files\n";
-      exit(1);
-    }
-    if ((*conf)["input_1"].as<string>() == "-" && (*conf)["input_2"].as<string>() == "-") {
-      cerr << "Both inputs cannot be STDIN\n";
-      exit(1);
-    }
-  } else {
-    if (conf->count("input_2") != 0) {
-      cerr << "Command '" << cmd << "' requires only one alignment file\n";
-      exit(1);
-    }
-  }
-}
-
-template<class C> static void AddCommand() {
-  C* c = new C;
-  commands[c->Name()].reset(c);
-}
-
-int main(int argc, char **argv) {
-  AddCommand<ConvertCommand>();
-  AddCommand<DisplayCommand>();
-  AddCommand<InvertCommand>();
-  AddCommand<IntersectCommand>();
-  AddCommand<UnionCommand>();
-  AddCommand<GDCommand>();
-  AddCommand<GDFCommand>();
-  AddCommand<GDFACommand>();
-  AddCommand<FMeasureCommand>();
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-  Command& cmd = *commands[conf["command"].as<string>()];
-  boost::shared_ptr<ReadFile> rf1(new ReadFile(conf["input_1"].as<string>()));
-  boost::shared_ptr<ReadFile> rf2;
-  if (cmd.RequiresTwoOperands())
-    rf2.reset(new ReadFile(conf["input_2"].as<string>()));
-  istream* in1 = rf1->stream();
-  istream* in2 = NULL;
-  if (rf2) in2 = rf2->stream();
-  while(*in1) {
-    string line1;
-    string line2;
-    getline(*in1, line1);
-    if (in2) {
-      getline(*in2, line2);
-      if ((*in1 && !*in2) || (*in2 && !*in1)) {
-        cerr << "Mismatched number of lines!\n";
-        exit(1);
-      }
-    }
-    if (line1.empty() && !*in1) break;
-    shared_ptr<Array2D<bool> > out(new Array2D<bool>);
-    shared_ptr<Array2D<bool> > a1 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line1);
-    if (in2) {
-      shared_ptr<Array2D<bool> > a2 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line2);
-      cmd.Apply(*a1, *a2, out.get());
-    } else {
-      Array2D<bool> dummy;
-      cmd.Apply(*a1, dummy, out.get());
-    }
-    
-    if (cmd.Result() == 1) {
-      AlignmentPharaoh::SerializePharaohFormat(*out, &cout);
-    }
-  }
-  if (cmd.Result() == 2)
-    cmd.Summary();
-  return 0;
-}
-
diff --git a/training/em_utils.h b/training/em_utils.h
deleted file mode 100644
index 37762978..00000000
--- a/training/em_utils.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _EM_UTILS_H_
-#define _EM_UTILS_H_
-
-#include "config.h"
-#ifdef HAVE_BOOST_DIGAMMA
-#include <boost/math/special_functions/digamma.hpp>
-using boost::math::digamma;
-#else
-#warning Using Mark Johnsons digamma()
-#include <cmath>
-inline double digamma(double x) {
-  double result = 0, xx, xx2, xx4;
-  assert(x > 0);
-  for ( ; x < 7; ++x)
-    result -= 1/x;
-  x -= 1.0/2.0;
-  xx = 1.0/x;
-  xx2 = xx*xx;
-  xx4 = xx2*xx2;
-  result += log(x)+(1./24.)*xx2-(7.0/960.0)*xx4+(31.0/8064.0)*xx4*xx2-(127.0/30720.0)*xx4*xx4;
-  return result;
-}
-#endif
-#endif
diff --git a/training/lbl_model.cc b/training/lbl_model.cc
new file mode 100644
index 00000000..a46ce33c
--- /dev/null
+++ b/training/lbl_model.cc
@@ -0,0 +1,421 @@
+#include <iostream>
+
+#include "config.h"
+#ifndef HAVE_EIGEN
+  int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; }
+#else
+
+#include <cstdlib>
+#include <algorithm>
+#include <cmath>
+#include <set>
+#include <cstring> // memset
+#include <ctime>
+
+#ifdef HAVE_MPI
+#include <boost/mpi/timer.hpp>
+#include <boost/mpi.hpp>
+#include <boost/archive/text_oarchive.hpp>
+namespace mpi = boost::mpi;
+#endif
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+#include <Eigen/Dense>
+
+#include "corpus_tools.h"
+#include "optimize.h"
+#include "array2d.h"
+#include "m.h"
+#include "lattice.h"
+#include "stringlib.h"
+#include "filelib.h"
+#include "tdict.h"
+
+namespace po = boost::program_options;
+using namespace std;
+
+#define kDIMENSIONS 10
+typedef Eigen::Matrix<double, kDIMENSIONS, 1> RVector;
+typedef Eigen::Matrix<double, 1, kDIMENSIONS> RTVector;
+typedef Eigen::Matrix<double, kDIMENSIONS, kDIMENSIONS> TMatrix;
+vector<RVector> r_src, r_trg;
+
+#if HAVE_MPI
+namespace boost {
+namespace serialization {
+
+template<class Archive>
+void serialize(Archive & ar, RVector & v, const unsigned int version) {
+  for (unsigned i = 0; i < kDIMENSIONS; ++i)
+    ar & v[i];
+}
+
+} // namespace serialization
+} // namespace boost
+#endif
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("input,i",po::value<string>(),"Input file")
+        ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training")
+        ("regularization_strength,C",po::value<double>()->default_value(0.1),"L2 regularization strength (0 for no regularization)")
+        ("eta", po::value<double>()->default_value(0.1f), "Eta for SGD")
+        ("source_embeddings,f", po::value<string>(), "File containing source embeddings (if unset, random vectors will be used)")
+        ("target_embeddings,e", po::value<string>(), "File containing target embeddings (if unset, random vectors will be used)")
+        ("random_seed,s", po::value<unsigned>(), "Random seed")
+        ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)")
+        ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (argc < 2 || conf->count("help")) {
+    cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n";
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void Normalize(RVector* v) {
+  double norm = v->norm();
+  assert(norm > 0.0f);
+  *v /= norm;
+}
+
+void Flatten(const TMatrix& m, vector<double>* v) {
+  unsigned c = 0;
+  v->resize(kDIMENSIONS * kDIMENSIONS);
+  for (unsigned i = 0; i < kDIMENSIONS; ++i)
+    for (unsigned j = 0; j < kDIMENSIONS; ++j) {
+      assert(boost::math::isfinite(m(i, j)));
+      (*v)[c++] = m(i,j);
+    }
+}
+
+void Unflatten(const vector<double>& v, TMatrix* m) {
+  unsigned c = 0;
+  for (unsigned i = 0; i < kDIMENSIONS; ++i)
+    for (unsigned j = 0; j < kDIMENSIONS; ++j) {
+      assert(boost::math::isfinite(v[c]));
+      (*m)(i, j) = v[c++];
+    }
+}
+
+double ApplyRegularization(const double C,
+                           const vector<double>& weights,
+                           vector<double>* g) {
+  assert(weights.size() == g->size());
+  double reg = 0;
+  for (size_t i = 0; i < weights.size(); ++i) {
+    const double& w_i = weights[i];
+    double& g_i = (*g)[i];
+    reg += C * w_i * w_i;
+    g_i += 2 * C * w_i;
+  }
+  return reg;
+}
+
+void LoadEmbeddings(const string& filename, vector<RVector>* pv) {
+  vector<RVector>& v = *pv;
+  cerr << "Reading embeddings from " << filename << " ...\n";
+  ReadFile rf(filename);
+  istream& in = *rf.stream();
+  string line;
+  unsigned lc = 0;
+  while(getline(in, line)) {
+    ++lc;
+    size_t cur = line.find(' ');
+    if (cur == string::npos || cur == 0) {
+      cerr << "Parse error reading line " << lc << ":\n" << line << endl;
+      abort();
+    }
+    WordID w = TD::Convert(line.substr(0, cur));
+    if (w >= v.size()) continue;
+    RVector& curv = v[w];
+    line[cur] = 0;
+    size_t start = cur + 1;
+    cur = start + 1;
+    size_t c = 0;
+    while(cur < line.size()) {
+      if (line[cur] == ' ') {
+        line[cur] = 0;
+        curv[c++] = strtod(&line[start], NULL);
+        start = cur + 1;
+        cur = start;
+        if (c == kDIMENSIONS) break;
+      }
+      ++cur;
+    }
+    if (c < kDIMENSIONS && cur != start) {
+      if (cur < line.size()) line[cur] = 0;
+      curv[c++] = strtod(&line[start], NULL);
+    }
+    if (c != kDIMENSIONS) {
+      static bool first = true;
+      if (first) {
+        cerr << " read " << c << " dimensions from embedding file, but built with " << kDIMENSIONS << " (filling in with random values)\n";
+        first = false;
+      }
+      for (; c < kDIMENSIONS; ++c) curv[c] = rand();
+    }
+    if (c == kDIMENSIONS && cur != line.size()) {
+      static bool first = true;
+      if (first) {
+        cerr << " embedding file contains more dimensions than configured with, truncating.\n";
+        first = false;
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+  std::cerr << "**MPI enabled.\n";
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  std::cerr << "**MPI disabled.\n";
+  const int rank = 0;
+  const int size = 1;
+#endif
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf)) return 1;
+  const string fname = conf["input"].as<string>();
+  const double reg_strength = conf["regularization_strength"].as<double>();
+  const bool has_l2 = reg_strength;
+  assert(reg_strength >= 0.0f);
+  const int ITERATIONS = conf["iterations"].as<unsigned>();
+  const double eta = conf["eta"].as<double>();
+  const double diagonal_tension = conf["diagonal_tension"].as<double>();
+  bool SGD = false;
+  if (diagonal_tension < 0.0) {
+    cerr << "Invalid value for diagonal_tension: must be >= 0\n";
+    return 1;
+  }
+  string testset;
+  if (conf.count("testset")) testset = conf["testset"].as<string>();
+
+  unsigned lc = 0;
+  vector<double> unnormed_a_i;
+  bool flag = false;
+  vector<vector<WordID> > srcs, trgs;
+  vector<WordID> vocab_e;
+  {
+    set<WordID> svocab_e, svocab_f;
+    CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size);
+    copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e));
+  }
+  cerr << "Number of target word types: " << vocab_e.size() << endl;
+  const double num_examples = lc;
+
+  boost::shared_ptr<LBFGSOptimizer> lbfgs;
+  if (rank == 0)
+    lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100));
+  r_trg.resize(TD::NumWords() + 1);
+  r_src.resize(TD::NumWords() + 1);
+  vector<set<unsigned> > trg_pos(TD::NumWords() + 1);
+
+  if (conf.count("random_seed")) {
+    srand(conf["random_seed"].as<unsigned>());
+  } else {
+    unsigned seed = time(NULL) + rank * 100;
+    cerr << "Random seed: " << seed << endl;
+    srand(seed);
+  }
+  
+  TMatrix t = TMatrix::Zero();
+  if (rank == 0) {
+    t = TMatrix::Random() / 50.0;
+    for (unsigned i = 1; i < r_trg.size(); ++i) {
+      r_trg[i] = RVector::Random();
+      r_src[i] = RVector::Random();
+    }
+    if (conf.count("source_embeddings"))
+      LoadEmbeddings(conf["source_embeddings"].as<string>(), &r_src);
+    if (conf.count("target_embeddings"))
+      LoadEmbeddings(conf["target_embeddings"].as<string>(), &r_trg);
+  }
+
+  // do optimization
+  TMatrix g = TMatrix::Zero();
+  vector<TMatrix> exp_src;
+  vector<double> z_src;
+  vector<double> flat_g, flat_t, rcv_grad;
+  Flatten(t, &flat_t);
+  bool converged = false;
+#if HAVE_MPI
+  mpi::broadcast(world, &flat_t[0], flat_t.size(), 0);
+  mpi::broadcast(world, r_trg, 0);
+  mpi::broadcast(world, r_src, 0);
+#endif
+  cerr << "rank=" << rank << ": " << r_trg[0][4] << endl;
+  for (int iter = 0; !converged && iter < ITERATIONS; ++iter) {
+    if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl;
+    Unflatten(flat_t, &t);
+    double likelihood = 0;
+    double denom = 0.0;
+    lc = 0;
+    flag = false;
+    g *= 0;
+    for (unsigned i = 0; i < srcs.size(); ++i) {
+      const vector<WordID>& src = srcs[i];
+      const vector<WordID>& trg = trgs[i];
+      ++lc;
+      if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; }
+      if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
+      denom += trg.size();
+
+      exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero());
+      z_src.clear(); z_src.resize(src.size(), 0.0);
+      Array2D<TMatrix> exp_refs(src.size(), trg.size(), TMatrix::Zero());
+      Array2D<double> z_refs(src.size(), trg.size(), 0.0);
+      for (unsigned j = 0; j < trg.size(); ++j)
+        trg_pos[trg[j]].insert(j);
+
+      for (unsigned i = 0; i < src.size(); ++i) {
+        const RVector& r_s = r_src[src[i]];
+        const RTVector pred = r_s.transpose() * t;
+        TMatrix& exp_m = exp_src[i];
+        double& z = z_src[i];
+        for (unsigned k = 0; k < vocab_e.size(); ++k) {
+          const WordID v_k = vocab_e[k];
+          const RVector& r_t = r_trg[v_k];
+          const double dot_prod = pred * r_t;
+          const double u = exp(dot_prod);
+          z += u;
+          const TMatrix v = r_s * r_t.transpose() * u;
+          exp_m += v;
+          set<unsigned>& ref_locs = trg_pos[v_k];
+          if (!ref_locs.empty()) {
+            for (set<unsigned>::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) {
+              TMatrix& exp_ref_ij = exp_refs(i, *it);
+              double& z_ref_ij = z_refs(i, *it);
+              z_ref_ij += u;
+              exp_ref_ij += v;
+            }
+          }
+        }
+      }
+      for (unsigned j = 0; j < trg.size(); ++j)
+        trg_pos[trg[j]].clear();
+
+      // model expectations for a single target generation with
+      // uniform alignment prior
+      // TODO: when using a non-uniform alignment, m_exp will be
+      // a function of j (below)
+      double m_z = 0;
+      TMatrix m_exp = TMatrix::Zero();
+      for (unsigned i = 0; i < src.size(); ++i) {
+        m_exp += exp_src[i];
+        m_z += z_src[i];
+      }
+      m_exp /= m_z;
+
+      Array2D<bool> al(src.size(), trg.size(), false);
+      for (unsigned j = 0; j < trg.size(); ++j) {
+        double ref_z = 0;
+        TMatrix ref_exp = TMatrix::Zero();
+        int max_i = 0;
+        double max_s = -9999999;
+        for (unsigned i = 0; i < src.size(); ++i) {
+          ref_exp += exp_refs(i, j);
+          ref_z += z_refs(i, j);
+          if (log(z_refs(i, j)) > max_s) {
+            max_s = log(z_refs(i, j));
+            max_i = i;
+          }
+          // TODO handle alignment prob
+        }
+        if (ref_z <= 0) { 
+          cerr << "TRG=" << TD::Convert(trg[j]) << endl;
+          cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl;
+          cerr << " REF_EXP=\n" << ref_exp << endl;
+          cerr << " M_EXP=\n" << m_exp << endl;
+          abort();
+        }
+        al(max_i, j) = true;
+        ref_exp /= ref_z;
+        g += m_exp - ref_exp;
+        likelihood += log(ref_z) - log(m_z);
+        if (SGD) {
+          t -= g * eta / num_examples;
+          g *= 0;
+        }
+      }
+      
+      if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; }
+    }
+    if (flag && rank == 0) { cerr << endl; }
+
+    double obj = 0;
+    if (!SGD) {
+      Flatten(g, &flat_g);
+      obj = -likelihood;
+#if HAVE_MPI
+      rcv_grad.resize(flat_g.size(), 0.0);
+      mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus<double>(), 0);
+      swap(flat_g, rcv_grad);
+      rcv_grad.clear();
+
+      double to = 0;
+      mpi::reduce(world, obj, to, plus<double>(), 0);
+      obj = to;
+      double tlh = 0;
+      mpi::reduce(world, likelihood, tlh, plus<double>(), 0);
+      likelihood = tlh;
+      double td = 0;
+      mpi::reduce(world, denom, td, plus<double>(), 0);
+      denom = td;
+#endif
+    }
+
+    if (rank == 0) {
+      double gn = 0;
+      for (unsigned i = 0; i < flat_g.size(); ++i)
+        gn += flat_g[i]*flat_g[i];
+      const double base2_likelihood = likelihood / log(2);
+      cerr << "  log_e likelihood: " << likelihood << endl;
+      cerr << "  log_2 likelihood: " << base2_likelihood << endl;
+      cerr << "     cross entropy: " << (-base2_likelihood / denom) << endl;
+      cerr << "        perplexity: " << pow(2.0, -base2_likelihood / denom) << endl;
+      cerr << "     gradient norm: " << sqrt(gn) << endl;
+      if (!SGD) {
+        if (has_l2) {
+          const double r = ApplyRegularization(reg_strength,
+                                               flat_t,
+                                               &flat_g);
+          obj += r;
+          cerr << "    regularization: " << r << endl;
+        }
+        lbfgs->Optimize(obj, flat_g, &flat_t);
+        converged = (lbfgs->HasConverged());
+      }
+    }
+#ifdef HAVE_MPI
+    mpi::broadcast(world, &flat_t[0], flat_t.size(), 0);
+    mpi::broadcast(world, converged, 0);
+#endif
+  }
+  if (rank == 0)
+    cerr << "TRANSLATION MATRIX:" << endl << t << endl;
+  return 0;
+}
+
+#endif
+
diff --git a/training/model1.cc b/training/model1.cc
index b9590ece..73104304 100644
--- a/training/model1.cc
+++ b/training/model1.cc
@@ -4,12 +4,12 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "m.h"
 #include "lattice.h"
 #include "stringlib.h"
 #include "filelib.h"
 #include "ttables.h"
 #include "tdict.h"
-#include "em_utils.h"
 
 namespace po = boost::program_options;
 using namespace std;
@@ -20,7 +20,12 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("iterations,i",po::value<unsigned>()->default_value(5),"Number of iterations of EM training")
         ("beam_threshold,t",po::value<double>()->default_value(-4),"log_10 of beam threshold (-10000 to include everything, 0 max)")
         ("no_null_word,N","Do not generate from the null token")
+        ("write_alignments,A", "Write alignments instead of parameters")
+        ("favor_diagonal,d", "Use a static alignment distribution that assigns higher probabilities to alignments near the diagonal")
+        ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (<1 = flat >1 = sharp)")
+        ("prob_align_null", po::value<double>()->default_value(0.08), "When --favor_diagonal is set, what's the probability of a null alignment?")
         ("variational_bayes,v","Add a symmetric Dirichlet prior and infer VB estimate of weights")
+        ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model")
         ("alpha,a", po::value<double>()->default_value(0.01), "Hyperparameter for optional Dirichlet prior")
         ("no_add_viterbi,V","Do not add Viterbi alignment points (may generate a grammar where some training sentence pairs are unreachable)");
   po::options_description clo("Command line options");
@@ -56,7 +61,14 @@ int main(int argc, char** argv) {
   const WordID kNULL = TD::Convert("<eps>");
   const bool add_viterbi = (conf.count("no_add_viterbi") == 0);
   const bool variational_bayes = (conf.count("variational_bayes") > 0);
+  const bool write_alignments = (conf.count("write_alignments") > 0);
+  const double diagonal_tension = conf["diagonal_tension"].as<double>();
+  const double prob_align_null = conf["prob_align_null"].as<double>();
+  string testset;
+  if (conf.count("testset")) testset = conf["testset"].as<string>();
+  const double prob_align_not_null = 1.0 - prob_align_null;
   const double alpha = conf["alpha"].as<double>();
+  const bool favor_diagonal = conf.count("favor_diagonal");
   if (variational_bayes && alpha <= 0.0) {
     cerr << "--alpha must be > 0\n";
     return 1;
@@ -64,6 +76,9 @@ int main(int argc, char** argv) {
 
   TTable tt;
   TTable::Word2Word2Double was_viterbi;
+  double tot_len_ratio = 0;
+  double mean_srclen_multiplier = 0;
+  vector<double> unnormed_a_i;
   for (int iter = 0; iter < ITERATIONS; ++iter) {
     const bool final_iteration = (iter == (ITERATIONS - 1));
     cerr << "ITERATION " << (iter + 1) << (final_iteration ? " (FINAL)" : "") << endl;
@@ -74,13 +89,13 @@ int main(int argc, char** argv) {
     int lc = 0;
     bool flag = false;
     string line;
+    string ssrc, strg;
     while(true) {
       getline(in, line);
       if (!in) break;
       ++lc;
       if (lc % 1000 == 0) { cerr << '.'; flag = true; }
       if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
-      string ssrc, strg;
       ParseTranslatorInput(line, &ssrc, &strg);
       Lattice src, trg;
       LatticeTools::ConvertTextToLattice(ssrc, &src);
@@ -90,34 +105,60 @@ int main(int argc, char** argv) {
         assert(src.size() > 0);
         assert(trg.size() > 0);
       }
+      if (src.size() > unnormed_a_i.size())
+        unnormed_a_i.resize(src.size());
+      if (iter == 0)
+        tot_len_ratio += static_cast<double>(trg.size()) / static_cast<double>(src.size());
       denom += trg.size();
       vector<double> probs(src.size() + 1);
-      const double src_logprob = -log(src.size() + 1);
+      bool first_al = true;  // used for write_alignments
       for (int j = 0; j < trg.size(); ++j) {
         const WordID& f_j = trg[j][0].label;
         double sum = 0;
+        const double j_over_ts = double(j) / trg.size();
+        double prob_a_i = 1.0 / (src.size() + use_null);  // uniform (model 1)
         if (use_null) {
-          probs[0] = tt.prob(kNULL, f_j);
+          if (favor_diagonal) prob_a_i = prob_align_null;
+          probs[0] = tt.prob(kNULL, f_j) * prob_a_i;
           sum += probs[0];
         }
+        double az = 0;
+        if (favor_diagonal) {
+          for (int ta = 0; ta < src.size(); ++ta) {
+            unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension);
+            az += unnormed_a_i[ta];
+          }
+          az /= prob_align_not_null;
+        }
         for (int i = 1; i <= src.size(); ++i) {
-          probs[i] = tt.prob(src[i-1][0].label, f_j);
+          if (favor_diagonal)
+            prob_a_i = unnormed_a_i[i-1] / az;
+          probs[i] = tt.prob(src[i-1][0].label, f_j) * prob_a_i;
           sum += probs[i];
         }
         if (final_iteration) {
-          if (add_viterbi) {
+          if (add_viterbi || write_alignments) {
             WordID max_i = 0;
             double max_p = -1;
+            int max_index = -1;
             if (use_null) {
               max_i = kNULL;
+              max_index = 0;
               max_p = probs[0];
             }
             for (int i = 1; i <= src.size(); ++i) {
               if (probs[i] > max_p) {
+                max_index = i;
                 max_p = probs[i];
                 max_i = src[i-1][0].label;
               }
             }
+            if (write_alignments) {
+              if (max_index > 0) {
+                if (first_al) first_al = false; else cout << ' ';
+                cout << (max_index - 1) << "-" << j;
+              }
+            }
             was_viterbi[max_i][f_j] = 1.0;
           }
         } else {
@@ -126,14 +167,19 @@ int main(int argc, char** argv) {
           for (int i = 1; i <= src.size(); ++i)
             tt.Increment(src[i-1][0].label, f_j, probs[i] / sum);
         }
-        likelihood += log(sum) + src_logprob;
+        likelihood += log(sum);
       }
+      if (write_alignments && final_iteration) cout << endl;
     }
 
     // log(e) = 1.0
     double base2_likelihood = likelihood / log(2);
 
     if (flag) { cerr << endl; }
+    if (iter == 0) {
+      mean_srclen_multiplier = tot_len_ratio / lc;
+      cerr << "expected target length = source length * " << mean_srclen_multiplier << endl;
+    }
     cerr << "  log_e likelihood: " << likelihood << endl;
     cerr << "  log_2 likelihood: " << base2_likelihood << endl;
     cerr << "   cross entropy: " << (-base2_likelihood / denom) << endl;
@@ -145,6 +191,55 @@ int main(int argc, char** argv) {
         tt.Normalize();
     }
   }
+  if (testset.size()) {
+    ReadFile rf(testset);
+    istream& in = *rf.stream();
+    int lc = 0;
+    double tlp = 0;
+    string ssrc, strg, line;
+    while (getline(in, line)) {
+      ++lc;
+      ParseTranslatorInput(line, &ssrc, &strg);
+      Lattice src, trg;
+      LatticeTools::ConvertTextToLattice(ssrc, &src);
+      LatticeTools::ConvertTextToLattice(strg, &trg);
+      double log_prob = Md::log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier);
+      if (src.size() > unnormed_a_i.size())
+        unnormed_a_i.resize(src.size());
+
+      // compute likelihood
+      for (int j = 0; j < trg.size(); ++j) {
+        const WordID& f_j = trg[j][0].label;
+        double sum = 0;
+        const double j_over_ts = double(j) / trg.size();
+        double prob_a_i = 1.0 / (src.size() + use_null);  // uniform (model 1)
+        if (use_null) {
+          if (favor_diagonal) prob_a_i = prob_align_null;
+          sum += tt.prob(kNULL, f_j) * prob_a_i;
+        }
+        double az = 0;
+        if (favor_diagonal) {
+          for (int ta = 0; ta < src.size(); ++ta) {
+            unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension);
+            az += unnormed_a_i[ta];
+          }
+          az /= prob_align_not_null;
+        }
+        for (int i = 1; i <= src.size(); ++i) {
+          if (favor_diagonal)
+            prob_a_i = unnormed_a_i[i-1] / az;
+          sum += tt.prob(src[i-1][0].label, f_j) * prob_a_i;
+        }
+        log_prob += log(sum);
+      }
+      tlp += log_prob;
+      cerr << ssrc << " ||| " << strg << " ||| " << log_prob << endl;
+    }
+    cerr << "TOTAL LOG PROB " << tlp << endl;
+  }
+
+  if (write_alignments) return 0;
+
   for (TTable::Word2Word2Double::iterator ei = tt.ttable.begin(); ei != tt.ttable.end(); ++ei) {
     const TTable::Word2Double& cpd = ei->second;
     const TTable::Word2Double& vit = was_viterbi[ei->first];
diff --git a/training/mpi_flex_optimize.cc b/training/mpi_flex_optimize.cc
index 00746532..a9197208 100644
--- a/training/mpi_flex_optimize.cc
+++ b/training/mpi_flex_optimize.cc
@@ -205,7 +205,7 @@ int main(int argc, char** argv) {
   const int size = 1;
   const int rank = 0;
 #endif
-  if (size > 0) SetSilent(true);  // turn off verbose decoder output
+  if (size > 1) SetSilent(true);  // turn off verbose decoder output
   register_feature_functions();
   MT19937* rng = NULL;
 
@@ -272,6 +272,7 @@ int main(int argc, char** argv) {
 
   int iter = -1;
   bool converged = false;
+  vector<double> gg;
   while (!converged) {
 #ifdef HAVE_MPI
     mpi::timer timer;
@@ -343,7 +344,7 @@ int main(int argc, char** argv) {
 
         double obj = 0;
 #ifdef HAVE_MPI
-        // TODO obj
+        reduce(world, local_obj, obj, std::plus<double>(), 0);
         reduce(world, local_grad, g, std::plus<SparseVector<double> >(), 0);
 #else
         obj = local_obj;
@@ -354,13 +355,14 @@ int main(int argc, char** argv) {
           // g /= (size_per_proc * size);
           if (!o)
             o.reset(new LBFGSOptimizer(FD::NumFeats(), lbfgs_memory_buffers));
-          vector<double> gg(FD::NumFeats());
+          gg.clear();
+          gg.resize(FD::NumFeats());
           if (gg.size() != cur_weights.size()) { cur_weights.resize(gg.size()); }
           for (SparseVector<double>::const_iterator it = g.begin(); it != g.end(); ++it)
             if (it->first) { gg[it->first] = it->second; }
           g.clear();
           double r = ApplyRegularizationTerms(regularization_strength,
-                                time_series_strength * (iter == 0 ? 0.0 : 1.0),
+                                time_series_strength, // * (iter == 0 ? 0.0 : 1.0),
                                 cur_weights,
                                 prev_weights,
                                 &gg);
@@ -375,10 +377,9 @@ int main(int argc, char** argv) {
           o->Optimize(obj, gg, &cur_weights);
         }
 #ifdef HAVE_MPI
-        // broadcast(world, x, 0);
+        broadcast(world, cur_weights, 0);
         broadcast(world, converged, 0);
         world.barrier();
-        if (rank == 0) { cerr << "  ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; }
 #endif
     }
     prev_weights = cur_weights;
diff --git a/training/mr_em_adapted_reduce.cc b/training/mr_em_adapted_reduce.cc
index d4c16a2f..f65b5440 100644
--- a/training/mr_em_adapted_reduce.cc
+++ b/training/mr_em_adapted_reduce.cc
@@ -10,7 +10,7 @@
 #include "fdict.h"
 #include "weights.h"
 #include "sparse_vector.h"
-#include "em_utils.h"
+#include "m.h"
 
 using namespace std;
 namespace po = boost::program_options;
@@ -63,11 +63,11 @@ void Maximize(const bool use_vb,
   assert(tot > 0.0);
   double ltot = log(tot);
   if (use_vb)
-    ltot = digamma(tot + total_event_types * alpha);
+    ltot = Md::digamma(tot + total_event_types * alpha);
   for (SparseVector<double>::const_iterator it = counts.begin();
        it != counts.end(); ++it) {
     if (use_vb) {
-      pc->set_value(it->first, NoZero(digamma(it->second + alpha) - ltot));
+      pc->set_value(it->first, NoZero(Md::digamma(it->second + alpha) - ltot));
     } else {
       pc->set_value(it->first, NoZero(log(it->second) - ltot));
     }
diff --git a/training/ttables.h b/training/ttables.h
index 50d85a68..bf3351d2 100644
--- a/training/ttables.h
+++ b/training/ttables.h
@@ -4,9 +4,9 @@
 #include <iostream>
 #include <tr1/unordered_map>
 
+#include "m.h"
 #include "wordid.h"
 #include "tdict.h"
-#include "em_utils.h"
 
 class TTable {
  public:
@@ -39,7 +39,7 @@ class TTable {
       for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
         tot += it->second + alpha;
       for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
-        it->second = exp(digamma(it->second + alpha) - digamma(tot));
+        it->second = exp(Md::digamma(it->second + alpha) - Md::digamma(tot));
     }
     counts.clear();
   }