From 1b8181bf0d6e9137e6b9ccdbe414aec37377a1a9 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>
Date: Sun, 18 Nov 2012 13:35:42 -0500
Subject: major restructure of the training code

---
 training/Makefile.am                               |  100 +-
 training/add-model1-features-to-scfg.pl            |   93 --
 training/candidate_set.cc                          |  169 ---
 training/candidate_set.h                           |   60 -
 training/cllh_observer.cc                          |   52 -
 training/cllh_observer.h                           |   26 -
 training/collapse_weights.cc                       |  110 --
 training/crf/Makefile.am                           |   27 +
 training/crf/cllh_observer.cc                      |   52 +
 training/crf/cllh_observer.h                       |   26 +
 training/crf/mpi_batch_optimize.cc                 |  372 +++++
 training/crf/mpi_compute_cllh.cc                   |  134 ++
 training/crf/mpi_extract_features.cc               |  151 ++
 training/crf/mpi_extract_reachable.cc              |  163 +++
 training/crf/mpi_flex_optimize.cc                  |  386 ++++++
 training/crf/mpi_online_optimize.cc                |  374 +++++
 training/dep-reorder/conll2reordering-forest.pl    |   65 -
 training/dep-reorder/george.conll                  |    4 -
 training/dep-reorder/scripts/conll2simplecfg.pl    |   57 -
 training/dpmert/Makefile.am                        |   25 +
 training/dpmert/ces.cc                             |   90 ++
 training/dpmert/ces.h                              |   16 +
 training/dpmert/divide_refs.py                     |   15 +
 training/dpmert/dpmert.pl                          |  618 +++++++++
 training/dpmert/error_surface.cc                   |   42 +
 training/dpmert/error_surface.h                    |   24 +
 training/dpmert/line_mediator.pl                   |  116 ++
 training/dpmert/line_optimizer.cc                  |  114 ++
 training/dpmert/line_optimizer.h                   |   48 +
 training/dpmert/lo_test.cc                         |  229 +++
 training/dpmert/mert_geometry.cc                   |  185 +++
 training/dpmert/mert_geometry.h                    |   81 ++
 training/dpmert/mr_dpmert_generate_mapper_input.cc |   81 ++
 training/dpmert/mr_dpmert_map.cc                   |  112 ++
 training/dpmert/mr_dpmert_reduce.cc                |   77 ++
 training/dpmert/test_aer/README                    |    8 +
 training/dpmert/test_aer/cdec.ini                  |    3 +
 training/dpmert/test_aer/corpus.src                |    3 +
 training/dpmert/test_aer/grammar                   |   12 +
 training/dpmert/test_aer/ref.0                     |    3 +
 training/dpmert/test_aer/weights                   |   13 +
 training/dpmert/test_data/0.json.gz                |  Bin 0 -> 13709 bytes
 training/dpmert/test_data/1.json.gz                |  Bin 0 -> 204803 bytes
 training/dpmert/test_data/c2e.txt.0                |    2 +
 training/dpmert/test_data/c2e.txt.1                |    2 +
 training/dpmert/test_data/c2e.txt.2                |    2 +
 training/dpmert/test_data/c2e.txt.3                |    2 +
 training/dpmert/test_data/re.txt.0                 |    5 +
 training/dpmert/test_data/re.txt.1                 |    5 +
 training/dpmert/test_data/re.txt.2                 |    5 +
 training/dpmert/test_data/re.txt.3                 |    5 +
 training/dtrain/Makefile.am                        |    7 +
 training/dtrain/README.md                          |   48 +
 training/dtrain/dtrain.cc                          |  657 +++++++++
 training/dtrain/dtrain.h                           |   97 ++
 training/dtrain/hstreaming/avg.rb                  |   32 +
 training/dtrain/hstreaming/cdec.ini                |   22 +
 training/dtrain/hstreaming/dtrain.ini              |   15 +
 training/dtrain/hstreaming/dtrain.sh               |    9 +
 training/dtrain/hstreaming/hadoop-streaming-job.sh |   30 +
 training/dtrain/hstreaming/lplp.rb                 |  131 ++
 training/dtrain/hstreaming/red-test                |    9 +
 training/dtrain/kbestget.h                         |  152 ++
 training/dtrain/ksampler.h                         |   61 +
 training/dtrain/pairsampling.h                     |  149 ++
 training/dtrain/parallelize.rb                     |   79 ++
 training/dtrain/parallelize/test/cdec.ini          |   22 +
 training/dtrain/parallelize/test/dtrain.ini        |   15 +
 training/dtrain/parallelize/test/in                |   10 +
 training/dtrain/parallelize/test/refs              |   10 +
 training/dtrain/score.cc                           |  254 ++++
 training/dtrain/score.h                            |  212 +++
 training/dtrain/test/example/README                |    8 +
 training/dtrain/test/example/cdec.ini              |   25 +
 training/dtrain/test/example/dtrain.ini            |   22 +
 training/dtrain/test/example/expected-output       |   89 ++
 training/dtrain/test/parallelize/cdec.ini          |   22 +
 training/dtrain/test/parallelize/dtrain.ini        |   15 +
 training/dtrain/test/parallelize/in                |   10 +
 training/dtrain/test/parallelize/refs              |   10 +
 training/dtrain/test/toy/cdec.ini                  |    2 +
 training/dtrain/test/toy/dtrain.ini                |   12 +
 training/dtrain/test/toy/input                     |    2 +
 training/entropy.cc                                |   41 -
 training/entropy.h                                 |   22 -
 training/fast_align.cc                             |  281 ----
 training/feature_expectations.cc                   |  232 ----
 training/grammar_convert.cc                        |  348 -----
 training/lbfgs.h                                   | 1459 --------------------
 training/lbfgs_test.cc                             |  117 --
 training/lbl_model.cc                              |  421 ------
 training/minrisk/Makefile.am                       |    6 +
 training/minrisk/minrisk.pl                        |  540 ++++++++
 training/minrisk/minrisk_generate_input.pl         |   18 +
 training/minrisk/minrisk_optimize.cc               |  197 +++
 training/mira/Makefile.am                          |    6 +
 training/mira/kbest_mira.cc                        |  309 +++++
 training/mpi_batch_optimize.cc                     |  372 -----
 training/mpi_compute_cllh.cc                       |  134 --
 training/mpi_em_optimize.cc                        |  389 ------
 training/mpi_extract_features.cc                   |  151 --
 training/mpi_extract_reachable.cc                  |  163 ---
 training/mpi_flex_optimize.cc                      |  386 ------
 training/mpi_online_optimize.cc                    |  374 -----
 training/mr_em_adapted_reduce.cc                   |  173 ---
 training/mr_em_map_adapter.cc                      |  160 ---
 training/mr_optimize_reduce.cc                     |  231 ----
 training/mr_reduce_to_weights.cc                   |  109 --
 training/online_optimizer.cc                       |   16 -
 training/online_optimizer.h                        |  129 --
 training/optimize.cc                               |  102 --
 training/optimize.h                                |   92 --
 training/optimize_test.cc                          |  118 --
 training/pro/Makefile.am                           |   11 +
 training/pro/mr_pro_generate_mapper_input.pl       |   18 +
 training/pro/mr_pro_map.cc                         |  201 +++
 training/pro/mr_pro_reduce.cc                      |  286 ++++
 training/pro/pro.pl                                |  555 ++++++++
 training/rampion/Makefile.am                       |    6 +
 training/rampion/rampion.pl                        |  540 ++++++++
 training/rampion/rampion_cccp.cc                   |  168 +++
 training/rampion/rampion_generate_input.pl         |   18 +
 training/risk.cc                                   |   45 -
 training/risk.h                                    |   26 -
 training/ttables.cc                                |   31 -
 training/ttables.h                                 |  101 --
 training/utils/candidate_set.cc                    |  169 +++
 training/utils/candidate_set.h                     |   60 +
 training/utils/decode-and-evaluate.pl              |  246 ++++
 training/utils/entropy.cc                          |   41 +
 training/utils/entropy.h                           |   22 +
 training/utils/grammar_convert.cc                  |  348 +++++
 training/utils/lbfgs.h                             | 1459 ++++++++++++++++++++
 training/utils/lbfgs_test.cc                       |  117 ++
 training/utils/libcall.pl                          |   71 +
 training/utils/online_optimizer.cc                 |   16 +
 training/utils/online_optimizer.h                  |  129 ++
 training/utils/optimize.cc                         |  102 ++
 training/utils/optimize.h                          |   92 ++
 training/utils/optimize_test.cc                    |  118 ++
 training/utils/parallelize.pl                      |  423 ++++++
 training/utils/risk.cc                             |   45 +
 training/utils/risk.h                              |   26 +
 training/utils/sentclient.c                        |   76 +
 training/utils/sentserver.c                        |  515 +++++++
 training/utils/sentserver.h                        |    6 +
 146 files changed, 12836 insertions(+), 6949 deletions(-)
 delete mode 100755 training/add-model1-features-to-scfg.pl
 delete mode 100644 training/candidate_set.cc
 delete mode 100644 training/candidate_set.h
 delete mode 100644 training/cllh_observer.cc
 delete mode 100644 training/cllh_observer.h
 delete mode 100644 training/collapse_weights.cc
 create mode 100644 training/crf/Makefile.am
 create mode 100644 training/crf/cllh_observer.cc
 create mode 100644 training/crf/cllh_observer.h
 create mode 100644 training/crf/mpi_batch_optimize.cc
 create mode 100644 training/crf/mpi_compute_cllh.cc
 create mode 100644 training/crf/mpi_extract_features.cc
 create mode 100644 training/crf/mpi_extract_reachable.cc
 create mode 100644 training/crf/mpi_flex_optimize.cc
 create mode 100644 training/crf/mpi_online_optimize.cc
 delete mode 100755 training/dep-reorder/conll2reordering-forest.pl
 delete mode 100644 training/dep-reorder/george.conll
 delete mode 100755 training/dep-reorder/scripts/conll2simplecfg.pl
 create mode 100644 training/dpmert/Makefile.am
 create mode 100644 training/dpmert/ces.cc
 create mode 100644 training/dpmert/ces.h
 create mode 100755 training/dpmert/divide_refs.py
 create mode 100755 training/dpmert/dpmert.pl
 create mode 100644 training/dpmert/error_surface.cc
 create mode 100644 training/dpmert/error_surface.h
 create mode 100755 training/dpmert/line_mediator.pl
 create mode 100644 training/dpmert/line_optimizer.cc
 create mode 100644 training/dpmert/line_optimizer.h
 create mode 100644 training/dpmert/lo_test.cc
 create mode 100644 training/dpmert/mert_geometry.cc
 create mode 100644 training/dpmert/mert_geometry.h
 create mode 100644 training/dpmert/mr_dpmert_generate_mapper_input.cc
 create mode 100644 training/dpmert/mr_dpmert_map.cc
 create mode 100644 training/dpmert/mr_dpmert_reduce.cc
 create mode 100644 training/dpmert/test_aer/README
 create mode 100644 training/dpmert/test_aer/cdec.ini
 create mode 100644 training/dpmert/test_aer/corpus.src
 create mode 100644 training/dpmert/test_aer/grammar
 create mode 100644 training/dpmert/test_aer/ref.0
 create mode 100644 training/dpmert/test_aer/weights
 create mode 100644 training/dpmert/test_data/0.json.gz
 create mode 100644 training/dpmert/test_data/1.json.gz
 create mode 100644 training/dpmert/test_data/c2e.txt.0
 create mode 100644 training/dpmert/test_data/c2e.txt.1
 create mode 100644 training/dpmert/test_data/c2e.txt.2
 create mode 100644 training/dpmert/test_data/c2e.txt.3
 create mode 100644 training/dpmert/test_data/re.txt.0
 create mode 100644 training/dpmert/test_data/re.txt.1
 create mode 100644 training/dpmert/test_data/re.txt.2
 create mode 100644 training/dpmert/test_data/re.txt.3
 create mode 100644 training/dtrain/Makefile.am
 create mode 100644 training/dtrain/README.md
 create mode 100644 training/dtrain/dtrain.cc
 create mode 100644 training/dtrain/dtrain.h
 create mode 100755 training/dtrain/hstreaming/avg.rb
 create mode 100644 training/dtrain/hstreaming/cdec.ini
 create mode 100644 training/dtrain/hstreaming/dtrain.ini
 create mode 100755 training/dtrain/hstreaming/dtrain.sh
 create mode 100755 training/dtrain/hstreaming/hadoop-streaming-job.sh
 create mode 100755 training/dtrain/hstreaming/lplp.rb
 create mode 100644 training/dtrain/hstreaming/red-test
 create mode 100644 training/dtrain/kbestget.h
 create mode 100644 training/dtrain/ksampler.h
 create mode 100644 training/dtrain/pairsampling.h
 create mode 100755 training/dtrain/parallelize.rb
 create mode 100644 training/dtrain/parallelize/test/cdec.ini
 create mode 100644 training/dtrain/parallelize/test/dtrain.ini
 create mode 100644 training/dtrain/parallelize/test/in
 create mode 100644 training/dtrain/parallelize/test/refs
 create mode 100644 training/dtrain/score.cc
 create mode 100644 training/dtrain/score.h
 create mode 100644 training/dtrain/test/example/README
 create mode 100644 training/dtrain/test/example/cdec.ini
 create mode 100644 training/dtrain/test/example/dtrain.ini
 create mode 100644 training/dtrain/test/example/expected-output
 create mode 100644 training/dtrain/test/parallelize/cdec.ini
 create mode 100644 training/dtrain/test/parallelize/dtrain.ini
 create mode 100644 training/dtrain/test/parallelize/in
 create mode 100644 training/dtrain/test/parallelize/refs
 create mode 100644 training/dtrain/test/toy/cdec.ini
 create mode 100644 training/dtrain/test/toy/dtrain.ini
 create mode 100644 training/dtrain/test/toy/input
 delete mode 100644 training/entropy.cc
 delete mode 100644 training/entropy.h
 delete mode 100644 training/fast_align.cc
 delete mode 100644 training/feature_expectations.cc
 delete mode 100644 training/grammar_convert.cc
 delete mode 100644 training/lbfgs.h
 delete mode 100644 training/lbfgs_test.cc
 delete mode 100644 training/lbl_model.cc
 create mode 100644 training/minrisk/Makefile.am
 create mode 100755 training/minrisk/minrisk.pl
 create mode 100755 training/minrisk/minrisk_generate_input.pl
 create mode 100644 training/minrisk/minrisk_optimize.cc
 create mode 100644 training/mira/Makefile.am
 create mode 100644 training/mira/kbest_mira.cc
 delete mode 100644 training/mpi_batch_optimize.cc
 delete mode 100644 training/mpi_compute_cllh.cc
 delete mode 100644 training/mpi_em_optimize.cc
 delete mode 100644 training/mpi_extract_features.cc
 delete mode 100644 training/mpi_extract_reachable.cc
 delete mode 100644 training/mpi_flex_optimize.cc
 delete mode 100644 training/mpi_online_optimize.cc
 delete mode 100644 training/mr_em_adapted_reduce.cc
 delete mode 100644 training/mr_em_map_adapter.cc
 delete mode 100644 training/mr_optimize_reduce.cc
 delete mode 100644 training/mr_reduce_to_weights.cc
 delete mode 100644 training/online_optimizer.cc
 delete mode 100644 training/online_optimizer.h
 delete mode 100644 training/optimize.cc
 delete mode 100644 training/optimize.h
 delete mode 100644 training/optimize_test.cc
 create mode 100644 training/pro/Makefile.am
 create mode 100755 training/pro/mr_pro_generate_mapper_input.pl
 create mode 100644 training/pro/mr_pro_map.cc
 create mode 100644 training/pro/mr_pro_reduce.cc
 create mode 100755 training/pro/pro.pl
 create mode 100644 training/rampion/Makefile.am
 create mode 100755 training/rampion/rampion.pl
 create mode 100644 training/rampion/rampion_cccp.cc
 create mode 100755 training/rampion/rampion_generate_input.pl
 delete mode 100644 training/risk.cc
 delete mode 100644 training/risk.h
 delete mode 100644 training/ttables.cc
 delete mode 100644 training/ttables.h
 create mode 100644 training/utils/candidate_set.cc
 create mode 100644 training/utils/candidate_set.h
 create mode 100755 training/utils/decode-and-evaluate.pl
 create mode 100644 training/utils/entropy.cc
 create mode 100644 training/utils/entropy.h
 create mode 100644 training/utils/grammar_convert.cc
 create mode 100644 training/utils/lbfgs.h
 create mode 100644 training/utils/lbfgs_test.cc
 create mode 100644 training/utils/libcall.pl
 create mode 100644 training/utils/online_optimizer.cc
 create mode 100644 training/utils/online_optimizer.h
 create mode 100644 training/utils/optimize.cc
 create mode 100644 training/utils/optimize.h
 create mode 100644 training/utils/optimize_test.cc
 create mode 100755 training/utils/parallelize.pl
 create mode 100644 training/utils/risk.cc
 create mode 100644 training/utils/risk.h
 create mode 100644 training/utils/sentclient.c
 create mode 100644 training/utils/sentserver.c
 create mode 100644 training/utils/sentserver.h

(limited to 'training')

diff --git a/training/Makefile.am b/training/Makefile.am
index f9c25391..e95e045f 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -1,91 +1,11 @@
-bin_PROGRAMS = \
-  fast_align \
-  lbl_model \
-  test_ngram \
-  mr_em_map_adapter \
-  mr_em_adapted_reduce \
-  mr_reduce_to_weights \
-  mr_optimize_reduce \
-  grammar_convert \
-  plftools \
-  collapse_weights \
-  mpi_extract_reachable \
-  mpi_extract_features \
-  mpi_online_optimize \
-  mpi_flex_optimize \
-  mpi_batch_optimize \
-  mpi_compute_cllh \
-  augment_grammar
+SUBDIRS = \
+  liblbfgs \
+  utils \
+  crf \
+  minrisk \
+  dpmert \
+  pro \
+  dtrain \
+  mira \
+  rampion
 
-noinst_PROGRAMS = \
-  lbfgs_test \
-  optimize_test
-
-TESTS = lbfgs_test optimize_test
-
-noinst_LIBRARIES = libtraining.a
-libtraining_a_SOURCES = \
-  candidate_set.cc \
-  entropy.cc \
-  optimize.cc \
-  online_optimizer.cc \
-  risk.cc
-
-mpi_online_optimize_SOURCES = mpi_online_optimize.cc
-mpi_online_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc
-mpi_flex_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc
-mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_extract_features_SOURCES = mpi_extract_features.cc
-mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc
-mpi_batch_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc
-mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-augment_grammar_SOURCES = augment_grammar.cc
-augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-test_ngram_SOURCES = test_ngram.cc
-test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-fast_align_SOURCES = fast_align.cc ttables.cc
-fast_align_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-lbl_model_SOURCES = lbl_model.cc
-lbl_model_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-grammar_convert_SOURCES = grammar_convert.cc
-grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-optimize_test_SOURCES = optimize_test.cc
-optimize_test_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-collapse_weights_SOURCES = collapse_weights.cc
-collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-lbfgs_test_SOURCES = lbfgs_test.cc
-lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc
-mr_optimize_reduce_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc
-mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc
-mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc
-mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-plftools_SOURCES = plftools.cc
-plftools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm
diff --git a/training/add-model1-features-to-scfg.pl b/training/add-model1-features-to-scfg.pl
deleted file mode 100755
index a0074317..00000000
--- a/training/add-model1-features-to-scfg.pl
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/perl -w
-
-# [X] ||| so [X,1] die [X,2] der ||| as [X,1] existing [X,2] the ||| 2.47712135315 2.53182387352 5.07100057602 ||| 0-0 2-2 4-4
-# [X] ||| so [X,1] die [X,2] der ||| this [X,1] the [X,2] of ||| 2.47712135315 3.19828724861 2.38270020485 ||| 0-0 2-2 4-4
-# [X] ||| so [X,1] die [X,2] der ||| as [X,1] the [X,2] the ||| 2.47712135315 2.53182387352 1.48463630676 ||| 0-0 2-2 4-4
-# [X] ||| so [X,1] die [X,2] der ||| is [X,1] the [X,2] of the ||| 2.47712135315 3.45197868347 2.64251494408 ||| 0-0 2-2 4-4 4-5
-
-die "Usage: $0 model1.f-e model1.e-f < grammar.scfg\n  (use trianing/model1 to extract the model files)\n" unless scalar @ARGV == 2;
-
-my $fm1 = shift @ARGV;
-die unless $fm1;
-my $frm1 = shift @ARGV;
-die unless $frm1;
-open M1,"<$fm1" or die;
-open RM1,"<$frm1" or die;
-print STDERR "Loading Model 1 probs from $fm1...\n";
-my %m1;
-while(<M1>) {
-  chomp;
-  my ($f, $e, $lp) = split /\s+/;
-  $m1{$e}->{$f} = exp($lp);
-}
-close M1;
-
-print STDERR "Loading Inverse Model 1 probs from $frm1...\n";
-my %rm1;
-while(<RM1>) {
-  chomp;
-  my ($e, $f, $lp) = split /\s+/;
-  $rm1{$f}->{$e} = exp($lp);
-}
-close RM1;
-
-my @label = qw( EGivenF LexFGivenE LexEGivenF );
-while(<>) {
-  chomp;
-  my ($l, $f, $e, $sscores, $al) = split / \|\|\| /;
-  my @scores = split /\s+/, $sscores;
-  unless ($sscores =~ /=/) {
-    for (my $i=0; $i<3; $i++) { $scores[$i] = "$label[$i]=$scores[$i]"; }
-  }
-  push @scores, "RuleCount=1";
-  my @fs = split /\s+/, $f;
-  my @es = split /\s+/, $e;
-  my $flen = scalar @fs;
-  my $elen = scalar @es;
-  my $pgen = 0;
-  my $nongen = 0;
-  for (my $i =0; $i < $flen; $i++) {
-    my $ftot = 0;
-    next if ($fs[$i] =~ /\[X/);
-    my $cr = $rm1{$fs[$i]};
-    for (my $j=0; $j <= $elen; $j++) {
-      my $ej = '<eps>';
-      if ($j < $elen) { $ej = $es[$j]; }
-      my $p = $cr->{$ej};
-      if (defined $p) { $ftot += $p; }
-    }
-    if ($ftot == 0) { $nongen = 1; last; }
-    $pgen += log($ftot) - log($elen);
-  }
-  my $bad = 0;
-  my $good = 0;
-  unless ($nongen) { push @scores, "RGood=1"; $good++; } else { push @scores, "RBad=1"; $bad++; }
-
-  $nongen = 0;
-  $pgen = 0;
-  for (my $i =0; $i < $elen; $i++) {
-    my $etot = 0;
-    next if ($es[$i] =~ /\[X/);
-    my $cr = $m1{$es[$i]};
-#    print STDERR "$es[$i]\n";
-    for (my $j=0; $j <= $flen; $j++) {
-      my $fj = '<eps>';
-      if ($j < $flen) { $fj = $fs[$j]; }
-      my $p = $cr->{$fj};
-#      print STDERR "  $fs[$j] : $p\n";
-      if (defined $p) { $etot += $p; }
-    }
-    if ($etot == 0) { $nongen = 1; last; }
-    $pgen += log($etot) - log($flen);
-  }
-  unless ($nongen) {
-    push @scores, "FGood=1";
-    if ($good) { push @scores, "BothGood=1"; } else { push @scores, "SusDel=1"; }
-  } else {
-    push @scores, "FBad=1";
-    if ($bad) { push @scores, "BothBad=1"; } else { push @scores, "SusHall=1"; }
-  }
-  print "$l ||| $f ||| $e ||| @scores";
-  if (defined $al) { print " ||| $al\n"; } else { print "\n"; }
-}
-
diff --git a/training/candidate_set.cc b/training/candidate_set.cc
deleted file mode 100644
index 087efec3..00000000
--- a/training/candidate_set.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-#include "candidate_set.h"
-
-#include <tr1/unordered_set>
-
-#include <boost/functional/hash.hpp>
-
-#include "verbose.h"
-#include "ns.h"
-#include "filelib.h"
-#include "wordid.h"
-#include "tdict.h"
-#include "hg.h"
-#include "kbest.h"
-#include "viterbi.h"
-
-using namespace std;
-
-namespace training {
-
-struct ApproxVectorHasher {
-  static const size_t MASK = 0xFFFFFFFFull;
-  union UType {
-    double f;   // leave as double
-    size_t i;
-  };
-  static inline double round(const double x) {
-    UType t;
-    t.f = x;
-    size_t r = t.i & MASK;
-    if ((r << 1) > MASK)
-      t.i += MASK - r + 1;
-    else
-      t.i &= (1ull - MASK);
-    return t.f;
-  }
-  size_t operator()(const SparseVector<double>& x) const {
-    size_t h = 0x573915839;
-    for (SparseVector<double>::const_iterator it = x.begin(); it != x.end(); ++it) {
-      UType t;
-      t.f = it->second;
-      if (t.f) {
-        size_t z = (t.i >> 32);
-        boost::hash_combine(h, it->first);
-        boost::hash_combine(h, z);
-      }
-    }
-    return h;
-  }
-};
-
-struct ApproxVectorEquals {
-  bool operator()(const SparseVector<double>& a, const SparseVector<double>& b) const {
-    SparseVector<double>::const_iterator bit = b.begin();
-    for (SparseVector<double>::const_iterator ait = a.begin(); ait != a.end(); ++ait) {
-      if (bit == b.end() ||
-          ait->first != bit->first ||
-          ApproxVectorHasher::round(ait->second) != ApproxVectorHasher::round(bit->second))
-        return false;
-      ++bit;
-    }
-    if (bit != b.end()) return false;
-    return true;
-  }
-};
-
-struct CandidateCompare {
-  bool operator()(const Candidate& a, const Candidate& b) const {
-    ApproxVectorEquals eq;
-    return (a.ewords == b.ewords && eq(a.fmap,b.fmap));
-  }
-};
-
-struct CandidateHasher {
-  size_t operator()(const Candidate& x) const {
-    boost::hash<vector<WordID> > hhasher;
-    ApproxVectorHasher vhasher;
-    size_t ha = hhasher(x.ewords);
-    boost::hash_combine(ha, vhasher(x.fmap));
-    return ha;
-  }
-};
-
-static void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
-  SparseVector<double>& x = *out;
-  size_t last_start = cur;
-  size_t last_comma = string::npos;
-  while(cur <= line.size()) {
-    if (line[cur] == ' ' || cur == line.size()) {
-      if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
-        cerr << "[ERROR] " << line << endl << "  position = " << cur << endl;
-        exit(1);
-      }
-      const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
-      if (cur < line.size()) line[cur] = 0;
-      const double val = strtod(&line[last_comma + 1], NULL);
-      x.set_value(fid, val);
-
-      last_comma = string::npos;
-      last_start = cur+1;
-    } else {
-      if (line[cur] == '=')
-        last_comma = cur;
-    }
-    ++cur;
-  }
-}
-
-void CandidateSet::WriteToFile(const string& file) const {
-  WriteFile wf(file);
-  ostream& out = *wf.stream();
-  out.precision(10);
-  string ss;
-  for (unsigned i = 0; i < cs.size(); ++i) {
-    out << TD::GetString(cs[i].ewords) << endl;
-    out << cs[i].fmap << endl;
-    cs[i].eval_feats.Encode(&ss);
-    out << ss << endl;
-  }
-}
-
-void CandidateSet::ReadFromFile(const string& file) {
-  if(!SILENT) cerr << "Reading candidates from " << file << endl;
-  ReadFile rf(file);
-  istream& in = *rf.stream();
-  string cand;
-  string feats;
-  string ss;
-  while(getline(in, cand)) {
-    getline(in, feats);
-    getline(in, ss);
-    assert(in);
-    cs.push_back(Candidate());
-    TD::ConvertSentence(cand, &cs.back().ewords);
-    ParseSparseVector(feats, 0, &cs.back().fmap);
-    cs.back().eval_feats = SufficientStats(ss);
-  }
-  if(!SILENT) cerr << "  read " << cs.size() << " candidates\n";
-}
-
-void CandidateSet::Dedup() {
-  if(!SILENT) cerr << "Dedup in=" << cs.size();
-  tr1::unordered_set<Candidate, CandidateHasher, CandidateCompare> u;
-  while(cs.size() > 0) {
-    u.insert(cs.back());
-    cs.pop_back();
-  }
-  tr1::unordered_set<Candidate, CandidateHasher, CandidateCompare>::iterator it = u.begin();
-  while (it != u.end()) {
-    cs.push_back(*it);
-    it = u.erase(it);
-  }
-  if(!SILENT) cerr << "  out=" << cs.size() << endl;
-}
-
-void CandidateSet::AddKBestCandidates(const Hypergraph& hg, size_t kbest_size, const SegmentEvaluator* scorer) {
-  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
-
-  for (unsigned i = 0; i < kbest_size; ++i) {
-    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
-      kbest.LazyKthBest(hg.nodes_.size() - 1, i);
-    if (!d) break;
-    cs.push_back(Candidate(d->yield, d->feature_values));
-    if (scorer)
-      scorer->Evaluate(d->yield, &cs.back().eval_feats);
-  }
-  Dedup();
-}
-
-}
diff --git a/training/candidate_set.h b/training/candidate_set.h
deleted file mode 100644
index 9d326ed0..00000000
--- a/training/candidate_set.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef _CANDIDATE_SET_H_
-#define _CANDIDATE_SET_H_
-
-#include <vector>
-#include <algorithm>
-
-#include "ns.h"
-#include "wordid.h"
-#include "sparse_vector.h"
-
-class Hypergraph;
-
-namespace training {
-
-struct Candidate {
-  Candidate() {}
-  Candidate(const std::vector<WordID>& e, const SparseVector<double>& fm) :
-      ewords(e),
-      fmap(fm) {}
-  Candidate(const std::vector<WordID>& e,
-            const SparseVector<double>& fm,
-            const SegmentEvaluator& se) :
-      ewords(e),
-      fmap(fm) {
-    se.Evaluate(ewords, &eval_feats);
-  }
-
-  void swap(Candidate& other) {
-    eval_feats.swap(other.eval_feats);
-    ewords.swap(other.ewords);
-    fmap.swap(other.fmap);
-  }
-
-  std::vector<WordID> ewords;
-  SparseVector<double> fmap;
-  SufficientStats eval_feats;
-};
-
-// represents some kind of collection of translation candidates, e.g.
-// aggregated k-best lists, sample lists, etc.
-class CandidateSet {
- public:
-  CandidateSet() {}
-  inline size_t size() const { return cs.size(); }
-  const Candidate& operator[](size_t i) const { return cs[i]; }
-
-  void ReadFromFile(const std::string& file);
-  void WriteToFile(const std::string& file) const;
-  void AddKBestCandidates(const Hypergraph& hg, size_t kbest_size, const SegmentEvaluator* scorer = NULL);
-  // TODO add code to do unique k-best
-  // TODO add code to draw k samples
-
- private:
-  void Dedup();
-  std::vector<Candidate> cs;
-};
-
-}
-
-#endif
diff --git a/training/cllh_observer.cc b/training/cllh_observer.cc
deleted file mode 100644
index 4ec2fa65..00000000
--- a/training/cllh_observer.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "cllh_observer.h"
-
-#include <cmath>
-#include <cassert>
-
-#include "inside_outside.h"
-#include "hg.h"
-#include "sentence_metadata.h"
-
-using namespace std;
-
-static const double kMINUS_EPSILON = -1e-6;
-
-ConditionalLikelihoodObserver::~ConditionalLikelihoodObserver() {}
-
-void ConditionalLikelihoodObserver::NotifyDecodingStart(const SentenceMetadata&) {
-  cur_obj = 0;
-  state = 1;
-}
-
-void ConditionalLikelihoodObserver::NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
-  assert(state == 1);
-  state = 2;
-  SparseVector<prob_t> cur_model_exp;
-  const prob_t z = InsideOutside<prob_t,
-                                 EdgeProb,
-                                 SparseVector<prob_t>,
-                                 EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
-  cur_obj = log(z);
-}
-
-void ConditionalLikelihoodObserver::NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-  assert(state == 2);
-  state = 3;
-  SparseVector<prob_t> ref_exp;
-  const prob_t ref_z = InsideOutside<prob_t,
-                                     EdgeProb,
-                                     SparseVector<prob_t>,
-                                     EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp);
-
-  double log_ref_z = log(ref_z);
-
-  // rounding errors means that <0 is too strict
-  if ((cur_obj - log_ref_z) < kMINUS_EPSILON) {
-    cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
-    exit(1);
-  }
-  assert(!std::isnan(log_ref_z));
-  acc_obj += (cur_obj - log_ref_z);
-  trg_words += smeta.GetReference().size();
-}
-
diff --git a/training/cllh_observer.h b/training/cllh_observer.h
deleted file mode 100644
index 0de47331..00000000
--- a/training/cllh_observer.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _CLLH_OBSERVER_H_
-#define _CLLH_OBSERVER_H_
-
-#include "decoder.h"
-
-struct ConditionalLikelihoodObserver : public DecoderObserver {
-
-  ConditionalLikelihoodObserver() : trg_words(), acc_obj(), cur_obj() {}
-  ~ConditionalLikelihoodObserver();
-
-  void Reset() {
-    acc_obj = 0;
-    trg_words = 0;
-  }
- 
-  virtual void NotifyDecodingStart(const SentenceMetadata&);
-  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg);
-  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg);
-
-  unsigned trg_words;
-  double acc_obj;
-  double cur_obj;
-  int state;
-};
-
-#endif
diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc
deleted file mode 100644
index c03eb031..00000000
--- a/training/collapse_weights.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-char const* NOTES =
-  "ZF_and_E means unnormalized scaled features.\n"
-  "For grammars with one nonterminal: F_and_E is joint,\n"
-  "F_given_E and E_given_F are conditional.\n"
-  "TODO: group rules by root nonterminal and then normalize.\n";
-
-
-#include <iostream>
-#include <fstream>
-#include <tr1/unordered_map>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include <boost/functional/hash.hpp>
-
-#include "prob.h"
-#include "filelib.h"
-#include "trule.h"
-#include "weights.h"
-
-namespace po = boost::program_options;
-using namespace std;
-
-typedef std::tr1::unordered_map<vector<WordID>, prob_t, boost::hash<vector<WordID> > > MarginalMap;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("grammar,g", po::value<string>(), "Grammar file")
-        ("weights,w", po::value<string>(), "Weights file")
-    ("unnormalized,u", "Always include ZF_and_E unnormalized score (default: only if sum was >1)")
-    ;
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config,c", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    const string cfg = (*conf)["config"].as<string>();
-    cerr << "Configuration file: " << cfg << endl;
-    ifstream config(cfg.c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !conf->count("grammar") || !conf->count("weights")) {
-    cerr << dcmdline_options << endl;
-    cerr << NOTES << endl;
-    exit(1);
-  }
-}
-
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-  const string wfile = conf["weights"].as<string>();
-  const string gfile = conf["grammar"].as<string>();
-  vector<weight_t> w;
-  Weights::InitFromFile(wfile, &w);
-  MarginalMap e_tots;
-  MarginalMap f_tots;
-  prob_t tot;
-  {
-    ReadFile rf(gfile);
-    assert(*rf.stream());
-    istream& in = *rf.stream();
-    cerr << "Computing marginals...\n";
-    int lc = 0;
-    while(in) {
-      string line;
-      getline(in, line);
-      ++lc;
-      if (line.empty()) continue;
-      TRule tr(line, true);
-      if (tr.GetFeatureValues().empty())
-        cerr << "Line " << lc << ": empty features - may introduce bias\n";
-      prob_t prob;
-      prob.logeq(tr.GetFeatureValues().dot(w));
-      e_tots[tr.e_] += prob;
-      f_tots[tr.f_] += prob;
-      tot += prob;
-    }
-  }
-  bool normalized = (fabs(log(tot)) < 0.001);
-  cerr << "Total: " << tot << (normalized ? " [normalized]" : " [scaled]") << endl;
-  ReadFile rf(gfile);
-  istream&in = *rf.stream();
-  while(in) {
-    string line;
-    getline(in, line);
-    if (line.empty()) continue;
-    TRule tr(line, true);
-    const double lp = tr.GetFeatureValues().dot(w);
-    if (std::isinf(lp)) { continue; }
-    tr.scores_.clear();
-
-    cout << tr.AsString() << " ||| F_and_E=" << lp - log(tot);
-    if (!normalized || conf.count("unnormalized")) {
-      cout << ";ZF_and_E=" << lp;
-    }
-    cout << ";F_given_E=" << lp - log(e_tots[tr.e_])
-         << ";E_given_F=" << lp - log(f_tots[tr.f_]) << endl;
-  }
-  return 0;
-}
-
diff --git a/training/crf/Makefile.am b/training/crf/Makefile.am
new file mode 100644
index 00000000..d203df25
--- /dev/null
+++ b/training/crf/Makefile.am
@@ -0,0 +1,27 @@
+bin_PROGRAMS = \
+  mpi_batch_optimize \
+  mpi_compute_cllh \
+  mpi_extract_features \
+  mpi_extract_reachable \
+  mpi_flex_optimize \
+  mpi_online_optimize
+
+mpi_online_optimize_SOURCES = mpi_online_optimize.cc
+mpi_online_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc
+mpi_flex_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc
+mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+mpi_extract_features_SOURCES = mpi_extract_features.cc
+mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc
+mpi_batch_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc
+mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I$(top_srcdir)/training -I$(top_srcdir)/training/utils -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/crf/cllh_observer.cc b/training/crf/cllh_observer.cc
new file mode 100644
index 00000000..4ec2fa65
--- /dev/null
+++ b/training/crf/cllh_observer.cc
@@ -0,0 +1,52 @@
+#include "cllh_observer.h"
+
+#include <cmath>
+#include <cassert>
+
+#include "inside_outside.h"
+#include "hg.h"
+#include "sentence_metadata.h"
+
+using namespace std;
+
+static const double kMINUS_EPSILON = -1e-6;
+
+ConditionalLikelihoodObserver::~ConditionalLikelihoodObserver() {}
+
+void ConditionalLikelihoodObserver::NotifyDecodingStart(const SentenceMetadata&) {
+  cur_obj = 0;
+  state = 1;
+}
+
+void ConditionalLikelihoodObserver::NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+  assert(state == 1);
+  state = 2;
+  SparseVector<prob_t> cur_model_exp;
+  const prob_t z = InsideOutside<prob_t,
+                                 EdgeProb,
+                                 SparseVector<prob_t>,
+                                 EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
+  cur_obj = log(z);
+}
+
+void ConditionalLikelihoodObserver::NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+  assert(state == 2);
+  state = 3;
+  SparseVector<prob_t> ref_exp;
+  const prob_t ref_z = InsideOutside<prob_t,
+                                     EdgeProb,
+                                     SparseVector<prob_t>,
+                                     EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp);
+
+  double log_ref_z = log(ref_z);
+
+  // rounding errors means that <0 is too strict
+  if ((cur_obj - log_ref_z) < kMINUS_EPSILON) {
+    cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
+    exit(1);
+  }
+  assert(!std::isnan(log_ref_z));
+  acc_obj += (cur_obj - log_ref_z);
+  trg_words += smeta.GetReference().size();
+}
+
diff --git a/training/crf/cllh_observer.h b/training/crf/cllh_observer.h
new file mode 100644
index 00000000..0de47331
--- /dev/null
+++ b/training/crf/cllh_observer.h
@@ -0,0 +1,26 @@
+#ifndef _CLLH_OBSERVER_H_
+#define _CLLH_OBSERVER_H_
+
+#include "decoder.h"
+
+struct ConditionalLikelihoodObserver : public DecoderObserver {
+
+  ConditionalLikelihoodObserver() : trg_words(), acc_obj(), cur_obj() {}
+  ~ConditionalLikelihoodObserver();
+
+  void Reset() {
+    acc_obj = 0;
+    trg_words = 0;
+  }
+ 
+  virtual void NotifyDecodingStart(const SentenceMetadata&);
+  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg);
+  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg);
+
+  unsigned trg_words;
+  double acc_obj;
+  double cur_obj;
+  int state;
+};
+
+#endif
diff --git a/training/crf/mpi_batch_optimize.cc b/training/crf/mpi_batch_optimize.cc
new file mode 100644
index 00000000..2eff07e4
--- /dev/null
+++ b/training/crf/mpi_batch_optimize.cc
@@ -0,0 +1,372 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+#include "config.h"
+#ifdef HAVE_MPI
+#include <boost/mpi/timer.hpp>
+#include <boost/mpi.hpp>
+namespace mpi = boost::mpi;
+#endif
+
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sentence_metadata.h"
+#include "cllh_observer.h"
+#include "verbose.h"
+#include "hg.h"
+#include "prob.h"
+#include "inside_outside.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "optimize.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("input_weights,w",po::value<string>(),"Input feature weights file")
+        ("training_data,t",po::value<string>(),"Training data")
+        ("test_data,T",po::value<string>(),"(optional) test data")
+        ("decoder_config,c",po::value<string>(),"Decoder configuration file")
+        ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file")
+        ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)")
+	("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory")
+        ("gaussian_prior,p","Use a Gaussian prior on the weights")
+        ("sigma_squared", po::value<double>()->default_value(1.0), "Sigma squared term for spherical Gaussian prior")
+        ("means,u", po::value<string>(), "(optional) file containing the means for Gaussian prior");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data")) || !conf->count("decoder_config")) {
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  int lc = 0;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (lc % size == rank) c->push_back(line);
+    ++lc;
+  }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct TrainingObserver : public DecoderObserver {
+  void Reset() {
+    acc_grad.clear();
+    acc_obj = 0;
+    total_complete = 0;
+    trg_words = 0;
+  } 
+
+  void SetLocalGradientAndObjective(vector<double>* g, double* o) const {
+    *o = acc_obj;
+    for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
+      (*g)[it->first] = it->second.as_float();
+  }
+
+  virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
+    cur_model_exp.clear();
+    cur_obj = 0;
+    state = 1;
+  }
+
+  // compute model expectations, denominator of objective
+  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    assert(state == 1);
+    state = 2;
+    const prob_t z = InsideOutside<prob_t,
+                                   EdgeProb,
+                                   SparseVector<prob_t>,
+                                   EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
+    cur_obj = log(z);
+    cur_model_exp /= z;
+  }
+
+  // compute "empirical" expectations, numerator of objective
+  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    assert(state == 2);
+    state = 3;
+    SparseVector<prob_t> ref_exp;
+    const prob_t ref_z = InsideOutside<prob_t,
+                                       EdgeProb,
+                                       SparseVector<prob_t>,
+                                       EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp);
+    ref_exp /= ref_z;
+
+    double log_ref_z;
+#if 0
+    if (crf_uniform_empirical) {
+      log_ref_z = ref_exp.dot(feature_weights);
+    } else {
+      log_ref_z = log(ref_z);
+    }
+#else
+    log_ref_z = log(ref_z);
+#endif
+
+    // rounding errors means that <0 is too strict
+    if ((cur_obj - log_ref_z) < kMINUS_EPSILON) {
+      cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
+      exit(1);
+    }
+    assert(!std::isnan(log_ref_z));
+    ref_exp -= cur_model_exp;
+    acc_grad -= ref_exp;
+    acc_obj += (cur_obj - log_ref_z);
+    trg_words += smeta.GetReference().size();
+  }
+
+  virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
+    if (state == 3) {
+      ++total_complete;
+    } else {
+    }
+  }
+
+  int total_complete;
+  SparseVector<prob_t> cur_model_exp;
+  SparseVector<prob_t> acc_grad;
+  double acc_obj;
+  double cur_obj;
+  unsigned trg_words;
+  int state;
+};
+
+void ReadConfig(const string& ini, vector<string>* out) {
+  ReadFile rf(ini);
+  istream& in = *rf.stream();
+  while(in) {
+    string line;
+    getline(in, line);
+    if (!in) continue;
+    out->push_back(line);
+  }
+}
+
+void StoreConfig(const vector<string>& cfg, istringstream* o) {
+  ostringstream os;
+  for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; }
+  o->str(os.str());
+}
+
+template <typename T>
+struct VectorPlus : public binary_function<vector<T>, vector<T>, vector<T> >  {
+  vector<T> operator()(const vector<int>& a, const vector<int>& b) const {
+    assert(a.size() == b.size());
+    vector<T> v(a.size());
+    transform(a.begin(), a.end(), b.begin(), v.begin(), plus<T>()); 
+    return v;
+  } 
+}; 
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  const int size = 1;
+  const int rank = 0;
+#endif
+  SetSilent(true);  // turn off verbose decoder output
+  register_feature_functions();
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf)) return 1;
+
+  // load cdec.ini and set up decoder
+  vector<string> cdec_ini;
+  ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
+  istringstream ini;
+  StoreConfig(cdec_ini, &ini);
+  if (rank == 0) cerr << "Loading grammar...\n";
+  Decoder* decoder = new Decoder(&ini);
+  if (decoder->GetConf()["input"].as<string>() != "-") {
+    cerr << "cdec.ini must not set an input file\n";
+    return 1;
+  }
+  if (rank == 0) cerr << "Done loading grammar!\n";
+
+  // load initial weights
+  if (rank == 0) { cerr << "Loading weights...\n"; }
+  vector<weight_t>& lambdas = decoder->CurrentWeightVector();
+  Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas);
+  if (rank == 0) { cerr << "Done loading weights.\n"; }
+
+  // freeze feature set (should be optional?)
+  const bool freeze_feature_set = true;
+  if (freeze_feature_set) FD::Freeze();
+
+  const int num_feats = FD::NumFeats();
+  if (rank == 0) cerr << "Number of features: " << num_feats << endl;
+  lambdas.resize(num_feats);
+
+  const bool gaussian_prior = conf.count("gaussian_prior");
+  vector<weight_t> means(num_feats, 0);
+  if (conf.count("means")) {
+    if (!gaussian_prior) {
+      cerr << "Don't use --means without --gaussian_prior!\n";
+      exit(1);
+    }
+    Weights::InitFromFile(conf["means"].as<string>(), &means);
+  }
+  boost::shared_ptr<BatchOptimizer> o;
+  if (rank == 0) {
+    const string omethod = conf["optimization_method"].as<string>();
+    if (omethod == "rprop")
+      o.reset(new RPropOptimizer(num_feats));  // TODO add configuration
+    else
+      o.reset(new LBFGSOptimizer(num_feats, conf["correction_buffers"].as<int>()));
+    cerr << "Optimizer: " << o->Name() << endl;
+  }
+  double objective = 0;
+  vector<double> gradient(num_feats, 0.0);
+  vector<double> rcv_grad;
+  rcv_grad.clear();
+  bool converged = false;
+
+  vector<string> corpus, test_corpus;
+  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
+  assert(corpus.size() > 0);
+  if (conf.count("test_data"))
+    ReadTrainingCorpus(conf["test_data"].as<string>(), rank, size, &test_corpus);
+
+  TrainingObserver observer;
+  ConditionalLikelihoodObserver cllh_observer;
+  while (!converged) {
+    observer.Reset();
+    cllh_observer.Reset();
+#ifdef HAVE_MPI
+    mpi::timer timer;
+    world.barrier();
+#endif
+    if (rank == 0) {
+      cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n";
+      cerr << "  Testset size: " << test_corpus.size() << " sentences / proc)\n";
+    }
+    for (int i = 0; i < corpus.size(); ++i)
+      decoder->Decode(corpus[i], &observer);
+    cerr << "  process " << rank << '/' << size << " done\n";
+    fill(gradient.begin(), gradient.end(), 0);
+    observer.SetLocalGradientAndObjective(&gradient, &objective);
+
+    unsigned total_words = 0;
+#ifdef HAVE_MPI
+    double to = 0;
+    rcv_grad.resize(num_feats, 0.0);
+    mpi::reduce(world, &gradient[0], gradient.size(), &rcv_grad[0], plus<double>(), 0);
+    swap(gradient, rcv_grad);
+    rcv_grad.clear();
+
+    reduce(world, observer.trg_words, total_words, std::plus<unsigned>(), 0);
+    mpi::reduce(world, objective, to, plus<double>(), 0);
+    objective = to;
+#else
+    total_words = observer.trg_words;
+#endif
+    if (rank == 0)
+      cerr << "TRAINING CORPUS: ln p(f|e)=" << objective << "\t log_2 p(f|e) = " << (objective/log(2)) << "\t cond. entropy = " << (objective/log(2) / total_words) << "\t ppl = " << pow(2, (objective/log(2) / total_words)) << endl;
+
+    for (int i = 0; i < test_corpus.size(); ++i)
+      decoder->Decode(test_corpus[i], &cllh_observer);
+
+    double test_objective = 0;
+    unsigned test_total_words = 0;
+#ifdef HAVE_MPI
+    reduce(world, cllh_observer.acc_obj, test_objective, std::plus<double>(), 0);
+    reduce(world, cllh_observer.trg_words, test_total_words, std::plus<unsigned>(), 0);
+#else
+    test_objective = cllh_observer.acc_obj;
+    test_total_words = cllh_observer.trg_words;
+#endif
+
+    if (rank == 0) {  // run optimizer only on rank=0 node
+      if (test_corpus.size())
+        cerr << "    TEST CORPUS: ln p(f|e)=" << test_objective << "\t log_2 p(f|e) = " << (test_objective/log(2)) << "\t cond. entropy = " << (test_objective/log(2) / test_total_words) << "\t ppl = " << pow(2, (test_objective/log(2) / test_total_words)) << endl;
+      if (gaussian_prior) {
+        const double sigsq = conf["sigma_squared"].as<double>();
+        double norm = 0;
+        for (int k = 1; k < lambdas.size(); ++k) {
+          const double& lambda_k = lambdas[k];
+          if (lambda_k) {
+            const double param = (lambda_k - means[k]);
+            norm += param * param;
+            gradient[k] += param / sigsq;
+          }
+        }
+        const double reg = norm / (2.0 * sigsq);
+        cerr << "REGULARIZATION TERM: " << reg << endl;
+        objective += reg;
+      }
+      cerr << "EVALUATION #" << o->EvaluationCount() << " OBJECTIVE: " << objective << endl;
+      double gnorm = 0;
+      for (int i = 0; i < gradient.size(); ++i)
+        gnorm += gradient[i] * gradient[i];
+      cerr << "  GNORM=" << sqrt(gnorm) << endl;
+      vector<weight_t> old = lambdas;
+      int c = 0;
+      while (old == lambdas) {
+        ++c;
+        if (c > 1) { cerr << "Same lambdas, repeating optimization\n"; }
+        o->Optimize(objective, gradient, &lambdas);
+        assert(c < 5);
+      }
+      old.clear();
+      Weights::SanityCheck(lambdas);
+      Weights::ShowLargestFeatures(lambdas);
+
+      converged = o->HasConverged();
+      if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
+
+      string fname = "weights.cur.gz";
+      if (converged) { fname = "weights.final.gz"; }
+      ostringstream vv;
+      vv << "Objective = " << objective << "  (eval count=" << o->EvaluationCount() << ")";
+      const string svv = vv.str();
+      Weights::WriteToFile(fname, lambdas, true, &svv);
+    }  // rank == 0
+    int cint = converged;
+#ifdef HAVE_MPI
+    mpi::broadcast(world, &lambdas[0], lambdas.size(), 0);
+    mpi::broadcast(world, cint, 0);
+    if (rank == 0) { cerr << "  ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; }
+#endif
+    converged = cint;
+  }
+  return 0;
+}
+
diff --git a/training/crf/mpi_compute_cllh.cc b/training/crf/mpi_compute_cllh.cc
new file mode 100644
index 00000000..066389d0
--- /dev/null
+++ b/training/crf/mpi_compute_cllh.cc
@@ -0,0 +1,134 @@
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+#include "config.h"
+#ifdef HAVE_MPI
+#include <boost/mpi.hpp>
+#endif
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "cllh_observer.h"
+#include "sentence_metadata.h"
+#include "verbose.h"
+#include "hg.h"
+#include "prob.h"
+#include "inside_outside.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "weights.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("weights,w",po::value<string>(),"Input feature weights file")
+        ("training_data,t",po::value<string>(),"Training data corpus")
+        ("decoder_config,c",po::value<string>(),"Decoder configuration file");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void ReadInstances(const string& fname, int rank, int size, vector<string>* c) {
+  assert(fname != "-");
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  int lc = 0;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (lc % size == rank) c->push_back(line);
+    ++lc;
+  }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+#ifdef HAVE_MPI
+namespace mpi = boost::mpi;
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  const int size = 1;
+  const int rank = 0;
+#endif
+  if (size > 1) SetSilent(true);  // turn off verbose decoder output
+  register_feature_functions();
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf))
+    return false;
+
+  // load cdec.ini and set up decoder
+  ReadFile ini_rf(conf["decoder_config"].as<string>());
+  Decoder decoder(ini_rf.stream());
+  if (decoder.GetConf()["input"].as<string>() != "-") {
+    cerr << "cdec.ini must not set an input file\n";
+    abort();
+  }
+
+  // load weights
+  vector<weight_t>& weights = decoder.CurrentWeightVector();
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &weights);
+
+  vector<string> corpus;
+  ReadInstances(conf["training_data"].as<string>(), rank, size, &corpus);
+  assert(corpus.size() > 0);
+
+  if (rank == 0)
+    cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
+
+  ConditionalLikelihoodObserver observer;
+  for (int i = 0; i < corpus.size(); ++i)
+    decoder.Decode(corpus[i], &observer);
+
+  double objective = 0;
+  unsigned total_words = 0;
+#ifdef HAVE_MPI
+  reduce(world, observer.acc_obj, objective, std::plus<double>(), 0);
+  reduce(world, observer.trg_words, total_words, std::plus<unsigned>(), 0);
+#else
+  objective = observer.acc_obj;
+#endif
+
+  if (rank == 0) {
+    cout << "CONDITIONAL LOG_e LIKELIHOOD: " << objective << endl;
+    cout << "CONDITIONAL LOG_2 LIKELIHOOD: " << (objective/log(2)) << endl;
+    cout << "         CONDITIONAL ENTROPY: " << (objective/log(2) / total_words) << endl;
+    cout << "                  PERPLEXITY: " << pow(2, (objective/log(2) / total_words)) << endl;
+  }
+
+  return 0;
+}
+
diff --git a/training/crf/mpi_extract_features.cc b/training/crf/mpi_extract_features.cc
new file mode 100644
index 00000000..6750aa15
--- /dev/null
+++ b/training/crf/mpi_extract_features.cc
@@ -0,0 +1,151 @@
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <cassert>
+
+#include "config.h"
+#ifdef HAVE_MPI
+#include <boost/mpi.hpp>
+#endif
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "ff_register.h"
+#include "verbose.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "decoder.h"
+#include "weights.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("training_data,t",po::value<string>(),"Training data corpus")
+        ("decoder_config,c",po::value<string>(),"Decoder configuration file")
+        ("weights,w", po::value<string>(), "(Optional) weights file; weights may affect what features are encountered in pruning configurations")
+        ("output_prefix,o",po::value<string>()->default_value("features"),"Output path prefix");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
+    cerr << "Decode an input set (optionally in parallel using MPI) and write\nout the feature strings encountered.\n";
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  int lc = 0;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (lc % size == rank) c->push_back(line);
+    ++lc;
+  }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct TrainingObserver : public DecoderObserver {
+
+  virtual void NotifyDecodingStart(const SentenceMetadata&) {
+  }
+
+  // compute model expectations, denominator of objective
+  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+  }
+
+  // compute "empirical" expectations, numerator of objective
+  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+  }
+};
+
+#ifdef HAVE_MPI
+namespace mpi = boost::mpi;
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  const int size = 1;
+  const int rank = 0;
+#endif
+  if (size > 1) SetSilent(true);  // turn off verbose decoder output
+  register_feature_functions();
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf))
+    return false;
+
+  // load cdec.ini and set up decoder
+  ReadFile ini_rf(conf["decoder_config"].as<string>());
+  Decoder decoder(ini_rf.stream());
+  if (decoder.GetConf()["input"].as<string>() != "-") {
+    cerr << "cdec.ini must not set an input file\n";
+    abort();
+  }
+
+  if (FD::UsingPerfectHashFunction()) {
+    cerr << "Your configuration file has enabled a cmph hash function. Please disable.\n";
+    return 1;
+  }
+
+  // load optional weights
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &decoder.CurrentWeightVector());
+
+  vector<string> corpus;
+  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
+  assert(corpus.size() > 0);
+
+  TrainingObserver observer;
+
+  if (rank == 0)
+    cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
+
+  for (int i = 0; i < corpus.size(); ++i)
+    decoder.Decode(corpus[i], &observer);
+
+  {
+    ostringstream os;
+    os << conf["output_prefix"].as<string>() << '.' << rank << "_of_" << size;
+    WriteFile wf(os.str());
+    ostream& out = *wf.stream();
+    const unsigned num_feats = FD::NumFeats();
+    for (unsigned i = 1; i < num_feats; ++i) {
+      out << FD::Convert(i) << endl;
+    }
+    cerr << "Wrote " << os.str() << endl;
+  }
+
+#ifdef HAVE_MPI
+  world.barrier();
+#else
+#endif
+
+  return 0;
+}
+
diff --git a/training/crf/mpi_extract_reachable.cc b/training/crf/mpi_extract_reachable.cc
new file mode 100644
index 00000000..2a7c2b9d
--- /dev/null
+++ b/training/crf/mpi_extract_reachable.cc
@@ -0,0 +1,163 @@
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <cassert>
+
+#include "config.h"
+#ifdef HAVE_MPI
+#include <boost/mpi.hpp>
+#endif
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "ff_register.h"
+#include "verbose.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "decoder.h"
+#include "weights.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("training_data,t",po::value<string>(),"Training data corpus")
+        ("decoder_config,c",po::value<string>(),"Decoder configuration file")
+        ("weights,w", po::value<string>(), "(Optional) weights file; weights may affect what features are encountered in pruning configurations")
+        ("output_prefix,o",po::value<string>()->default_value("reachable"),"Output path prefix");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
+    cerr << "Decode an input set (optionally in parallel using MPI) and write\nout the inputs that produce reachable parallel parses.\n";
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  int lc = 0;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (lc % size == rank) c->push_back(line);
+    ++lc;
+  }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct ReachabilityObserver : public DecoderObserver {
+
+  virtual void NotifyDecodingStart(const SentenceMetadata&) {
+    reachable = false;
+  }
+
+  // compute model expectations, denominator of objective
+  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+  }
+
+  // compute "empirical" expectations, numerator of objective
+  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    reachable = true;
+  }
+
+  bool reachable;
+};
+
+#ifdef HAVE_MPI
+namespace mpi = boost::mpi;
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  const int size = 1;
+  const int rank = 0;
+#endif
+  if (size > 1) SetSilent(true);  // turn off verbose decoder output
+  register_feature_functions();
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf))
+    return false;
+
+  // load cdec.ini and set up decoder
+  ReadFile ini_rf(conf["decoder_config"].as<string>());
+  Decoder decoder(ini_rf.stream());
+  if (decoder.GetConf()["input"].as<string>() != "-") {
+    cerr << "cdec.ini must not set an input file\n";
+    abort();
+  }
+
+  if (FD::UsingPerfectHashFunction()) {
+    cerr << "Your configuration file has enabled a cmph hash function. Please disable.\n";
+    return 1;
+  }
+
+  // load optional weights
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &decoder.CurrentWeightVector());
+
+  vector<string> corpus;
+  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
+  assert(corpus.size() > 0);
+
+
+  if (rank == 0)
+    cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
+
+  size_t num_reached = 0;
+  {
+    ostringstream os;
+    os << conf["output_prefix"].as<string>() << '.' << rank << "_of_" << size;
+    WriteFile wf(os.str());
+    ostream& out = *wf.stream();
+    ReachabilityObserver observer;
+    for (int i = 0; i < corpus.size(); ++i) {
+      decoder.Decode(corpus[i], &observer);
+      if (observer.reachable) {
+         out << corpus[i] << endl;
+         ++num_reached;
+      }
+      corpus[i].clear();
+    }
+    cerr << "Shard " << rank << '/' << size << " finished, wrote "
+         << num_reached << " instances to " << os.str() << endl;
+  }
+
+  size_t total = 0;
+#ifdef HAVE_MPI
+  reduce(world, num_reached, total, std::plus<double>(), 0);
+#else
+  total = num_reached;
+#endif
+  if (rank == 0) {
+    cerr << "-----------------------------------------\n";
+    cerr << "TOTAL = " << total << " instances\n";
+  }
+  return 0;
+}
+
diff --git a/training/crf/mpi_flex_optimize.cc b/training/crf/mpi_flex_optimize.cc
new file mode 100644
index 00000000..b52decdc
--- /dev/null
+++ b/training/crf/mpi_flex_optimize.cc
@@ -0,0 +1,386 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "stringlib.h"
+#include "verbose.h"
+#include "hg.h"
+#include "prob.h"
+#include "inside_outside.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "optimize.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+#ifdef HAVE_MPI
+#include <boost/mpi/timer.hpp>
+#include <boost/mpi.hpp>
+namespace mpi = boost::mpi;
+#endif
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("cdec_config,c",po::value<string>(),"Decoder configuration file")
+        ("weights,w",po::value<string>(),"Initial feature weights")
+        ("training_data,d",po::value<string>(),"Training data")
+        ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(6), "Number of training instances evaluated per processor in each minibatch")
+        ("minibatch_iterations,i", po::value<unsigned>()->default_value(10), "Number of optimization iterations per minibatch")
+        ("iterations,I", po::value<unsigned>()->default_value(50), "Number of passes through the training data before termination")
+        ("regularization_strength,C", po::value<double>()->default_value(0.2), "Regularization strength")
+        ("time_series_strength,T", po::value<double>()->default_value(0.0), "Time series regularization strength")
+        ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+        ("lbfgs_memory_buffers,M", po::value<unsigned>()->default_value(10), "Number of memory buffers for LBFGS history");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("training_data") || !conf->count("cdec_config")) {
+    cerr << "LBFGS minibatch online optimizer (MPI support "
+#if HAVE_MPI
+         << "enabled"
+#else
+         << "not enabled"
+#endif
+         << ")\n" << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  int id = 0;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (id % size == rank) {
+      c->push_back(line);
+      order->push_back(id);
+    }
+    ++id;
+  }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct CopyHGsObserver : public DecoderObserver {
+  Hypergraph* hg_;
+  Hypergraph* gold_hg_;
+
+  // this can free up some memory
+  void RemoveRules(Hypergraph* h) {
+    for (unsigned i = 0; i < h->edges_.size(); ++i)
+      h->edges_[i].rule_.reset();
+  }
+
+  void SetCurrentHypergraphs(Hypergraph* h, Hypergraph* gold_h) {
+    hg_ = h;
+    gold_hg_ = gold_h;
+  }
+
+  virtual void NotifyDecodingStart(const SentenceMetadata&) {
+    state = 1;
+  }
+
+  // compute model expectations, denominator of objective
+  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+    *hg_ = *hg;
+    RemoveRules(hg_);
+    assert(state == 1);
+    state = 2;
+  }
+
+  // compute "empirical" expectations, numerator of objective
+  virtual void NotifyAlignmentForest(const SentenceMetadata&, Hypergraph* hg) {
+    assert(state == 2);
+    state = 3;
+    *gold_hg_ = *hg;
+    RemoveRules(gold_hg_);
+  }
+
+  virtual void NotifyDecodingComplete(const SentenceMetadata&) {
+    if (state == 3) {
+    } else {
+      hg_->clear();
+      gold_hg_->clear();
+    }
+  }
+
+  int state;
+};
+
+void ReadConfig(const string& ini, istringstream* out) {
+  ReadFile rf(ini);
+  istream& in = *rf.stream();
+  ostringstream os;
+  while(in) {
+    string line;
+    getline(in, line);
+    if (!in) continue;
+    os << line << endl;
+  }
+  out->str(os.str());
+}
+
+#ifdef HAVE_MPI
+namespace boost { namespace mpi {
+  template<>
+  struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> > 
+    : mpl::true_ { };
+} } // end namespace boost::mpi
+#endif
+
+void AddGrad(const SparseVector<prob_t> x, double s, SparseVector<double>* acc) {
+  for (SparseVector<prob_t>::const_iterator it = x.begin(); it != x.end(); ++it)
+    acc->add_value(it->first, it->second.as_float() * s);
+}
+
+double PNorm(const vector<double>& v, const double p) {
+  double acc = 0;
+  for (int i = 0; i < v.size(); ++i)
+    acc += pow(v[i], p);
+  return pow(acc, 1.0 / p);
+}
+
+void VV(ostream&os, const vector<double>& v) {
+  for (int i = 1; i < v.size(); ++i)
+    if (v[i]) os << FD::Convert(i) << "=" << v[i] << " ";
+}
+
+double ApplyRegularizationTerms(const double C,
+                                const double T,
+                                const vector<double>& weights,
+                                const vector<double>& prev_weights,
+                                double* g) {
+  double reg = 0;
+  for (size_t i = 0; i < weights.size(); ++i) {
+    const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0);
+    const double& w_i = weights[i];
+    reg += C * w_i * w_i;
+    g[i] += 2 * C * w_i;
+
+    reg += T * (w_i - prev_w_i) * (w_i - prev_w_i);
+    g[i] += 2 * T * (w_i - prev_w_i);
+  }
+  return reg;
+}
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  const int size = 1;
+  const int rank = 0;
+#endif
+  if (size > 1) SetSilent(true);  // turn off verbose decoder output
+  register_feature_functions();
+  MT19937* rng = NULL;
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf))
+    return 1;
+
+  boost::shared_ptr<BatchOptimizer> o;
+  const unsigned lbfgs_memory_buffers = conf["lbfgs_memory_buffers"].as<unsigned>();
+  const unsigned size_per_proc = conf["minibatch_size_per_proc"].as<unsigned>();
+  const unsigned minibatch_iterations = conf["minibatch_iterations"].as<unsigned>();
+  const double regularization_strength = conf["regularization_strength"].as<double>();
+  const double time_series_strength = conf["time_series_strength"].as<double>();
+  const bool use_time_series_reg = time_series_strength > 0.0;
+  const unsigned max_iteration = conf["iterations"].as<unsigned>();
+
+  vector<string> corpus;
+  vector<int> ids;
+  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
+  assert(corpus.size() > 0);
+
+  if (size_per_proc > corpus.size()) {
+    cerr << "Minibatch size (per processor) must be smaller or equal to the local corpus size!\n";
+    return 1;
+  }
+
+  // initialize decoder (loads hash functions if necessary)
+  istringstream ins;
+  ReadConfig(conf["cdec_config"].as<string>(), &ins);
+  Decoder decoder(&ins);
+
+  // load initial weights
+  vector<weight_t> prev_weights;
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &prev_weights);
+
+  if (conf.count("random_seed"))
+    rng = new MT19937(conf["random_seed"].as<uint32_t>());
+  else
+    rng = new MT19937;
+
+  size_t total_corpus_size = 0;
+#ifdef HAVE_MPI
+  reduce(world, corpus.size(), total_corpus_size, std::plus<size_t>(), 0);
+#else
+  total_corpus_size = corpus.size();
+#endif
+
+  if (rank == 0)
+    cerr << "Total corpus size: " << total_corpus_size << endl;
+
+  CopyHGsObserver observer;
+
+  int write_weights_every_ith = 100; // TODO configure
+  int titer = -1;
+
+  vector<weight_t>& cur_weights = decoder.CurrentWeightVector();
+  if (use_time_series_reg) {
+    cur_weights = prev_weights;
+  } else {
+    cur_weights.swap(prev_weights);
+    prev_weights.clear();
+  }
+
+  int iter = -1;
+  bool converged = false;
+  vector<double> gg;
+  while (!converged) {
+#ifdef HAVE_MPI
+    mpi::timer timer;
+#endif
+    ++iter; ++titer;
+    if (rank == 0) {
+      converged = (iter == max_iteration);
+        string fname = "weights.cur.gz";
+        if (iter % write_weights_every_ith == 0) {
+          ostringstream o; o << "weights.epoch_" << iter << ".gz";
+          fname = o.str();
+        }
+        if (converged) { fname = "weights.final.gz"; }
+        ostringstream vv;
+        vv << "total iter=" << titer << " (of current config iter=" << iter << ")  minibatch=" << size_per_proc << " sentences/proc x " << size << " procs.   num_feats=" << FD::NumFeats() << "   passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size()));
+        const string svv = vv.str();
+        Weights::WriteToFile(fname, cur_weights, true, &svv);
+      }
+
+      vector<Hypergraph> hgs(size_per_proc);
+      vector<Hypergraph> gold_hgs(size_per_proc);
+      for (int i = 0; i < size_per_proc; ++i) {
+        int ei = corpus.size() * rng->next();
+        int id = ids[ei];
+        observer.SetCurrentHypergraphs(&hgs[i], &gold_hgs[i]);
+        decoder.SetId(id);
+        decoder.Decode(corpus[ei], &observer);
+      }
+
+      SparseVector<double> local_grad, g;
+      double local_obj = 0;
+      o.reset();
+      for (unsigned mi = 0; mi < minibatch_iterations; ++mi) {
+        local_grad.clear();
+        g.clear();
+        local_obj = 0;
+
+        for (unsigned i = 0; i < size_per_proc; ++i) {
+          Hypergraph& hg = hgs[i];
+          Hypergraph& hg_gold = gold_hgs[i];
+          if (hg.edges_.size() < 2) continue;
+
+          hg.Reweight(cur_weights);
+          hg_gold.Reweight(cur_weights);
+          SparseVector<prob_t> model_exp, gold_exp;
+          const prob_t z = InsideOutside<prob_t,
+                                         EdgeProb,
+                                         SparseVector<prob_t>,
+                                         EdgeFeaturesAndProbWeightFunction>(hg, &model_exp);
+          local_obj += log(z);
+          model_exp /= z;
+          AddGrad(model_exp, 1.0, &local_grad);
+          model_exp.clear();
+
+          const prob_t goldz = InsideOutside<prob_t,
+                                         EdgeProb,
+                                         SparseVector<prob_t>,
+                                         EdgeFeaturesAndProbWeightFunction>(hg_gold, &gold_exp);
+          local_obj -= log(goldz);
+
+          if (log(z) - log(goldz) < kMINUS_EPSILON) {
+            cerr << "DIFF. ERR! log_model_z < log_gold_z: " << log(z) << " " << log(goldz) << endl;
+            return 1;
+          }
+
+          gold_exp /= goldz;
+          AddGrad(gold_exp, -1.0, &local_grad);
+        }
+
+        double obj = 0;
+#ifdef HAVE_MPI
+        reduce(world, local_obj, obj, std::plus<double>(), 0);
+        reduce(world, local_grad, g, std::plus<SparseVector<double> >(), 0);
+#else
+        obj = local_obj;
+        g.swap(local_grad);
+#endif
+        local_grad.clear();
+        if (rank == 0) {
+          // g /= (size_per_proc * size);
+          if (!o)
+            o.reset(new LBFGSOptimizer(FD::NumFeats(), lbfgs_memory_buffers));
+          gg.clear();
+          gg.resize(FD::NumFeats());
+          if (gg.size() != cur_weights.size()) { cur_weights.resize(gg.size()); }
+          for (SparseVector<double>::iterator it = g.begin(); it != g.end(); ++it)
+            if (it->first) { gg[it->first] = it->second; }
+          g.clear();
+          double r = ApplyRegularizationTerms(regularization_strength,
+                                time_series_strength, // * (iter == 0 ? 0.0 : 1.0),
+                                cur_weights,
+                                prev_weights,
+                                &gg[0]);
+          obj += r;
+          if (mi == 0 || mi == (minibatch_iterations - 1)) {
+            if (!mi) cerr << iter << ' '; else cerr << ' ';
+            cerr << "OBJ=" << obj << " (REG=" << r << ")" << " |g|=" << PNorm(gg, 2) << " |w|=" << PNorm(cur_weights, 2); 
+            if (mi > 0) cerr << endl << flush; else cerr << ' ';
+          } else { cerr << '.' << flush; }
+          // cerr << "w = "; VV(cerr, cur_weights); cerr << endl;
+          // cerr << "g = "; VV(cerr, gg); cerr << endl;
+          o->Optimize(obj, gg, &cur_weights);
+        }
+#ifdef HAVE_MPI
+        broadcast(world, cur_weights, 0);
+        broadcast(world, converged, 0);
+        world.barrier();
+#endif
+    }
+    prev_weights = cur_weights;
+  }
+  return 0;
+}
diff --git a/training/crf/mpi_online_optimize.cc b/training/crf/mpi_online_optimize.cc
new file mode 100644
index 00000000..d6968848
--- /dev/null
+++ b/training/crf/mpi_online_optimize.cc
@@ -0,0 +1,374 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+#include <tr1/memory>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "stringlib.h"
+#include "verbose.h"
+#include "hg.h"
+#include "prob.h"
+#include "inside_outside.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "online_optimizer.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+#ifdef HAVE_MPI
+#include <boost/mpi/timer.hpp>
+#include <boost/mpi.hpp>
+namespace mpi = boost::mpi;
+#endif
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("input_weights,w",po::value<string>(),"Input feature weights file")
+        ("frozen_features,z",po::value<string>(), "List of features not to optimize")
+        ("training_data,t",po::value<string>(),"Training data corpus")
+        ("training_agenda,a",po::value<string>(), "Text file listing a series of configuration files and the number of iterations to train using each configuration successively")
+        ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(5), "Number of training instances evaluated per processor in each minibatch")
+        ("optimization_method,m", po::value<string>()->default_value("sgd"), "Optimization method (sgd)")
+        ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+        ("eta_0,e", po::value<double>()->default_value(0.2), "Initial learning rate for SGD (eta_0)")
+        ("L1,1","Use L1 regularization")
+        ("regularization_strength,C", po::value<double>()->default_value(1.0), "Regularization strength (C)");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("training_data") || !conf->count("training_agenda")) {
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  int id = 0;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (id % size == rank) {
+      c->push_back(line);
+      order->push_back(id);
+    }
+    ++id;
+  }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct TrainingObserver : public DecoderObserver {
+  void Reset() {
+    acc_grad.clear();
+    acc_obj = 0;
+    total_complete = 0;
+  } 
+
+  void SetLocalGradientAndObjective(vector<double>* g, double* o) const {
+    *o = acc_obj;
+    for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
+      (*g)[it->first] = it->second.as_float();
+  }
+
+  virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
+    cur_model_exp.clear();
+    cur_obj = 0;
+    state = 1;
+  }
+
+  // compute model expectations, denominator of objective
+  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    assert(state == 1);
+    state = 2;
+    const prob_t z = InsideOutside<prob_t,
+                                   EdgeProb,
+                                   SparseVector<prob_t>,
+                                   EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
+    cur_obj = log(z);
+    cur_model_exp /= z;
+  }
+
+  // compute "empirical" expectations, numerator of objective
+  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    assert(state == 2);
+    state = 3;
+    SparseVector<prob_t> ref_exp;
+    const prob_t ref_z = InsideOutside<prob_t,
+                                       EdgeProb,
+                                       SparseVector<prob_t>,
+                                       EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp);
+    ref_exp /= ref_z;
+
+    double log_ref_z;
+#if 0
+    if (crf_uniform_empirical) {
+      log_ref_z = ref_exp.dot(feature_weights);
+    } else {
+      log_ref_z = log(ref_z);
+    }
+#else
+    log_ref_z = log(ref_z);
+#endif
+
+    // rounding errors means that <0 is too strict
+    if ((cur_obj - log_ref_z) < kMINUS_EPSILON) {
+      cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
+      exit(1);
+    }
+    assert(!std::isnan(log_ref_z));
+    ref_exp -= cur_model_exp;
+    acc_grad += ref_exp;
+    acc_obj += (cur_obj - log_ref_z);
+  }
+
+  virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
+    if (state == 3) {
+      ++total_complete;
+    } else {
+    }
+  }
+
+  void GetGradient(SparseVector<double>* g) const {
+    g->clear();
+    for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
+      g->set_value(it->first, it->second.as_float());
+  }
+
+  int total_complete;
+  SparseVector<prob_t> cur_model_exp;
+  SparseVector<prob_t> acc_grad;
+  double acc_obj;
+  double cur_obj;
+  int state;
+};
+
+#ifdef HAVE_MPI
+namespace boost { namespace mpi {
+  template<>
+  struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> > 
+    : mpl::true_ { };
+} } // end namespace boost::mpi
+#endif
+
+bool LoadAgenda(const string& file, vector<pair<string, int> >* a) {
+  ReadFile rf(file);
+  istream& in = *rf.stream();
+  string line;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    if (line.empty()) continue;
+    if (line[0] == '#') continue;
+    int sc = 0;
+    if (line.size() < 3) return false;
+    for (int i = 0; i < line.size(); ++i) { if (line[i] == ' ') ++sc; }
+    if (sc != 1) { cerr << "Too many spaces in line: " << line << endl; return false; }
+    size_t d = line.find(" ");
+    pair<string, int> x;
+    x.first = line.substr(0,d);
+    x.second = atoi(line.substr(d+1).c_str());
+    a->push_back(x);
+    if (!FileExists(x.first)) {
+      cerr << "Can't find file " << x.first << endl;
+      return false;
+    }
+  }
+  return true;
+}
+
+int main(int argc, char** argv) {
+  cerr << "THIS SOFTWARE IS DEPRECATED YOU SHOULD USE mpi_flex_optimize\n";
+#ifdef HAVE_MPI
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  const int size = world.size(); 
+  const int rank = world.rank();
+#else
+  const int size = 1;
+  const int rank = 0;
+#endif
+  if (size > 1) SetSilent(true);  // turn off verbose decoder output
+  register_feature_functions();
+  std::tr1::shared_ptr<MT19937> rng;
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf))
+    return 1;
+
+  vector<pair<string, int> > agenda;
+  if (!LoadAgenda(conf["training_agenda"].as<string>(), &agenda))
+    return 1;
+  if (rank == 0)
+    cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n";
+
+  assert(agenda.size() > 0);
+
+  if (1) {  // hack to load the feature hash functions -- TODO this should not be in cdec.ini
+    const string& cur_config = agenda[0].first;
+    const unsigned max_iteration = agenda[0].second;
+    ReadFile ini_rf(cur_config);
+    Decoder decoder(ini_rf.stream());
+  }
+
+  // load initial weights
+  vector<weight_t> init_weights;
+  if (conf.count("input_weights"))
+    Weights::InitFromFile(conf["input_weights"].as<string>(), &init_weights);
+
+  vector<int> frozen_fids;
+  if (conf.count("frozen_features")) {
+    ReadFile rf(conf["frozen_features"].as<string>());
+    istream& in = *rf.stream();
+    string line;
+    while(in) {
+      getline(in, line);
+      if (line.empty()) continue;
+      if (line[0] == ' ' || line[line.size() - 1] == ' ') { line = Trim(line); }
+      frozen_fids.push_back(FD::Convert(line));
+    }
+    if (rank == 0) cerr << "Freezing " << frozen_fids.size() << " features.\n";
+  }
+
+  vector<string> corpus;
+  vector<int> ids;
+  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
+  assert(corpus.size() > 0);
+
+  std::tr1::shared_ptr<OnlineOptimizer> o;
+  std::tr1::shared_ptr<LearningRateSchedule> lr;
+
+  const unsigned size_per_proc = conf["minibatch_size_per_proc"].as<unsigned>();
+  if (size_per_proc > corpus.size()) {
+    cerr << "Minibatch size must be smaller than corpus size!\n";
+    return 1;
+  }
+
+  size_t total_corpus_size = 0;
+#ifdef HAVE_MPI
+  reduce(world, corpus.size(), total_corpus_size, std::plus<size_t>(), 0);
+#else
+  total_corpus_size = corpus.size();
+#endif
+
+  if (rank == 0) {
+    cerr << "Total corpus size: " << total_corpus_size << endl;
+    const unsigned batch_size = size_per_proc * size;
+    // TODO config
+    lr.reset(new ExponentialDecayLearningRate(batch_size, conf["eta_0"].as<double>()));
+
+    const string omethod = conf["optimization_method"].as<string>();
+    if (omethod == "sgd") {
+      const double C = conf["regularization_strength"].as<double>();
+      o.reset(new CumulativeL1OnlineOptimizer(lr, total_corpus_size, C, frozen_fids));
+    } else {
+      assert(!"fail");
+    }
+  }
+  if (conf.count("random_seed"))
+    rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    rng.reset(new MT19937);
+
+  SparseVector<double> x;
+  Weights::InitSparseVector(init_weights, &x);
+  TrainingObserver observer;
+
+  int write_weights_every_ith = 100; // TODO configure
+  int titer = -1;
+
+  for (int ai = 0; ai < agenda.size(); ++ai) {
+    const string& cur_config = agenda[ai].first;
+    const unsigned max_iteration = agenda[ai].second;
+    if (rank == 0)
+      cerr << "STARTING TRAINING EPOCH " << (ai+1) << ". CONFIG=" << cur_config << endl;
+    // load cdec.ini and set up decoder
+    ReadFile ini_rf(cur_config);
+    Decoder decoder(ini_rf.stream());
+    vector<weight_t>& lambdas = decoder.CurrentWeightVector();
+    if (ai == 0) { lambdas.swap(init_weights); init_weights.clear(); }
+
+    if (rank == 0)
+      o->ResetEpoch(); // resets the learning rate-- TODO is this good?
+
+    int iter = -1;
+    bool converged = false;
+    while (!converged) {
+#ifdef HAVE_MPI
+      mpi::timer timer;
+#endif
+      x.init_vector(&lambdas);
+      ++iter; ++titer;
+      observer.Reset();
+      if (rank == 0) {
+        converged = (iter == max_iteration);
+        Weights::SanityCheck(lambdas);
+        static int cc = 0; ++cc; if (cc > 1) { Weights::ShowLargestFeatures(lambdas); }
+        string fname = "weights.cur.gz";
+        if (iter % write_weights_every_ith == 0) {
+          ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz";
+          fname = o.str();
+        }
+        if (converged && ((ai+1)==agenda.size())) { fname = "weights.final.gz"; }
+        ostringstream vv;
+        vv << "total iter=" << titer << " (of current config iter=" << iter << ")  minibatch=" << size_per_proc << " sentences/proc x " << size << " procs.   num_feats=" << x.size() << '/' << FD::NumFeats() << "   passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << "   eta=" << lr->eta(titer);
+        const string svv = vv.str();
+        cerr << svv << endl;
+        Weights::WriteToFile(fname, lambdas, true, &svv);
+      }
+
+      for (int i = 0; i < size_per_proc; ++i) {
+        int ei = corpus.size() * rng->next();
+        int id = ids[ei];
+        decoder.SetId(id);
+        decoder.Decode(corpus[ei], &observer);
+      }
+      SparseVector<double> local_grad, g;
+      observer.GetGradient(&local_grad);
+#ifdef HAVE_MPI
+      reduce(world, local_grad, g, std::plus<SparseVector<double> >(), 0);
+#else
+      g.swap(local_grad);
+#endif
+      local_grad.clear();
+      if (rank == 0) {
+        g /= (size_per_proc * size);
+        o->UpdateWeights(g, FD::NumFeats(), &x);
+      }
+#ifdef HAVE_MPI
+      broadcast(world, x, 0);
+      broadcast(world, converged, 0);
+      world.barrier();
+      if (rank == 0) { cerr << "  ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; }
+#endif
+    }
+  }
+  return 0;
+}
diff --git a/training/dep-reorder/conll2reordering-forest.pl b/training/dep-reorder/conll2reordering-forest.pl
deleted file mode 100755
index 3cd226be..00000000
--- a/training/dep-reorder/conll2reordering-forest.pl
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
-my $FIRST_CONV = "$script_dir/scripts/conll2simplecfg.pl";
-my $CDEC = "$script_dir/../../decoder/cdec";
-
-our $tfile1 = "grammar1.$$";
-our $tfile2 = "text.$$";
-
-die "Usage: $0 parses.conll\n" unless scalar @ARGV == 1;
-open C, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!";
-
-END { unlink $tfile1; unlink "$tfile1.cfg"; unlink $tfile2; }
-
-my $first = 1;
-open T, ">$tfile1" or die "Can't write $tfile1: $!";
-my $lc = 0;
-my $flag = 0;
-my @words = ();
-while(<C>) {
-  print T;
-  chomp;
-  if (/^$/) {
-    if ($first) { $first = undef; } else { if ($flag) { print "\n"; $flag = 0; } }
-    $first = undef;
-    close T;
-    open SO, ">$tfile2" or die "Can't write $tfile2: $!";
-    print SO "@words\n";
-    close SO;
-    @words=();
-    `$FIRST_CONV < $tfile1 > $tfile1.cfg`;
-    if ($? != 0) {
-      die "Error code: $?";
-    }
-    my $cfg = `$CDEC -n -S 10000 -f scfg -g $tfile1.cfg -i $tfile2 --show_cfg_search_space 2>/dev/null`;
-    if ($? != 0) {
-      die "Error code: $?";
-    }
-    my @rules = split /\n/, $cfg;
-    shift @rules; # get rid of output
-    for my $rule (@rules) {
-      my ($lhs, $f, $e, $feats) = split / \|\|\| /, $rule;
-      $f =~ s/,\d\]/\]/g;
-      $feats = 'TOP=1' unless $feats;
-      if ($lhs =~ /\[Goal_\d+\]/) { $lhs = '[S]'; }
-      print "$lhs ||| $f ||| $feats\n";
-      if ($e eq '[1] [2]') {
-        my ($a, $b) = split /\s+/, $f;
-        $feats =~ s/=1$//;
-        my ($x, $y) = split /_/, $feats;
-        print "$lhs ||| $b $a ||| ${y}_$x=1\n";
-      }
-      $flag = 1;
-    }
-    open T, ">$tfile1" or die "Can't write $tfile1: $!";
-    $lc = -1;
-  } else {
-    my ($ind, $word, @dmmy) = split /\s+/;
-    push @words, $word;
-  }
-  $lc++;
-}
-close T;
-
diff --git a/training/dep-reorder/george.conll b/training/dep-reorder/george.conll
deleted file mode 100644
index 7eebb360..00000000
--- a/training/dep-reorder/george.conll
+++ /dev/null
@@ -1,4 +0,0 @@
-1	George	_	GEORGE	_	_	2	X	_	_
-2	hates	_	HATES	_	_	0	X	_	_
-3	broccoli	_	BROC	_	_	2	X	_	_
-
diff --git a/training/dep-reorder/scripts/conll2simplecfg.pl b/training/dep-reorder/scripts/conll2simplecfg.pl
deleted file mode 100755
index b101347a..00000000
--- a/training/dep-reorder/scripts/conll2simplecfg.pl
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-# 1	在	_	10	_	_	4	X	_	_
-# 2	门厅	_	3	_	_	1	X	_	_
-# 3	下面	_	23	_	_	4	X	_	_
-# 4	。	_	45	_	_	0	X	_	_
-
-my @ldeps;
-my @rdeps;
-@ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; }
-@rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; }
-my $rootcat = 0;
-my @cats = ('S');
-my $len = 0;
-my @noposcats = ('S');
-while(<>) {
-  chomp;
-  if (/^\s*$/) {
-    write_cfg($len);
-    $len = 0;
-    @cats=('S');
-    @noposcats = ('S');
-    @ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; }
-    @rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; }
-    next;
-  }
-  $len++;
-  my ($pos, $word, $d1, $xcat, $d2, $d3, $headpos, $deptype) = split /\s+/;
-  my $cat = "C$xcat";
-  my $catpos = $cat . "_$pos";
-  push @cats, $catpos;
-  push @noposcats, $cat;
-  print "[$catpos] ||| $word ||| $word ||| Word=1\n";
-  if ($headpos == 0) { $rootcat = $pos; }
-  if ($pos < $headpos) {
-    push @{$ldeps[$headpos]}, $pos;
-  } else {
-    push @{$rdeps[$headpos]}, $pos;
-  }
-}
-
-sub write_cfg {
-  my $len = shift;
-  for (my $i = 1; $i <= $len; $i++) {
-    my @lds = @{$ldeps[$i]};
-    for my $ld (@lds) {
-      print "[$cats[$i]] ||| [$cats[$ld],1] [$cats[$i],2] ||| [1] [2] ||| $noposcats[$ld]_$noposcats[$i]=1\n";
-    }
-    my @rds = @{$rdeps[$i]};
-    for my $rd (@rds) {
-      print "[$cats[$i]] ||| [$cats[$i],1] [$cats[$rd],2] ||| [1] [2] ||| $noposcats[$i]_$noposcats[$rd]=1\n";
-    }
-  }
-  print "[S] ||| [$cats[$rootcat],1] ||| [1] ||| TOP=1\n";
-}
-
diff --git a/training/dpmert/Makefile.am b/training/dpmert/Makefile.am
new file mode 100644
index 00000000..ff318bef
--- /dev/null
+++ b/training/dpmert/Makefile.am
@@ -0,0 +1,25 @@
+bin_PROGRAMS = \
+  mr_dpmert_map \
+  mr_dpmert_reduce \
+  mr_dpmert_generate_mapper_input
+
+noinst_PROGRAMS = \
+  lo_test
+TESTS = lo_test
+
+mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc
+mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+# nbest2hg_SOURCES = nbest2hg.cc
+# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz
+
+mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc
+mr_dpmert_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc
+mr_dpmert_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc
+lo_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/dpmert/ces.cc b/training/dpmert/ces.cc
new file mode 100644
index 00000000..157b2d17
--- /dev/null
+++ b/training/dpmert/ces.cc
@@ -0,0 +1,90 @@
+#include "ces.h"
+
+#include <vector>
+#include <sstream>
+#include <boost/shared_ptr.hpp>
+
+// TODO, if AER is to be optimized again, we will need this
+// #include "aligner.h"
+#include "lattice.h"
+#include "mert_geometry.h"
+#include "error_surface.h"
+#include "ns.h"
+
+using namespace std;
+
+const bool minimize_segments = true;    // if adjacent segments have equal scores, merge them
+
+void ComputeErrorSurface(const SegmentEvaluator& ss,
+                         const ConvexHull& ve,
+                         ErrorSurface* env,
+                         const EvaluationMetric* metric,
+                         const Hypergraph& hg) {
+  vector<WordID> prev_trans;
+  const vector<boost::shared_ptr<MERTPoint> >& ienv = ve.GetSortedSegs();
+  env->resize(ienv.size());
+  SufficientStats prev_score; // defaults to 0
+  int j = 0;
+  for (unsigned i = 0; i < ienv.size(); ++i) {
+    const MERTPoint& seg = *ienv[i];
+    vector<WordID> trans;
+#if 0
+    if (type == AER) {
+      vector<bool> edges(hg.edges_.size(), false);
+      seg.CollectEdgesUsed(&edges);  // get the set of edges in the viterbi
+                                     // alignment
+      ostringstream os;
+      const string* psrc = ss.GetSource();
+      if (psrc == NULL) {
+        cerr << "AER scoring in VEST requires source, but it is missing!\n";
+        abort();
+      }
+      size_t pos = psrc->rfind(" ||| ");
+      if (pos == string::npos) {
+        cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl;
+        abort();
+      }
+      Lattice src;
+      Lattice ref;
+      LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src);
+      LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref);
+      AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges);
+      string tstr = os.str();
+      TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans);
+    } else {
+#endif
+      seg.ConstructTranslation(&trans);
+    //}
+    //cerr << "Scoring: " << TD::GetString(trans) << endl;
+    if (trans == prev_trans) {
+      if (!minimize_segments) {
+        ErrorSegment& out = (*env)[j];
+        out.delta.fields.clear();
+        out.x = seg.x;
+	++j;
+      }
+      //cerr << "Identical translation, skipping scoring\n";
+    } else {
+      SufficientStats score;
+      ss.Evaluate(trans, &score);
+      // cerr << "score= " << score->ComputeScore() << "\n";
+      //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl;
+      const SufficientStats delta = score - prev_score;
+      //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl;
+      //string xx; delta.Encode(&xx); cerr << xx << endl;
+      prev_trans.swap(trans);
+      prev_score = score;
+      if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) {
+        ErrorSegment& out = (*env)[j];
+        out.delta = delta;
+        out.x = seg.x;
+        ++j;
+      }
+    }
+  }
+  // cerr << " In segments: " << ienv.size() << endl;
+  // cerr << "Out segments: " << j << endl;
+  assert(j > 0);
+  env->resize(j);
+}
+
diff --git a/training/dpmert/ces.h b/training/dpmert/ces.h
new file mode 100644
index 00000000..e4fa2080
--- /dev/null
+++ b/training/dpmert/ces.h
@@ -0,0 +1,16 @@
+#ifndef _CES_H_
+#define _CES_H_
+
+class ConvexHull;
+class Hypergraph;
+class SegmentEvaluator;
+class ErrorSurface;
+class EvaluationMetric;
+
+void ComputeErrorSurface(const SegmentEvaluator& ss,
+                         const ConvexHull& convex_hull,
+                         ErrorSurface* es,
+                         const EvaluationMetric* metric,
+                         const Hypergraph& hg);
+
+#endif
diff --git a/training/dpmert/divide_refs.py b/training/dpmert/divide_refs.py
new file mode 100755
index 00000000..b478f918
--- /dev/null
+++ b/training/dpmert/divide_refs.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+import sys
+
+(numRefs, outPrefix) = sys.argv[1:]
+numRefs = int(numRefs)
+
+outs = [open(outPrefix+str(i), "w") for i in range(numRefs)]
+
+i = 0
+for line in sys.stdin:
+  outs[i].write(line)
+  i = (i + 1) % numRefs
+
+for out in outs:
+  out.close()
diff --git a/training/dpmert/dpmert.pl b/training/dpmert/dpmert.pl
new file mode 100755
index 00000000..559420f5
--- /dev/null
+++ b/training/dpmert/dpmert.pl
@@ -0,0 +1,618 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use File::Basename qw(basename);
+require "libcall.pl";
+
+my $QSUB_CMD = qsub_args(mert_memory());
+
+# Default settings
+my $srcFile;  # deprecated
+my $refFiles; # deprecated
+my $default_jobs = env_default_jobs();
+my $bin_dir = $SCRIPT_DIR;
+my $util_dir = "$SCRIPT_DIR/../utils";
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input";
+my $MAPPER = "$bin_dir/mr_dpmert_map";
+my $REDUCER = "$bin_dir/mr_dpmert_reduce";
+my $parallelize = "$util_dir/parallelize.pl";
+my $libcall = "$util_dir/libcall.pl";
+my $sentserver = "$util_dir/sentserver";
+my $sentclient = "$util_dir/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 200;
+my $rand_directions = 15;
+my $iteration = 1;
+my $best_weights;
+my $max_iterations = 15;
+my $optimization_iters = 6;
+my $jobs = $default_jobs;   # number of decode nodes
+my $pmem = "9g";
+my $disable_clean = 0;
+my %seen_weights;
+my $help = 0;
+my $epsilon = 0.0001;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $initialWeights;
+my $bleu_weight=1;
+my $use_make = 1;  # use make to parallelize line search
+my $useqsub;
+my $pass_suffix = '';
+my $devset;
+# Process command-line options
+if (GetOptions(
+	"config=s" => \$iniFile,
+	"weights=s" => \$initialWeights,
+        "devset=s" => \$devset,
+	"jobs=i" => \$jobs,
+	"pass-suffix=s" => \$pass_suffix,
+	"help" => \$help,
+	"qsub" => \$useqsub,
+	"iterations=i" => \$max_iterations,
+	"pmem=s" => \$pmem,
+	"random-directions=i" => \$rand_directions,
+	"metric=s" => \$metric,
+	"source-file=s" => \$srcFile,
+	"output-dir=s" => \$dir,
+) == 0 || @ARGV!=0 || $help) {
+	print_help();
+	exit;
+}
+
+if ($useqsub) {
+  $use_make = 0;
+  die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (defined $srcFile || defined $refFiles) {
+  die <<EOT;
+
+  The options --ref-files and --source-file are no longer supported.
+  Please specify the input file and its reference translations with
+  --devset FILE
+
+EOT
+}
+
+if (!defined $iniFile) { push @missing_args, "--config"; }
+if (!defined $devset) { push @missing_args, "--devset"; }
+if (!defined $initialWeights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args);
+
+if ($metric =~ /^(combi|ter)$/i) {
+  $lines_per_mapper = 40;
+} elsif ($metric =~ /^meteor$/i) {
+  $lines_per_mapper = 2000;   # start up time is really high for METEOR
+}
+
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+  $DIR_FLAG = '';
+}
+
+unless ($dir){
+	$dir = "dpmert";
+}
+unless ($dir =~ /^\//){  # convert relative path to absolute path
+	my $basedir = check_output("pwd");
+	chomp $basedir;
+	$dir = "$basedir/$dir";
+}
+
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+	print STDERR "Cleanup...\n";
+	for my $pid (@childpids){ unchecked_call("kill $pid"); }
+	for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+	exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit = sub{ cleanup(); }; 
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = basename($decoder); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+
+sub dirsize {
+    opendir ISEMPTY,$_[0];
+    return scalar(readdir(ISEMPTY))-1;
+}
+if (-e $dir) {
+	# allow preexisting logfile, binaries, but not dist-dpmert.pl outputs
+	die "ERROR: output directory $dir already exists (remove or use --output-dir dir)\n\n";
+} else {
+	mkdir "$dir" or die "Can't mkdir $dir: $!";
+	mkdir "$dir/hgs" or die;
+	mkdir "$dir/scripts" or die;
+	print STDERR <<EOT;
+	DECODER:          $decoder
+	INI FILE:         $iniFile
+	WORKING DIR:      $dir
+	DEVSET:           $devset
+	EVAL METRIC:      $metric
+	MAX ITERATIONS:   $max_iterations
+	PARALLEL JOBS:    $jobs
+	HEAD NODE:        $host
+	PMEM (DECODING):  $pmem
+	INITIAL WEIGHTS:  $initialWeights
+EOT
+}
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+check_call("cp $initialWeights $dir/weights.0");
+$iniFile = $newIniFile;
+
+split_devset($devset, "$dir/dev.input.raw", "$dir/dev.refs");
+my $refs = "-r $dir/dev.refs";
+my $newsrc = "$dir/dev.input";
+enseg("$dir/dev.input.raw", $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+while (1){
+	print STDERR "\n\nITERATION $iteration\n==========\n";
+
+	if ($iteration > $max_iterations){
+		print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+		last;
+	}
+	# iteration-specific files
+	my $runFile="$dir/run.raw.$iteration";
+	my $onebestFile="$dir/1best.$iteration";
+	my $logdir="$dir/logs.$iteration";
+	my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+	my $scorerLog="$logdir/scorer.log.$iteration";
+	check_call("mkdir -p $logdir");
+
+
+	#decode
+	print STDERR "RUNNING DECODER AT ";
+	print STDERR unchecked_output("date");
+	my $im1 = $iteration - 1;
+	my $weightsFile="$dir/weights.$im1";
+	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+	my $pcmd;
+	if ($use_make) {
+		$pcmd = "cat $srcFile | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --";
+	} else {
+		$pcmd = "cat $srcFile | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --";
+	}
+	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_bash_call($cmd);
+        my $num_hgs;
+        my $num_topbest;
+        my $retries = 0;
+	while($retries < 5) {
+	    $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+	    $num_topbest = check_output("wc -l < $runFile");
+	    print STDERR "NUMBER OF HGs: $num_hgs\n";
+	    print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+	    if($devSize == $num_hgs && $devSize == $num_topbest) {
+		last;
+	    } else {
+		print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+		sleep(3);
+	    }
+	    $retries++;
+	}
+	die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+	my $dec_score = check_output("cat $runFile | $SCORER $refs -m $metric");
+	chomp $dec_score;
+	print STDERR "DECODER SCORE: $dec_score\n";
+
+	# save space
+	check_call("gzip -f $runFile");
+	check_call("gzip -f $decoderLog");
+
+	# run optimizer
+	print STDERR "RUNNING OPTIMIZER AT ";
+	print STDERR unchecked_output("date");
+	my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+	my $score = 0;
+	my $icc = 0;
+	my $inweights="$dir/weights.$im1";
+	for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) {
+		print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n";
+		print STDERR unchecked_output("date");
+		$icc++;
+		$cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter";
+		print STDERR "COMMAND:\n$cmd\n";
+		check_call($cmd);
+		check_call("mkdir -p $dir/splag.$im1");
+		$cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput.";
+		print STDERR "COMMAND:\n$cmd\n";
+		check_call($cmd);
+		opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
+		my @shards = grep { /^mapinput\./ } readdir(DIR);
+		closedir DIR;
+		die "No shards!" unless scalar @shards > 0;
+		my $joblist = "";
+		my $nmappers = 0;
+		my @mapoutputs = ();
+		@cleanupcmds = ();
+		my %o2i = ();
+		my $first_shard = 1;
+		my $mkfile; # only used with makefiles
+		my $mkfilename;
+		if ($use_make) {
+			$mkfilename = "$dir/splag.$im1/domap.mk";
+			open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
+			print $mkfile "all: $dir/splag.$im1/map.done\n\n";
+		}
+		my @mkouts = ();  # only used with makefiles
+		for my $shard (@shards) {
+			my $mapoutput = $shard;
+			my $client_name = $shard;
+			$client_name =~ s/mapinput.//;
+			$client_name = "dpmert.$client_name";
+			$mapoutput =~ s/mapinput/mapoutput/;
+			push @mapoutputs, "$dir/splag.$im1/$mapoutput";
+			$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
+			my $script = "$MAPPER -s $srcFile -m $metric $refs < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";
+			if ($use_make) {
+				my $script_file = "$dir/scripts/map.$shard";
+				open F, ">$script_file" or die "Can't write $script_file: $!";
+				print F "#!/bin/bash\n";
+				print F "$script\n";
+				close F;
+				my $output = "$dir/splag.$im1/$mapoutput";
+				push @mkouts, $output;
+				chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
+				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+				print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
+			} else {
+				my $script_file = "$dir/scripts/map.$shard";
+				open F, ">$script_file" or die "Can't write $script_file: $!";
+				print F "$script\n";
+				close F;
+				if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+
+				$nmappers++;
+				my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
+				my $jobid = check_output("$qcmd");
+				chomp $jobid;
+				$jobid =~ s/^(\d+)(.*?)$/\1/g;
+				$jobid =~ s/^Your job (\d+) .*$/\1/;
+		 	 	push(@cleanupcmds, "qdel $jobid 2> /dev/null");
+				print STDERR " $jobid";
+				if ($joblist == "") { $joblist = $jobid; }
+				else {$joblist = $joblist . "\|" . $jobid; }
+			}
+		}
+		if ($use_make) {
+			print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
+			close $mkfile;
+			my $mcmd = "make -j $jobs -f $mkfilename";
+			print STDERR "\nExecuting: $mcmd\n";
+			check_call($mcmd);
+		} else {
+			print STDERR "\nLaunched $nmappers mappers.\n";
+      			sleep 8;
+			print STDERR "Waiting for mappers to complete...\n";
+			while ($nmappers > 0) {
+			  sleep 5;
+			  my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
+			  $nmappers = scalar @livejobs;
+			}
+			print STDERR "All mappers complete.\n";
+		}
+		my $tol = 0;
+		my $til = 0;
+		for my $mo (@mapoutputs) {
+		  my $olines = get_lines($mo);
+		  my $ilines = get_lines($o2i{$mo});
+		  $tol += $olines;
+		  $til += $ilines;
+		  die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines;
+		}
+		print STDERR "Results for $tol/$til lines\n";
+		print STDERR "\nSORTING AND RUNNING VEST REDUCER\n";
+		print STDERR unchecked_output("date");
+		$cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1";
+		print STDERR "COMMAND:\n$cmd\n";
+		check_bash_call($cmd);
+		$cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1";
+		# sort returns failure even when it doesn't fail for some reason
+		my $best=unchecked_output("$cmd"); chomp $best;
+		print STDERR "$best\n";
+		my ($oa, $x, $xscore) = split /\|/, $best;
+		$score = $xscore;
+		print STDERR "PROJECTED SCORE: $score\n";
+		if (abs($x) < $epsilon) {
+			print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n";
+			last;
+		}
+                my $psd = $score - $last_score;
+                $last_score = $score;
+		if (abs($psd) < $epsilon) {
+			print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n";
+			last;
+		}
+		my ($origin, $axis) = split /\s+/, $oa;
+
+		my %ori = convert($origin);
+		my %axi = convert($axis);
+
+		my $finalFile="$dir/weights.$im1-$opt_iter";
+		open W, ">$finalFile" or die "Can't write: $finalFile: $!";
+                my $norm = 0;
+		for my $k (sort keys %ori) {
+			my $dd = $ori{$k} + $axi{$k} * $x;
+                        $norm += $dd * $dd;
+		}
+                $norm = sqrt($norm);
+		$norm = 1;
+		for my $k (sort keys %ori) {
+			my $v = ($ori{$k} + $axi{$k} * $x) / $norm;
+			print W "$k $v\n";
+		}
+		check_call("rm $dir/splag.$im1/*");
+		$inweights = $finalFile;
+	}
+	$lastWeightsFile = "$dir/weights.$iteration";
+	check_call("cp $inweights $lastWeightsFile");
+	if ($icc < 2) {
+		print STDERR "\nREACHED STOPPING CRITERION: score change too little\n";
+		last;
+	}
+	$lastPScore = $score;
+	$iteration++;
+	print STDERR "\n==========\n";
+}
+
+check_call("cp $lastWeightsFile $dir/weights.final");
+print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n";
+print STDOUT "$dir/weights.final\n";
+exit 0;
+
+
+sub get_lines {
+  my $fn = shift @_;
+  open FL, "<$fn" or die "Couldn't read $fn: $!";
+  my $lc = 0;
+  while(<FL>) { $lc++; }
+  return $lc;
+}
+
+sub read_weights_file {
+  my ($file) = @_;
+  open F, "<$file" or die "Couldn't read $file: $!";
+  my @r = ();
+  my $pm = -1;
+  while(<F>) {
+    next if /^#/;
+    next if /^\s*$/;
+    chomp;
+    if (/^(.+)\s+(.+)$/) {
+      my $m = $1;
+      my $w = $2;
+      die "Weights out of order: $m <= $pm" unless $m > $pm;
+      push @r, $w;
+    } else {
+      warn "Unexpected feature name in weight file: $_";
+    }
+  }
+  close F;
+  return join ' ', @r;
+}
+
+sub update_weights_file {
+  my ($neww, $rfn, $rpts) = @_;
+  my @feats = @$rfn;
+  my @pts = @$rpts;
+  my $num_feats = scalar @feats;
+  my $num_pts = scalar @pts;
+  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+  open G, ">$neww" or die;
+  for (my $i = 0; $i < $num_feats; $i++) {
+    my $f = $feats[$i];
+    my $lambda = $pts[$i];
+    print G "$f $lambda\n";
+  }
+  close G;
+}
+
+sub enseg {
+	my $src = shift;
+	my $newsrc = shift;
+	open(SRC, $src);
+	open(NEWSRC, ">$newsrc");
+	my $i=0;
+	while (my $line=<SRC>){
+		chomp $line;
+		if ($line =~ /^\s*<seg/i) {
+		    if($line =~ /id="[0-9]+"/) {
+			print NEWSRC "$line\n";
+		    } else {
+			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+		    }
+		} else {
+			print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+		}
+		$i++;
+	}
+	close SRC;
+	close NEWSRC;
+}
+
+sub print_help {
+
+	my $executable = basename($0); chomp $executable;
+	print << "Help";
+
+Usage: $executable [options] <ini file>
+
+	$executable [options]
+		Runs a complete MERT optimization. Required options are --weights,
+		--devset, and --config.
+
+Options:
+
+	--config <file>   [-c <file>]
+		The decoder configuration file.
+
+	--devset <file>   [-d <file>]
+		The source *and* references for the development set.
+
+	--weights <file>  [-w <file>]
+		A file specifying initial feature weights.  The format is
+		FeatureName_1 value1
+		FeatureName_2 value2
+		**All and only the weights listed in <file> will be optimized!**
+
+	--metric <name>
+		Metric to optimize.
+		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+	--iterations <M>
+		Maximum number of iterations to run.  If not specified, defaults
+		to 10.
+
+	--pass-suffix <S>
+		If the decoder is doing multi-pass decoding, the pass suffix "2",
+		"3", etc., is used to control what iteration of weights is set.
+
+	--rand-directions <num>
+		MERT will attempt to optimize along all of the principle directions,
+		set this parameter to explore other directions. Defaults to 5.
+
+	--output-dir <dir>
+		Directory for intermediate and output files.
+
+	--help
+		Print this message and exit.
+
+Job control options:
+
+	--jobs <I>
+		Number of decoder processes to run in parallel. [default=$default_jobs]
+
+	--qsub
+		Use qsub to run jobs in parallel (qsub must be configured in
+		environment/LocalEnvironment.pm)
+
+	--pmem <N>
+		Amount of physical memory requested for parallel decoding jobs
+		(used with qsub requests only)
+
+Help
+}
+
+sub convert {
+  my ($str) = @_;
+  my @ps = split /;/, $str;
+  my %dict = ();
+  for my $p (@ps) {
+    my ($k, $v) = split /=/, $p;
+    $dict{$k} = $v;
+  }
+  return %dict;
+}
+
+
+
+sub cmdline {
+    return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+    my ($arg)=@_;
+    return undef unless defined $arg;
+    if ($arg =~ /$is_shell_special/) {
+        $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+        return "\"$arg\"";
+    }
+    return $arg;
+}
+
+sub escaped_shell_args {
+    return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+    return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
+
+sub split_devset {
+  my ($infile, $outsrc, $outref) = @_;
+  open F, "<$infile" or die "Can't read $infile: $!";
+  open S, ">$outsrc" or die "Can't write $outsrc: $!";
+  open R, ">$outref" or die "Can't write $outref: $!";
+  while(<F>) {
+    chomp;
+    my ($src, @refs) = split /\s*\|\|\|\s*/;
+    die "Malformed devset line: $_\n" unless scalar @refs > 0;
+    print S "$src\n";
+    print R join(' ||| ', @refs) . "\n";
+  }
+  close R;
+  close S;
+  close F;
+}
+
diff --git a/training/dpmert/error_surface.cc b/training/dpmert/error_surface.cc
new file mode 100644
index 00000000..515b67f8
--- /dev/null
+++ b/training/dpmert/error_surface.cc
@@ -0,0 +1,42 @@
+#include "error_surface.h"
+
+#include <cassert>
+#include <sstream>
+
+using namespace std;
+
+ErrorSurface::~ErrorSurface() {}
+
+void ErrorSurface::Serialize(std::string* out) const {
+  const int segments = this->size();
+  ostringstream os(ios::binary);
+  os.write((const char*)&segments,sizeof(segments));
+  for (int i = 0; i < segments; ++i) {
+    const ErrorSegment& cur = (*this)[i];
+    string senc;
+    cur.delta.Encode(&senc);
+    assert(senc.size() < 1024);
+    unsigned char len = senc.size();
+    os.write((const char*)&cur.x, sizeof(cur.x));
+    os.write((const char*)&len, sizeof(len));
+    os.write((const char*)&senc[0], len);
+  }
+  *out = os.str();
+}
+
+void ErrorSurface::Deserialize(const std::string& in) {
+  istringstream is(in, ios::binary);
+  int segments;
+  is.read((char*)&segments, sizeof(segments));
+  this->resize(segments);
+  for (int i = 0; i < segments; ++i) {
+    ErrorSegment& cur = (*this)[i];
+    unsigned char len;
+    is.read((char*)&cur.x, sizeof(cur.x));
+    is.read((char*)&len, sizeof(len));
+    string senc(len, '\0'); assert(senc.size() == len);
+    is.read((char*)&senc[0], len);
+    cur.delta = SufficientStats(senc);
+  }
+}
+
diff --git a/training/dpmert/error_surface.h b/training/dpmert/error_surface.h
new file mode 100644
index 00000000..bb65847b
--- /dev/null
+++ b/training/dpmert/error_surface.h
@@ -0,0 +1,24 @@
+#ifndef _ERROR_SURFACE_H_
+#define _ERROR_SURFACE_H_
+
+#include <vector>
+#include <string>
+
+#include "ns.h"
+
+class Score;
+
+struct ErrorSegment {
+  double x;
+  SufficientStats delta;
+  ErrorSegment() : x(0), delta() {}
+};
+
+class ErrorSurface : public std::vector<ErrorSegment> {
+ public:
+  ~ErrorSurface();
+  void Serialize(std::string* out) const;
+  void Deserialize(const std::string& in);
+};
+
+#endif
diff --git a/training/dpmert/line_mediator.pl b/training/dpmert/line_mediator.pl
new file mode 100755
index 00000000..bc2bb24c
--- /dev/null
+++ b/training/dpmert/line_mediator.pl
@@ -0,0 +1,116 @@
+#!/usr/bin/perl -w
+#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication
+
+# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver)
+
+#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism.
+
+use strict;
+use IPC::Open2;
+use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO);
+
+my $quiet=!$ENV{DEBUG};
+$quiet=1 if $ENV{QUIET};
+sub info {
+    local $,=' ';
+    print STDERR @_ unless $quiet;
+}
+
+my $mode='CROSS';
+my $ser='DIRECT';
+$mode='PIPE' if $ENV{PIPE};
+$mode='SNAKE' if $ENV{SNAKE};
+$mode='CROSS' if $ENV{CROSS};
+$ser='SERIAL' if $ENV{SERIAL};
+$ser='DIRECT' if $ENV{DIRECT};
+$ser='SERIAL' if $mode eq 'SNAKE';
+info("mode: $mode\n");
+info("connection: $ser\n");
+
+
+my @c1;
+if (scalar @ARGV) {
+    do {
+        push @c1,shift
+    } while scalar @ARGV && $c1[$#c1] ne '--';
+}
+pop @c1;
+my @c2=@ARGV;
+@ARGV=();
+(scalar @c1 && scalar @c2) || die qq{
+usage: $0 cmd1 args -- cmd2 args
+all options are environment variables.
+DEBUG=1 env var enables debugging output.
+CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication.  crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output.  cmd1 initiates the conversation (sends the first line).    default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork).
+SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG.
+if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout.
+if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2.
+DIRECT=1 (default) will override SERIAL=1.
+CROSS=1 (default) will override SNAKE or PIPE.
+};
+
+info("1 cmd:",@c1,"\n");
+info("2 cmd:",@c2,"\n");
+
+sub lineto {
+    select $_[0];
+    $|=1;
+    shift;
+    print @_;
+}
+
+if ($ser eq 'SERIAL') {
+    my ($R1,$W1,$R2,$W2);
+    my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3.
+    my $c2p=open2($R2,$W2,@c2);
+    if ($mode eq 'CROSS') {
+        while(<$R1>) {
+            info("1:",$_);
+            lineto($W2,$_);
+            last unless defined ($_=<$R2>);
+            info("1|2:",$_);
+            lineto($W1,$_);
+        }
+    } else {
+        my $snake=$mode eq 'SNAKE';
+        while(<STDIN>) {
+            info("IN:",$_);
+            lineto($W1,$_);
+            last unless defined ($_=<$R1>);
+            info("IN|1:",$_);
+            lineto($W2,$_);
+            last unless defined ($_=<$R2>);
+            info("IN|1|2:",$_);
+            if ($snake) {
+                lineto($W1,$_);
+                last unless defined ($_=<$R1>);
+                info("IN|1|2|1:",$_);
+            }
+            lineto(*STDOUT,$_);
+        }
+    }
+} else {
+    info("DIRECT mode\n");
+    my @rw1=POSIX::pipe();
+    my @rw2=POSIX::pipe();
+    my $pid=undef;
+    $SIG{CHLD} = sub { wait };
+    while (not defined ($pid=fork())) {
+        sleep 1;
+    }
+    my $pipe = $mode eq 'PIPE';
+    unless ($pipe) {
+        POSIX::close(STDOUT_FILENO);
+        POSIX::close(STDIN_FILENO);
+    }
+    if ($pid) {
+        POSIX::dup2($rw1[1],STDOUT_FILENO);
+        POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe;
+        exec @c1;
+    } else {
+        POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe;
+        POSIX::dup2($rw1[0],STDIN_FILENO);
+        exec @c2;
+    }
+    while (wait()!=-1) {}
+}
diff --git a/training/dpmert/line_optimizer.cc b/training/dpmert/line_optimizer.cc
new file mode 100644
index 00000000..9cf33502
--- /dev/null
+++ b/training/dpmert/line_optimizer.cc
@@ -0,0 +1,114 @@
+#include "line_optimizer.h"
+
+#include <limits>
+#include <algorithm>
+
+#include "sparse_vector.h"
+#include "ns.h"
+
+using namespace std;
+
+typedef ErrorSurface::const_iterator ErrorIter;
+
+// sort by increasing x-ints
+struct IntervalComp {
+  bool operator() (const ErrorIter& a, const ErrorIter& b) const {
+    return a->x < b->x;
+  }
+};
+
+double LineOptimizer::LineOptimize(
+    const EvaluationMetric* metric,
+    const vector<ErrorSurface>& surfaces,
+    const LineOptimizer::ScoreType type,
+    float* best_score,
+    const double epsilon) {
+  // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << "  MINE=" << type << endl;
+  vector<ErrorIter> all_ints;
+  for (vector<ErrorSurface>::const_iterator i = surfaces.begin();
+       i != surfaces.end(); ++i) {
+    const ErrorSurface& surface = *i;
+    for (ErrorIter j = surface.begin(); j != surface.end(); ++j)
+      all_ints.push_back(j);
+  }
+  sort(all_ints.begin(), all_ints.end(), IntervalComp());
+  double last_boundary = all_ints.front()->x;
+  SufficientStats acc;
+  float& cur_best_score = *best_score;
+  cur_best_score = (type == MAXIMIZE_SCORE ?
+    -numeric_limits<float>::max() : numeric_limits<float>::max());
+  bool left_edge = true;
+  double pos = numeric_limits<double>::quiet_NaN();
+  for (vector<ErrorIter>::iterator i = all_ints.begin();
+       i != all_ints.end(); ++i) {
+    const ErrorSegment& seg = **i;
+    if (seg.x - last_boundary > epsilon) {
+      float sco = metric->ComputeScore(acc);
+      if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
+          (type == MINIMIZE_SCORE && sco < cur_best_score) ) {
+        cur_best_score = sco;
+	if (left_edge) {
+	  pos = seg.x - 0.1;
+	  left_edge = false;
+	} else {
+	  pos = last_boundary + (seg.x - last_boundary) / 2;
+	}
+	//cerr << "NEW BEST: " << pos << "  (score=" << cur_best_score << ")\n";
+      }
+      // string xx = metric->DetailedScore(acc); cerr << "---- " << xx;
+#undef SHOW_ERROR_SURFACES
+#ifdef SHOW_ERROR_SURFACES
+      cerr << "x=" << seg.x << "\ts=" << sco << "\n";
+#endif
+      last_boundary = seg.x;
+    }
+    // cerr << "x-boundary=" << seg.x << "\n";
+    //string x2; acc.Encode(&x2); cerr << "   ACC: " << x2 << endl;
+    //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl;
+    acc += seg.delta;
+  }
+  float sco = metric->ComputeScore(acc);
+  if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
+      (type == MINIMIZE_SCORE && sco < cur_best_score) ) {
+    cur_best_score = sco;
+    if (left_edge) {
+      pos = 0;
+    } else {
+      pos = last_boundary + 1000.0;
+    }
+  }
+  return pos;
+}
+
+void LineOptimizer::RandomUnitVector(const vector<int>& features_to_optimize,
+                                     SparseVector<double>* axis,
+                                     RandomNumberGenerator<boost::mt19937>* rng) {
+  axis->clear();
+  for (int i = 0; i < features_to_optimize.size(); ++i)
+    axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0));
+  (*axis) /= axis->l2norm();
+}
+
+void LineOptimizer::CreateOptimizationDirections(
+     const vector<int>& features_to_optimize,
+     int additional_random_directions,
+     RandomNumberGenerator<boost::mt19937>* rng,
+     vector<SparseVector<double> >* dirs
+     , bool include_orthogonal
+  ) {
+  dirs->clear();
+  typedef SparseVector<double> Dir;
+  vector<Dir> &out=*dirs;
+  int i=0;
+  if (include_orthogonal)
+    for (;i<features_to_optimize.size();++i) {
+      Dir d;
+      d.set_value(features_to_optimize[i],1.);
+      out.push_back(d);
+    }
+  out.resize(i+additional_random_directions);
+  for (;i<out.size();++i)
+     RandomUnitVector(features_to_optimize, &out[i], rng);
+  cerr << "Generated " << out.size() << " total axes to optimize along.\n";
+}
+
diff --git a/training/dpmert/line_optimizer.h b/training/dpmert/line_optimizer.h
new file mode 100644
index 00000000..83819f41
--- /dev/null
+++ b/training/dpmert/line_optimizer.h
@@ -0,0 +1,48 @@
+#ifndef LINE_OPTIMIZER_H_
+#define LINE_OPTIMIZER_H_
+
+#include <vector>
+
+#include "sparse_vector.h"
+#include "error_surface.h"
+#include "sampler.h"
+
+class EvaluationMetric;
+class Weights;
+
+struct LineOptimizer {
+
+  // use MINIMIZE_SCORE for things like TER, WER
+  // MAXIMIZE_SCORE for things like BLEU
+  enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE };
+
+  // merge all the error surfaces together into a global
+  // error surface and find (the middle of) the best segment
+  static double LineOptimize(
+     const EvaluationMetric* metric,
+     const std::vector<ErrorSurface>& envs,
+     const LineOptimizer::ScoreType type,
+     float* best_score,
+     const double epsilon = 1.0/65536.0);
+
+  // return a random vector of length 1 where all dimensions
+  // not listed in dimensions will be 0.
+  static void RandomUnitVector(const std::vector<int>& dimensions,
+                               SparseVector<double>* axis,
+                               RandomNumberGenerator<boost::mt19937>* rng);
+
+  // generate a list of directions to optimize; the list will
+  // contain the orthogonal vectors corresponding to the dimensions in
+  // primary and then additional_random_directions directions in those
+  // dimensions as well.  All vectors will be length 1.
+  static void CreateOptimizationDirections(
+     const std::vector<int>& primary,
+     int additional_random_directions,
+     RandomNumberGenerator<boost::mt19937>* rng,
+     std::vector<SparseVector<double> >* dirs
+     , bool include_primary=true
+    );
+
+};
+
+#endif
diff --git a/training/dpmert/lo_test.cc b/training/dpmert/lo_test.cc
new file mode 100644
index 00000000..95a08d3d
--- /dev/null
+++ b/training/dpmert/lo_test.cc
@@ -0,0 +1,229 @@
+#define BOOST_TEST_MODULE LineOptimizerTest
+#include <boost/test/unit_test.hpp>
+#include <boost/test/floating_point_comparison.hpp>
+
+#include <cmath>
+#include <iostream>
+#include <fstream>
+
+#include <boost/shared_ptr.hpp>
+
+#include "ns.h"
+#include "ns_docscorer.h"
+#include "ces.h"
+#include "fdict.h"
+#include "hg.h"
+#include "kbest.h"
+#include "hg_io.h"
+#include "filelib.h"
+#include "inside_outside.h"
+#include "viterbi.h"
+#include "mert_geometry.h"
+#include "line_optimizer.h"
+
+using namespace std;
+
+const char* ref11 = "australia reopens embassy in manila";
+const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack .";
+const char* ref21 = "australia reopened manila embassy";
+const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack .";
+const char* ref31 = "australia to reopen embassy in manila";
+const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats .";
+const char* ref41 = "australia to re - open its embassy to manila";
+const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago .";
+
+BOOST_AUTO_TEST_CASE( TestCheckNaN) {
+  double x = 0;
+  double y = 0;
+  double z = x / y;
+  BOOST_CHECK_EQUAL(true, std::isnan(z));
+}
+
+BOOST_AUTO_TEST_CASE(TestConvexHull) {
+  boost::shared_ptr<MERTPoint> a1(new MERTPoint(-1, 0));
+  boost::shared_ptr<MERTPoint> b1(new MERTPoint(1, 0));
+  boost::shared_ptr<MERTPoint> a2(new MERTPoint(-1, 1));
+  boost::shared_ptr<MERTPoint> b2(new MERTPoint(1, -1));
+  vector<boost::shared_ptr<MERTPoint> > sa; sa.push_back(a1); sa.push_back(b1);
+  vector<boost::shared_ptr<MERTPoint> > sb; sb.push_back(a2); sb.push_back(b2);
+  ConvexHull a(sa);
+  cerr << a << endl;
+  ConvexHull b(sb);
+  ConvexHull c = a;
+  c *= b;
+  cerr << a << " (*) " << b << " = " << c << endl;
+  BOOST_CHECK_EQUAL(3, c.size());
+}
+
+BOOST_AUTO_TEST_CASE(TestConvexHullInside) {
+  const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}";
+  Hypergraph hg;
+  istringstream instr(json);
+  HypergraphIO::ReadFromJSON(&instr, &hg);
+  SparseVector<double> wts;
+  wts.set_value(FD::Convert("f1"), 0.4);
+  wts.set_value(FD::Convert("f2"), 1.0);
+  hg.Reweight(wts);
+  vector<pair<vector<WordID>, prob_t> > list;
+  std::vector<SparseVector<double> > features;
+  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10);
+  for (int i = 0; i < 10; ++i) {
+    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+      kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+    if (!d) break;
+    cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
+  }
+  SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0);
+  ConvexHullWeightFunction wf(wts, dir);
+  ConvexHull env = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
+  cerr << env << endl;
+  const vector<boost::shared_ptr<MERTPoint> >& segs = env.GetSortedSegs();
+  dir *= segs[1]->x;
+  wts += dir;
+  hg.Reweight(wts);
+  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest2(hg, 10);
+  for (int i = 0; i < 10; ++i) {
+    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+      kbest2.LazyKthBest(hg.nodes_.size() - 1, i);
+    if (!d) break;
+    cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
+  }
+  for (unsigned i = 0; i < segs.size(); ++i) {
+    cerr << "seg=" << i << endl;
+    vector<WordID> trans;
+    segs[i]->ConstructTranslation(&trans);
+    cerr << TD::GetString(trans) << endl;
+  }
+}
+
+BOOST_AUTO_TEST_CASE( TestS1) {
+  int fPhraseModel_0 = FD::Convert("PhraseModel_0");
+  int fPhraseModel_1 = FD::Convert("PhraseModel_1");
+  int fPhraseModel_2 = FD::Convert("PhraseModel_2");
+  int fLanguageModel = FD::Convert("LanguageModel");
+  int fWordPenalty = FD::Convert("WordPenalty");
+  int fPassThrough = FD::Convert("PassThrough");
+  SparseVector<double> wts;
+  wts.set_value(fWordPenalty, 4.25);
+  wts.set_value(fLanguageModel, -1.1165);
+  wts.set_value(fPhraseModel_0, -0.96);
+  wts.set_value(fPhraseModel_1, -0.65);
+  wts.set_value(fPhraseModel_2, -0.77);
+  wts.set_value(fPassThrough, -10.0);
+
+  vector<int> to_optimize;
+  to_optimize.push_back(fWordPenalty);
+  to_optimize.push_back(fLanguageModel);
+  to_optimize.push_back(fPhraseModel_0);
+  to_optimize.push_back(fPhraseModel_1);
+  to_optimize.push_back(fPhraseModel_2);
+
+  std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : "test_data");
+
+  Hypergraph hg;
+  ReadFile rf(path + "/0.json.gz");
+  HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+  hg.Reweight(wts);
+
+  Hypergraph hg2;
+  ReadFile rf2(path + "/1.json.gz");
+  HypergraphIO::ReadFromJSON(rf2.stream(), &hg2);
+  hg2.Reweight(wts);
+
+  vector<vector<WordID> > refs1(4);
+  TD::ConvertSentence(ref11, &refs1[0]);
+  TD::ConvertSentence(ref21, &refs1[1]);
+  TD::ConvertSentence(ref31, &refs1[2]);
+  TD::ConvertSentence(ref41, &refs1[3]);
+  vector<vector<WordID> > refs2(4);
+  TD::ConvertSentence(ref12, &refs2[0]);
+  TD::ConvertSentence(ref22, &refs2[1]);
+  TD::ConvertSentence(ref32, &refs2[2]);
+  TD::ConvertSentence(ref42, &refs2[3]);
+  vector<ConvexHull> envs(2);
+
+  RandomNumberGenerator<boost::mt19937> rng;
+
+  vector<SparseVector<double> > axes; // directions to search
+  LineOptimizer::CreateOptimizationDirections(
+     to_optimize,
+     10,
+     &rng,
+     &axes);
+  assert(axes.size() == 10 + to_optimize.size());
+  for (unsigned i = 0; i < axes.size(); ++i)
+    cerr << axes[i] << endl;
+  const SparseVector<double>& axis = axes[0];
+
+  cerr << "Computing Viterbi envelope using inside algorithm...\n";
+  cerr << "axis: " << axis << endl;
+  clock_t t_start=clock();
+  ConvexHullWeightFunction wf(wts, axis);  // wts = starting point, axis = search direction
+  envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
+  envs[1] = Inside<ConvexHull, ConvexHullWeightFunction>(hg2, NULL, wf);
+
+  vector<ErrorSurface> es(2);
+  EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU");
+  boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(refs1);
+  boost::shared_ptr<SegmentEvaluator> scorer2 = metric->CreateSegmentEvaluator(refs2);
+  ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg);
+  ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2);
+  cerr << envs[0].size() << " " << envs[1].size() << endl;
+  cerr << es[0].size() << " " << es[1].size() << endl;
+  envs.clear();
+  clock_t t_env=clock();
+  float score;
+  double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score);
+  clock_t t_opt=clock();
+  cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n";
+  BOOST_CHECK_CLOSE(0.48719698, score, 1e-5);
+  SparseVector<double> res = axis;
+  res *= m;
+  res += wts;
+  cerr << "res: " << res << endl;
+  cerr << "ENVELOPE PROCESSING=" << (static_cast<double>(t_env - t_start) / 1000.0) << endl;
+  cerr << "  LINE OPTIMIZATION=" << (static_cast<double>(t_opt - t_env) / 1000.0) << endl;
+  hg.Reweight(res);
+  hg2.Reweight(res);
+  vector<WordID> t1,t2;
+  ViterbiESentence(hg, &t1);
+  ViterbiESentence(hg2, &t2);
+  cerr << TD::GetString(t1) << endl;
+  cerr << TD::GetString(t2) << endl;
+}
+
+BOOST_AUTO_TEST_CASE(TestZeroOrigin) {
+  const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}";
+  Hypergraph hg;
+  istringstream instr(json);
+  HypergraphIO::ReadFromJSON(&instr, &hg);
+  SparseVector<double> wts;
+  wts.set_value(FD::Convert("PassThrough"), -0.929201533002898);
+  hg.Reweight(wts);
+
+  vector<pair<vector<WordID>, prob_t> > list;
+  std::vector<SparseVector<double> > features;
+  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10);
+  for (int i = 0; i < 10; ++i) {
+    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+      kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+    if (!d) break;
+    cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
+  }
+ 
+  SparseVector<double> axis; axis.set_value(FD::Convert("Glue"),1.0);
+  ConvexHullWeightFunction wf(wts, axis);  // wts = starting point, axis = search direction
+  vector<ConvexHull> envs(1);
+  envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
+
+  vector<vector<WordID> > mr(4);
+  TD::ConvertSentence("untitled", &mr[0]);
+  TD::ConvertSentence("with no title", &mr[1]);
+  TD::ConvertSentence("without a title", &mr[2]);
+  TD::ConvertSentence("without title", &mr[3]);
+  EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU");
+  boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(mr);
+  vector<ErrorSurface> es(1);
+  ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg);
+}
+
diff --git a/training/dpmert/mert_geometry.cc b/training/dpmert/mert_geometry.cc
new file mode 100644
index 00000000..d6973658
--- /dev/null
+++ b/training/dpmert/mert_geometry.cc
@@ -0,0 +1,185 @@
+#include "mert_geometry.h"
+
+#include <cassert>
+#include <limits>
+
+using namespace std;
+
+ConvexHull::ConvexHull(int i) {
+  if (i == 0) {
+    // do nothing - <>
+  } else if (i == 1) {
+    points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(0, 0, 0, boost::shared_ptr<MERTPoint>(), boost::shared_ptr<MERTPoint>())));
+    assert(this->IsMultiplicativeIdentity());
+  } else {
+    cerr << "Only can create ConvexHull semiring 0 and 1 with this constructor!\n";
+    abort();
+  }
+}
+
+const ConvexHull ConvexHullWeightFunction::operator()(const Hypergraph::Edge& e) const {
+  const double m = direction.dot(e.feature_values_);
+  const double b = origin.dot(e.feature_values_);
+  MERTPoint* point = new MERTPoint(m, b, e);
+  return ConvexHull(1, point);
+}
+
+ostream& operator<<(ostream& os, const ConvexHull& env) {
+  os << '<';
+  const vector<boost::shared_ptr<MERTPoint> >& points = env.GetSortedSegs();
+  for (int i = 0; i < points.size(); ++i)
+    os << (i==0 ? "" : "|") << "x=" << points[i]->x << ",b=" << points[i]->b << ",m=" << points[i]->m << ",p1=" << points[i]->p1 << ",p2=" << points[i]->p2;
+  return os << '>';
+}
+
+#define ORIGINAL_MERT_IMPLEMENTATION 1
+#ifdef ORIGINAL_MERT_IMPLEMENTATION
+
+struct SlopeCompare {
+  bool operator() (const boost::shared_ptr<MERTPoint>& a, const boost::shared_ptr<MERTPoint>& b) const {
+    return a->m < b->m;
+  }
+};
+
+const ConvexHull& ConvexHull::operator+=(const ConvexHull& other) {
+  if (!other.is_sorted) other.Sort();
+  if (points.empty()) {
+    points = other.points;
+    return *this;
+  }
+  is_sorted = false;
+  int j = points.size();
+  points.resize(points.size() + other.points.size());
+  for (int i = 0; i < other.points.size(); ++i)
+    points[j++] = other.points[i];
+  assert(j == points.size());
+  return *this;
+}
+
+void ConvexHull::Sort() const {
+  sort(points.begin(), points.end(), SlopeCompare());
+  const int k = points.size();
+  int j = 0;
+  for (int i = 0; i < k; ++i) {
+    MERTPoint l = *points[i];
+    l.x = kMinusInfinity;
+    // cerr << "m=" << l.m << endl;
+    if (0 < j) {
+      if (points[j-1]->m == l.m) {   // lines are parallel
+        if (l.b <= points[j-1]->b) continue;
+        --j;
+      }
+      while(0 < j) {
+        l.x = (l.b - points[j-1]->b) / (points[j-1]->m - l.m);
+        if (points[j-1]->x < l.x) break;
+        --j;
+      }
+      if (0 == j) l.x = kMinusInfinity;
+    }
+    *points[j++] = l;
+  }
+  points.resize(j);
+  is_sorted = true;
+}
+
+const ConvexHull& ConvexHull::operator*=(const ConvexHull& other) {
+  if (other.IsMultiplicativeIdentity()) { return *this; }
+  if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; }
+
+  if (!is_sorted) Sort();
+  if (!other.is_sorted) other.Sort();
+
+  if (this->IsEdgeEnvelope()) {
+//    if (other.size() > 1)
+//      cerr << *this << " (TIMES) " << other << endl;
+    boost::shared_ptr<MERTPoint> edge_parent = points[0];
+    const double& edge_b = edge_parent->b;
+    const double& edge_m = edge_parent->m;
+    points.clear();
+    for (int i = 0; i < other.points.size(); ++i) {
+      const MERTPoint& p = *other.points[i];
+      const double m = p.m + edge_m;
+      const double b = p.b + edge_b;
+      const double& x = p.x;       // x's don't change with *
+      points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(x, m, b, edge_parent, other.points[i])));
+      assert(points.back()->p1->edge);
+    }
+//    if (other.size() > 1)
+//      cerr << " = " << *this << endl;
+  } else {
+    vector<boost::shared_ptr<MERTPoint> > new_points;
+    int this_i = 0;
+    int other_i = 0;
+    const int this_size  = points.size();
+    const int other_size = other.points.size();
+    double cur_x = kMinusInfinity;   // moves from left to right across the
+                                     // real numbers, stopping for all inter-
+                                     // sections
+    double this_next_val  = (1 < this_size  ? points[1]->x       : kPlusInfinity);
+    double other_next_val = (1 < other_size ? other.points[1]->x : kPlusInfinity);
+    while (this_i < this_size && other_i < other_size) {
+      const MERTPoint& this_point = *points[this_i];
+      const MERTPoint& other_point= *other.points[other_i];
+      const double m = this_point.m + other_point.m;
+      const double b = this_point.b + other_point.b;
+ 
+      new_points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(cur_x, m, b, points[this_i], other.points[other_i])));
+      int comp = 0;
+      if (this_next_val < other_next_val) comp = -1; else
+        if (this_next_val > other_next_val) comp = 1;
+      if (0 == comp) {  // the next values are equal, advance both indices
+        ++this_i;
+	++other_i;
+        cur_x = this_next_val;  // could be other_next_val (they're equal!)
+        this_next_val  = (this_i+1  < this_size  ? points[this_i+1]->x        : kPlusInfinity);
+        other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity);
+      } else {  // advance the i with the lower x, update cur_x
+        if (-1 == comp) {
+          ++this_i;
+          cur_x = this_next_val;
+          this_next_val =  (this_i+1  < this_size  ? points[this_i+1]->x        : kPlusInfinity);
+        } else {
+          ++other_i;
+          cur_x = other_next_val;
+          other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity);
+        }
+      }
+    }
+    points.swap(new_points);
+  }
+  //cerr << "Multiply: result=" << (*this) << endl;
+  return *this;
+}
+
+// recursively construct translation
+void MERTPoint::ConstructTranslation(vector<WordID>* trans) const {
+  const MERTPoint* cur = this;
+  vector<vector<WordID> > ant_trans;
+  while(!cur->edge) {
+    ant_trans.resize(ant_trans.size() + 1);
+    cur->p2->ConstructTranslation(&ant_trans.back());
+    cur = cur->p1.get();
+  }
+  size_t ant_size = ant_trans.size();
+  vector<const vector<WordID>*> pants(ant_size);
+  assert(ant_size == cur->edge->tail_nodes_.size());
+  --ant_size;
+  for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i];
+  cur->edge->rule_->ESubstitute(pants, trans);
+}
+
+void MERTPoint::CollectEdgesUsed(std::vector<bool>* edges_used) const {
+  if (edge) {
+    assert(edge->id_ < edges_used->size());
+    (*edges_used)[edge->id_] = true;
+  }
+  if (p1) p1->CollectEdgesUsed(edges_used);
+  if (p2) p2->CollectEdgesUsed(edges_used);
+}
+
+#else
+
+// THIS IS THE NEW FASTER IMPLEMENTATION OF THE MERT SEMIRING OPERATIONS
+
+#endif
+
diff --git a/training/dpmert/mert_geometry.h b/training/dpmert/mert_geometry.h
new file mode 100644
index 00000000..a8b6959e
--- /dev/null
+++ b/training/dpmert/mert_geometry.h
@@ -0,0 +1,81 @@
+#ifndef _MERT_GEOMETRY_H_
+#define _MERT_GEOMETRY_H_
+
+#include <vector>
+#include <iostream>
+#include <boost/shared_ptr.hpp>
+
+#include "hg.h"
+#include "sparse_vector.h"
+
+static const double kMinusInfinity = -std::numeric_limits<double>::infinity();
+static const double kPlusInfinity = std::numeric_limits<double>::infinity();
+
+struct MERTPoint {
+  MERTPoint() : x(), m(), b(), edge() {}
+  MERTPoint(double _m, double _b) :
+    x(kMinusInfinity), m(_m), b(_b), edge() {}
+  MERTPoint(double _x, double _m, double _b, const boost::shared_ptr<MERTPoint>& p1_, const boost::shared_ptr<MERTPoint>& p2_) :
+    x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {}
+  MERTPoint(double _m, double _b, const Hypergraph::Edge& edge) :
+    x(kMinusInfinity), m(_m), b(_b), edge(&edge) {}
+
+  double x;                   // x intersection with previous segment in env, or -inf if none
+  double m;                   // this line's slope
+  double b;                   // intercept with y-axis
+
+  // we keep a pointer to the "parents" of this segment so we can reconstruct
+  // the Viterbi translation corresponding to this segment
+  boost::shared_ptr<MERTPoint> p1;
+  boost::shared_ptr<MERTPoint> p2;
+
+  // only MERTPoints created from an edge using the ConvexHullWeightFunction
+  // have rules
+  // TRulePtr rule;
+  const Hypergraph::Edge* edge;
+
+  // recursively recover the Viterbi translation that will result from setting
+  // the weights to origin + axis * x, where x is any value from this->x up
+  // until the next largest x in the containing ConvexHull
+  void ConstructTranslation(std::vector<WordID>* trans) const;
+  void CollectEdgesUsed(std::vector<bool>* edges_used) const;
+};
+
+// this is the semiring value type,
+// it defines constructors for 0, 1, and the operations + and *
+struct ConvexHull {
+  // create semiring zero
+  ConvexHull() : is_sorted(true) {}  // zero
+  // for debugging:
+  ConvexHull(const std::vector<boost::shared_ptr<MERTPoint> >& s) : points(s) { Sort(); }
+  // create semiring 1 or 0
+  explicit ConvexHull(int i);
+  ConvexHull(int n, MERTPoint* point) : is_sorted(true), points(n, boost::shared_ptr<MERTPoint>(point)) {}
+  const ConvexHull& operator+=(const ConvexHull& other);
+  const ConvexHull& operator*=(const ConvexHull& other);
+  bool IsMultiplicativeIdentity() const {
+    return size() == 1 && (points[0]->b == 0.0 && points[0]->m == 0.0) && (!points[0]->edge) && (!points[0]->p1) && (!points[0]->p2); }
+  const std::vector<boost::shared_ptr<MERTPoint> >& GetSortedSegs() const {
+    if (!is_sorted) Sort();
+    return points;
+  }
+  size_t size() const { return points.size(); }
+
+ private:
+  bool IsEdgeEnvelope() const {
+    return points.size() == 1 && points[0]->edge; }
+  void Sort() const;
+  mutable bool is_sorted;
+  mutable std::vector<boost::shared_ptr<MERTPoint> > points;
+};
+std::ostream& operator<<(std::ostream& os, const ConvexHull& env);
+
+struct ConvexHullWeightFunction {
+  ConvexHullWeightFunction(const SparseVector<double>& ori,
+                           const SparseVector<double>& dir) : origin(ori), direction(dir) {}
+  const ConvexHull operator()(const Hypergraph::Edge& e) const;
+  const SparseVector<double> origin;
+  const SparseVector<double> direction;
+};
+
+#endif
diff --git a/training/dpmert/mr_dpmert_generate_mapper_input.cc b/training/dpmert/mr_dpmert_generate_mapper_input.cc
new file mode 100644
index 00000000..199cd23a
--- /dev/null
+++ b/training/dpmert/mr_dpmert_generate_mapper_input.cc
@@ -0,0 +1,81 @@
+#include <iostream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "weights.h"
+#include "line_optimizer.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)")
+        ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository")
+        ("weights,w",po::value<string>(),"[REQD] Current feature weights file")
+        ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
+        ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in")
+        ("help,h", "Help");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  bool flag = false;
+  if (conf->count("dev_set_size") == 0) {
+    cerr << "Please specify the size of the development set using -d N\n";
+    flag = true;
+  }
+  if (conf->count("weights") == 0) {
+    cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
+    flag = true;
+  }
+  if (conf->count("forest_repository") == 0) {
+    cerr << "Please specify the forest repository location using -r <DIR>\n";
+    flag = true;
+  }
+  if (flag || conf->count("help")) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+int main(int argc, char** argv) {
+  RandomNumberGenerator<boost::mt19937> rng;
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  vector<string> features;
+  SparseVector<weight_t> origin;
+  vector<weight_t> w;
+  Weights::InitFromFile(conf["weights"].as<string>(), &w, &features);
+  Weights::InitSparseVector(w, &origin);
+  const string forest_repository = conf["forest_repository"].as<string>();
+  if (!DirectoryExists(forest_repository)) {
+    cerr << "Forest repository directory " << forest_repository << " not found!\n";
+    return 1;
+  }
+  if (conf.count("optimize_feature") > 0)
+    features=conf["optimize_feature"].as<vector<string> >();
+  vector<SparseVector<weight_t> > directions;
+  vector<int> fids(features.size());
+  for (unsigned i = 0; i < features.size(); ++i)
+    fids[i] = FD::Convert(features[i]);
+  LineOptimizer::CreateOptimizationDirections(
+     fids,
+     conf["random_directions"].as<unsigned int>(),
+     &rng,
+     &directions);
+  unsigned dev_set_size = conf["dev_set_size"].as<unsigned>();
+  for (unsigned i = 0; i < dev_set_size; ++i) {
+    for (unsigned j = 0; j < directions.size(); ++j) {
+      cout << forest_repository << '/' << i << ".json.gz " << i << ' ';
+      print(cout, origin, "=", ";");
+      cout << ' ';
+      print(cout, directions[j], "=", ";");
+      cout << endl;
+    }
+  }
+  return 0;
+}
diff --git a/training/dpmert/mr_dpmert_map.cc b/training/dpmert/mr_dpmert_map.cc
new file mode 100644
index 00000000..d1efcf96
--- /dev/null
+++ b/training/dpmert/mr_dpmert_map.cc
@@ -0,0 +1,112 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "ns.h"
+#include "ns_docscorer.h"
+#include "ces.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "sparse_vector.h"
+#include "mert_geometry.h"
+#include "inside_outside.h"
+#include "error_surface.h"
+#include "b64tools.h"
+#include "hg_io.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+        ("source,s",po::value<string>(), "Source file (ignored, except for AER)")
+        ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric being optimized")
+        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+        ("help,h", "Help");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  bool flag = false;
+  if (!conf->count("reference")) {
+    cerr << "Please specify one or more references using -r <REF.TXT>\n";
+    flag = true;
+  }
+  if (flag || conf->count("help")) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+bool ReadSparseVectorString(const string& s, SparseVector<double>* v) {
+#if 0
+  // this should work, but untested.
+  std::istringstream i(s);
+  i>>*v;
+#else
+  vector<string> fields;
+  Tokenize(s, ';', &fields);
+  if (fields.empty()) return false;
+  for (unsigned i = 0; i < fields.size(); ++i) {
+    vector<string> pair(2);
+    Tokenize(fields[i], '=', &pair);
+    if (pair.size() != 2) {
+      cerr << "Error parsing vector string: " << fields[i] << endl;
+      return false;
+    }
+    v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str()));
+  }
+  return true;
+#endif
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  const string evaluation_metric = conf["evaluation_metric"].as<string>();
+  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+  DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+  cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
+  Hypergraph hg;
+  string last_file;
+  ReadFile in_read(conf["input"].as<string>());
+  istream &in=*in_read.stream();
+  while(in) {
+    string line;
+    getline(in, line);
+    if (line.empty()) continue;
+    istringstream is(line);
+    int sent_id;
+    string file, s_origin, s_direction;
+    // path-to-file (JSON) sent_ed starting-point search-direction
+    is >> file >> sent_id >> s_origin >> s_direction;
+    SparseVector<double> origin;
+    ReadSparseVectorString(s_origin, &origin);
+    SparseVector<double> direction;
+    ReadSparseVectorString(s_direction, &direction);
+    // cerr << "File: " << file << "\nDir: " << direction << "\n   X: " << origin << endl;
+    if (last_file != file) {
+      last_file = file;
+      ReadFile rf(file);
+      HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+    }
+    const ConvexHullWeightFunction wf(origin, direction);
+    const ConvexHull hull = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
+
+    ErrorSurface es;
+    ComputeErrorSurface(*ds[sent_id], hull, &es, metric, hg);
+    //cerr << "Viterbi envelope has " << ve.size() << " segments\n";
+    // cerr << "Error surface has " << es.size() << " segments\n";
+    string val;
+    es.Serialize(&val);
+    cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t';
+    B64::b64encode(val.c_str(), val.size(), &cout);
+    cout << endl << flush;
+  }
+  return 0;
+}
diff --git a/training/dpmert/mr_dpmert_reduce.cc b/training/dpmert/mr_dpmert_reduce.cc
new file mode 100644
index 00000000..31512a03
--- /dev/null
+++ b/training/dpmert/mr_dpmert_reduce.cc
@@ -0,0 +1,77 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sparse_vector.h"
+#include "error_surface.h"
+#include "line_optimizer.h"
+#include "b64tools.h"
+#include "stringlib.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("evaluation_metric,m",po::value<string>(), "Evaluation metric (IBM_BLEU, etc.)")
+        ("help,h", "Help");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  bool flag = conf->count("evaluation_metric") == 0;
+  if (flag || conf->count("help")) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  const string evaluation_metric = conf["evaluation_metric"].as<string>();
+  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+  LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE;
+  if (metric->IsErrorMetric())
+    opt_type = LineOptimizer::MINIMIZE_SCORE;
+
+  vector<ErrorSurface> esv;
+  string last_key, line, key, val;
+  while(getline(cin, line)) {
+    size_t ks = line.find("\t");
+    assert(string::npos != ks);
+    assert(ks > 2);
+    key = line.substr(2, ks - 2);
+    val = line.substr(ks + 1);
+    if (key != last_key) {
+      if (!last_key.empty()) {
+	float score;
+        double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score);
+	cout << last_key << "|" << x << "|" << score << endl;
+      }
+      last_key.swap(key);
+      esv.clear();
+    }
+    if (val.size() % 4 != 0) {
+      cerr << "B64 encoding error 1! Skipping.\n";
+      continue;
+    }
+    string encoded(val.size() / 4 * 3, '\0');
+    if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) {
+      cerr << "B64 encoding error 2! Skipping.\n";
+      continue;
+    }
+    esv.push_back(ErrorSurface());
+    esv.back().Deserialize(encoded);
+  }
+  if (!esv.empty()) {
+    float score;
+    double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score);
+    cout << last_key << "|" << x << "|" << score << endl;
+  }
+  return 0;
+}
diff --git a/training/dpmert/test_aer/README b/training/dpmert/test_aer/README
new file mode 100644
index 00000000..819b2e32
--- /dev/null
+++ b/training/dpmert/test_aer/README
@@ -0,0 +1,8 @@
+To run the test:
+
+../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights
+
+This will optimize the parameters of the tiny lexical translation model
+so as to minimize the AER of the Viterbi alignment on the development
+set in corpus.src according to the reference alignments in ref.0.
+
diff --git a/training/dpmert/test_aer/cdec.ini b/training/dpmert/test_aer/cdec.ini
new file mode 100644
index 00000000..08187848
--- /dev/null
+++ b/training/dpmert/test_aer/cdec.ini
@@ -0,0 +1,3 @@
+formalism=lextrans
+grammar=grammar
+aligner=true
diff --git a/training/dpmert/test_aer/corpus.src b/training/dpmert/test_aer/corpus.src
new file mode 100644
index 00000000..31b23971
--- /dev/null
+++ b/training/dpmert/test_aer/corpus.src
@@ -0,0 +1,3 @@
+el gato negro ||| the black cat
+el gato ||| the cat
+el libro ||| the book
diff --git a/training/dpmert/test_aer/grammar b/training/dpmert/test_aer/grammar
new file mode 100644
index 00000000..9d857824
--- /dev/null
+++ b/training/dpmert/test_aer/grammar
@@ -0,0 +1,12 @@
+el ||| cat ||| F1=1
+el ||| the ||| F2=1
+el ||| black ||| F3=1
+el ||| book ||| F11=1
+gato ||| cat ||| F4=1 NN=1
+gato ||| black ||| F5=1
+gato ||| the ||| F6=1
+negro ||| the ||| F7=1
+negro ||| cat ||| F8=1
+negro ||| black ||| F9=1
+libro ||| the ||| F10=1
+libro ||| book ||| F12=1 NN=1
diff --git a/training/dpmert/test_aer/ref.0 b/training/dpmert/test_aer/ref.0
new file mode 100644
index 00000000..734a9c5b
--- /dev/null
+++ b/training/dpmert/test_aer/ref.0
@@ -0,0 +1,3 @@
+0-0 1-2 2-1
+0-0 1-1
+0-0 1-1
diff --git a/training/dpmert/test_aer/weights b/training/dpmert/test_aer/weights
new file mode 100644
index 00000000..afc9282e
--- /dev/null
+++ b/training/dpmert/test_aer/weights
@@ -0,0 +1,13 @@
+F1 0.1
+F2 -.5980815
+F3 0.24235
+F4 0.625
+F5 0.4514
+F6 0.112316
+F7 -0.123415
+F8 -0.25390285
+F9 -0.23852
+F10 0.646
+F11 0.413141
+F12 0.343216
+NN -0.1215
diff --git a/training/dpmert/test_data/0.json.gz b/training/dpmert/test_data/0.json.gz
new file mode 100644
index 00000000..30f8dd77
Binary files /dev/null and b/training/dpmert/test_data/0.json.gz differ
diff --git a/training/dpmert/test_data/1.json.gz b/training/dpmert/test_data/1.json.gz
new file mode 100644
index 00000000..c82cc179
Binary files /dev/null and b/training/dpmert/test_data/1.json.gz differ
diff --git a/training/dpmert/test_data/c2e.txt.0 b/training/dpmert/test_data/c2e.txt.0
new file mode 100644
index 00000000..12c4abe9
--- /dev/null
+++ b/training/dpmert/test_data/c2e.txt.0
@@ -0,0 +1,2 @@
+australia reopens embassy in manila
+( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack .
diff --git a/training/dpmert/test_data/c2e.txt.1 b/training/dpmert/test_data/c2e.txt.1
new file mode 100644
index 00000000..4ac12df1
--- /dev/null
+++ b/training/dpmert/test_data/c2e.txt.1
@@ -0,0 +1,2 @@
+australia reopened manila embassy
+( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack .
diff --git a/training/dpmert/test_data/c2e.txt.2 b/training/dpmert/test_data/c2e.txt.2
new file mode 100644
index 00000000..2f67b72f
--- /dev/null
+++ b/training/dpmert/test_data/c2e.txt.2
@@ -0,0 +1,2 @@
+australia to reopen embassy in manila
+( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats .
diff --git a/training/dpmert/test_data/c2e.txt.3 b/training/dpmert/test_data/c2e.txt.3
new file mode 100644
index 00000000..5483cef6
--- /dev/null
+++ b/training/dpmert/test_data/c2e.txt.3
@@ -0,0 +1,2 @@
+australia to re - open its embassy to manila
+( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago .
diff --git a/training/dpmert/test_data/re.txt.0 b/training/dpmert/test_data/re.txt.0
new file mode 100644
index 00000000..86eff087
--- /dev/null
+++ b/training/dpmert/test_data/re.txt.0
@@ -0,0 +1,5 @@
+erdogan states turkey to reject any pressures to urge it to recognize cyprus
+ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened .
+erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus .
+we will discuss this dossier in the course of membership negotiations . "
+he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . "
diff --git a/training/dpmert/test_data/re.txt.1 b/training/dpmert/test_data/re.txt.1
new file mode 100644
index 00000000..2140f198
--- /dev/null
+++ b/training/dpmert/test_data/re.txt.1
@@ -0,0 +1,5 @@
+erdogan confirms turkey will resist any pressure to recognize cyprus
+ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara .
+erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus .
+we shall discuss this issue in the course of the membership negotiations . "
+he added : " let me be clear - i cannot confine turkey . this is something we do not accept . "
diff --git a/training/dpmert/test_data/re.txt.2 b/training/dpmert/test_data/re.txt.2
new file mode 100644
index 00000000..94e46286
--- /dev/null
+++ b/training/dpmert/test_data/re.txt.2
@@ -0,0 +1,5 @@
+erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus
+ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara .
+erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus .
+we shall discuss this dossier during the negotiations on joining . "
+and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . "
diff --git a/training/dpmert/test_data/re.txt.3 b/training/dpmert/test_data/re.txt.3
new file mode 100644
index 00000000..f87c3308
--- /dev/null
+++ b/training/dpmert/test_data/re.txt.3
@@ -0,0 +1,5 @@
+erdogan stresses that turkey will reject all pressures to force it to recognize cyprus
+ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not .
+erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus .
+we will discuss this file during the negotiations on joining . "
+he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . "
diff --git a/training/dtrain/Makefile.am b/training/dtrain/Makefile.am
new file mode 100644
index 00000000..5b48e756
--- /dev/null
+++ b/training/dtrain/Makefile.am
@@ -0,0 +1,7 @@
+bin_PROGRAMS = dtrain
+
+dtrain_SOURCES = dtrain.cc score.cc
+dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
+
diff --git a/training/dtrain/README.md b/training/dtrain/README.md
new file mode 100644
index 00000000..7edabbf1
--- /dev/null
+++ b/training/dtrain/README.md
@@ -0,0 +1,48 @@
+This is a simple (and parallelizable) tuning method for cdec
+which is able to train the weights of very many (sparse) features.
+It was used here:
+  "Joint Feature Selection in Distributed Stochastic
+   Learning for Large-Scale Discriminative Training in
+   SMT"
+(Simianer, Riezler, Dyer; ACL 2012)
+
+
+Building
+--------
+Builds when building cdec, see ../BUILDING .
+To build only parts needed for dtrain do
+```
+  autoreconf -ifv
+  ./configure [--disable-gtest]
+  cd dtrain/; make
+```
+
+Running
+-------
+To run this on a dev set locally:
+```
+    #define DTRAIN_LOCAL
+```
+otherwise remove that line or undef, then recompile. You need a single
+grammar file or input annotated with per-sentence grammars (psg) as you
+would use with cdec. Additionally you need to give dtrain a file with
+references (--refs) when running locally.
+
+The input for use with hadoop streaming looks like this:
+```
+    <sid>\t<source>\t<ref>\t<grammar rules separated by \t>
+```
+To convert a psg to this format you need to replace all "\n"
+by "\t". Make sure there are no tabs in your data.
+
+For an example of local usage (with the 'distributed' format)
+the see test/example/ . This expects dtrain to be built without
+DTRAIN_LOCAL.
+
+Legal
+-----
+Copyright (c) 2012 by Patrick Simianer <p@simianer.de>
+
+See the file ../LICENSE.txt for the licensing terms that this software is
+released under.
+
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
new file mode 100644
index 00000000..18286668
--- /dev/null
+++ b/training/dtrain/dtrain.cc
@@ -0,0 +1,657 @@
+#include "dtrain.h"
+
+
+bool
+dtrain_init(int argc, char** argv, po::variables_map* cfg)
+{
+  po::options_description ini("Configuration File Options");
+  ini.add_options()
+    ("input",             po::value<string>()->default_value("-"),                                                   "input file")
+    ("output",            po::value<string>()->default_value("-"),                          "output weights file, '-' for STDOUT")
+    ("input_weights",     po::value<string>(),                                "input weights file (e.g. from previous iteration)")
+    ("decoder_config",    po::value<string>(),                                                      "configuration file for cdec")
+    ("print_weights",     po::value<string>(),                                               "weights to print on each iteration")
+    ("stop_after",        po::value<unsigned>()->default_value(0),                                 "stop after X input sentences")
+    ("tmp",               po::value<string>()->default_value("/tmp"),                                           "temp dir to use")
+    ("keep",              po::value<bool>()->zero_tokens(),                               "keep weights files for each iteration")
+    ("hstreaming",        po::value<string>(),                                   "run in hadoop streaming mode, arg is a task id")
+    ("epochs",            po::value<unsigned>()->default_value(10),                               "# of iterations T (per shard)")
+    ("k",                 po::value<unsigned>()->default_value(100),                            "how many translations to sample")
+    ("sample_from",       po::value<string>()->default_value("kbest"),     "where to sample translations from: 'kbest', 'forest'")
+    ("filter",            po::value<string>()->default_value("uniq"),                          "filter kbest list: 'not', 'uniq'")
+    ("pair_sampling",     po::value<string>()->default_value("XYX"),                 "how to sample pairs: 'all', 'XYX' or 'PRO'")
+    ("hi_lo",             po::value<float>()->default_value(0.1),                   "hi and lo (X) for XYX (default 0.1), <= 0.5")
+    ("pair_threshold",    po::value<score_t>()->default_value(0.),                         "bleu [0,1] threshold to filter pairs")
+    ("N",                 po::value<unsigned>()->default_value(4),                                          "N for Ngrams (BLEU)")
+    ("scorer",            po::value<string>()->default_value("stupid_bleu"),      "scoring: bleu, stupid_, smooth_, approx_, lc_")
+    ("learning_rate",     po::value<weight_t>()->default_value(1.0),                                              "learning rate")
+    ("gamma",             po::value<weight_t>()->default_value(0.),                            "gamma for SVM (0 for perceptron)")
+    ("select_weights",    po::value<string>()->default_value("last"),     "output best, last, avg weights ('VOID' to throw away)")
+    ("rescale",           po::value<bool>()->zero_tokens(),                              "rescale weight vector after each input")
+    ("l1_reg",            po::value<string>()->default_value("none"),      "apply l1 regularization as in 'Tsuroka et al' (2010)")
+    ("l1_reg_strength",   po::value<weight_t>(),                                                     "l1 regularization strength")
+    ("fselect",           po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPLEMENTED") // TODO
+    ("approx_bleu_d",     po::value<score_t>()->default_value(0.9),                                   "discount for approx. BLEU")
+    ("scale_bleu_diff",   po::value<bool>()->zero_tokens(),                      "learning rate <- bleu diff of a misranked pair")
+    ("loss_margin",       po::value<weight_t>()->default_value(0.),  "update if no error in pref pair but model scores this near")
+    ("max_pairs",         po::value<unsigned>()->default_value(std::numeric_limits<unsigned>::max()), "max. # of pairs per Sent.")
+#ifdef DTRAIN_LOCAL
+    ("refs,r",            po::value<string>(),                                                         "references in local mode")
+#endif
+    ("noup",              po::value<bool>()->zero_tokens(),                                               "do not update weights");
+  po::options_description cl("Command Line Options");
+  cl.add_options()
+    ("config,c",         po::value<string>(),              "dtrain config file")
+    ("quiet,q",          po::value<bool>()->zero_tokens(),           "be quiet")
+    ("verbose,v",        po::value<bool>()->zero_tokens(),         "be verbose");
+  cl.add(ini);
+  po::store(parse_command_line(argc, argv, cl), *cfg);
+  if (cfg->count("config")) {
+    ifstream ini_f((*cfg)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(ini_f, ini), *cfg);
+  }
+  po::notify(*cfg);
+  if (!cfg->count("decoder_config")) {
+    cerr << cl << endl;
+    return false;
+  }
+  if (cfg->count("hstreaming") && (*cfg)["output"].as<string>() != "-") {
+    cerr << "When using 'hstreaming' the 'output' param should be '-'." << endl;
+    return false;
+  }
+#ifdef DTRAIN_LOCAL
+  if ((*cfg)["input"].as<string>() == "-") {
+    cerr << "Can't use stdin as input with this binary. Recompile without DTRAIN_LOCAL" << endl;
+    return false;
+  }
+#endif
+  if ((*cfg)["sample_from"].as<string>() != "kbest"
+       && (*cfg)["sample_from"].as<string>() != "forest") {
+    cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as<string>() << "', use 'kbest' or 'forest'." << endl;
+    return false;
+  }
+  if ((*cfg)["sample_from"].as<string>() == "kbest" && (*cfg)["filter"].as<string>() != "uniq" &&
+        (*cfg)["filter"].as<string>() != "not") {
+    cerr << "Wrong 'filter' param: '" << (*cfg)["filter"].as<string>() << "', use 'uniq' or 'not'." << endl;
+    return false;
+  }
+  if ((*cfg)["pair_sampling"].as<string>() != "all" && (*cfg)["pair_sampling"].as<string>() != "XYX" &&
+        (*cfg)["pair_sampling"].as<string>() != "PRO") {
+    cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as<string>() << "'." << endl;
+    return false;
+  }
+  if(cfg->count("hi_lo") && (*cfg)["pair_sampling"].as<string>() != "XYX") {
+    cerr << "Warning: hi_lo only works with pair_sampling XYX." << endl;
+  }
+  if((*cfg)["hi_lo"].as<float>() > 0.5 || (*cfg)["hi_lo"].as<float>() < 0.01) {
+    cerr << "hi_lo must lie in [0.01, 0.5]" << endl;
+    return false;
+  }
+  if ((*cfg)["pair_threshold"].as<score_t>() < 0) {
+    cerr << "The threshold must be >= 0!" << endl;
+    return false;
+  }
+  if ((*cfg)["select_weights"].as<string>() != "last" && (*cfg)["select_weights"].as<string>() != "best" &&
+        (*cfg)["select_weights"].as<string>() != "avg" && (*cfg)["select_weights"].as<string>() != "VOID") {
+    cerr << "Wrong 'select_weights' param: '" << (*cfg)["select_weights"].as<string>() << "', use 'last' or 'best'." << endl;
+    return false;
+  }
+  return true;
+}
+
+int
+main(int argc, char** argv)
+{
+  // handle most parameters
+  po::variables_map cfg;
+  if (!dtrain_init(argc, argv, &cfg)) exit(1); // something is wrong
+  bool quiet = false;
+  if (cfg.count("quiet")) quiet = true;
+  bool verbose = false;
+  if (cfg.count("verbose")) verbose = true;
+  bool noup = false;
+  if (cfg.count("noup")) noup = true;
+  bool hstreaming = false;
+  string task_id;
+  if (cfg.count("hstreaming")) {
+    hstreaming = true;
+    quiet = true;
+    task_id = cfg["hstreaming"].as<string>();
+    cerr.precision(17);
+  }
+  bool rescale = false;
+  if (cfg.count("rescale")) rescale = true;
+  HSReporter rep(task_id);
+  bool keep = false;
+  if (cfg.count("keep")) keep = true;
+
+  const unsigned k = cfg["k"].as<unsigned>();
+  const unsigned N = cfg["N"].as<unsigned>();
+  const unsigned T = cfg["epochs"].as<unsigned>();
+  const unsigned stop_after = cfg["stop_after"].as<unsigned>();
+  const string filter_type = cfg["filter"].as<string>();
+  const string sample_from = cfg["sample_from"].as<string>();
+  const string pair_sampling = cfg["pair_sampling"].as<string>();
+  const score_t pair_threshold = cfg["pair_threshold"].as<score_t>();
+  const string select_weights = cfg["select_weights"].as<string>();
+  const float hi_lo = cfg["hi_lo"].as<float>();
+  const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>();
+  const unsigned max_pairs = cfg["max_pairs"].as<unsigned>();
+  weight_t loss_margin = cfg["loss_margin"].as<weight_t>();
+  if (loss_margin > 9998.) loss_margin = std::numeric_limits<float>::max();
+  bool scale_bleu_diff = false;
+  if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true;
+  bool average = false;
+  if (select_weights == "avg")
+    average = true;
+  vector<string> print_weights;
+  if (cfg.count("print_weights"))
+    boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" "));
+
+  // setup decoder
+  register_feature_functions();
+  SetSilent(true);
+  ReadFile ini_rf(cfg["decoder_config"].as<string>());
+  if (!quiet)
+    cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
+  Decoder decoder(ini_rf.stream());
+
+  // scoring metric/scorer
+  string scorer_str = cfg["scorer"].as<string>();
+  LocalScorer* scorer;
+  if (scorer_str == "bleu") {
+    scorer = dynamic_cast<BleuScorer*>(new BleuScorer);
+  } else if (scorer_str == "stupid_bleu") {
+    scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer);
+  } else if (scorer_str == "smooth_bleu") {
+    scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);
+  } else if (scorer_str == "sum_bleu") {
+    scorer = dynamic_cast<SumBleuScorer*>(new SumBleuScorer);
+  } else if (scorer_str == "sumexp_bleu") {
+    scorer = dynamic_cast<SumExpBleuScorer*>(new SumExpBleuScorer);
+  } else if (scorer_str == "sumwhatever_bleu") {
+    scorer = dynamic_cast<SumWhateverBleuScorer*>(new SumWhateverBleuScorer);
+  } else if (scorer_str == "approx_bleu") {
+    scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d));
+  } else if (scorer_str == "lc_bleu") {
+    scorer = dynamic_cast<LinearBleuScorer*>(new LinearBleuScorer(N));
+  } else {
+    cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;
+    exit(1);
+  }
+  vector<score_t> bleu_weights;
+  scorer->Init(N, bleu_weights);
+
+  // setup decoder observer
+  MT19937 rng; // random number generator, only for forest sampling
+  HypSampler* observer;
+  if (sample_from == "kbest")
+    observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type));
+  else
+    observer = dynamic_cast<KSampler*>(new KSampler(k, &rng));
+  observer->SetScorer(scorer);
+
+  // init weights
+  vector<weight_t>& dense_weights = decoder.CurrentWeightVector();
+  SparseVector<weight_t> lambdas, cumulative_penalties, w_average;
+  if (cfg.count("input_weights")) Weights::InitFromFile(cfg["input_weights"].as<string>(), &dense_weights);
+  Weights::InitSparseVector(dense_weights, &lambdas);
+
+  // meta params for perceptron, SVM
+  weight_t eta = cfg["learning_rate"].as<weight_t>();
+  weight_t gamma = cfg["gamma"].as<weight_t>();
+
+  // l1 regularization
+  bool l1naive = false;
+  bool l1clip = false;
+  bool l1cumul = false;
+  weight_t l1_reg = 0;
+  if (cfg["l1_reg"].as<string>() != "none") {
+    string s = cfg["l1_reg"].as<string>();
+    if (s == "naive") l1naive = true;
+    else if (s == "clip") l1clip = true;
+    else if (s == "cumul") l1cumul = true;
+    l1_reg = cfg["l1_reg_strength"].as<weight_t>();
+  }
+
+  // output
+  string output_fn = cfg["output"].as<string>();
+  // input
+  string input_fn = cfg["input"].as<string>();
+  ReadFile input(input_fn);
+  // buffer input for t > 0
+  vector<string> src_str_buf;          // source strings (decoder takes only strings)
+  vector<vector<WordID> > ref_ids_buf; // references as WordID vecs
+  // where temp files go
+  string tmp_path = cfg["tmp"].as<string>();
+#ifdef DTRAIN_LOCAL
+  string refs_fn = cfg["refs"].as<string>();
+  ReadFile refs(refs_fn);
+#else
+  string grammar_buf_fn = gettmpf(tmp_path, "dtrain-grammars");
+  ogzstream grammar_buf_out;
+  grammar_buf_out.open(grammar_buf_fn.c_str());
+#endif
+
+  unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size
+  vector<pair<score_t, score_t> > all_scores;
+  score_t max_score = 0.;
+  unsigned best_it = 0;
+  float overall_time = 0.;
+
+  // output cfg
+  if (!quiet) {
+    cerr << _p5;
+    cerr << endl << "dtrain" << endl << "Parameters:" << endl;
+    cerr << setw(25) << "k " << k << endl;
+    cerr << setw(25) << "N " << N << endl;
+    cerr << setw(25) << "T " << T << endl;
+    cerr << setw(25) << "scorer '" << scorer_str << "'" << endl;
+    if (scorer_str == "approx_bleu")
+      cerr << setw(25) << "approx. B discount " << approx_bleu_d << endl;
+    cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl;
+    if (sample_from == "kbest")
+      cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl;
+    if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl;
+    else cerr << setw(25) << "learning rate " << "bleu diff" << endl;
+    cerr << setw(25) << "gamma " << gamma << endl;
+    cerr << setw(25) << "loss margin " << loss_margin << endl;
+    cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;
+    if (pair_sampling == "XYX")
+      cerr << setw(25) << "hi lo " << hi_lo << endl;
+    cerr << setw(25) << "pair threshold " << pair_threshold << endl;
+    cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl;
+    if (cfg.count("l1_reg"))
+      cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl;
+    if (rescale)
+      cerr << setw(25) << "rescale " << rescale << endl;
+    cerr << setw(25) << "max pairs " << max_pairs << endl;
+    cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
+    cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
+#ifdef DTRAIN_LOCAL
+    cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl;
+#endif
+    cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
+    if (cfg.count("input_weights"))
+      cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl;
+    if (stop_after > 0)
+      cerr << setw(25) << "stop_after " << stop_after << endl;
+    if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " inputs)" << endl;
+  }
+
+
+  for (unsigned t = 0; t < T; t++) // T epochs
+  {
+
+  if (hstreaming) cerr << "reporter:status:Iteration #" << t+1 << " of " << T << endl;
+
+  time_t start, end;
+  time(&start);
+#ifndef DTRAIN_LOCAL
+  igzstream grammar_buf_in;
+  if (t > 0) grammar_buf_in.open(grammar_buf_fn.c_str());
+#endif
+  score_t score_sum = 0.;
+  score_t model_sum(0);
+  unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0;
+  if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl;
+
+  while(true)
+  {
+
+    string in;
+    bool next = false, stop = false; // next iteration or premature stop
+    if (t == 0) {
+      if(!getline(*input, in)) next = true;
+    } else {
+      if (ii == in_sz) next = true; // stop if we reach the end of our input
+    }
+    // stop after X sentences (but still go on for those)
+    if (stop_after > 0 && stop_after == ii && !next) stop = true;
+
+    // produce some pretty output
+    if (!quiet && !verbose) {
+      if (ii == 0) cerr << " ";
+      if ((ii+1) % (DTRAIN_DOTS) == 0) {
+        cerr << ".";
+        cerr.flush();
+      }
+      if ((ii+1) % (20*DTRAIN_DOTS) == 0) {
+        cerr << " " << ii+1 << endl;
+        if (!next && !stop) cerr << " ";
+      }
+      if (stop) {
+        if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl;
+        cerr << "Stopping after " << stop_after << " input sentences." << endl;
+      } else {
+        if (next) {
+          if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl;
+        }
+      }
+    }
+
+    // next iteration
+    if (next || stop) break;
+
+    // weights
+    lambdas.init_vector(&dense_weights);
+
+    // getting input
+    vector<WordID> ref_ids; // reference as vector<WordID>
+#ifndef DTRAIN_LOCAL
+    vector<string> in_split; // input: sid\tsrc\tref\tpsg
+    if (t == 0) {
+      // handling input
+      split_in(in, in_split);
+      if (hstreaming && ii == 0) cerr << "reporter:counter:" << task_id << ",First ID," << in_split[0] << endl;
+      // getting reference
+      vector<string> ref_tok;
+      boost::split(ref_tok, in_split[2], boost::is_any_of(" "));
+      register_and_convert(ref_tok, ref_ids);
+      ref_ids_buf.push_back(ref_ids);
+      // process and set grammar
+      bool broken_grammar = true; // ignore broken grammars
+      for (string::iterator it = in.begin(); it != in.end(); it++) {
+        if (!isspace(*it)) {
+          broken_grammar = false;
+          break;
+        }
+      }
+      if (broken_grammar) {
+        cerr << "Broken grammar for " << ii+1 << "! Ignoring this input." << endl;
+        continue;
+      }
+      boost::replace_all(in, "\t", "\n");
+      in += "\n";
+      grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
+      decoder.AddSupplementalGrammarFromString(in);
+      src_str_buf.push_back(in_split[1]);
+      // decode
+      observer->SetRef(ref_ids);
+      decoder.Decode(in_split[1], observer);
+    } else {
+      // get buffered grammar
+      string grammar_str;
+      while (true) {
+        string rule;
+        getline(grammar_buf_in, rule);
+        if (boost::starts_with(rule, DTRAIN_GRAMMAR_DELIM)) break;
+        grammar_str += rule + "\n";
+      }
+      decoder.AddSupplementalGrammarFromString(grammar_str);
+      // decode
+      observer->SetRef(ref_ids_buf[ii]);
+      decoder.Decode(src_str_buf[ii], observer);
+    }
+#else
+    if (t == 0) {
+      string r_;
+      getline(*refs, r_);
+      vector<string> ref_tok;
+      boost::split(ref_tok, r_, boost::is_any_of(" "));
+      register_and_convert(ref_tok, ref_ids);
+      ref_ids_buf.push_back(ref_ids);
+      src_str_buf.push_back(in);
+    } else {
+      ref_ids = ref_ids_buf[ii];
+    }
+    observer->SetRef(ref_ids);
+    if (t == 0)
+      decoder.Decode(in, observer);
+    else
+      decoder.Decode(src_str_buf[ii], observer);
+#endif
+
+    // get (scored) samples
+    vector<ScoredHyp>* samples = observer->GetSamples();
+
+    if (verbose) {
+      cerr << "--- ref for " << ii << ": ";
+      if (t > 0) printWordIDVec(ref_ids_buf[ii]);
+      else printWordIDVec(ref_ids);
+      cerr << endl;
+      for (unsigned u = 0; u < samples->size(); u++) {
+        cerr << _p2 << _np << "[" << u << ". '";
+        printWordIDVec((*samples)[u].w);
+        cerr << "'" << endl;
+        cerr << "SCORE=" << (*samples)[u].score << ",model="<< (*samples)[u].model << endl;
+        cerr << "F{" << (*samples)[u].f << "} ]" << endl << endl;
+      }
+    }
+
+    score_sum += (*samples)[0].score; // stats for 1best
+    model_sum += (*samples)[0].model;
+
+    f_count += observer->get_f_count();
+    list_sz += observer->get_sz();
+
+    // weight updates
+    if (!noup) {
+      // get pairs
+      vector<pair<ScoredHyp,ScoredHyp> > pairs;
+      if (pair_sampling == "all")
+        all_pairs(samples, pairs, pair_threshold, max_pairs);
+      if (pair_sampling == "XYX")
+        partXYX(samples, pairs, pair_threshold, max_pairs, hi_lo);
+      if (pair_sampling == "PRO")
+        PROsampling(samples, pairs, pair_threshold, max_pairs);
+      npairs += pairs.size();
+
+      for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin();
+           it != pairs.end(); it++) {
+#ifdef DTRAIN_FASTER_PERCEPTRON
+        bool rank_error = true; // pair sampling already did this for us
+        rank_errors++;
+        score_t margin = std::numeric_limits<float>::max();
+#else
+        bool rank_error = it->first.model <= it->second.model;
+        if (rank_error) rank_errors++;
+        score_t margin = fabs(fabs(it->first.model) - fabs(it->second.model));
+        if (!rank_error && margin < loss_margin) margin_violations++;
+#endif
+        if (scale_bleu_diff) eta = it->first.score - it->second.score;
+        if (rank_error || margin < loss_margin) {
+          SparseVector<weight_t> diff_vec = it->first.f - it->second.f;
+          lambdas.plus_eq_v_times_s(diff_vec, eta);
+          if (gamma)
+            lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs));
+        }
+      }
+
+      // l1 regularization
+      if (l1naive) {
+        for (unsigned d = 0; d < lambdas.size(); d++) {
+          weight_t v = lambdas.get(d);
+          lambdas.set_value(d, v - sign(v) * l1_reg);
+        }
+      } else if (l1clip) {
+        for (unsigned d = 0; d < lambdas.size(); d++) {
+          if (lambdas.nonzero(d)) {
+            weight_t v = lambdas.get(d);
+            if (v > 0) {
+              lambdas.set_value(d, max(0., v - l1_reg));
+            } else {
+              lambdas.set_value(d, min(0., v + l1_reg));
+            }
+          }
+        }
+      } else if (l1cumul) {
+        weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input
+        for (unsigned d = 0; d < lambdas.size(); d++) {
+          if (lambdas.nonzero(d)) {
+            weight_t v = lambdas.get(d);
+            weight_t penalty = 0;
+            if (v > 0) {
+              penalty = max(0., v-(acc_penalty + cumulative_penalties.get(d)));
+            } else {
+              penalty = min(0., v+(acc_penalty - cumulative_penalties.get(d)));
+            }
+            lambdas.set_value(d, penalty);
+            cumulative_penalties.set_value(d, cumulative_penalties.get(d)+penalty);
+          }
+        }
+      }
+
+    }
+
+    if (rescale) lambdas /= lambdas.l2norm();
+
+    ++ii;
+
+    if (hstreaming) {
+      rep.update_counter("Seen #"+boost::lexical_cast<string>(t+1), 1u);
+      rep.update_counter("Seen", 1u);
+    }
+
+  } // input loop
+
+  if (average) w_average += lambdas;
+
+  if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset();
+
+  if (t == 0) {
+    in_sz = ii; // remember size of input (# lines)
+    if (hstreaming) {
+      rep.update_counter("|Input|", ii);
+      rep.update_gcounter("|Input|", ii);
+      rep.update_gcounter("Shards", 1u);
+    }
+  }
+
+#ifndef DTRAIN_LOCAL
+  if (t == 0) {
+    grammar_buf_out.close();
+  } else {
+    grammar_buf_in.close();
+  }
+#endif
+
+  // print some stats
+  score_t score_avg = score_sum/(score_t)in_sz;
+  score_t model_avg = model_sum/(score_t)in_sz;
+  score_t score_diff, model_diff;
+  if (t > 0) {
+    score_diff = score_avg - all_scores[t-1].first;
+    model_diff = model_avg - all_scores[t-1].second;
+  } else {
+    score_diff = score_avg;
+    model_diff = model_avg;
+  }
+
+  unsigned nonz = 0;
+  if (!quiet || hstreaming) nonz = (unsigned)lambdas.num_nonzero();
+
+  if (!quiet) {
+    cerr << _p5 << _p << "WEIGHTS" << endl;
+    for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) {
+      cerr << setw(18) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl;
+    }
+    cerr << "        ---" << endl;
+    cerr << _np << "       1best avg score: " << score_avg;
+    cerr << _p << " (" << score_diff << ")" << endl;
+    cerr << _np << " 1best avg model score: " << model_avg;
+    cerr << _p << " (" << model_diff << ")" << endl;
+    cerr << "           avg # pairs: ";
+    cerr << _np << npairs/(float)in_sz << endl;
+    cerr << "        avg # rank err: ";
+    cerr << rank_errors/(float)in_sz << endl;
+#ifndef DTRAIN_FASTER_PERCEPTRON
+    cerr << "     avg # margin viol: ";
+    cerr << margin_violations/(float)in_sz << endl;
+#endif
+    cerr << "    non0 feature count: " <<  nonz << endl;
+    cerr << "           avg list sz: " << list_sz/(float)in_sz << endl;
+    cerr << "           avg f count: " << f_count/(float)list_sz << endl;
+  }
+
+  if (hstreaming) {
+    rep.update_counter("Score 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(score_avg*DTRAIN_SCALE));
+    rep.update_counter("Model 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(model_avg*DTRAIN_SCALE));
+    rep.update_counter("Pairs avg #"+boost::lexical_cast<string>(t+1), (unsigned)((npairs/(weight_t)in_sz)*DTRAIN_SCALE));
+    rep.update_counter("Rank errors avg #"+boost::lexical_cast<string>(t+1), (unsigned)((rank_errors/(weight_t)in_sz)*DTRAIN_SCALE));
+    rep.update_counter("Margin violations avg #"+boost::lexical_cast<string>(t+1), (unsigned)((margin_violations/(weight_t)in_sz)*DTRAIN_SCALE));
+    rep.update_counter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz);
+    rep.update_gcounter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz);
+  }
+
+  pair<score_t,score_t> remember;
+  remember.first = score_avg;
+  remember.second = model_avg;
+  all_scores.push_back(remember);
+  if (score_avg > max_score) {
+    max_score = score_avg;
+    best_it = t;
+  }
+  time (&end);
+  float time_diff = difftime(end, start);
+  overall_time += time_diff;
+  if (!quiet) {
+    cerr << _p2 << _np << "(time " << time_diff/60. << " min, ";
+    cerr << time_diff/in_sz << " s/S)" << endl;
+  }
+  if (t+1 != T && !quiet) cerr << endl;
+
+  if (noup) break;
+
+  // write weights to file
+  if (select_weights == "best" || keep) {
+    lambdas.init_vector(&dense_weights);
+    string w_fn = "weights." + boost::lexical_cast<string>(t) + ".gz";
+    Weights::WriteToFile(w_fn, dense_weights, true);
+  }
+
+  } // outer loop
+
+  if (average) w_average /= (weight_t)T;
+
+#ifndef DTRAIN_LOCAL
+  unlink(grammar_buf_fn.c_str());
+#endif
+
+  if (!noup) {
+    if (!quiet) cerr << endl << "Writing weights file to '" << output_fn << "' ..." << endl;
+    if (select_weights == "last" || average) { // last, average
+      WriteFile of(output_fn); // works with '-'
+      ostream& o = *of.stream();
+      o.precision(17);
+      o << _np;
+      if (average) {
+        for (SparseVector<weight_t>::iterator it = w_average.begin(); it != w_average.end(); ++it) {
+	      if (it->second == 0) continue;
+          o << FD::Convert(it->first) << '\t' << it->second << endl;
+        }
+      } else {
+        for (SparseVector<weight_t>::iterator it = lambdas.begin(); it != lambdas.end(); ++it) {
+	      if (it->second == 0) continue;
+          o << FD::Convert(it->first) << '\t' << it->second << endl;
+        }
+      }
+    } else if (select_weights == "VOID") { // do nothing with the weights
+    } else { // best
+      if (output_fn != "-") {
+        CopyFile("weights."+boost::lexical_cast<string>(best_it)+".gz", output_fn);
+      } else {
+        ReadFile bestw("weights."+boost::lexical_cast<string>(best_it)+".gz");
+        string o;
+        cout.precision(17);
+        cout << _np;
+        while(getline(*bestw, o)) cout << o << endl;
+      }
+      if (!keep) {
+        for (unsigned i = 0; i < T; i++) {
+          string s = "weights." + boost::lexical_cast<string>(i) + ".gz";
+          unlink(s.c_str());
+        }
+      }
+    }
+    if (output_fn == "-" && hstreaming) cout << "__SHARD_COUNT__\t1" << endl;
+    if (!quiet) cerr << "done" << endl;
+  }
+
+  if (!quiet) {
+    cerr << _p5 << _np << endl << "---" << endl << "Best iteration: ";
+    cerr << best_it+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl;
+    cerr << "This took " << overall_time/60. << " min." << endl;
+  }
+}
+
diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h
new file mode 100644
index 00000000..4b6f415c
--- /dev/null
+++ b/training/dtrain/dtrain.h
@@ -0,0 +1,97 @@
+#ifndef _DTRAIN_H_
+#define _DTRAIN_H_
+
+#undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs
+                                 // DO NOT USE WITH SVM!
+//#define DTRAIN_LOCAL
+#define DTRAIN_DOTS 10 // after how many inputs to display a '.'
+#define DTRAIN_GRAMMAR_DELIM "########EOS########"
+#define DTRAIN_SCALE 100000
+
+
+#include <iomanip>
+#include <climits>
+#include <string.h>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/program_options.hpp>
+
+#include "ksampler.h"
+#include "pairsampling.h"
+
+#include "filelib.h"
+
+
+using namespace std;
+using namespace dtrain;
+namespace po = boost::program_options;
+
+inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids)
+{
+  vector<string>::const_iterator it;
+  for (it = strs.begin(); it < strs.end(); it++)
+    ids.push_back(TD::Convert(*it));
+}
+
+inline string gettmpf(const string path, const string infix)
+{
+  char fn[path.size() + infix.size() + 8];
+  strcpy(fn, path.c_str());
+  strcat(fn, "/");
+  strcat(fn, infix.c_str());
+  strcat(fn, "-XXXXXX");
+  if (!mkstemp(fn)) {
+    cerr << "Cannot make temp file in" << path << " , exiting." << endl;
+    exit(1);
+  }
+  return string(fn);
+}
+
+inline void split_in(string& s, vector<string>& parts)
+{
+  unsigned f = 0;
+  for(unsigned i = 0; i < 3; i++) {
+    unsigned e = f;
+    f = s.find("\t", f+1);
+    if (e != 0) parts.push_back(s.substr(e+1, f-e-1));
+    else parts.push_back(s.substr(0, f));
+  }
+  s.erase(0, f+1);
+}
+
+struct HSReporter
+{
+  string task_id_;
+
+  HSReporter(string task_id) : task_id_(task_id) {}
+
+  inline void update_counter(string name, unsigned amount) {
+    cerr << "reporter:counter:" << task_id_ << "," << name << "," << amount << endl;
+  }
+  inline void update_gcounter(string name, unsigned amount) {
+    cerr << "reporter:counter:Global," << name << "," << amount << endl;
+  }
+};
+
+inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }
+inline ostream& _p(ostream& out)  { return out << setiosflags(ios::showpos); }
+inline ostream& _p2(ostream& out) { return out << setprecision(2); }
+inline ostream& _p5(ostream& out) { return out << setprecision(5); }
+
+inline void printWordIDVec(vector<WordID>& v)
+{
+  for (unsigned i = 0; i < v.size(); i++) {
+    cerr << TD::Convert(v[i]);
+    if (i < v.size()-1) cerr << " ";
+  }
+}
+
+template<typename T>
+inline T sign(T z)
+{
+  if (z == 0) return 0;
+  return z < 0 ? -1 : +1;
+}
+
+#endif
+
diff --git a/training/dtrain/hstreaming/avg.rb b/training/dtrain/hstreaming/avg.rb
new file mode 100755
index 00000000..2599c732
--- /dev/null
+++ b/training/dtrain/hstreaming/avg.rb
@@ -0,0 +1,32 @@
+#!/usr/bin/env ruby
+# first arg may be an int of custom shard count
+
+shard_count_key = "__SHARD_COUNT__"
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+w = {}
+c = {}
+w.default = 0
+c.default = 0
+while line = STDIN.gets
+  key, val = line.split /\s/
+  w[key] += val.to_f
+  c[key] += 1
+end
+
+if ARGV.size == 0
+  shard_count = w["__SHARD_COUNT__"]
+else
+  shard_count = ARGV[0].to_f
+end
+w.each_key { |k|
+  if k == shard_count_key
+    next
+  else
+    puts "#{k}\t#{w[k]/shard_count}"
+    #puts "# #{c[k]}"
+  end
+}
+
diff --git a/training/dtrain/hstreaming/cdec.ini b/training/dtrain/hstreaming/cdec.ini
new file mode 100644
index 00000000..d4f5cecd
--- /dev/null
+++ b/training/dtrain/hstreaming/cdec.ini
@@ -0,0 +1,22 @@
+formalism=scfg
+add_pass_through_rules=true
+scfg_max_span_limit=15
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=30
+feature_function=WordPenalty
+feature_function=KLanguageModel nc-wmt11.en.srilm.gz
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+#feature_function=RuleIdentityFeatures
+#feature_function=RuleNgramFeatures
+#feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/training/dtrain/hstreaming/dtrain.ini b/training/dtrain/hstreaming/dtrain.ini
new file mode 100644
index 00000000..a2c219a1
--- /dev/null
+++ b/training/dtrain/hstreaming/dtrain.ini
@@ -0,0 +1,15 @@
+input=-
+output=-
+decoder_config=cdec.ini
+tmp=/var/hadoop/mapred/local/
+epochs=1
+k=100
+N=4
+learning_rate=0.0001
+gamma=0
+scorer=stupid_bleu
+sample_from=kbest
+filter=uniq
+pair_sampling=XYX
+pair_threshold=0
+select_weights=last
diff --git a/training/dtrain/hstreaming/dtrain.sh b/training/dtrain/hstreaming/dtrain.sh
new file mode 100755
index 00000000..877ff94c
--- /dev/null
+++ b/training/dtrain/hstreaming/dtrain.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# script to run dtrain with a task id
+
+pushd . &>/dev/null
+cd ..
+ID=$(basename $(pwd)) # attempt_...
+popd &>/dev/null
+./dtrain -c dtrain.ini --hstreaming $ID
+
diff --git a/training/dtrain/hstreaming/hadoop-streaming-job.sh b/training/dtrain/hstreaming/hadoop-streaming-job.sh
new file mode 100755
index 00000000..92419956
--- /dev/null
+++ b/training/dtrain/hstreaming/hadoop-streaming-job.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+EXP=a_simple_test
+
+# change these vars to fit your hadoop installation
+HADOOP_HOME=/usr/lib/hadoop-0.20
+JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
+HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
+
+ IN=input_on_hdfs
+OUT=output_weights_on_hdfs
+
+# you can -reducer to NONE if you want to
+# do feature selection/averaging locally (e.g. to
+# keep weights of all epochs)
+$HSTREAMING \
+    -mapper "dtrain.sh" \
+    -reducer "ruby lplp.rb l2 select_k 100000" \
+    -input $IN \
+    -output $OUT \
+    -file dtrain.sh \
+    -file lplp.rb \
+    -file ../dtrain \
+    -file dtrain.ini \
+    -file cdec.ini \
+    -file ../test/example/nc-wmt11.en.srilm.gz \
+    -jobconf mapred.reduce.tasks=30 \
+    -jobconf mapred.max.map.failures.percent=0 \
+    -jobconf mapred.job.name="dtrain $EXP"
+
diff --git a/training/dtrain/hstreaming/lplp.rb b/training/dtrain/hstreaming/lplp.rb
new file mode 100755
index 00000000..f0cd58c5
--- /dev/null
+++ b/training/dtrain/hstreaming/lplp.rb
@@ -0,0 +1,131 @@
+# lplp.rb
+
+# norms
+def l0(feature_column, n)
+  if feature_column.size >= n then return 1 else return 0 end
+end
+
+def l1(feature_column, n=-1)
+  return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i }
+end
+
+def l2(feature_column, n=-1)
+  return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i }
+end
+
+def linfty(feature_column, n=-1)
+  return feature_column.map { |i| i.abs }.max
+end
+
+# stats
+def median(feature_column, n)
+  return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2]
+end
+
+def mean(feature_column, n)
+  return feature_column.reduce { |sum, i| sum+i } / n
+end
+
+# selection
+def select_k(weights, norm_fun, n, k=10000)
+  weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p|
+    puts "#{p[0]}\t#{mean(p[1], n)}"
+    k -= 1
+    if k == 0 then break end
+  }
+end
+
+def cut(weights, norm_fun, n, epsilon=0.0001)
+  weights.each { |k,v|
+    if norm_fun.call(v, n).abs >= epsilon
+      puts "#{k}\t#{mean(v, n)}"
+    end
+  }
+end
+
+# test
+def _test()
+  puts
+  w = {}
+  w["a"] = [1, 2, 3]
+  w["b"] = [1, 2]
+  w["c"] = [66]
+  w["d"] = [10, 20, 30]
+  n = 3
+  puts w.to_s
+  puts
+  puts "select_k"
+  puts "l0 expect ad"
+  select_k(w, method(:l0), n, 2)
+  puts "l1 expect cd"
+  select_k(w, method(:l1), n, 2)
+  puts "l2 expect c"
+  select_k(w, method(:l2), n, 1)
+  puts
+  puts "cut"
+  puts "l1 expect cd"
+  cut(w, method(:l1), n, 7)
+  puts
+  puts "median"
+  a = [1,2,3,4,5]
+  puts a.to_s
+  puts median(a, 5)
+  puts
+  puts "#{median(a, 7)} <- that's because we add missing 0s:"
+  puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s
+  puts
+  puts "mean expect bc"
+  w.clear
+  w["a"] = [2]
+  w["b"] = [2.1]
+  w["c"] = [2.2]
+  cut(w, method(:mean), 1, 2.05)
+ exit
+end
+#_test()
+
+# actually do something
+def usage()
+  puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> [n] < <input>"
+  puts "   l0...: norms for selection"
+  puts "select_k: only output top k (according to the norm of their column vector) features"
+  puts "     cut: output features with weight >= threshold"
+  puts "       n: if we do not have a shard count use this number for averaging"
+  exit
+end
+
+if ARGV.size < 3 then usage end
+norm_fun = method(ARGV[0].to_sym)
+type = ARGV[1]
+x = ARGV[2].to_f
+
+shard_count_key = "__SHARD_COUNT__"
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+w = {}
+shard_count = 0
+while line = STDIN.gets
+  key, val = line.split /\s+/
+  if key == shard_count_key
+    shard_count += 1
+    next
+  end
+  if w.has_key? key
+    w[key].push val.to_f
+  else
+    w[key] = [val.to_f]
+  end
+end
+
+if ARGV.size == 4 then shard_count = ARGV[3].to_f end
+
+if type == 'cut'
+  cut(w, norm_fun, shard_count, x)
+elsif type == 'select_k'
+  select_k(w, norm_fun, shard_count, x)
+else
+  puts "oh oh"
+end
+
diff --git a/training/dtrain/hstreaming/red-test b/training/dtrain/hstreaming/red-test
new file mode 100644
index 00000000..2623d697
--- /dev/null
+++ b/training/dtrain/hstreaming/red-test
@@ -0,0 +1,9 @@
+a	1
+b	2
+c	3.5
+a	1
+b	2
+c	3.5
+d	1
+e	2
+__SHARD_COUNT__	2
diff --git a/training/dtrain/kbestget.h b/training/dtrain/kbestget.h
new file mode 100644
index 00000000..dd8882e1
--- /dev/null
+++ b/training/dtrain/kbestget.h
@@ -0,0 +1,152 @@
+#ifndef _DTRAIN_KBESTGET_H_
+#define _DTRAIN_KBESTGET_H_
+
+#include "kbest.h" // cdec
+#include "sentence_metadata.h"
+
+#include "verbose.h"
+#include "viterbi.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "weights.h"
+#include "logval.h"
+
+using namespace std;
+
+namespace dtrain
+{
+
+
+typedef double score_t;
+
+struct ScoredHyp
+{
+  vector<WordID> w;
+  SparseVector<double> f;
+  score_t model;
+  score_t score;
+  unsigned rank;
+};
+
+struct LocalScorer
+{
+  unsigned N_;
+  vector<score_t> w_;
+
+  virtual score_t
+  Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len)=0;
+
+  void Reset() {} // only for approx bleu
+
+  inline void
+  Init(unsigned N, vector<score_t> weights)
+  {
+    assert(N > 0);
+    N_ = N;
+    if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_);
+    else w_ = weights;
+  }
+
+  inline score_t
+  brevity_penalty(const unsigned hyp_len, const unsigned ref_len)
+  {
+    if (hyp_len > ref_len) return 1;
+    return exp(1 - (score_t)ref_len/hyp_len);
+  }
+};
+
+struct HypSampler : public DecoderObserver
+{
+  LocalScorer* scorer_;
+  vector<WordID>* ref_;
+  unsigned f_count_, sz_;
+  virtual vector<ScoredHyp>* GetSamples()=0;
+  inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; }
+  inline void SetRef(vector<WordID>& ref) { ref_ = &ref; }
+  inline unsigned get_f_count() { return f_count_; }
+  inline unsigned get_sz() { return sz_; }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+
+struct KBestGetter : public HypSampler
+{
+  const unsigned k_;
+  const string filter_type_;
+  vector<ScoredHyp> s_;
+  unsigned src_len_;
+
+  KBestGetter(const unsigned k, const string filter_type) :
+    k_(k), filter_type_(filter_type) {}
+
+  virtual void
+  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+  {
+    src_len_ = smeta.GetSourceLength();
+    KBestScored(*hg);
+  }
+
+  vector<ScoredHyp>* GetSamples() { return &s_; }
+
+  void
+  KBestScored(const Hypergraph& forest)
+  {
+    if (filter_type_ == "uniq") {
+      KBestUnique(forest);
+    } else if (filter_type_ == "not") {
+      KBestNoFilter(forest);
+    }
+  }
+
+  void
+  KBestUnique(const Hypergraph& forest)
+  {
+    s_.clear(); sz_ = f_count_ = 0;
+    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,
+      KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_);
+    for (unsigned i = 0; i < k_; ++i) {
+      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique,
+              prob_t, EdgeProb>::Derivation* d =
+            kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+      if (!d) break;
+      ScoredHyp h;
+      h.w = d->yield;
+      h.f = d->feature_values;
+      h.model = log(d->score);
+      h.rank = i;
+      h.score = scorer_->Score(h.w, *ref_, i, src_len_);
+      s_.push_back(h);
+      sz_++;
+      f_count_ += h.f.size();
+    }
+  }
+
+  void
+  KBestNoFilter(const Hypergraph& forest)
+  {
+    s_.clear(); sz_ = f_count_ = 0;
+    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_);
+    for (unsigned i = 0; i < k_; ++i) {
+      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+            kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+      if (!d) break;
+      ScoredHyp h;
+      h.w = d->yield;
+      h.f = d->feature_values;
+      h.model = log(d->score);
+      h.rank = i;
+      h.score = scorer_->Score(h.w, *ref_, i, src_len_);
+      s_.push_back(h);
+      sz_++;
+      f_count_ += h.f.size();
+    }
+  }
+};
+
+
+} // namespace
+
+#endif
+
diff --git a/training/dtrain/ksampler.h b/training/dtrain/ksampler.h
new file mode 100644
index 00000000..bc2f56cd
--- /dev/null
+++ b/training/dtrain/ksampler.h
@@ -0,0 +1,61 @@
+#ifndef _DTRAIN_KSAMPLER_H_
+#define _DTRAIN_KSAMPLER_H_
+
+#include "hg_sampler.h" // cdec
+#include "kbestget.h"
+#include "score.h"
+
+namespace dtrain
+{
+
+bool
+cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b)
+{
+  return a.model > b.model;
+}
+
+struct KSampler : public HypSampler
+{
+  const unsigned k_;
+  vector<ScoredHyp> s_;
+  MT19937* prng_;
+  score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);
+  unsigned src_len_;
+
+  explicit KSampler(const unsigned k, MT19937* prng) :
+    k_(k), prng_(prng) {}
+
+  virtual void
+  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+  {
+    src_len_ = smeta.GetSourceLength();
+    ScoredSamples(*hg);
+  }
+
+  vector<ScoredHyp>* GetSamples() { return &s_; }
+
+  void ScoredSamples(const Hypergraph& forest) {
+    s_.clear(); sz_ = f_count_ = 0;
+    std::vector<HypergraphSampler::Hypothesis> samples;
+    HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples);
+    for (unsigned i = 0; i < k_; ++i) {
+      ScoredHyp h;
+      h.w = samples[i].words;
+      h.f = samples[i].fmap;
+      h.model = log(samples[i].model_score);
+      h.rank = i;
+      h.score = scorer_->Score(h.w, *ref_, i, src_len_);
+      s_.push_back(h);
+      sz_++;
+      f_count_ += h.f.size();
+    }
+    sort(s_.begin(), s_.end(), cmp_hyp_by_model_d);
+    for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i;
+  }
+};
+
+
+} // namespace
+
+#endif
+
diff --git a/training/dtrain/pairsampling.h b/training/dtrain/pairsampling.h
new file mode 100644
index 00000000..84be1efb
--- /dev/null
+++ b/training/dtrain/pairsampling.h
@@ -0,0 +1,149 @@
+#ifndef _DTRAIN_PAIRSAMPLING_H_
+#define _DTRAIN_PAIRSAMPLING_H_
+
+namespace dtrain
+{
+
+
+bool
+accept_pair(score_t a, score_t b, score_t threshold)
+{
+  if (fabs(a - b) < threshold) return false;
+  return true;
+}
+
+bool
+cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b)
+{
+  return a.score > b.score;
+}
+
+inline void
+all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float _unused=1)
+{
+  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
+  unsigned sz = s->size();
+  bool b = false;
+  unsigned count = 0;
+  for (unsigned i = 0; i < sz-1; i++) {
+    for (unsigned j = i+1; j < sz; j++) {
+      if (threshold > 0) {
+        if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
+          training.push_back(make_pair((*s)[i], (*s)[j]));
+      } else {
+        if ((*s)[i].score != (*s)[j].score)
+          training.push_back(make_pair((*s)[i], (*s)[j]));
+      }
+      if (++count == max) {
+        b = true;
+        break;
+      }
+    }
+    if (b) break;
+  }
+}
+
+/*
+ * multipartite ranking
+ *  sort (descending) by bleu
+ *  compare top X to middle Y and low X
+ *  cmp middle Y to low X
+ */
+
+inline void
+partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float hi_lo)
+{
+  unsigned sz = s->size();
+  if (sz < 2) return;
+  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
+  unsigned sep = round(sz*hi_lo);
+  unsigned sep_hi = sep;
+  if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi;
+  else sep_hi = 1;
+  bool b = false;
+  unsigned count = 0;
+  for (unsigned i = 0; i < sep_hi; i++) {
+    for (unsigned j = sep_hi; j < sz; j++) {
+#ifdef DTRAIN_FASTER_PERCEPTRON
+      if ((*s)[i].model <= (*s)[j].model) {
+#endif
+      if (threshold > 0) {
+        if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
+          training.push_back(make_pair((*s)[i], (*s)[j]));
+      } else {
+        if ((*s)[i].score != (*s)[j].score)
+          training.push_back(make_pair((*s)[i], (*s)[j]));
+      }
+      if (++count == max) {
+        b = true;
+        break;
+      }
+#ifdef DTRAIN_FASTER_PERCEPTRON
+      }
+#endif
+    }
+    if (b) break;
+  }
+  unsigned sep_lo = sz-sep;
+  while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo;
+  for (unsigned i = sep_hi; i < sz-sep_lo; i++) {
+    for (unsigned j = sz-sep_lo; j < sz; j++) {
+#ifdef DTRAIN_FASTER_PERCEPTRON
+      if ((*s)[i].model <= (*s)[j].model) {
+#endif
+      if (threshold > 0) {
+        if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
+          training.push_back(make_pair((*s)[i], (*s)[j]));
+      } else {
+        if ((*s)[i].score != (*s)[j].score)
+          training.push_back(make_pair((*s)[i], (*s)[j]));
+      }
+      if (++count == max) return;
+#ifdef DTRAIN_FASTER_PERCEPTRON
+      }
+#endif
+    }
+  }
+}
+
+/*
+ * pair sampling as in
+ * 'Tuning as Ranking' (Hopkins & May, 2011)
+ *     count = 5000
+ * threshold = 5% BLEU (0.05 for param 3)
+ *       cut = top 50
+ */
+bool
+_PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b)
+{
+  return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
+}
+inline void
+PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float _unused=1)
+{
+  unsigned max_count = 5000, count = 0, sz = s->size();
+  bool b = false;
+  for (unsigned i = 0; i < sz-1; i++) {
+    for (unsigned j = i+1; j < sz; j++) {
+      if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) {
+        training.push_back(make_pair((*s)[i], (*s)[j]));
+        if (++count == max_count) {
+          b = true;
+          break;
+        }
+      }
+    }
+    if (b) break;
+  }
+  if (training.size() > 50) {
+    sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d);
+    training.erase(training.begin()+50, training.end());
+  }
+  return;
+}
+
+
+} // namespace
+
+#endif
+
diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb
new file mode 100755
index 00000000..1d277ff6
--- /dev/null
+++ b/training/dtrain/parallelize.rb
@@ -0,0 +1,79 @@
+#!/usr/bin/env ruby
+
+
+if ARGV.size != 5
+  STDERR.write "Usage: "
+  STDERR.write "ruby parallelize.rb <#shards> <input> <refs> <epochs> <dtrain.ini>\n"
+  exit
+end
+
+dtrain_bin = '/home/pks/bin/dtrain_local'
+ruby       = '/usr/bin/ruby'
+lplp_rb    = '/home/pks/mt/cdec-dtrain/dtrain/hstreaming/lplp.rb'
+lplp_args  = 'l2 select_k 100000'
+gzip       = '/bin/gzip'
+
+num_shards = ARGV[0].to_i
+input      = ARGV[1]
+refs       = ARGV[2]
+epochs     = ARGV[3].to_i
+ini        = ARGV[4]
+
+
+`mkdir work`
+
+def make_shards(input, refs, num_shards)
+  lc = `wc -l #{input}`.split.first.to_i
+  shard_sz = lc / num_shards
+  leftover = lc % num_shards
+  in_f = File.new input, 'r'
+  refs_f = File.new refs, 'r'
+  shard_in_files = []
+  shard_refs_files = []
+  0.upto(num_shards-1) { |shard|
+    shard_in = File.new "work/shard.#{shard}.in", 'w+'
+    shard_refs = File.new "work/shard.#{shard}.refs", 'w+'
+    0.upto(shard_sz-1) { |i|
+      shard_in.write in_f.gets
+      shard_refs.write refs_f.gets
+    }
+    shard_in_files << shard_in
+    shard_refs_files << shard_refs
+  }
+  while leftover > 0
+    shard_in_files[-1].write in_f.gets
+    shard_refs_files[-1].write refs_f.gets
+    leftover -= 1
+  end
+  (shard_in_files + shard_refs_files).each do |f| f.close end
+  in_f.close
+  refs_f.close
+end
+
+make_shards input, refs, num_shards
+
+0.upto(epochs-1) { |epoch|
+  pids = []
+  input_weights = ''
+  if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end
+  weights_files = []
+  0.upto(num_shards-1) { |shard|
+    pids << Kernel.fork {
+      `#{dtrain_bin} -c #{ini}\
+        --input work/shard.#{shard}.in\
+        --refs work/shard.#{shard}.refs #{input_weights}\
+        --output work/weights.#{shard}.#{epoch}\
+        &> work/out.#{shard}.#{epoch}`
+    }
+    weights_files << "work/weights.#{shard}.#{epoch}"
+  }
+  pids.each { |pid| Process.wait(pid) }
+  cat = File.new('work/weights_cat', 'w+')
+  weights_files.each { |f| cat.write File.new(f, 'r').read }
+  cat.close
+  `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat &> work/weights.#{epoch}`
+}
+
+`rm work/weights_cat`
+`#{gzip} work/*`
+
diff --git a/training/dtrain/parallelize/test/cdec.ini b/training/dtrain/parallelize/test/cdec.ini
new file mode 100644
index 00000000..72e99dc5
--- /dev/null
+++ b/training/dtrain/parallelize/test/cdec.ini
@@ -0,0 +1,22 @@
+formalism=scfg
+add_pass_through_rules=true
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=200
+scfg_max_span_limit=15
+feature_function=WordPenalty
+feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+#feature_function=RuleIdentityFeatures
+#feature_function=RuleNgramFeatures
+#feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/training/dtrain/parallelize/test/dtrain.ini b/training/dtrain/parallelize/test/dtrain.ini
new file mode 100644
index 00000000..03f9d240
--- /dev/null
+++ b/training/dtrain/parallelize/test/dtrain.ini
@@ -0,0 +1,15 @@
+k=100
+N=4
+learning_rate=0.0001
+gamma=0
+loss_margin=0
+epochs=1
+scorer=stupid_bleu
+sample_from=kbest
+filter=uniq
+pair_sampling=XYX
+hi_lo=0.1
+select_weights=last
+print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
+tmp=/tmp
+decoder_config=cdec.ini
diff --git a/training/dtrain/parallelize/test/in b/training/dtrain/parallelize/test/in
new file mode 100644
index 00000000..a312809f
--- /dev/null
+++ b/training/dtrain/parallelize/test/in
@@ -0,0 +1,10 @@
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.0.gz" id="0">barack obama erhält als vierter us @-@ präsident den frieden nobelpreis</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.1.gz" id="1">der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.2.gz" id="2">darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.3.gz" id="3">der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.4.gz" id="4">zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.5.gz" id="5">das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.6.gz" id="6">nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.7.gz" id="7">diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.8.gz" id="8">das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.9.gz" id="9">der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt .</seg>
diff --git a/training/dtrain/parallelize/test/refs b/training/dtrain/parallelize/test/refs
new file mode 100644
index 00000000..4d3128cb
--- /dev/null
+++ b/training/dtrain/parallelize/test/refs
@@ -0,0 +1,10 @@
+barack obama becomes the fourth american president to receive the nobel peace prize
+the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so .
+he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things .
+the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule .
+first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations .
+the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway .
+then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award .
+he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office .
+the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan .
+the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries .
diff --git a/training/dtrain/score.cc b/training/dtrain/score.cc
new file mode 100644
index 00000000..34fc86a9
--- /dev/null
+++ b/training/dtrain/score.cc
@@ -0,0 +1,254 @@
+#include "score.h"
+
+namespace dtrain
+{
+
+
+/*
+ * bleu
+ *
+ * as in "BLEU: a Method for Automatic Evaluation
+ *        of Machine Translation"
+ * (Papineni et al. '02)
+ *
+ * NOTE: 0 if for one n \in {1..N} count is 0
+ */
+score_t
+BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len)
+{
+  if (hyp_len == 0 || ref_len == 0) return 0.;
+  unsigned M = N_;
+  vector<score_t> v = w_;
+  if (ref_len < N_) {
+    M = ref_len;
+    for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M);
+  }
+  score_t sum = 0;
+  for (unsigned i = 0; i < M; i++) {
+    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) return 0.;
+    sum += v[i] * log((score_t)counts.clipped_[i]/counts.sum_[i]);
+  }
+  return brevity_penalty(hyp_len, ref_len) * exp(sum);
+}
+
+score_t
+BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+                  const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
+  if (hyp_len == 0 || ref_len == 0) return 0.;
+  NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+  return Bleu(counts, hyp_len, ref_len);
+}
+
+/*
+ * 'stupid' bleu
+ *
+ * as in "ORANGE: a Method for Evaluating
+ *        Automatic Evaluation Metrics
+ *        for Machine Translation"
+ * (Lin & Och '04)
+ *
+ * NOTE: 0 iff no 1gram match
+ */
+score_t
+StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+                        const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
+  if (hyp_len == 0 || ref_len == 0) return 0.;
+  NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+  unsigned M = N_;
+  vector<score_t> v = w_;
+  if (ref_len < N_) {
+    M = ref_len;
+    for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M);
+  }
+  score_t sum = 0, add = 0;
+  for (unsigned i = 0; i < M; i++) {
+    if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.;
+    if (i == 1) add = 1;
+    sum += v[i] * log(((score_t)counts.clipped_[i] + add)/((counts.sum_[i] + add)));
+  }
+  return  brevity_penalty(hyp_len, ref_len) * exp(sum);
+}
+
+/*
+ * smooth bleu
+ *
+ * as in "An End-to-End Discriminative Approach
+ *        to Machine Translation"
+ * (Liang et al. '06)
+ *
+ * NOTE: max is 0.9375 (with N=4)
+ */
+score_t
+SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+                        const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
+  if (hyp_len == 0 || ref_len == 0) return 0.;
+  NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+  unsigned M = N_;
+  if (ref_len < N_) M = ref_len;
+  score_t sum = 0.;
+  vector<score_t> i_bleu;
+  for (unsigned i = 0; i < M; i++) i_bleu.push_back(0.);
+  for (unsigned i = 0; i < M; i++) {
+    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) {
+      break;
+    } else {
+      score_t i_ng = log((score_t)counts.clipped_[i]/counts.sum_[i]);
+      for (unsigned j = i; j < M; j++) {
+        i_bleu[j] += (1/((score_t)j+1)) * i_ng;
+      }
+    }
+    sum += exp(i_bleu[i])/pow(2.0, (double)(N_-i));
+  }
+  return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+/*
+ * 'sum' bleu
+ *
+ * sum up Ngram precisions
+ */
+score_t
+SumBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+                        const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
+  if (hyp_len == 0 || ref_len == 0) return 0.;
+  NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+  unsigned M = N_;
+  if (ref_len < N_) M = ref_len;
+  score_t sum = 0.;
+  unsigned j = 1;
+  for (unsigned i = 0; i < M; i++) {
+    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break;
+    sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2.0, (double) (N_-j+1));
+    j++;
+  }
+  return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+/*
+ * 'sum' (exp) bleu
+ *
+ * sum up exp(Ngram precisions)
+ */
+score_t
+SumExpBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+                        const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
+  if (hyp_len == 0 || ref_len == 0) return 0.;
+  NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+  unsigned M = N_;
+  if (ref_len < N_) M = ref_len;
+  score_t sum = 0.;
+  unsigned j = 1;
+  for (unsigned i = 0; i < M; i++) {
+    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break;
+    sum += exp(((score_t)counts.clipped_[i]/counts.sum_[i]))/pow(2.0, (double) (N_-j+1));
+    j++;
+  }
+  return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+/*
+ * 'sum' (whatever) bleu
+ *
+ * sum up exp(weight * log(Ngram precisions))
+ */
+score_t
+SumWhateverBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+                        const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
+  if (hyp_len == 0 || ref_len == 0) return 0.;
+  NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+  unsigned M = N_;
+  vector<score_t> v = w_;
+  if (ref_len < N_) {
+    M = ref_len;
+    for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M);
+  }
+  score_t sum = 0.;
+  unsigned j = 1;
+  for (unsigned i = 0; i < M; i++) {
+    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break;
+    sum += exp(v[i] * log(((score_t)counts.clipped_[i]/counts.sum_[i])))/pow(2.0, (double) (N_-j+1));
+    j++;
+  }
+  return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+/*
+ * approx. bleu
+ *
+ * as in "Online Large-Margin Training of Syntactic
+ *        and Structural Translation Features"
+ * (Chiang et al. '08)
+ *
+ * NOTE: Needs some more code in dtrain.cc .
+ *       No scaling by src len.
+ */
+score_t
+ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+                        const unsigned rank, const unsigned src_len)
+{
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
+  if (ref_len == 0) return 0.;
+  score_t score = 0.;
+  NgramCounts counts(N_);
+  if (hyp_len > 0) {
+    counts = make_ngram_counts(hyp, ref, N_);
+    NgramCounts tmp = glob_onebest_counts_ + counts;
+    score = Bleu(tmp, hyp_len, ref_len);
+  }
+  if (rank == 0) { // 'context of 1best translations'
+    glob_onebest_counts_ += counts;
+    glob_onebest_counts_ *= discount_;
+    glob_hyp_len_ = discount_ * (glob_hyp_len_ + hyp_len);
+    glob_ref_len_ = discount_ * (glob_ref_len_ + ref_len);
+    glob_src_len_ = discount_ * (glob_src_len_ + src_len);
+  }
+  return score;
+}
+
+/*
+ * Linear (Corpus) Bleu
+ *
+ * as in "Lattice Minimum Bayes-Risk Decoding
+ *        for Statistical Machine Translation"
+ * (Tromble et al. '08)
+ *
+ */
+score_t
+LinearBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+                        const unsigned rank, const unsigned /*src_len*/)
+{
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
+  if (ref_len == 0) return 0.;
+  unsigned M = N_;
+  if (ref_len < N_) M = ref_len;
+  NgramCounts counts(M);
+  if (hyp_len > 0)
+    counts = make_ngram_counts(hyp, ref, M);
+  score_t ret = 0.;
+  for (unsigned i = 0; i < M; i++) {
+    if (counts.sum_[i] == 0 || onebest_counts_.sum_[i] == 0) break;
+    ret += counts.sum_[i]/onebest_counts_.sum_[i];
+  }
+  ret = -(hyp_len/(score_t)onebest_len_) + (1./M) * ret;
+  if (rank == 0) {
+    onebest_len_ += hyp_len;
+    onebest_counts_ += counts;
+  }
+  return ret;
+}
+
+
+} // namespace
+
diff --git a/training/dtrain/score.h b/training/dtrain/score.h
new file mode 100644
index 00000000..f317c903
--- /dev/null
+++ b/training/dtrain/score.h
@@ -0,0 +1,212 @@
+#ifndef _DTRAIN_SCORE_H_
+#define _DTRAIN_SCORE_H_
+
+#include "kbestget.h"
+
+using namespace std;
+
+namespace dtrain
+{
+
+
+struct NgramCounts
+{
+  unsigned N_;
+  map<unsigned, score_t> clipped_;
+  map<unsigned, score_t> sum_;
+
+  NgramCounts(const unsigned N) : N_(N) { Zero(); }
+
+  inline void
+  operator+=(const NgramCounts& rhs)
+  {
+    if (rhs.N_ > N_) Resize(rhs.N_);
+    for (unsigned i = 0; i < N_; i++) {
+      this->clipped_[i] += rhs.clipped_.find(i)->second;
+      this->sum_[i] += rhs.sum_.find(i)->second;
+    }
+  }
+
+  inline const NgramCounts
+  operator+(const NgramCounts &other) const
+  {
+    NgramCounts result = *this;
+    result += other;
+    return result;
+  }
+
+  inline void
+  operator*=(const score_t rhs)
+  {
+    for (unsigned i = 0; i < N_; i++) {
+      this->clipped_[i] *= rhs;
+      this->sum_[i] *= rhs;
+    }
+  }
+
+  inline void
+  Add(const unsigned count, const unsigned ref_count, const unsigned i)
+  {
+    assert(i < N_);
+    if (count > ref_count) {
+      clipped_[i] += ref_count;
+    } else {
+      clipped_[i] += count;
+    }
+    sum_[i] += count;
+  }
+
+  inline void
+  Zero()
+  {
+    for (unsigned i = 0; i < N_; i++) {
+      clipped_[i] = 0.;
+      sum_[i] = 0.;
+    }
+  }
+
+  inline void
+  One()
+  {
+    for (unsigned i = 0; i < N_; i++) {
+      clipped_[i] = 1.;
+      sum_[i] = 1.;
+    }
+  }
+
+  inline void
+  Print()
+  {
+    for (unsigned i = 0; i < N_; i++) {
+      cout << i+1 << "grams (clipped):\t" << clipped_[i] << endl;
+      cout << i+1 << "grams:\t\t\t" << sum_[i] << endl;
+    }
+  }
+
+  inline void Resize(unsigned N)
+  {
+    if (N == N_) return;
+    else if (N > N_) {
+      for (unsigned i = N_; i < N; i++) {
+        clipped_[i] = 0.;
+        sum_[i] = 0.;
+      }
+    } else { // N < N_
+      for (unsigned i = N_-1; i > N-1; i--) {
+        clipped_.erase(i);
+        sum_.erase(i);
+      }
+    }
+    N_ = N;
+  }
+};
+
+typedef map<vector<WordID>, unsigned> Ngrams;
+
+inline Ngrams
+make_ngrams(const vector<WordID>& s, const unsigned N)
+{
+  Ngrams ngrams;
+  vector<WordID> ng;
+  for (size_t i = 0; i < s.size(); i++) {
+    ng.clear();
+    for (unsigned j = i; j < min(i+N, s.size()); j++) {
+      ng.push_back(s[j]);
+      ngrams[ng]++;
+    }
+  }
+  return ngrams;
+}
+
+inline NgramCounts
+make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned N)
+{
+  Ngrams hyp_ngrams = make_ngrams(hyp, N);
+  Ngrams ref_ngrams = make_ngrams(ref, N);
+  NgramCounts counts(N);
+  Ngrams::iterator it;
+  Ngrams::iterator ti;
+  for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) {
+    ti = ref_ngrams.find(it->first);
+    if (ti != ref_ngrams.end()) {
+      counts.Add(it->second, ti->second, it->first.size() - 1);
+    } else {
+      counts.Add(it->second, 0, it->first.size() - 1);
+    }
+  }
+  return counts;
+}
+
+struct BleuScorer : public LocalScorer
+{
+  score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len);
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct StupidBleuScorer : public LocalScorer
+{
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct SmoothBleuScorer : public LocalScorer
+{
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct SumBleuScorer : public LocalScorer
+{
+   score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct SumExpBleuScorer : public LocalScorer
+{
+   score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct SumWhateverBleuScorer : public LocalScorer
+{
+   score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct ApproxBleuScorer : public BleuScorer
+{
+  NgramCounts glob_onebest_counts_;
+  unsigned glob_hyp_len_, glob_ref_len_, glob_src_len_;
+  score_t discount_;
+
+  ApproxBleuScorer(unsigned N, score_t d) : glob_onebest_counts_(NgramCounts(N)), discount_(d)
+  {
+    glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0;
+  }
+
+  inline void Reset() {
+    glob_onebest_counts_.Zero();
+    glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0.;
+  }
+
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len);
+};
+
+struct LinearBleuScorer : public BleuScorer
+{
+  unsigned onebest_len_;
+  NgramCounts onebest_counts_;
+
+  LinearBleuScorer(unsigned N) : onebest_len_(1), onebest_counts_(N)
+  {
+    onebest_counts_.One();
+  }
+
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned /*src_len*/);
+
+  inline void Reset() {
+    onebest_len_ = 1;
+    onebest_counts_.One();
+  }
+};
+
+
+} // namespace
+
+#endif
+
diff --git a/training/dtrain/test/example/README b/training/dtrain/test/example/README
new file mode 100644
index 00000000..6937b11b
--- /dev/null
+++ b/training/dtrain/test/example/README
@@ -0,0 +1,8 @@
+Small example of input format for distributed training.
+Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini .
+
+For this to work, undef 'DTRAIN_LOCAL' in dtrain.h
+and recompile.
+
+Data is here: http://simianer.de/#dtrain
+
diff --git a/training/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini
new file mode 100644
index 00000000..d5955f0e
--- /dev/null
+++ b/training/dtrain/test/example/cdec.ini
@@ -0,0 +1,25 @@
+formalism=scfg
+add_pass_through_rules=true
+scfg_max_span_limit=15
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=30
+feature_function=WordPenalty
+feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz
+# all currently working feature functions for translation:
+# (with those features active that were used in the ACL paper)
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+feature_function=RuleIdentityFeatures
+feature_function=RuleSourceBigramFeatures
+feature_function=RuleTargetBigramFeatures
+feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/training/dtrain/test/example/dtrain.ini b/training/dtrain/test/example/dtrain.ini
new file mode 100644
index 00000000..72d50ca1
--- /dev/null
+++ b/training/dtrain/test/example/dtrain.ini
@@ -0,0 +1,22 @@
+input=test/example/nc-wmt11.1k.gz    # use '-' for STDIN
+output=-                             # a weights file (add .gz for gzip compression) or STDOUT '-'
+select_weights=VOID                  # don't output weights
+decoder_config=test/example/cdec.ini # config for cdec
+# weights for these features will be printed on each iteration
+print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
+tmp=/tmp
+stop_after=10 # stop epoch after 10 inputs
+
+# interesting stuff
+epochs=2                # run over input 2 times
+k=100                   # use 100best lists
+N=4                     # optimize (approx) BLEU4
+scorer=stupid_bleu      # use 'stupid' BLEU+1
+learning_rate=1.0       # learning rate, don't care if gamma=0 (perceptron)
+gamma=0                 # use SVM reg
+sample_from=kbest       # use kbest lists (as opposed to forest)
+filter=uniq             # only unique entries in kbest (surface form)
+pair_sampling=XYX
+hi_lo=0.1               # 10 vs 80 vs 10 and 80 vs 10 here
+pair_threshold=0        # minimum distance in BLEU (this will still only use pairs with diff > 0)
+loss_margin=0
diff --git a/training/dtrain/test/example/expected-output b/training/dtrain/test/example/expected-output
new file mode 100644
index 00000000..05326763
--- /dev/null
+++ b/training/dtrain/test/example/expected-output
@@ -0,0 +1,89 @@
+                cdec cfg 'test/example/cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading test/example/nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+  Example feature: Shape_S00000_T00000
+Seeding random number sequence to 2912000813
+
+dtrain
+Parameters:
+                       k 100
+                       N 4
+                       T 2
+                 scorer 'stupid_bleu'
+             sample from 'kbest'
+                  filter 'uniq'
+           learning rate 1
+                   gamma 0
+             loss margin 0
+                   pairs 'XYX'
+                   hi lo 0.1
+          pair threshold 0
+          select weights 'VOID'
+                  l1 reg 0 'none'
+               max pairs 4294967295
+                cdec cfg 'test/example/cdec.ini'
+                   input 'test/example/nc-wmt11.1k.gz'
+                  output '-'
+              stop_after 10
+(a dot represents 10 inputs)
+Iteration #1 of 2.
+ . 10
+Stopping after 10 input sentences.
+WEIGHTS
+              Glue = -637
+       WordPenalty = +1064
+     LanguageModel = +1175.3
+ LanguageModel_OOV = -1437
+     PhraseModel_0 = +1935.6
+     PhraseModel_1 = +2499.3
+     PhraseModel_2 = +964.96
+     PhraseModel_3 = +1410.8
+     PhraseModel_4 = -5977.9
+     PhraseModel_5 = +522
+     PhraseModel_6 = +1089
+       PassThrough = -1308
+        ---
+       1best avg score: 0.16963 (+0.16963)
+ 1best avg model score: 64485 (+64485)
+           avg # pairs: 1494.4
+        avg # rank err: 702.6
+     avg # margin viol: 0
+    non0 feature count: 528
+           avg list sz: 85.7
+           avg f count: 102.75
+(time 0.083 min, 0.5 s/S)
+
+Iteration #2 of 2.
+ . 10
+WEIGHTS
+              Glue = -1196
+       WordPenalty = +809.52
+     LanguageModel = +3112.1
+ LanguageModel_OOV = -1464
+     PhraseModel_0 = +3895.5
+     PhraseModel_1 = +4683.4
+     PhraseModel_2 = +1092.8
+     PhraseModel_3 = +1079.6
+     PhraseModel_4 = -6827.7
+     PhraseModel_5 = -888
+     PhraseModel_6 = +142
+       PassThrough = -1335
+        ---
+       1best avg score: 0.277 (+0.10736)
+ 1best avg model score: -3110.5 (-67595)
+           avg # pairs: 1144.2
+        avg # rank err: 529.1
+     avg # margin viol: 0
+    non0 feature count: 859
+           avg list sz: 74.9
+           avg f count: 112.84
+(time 0.067 min, 0.4 s/S)
+
+Writing weights file to '-' ...
+done
+
+---
+Best iteration: 2 [SCORE 'stupid_bleu'=0.277].
+This took 0.15 min.
diff --git a/training/dtrain/test/parallelize/cdec.ini b/training/dtrain/test/parallelize/cdec.ini
new file mode 100644
index 00000000..72e99dc5
--- /dev/null
+++ b/training/dtrain/test/parallelize/cdec.ini
@@ -0,0 +1,22 @@
+formalism=scfg
+add_pass_through_rules=true
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=200
+scfg_max_span_limit=15
+feature_function=WordPenalty
+feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+#feature_function=RuleIdentityFeatures
+#feature_function=RuleNgramFeatures
+#feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/training/dtrain/test/parallelize/dtrain.ini b/training/dtrain/test/parallelize/dtrain.ini
new file mode 100644
index 00000000..03f9d240
--- /dev/null
+++ b/training/dtrain/test/parallelize/dtrain.ini
@@ -0,0 +1,15 @@
+k=100
+N=4
+learning_rate=0.0001
+gamma=0
+loss_margin=0
+epochs=1
+scorer=stupid_bleu
+sample_from=kbest
+filter=uniq
+pair_sampling=XYX
+hi_lo=0.1
+select_weights=last
+print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
+tmp=/tmp
+decoder_config=cdec.ini
diff --git a/training/dtrain/test/parallelize/in b/training/dtrain/test/parallelize/in
new file mode 100644
index 00000000..a312809f
--- /dev/null
+++ b/training/dtrain/test/parallelize/in
@@ -0,0 +1,10 @@
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.0.gz" id="0">barack obama erhält als vierter us @-@ präsident den frieden nobelpreis</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.1.gz" id="1">der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.2.gz" id="2">darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.3.gz" id="3">der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.4.gz" id="4">zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.5.gz" id="5">das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.6.gz" id="6">nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.7.gz" id="7">diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.8.gz" id="8">das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.9.gz" id="9">der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt .</seg>
diff --git a/training/dtrain/test/parallelize/refs b/training/dtrain/test/parallelize/refs
new file mode 100644
index 00000000..4d3128cb
--- /dev/null
+++ b/training/dtrain/test/parallelize/refs
@@ -0,0 +1,10 @@
+barack obama becomes the fourth american president to receive the nobel peace prize
+the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so .
+he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things .
+the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule .
+first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations .
+the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway .
+then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award .
+he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office .
+the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan .
+the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries .
diff --git a/training/dtrain/test/toy/cdec.ini b/training/dtrain/test/toy/cdec.ini
new file mode 100644
index 00000000..98b02d44
--- /dev/null
+++ b/training/dtrain/test/toy/cdec.ini
@@ -0,0 +1,2 @@
+formalism=scfg
+add_pass_through_rules=true
diff --git a/training/dtrain/test/toy/dtrain.ini b/training/dtrain/test/toy/dtrain.ini
new file mode 100644
index 00000000..a091732f
--- /dev/null
+++ b/training/dtrain/test/toy/dtrain.ini
@@ -0,0 +1,12 @@
+decoder_config=test/toy/cdec.ini
+input=test/toy/input
+output=-
+print_weights=logp shell_rule house_rule small_rule little_rule PassThrough
+k=4
+N=4
+epochs=2
+scorer=bleu
+sample_from=kbest
+filter=uniq
+pair_sampling=all
+learning_rate=1
diff --git a/training/dtrain/test/toy/input b/training/dtrain/test/toy/input
new file mode 100644
index 00000000..4d10a9ea
--- /dev/null
+++ b/training/dtrain/test/toy/input
@@ -0,0 +1,2 @@
+0	ich sah ein kleines haus	i saw a little house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0	[NP] ||| ich ||| i ||| logp=0	[NP] ||| ein [NN,1] ||| a [1] ||| logp=0	[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1	[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1	[JJ] ||| kleines ||| small ||| logp=0 small_rule=1	[JJ] ||| kleines ||| little ||| logp=0 little_rule=1	[JJ] ||| grosses ||| big ||| logp=0	[JJ] ||| grosses ||| large ||| logp=0	[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0	[V] ||| sah ||| saw ||| logp=0	[V] ||| fand ||| found ||| logp=0
+1	ich fand ein kleines haus	i found a little house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0	[NP] ||| ich ||| i ||| logp=0	[NP] ||| ein [NN,1] ||| a [1] ||| logp=0	[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1	[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1	[JJ] ||| kleines ||| small ||| logp=0 small_rule=1	[JJ] ||| kleines ||| little ||| logp=0 little_rule=1	[JJ] ||| grosses ||| big ||| logp=0	[JJ] ||| grosses ||| large ||| logp=0	[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0	[V] ||| sah ||| saw ||| logp=0	[V] ||| fand ||| found ||| logp=0
diff --git a/training/entropy.cc b/training/entropy.cc
deleted file mode 100644
index 4fdbe2be..00000000
--- a/training/entropy.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-#include "entropy.h"
-
-#include "prob.h"
-#include "candidate_set.h"
-
-using namespace std;
-
-namespace training {
-
-// see Mann and McCallum "Efficient Computation of Entropy Gradient ..." for
-// a mostly clear derivation of:
-//   g = E[ F(x,y) * log p(y|x) ] + H(y | x) * E[ F(x,y) ]
-double CandidateSetEntropy::operator()(const vector<double>& params,
-                                       SparseVector<double>* g) const {
-  prob_t z;
-  vector<double> dps(cands_.size());
-  for (unsigned i = 0; i < cands_.size(); ++i) {
-    dps[i] = cands_[i].fmap.dot(params);
-    const prob_t u(dps[i], init_lnx());
-    z += u;
-  }
-  const double log_z = log(z);
-
-  SparseVector<double> exp_feats;
-  double entropy = 0;
-  for (unsigned i = 0; i < cands_.size(); ++i) {
-    const double log_prob = cands_[i].fmap.dot(params) - log_z;
-    const double prob = exp(log_prob);
-    const double e_logprob = prob * log_prob;
-    entropy -= e_logprob;
-    if (g) {
-      (*g) += cands_[i].fmap * e_logprob;
-      exp_feats += cands_[i].fmap * prob;
-    }
-  }
-  if (g) (*g) += exp_feats * entropy;
-  return entropy;
-}
-
-}
-
diff --git a/training/entropy.h b/training/entropy.h
deleted file mode 100644
index 796589ca..00000000
--- a/training/entropy.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _CSENTROPY_H_
-#define _CSENTROPY_H_
-
-#include <vector>
-#include "sparse_vector.h"
-
-namespace training {
-  class CandidateSet;
-
-  class CandidateSetEntropy {
-   public:
-    explicit CandidateSetEntropy(const CandidateSet& cs) : cands_(cs) {}
-    // compute the entropy (expected log likelihood) of a CandidateSet
-    // (optional) the gradient of the entropy with respect to params
-    double operator()(const std::vector<double>& params,
-                      SparseVector<double>* g = NULL) const;
-   private:
-    const CandidateSet& cands_;
-  };
-};
-
-#endif
diff --git a/training/fast_align.cc b/training/fast_align.cc
deleted file mode 100644
index 7492d26f..00000000
--- a/training/fast_align.cc
+++ /dev/null
@@ -1,281 +0,0 @@
-#include <iostream>
-#include <cmath>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "m.h"
-#include "corpus_tools.h"
-#include "stringlib.h"
-#include "filelib.h"
-#include "ttables.h"
-#include "tdict.h"
-
-namespace po = boost::program_options;
-using namespace std;
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("input,i",po::value<string>(),"Parallel corpus input file")
-        ("reverse,r","Reverse estimation (swap source and target during training)")
-        ("iterations,I",po::value<unsigned>()->default_value(5),"Number of iterations of EM training")
-        //("bidir,b", "Run bidirectional alignment")
-        ("favor_diagonal,d", "Use a static alignment distribution that assigns higher probabilities to alignments near the diagonal")
-        ("prob_align_null", po::value<double>()->default_value(0.08), "When --favor_diagonal is set, what's the probability of a null alignment?")
-        ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (<1 = flat >1 = sharp)")
-        ("variational_bayes,v","Infer VB estimate of parameters under a symmetric Dirichlet prior")
-        ("alpha,a", po::value<double>()->default_value(0.01), "Hyperparameter for optional Dirichlet prior")
-        ("no_null_word,N","Do not generate from a null token")
-        ("output_parameters,p", "Write model parameters instead of alignments")
-        ("beam_threshold,t",po::value<double>()->default_value(-4),"When writing parameters, log_10 of beam threshold for writing parameter (-10000 to include everything, 0 max parameter only)")
-        ("hide_training_alignments,H", "Hide training alignments (only useful if you want to use -x option and just compute testset statistics)")
-        ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model")
-        ("no_add_viterbi,V","When writing model parameters, do not add Viterbi alignment points (may generate a grammar where some training sentence pairs are unreachable)");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || conf->count("input") == 0) {
-    cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n";
-    cerr << dcmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  if (!InitCommandLine(argc, argv, &conf)) return 1;
-  const string fname = conf["input"].as<string>();
-  const bool reverse = conf.count("reverse") > 0;
-  const int ITERATIONS = conf["iterations"].as<unsigned>();
-  const double BEAM_THRESHOLD = pow(10.0, conf["beam_threshold"].as<double>());
-  const bool use_null = (conf.count("no_null_word") == 0);
-  const WordID kNULL = TD::Convert("<eps>");
-  const bool add_viterbi = (conf.count("no_add_viterbi") == 0);
-  const bool variational_bayes = (conf.count("variational_bayes") > 0);
-  const bool write_alignments = (conf.count("output_parameters") == 0);
-  const double diagonal_tension = conf["diagonal_tension"].as<double>();
-  const double prob_align_null = conf["prob_align_null"].as<double>();
-  const bool hide_training_alignments = (conf.count("hide_training_alignments") > 0);
-  string testset;
-  if (conf.count("testset")) testset = conf["testset"].as<string>();
-  const double prob_align_not_null = 1.0 - prob_align_null;
-  const double alpha = conf["alpha"].as<double>();
-  const bool favor_diagonal = conf.count("favor_diagonal");
-  if (variational_bayes && alpha <= 0.0) {
-    cerr << "--alpha must be > 0\n";
-    return 1;
-  }
-
-  TTable s2t, t2s;
-  TTable::Word2Word2Double s2t_viterbi;
-  double tot_len_ratio = 0;
-  double mean_srclen_multiplier = 0;
-  vector<double> unnormed_a_i;
-  for (int iter = 0; iter < ITERATIONS; ++iter) {
-    const bool final_iteration = (iter == (ITERATIONS - 1));
-    cerr << "ITERATION " << (iter + 1) << (final_iteration ? " (FINAL)" : "") << endl;
-    ReadFile rf(fname);
-    istream& in = *rf.stream();
-    double likelihood = 0;
-    double denom = 0.0;
-    int lc = 0;
-    bool flag = false;
-    string line;
-    string ssrc, strg;
-    vector<WordID> src, trg;
-    while(true) {
-      getline(in, line);
-      if (!in) break;
-      ++lc;
-      if (lc % 1000 == 0) { cerr << '.'; flag = true; }
-      if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
-      src.clear(); trg.clear();
-      CorpusTools::ReadLine(line, &src, &trg);
-      if (reverse) swap(src, trg);
-      if (src.size() == 0 || trg.size() == 0) {
-        cerr << "Error: " << lc << "\n" << line << endl;
-        return 1;
-      }
-      if (src.size() > unnormed_a_i.size())
-        unnormed_a_i.resize(src.size());
-      if (iter == 0)
-        tot_len_ratio += static_cast<double>(trg.size()) / static_cast<double>(src.size());
-      denom += trg.size();
-      vector<double> probs(src.size() + 1);
-      bool first_al = true;  // used for write_alignments
-      for (int j = 0; j < trg.size(); ++j) {
-        const WordID& f_j = trg[j];
-        double sum = 0;
-        const double j_over_ts = double(j) / trg.size();
-        double prob_a_i = 1.0 / (src.size() + use_null);  // uniform (model 1)
-        if (use_null) {
-          if (favor_diagonal) prob_a_i = prob_align_null;
-          probs[0] = s2t.prob(kNULL, f_j) * prob_a_i;
-          sum += probs[0];
-        }
-        double az = 0;
-        if (favor_diagonal) {
-          for (int ta = 0; ta < src.size(); ++ta) {
-            unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension);
-            az += unnormed_a_i[ta];
-          }
-          az /= prob_align_not_null;
-        }
-        for (int i = 1; i <= src.size(); ++i) {
-          if (favor_diagonal)
-            prob_a_i = unnormed_a_i[i-1] / az;
-          probs[i] = s2t.prob(src[i-1], f_j) * prob_a_i;
-          sum += probs[i];
-        }
-        if (final_iteration) {
-          if (add_viterbi || write_alignments) {
-            WordID max_i = 0;
-            double max_p = -1;
-            int max_index = -1;
-            if (use_null) {
-              max_i = kNULL;
-              max_index = 0;
-              max_p = probs[0];
-            }
-            for (int i = 1; i <= src.size(); ++i) {
-              if (probs[i] > max_p) {
-                max_index = i;
-                max_p = probs[i];
-                max_i = src[i-1];
-              }
-            }
-            if (!hide_training_alignments && write_alignments) {
-              if (max_index > 0) {
-                if (first_al) first_al = false; else cout << ' ';
-                if (reverse)
-                  cout << j << '-' << (max_index - 1);
-                else
-                  cout << (max_index - 1) << '-' << j;
-              }
-            }
-            s2t_viterbi[max_i][f_j] = 1.0;
-          }
-        } else {
-          if (use_null)
-            s2t.Increment(kNULL, f_j, probs[0] / sum);
-          for (int i = 1; i <= src.size(); ++i)
-            s2t.Increment(src[i-1], f_j, probs[i] / sum);
-        }
-        likelihood += log(sum);
-      }
-      if (write_alignments && final_iteration && !hide_training_alignments) cout << endl;
-    }
-
-    // log(e) = 1.0
-    double base2_likelihood = likelihood / log(2);
-
-    if (flag) { cerr << endl; }
-    if (iter == 0) {
-      mean_srclen_multiplier = tot_len_ratio / lc;
-      cerr << "expected target length = source length * " << mean_srclen_multiplier << endl;
-    }
-    cerr << "  log_e likelihood: " << likelihood << endl;
-    cerr << "  log_2 likelihood: " << base2_likelihood << endl;
-    cerr << "   cross entropy: " << (-base2_likelihood / denom) << endl;
-    cerr << "      perplexity: " << pow(2.0, -base2_likelihood / denom) << endl;
-    if (!final_iteration) {
-      if (variational_bayes)
-        s2t.NormalizeVB(alpha);
-      else
-        s2t.Normalize();
-    }
-  }
-  if (testset.size()) {
-    ReadFile rf(testset);
-    istream& in = *rf.stream();
-    int lc = 0;
-    double tlp = 0;
-    string line;
-    while (getline(in, line)) {
-      ++lc;
-      vector<WordID> src, trg;
-      CorpusTools::ReadLine(line, &src, &trg);
-      cout << TD::GetString(src) << " ||| " << TD::GetString(trg) << " |||";
-      if (reverse) swap(src, trg);
-      double log_prob = Md::log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier);
-      if (src.size() > unnormed_a_i.size())
-        unnormed_a_i.resize(src.size());
-
-      // compute likelihood
-      for (int j = 0; j < trg.size(); ++j) {
-        const WordID& f_j = trg[j];
-        double sum = 0;
-        int a_j = 0;
-        double max_pat = 0;
-        const double j_over_ts = double(j) / trg.size();
-        double prob_a_i = 1.0 / (src.size() + use_null);  // uniform (model 1)
-        if (use_null) {
-          if (favor_diagonal) prob_a_i = prob_align_null;
-          max_pat = s2t.prob(kNULL, f_j) * prob_a_i;
-          sum += max_pat;
-        }
-        double az = 0;
-        if (favor_diagonal) {
-          for (int ta = 0; ta < src.size(); ++ta) {
-            unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension);
-            az += unnormed_a_i[ta];
-          }
-          az /= prob_align_not_null;
-        }
-        for (int i = 1; i <= src.size(); ++i) {
-          if (favor_diagonal)
-            prob_a_i = unnormed_a_i[i-1] / az;
-          double pat = s2t.prob(src[i-1], f_j) * prob_a_i;
-          if (pat > max_pat) { max_pat = pat; a_j = i; }
-          sum += pat;
-        }
-        log_prob += log(sum);
-        if (write_alignments) {
-          if (a_j > 0) {
-            cout << ' ';
-            if (reverse)
-              cout << j << '-' << (a_j - 1);
-            else
-              cout << (a_j - 1) << '-' << j;
-          }
-        }
-      }
-      tlp += log_prob;
-      cout << " ||| " << log_prob << endl << flush;
-    } // loop over test set sentences
-    cerr << "TOTAL LOG PROB " << tlp << endl;
-  }
-
-  if (write_alignments) return 0;
-
-  for (TTable::Word2Word2Double::iterator ei = s2t.ttable.begin(); ei != s2t.ttable.end(); ++ei) {
-    const TTable::Word2Double& cpd = ei->second;
-    const TTable::Word2Double& vit = s2t_viterbi[ei->first];
-    const string& esym = TD::Convert(ei->first);
-    double max_p = -1;
-    for (TTable::Word2Double::const_iterator fi = cpd.begin(); fi != cpd.end(); ++fi)
-      if (fi->second > max_p) max_p = fi->second;
-    const double threshold = max_p * BEAM_THRESHOLD;
-    for (TTable::Word2Double::const_iterator fi = cpd.begin(); fi != cpd.end(); ++fi) {
-      if (fi->second > threshold || (vit.find(fi->first) != vit.end())) {
-        cout << esym << ' ' << TD::Convert(fi->first) << ' ' << log(fi->second) << endl;
-      }
-    } 
-  }
-  return 0;
-}
-
diff --git a/training/feature_expectations.cc b/training/feature_expectations.cc
deleted file mode 100644
index f1a85495..00000000
--- a/training/feature_expectations.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-#include <tr1/memory>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "verbose.h"
-#include "hg.h"
-#include "prob.h"
-#include "inside_outside.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "online_optimizer.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "sampler.h"
-
-#ifdef HAVE_MPI
-#include <boost/mpi/timer.hpp>
-#include <boost/mpi.hpp>
-namespace mpi = boost::mpi;
-#endif
-
-using namespace std;
-namespace po = boost::program_options;
-
-struct FComp {
-  const vector<double>& w_;
-  FComp(const vector<double>& w) : w_(w) {}
-  bool operator()(int a, int b) const {
-    return fabs(w_[a]) > fabs(w_[b]);
-  }
-};
-
-void ShowFeatures(const vector<double>& w) {
-  vector<int> fnums(w.size());
-  for (int i = 0; i < w.size(); ++i)
-    fnums[i] = i;
-  sort(fnums.begin(), fnums.end(), FComp(w));
-  for (vector<int>::iterator i = fnums.begin(); i != fnums.end(); ++i) {
-    if (w[*i]) cout << FD::Convert(*i) << ' ' << w[*i] << endl;
-  }
-}
-
-void ReadConfig(const string& ini, vector<string>* out) {
-  ReadFile rf(ini);
-  istream& in = *rf.stream();
-  while(in) {
-    string line;
-    getline(in, line);
-    if (!in) continue;
-    out->push_back(line);
-  }
-}
-
-void StoreConfig(const vector<string>& cfg, istringstream* o) {
-  ostringstream os;
-  for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; }
-  o->str(os.str());
-}
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("input,i",po::value<string>(),"Corpus of source language sentences")
-        ("weights,w",po::value<string>(),"Input feature weights file")
-        ("decoder_config,c",po::value<string>(), "cdec.ini file");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !conf->count("input") || !conf->count("decoder_config")) {
-    cerr << dcmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) {
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  int id = 0;
-  while(in) {
-    getline(in, line);
-    if (!in) break;
-    if (id % size == rank) {
-      c->push_back(line);
-      order->push_back(id);
-    }
-    ++id;
-  }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct TrainingObserver : public DecoderObserver {
-  void Reset() {
-    acc_exp.clear();
-    total_complete = 0;
-  } 
-
-  virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
-    cur_model_exp.clear();
-    state = 1;
-  }
-
-  // compute model expectations, denominator of objective
-  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    assert(state == 1);
-    state = 2;
-    const prob_t z = InsideOutside<prob_t,
-                                   EdgeProb,
-                                   SparseVector<prob_t>,
-                                   EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
-    cur_model_exp /= z;
-    acc_exp += cur_model_exp;
-  }
-
-  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    cerr << "IGNORING ALIGNMENT FOREST!\n";
-  }
-
-  virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
-    if (state == 2) {
-      ++total_complete;
-    }
-  }
-
-  void GetExpectations(SparseVector<double>* g) const {
-    g->clear();
-    for (SparseVector<prob_t>::const_iterator it = acc_exp.begin(); it != acc_exp.end(); ++it)
-      g->set_value(it->first, it->second);
-  }
-
-  int total_complete;
-  SparseVector<prob_t> cur_model_exp;
-  SparseVector<prob_t> acc_exp;
-  int state;
-};
-
-#ifdef HAVE_MPI
-namespace boost { namespace mpi {
-  template<>
-  struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> > 
-    : mpl::true_ { };
-} } // end namespace boost::mpi
-#endif
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
-  mpi::environment env(argc, argv);
-  mpi::communicator world;
-  const int size = world.size(); 
-  const int rank = world.rank();
-#else
-  const int size = 1;
-  const int rank = 0;
-#endif
-  if (size > 1) SetSilent(true);  // turn off verbose decoder output
-  register_feature_functions();
-
-  po::variables_map conf;
-  if (!InitCommandLine(argc, argv, &conf))
-    return 1;
-
-  // load initial weights
-  Weights weights;
-  if (conf.count("weights"))
-    weights.InitFromFile(conf["weights"].as<string>());
-
-  vector<string> corpus;
-  vector<int> ids;
-  ReadTrainingCorpus(conf["input"].as<string>(), rank, size, &corpus, &ids);
-  assert(corpus.size() > 0);
-
-  vector<string> cdec_ini;
-  ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
-  istringstream ini;
-  StoreConfig(cdec_ini, &ini);
-  Decoder decoder(&ini);
-  if (decoder.GetConf()["input"].as<string>() != "-") {
-    cerr << "cdec.ini must not set an input file\n";
-    return 1;
-  }
-
-  SparseVector<double> x;
-  weights.InitSparseVector(&x);
-  TrainingObserver observer;
-
-  weights.InitFromVector(x);
-  vector<double> lambdas;
-  weights.InitVector(&lambdas);
-  decoder.SetWeights(lambdas);
-  observer.Reset();
-  for (unsigned i = 0; i < corpus.size(); ++i) {
-    int id = ids[i];
-    decoder.SetId(id);
-    decoder.Decode(corpus[i], &observer);
-  }
-  SparseVector<double> local_exps, exps;
-  observer.GetExpectations(&local_exps);
-#ifdef HAVE_MPI
-  reduce(world, local_exps, exps, std::plus<SparseVector<double> >(), 0);
-#else
-  exps.swap(local_exps);
-#endif
-
-  weights.InitFromVector(exps);
-  weights.InitVector(&lambdas);
-  ShowFeatures(lambdas);
-
-  return 0;
-}
diff --git a/training/grammar_convert.cc b/training/grammar_convert.cc
deleted file mode 100644
index 607a7cb9..00000000
--- a/training/grammar_convert.cc
+++ /dev/null
@@ -1,348 +0,0 @@
-/*
-  this program modifies cfg hypergraphs (forests) and extracts kbests?
-  what are: json, split ?
- */
-#include <iostream>
-#include <algorithm>
-#include <sstream>
-
-#include <boost/lexical_cast.hpp>
-#include <boost/program_options.hpp>
-
-#include "inside_outside.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "hg.h"
-#include "hg_io.h"
-#include "kbest.h"
-#include "viterbi.h"
-#include "weights.h"
-
-namespace po = boost::program_options;
-using namespace std;
-
-WordID kSTART;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("input,i", po::value<string>()->default_value("-"), "Input file")
-        ("format,f", po::value<string>()->default_value("cfg"), "Input format. Values: cfg, json, split")
-        ("output,o", po::value<string>()->default_value("json"), "Output command. Values: json, 1best")
-        ("reorder,r", "Add Yamada & Knight (2002) reorderings")
-        ("weights,w", po::value<string>(), "Feature weights for k-best derivations [optional]")
-        ("collapse_weights,C", "Collapse order features into a single feature whose value is all of the locally applying feature weights")
-        ("k_derivations,k", po::value<int>(), "Show k derivations and their features")
-        ("max_reorder,m", po::value<int>()->default_value(999), "Move a constituent at most this far")
-        ("help,h", "Print this help message and exit");
-  po::options_description clo("Command line options");
-  po::options_description dcmdline_options;
-  dcmdline_options.add(opts);
-
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  po::notify(*conf);
-
-  if (conf->count("help") || conf->count("input") == 0) {
-    cerr << "\nUsage: grammar_convert [-options]\n\nConverts a grammar file (in Hiero format) into JSON hypergraph.\n";
-    cerr << dcmdline_options << endl;
-    exit(1);
-  }
-}
-
-int GetOrCreateNode(const WordID& lhs, map<WordID, int>* lhs2node, Hypergraph* hg) {
-  int& node_id = (*lhs2node)[lhs];
-  if (!node_id)
-    node_id = hg->AddNode(lhs)->id_ + 1;
-  return node_id - 1;
-}
-
-void FilterAndCheckCorrectness(int goal, Hypergraph* hg) {
-  if (goal < 0) {
-    cerr << "Error! [S] not found in grammar!\n";
-    exit(1);
-  }
-  if (hg->nodes_[goal].in_edges_.size() != 1) {
-    cerr << "Error! [S] has more than one rewrite!\n";
-    exit(1);
-  }
-  int old_size = hg->nodes_.size();
-  hg->TopologicallySortNodesAndEdges(goal);
-  if (hg->nodes_.size() != old_size) {
-    cerr << "Warning! During sorting " << (old_size - hg->nodes_.size()) << " disappeared!\n";
-  }
-  vector<double> inside; // inside score at each node
-  double p = Inside<double, TransitionCountWeightFunction>(*hg, &inside);
-  if (!p) {
-    cerr << "Warning! Grammar defines the empty language!\n";
-    hg->clear();
-    return;
-  }
-  vector<bool> prune(hg->edges_.size(), false);
-  int bad_edges = 0;
-  for (unsigned i = 0; i < hg->edges_.size(); ++i) {
-    Hypergraph::Edge& edge = hg->edges_[i];
-    bool bad = false;
-    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
-      if (!inside[edge.tail_nodes_[j]]) {
-        bad = true;
-        ++bad_edges;
-      }
-    }
-    prune[i] = bad;
-  }
-  cerr << "Removing " << bad_edges << " bad edges from the grammar.\n";
-  for (unsigned i = 0; i < hg->edges_.size(); ++i) {
-    if (prune[i])
-      cerr << "   " << hg->edges_[i].rule_->AsString() << endl;
-  }
-  hg->PruneEdges(prune);
-}
-
-void CreateEdge(const TRulePtr& r, const Hypergraph::TailNodeVector& tail, Hypergraph::Node* head_node, Hypergraph* hg) {
-  Hypergraph::Edge* new_edge = hg->AddEdge(r, tail);
-  hg->ConnectEdgeToHeadNode(new_edge, head_node);
-  new_edge->feature_values_ = r->scores_;
-}
-
-// from a category label like "NP_2", return "NP"
-string PureCategory(WordID cat) {
-  assert(cat < 0);
-  string c = TD::Convert(cat*-1);
-  size_t p = c.find("_");
-  if (p == string::npos) return c;
-  return c.substr(0, p);
-};
-
-string ConstituentOrderFeature(const TRule& rule, const vector<int>& pi) {
-  const static string kTERM_VAR = "x";
-  const vector<WordID>& f = rule.f();
-  map<string, int> used;
-  vector<string> terms(f.size());
-  for (int i = 0; i < f.size(); ++i) {
-    const string term = (f[i] < 0 ? PureCategory(f[i]) : kTERM_VAR);
-    int& count = used[term];
-    if (!count) {
-      terms[i] = term;
-    } else {
-      ostringstream os;
-      os << term << count;
-      terms[i] = os.str();
-    }
-    ++count;
-  }
-  ostringstream os;
-  os << PureCategory(rule.GetLHS()) << ':';
-  for (int i = 0; i < f.size(); ++i) {
-    if (i > 0) os << '_';
-    os << terms[pi[i]];
-  }
-  return os.str();
-}
-
-bool CheckPermutationMask(const vector<int>& mask, const vector<int>& pi) {
-  assert(mask.size() == pi.size());
-
-  int req_min = -1;
-  int cur_max = 0;
-  int cur_mask = -1;
-  for (int i = 0; i < mask.size(); ++i) {
-    if (mask[i] != cur_mask) {
-      cur_mask = mask[i];
-      req_min = cur_max - 1;
-    }
-    if (pi[i] > req_min) {
-      if (pi[i] > cur_max) cur_max = pi[i];
-    } else {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-void PermuteYKRecursive(int nodeid, const WordID& parent, const int max_reorder, Hypergraph* hg) {
-  // Hypergraph tmp = *hg;
-  Hypergraph::Node* node = &hg->nodes_[nodeid];
-  if (node->in_edges_.size() != 1) {
-    cerr << "Multiple rewrites of [" << TD::Convert(node->cat_ * -1) << "] (parent is [" << TD::Convert(parent*-1) << "])\n";
-    cerr << "  not recursing!\n";
-    return;
-  }
-//  for (int eii = 0; eii < node->in_edges_.size(); ++eii) {
-    const int oe_index = node->in_edges_.front();
-    const TRule& rule = *hg->edges_[oe_index].rule_;
-    const Hypergraph::TailNodeVector orig_tail = hg->edges_[oe_index].tail_nodes_;
-    const int tail_size = orig_tail.size();
-    for (int i = 0; i < tail_size; ++i) {
-      PermuteYKRecursive(hg->edges_[oe_index].tail_nodes_[i], node->cat_, max_reorder, hg);
-    }
-    const vector<WordID>& of = rule.f_;
-    if (of.size() == 1) return;
-  //  cerr << "Permuting [" << TD::Convert(node->cat_ * -1) << "]\n";
-  //  cerr << "ORIG: " << rule.AsString() << endl;
-    vector<WordID> pi(of.size(), 0);
-    for (int i = 0; i < pi.size(); ++i) pi[i] = i;
-
-    vector<int> permutation_mask(of.size(), 0);
-    const bool dont_reorder_across_PU = true;  // TODO add configuration
-    if (dont_reorder_across_PU) {
-      int cur = 0;
-      for (int i = 0; i < pi.size(); ++i) {
-        if (of[i] >= 0) continue;
-        const string cat = PureCategory(of[i]);
-        if (cat == "PU" || cat == "PU!H" || cat == "PUNC" || cat == "PUNC!H" || cat == "CC") {
-          ++cur;
-          permutation_mask[i] = cur;
-          ++cur;
-        } else {
-          permutation_mask[i] = cur;
-        }
-      }
-    }
-    int fid = FD::Convert(ConstituentOrderFeature(rule, pi));
-    hg->edges_[oe_index].feature_values_.set_value(fid, 1.0);
-    while (next_permutation(pi.begin(), pi.end())) {
-      if (!CheckPermutationMask(permutation_mask, pi))
-        continue;
-      vector<WordID> nf(pi.size(), 0);
-      Hypergraph::TailNodeVector tail(pi.size(), 0);
-      bool skip = false;
-      for (int i = 0; i < pi.size(); ++i) {
-        int dist = pi[i] - i; if (dist < 0) dist *= -1;
-        if (dist > max_reorder) { skip = true; break; }
-        nf[i] = of[pi[i]];
-        tail[i] = orig_tail[pi[i]];
-      }
-      if (skip) continue;
-      TRulePtr nr(new TRule(rule));
-      nr->f_ = nf;
-      int fid = FD::Convert(ConstituentOrderFeature(rule, pi));
-      nr->scores_.set_value(fid, 1.0);
-  //    cerr << "PERM: " << nr->AsString() << endl;
-      CreateEdge(nr, tail, node, hg);
-    }
- // }
-}
-
-void PermuteYamadaAndKnight(Hypergraph* hg, int max_reorder) {
-  assert(hg->nodes_.back().cat_ == kSTART);
-  assert(hg->nodes_.back().in_edges_.size() == 1);
-  PermuteYKRecursive(hg->nodes_.size() - 1, kSTART, max_reorder, hg);
-}
-
-void CollapseWeights(Hypergraph* hg) {
-  int fid = FD::Convert("Reordering");
-  for (int i = 0; i < hg->edges_.size(); ++i) {
-    Hypergraph::Edge& edge = hg->edges_[i];
-    edge.feature_values_.clear();
-    if (edge.edge_prob_ != prob_t::Zero()) {
-      edge.feature_values_.set_value(fid, log(edge.edge_prob_));
-    }
-  }
-}
-
-void ProcessHypergraph(const vector<double>& w, const po::variables_map& conf, const string& ref, Hypergraph* hg) {
-  if (conf.count("reorder"))
-    PermuteYamadaAndKnight(hg, conf["max_reorder"].as<int>());
-  if (w.size() > 0) { hg->Reweight(w); }
-  if (conf.count("collapse_weights")) CollapseWeights(hg);
-  if (conf["output"].as<string>() == "json") {
-    HypergraphIO::WriteToJSON(*hg, false, &cout);
-    if (!ref.empty()) { cerr << "REF: " << ref << endl; }
-  } else {
-    vector<WordID> onebest;
-    ViterbiESentence(*hg, &onebest);
-    if (ref.empty()) {
-      cout << TD::GetString(onebest) << endl;
-    } else {
-      cout << TD::GetString(onebest) << " ||| " << ref << endl;
-    }
-  }
-  if (conf.count("k_derivations")) {
-    const int k = conf["k_derivations"].as<int>();
-    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(*hg, k);
-    for (int i = 0; i < k; ++i) {
-      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
-        kbest.LazyKthBest(hg->nodes_.size() - 1, i);
-      if (!d) break;
-      cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
-    }
-  }
-}
-
-int main(int argc, char **argv) {
-  kSTART = TD::Convert("S") * -1;
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-  string infile = conf["input"].as<string>();
-  const bool is_split_input = (conf["format"].as<string>() == "split");
-  const bool is_json_input = is_split_input || (conf["format"].as<string>() == "json");
-  const bool collapse_weights = conf.count("collapse_weights");
-  vector<double> w;
-  if (conf.count("weights"))
-    Weights::InitFromFile(conf["weights"].as<string>(), &w);
-
-  if (collapse_weights && !w.size()) {
-    cerr << "--collapse_weights requires a weights file to be specified!\n";
-    exit(1);
-  }
-  ReadFile rf(infile);
-  istream* in = rf.stream();
-  assert(*in);
-  int lc = 0;
-  Hypergraph hg;
-  map<WordID, int> lhs2node;
-  while(*in) {
-    string line;
-    ++lc;
-    getline(*in, line);
-    if (is_json_input) {
-      if (line.empty() || line[0] == '#') continue;
-      string ref;
-      if (is_split_input) {
-        size_t pos = line.rfind("}}");
-        assert(pos != string::npos);
-        size_t rstart = line.find("||| ", pos);
-        assert(rstart != string::npos);
-        ref = line.substr(rstart + 4);
-        line = line.substr(0, pos + 2);
-      }
-      istringstream is(line);
-      if (HypergraphIO::ReadFromJSON(&is, &hg)) {
-        ProcessHypergraph(w, conf, ref, &hg);
-        hg.clear();
-      } else {
-        cerr << "Error reading grammar from JSON: line " << lc << endl;
-        exit(1);
-      }
-    } else {
-      if (line.empty()) {
-        int goal = lhs2node[kSTART] - 1;
-        FilterAndCheckCorrectness(goal, &hg);
-        ProcessHypergraph(w, conf, "", &hg);
-        hg.clear();
-        lhs2node.clear();
-        continue;
-      }
-      if (line[0] == '#') continue;
-      if (line[0] != '[') {
-        cerr << "Line " << lc << ": bad format\n";
-        exit(1);
-      }
-      TRulePtr tr(TRule::CreateRuleMonolingual(line));
-      Hypergraph::TailNodeVector tail;
-      for (int i = 0; i < tr->f_.size(); ++i) {
-        WordID var_cat = tr->f_[i];
-        if (var_cat < 0)
-          tail.push_back(GetOrCreateNode(var_cat, &lhs2node, &hg));
-      }
-      const WordID lhs = tr->GetLHS();
-      int head = GetOrCreateNode(lhs, &lhs2node, &hg);
-      Hypergraph::Edge* edge = hg.AddEdge(tr, tail);
-      edge->feature_values_ = tr->scores_;
-      Hypergraph::Node* node = &hg.nodes_[head];
-      hg.ConnectEdgeToHeadNode(edge, node);
-    }
-  }
-}
-
diff --git a/training/lbfgs.h b/training/lbfgs.h
deleted file mode 100644
index e8baecab..00000000
--- a/training/lbfgs.h
+++ /dev/null
@@ -1,1459 +0,0 @@
-#ifndef SCITBX_LBFGS_H
-#define SCITBX_LBFGS_H
-
-#include <cstdio>
-#include <cstddef>
-#include <cmath>
-#include <stdexcept>
-#include <algorithm>
-#include <vector>
-#include <string>
-#include <iostream>
-#include <sstream>
-
-namespace scitbx {
-
-//! Limited-memory Broyden-Fletcher-Goldfarb-Shanno (LBFGS) %minimizer.
-/*! Implementation of the
-    Limited-memory Broyden-Fletcher-Goldfarb-Shanno (LBFGS)
-    algorithm for large-scale multidimensional minimization
-    problems.
-
-    This code was manually derived from Java code which was
-    in turn derived from the Fortran program
-    <code>lbfgs.f</code>.  The Java translation was
-    effected mostly mechanically, with some manual
-    clean-up; in particular, array indices start at 0
-    instead of 1.  Most of the comments from the Fortran
-    code have been pasted in.
-
-    Information on the original LBFGS Fortran source code is
-    available at
-    http://www.netlib.org/opt/lbfgs_um.shar . The following
-    information is taken verbatim from the Netlib documentation
-    for the Fortran source.
-
-    <pre>
-    file    opt/lbfgs_um.shar
-    for     unconstrained optimization problems
-    alg     limited memory BFGS method
-    by      J. Nocedal
-    contact nocedal@eecs.nwu.edu
-    ref     D. C. Liu and J. Nocedal, ``On the limited memory BFGS method for
-    ,       large scale optimization methods'' Mathematical Programming 45
-    ,       (1989), pp. 503-528.
-    ,       (Postscript file of this paper is available via anonymous ftp
-    ,       to eecs.nwu.edu in the directory pub/%lbfgs/lbfgs_um.)
-    </pre>
-
-    @author Jorge Nocedal: original Fortran version, including comments
-    (July 1990).<br>
-    Robert Dodier: Java translation, August 1997.<br>
-    Ralf W. Grosse-Kunstleve: C++ port, March 2002.<br>
-    Chris Dyer: serialize/deserialize functionality
- */
-namespace lbfgs {
-
-  //! Generic exception class for %lbfgs %error messages.
-  /*! All exceptions thrown by the minimizer are derived from this class.
-   */
-  class error : public std::exception {
-    public:
-      //! Constructor.
-      error(std::string const& msg) throw()
-        : msg_("lbfgs error: " + msg)
-      {}
-      //! Access to error message.
-      virtual const char* what() const throw() { return msg_.c_str(); }
-    protected:
-      virtual ~error() throw() {}
-      std::string msg_;
-    public:
-      static std::string itoa(unsigned long i) {
-        std::ostringstream os;
-        os << i;
-        return os.str();
-      }
-  };
-
-  //! Specific exception class.
-  class error_internal_error : public error {
-    public:
-      //! Constructor.
-      error_internal_error(const char* file, unsigned long line) throw()
-        : error(
-            "Internal Error: " + std::string(file) + "(" + itoa(line) + ")")
-      {}
-  };
-
-  //! Specific exception class.
-  class error_improper_input_parameter : public error {
-    public:
-      //! Constructor.
-      error_improper_input_parameter(std::string const& msg) throw()
-        : error("Improper input parameter: " + msg)
-      {}
-  };
-
-  //! Specific exception class.
-  class error_improper_input_data : public error {
-    public:
-      //! Constructor.
-      error_improper_input_data(std::string const& msg) throw()
-        : error("Improper input data: " + msg)
-      {}
-  };
-
-  //! Specific exception class.
-  class error_search_direction_not_descent : public error {
-    public:
-      //! Constructor.
-      error_search_direction_not_descent() throw()
-        : error("The search direction is not a descent direction.")
-      {}
-  };
-
-  //! Specific exception class.
-  class error_line_search_failed : public error {
-    public:
-      //! Constructor.
-      error_line_search_failed(std::string const& msg) throw()
-        : error("Line search failed: " + msg)
-      {}
-  };
-
-  //! Specific exception class.
-  class error_line_search_failed_rounding_errors
-  : public error_line_search_failed {
-    public:
-      //! Constructor.
-      error_line_search_failed_rounding_errors(std::string const& msg) throw()
-        : error_line_search_failed(msg)
-      {}
-  };
-
-  namespace detail {
-
-    template <typename NumType>
-    inline
-    NumType
-    pow2(NumType const& x) { return x * x; }
-
-    template <typename NumType>
-    inline
-    NumType
-    abs(NumType const& x) {
-      if (x < NumType(0)) return -x;
-      return x;
-    }
-
-    // This class implements an algorithm for multi-dimensional line search.
-    template <typename FloatType, typename SizeType = std::size_t>
-    class mcsrch
-    {
-      protected:
-        int infoc;
-        FloatType dginit;
-        bool brackt;
-        bool stage1;
-        FloatType finit;
-        FloatType dgtest;
-        FloatType width;
-        FloatType width1;
-        FloatType stx;
-        FloatType fx;
-        FloatType dgx;
-        FloatType sty;
-        FloatType fy;
-        FloatType dgy;
-        FloatType stmin;
-        FloatType stmax;
-
-        static FloatType const& max3(
-          FloatType const& x,
-          FloatType const& y,
-          FloatType const& z)
-        {
-          return x < y ? (y < z ? z : y ) : (x < z ? z : x );
-        }
-
-      public:
-        /* Minimize a function along a search direction. This code is
-           a Java translation of the function <code>MCSRCH</code> from
-           <code>lbfgs.f</code>, which in turn is a slight modification
-           of the subroutine <code>CSRCH</code> of More' and Thuente.
-           The changes are to allow reverse communication, and do not
-           affect the performance of the routine. This function, in turn,
-           calls <code>mcstep</code>.<p>
-
-           The Java translation was effected mostly mechanically, with
-           some manual clean-up; in particular, array indices start at 0
-           instead of 1.  Most of the comments from the Fortran code have
-           been pasted in here as well.<p>
-
-           The purpose of <code>mcsrch</code> is to find a step which
-           satisfies a sufficient decrease condition and a curvature
-           condition.<p>
-
-           At each stage this function updates an interval of uncertainty
-           with endpoints <code>stx</code> and <code>sty</code>. The
-           interval of uncertainty is initially chosen so that it
-           contains a minimizer of the modified function
-           <pre>
-                f(x+stp*s) - f(x) - ftol*stp*(gradf(x)'s).
-           </pre>
-           If a step is obtained for which the modified function has a
-           nonpositive function value and nonnegative derivative, then
-           the interval of uncertainty is chosen so that it contains a
-           minimizer of <code>f(x+stp*s)</code>.<p>
-
-           The algorithm is designed to find a step which satisfies
-           the sufficient decrease condition
-           <pre>
-                 f(x+stp*s) &lt;= f(X) + ftol*stp*(gradf(x)'s),
-           </pre>
-           and the curvature condition
-           <pre>
-                 abs(gradf(x+stp*s)'s)) &lt;= gtol*abs(gradf(x)'s).
-           </pre>
-           If <code>ftol</code> is less than <code>gtol</code> and if,
-           for example, the function is bounded below, then there is
-           always a step which satisfies both conditions. If no step can
-           be found which satisfies both conditions, then the algorithm
-           usually stops when rounding errors prevent further progress.
-           In this case <code>stp</code> only satisfies the sufficient
-           decrease condition.<p>
-
-           @author Original Fortran version by Jorge J. More' and
-             David J. Thuente as part of the Minpack project, June 1983,
-             Argonne National Laboratory. Java translation by Robert
-             Dodier, August 1997.
-
-           @param n The number of variables.
-
-           @param x On entry this contains the base point for the line
-             search. On exit it contains <code>x + stp*s</code>.
-
-           @param f On entry this contains the value of the objective
-             function at <code>x</code>. On exit it contains the value
-             of the objective function at <code>x + stp*s</code>.
-
-           @param g On entry this contains the gradient of the objective
-             function at <code>x</code>. On exit it contains the gradient
-             at <code>x + stp*s</code>.
-
-           @param s The search direction.
-
-           @param stp On entry this contains an initial estimate of a
-             satifactory step length. On exit <code>stp</code> contains
-             the final estimate.
-
-           @param ftol Tolerance for the sufficient decrease condition.
-
-           @param xtol Termination occurs when the relative width of the
-             interval of uncertainty is at most <code>xtol</code>.
-
-           @param maxfev Termination occurs when the number of evaluations
-             of the objective function is at least <code>maxfev</code> by
-             the end of an iteration.
-
-           @param info This is an output variable, which can have these
-             values:
-             <ul>
-             <li><code>info = -1</code> A return is made to compute
-                 the function and gradient.
-             <li><code>info = 1</code> The sufficient decrease condition
-                 and the directional derivative condition hold.
-             </ul>
-
-           @param nfev On exit, this is set to the number of function
-             evaluations.
-
-           @param wa Temporary storage array, of length <code>n</code>.
-         */
-        void run(
-          FloatType const& gtol,
-          FloatType const& stpmin,
-          FloatType const& stpmax,
-          SizeType n,
-          FloatType* x,
-          FloatType f,
-          const FloatType* g,
-          FloatType* s,
-          SizeType is0,
-          FloatType& stp,
-          FloatType ftol,
-          FloatType xtol,
-          SizeType maxfev,
-          int& info,
-          SizeType& nfev,
-          FloatType* wa);
-
-        /* The purpose of this function is to compute a safeguarded step
-           for a linesearch and to update an interval of uncertainty for
-           a minimizer of the function.<p>
-
-           The parameter <code>stx</code> contains the step with the
-           least function value. The parameter <code>stp</code> contains
-           the current step. It is assumed that the derivative at
-           <code>stx</code> is negative in the direction of the step. If
-           <code>brackt</code> is <code>true</code> when
-           <code>mcstep</code> returns then a minimizer has been
-           bracketed in an interval of uncertainty with endpoints
-           <code>stx</code> and <code>sty</code>.<p>
-
-           Variables that must be modified by <code>mcstep</code> are
-           implemented as 1-element arrays.
-
-           @param stx Step at the best step obtained so far.
-             This variable is modified by <code>mcstep</code>.
-           @param fx Function value at the best step obtained so far.
-             This variable is modified by <code>mcstep</code>.
-           @param dx Derivative at the best step obtained so far.
-             The derivative must be negative in the direction of the
-             step, that is, <code>dx</code> and <code>stp-stx</code> must
-             have opposite signs.  This variable is modified by
-             <code>mcstep</code>.
-
-           @param sty Step at the other endpoint of the interval of
-             uncertainty. This variable is modified by <code>mcstep</code>.
-           @param fy Function value at the other endpoint of the interval
-             of uncertainty. This variable is modified by
-             <code>mcstep</code>.
-
-           @param dy Derivative at the other endpoint of the interval of
-             uncertainty. This variable is modified by <code>mcstep</code>.
-
-           @param stp Step at the current step. If <code>brackt</code> is set
-             then on input <code>stp</code> must be between <code>stx</code>
-             and <code>sty</code>. On output <code>stp</code> is set to the
-             new step.
-           @param fp Function value at the current step.
-           @param dp Derivative at the current step.
-
-           @param brackt Tells whether a minimizer has been bracketed.
-             If the minimizer has not been bracketed, then on input this
-             variable must be set <code>false</code>. If the minimizer has
-             been bracketed, then on output this variable is
-             <code>true</code>.
-
-           @param stpmin Lower bound for the step.
-           @param stpmax Upper bound for the step.
-
-           If the return value is 1, 2, 3, or 4, then the step has
-           been computed successfully. A return value of 0 indicates
-           improper input parameters.
-
-           @author Jorge J. More, David J. Thuente: original Fortran version,
-             as part of Minpack project. Argonne Nat'l Laboratory, June 1983.
-             Robert Dodier: Java translation, August 1997.
-         */
-        static int mcstep(
-          FloatType& stx,
-          FloatType& fx,
-          FloatType& dx,
-          FloatType& sty,
-          FloatType& fy,
-          FloatType& dy,
-          FloatType& stp,
-          FloatType fp,
-          FloatType dp,
-          bool& brackt,
-          FloatType stpmin,
-          FloatType stpmax);
-
-        void serialize(std::ostream* out) const {
-          out->write((const char*)&infoc,sizeof(infoc));
-          out->write((const char*)&dginit,sizeof(dginit));
-          out->write((const char*)&brackt,sizeof(brackt));
-          out->write((const char*)&stage1,sizeof(stage1));
-          out->write((const char*)&finit,sizeof(finit));
-          out->write((const char*)&dgtest,sizeof(dgtest));
-          out->write((const char*)&width,sizeof(width));
-          out->write((const char*)&width1,sizeof(width1));
-          out->write((const char*)&stx,sizeof(stx));
-          out->write((const char*)&fx,sizeof(fx));
-          out->write((const char*)&dgx,sizeof(dgx));
-          out->write((const char*)&sty,sizeof(sty));
-          out->write((const char*)&fy,sizeof(fy));
-          out->write((const char*)&dgy,sizeof(dgy));
-          out->write((const char*)&stmin,sizeof(stmin));
-          out->write((const char*)&stmax,sizeof(stmax));
-        }
-
-        void deserialize(std::istream* in) const {
-          in->read((char*)&infoc, sizeof(infoc));
-          in->read((char*)&dginit, sizeof(dginit));
-          in->read((char*)&brackt, sizeof(brackt));
-          in->read((char*)&stage1, sizeof(stage1));
-          in->read((char*)&finit, sizeof(finit));
-          in->read((char*)&dgtest, sizeof(dgtest));
-          in->read((char*)&width, sizeof(width));
-          in->read((char*)&width1, sizeof(width1));
-          in->read((char*)&stx, sizeof(stx));
-          in->read((char*)&fx, sizeof(fx));
-          in->read((char*)&dgx, sizeof(dgx));
-          in->read((char*)&sty, sizeof(sty));
-          in->read((char*)&fy, sizeof(fy));
-          in->read((char*)&dgy, sizeof(dgy));
-          in->read((char*)&stmin, sizeof(stmin));
-          in->read((char*)&stmax, sizeof(stmax));
-        }
-    };
-
-    template <typename FloatType, typename SizeType>
-    void mcsrch<FloatType, SizeType>::run(
-      FloatType const& gtol,
-      FloatType const& stpmin,
-      FloatType const& stpmax,
-      SizeType n,
-      FloatType* x,
-      FloatType f,
-      const FloatType* g,
-      FloatType* s,
-      SizeType is0,
-      FloatType& stp,
-      FloatType ftol,
-      FloatType xtol,
-      SizeType maxfev,
-      int& info,
-      SizeType& nfev,
-      FloatType* wa)
-    {
-      if (info != -1) {
-        infoc = 1;
-        if (   n == 0
-            || maxfev == 0
-            || gtol < FloatType(0)
-            || xtol < FloatType(0)
-            || stpmin < FloatType(0)
-            || stpmax < stpmin) {
-          throw error_internal_error(__FILE__, __LINE__);
-        }
-        if (stp <= FloatType(0) || ftol < FloatType(0)) {
-          throw error_internal_error(__FILE__, __LINE__);
-        }
-        // Compute the initial gradient in the search direction
-        // and check that s is a descent direction.
-        dginit = FloatType(0);
-        for (SizeType j = 0; j < n; j++) {
-          dginit += g[j] * s[is0+j];
-        }
-        if (dginit >= FloatType(0)) {
-          throw error_search_direction_not_descent();
-        }
-        brackt = false;
-        stage1 = true;
-        nfev = 0;
-        finit = f;
-        dgtest = ftol*dginit;
-        width = stpmax - stpmin;
-        width1 = FloatType(2) * width;
-        std::copy(x, x+n, wa);
-        // The variables stx, fx, dgx contain the values of the step,
-        // function, and directional derivative at the best step.
-        // The variables sty, fy, dgy contain the value of the step,
-        // function, and derivative at the other endpoint of
-        // the interval of uncertainty.
-        // The variables stp, f, dg contain the values of the step,
-        // function, and derivative at the current step.
-        stx = FloatType(0);
-        fx = finit;
-        dgx = dginit;
-        sty = FloatType(0);
-        fy = finit;
-        dgy = dginit;
-      }
-      for (;;) {
-        if (info != -1) {
-          // Set the minimum and maximum steps to correspond
-          // to the present interval of uncertainty.
-          if (brackt) {
-            stmin = std::min(stx, sty);
-            stmax = std::max(stx, sty);
-          }
-          else {
-            stmin = stx;
-            stmax = stp + FloatType(4) * (stp - stx);
-          }
-          // Force the step to be within the bounds stpmax and stpmin.
-          stp = std::max(stp, stpmin);
-          stp = std::min(stp, stpmax);
-          // If an unusual termination is to occur then let
-          // stp be the lowest point obtained so far.
-          if (   (brackt && (stp <= stmin || stp >= stmax))
-              || nfev >= maxfev - 1 || infoc == 0
-              || (brackt && stmax - stmin <= xtol * stmax)) {
-            stp = stx;
-          }
-          // Evaluate the function and gradient at stp
-          // and compute the directional derivative.
-          // We return to main program to obtain F and G.
-          for (SizeType j = 0; j < n; j++) {
-            x[j] = wa[j] + stp * s[is0+j];
-          }
-          info=-1;
-          break;
-        }
-        info = 0;
-        nfev++;
-        FloatType dg(0);
-        for (SizeType j = 0; j < n; j++) {
-          dg += g[j] * s[is0+j];
-        }
-        FloatType ftest1 = finit + stp*dgtest;
-        // Test for convergence.
-        if ((brackt && (stp <= stmin || stp >= stmax)) || infoc == 0) {
-          throw error_line_search_failed_rounding_errors(
-            "Rounding errors prevent further progress."
-            " There may not be a step which satisfies the"
-            " sufficient decrease and curvature conditions."
-            " Tolerances may be too small.");
-        }
-        if (stp == stpmax && f <= ftest1 && dg <= dgtest) {
-          throw error_line_search_failed(
-            "The step is at the upper bound stpmax().");
-        }
-        if (stp == stpmin && (f > ftest1 || dg >= dgtest)) {
-          throw error_line_search_failed(
-            "The step is at the lower bound stpmin().");
-        }
-        if (nfev >= maxfev) {
-          throw error_line_search_failed(
-            "Number of function evaluations has reached maxfev().");
-        }
-        if (brackt && stmax - stmin <= xtol * stmax) {
-          throw error_line_search_failed(
-            "Relative width of the interval of uncertainty"
-            " is at most xtol().");
-        }
-        // Check for termination.
-        if (f <= ftest1 && abs(dg) <= gtol * (-dginit)) {
-          info = 1;
-          break;
-        }
-        // In the first stage we seek a step for which the modified
-        // function has a nonpositive value and nonnegative derivative.
-        if (   stage1 && f <= ftest1
-            && dg >= std::min(ftol, gtol) * dginit) {
-          stage1 = false;
-        }
-        // A modified function is used to predict the step only if
-        // we have not obtained a step for which the modified
-        // function has a nonpositive function value and nonnegative
-        // derivative, and if a lower function value has been
-        // obtained but the decrease is not sufficient.
-        if (stage1 && f <= fx && f > ftest1) {
-          // Define the modified function and derivative values.
-          FloatType fm = f - stp*dgtest;
-          FloatType fxm = fx - stx*dgtest;
-          FloatType fym = fy - sty*dgtest;
-          FloatType dgm = dg - dgtest;
-          FloatType dgxm = dgx - dgtest;
-          FloatType dgym = dgy - dgtest;
-          // Call cstep to update the interval of uncertainty
-          // and to compute the new step.
-          infoc = mcstep(stx, fxm, dgxm, sty, fym, dgym, stp, fm, dgm,
-                         brackt, stmin, stmax);
-          // Reset the function and gradient values for f.
-          fx = fxm + stx*dgtest;
-          fy = fym + sty*dgtest;
-          dgx = dgxm + dgtest;
-          dgy = dgym + dgtest;
-        }
-        else {
-          // Call mcstep to update the interval of uncertainty
-          // and to compute the new step.
-          infoc = mcstep(stx, fx, dgx, sty, fy, dgy, stp, f, dg,
-                         brackt, stmin, stmax);
-        }
-        // Force a sufficient decrease in the size of the
-        // interval of uncertainty.
-        if (brackt) {
-          if (abs(sty - stx) >= FloatType(0.66) * width1) {
-            stp = stx + FloatType(0.5) * (sty - stx);
-          }
-          width1 = width;
-          width = abs(sty - stx);
-        }
-      }
-    }
-
-    template <typename FloatType, typename SizeType>
-    int mcsrch<FloatType, SizeType>::mcstep(
-      FloatType& stx,
-      FloatType& fx,
-      FloatType& dx,
-      FloatType& sty,
-      FloatType& fy,
-      FloatType& dy,
-      FloatType& stp,
-      FloatType fp,
-      FloatType dp,
-      bool& brackt,
-      FloatType stpmin,
-      FloatType stpmax)
-    {
-      bool bound;
-      FloatType gamma, p, q, r, s, sgnd, stpc, stpf, stpq, theta;
-      int info = 0;
-      if (   (   brackt && (stp <= std::min(stx, sty)
-              || stp >= std::max(stx, sty)))
-          || dx * (stp - stx) >= FloatType(0) || stpmax < stpmin) {
-        return 0;
-      }
-      // Determine if the derivatives have opposite sign.
-      sgnd = dp * (dx / abs(dx));
-      if (fp > fx) {
-        // First case. A higher function value.
-        // The minimum is bracketed. If the cubic step is closer
-        // to stx than the quadratic step, the cubic step is taken,
-        // else the average of the cubic and quadratic steps is taken.
-        info = 1;
-        bound = true;
-        theta = FloatType(3) * (fx - fp) / (stp - stx) + dx + dp;
-        s = max3(abs(theta), abs(dx), abs(dp));
-        gamma = s * std::sqrt(pow2(theta / s) - (dx / s) * (dp / s));
-        if (stp < stx) gamma = - gamma;
-        p = (gamma - dx) + theta;
-        q = ((gamma - dx) + gamma) + dp;
-        r = p/q;
-        stpc = stx + r * (stp - stx);
-        stpq = stx
-          + ((dx / ((fx - fp) / (stp - stx) + dx)) / FloatType(2))
-            * (stp - stx);
-        if (abs(stpc - stx) < abs(stpq - stx)) {
-          stpf = stpc;
-        }
-        else {
-          stpf = stpc + (stpq - stpc) / FloatType(2);
-        }
-        brackt = true;
-      }
-      else if (sgnd < FloatType(0)) {
-        // Second case. A lower function value and derivatives of
-        // opposite sign. The minimum is bracketed. If the cubic
-        // step is closer to stx than the quadratic (secant) step,
-        // the cubic step is taken, else the quadratic step is taken.
-        info = 2;
-        bound = false;
-        theta = FloatType(3) * (fx - fp) / (stp - stx) + dx + dp;
-        s = max3(abs(theta), abs(dx), abs(dp));
-        gamma = s * std::sqrt(pow2(theta / s) - (dx / s) * (dp / s));
-        if (stp > stx) gamma = - gamma;
-        p = (gamma - dp) + theta;
-        q = ((gamma - dp) + gamma) + dx;
-        r = p/q;
-        stpc = stp + r * (stx - stp);
-        stpq = stp + (dp / (dp - dx)) * (stx - stp);
-        if (abs(stpc - stp) > abs(stpq - stp)) {
-          stpf = stpc;
-        }
-        else {
-          stpf = stpq;
-        }
-        brackt = true;
-      }
-      else if (abs(dp) < abs(dx)) {
-        // Third case. A lower function value, derivatives of the
-        // same sign, and the magnitude of the derivative decreases.
-        // The cubic step is only used if the cubic tends to infinity
-        // in the direction of the step or if the minimum of the cubic
-        // is beyond stp. Otherwise the cubic step is defined to be
-        // either stpmin or stpmax. The quadratic (secant) step is also
-        // computed and if the minimum is bracketed then the the step
-        // closest to stx is taken, else the step farthest away is taken.
-        info = 3;
-        bound = true;
-        theta = FloatType(3) * (fx - fp) / (stp - stx) + dx + dp;
-        s = max3(abs(theta), abs(dx), abs(dp));
-        gamma = s * std::sqrt(
-          std::max(FloatType(0), pow2(theta / s) - (dx / s) * (dp / s)));
-        if (stp > stx) gamma = -gamma;
-        p = (gamma - dp) + theta;
-        q = (gamma + (dx - dp)) + gamma;
-        r = p/q;
-        if (r < FloatType(0) && gamma != FloatType(0)) {
-          stpc = stp + r * (stx - stp);
-        }
-        else if (stp > stx) {
-          stpc = stpmax;
-        }
-        else {
-          stpc = stpmin;
-        }
-        stpq = stp + (dp / (dp - dx)) * (stx - stp);
-        if (brackt) {
-          if (abs(stp - stpc) < abs(stp - stpq)) {
-            stpf = stpc;
-          }
-          else {
-            stpf = stpq;
-          }
-        }
-        else {
-          if (abs(stp - stpc) > abs(stp - stpq)) {
-            stpf = stpc;
-          }
-          else {
-            stpf = stpq;
-          }
-        }
-      }
-      else {
-        // Fourth case. A lower function value, derivatives of the
-        // same sign, and the magnitude of the derivative does
-        // not decrease. If the minimum is not bracketed, the step
-        // is either stpmin or stpmax, else the cubic step is taken.
-        info = 4;
-        bound = false;
-        if (brackt) {
-          theta = FloatType(3) * (fp - fy) / (sty - stp) + dy + dp;
-          s = max3(abs(theta), abs(dy), abs(dp));
-          gamma = s * std::sqrt(pow2(theta / s) - (dy / s) * (dp / s));
-          if (stp > sty) gamma = -gamma;
-          p = (gamma - dp) + theta;
-          q = ((gamma - dp) + gamma) + dy;
-          r = p/q;
-          stpc = stp + r * (sty - stp);
-          stpf = stpc;
-        }
-        else if (stp > stx) {
-          stpf = stpmax;
-        }
-        else {
-          stpf = stpmin;
-        }
-      }
-      // Update the interval of uncertainty. This update does not
-      // depend on the new step or the case analysis above.
-      if (fp > fx) {
-        sty = stp;
-        fy = fp;
-        dy = dp;
-      }
-      else {
-        if (sgnd < FloatType(0)) {
-          sty = stx;
-          fy = fx;
-          dy = dx;
-        }
-        stx = stp;
-        fx = fp;
-        dx = dp;
-      }
-      // Compute the new step and safeguard it.
-      stpf = std::min(stpmax, stpf);
-      stpf = std::max(stpmin, stpf);
-      stp = stpf;
-      if (brackt && bound) {
-        if (sty > stx) {
-          stp = std::min(stx + FloatType(0.66) * (sty - stx), stp);
-        }
-        else {
-          stp = std::max(stx + FloatType(0.66) * (sty - stx), stp);
-        }
-      }
-      return info;
-    }
-
-    /* Compute the sum of a vector times a scalar plus another vector.
-       Adapted from the subroutine <code>daxpy</code> in
-       <code>lbfgs.f</code>.
-     */
-    template <typename FloatType, typename SizeType>
-    void daxpy(
-      SizeType n,
-      FloatType da,
-      const FloatType* dx,
-      SizeType ix0,
-      SizeType incx,
-      FloatType* dy,
-      SizeType iy0,
-      SizeType incy)
-    {
-      SizeType i, ix, iy, m;
-      if (n == 0) return;
-      if (da == FloatType(0)) return;
-      if  (!(incx == 1 && incy == 1)) {
-        ix = 0;
-        iy = 0;
-        for (i = 0; i < n; i++) {
-          dy[iy0+iy] += da * dx[ix0+ix];
-          ix += incx;
-          iy += incy;
-        }
-        return;
-      }
-      m = n % 4;
-      for (i = 0; i < m; i++) {
-        dy[iy0+i] += da * dx[ix0+i];
-      }
-      for (; i < n;) {
-        dy[iy0+i] += da * dx[ix0+i]; i++;
-        dy[iy0+i] += da * dx[ix0+i]; i++;
-        dy[iy0+i] += da * dx[ix0+i]; i++;
-        dy[iy0+i] += da * dx[ix0+i]; i++;
-      }
-    }
-
-    template <typename FloatType, typename SizeType>
-    inline
-    void daxpy(
-      SizeType n,
-      FloatType da,
-      const FloatType* dx,
-      SizeType ix0,
-      FloatType* dy)
-    {
-      daxpy(n, da, dx, ix0, SizeType(1), dy, SizeType(0), SizeType(1));
-    }
-
-    /* Compute the dot product of two vectors.
-       Adapted from the subroutine <code>ddot</code>
-       in <code>lbfgs.f</code>.
-     */
-    template <typename FloatType, typename SizeType>
-    FloatType ddot(
-      SizeType n,
-      const FloatType* dx,
-      SizeType ix0,
-      SizeType incx,
-      const FloatType* dy,
-      SizeType iy0,
-      SizeType incy)
-    {
-      SizeType i, ix, iy, m;
-      FloatType dtemp(0);
-      if (n == 0) return FloatType(0);
-      if (!(incx == 1 && incy == 1)) {
-        ix = 0;
-        iy = 0;
-        for (i = 0; i < n; i++) {
-          dtemp += dx[ix0+ix] * dy[iy0+iy];
-          ix += incx;
-          iy += incy;
-        }
-        return dtemp;
-      }
-      m = n % 5;
-      for (i = 0; i < m; i++) {
-        dtemp += dx[ix0+i] * dy[iy0+i];
-      }
-      for (; i < n;) {
-        dtemp += dx[ix0+i] * dy[iy0+i]; i++;
-        dtemp += dx[ix0+i] * dy[iy0+i]; i++;
-        dtemp += dx[ix0+i] * dy[iy0+i]; i++;
-        dtemp += dx[ix0+i] * dy[iy0+i]; i++;
-        dtemp += dx[ix0+i] * dy[iy0+i]; i++;
-      }
-      return dtemp;
-    }
-
-    template <typename FloatType, typename SizeType>
-    inline
-    FloatType ddot(
-      SizeType n,
-      const FloatType* dx,
-      const FloatType* dy)
-    {
-      return ddot(
-        n, dx, SizeType(0), SizeType(1), dy, SizeType(0), SizeType(1));
-    }
-
-  } // namespace detail
-
-  //! Interface to the LBFGS %minimizer.
-  /*! This class solves the unconstrained minimization problem
-      <pre>
-          min f(x),  x = (x1,x2,...,x_n),
-      </pre>
-      using the limited-memory BFGS method. The routine is
-      especially effective on problems involving a large number of
-      variables. In a typical iteration of this method an
-      approximation Hk to the inverse of the Hessian
-      is obtained by applying <code>m</code> BFGS updates to a
-      diagonal matrix Hk0, using information from the
-      previous <code>m</code> steps.  The user specifies the number
-      <code>m</code>, which determines the amount of storage
-      required by the routine. The user may also provide the
-      diagonal matrices Hk0 (parameter <code>diag</code> in the run()
-      function) if not satisfied with the default choice. The
-      algorithm is described in "On the limited memory BFGS method for
-      large scale optimization", by D. Liu and J. Nocedal, Mathematical
-      Programming B 45 (1989) 503-528.
-
-      The user is required to calculate the function value
-      <code>f</code> and its gradient <code>g</code>. In order to
-      allow the user complete control over these computations,
-      reverse communication is used. The routine must be called
-      repeatedly under the control of the member functions
-      <code>requests_f_and_g()</code>,
-      <code>requests_diag()</code>.
-      If neither requests_f_and_g() nor requests_diag() is
-      <code>true</code> the user should check for convergence
-      (using class traditional_convergence_test or any
-      other custom test). If the convergence test is negative,
-      the minimizer may be called again for the next iteration.
-
-      The steplength (stp()) is determined at each iteration
-      by means of the line search routine <code>mcsrch</code>, which is
-      a slight modification of the routine <code>CSRCH</code> written
-      by More' and Thuente.
-
-      The only variables that are machine-dependent are
-      <code>xtol</code>,
-      <code>stpmin</code> and
-      <code>stpmax</code>.
-
-      Fatal errors cause <code>error</code> exceptions to be thrown.
-      The generic class <code>error</code> is sub-classed (e.g.
-      class <code>error_line_search_failed</code>) to facilitate
-      granular %error handling.
-
-      A note on performance: Using Compaq Fortran V5.4 and
-      Compaq C++ V6.5, the C++ implementation is about 15% slower
-      than the Fortran implementation.
-   */
-  template <typename FloatType, typename SizeType = std::size_t>
-  class minimizer
-  {
-    public:
-      //! Default constructor. Some members are not initialized!
-      minimizer()
-      : n_(0), m_(0), maxfev_(0),
-        gtol_(0), xtol_(0),
-        stpmin_(0), stpmax_(0),
-        ispt(0), iypt(0)
-      {}
-
-      //! Constructor.
-      /*! @param n The number of variables in the minimization problem.
-             Restriction: <code>n &gt; 0</code>.
-
-          @param m The number of corrections used in the BFGS update.
-             Values of <code>m</code> less than 3 are not recommended;
-             large values of <code>m</code> will result in excessive
-             computing time. <code>3 &lt;= m &lt;= 7</code> is
-             recommended.
-             Restriction: <code>m &gt; 0</code>.
-
-          @param maxfev Maximum number of function evaluations
-             <b>per line search</b>.
-             Termination occurs when the number of evaluations
-             of the objective function is at least <code>maxfev</code> by
-             the end of an iteration.
-
-          @param gtol Controls the accuracy of the line search.
-            If the function and gradient evaluations are inexpensive with
-            respect to the cost of the iteration (which is sometimes the
-            case when solving very large problems) it may be advantageous
-            to set <code>gtol</code> to a small value. A typical small
-            value is 0.1.
-            Restriction: <code>gtol</code> should be greater than 1e-4.
-
-          @param xtol An estimate of the machine precision (e.g. 10e-16
-            on a SUN station 3/60). The line search routine will
-            terminate if the relative width of the interval of
-            uncertainty is less than <code>xtol</code>.
-
-          @param stpmin Specifies the lower bound for the step
-            in the line search.
-            The default value is 1e-20. This value need not be modified
-            unless the exponent is too large for the machine being used,
-            or unless the problem is extremely badly scaled (in which
-            case the exponent should be increased).
-
-          @param stpmax specifies the upper bound for the step
-            in the line search.
-            The default value is 1e20. This value need not be modified
-            unless the exponent is too large for the machine being used,
-            or unless the problem is extremely badly scaled (in which
-            case the exponent should be increased).
-       */
-      explicit
-      minimizer(
-        SizeType n,
-        SizeType m = 5,
-        SizeType maxfev = 20,
-        FloatType gtol = FloatType(0.9),
-        FloatType xtol = FloatType(1.e-16),
-        FloatType stpmin = FloatType(1.e-20),
-        FloatType stpmax = FloatType(1.e20))
-        : n_(n), m_(m), maxfev_(maxfev),
-          gtol_(gtol), xtol_(xtol),
-          stpmin_(stpmin), stpmax_(stpmax),
-          iflag_(0), requests_f_and_g_(false), requests_diag_(false),
-          iter_(0), nfun_(0), stp_(0),
-          stp1(0), ftol(0.0001), ys(0), point(0), npt(0),
-          ispt(n+2*m), iypt((n+2*m)+n*m),
-          info(0), bound(0), nfev(0)
-      {
-        if (n_ == 0) {
-          throw error_improper_input_parameter("n = 0.");
-        }
-        if (m_ == 0) {
-          throw error_improper_input_parameter("m = 0.");
-        }
-        if (maxfev_ == 0) {
-         throw error_improper_input_parameter("maxfev = 0.");
-        }
-        if (gtol_ <= FloatType(1.e-4)) {
-          throw error_improper_input_parameter("gtol <= 1.e-4.");
-        }
-        if (xtol_ < FloatType(0)) {
-          throw error_improper_input_parameter("xtol < 0.");
-        }
-        if (stpmin_ < FloatType(0)) {
-          throw error_improper_input_parameter("stpmin < 0.");
-        }
-        if (stpmax_ < stpmin) {
-          throw error_improper_input_parameter("stpmax < stpmin");
-        }
-        w_.resize(n_*(2*m_+1)+2*m_);
-        scratch_array_.resize(n_);
-      }
-
-      //! Number of free parameters (as passed to the constructor).
-      SizeType n() const { return n_; }
-
-      //! Number of corrections kept (as passed to the constructor).
-      SizeType m() const { return m_; }
-
-      /*! \brief Maximum number of evaluations of the objective function
-          per line search (as passed to the constructor).
-       */
-      SizeType maxfev() const { return maxfev_; }
-
-      /*! \brief Control of the accuracy of the line search.
-          (as passed to the constructor).
-       */
-      FloatType gtol() const { return gtol_; }
-
-      //! Estimate of the machine precision (as passed to the constructor).
-      FloatType xtol() const { return xtol_; }
-
-      /*! \brief Lower bound for the step in the line search.
-          (as passed to the constructor).
-       */
-      FloatType stpmin() const { return stpmin_; }
-
-      /*! \brief Upper bound for the step in the line search.
-          (as passed to the constructor).
-       */
-      FloatType stpmax() const { return stpmax_; }
-
-      //! Status indicator for reverse communication.
-      /*! <code>true</code> if the run() function returns to request
-          evaluation of the objective function (<code>f</code>) and
-          gradients (<code>g</code>) for the current point
-          (<code>x</code>). To continue the minimization the
-          run() function is called again with the updated values for
-          <code>f</code> and <code>g</code>.
-          <p>
-          See also: requests_diag()
-       */
-      bool requests_f_and_g() const { return requests_f_and_g_; }
-
-      //! Status indicator for reverse communication.
-      /*! <code>true</code> if the run() function returns to request
-          evaluation of the diagonal matrix (<code>diag</code>)
-          for the current point (<code>x</code>).
-          To continue the minimization the run() function is called
-          again with the updated values for <code>diag</code>.
-          <p>
-          See also: requests_f_and_g()
-       */
-      bool requests_diag() const { return requests_diag_; }
-
-      //! Number of iterations so far.
-      /*! Note that one iteration may involve multiple evaluations
-          of the objective function.
-          <p>
-          See also: nfun()
-       */
-      SizeType iter() const { return iter_; }
-
-      //! Total number of evaluations of the objective function so far.
-      /*! The total number of function evaluations increases by the
-          number of evaluations required for the line search. The total
-          is only increased after a successful line search.
-          <p>
-          See also: iter()
-       */
-      SizeType nfun() const { return nfun_; }
-
-      //! Norm of gradient given gradient array of length n().
-      FloatType euclidean_norm(const FloatType* a) const {
-        return std::sqrt(detail::ddot(n_, a, a));
-      }
-
-      //! Current stepsize.
-      FloatType stp() const { return stp_; }
-
-      //! Execution of one step of the minimization.
-      /*! @param x On initial entry this must be set by the user to
-             the values of the initial estimate of the solution vector.
-
-          @param f Before initial entry or on re-entry under the
-             control of requests_f_and_g(), <code>f</code> must be set
-             by the user to contain the value of the objective function
-             at the current point <code>x</code>.
-
-          @param g Before initial entry or on re-entry under the
-             control of requests_f_and_g(), <code>g</code> must be set
-             by the user to contain the components of the gradient at
-             the current point <code>x</code>.
-
-          The return value is <code>true</code> if either
-          requests_f_and_g() or requests_diag() is <code>true</code>.
-          Otherwise the user should check for convergence
-          (e.g. using class traditional_convergence_test) and
-          call the run() function again to continue the minimization.
-          If the return value is <code>false</code> the user
-          should <b>not</b> update <code>f</code>, <code>g</code> or
-          <code>diag</code> (other overload) before calling
-          the run() function again.
-
-          Note that <code>x</code> is always modified by the run()
-          function. Depending on the situation it can therefore be
-          necessary to evaluate the objective function one more time
-          after the minimization is terminated.
-       */
-      bool run(
-        FloatType* x,
-        FloatType f,
-        const FloatType* g)
-      {
-        return generic_run(x, f, g, false, 0);
-      }
-
-      //! Execution of one step of the minimization.
-      /*! @param x See other overload.
-
-          @param f See other overload.
-
-          @param g See other overload.
-
-          @param diag On initial entry or on re-entry under the
-             control of requests_diag(), <code>diag</code> must be set by
-             the user to contain the values of the diagonal matrix Hk0.
-             The routine will return at each iteration of the algorithm
-             with requests_diag() set to <code>true</code>.
-             <p>
-             Restriction: all elements of <code>diag</code> must be
-             positive.
-       */
-      bool run(
-        FloatType* x,
-        FloatType f,
-        const FloatType* g,
-        const FloatType* diag)
-      {
-        return generic_run(x, f, g, true, diag);
-      }
-
-      void serialize(std::ostream* out) const {
-        out->write((const char*)&n_, sizeof(n_)); // sanity check
-        out->write((const char*)&m_, sizeof(m_)); // sanity check
-        SizeType fs = sizeof(FloatType);
-        out->write((const char*)&fs, sizeof(fs)); // sanity check
-
-        mcsrch_instance.serialize(out);
-        out->write((const char*)&iflag_, sizeof(iflag_));
-        out->write((const char*)&requests_f_and_g_, sizeof(requests_f_and_g_));
-        out->write((const char*)&requests_diag_, sizeof(requests_diag_));
-        out->write((const char*)&iter_, sizeof(iter_));
-        out->write((const char*)&nfun_, sizeof(nfun_));
-        out->write((const char*)&stp_, sizeof(stp_));
-        out->write((const char*)&stp1, sizeof(stp1));
-        out->write((const char*)&ftol, sizeof(ftol));
-        out->write((const char*)&ys, sizeof(ys));
-        out->write((const char*)&point, sizeof(point));
-        out->write((const char*)&npt, sizeof(npt));
-        out->write((const char*)&info, sizeof(info));
-        out->write((const char*)&bound, sizeof(bound));
-        out->write((const char*)&nfev, sizeof(nfev));
-        out->write((const char*)&w_[0], sizeof(FloatType) * w_.size());
-        out->write((const char*)&scratch_array_[0], sizeof(FloatType) * scratch_array_.size());
-      }
-
-      void deserialize(std::istream* in) {
-        SizeType n, m, fs;
-        in->read((char*)&n, sizeof(n));
-        in->read((char*)&m, sizeof(m));
-        in->read((char*)&fs, sizeof(fs));
-        assert(n == n_);
-        assert(m == m_);
-        assert(fs == sizeof(FloatType));
-
-        mcsrch_instance.deserialize(in);
-        in->read((char*)&iflag_, sizeof(iflag_));
-        in->read((char*)&requests_f_and_g_, sizeof(requests_f_and_g_));
-        in->read((char*)&requests_diag_, sizeof(requests_diag_));
-        in->read((char*)&iter_, sizeof(iter_));
-        in->read((char*)&nfun_, sizeof(nfun_));
-        in->read((char*)&stp_, sizeof(stp_));
-        in->read((char*)&stp1, sizeof(stp1));
-        in->read((char*)&ftol, sizeof(ftol));
-        in->read((char*)&ys, sizeof(ys));
-        in->read((char*)&point, sizeof(point));
-        in->read((char*)&npt, sizeof(npt));
-        in->read((char*)&info, sizeof(info));
-        in->read((char*)&bound, sizeof(bound));
-        in->read((char*)&nfev, sizeof(nfev));
-        in->read((char*)&w_[0], sizeof(FloatType) * w_.size());
-        in->read((char*)&scratch_array_[0], sizeof(FloatType) * scratch_array_.size());
-      }
-
-    protected:
-      static void throw_diagonal_element_not_positive(SizeType i) {
-        throw error_improper_input_data(
-          "The " + error::itoa(i) + ". diagonal element of the"
-          " inverse Hessian approximation is not positive.");
-      }
-
-      bool generic_run(
-        FloatType* x,
-        FloatType f,
-        const FloatType* g,
-        bool diagco,
-        const FloatType* diag);
-
-      detail::mcsrch<FloatType, SizeType> mcsrch_instance;
-      const SizeType n_;
-      const SizeType m_;
-      const SizeType maxfev_;
-      const FloatType gtol_;
-      const FloatType xtol_;
-      const FloatType stpmin_;
-      const FloatType stpmax_;
-      int iflag_;
-      bool requests_f_and_g_;
-      bool requests_diag_;
-      SizeType iter_;
-      SizeType nfun_;
-      FloatType stp_;
-      FloatType stp1;
-      FloatType ftol;
-      FloatType ys;
-      SizeType point;
-      SizeType npt;
-      const SizeType ispt;
-      const SizeType iypt;
-      int info;
-      SizeType bound;
-      SizeType nfev;
-      std::vector<FloatType> w_;
-      std::vector<FloatType> scratch_array_;
-  };
-
-  template <typename FloatType, typename SizeType>
-  bool minimizer<FloatType, SizeType>::generic_run(
-    FloatType* x,
-    FloatType f,
-    const FloatType* g,
-    bool diagco,
-    const FloatType* diag)
-  {
-    bool execute_entire_while_loop = false;
-    if (!(requests_f_and_g_ || requests_diag_)) {
-      execute_entire_while_loop = true;
-    }
-    requests_f_and_g_ = false;
-    requests_diag_ = false;
-    FloatType* w = &(*(w_.begin()));
-    if (iflag_ == 0) { // Initialize.
-      nfun_ = 1;
-      if (diagco) {
-        for (SizeType i = 0; i < n_; i++) {
-          if (diag[i] <= FloatType(0)) {
-            throw_diagonal_element_not_positive(i);
-          }
-        }
-      }
-      else {
-        std::fill_n(scratch_array_.begin(), n_, FloatType(1));
-        diag = &(*(scratch_array_.begin()));
-      }
-      for (SizeType i = 0; i < n_; i++) {
-        w[ispt + i] = -g[i] * diag[i];
-      }
-      FloatType gnorm = std::sqrt(detail::ddot(n_, g, g));
-      if (gnorm == FloatType(0)) return false;
-      stp1 = FloatType(1) / gnorm;
-      execute_entire_while_loop = true;
-    }
-    if (execute_entire_while_loop) {
-      bound = iter_;
-      iter_++;
-      info = 0;
-      if (iter_ != 1) {
-        if (iter_ > m_) bound = m_;
-        ys = detail::ddot(
-          n_, w, iypt + npt, SizeType(1), w, ispt + npt, SizeType(1));
-        if (!diagco) {
-          FloatType yy = detail::ddot(
-            n_, w, iypt + npt, SizeType(1), w, iypt + npt, SizeType(1));
-          std::fill_n(scratch_array_.begin(), n_, ys / yy);
-          diag = &(*(scratch_array_.begin()));
-        }
-        else {
-          iflag_ = 2;
-          requests_diag_ = true;
-          return true;
-        }
-      }
-    }
-    if (execute_entire_while_loop || iflag_ == 2) {
-      if (iter_ != 1) {
-        if (diag == 0) {
-          throw error_internal_error(__FILE__, __LINE__);
-        }
-        if (diagco) {
-          for (SizeType i = 0; i < n_; i++) {
-            if (diag[i] <= FloatType(0)) {
-              throw_diagonal_element_not_positive(i);
-            }
-          }
-        }
-        SizeType cp = point;
-        if (point == 0) cp = m_;
-        w[n_ + cp -1] = 1 / ys;
-        SizeType i;
-        for (i = 0; i < n_; i++) {
-          w[i] = -g[i];
-        }
-        cp = point;
-        for (i = 0; i < bound; i++) {
-          if (cp == 0) cp = m_;
-          cp--;
-          FloatType sq = detail::ddot(
-            n_, w, ispt + cp * n_, SizeType(1), w, SizeType(0), SizeType(1));
-          SizeType inmc=n_+m_+cp;
-          SizeType iycn=iypt+cp*n_;
-          w[inmc] = w[n_ + cp] * sq;
-          detail::daxpy(n_, -w[inmc], w, iycn, w);
-        }
-        for (i = 0; i < n_; i++) {
-          w[i] *= diag[i];
-        }
-        for (i = 0; i < bound; i++) {
-          FloatType yr = detail::ddot(
-            n_, w, iypt + cp * n_, SizeType(1), w, SizeType(0), SizeType(1));
-          FloatType beta = w[n_ + cp] * yr;
-          SizeType inmc=n_+m_+cp;
-          beta = w[inmc] - beta;
-          SizeType iscn=ispt+cp*n_;
-          detail::daxpy(n_, beta, w, iscn, w);
-          cp++;
-          if (cp == m_) cp = 0;
-        }
-        std::copy(w, w+n_, w+(ispt + point * n_));
-      }
-      stp_ = FloatType(1);
-      if (iter_ == 1) stp_ = stp1;
-      std::copy(g, g+n_, w);
-    }
-    mcsrch_instance.run(
-      gtol_, stpmin_, stpmax_, n_, x, f, g, w, ispt + point * n_,
-      stp_, ftol, xtol_, maxfev_, info, nfev, &(*(scratch_array_.begin())));
-    if (info == -1) {
-      iflag_ = 1;
-      requests_f_and_g_ = true;
-      return true;
-    }
-    if (info != 1) {
-      throw error_internal_error(__FILE__, __LINE__);
-    }
-    nfun_ += nfev;
-    npt = point*n_;
-    for (SizeType i = 0; i < n_; i++) {
-      w[ispt + npt + i] = stp_ * w[ispt + npt + i];
-      w[iypt + npt + i] = g[i] - w[i];
-    }
-    point++;
-    if (point == m_) point = 0;
-    return false;
-  }
-
-  //! Traditional LBFGS convergence test.
-  /*! This convergence test is equivalent to the test embedded
-      in the <code>lbfgs.f</code> Fortran code. The test assumes that
-      there is a meaningful relation between the Euclidean norm of the
-      parameter vector <code>x</code> and the norm of the gradient
-      vector <code>g</code>. Therefore this test should not be used if
-      this assumption is not correct for a given problem.
-   */
-  template <typename FloatType, typename SizeType = std::size_t>
-  class traditional_convergence_test
-  {
-    public:
-      //! Default constructor.
-      traditional_convergence_test()
-      : n_(0), eps_(0)
-      {}
-
-      //! Constructor.
-      /*! @param n The number of variables in the minimization problem.
-             Restriction: <code>n &gt; 0</code>.
-
-          @param eps Determines the accuracy with which the solution
-            is to be found.
-       */
-      explicit
-      traditional_convergence_test(
-        SizeType n,
-        FloatType eps = FloatType(1.e-5))
-      : n_(n), eps_(eps)
-      {
-        if (n_ == 0) {
-          throw error_improper_input_parameter("n = 0.");
-        }
-        if (eps_ < FloatType(0)) {
-          throw error_improper_input_parameter("eps < 0.");
-        }
-      }
-
-      //! Number of free parameters (as passed to the constructor).
-      SizeType n() const { return n_; }
-
-      /*! \brief Accuracy with which the solution is to be found
-          (as passed to the constructor).
-       */
-      FloatType eps() const { return eps_; }
-
-      //! Execution of the convergence test for the given parameters.
-      /*! Returns <code>true</code> if
-          <pre>
-            ||g|| &lt; eps * max(1,||x||),
-          </pre>
-          where <code>||.||</code> denotes the Euclidean norm.
-
-          @param x Current solution vector.
-
-          @param g Components of the gradient at the current
-            point <code>x</code>.
-       */
-      bool
-      operator()(const FloatType* x, const FloatType* g) const
-      {
-        FloatType xnorm = std::sqrt(detail::ddot(n_, x, x));
-        FloatType gnorm = std::sqrt(detail::ddot(n_, g, g));
-        if (gnorm <= eps_ * std::max(FloatType(1), xnorm)) return true;
-        return false;
-      }
-    protected:
-      const SizeType n_;
-      const FloatType eps_;
-  };
-
-}} // namespace scitbx::lbfgs
-
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const scitbx::lbfgs::minimizer<T>& min) {
-  return os << "ITER=" << min.iter() << "\tNFUN=" << min.nfun() << "\tSTP=" << min.stp() << "\tDIAG=" << min.requests_diag() << "\tF&G=" << min.requests_f_and_g();
-}
-
-
-#endif // SCITBX_LBFGS_H
diff --git a/training/lbfgs_test.cc b/training/lbfgs_test.cc
deleted file mode 100644
index 9678e788..00000000
--- a/training/lbfgs_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-#include <cassert>
-#include <iostream>
-#include <sstream>
-#include <cmath>
-#include "lbfgs.h"
-#include "sparse_vector.h"
-#include "fdict.h"
-
-using namespace std;
-
-double TestOptimizer() {
-  cerr << "TESTING NON-PERSISTENT OPTIMIZER\n";
-
-  // f(x,y) = 4x1^2 + x1*x2 + x2^2 + x3^2 + 6x3 + 5
-  // df/dx1 = 8*x1 + x2
-  // df/dx2 = 2*x2 + x1
-  // df/dx3 = 2*x3 + 6
-  double x[3];
-  double g[3];
-  scitbx::lbfgs::minimizer<double> opt(3);
-  scitbx::lbfgs::traditional_convergence_test<double> converged(3);
-  x[0] = 8;
-  x[1] = 8;
-  x[2] = 8;
-  double obj = 0;
-  do {
-    g[0] = 8 * x[0] + x[1];
-    g[1] = 2 * x[1] + x[0];
-    g[2] = 2 * x[2] + 6;
-    obj = 4 * x[0]*x[0] + x[0] * x[1] + x[1]*x[1] + x[2]*x[2] + 6 * x[2] + 5;
-    opt.run(x, obj, g);
-    if (!opt.requests_f_and_g()) {
-      if (converged(x,g)) break;
-      opt.run(x, obj, g);
-    }
-    cerr << x[0] << " " << x[1] << " " << x[2] << endl;
-    cerr << "   obj=" << obj << "\td/dx1=" << g[0] << " d/dx2=" << g[1] << " d/dx3=" << g[2] << endl;
-    cerr << opt << endl;
-  } while (true);
-  return obj;
-}
-
-double TestPersistentOptimizer() {
-  cerr << "\nTESTING PERSISTENT OPTIMIZER\n";
-  // f(x,y) = 4x1^2 + x1*x2 + x2^2 + x3^2 + 6x3 + 5
-  // df/dx1 = 8*x1 + x2
-  // df/dx2 = 2*x2 + x1
-  // df/dx3 = 2*x3 + 6
-  double x[3];
-  double g[3];
-  scitbx::lbfgs::traditional_convergence_test<double> converged(3);
-  x[0] = 8;
-  x[1] = 8;
-  x[2] = 8;
-  double obj = 0;
-  string state;
-  do {
-    g[0] = 8 * x[0] + x[1];
-    g[1] = 2 * x[1] + x[0];
-    g[2] = 2 * x[2] + 6;
-    obj = 4 * x[0]*x[0] + x[0] * x[1] + x[1]*x[1] + x[2]*x[2] + 6 * x[2] + 5;
-
-    {
-      scitbx::lbfgs::minimizer<double> opt(3);
-      if (state.size() > 0) {
-        istringstream is(state, ios::binary);
-        opt.deserialize(&is);
-      }
-      opt.run(x, obj, g);
-      ostringstream os(ios::binary); opt.serialize(&os); state = os.str();
-    }
-
-    cerr << x[0] << " " << x[1] << " " << x[2] << endl;
-    cerr << "   obj=" << obj << "\td/dx1=" << g[0] << " d/dx2=" << g[1] << " d/dx3=" << g[2] << endl;
-  } while (!converged(x, g));
-  return obj;
-}
-
-void TestSparseVector() {
-  cerr << "Testing SparseVector<double> serialization.\n";
-  int f1 = FD::Convert("Feature_1");
-  int f2 = FD::Convert("Feature_2");
-  FD::Convert("LanguageModel");
-  int f4 = FD::Convert("SomeFeature");
-  int f5 = FD::Convert("SomeOtherFeature");
-  SparseVector<double> g;
-  g.set_value(f2, log(0.5));
-  g.set_value(f4, log(0.125));
-  g.set_value(f1, 0);
-  g.set_value(f5, 23.777);
-  ostringstream os;
-  double iobj = 1.5;
-  B64::Encode(iobj, g, &os);
-  cerr << iobj << "\t" << g << endl;
-  string data = os.str();
-  cout << data << endl;
-  SparseVector<double> v;
-  double obj;
-  bool decode_b64 = B64::Decode(&obj, &v, &data[0], data.size());
-  cerr << obj << "\t" << v << endl;
-  assert(decode_b64);
-  assert(obj == iobj);
-  assert(g.size() == v.size());
-}
-
-int main() {
-  double o1 = TestOptimizer();
-  double o2 = TestPersistentOptimizer();
-  if (fabs(o1 - o2) > 1e-5) {
-    cerr << "OPTIMIZERS PERFORMED DIFFERENTLY!\n" << o1 << " vs. " << o2 << endl;
-    return 1;
-  }
-  TestSparseVector();
-  cerr << "SUCCESS\n";
-  return 0;
-}
-
diff --git a/training/lbl_model.cc b/training/lbl_model.cc
deleted file mode 100644
index a46ce33c..00000000
--- a/training/lbl_model.cc
+++ /dev/null
@@ -1,421 +0,0 @@
-#include <iostream>
-
-#include "config.h"
-#ifndef HAVE_EIGEN
-  int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; }
-#else
-
-#include <cstdlib>
-#include <algorithm>
-#include <cmath>
-#include <set>
-#include <cstring> // memset
-#include <ctime>
-
-#ifdef HAVE_MPI
-#include <boost/mpi/timer.hpp>
-#include <boost/mpi.hpp>
-#include <boost/archive/text_oarchive.hpp>
-namespace mpi = boost::mpi;
-#endif
-#include <boost/math/special_functions/fpclassify.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include <Eigen/Dense>
-
-#include "corpus_tools.h"
-#include "optimize.h"
-#include "array2d.h"
-#include "m.h"
-#include "lattice.h"
-#include "stringlib.h"
-#include "filelib.h"
-#include "tdict.h"
-
-namespace po = boost::program_options;
-using namespace std;
-
-#define kDIMENSIONS 10
-typedef Eigen::Matrix<double, kDIMENSIONS, 1> RVector;
-typedef Eigen::Matrix<double, 1, kDIMENSIONS> RTVector;
-typedef Eigen::Matrix<double, kDIMENSIONS, kDIMENSIONS> TMatrix;
-vector<RVector> r_src, r_trg;
-
-#if HAVE_MPI
-namespace boost {
-namespace serialization {
-
-template<class Archive>
-void serialize(Archive & ar, RVector & v, const unsigned int version) {
-  for (unsigned i = 0; i < kDIMENSIONS; ++i)
-    ar & v[i];
-}
-
-} // namespace serialization
-} // namespace boost
-#endif
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("input,i",po::value<string>(),"Input file")
-        ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training")
-        ("regularization_strength,C",po::value<double>()->default_value(0.1),"L2 regularization strength (0 for no regularization)")
-        ("eta", po::value<double>()->default_value(0.1f), "Eta for SGD")
-        ("source_embeddings,f", po::value<string>(), "File containing source embeddings (if unset, random vectors will be used)")
-        ("target_embeddings,e", po::value<string>(), "File containing target embeddings (if unset, random vectors will be used)")
-        ("random_seed,s", po::value<unsigned>(), "Random seed")
-        ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)")
-        ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (argc < 2 || conf->count("help")) {
-    cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n";
-    cerr << dcmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-void Normalize(RVector* v) {
-  double norm = v->norm();
-  assert(norm > 0.0f);
-  *v /= norm;
-}
-
-void Flatten(const TMatrix& m, vector<double>* v) {
-  unsigned c = 0;
-  v->resize(kDIMENSIONS * kDIMENSIONS);
-  for (unsigned i = 0; i < kDIMENSIONS; ++i)
-    for (unsigned j = 0; j < kDIMENSIONS; ++j) {
-      assert(boost::math::isfinite(m(i, j)));
-      (*v)[c++] = m(i,j);
-    }
-}
-
-void Unflatten(const vector<double>& v, TMatrix* m) {
-  unsigned c = 0;
-  for (unsigned i = 0; i < kDIMENSIONS; ++i)
-    for (unsigned j = 0; j < kDIMENSIONS; ++j) {
-      assert(boost::math::isfinite(v[c]));
-      (*m)(i, j) = v[c++];
-    }
-}
-
-double ApplyRegularization(const double C,
-                           const vector<double>& weights,
-                           vector<double>* g) {
-  assert(weights.size() == g->size());
-  double reg = 0;
-  for (size_t i = 0; i < weights.size(); ++i) {
-    const double& w_i = weights[i];
-    double& g_i = (*g)[i];
-    reg += C * w_i * w_i;
-    g_i += 2 * C * w_i;
-  }
-  return reg;
-}
-
-void LoadEmbeddings(const string& filename, vector<RVector>* pv) {
-  vector<RVector>& v = *pv;
-  cerr << "Reading embeddings from " << filename << " ...\n";
-  ReadFile rf(filename);
-  istream& in = *rf.stream();
-  string line;
-  unsigned lc = 0;
-  while(getline(in, line)) {
-    ++lc;
-    size_t cur = line.find(' ');
-    if (cur == string::npos || cur == 0) {
-      cerr << "Parse error reading line " << lc << ":\n" << line << endl;
-      abort();
-    }
-    WordID w = TD::Convert(line.substr(0, cur));
-    if (w >= v.size()) continue;
-    RVector& curv = v[w];
-    line[cur] = 0;
-    size_t start = cur + 1;
-    cur = start + 1;
-    size_t c = 0;
-    while(cur < line.size()) {
-      if (line[cur] == ' ') {
-        line[cur] = 0;
-        curv[c++] = strtod(&line[start], NULL);
-        start = cur + 1;
-        cur = start;
-        if (c == kDIMENSIONS) break;
-      }
-      ++cur;
-    }
-    if (c < kDIMENSIONS && cur != start) {
-      if (cur < line.size()) line[cur] = 0;
-      curv[c++] = strtod(&line[start], NULL);
-    }
-    if (c != kDIMENSIONS) {
-      static bool first = true;
-      if (first) {
-        cerr << " read " << c << " dimensions from embedding file, but built with " << kDIMENSIONS << " (filling in with random values)\n";
-        first = false;
-      }
-      for (; c < kDIMENSIONS; ++c) curv[c] = rand();
-    }
-    if (c == kDIMENSIONS && cur != line.size()) {
-      static bool first = true;
-      if (first) {
-        cerr << " embedding file contains more dimensions than configured with, truncating.\n";
-        first = false;
-      }
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
-  std::cerr << "**MPI enabled.\n";
-  mpi::environment env(argc, argv);
-  mpi::communicator world;
-  const int size = world.size(); 
-  const int rank = world.rank();
-#else
-  std::cerr << "**MPI disabled.\n";
-  const int rank = 0;
-  const int size = 1;
-#endif
-  po::variables_map conf;
-  if (!InitCommandLine(argc, argv, &conf)) return 1;
-  const string fname = conf["input"].as<string>();
-  const double reg_strength = conf["regularization_strength"].as<double>();
-  const bool has_l2 = reg_strength;
-  assert(reg_strength >= 0.0f);
-  const int ITERATIONS = conf["iterations"].as<unsigned>();
-  const double eta = conf["eta"].as<double>();
-  const double diagonal_tension = conf["diagonal_tension"].as<double>();
-  bool SGD = false;
-  if (diagonal_tension < 0.0) {
-    cerr << "Invalid value for diagonal_tension: must be >= 0\n";
-    return 1;
-  }
-  string testset;
-  if (conf.count("testset")) testset = conf["testset"].as<string>();
-
-  unsigned lc = 0;
-  vector<double> unnormed_a_i;
-  bool flag = false;
-  vector<vector<WordID> > srcs, trgs;
-  vector<WordID> vocab_e;
-  {
-    set<WordID> svocab_e, svocab_f;
-    CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size);
-    copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e));
-  }
-  cerr << "Number of target word types: " << vocab_e.size() << endl;
-  const double num_examples = lc;
-
-  boost::shared_ptr<LBFGSOptimizer> lbfgs;
-  if (rank == 0)
-    lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100));
-  r_trg.resize(TD::NumWords() + 1);
-  r_src.resize(TD::NumWords() + 1);
-  vector<set<unsigned> > trg_pos(TD::NumWords() + 1);
-
-  if (conf.count("random_seed")) {
-    srand(conf["random_seed"].as<unsigned>());
-  } else {
-    unsigned seed = time(NULL) + rank * 100;
-    cerr << "Random seed: " << seed << endl;
-    srand(seed);
-  }
-  
-  TMatrix t = TMatrix::Zero();
-  if (rank == 0) {
-    t = TMatrix::Random() / 50.0;
-    for (unsigned i = 1; i < r_trg.size(); ++i) {
-      r_trg[i] = RVector::Random();
-      r_src[i] = RVector::Random();
-    }
-    if (conf.count("source_embeddings"))
-      LoadEmbeddings(conf["source_embeddings"].as<string>(), &r_src);
-    if (conf.count("target_embeddings"))
-      LoadEmbeddings(conf["target_embeddings"].as<string>(), &r_trg);
-  }
-
-  // do optimization
-  TMatrix g = TMatrix::Zero();
-  vector<TMatrix> exp_src;
-  vector<double> z_src;
-  vector<double> flat_g, flat_t, rcv_grad;
-  Flatten(t, &flat_t);
-  bool converged = false;
-#if HAVE_MPI
-  mpi::broadcast(world, &flat_t[0], flat_t.size(), 0);
-  mpi::broadcast(world, r_trg, 0);
-  mpi::broadcast(world, r_src, 0);
-#endif
-  cerr << "rank=" << rank << ": " << r_trg[0][4] << endl;
-  for (int iter = 0; !converged && iter < ITERATIONS; ++iter) {
-    if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl;
-    Unflatten(flat_t, &t);
-    double likelihood = 0;
-    double denom = 0.0;
-    lc = 0;
-    flag = false;
-    g *= 0;
-    for (unsigned i = 0; i < srcs.size(); ++i) {
-      const vector<WordID>& src = srcs[i];
-      const vector<WordID>& trg = trgs[i];
-      ++lc;
-      if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; }
-      if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
-      denom += trg.size();
-
-      exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero());
-      z_src.clear(); z_src.resize(src.size(), 0.0);
-      Array2D<TMatrix> exp_refs(src.size(), trg.size(), TMatrix::Zero());
-      Array2D<double> z_refs(src.size(), trg.size(), 0.0);
-      for (unsigned j = 0; j < trg.size(); ++j)
-        trg_pos[trg[j]].insert(j);
-
-      for (unsigned i = 0; i < src.size(); ++i) {
-        const RVector& r_s = r_src[src[i]];
-        const RTVector pred = r_s.transpose() * t;
-        TMatrix& exp_m = exp_src[i];
-        double& z = z_src[i];
-        for (unsigned k = 0; k < vocab_e.size(); ++k) {
-          const WordID v_k = vocab_e[k];
-          const RVector& r_t = r_trg[v_k];
-          const double dot_prod = pred * r_t;
-          const double u = exp(dot_prod);
-          z += u;
-          const TMatrix v = r_s * r_t.transpose() * u;
-          exp_m += v;
-          set<unsigned>& ref_locs = trg_pos[v_k];
-          if (!ref_locs.empty()) {
-            for (set<unsigned>::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) {
-              TMatrix& exp_ref_ij = exp_refs(i, *it);
-              double& z_ref_ij = z_refs(i, *it);
-              z_ref_ij += u;
-              exp_ref_ij += v;
-            }
-          }
-        }
-      }
-      for (unsigned j = 0; j < trg.size(); ++j)
-        trg_pos[trg[j]].clear();
-
-      // model expectations for a single target generation with
-      // uniform alignment prior
-      // TODO: when using a non-uniform alignment, m_exp will be
-      // a function of j (below)
-      double m_z = 0;
-      TMatrix m_exp = TMatrix::Zero();
-      for (unsigned i = 0; i < src.size(); ++i) {
-        m_exp += exp_src[i];
-        m_z += z_src[i];
-      }
-      m_exp /= m_z;
-
-      Array2D<bool> al(src.size(), trg.size(), false);
-      for (unsigned j = 0; j < trg.size(); ++j) {
-        double ref_z = 0;
-        TMatrix ref_exp = TMatrix::Zero();
-        int max_i = 0;
-        double max_s = -9999999;
-        for (unsigned i = 0; i < src.size(); ++i) {
-          ref_exp += exp_refs(i, j);
-          ref_z += z_refs(i, j);
-          if (log(z_refs(i, j)) > max_s) {
-            max_s = log(z_refs(i, j));
-            max_i = i;
-          }
-          // TODO handle alignment prob
-        }
-        if (ref_z <= 0) { 
-          cerr << "TRG=" << TD::Convert(trg[j]) << endl;
-          cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl;
-          cerr << " REF_EXP=\n" << ref_exp << endl;
-          cerr << " M_EXP=\n" << m_exp << endl;
-          abort();
-        }
-        al(max_i, j) = true;
-        ref_exp /= ref_z;
-        g += m_exp - ref_exp;
-        likelihood += log(ref_z) - log(m_z);
-        if (SGD) {
-          t -= g * eta / num_examples;
-          g *= 0;
-        }
-      }
-      
-      if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; }
-    }
-    if (flag && rank == 0) { cerr << endl; }
-
-    double obj = 0;
-    if (!SGD) {
-      Flatten(g, &flat_g);
-      obj = -likelihood;
-#if HAVE_MPI
-      rcv_grad.resize(flat_g.size(), 0.0);
-      mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus<double>(), 0);
-      swap(flat_g, rcv_grad);
-      rcv_grad.clear();
-
-      double to = 0;
-      mpi::reduce(world, obj, to, plus<double>(), 0);
-      obj = to;
-      double tlh = 0;
-      mpi::reduce(world, likelihood, tlh, plus<double>(), 0);
-      likelihood = tlh;
-      double td = 0;
-      mpi::reduce(world, denom, td, plus<double>(), 0);
-      denom = td;
-#endif
-    }
-
-    if (rank == 0) {
-      double gn = 0;
-      for (unsigned i = 0; i < flat_g.size(); ++i)
-        gn += flat_g[i]*flat_g[i];
-      const double base2_likelihood = likelihood / log(2);
-      cerr << "  log_e likelihood: " << likelihood << endl;
-      cerr << "  log_2 likelihood: " << base2_likelihood << endl;
-      cerr << "     cross entropy: " << (-base2_likelihood / denom) << endl;
-      cerr << "        perplexity: " << pow(2.0, -base2_likelihood / denom) << endl;
-      cerr << "     gradient norm: " << sqrt(gn) << endl;
-      if (!SGD) {
-        if (has_l2) {
-          const double r = ApplyRegularization(reg_strength,
-                                               flat_t,
-                                               &flat_g);
-          obj += r;
-          cerr << "    regularization: " << r << endl;
-        }
-        lbfgs->Optimize(obj, flat_g, &flat_t);
-        converged = (lbfgs->HasConverged());
-      }
-    }
-#ifdef HAVE_MPI
-    mpi::broadcast(world, &flat_t[0], flat_t.size(), 0);
-    mpi::broadcast(world, converged, 0);
-#endif
-  }
-  if (rank == 0)
-    cerr << "TRANSLATION MATRIX:" << endl << t << endl;
-  return 0;
-}
-
-#endif
-
diff --git a/training/minrisk/Makefile.am b/training/minrisk/Makefile.am
new file mode 100644
index 00000000..a15e821e
--- /dev/null
+++ b/training/minrisk/Makefile.am
@@ -0,0 +1,6 @@
+bin_PROGRAMS = minrisk_optimize
+
+minrisk_optimize_SOURCES = minrisk_optimize.cc
+minrisk_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/training/liblbfgs/liblbfgs.a -lz
+
+AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training -I$(top_srcdir)/training/utils
diff --git a/training/minrisk/minrisk.pl b/training/minrisk/minrisk.pl
new file mode 100755
index 00000000..0f8bacd0
--- /dev/null
+++ b/training/minrisk/minrisk.pl
@@ -0,0 +1,540 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+my $default_jobs = env_default_jobs();
+
+my $UTILS_DIR="$SCRIPT_DIR/../utils";
+require "$UTILS_DIR/libcall.pl";
+
+# Default settings
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/minrisk_generate_input.pl";
+my $MAPPER = "$bin_dir/minrisk_optimize";
+my $parallelize = "$UTILS_DIR/parallelize.pl";
+my $libcall = "$UTILS_DIR/libcall.pl";
+my $sentserver = "$UTILS_DIR/sentserver";
+my $sentclient = "$UTILS_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 30;
+my $iteration = 1;
+my $best_weights;
+my $psi = 1;
+my $default_max_iter = 30;
+my $max_iterations = $default_max_iter;
+my $jobs = $default_jobs;   # number of decode nodes
+my $pmem = "4g";
+my $disable_clean = 0;
+my %seen_weights;
+my $help = 0;
+my $epsilon = 0.0001;
+my $dryrun = 0;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $use_make = 1;  # use make to parallelize
+my $useqsub = 0;
+my $initial_weights;
+my $pass_suffix = '';
+my $cpbin=1;
+
+# regularization strength
+my $tune_regularizer = 0;
+my $reg = 500;
+my $reg_previous = 5000;
+my $dont_accum = 0;
+
+# Process command-line options
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+	"jobs=i" => \$jobs,
+	"dont-clean" => \$disable_clean,
+	"dont-accumulate" => \$dont_accum,
+	"pass-suffix=s" => \$pass_suffix,
+        "qsub" => \$useqsub,
+	"dry-run" => \$dryrun,
+	"epsilon=s" => \$epsilon,
+	"help" => \$help,
+        "weights=s" => \$initial_weights,
+	"reg=f" => \$reg,
+	"use-make=i" => \$use_make,
+	"max-iterations=i" => \$max_iterations,
+	"pmem=s" => \$pmem,
+        "cpbin!" => \$cpbin,
+	"ref-files=s" => \$refFiles,
+	"metric=s" => \$metric,
+	"source-file=s" => \$srcFile,
+	"workdir=s" => \$dir,
+) == 0 || @ARGV!=1 || $help) {
+	print_help();
+	exit;
+}
+
+die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer;
+
+if ($useqsub) {
+  $use_make = 0;
+  die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (!defined $srcFile) { push @missing_args, "--source-file"; }
+if (!defined $refFiles) { push @missing_args, "--ref-files"; }
+if (!defined $initial_weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
+
+if ($metric =~ /^(combi|ter)$/i) {
+  $lines_per_mapper = 5;
+}
+
+($iniFile) = @ARGV;
+
+
+sub write_config;
+sub enseg;
+sub print_help;
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+  $DIR_FLAG = '';
+}
+
+my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
+
+unless ($dir){
+	$dir = "minrisk";
+}
+unless ($dir =~ /^\//){  # convert relative path to absolute path
+	my $basedir = check_output("pwd");
+	chomp $basedir;
+	$dir = "$basedir/$dir";
+}
+
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+	print STDERR "Cleanup...\n";
+	for my $pid (@childpids){ unchecked_call("kill $pid"); }
+	for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+	exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit = 
+    sub{ cleanup(); }; 
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+use File::Basename qw(basename);
+#pass bindir, refs to vars holding bin
+sub modbin {
+    local $_;
+    my $bindir=shift;
+    check_call("mkdir -p $bindir");
+    -d $bindir || die "couldn't make bindir $bindir";
+    for (@_) {
+        my $src=$$_;
+        $$_="$bindir/".basename($src);
+        check_call("cp -p $src $$_");
+    }
+}
+sub dirsize {
+    opendir ISEMPTY,$_[0];
+    return scalar(readdir(ISEMPTY))-1;
+}
+my @allweights;
+if ($dryrun){
+	write_config(*STDERR);
+	exit 0;
+} else {
+	if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs
+	  die "ERROR: working dir $dir already exists\n\n";
+	} else {
+		-e $dir || mkdir $dir;
+		mkdir "$dir/hgs";
+        modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
+    mkdir "$dir/scripts";
+        my $cmdfile="$dir/rerun-pro.sh";
+        open CMD,'>',$cmdfile;
+        print CMD "cd ",&getcwd,"\n";
+#        print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
+        my $cline=&cmdline."\n";
+        print CMD $cline;
+        close CMD;
+        print STDERR $cline;
+        chmod(0755,$cmdfile);
+	check_call("cp $initial_weights $dir/weights.0");
+	die "Can't find weights.0" unless (-e "$dir/weights.0");
+	}
+	write_config(*STDERR);
+}
+
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+$iniFile = $newIniFile;
+
+my $newsrc = "$dir/dev.input";
+enseg($srcFile, $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+my $kbest = "$dir/kbest";
+if ($dont_accum) {
+  $kbest = '';
+} else {
+  check_call("mkdir -p $kbest");
+  $kbest = "--kbest_repository $kbest";
+}
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+while (1){
+	print STDERR "\n\nITERATION $iteration\n==========\n";
+
+	if ($iteration > $max_iterations){
+		print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+		last;
+	}
+	# iteration-specific files
+	my $runFile="$dir/run.raw.$iteration";
+	my $onebestFile="$dir/1best.$iteration";
+	my $logdir="$dir/logs.$iteration";
+	my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+	my $scorerLog="$logdir/scorer.log.$iteration";
+	check_call("mkdir -p $logdir");
+
+
+	#decode
+	print STDERR "RUNNING DECODER AT ";
+	print STDERR unchecked_output("date");
+	my $im1 = $iteration - 1;
+	my $weightsFile="$dir/weights.$im1";
+        push @allweights, "-w $dir/weights.$im1";
+        `rm -f $dir/hgs/*.gz`;
+	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+	my $pcmd;
+	if ($use_make) {
+		$pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
+	} else {
+		$pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
+	}
+	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_bash_call($cmd);
+        my $num_hgs;
+        my $num_topbest;
+        my $retries = 0;
+	while($retries < 5) {
+	    $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+	    $num_topbest = check_output("wc -l < $runFile");
+	    print STDERR "NUMBER OF HGs: $num_hgs\n";
+	    print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+	    if($devSize == $num_hgs && $devSize == $num_topbest) {
+		last;
+	    } else {
+		print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+		sleep(3);
+	    }
+	    $retries++;
+	}
+	die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+	my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric");
+	chomp $dec_score;
+	print STDERR "DECODER SCORE: $dec_score\n";
+
+	# save space
+	check_call("gzip -f $runFile");
+	check_call("gzip -f $decoderLog");
+
+	# run optimizer
+	print STDERR "RUNNING OPTIMIZER AT ";
+	print STDERR unchecked_output("date");
+	print STDERR " - GENERATE TRAINING EXEMPLARS\n";
+	my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+	my $score = 0;
+	my $icc = 0;
+	my $inweights="$dir/weights.$im1";
+	my $outweights="$dir/weights.$iteration";
+	$cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_call($cmd);
+	$cmd="$MAPPER $refs_comma_sep -m $metric -i $dir/agenda.$im1 $kbest -w $inweights > $outweights";
+	check_call($cmd);
+	$lastWeightsFile = $outweights;
+	$iteration++;
+	`rm hgs/*.gz`;
+	print STDERR "\n==========\n";
+}
+
+print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
+
+print STDOUT "$lastWeightsFile\n";
+
+exit 0;
+
+sub get_lines {
+  my $fn = shift @_;
+  open FL, "<$fn" or die "Couldn't read $fn: $!";
+  my $lc = 0;
+  while(<FL>) { $lc++; }
+  return $lc;
+}
+
+sub get_comma_sep_refs {
+  my ($r,$p) = @_;
+  my $o = check_output("echo $p");
+  chomp $o;
+  my @files = split /\s+/, $o;
+  return "-$r " . join(" -$r ", @files);
+}
+
+sub read_weights_file {
+  my ($file) = @_;
+  open F, "<$file" or die "Couldn't read $file: $!";
+  my @r = ();
+  my $pm = -1;
+  while(<F>) {
+    next if /^#/;
+    next if /^\s*$/;
+    chomp;
+    if (/^(.+)\s+(.+)$/) {
+      my $m = $1;
+      my $w = $2;
+      die "Weights out of order: $m <= $pm" unless $m > $pm;
+      push @r, $w;
+    } else {
+      warn "Unexpected feature name in weight file: $_";
+    }
+  }
+  close F;
+  return join ' ', @r;
+}
+
+# subs
+sub write_config {
+	my $fh = shift;
+	my $cleanup = "yes";
+	if ($disable_clean) {$cleanup = "no";}
+
+	print $fh "\n";
+	print $fh "DECODER:          $decoder\n";
+	print $fh "INI FILE:         $iniFile\n";
+	print $fh "WORKING DIR:      $dir\n";
+	print $fh "SOURCE (DEV):     $srcFile\n";
+	print $fh "REFS (DEV):       $refFiles\n";
+	print $fh "EVAL METRIC:      $metric\n";
+	print $fh "MAX ITERATIONS:   $max_iterations\n";
+	print $fh "JOBS:             $jobs\n";
+	print $fh "HEAD NODE:        $host\n";
+	print $fh "PMEM (DECODING):  $pmem\n";
+	print $fh "CLEANUP:          $cleanup\n";
+}
+
+sub update_weights_file {
+  my ($neww, $rfn, $rpts) = @_;
+  my @feats = @$rfn;
+  my @pts = @$rpts;
+  my $num_feats = scalar @feats;
+  my $num_pts = scalar @pts;
+  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+  open G, ">$neww" or die;
+  for (my $i = 0; $i < $num_feats; $i++) {
+    my $f = $feats[$i];
+    my $lambda = $pts[$i];
+    print G "$f $lambda\n";
+  }
+  close G;
+}
+
+sub enseg {
+	my $src = shift;
+	my $newsrc = shift;
+	open(SRC, $src);
+	open(NEWSRC, ">$newsrc");
+	my $i=0;
+	while (my $line=<SRC>){
+		chomp $line;
+		if ($line =~ /^\s*<seg/i) {
+		    if($line =~ /id="[0-9]+"/) {
+			print NEWSRC "$line\n";
+		    } else {
+			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+		    }
+		} else {
+			print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+		}
+		$i++;
+	}
+	close SRC;
+	close NEWSRC;
+	die "Empty dev set!" if ($i == 0);
+}
+
+sub print_help {
+
+	my $executable = check_output("basename $0"); chomp $executable;
+	print << "Help";
+
+Usage: $executable [options] <ini file>
+
+	$executable [options] <ini file>
+		Runs a complete PRO optimization using the ini file specified.
+
+Required:
+
+	--ref-files <files>
+		Dev set ref files.  This option takes only a single string argument.
+		To use multiple files (including file globbing), this argument should
+		be quoted.
+
+	--source-file <file>
+		Dev set source file.
+
+	--weights <file>
+		Initial weights file (use empty file to start from 0)
+
+General options:
+
+	--help
+		Print this message and exit.
+
+	--dont-accumulate
+		Don't accumulate k-best lists from multiple iterations.
+
+	--max-iterations <M>
+		Maximum number of iterations to run.  If not specified, defaults
+		to $default_max_iter.
+
+	--metric <method>
+		Metric to optimize.
+		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+	--pass-suffix <S>
+		If the decoder is doing multi-pass decoding, the pass suffix "2",
+		"3", etc., is used to control what iteration of weights is set.
+
+	--workdir <dir>
+		Directory for intermediate and output files.  If not specified, the
+		name is derived from the ini filename.  Assuming that the ini
+		filename begins with the decoder name and ends with ini, the default
+		name of the working directory is inferred from the middle part of
+		the filename.  E.g. an ini file named decoder.foo.ini would have
+		a default working directory name foo.
+
+Regularization options:
+
+	--reg <F>
+		l2 regularization strength [default=500]. The greater this value,
+		the closer to zero the weights will be.
+
+Job control options:
+
+	--jobs <I>
+		Number of decoder processes to run in parallel. [default=$default_jobs]
+
+	--qsub
+		Use qsub to run jobs in parallel (qsub must be configured in
+		environment/LocalEnvironment.pm)
+
+	--pmem <N>
+		Amount of physical memory requested for parallel decoding jobs
+		(used with qsub requests only)
+
+Help
+}
+
+sub convert {
+  my ($str) = @_;
+  my @ps = split /;/, $str;
+  my %dict = ();
+  for my $p (@ps) {
+    my ($k, $v) = split /=/, $p;
+    $dict{$k} = $v;
+  }
+  return %dict;
+}
+
+
+sub cmdline {
+    return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+    my ($arg)=@_;
+    return undef unless defined $arg;
+    if ($arg =~ /$is_shell_special/) {
+        $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+        return "\"$arg\"";
+    }
+    return $arg;
+}
+
+sub escaped_shell_args {
+    return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+    return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
diff --git a/training/minrisk/minrisk_generate_input.pl b/training/minrisk/minrisk_generate_input.pl
new file mode 100755
index 00000000..b30fc4fd
--- /dev/null
+++ b/training/minrisk/minrisk_generate_input.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1;
+my $d = shift @ARGV;
+die "Can't find directory $d" unless -d $d;
+
+opendir(DIR, $d) or die "Can't read $d: $!";
+my @hgs = grep { /\.gz$/ } readdir(DIR);
+closedir DIR;
+
+for my $hg (@hgs) {
+  my $file = $hg;
+  my $id = $hg;
+  $id =~ s/(\.json)?\.gz//;
+  print "$d/$file $id\n";
+}
+
diff --git a/training/minrisk/minrisk_optimize.cc b/training/minrisk/minrisk_optimize.cc
new file mode 100644
index 00000000..da8b5260
--- /dev/null
+++ b/training/minrisk/minrisk_optimize.cc
@@ -0,0 +1,197 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <limits>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "liblbfgs/lbfgs++.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "weights.h"
+#include "hg_io.h"
+#include "kbest.h"
+#include "viterbi.h"
+#include "ns.h"
+#include "ns_docscorer.h"
+#include "candidate_set.h"
+#include "risk.h"
+#include "entropy.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+        ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
+        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+        ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)")
+        ("temperature,T",po::value<double>()->default_value(0.0), "Temperature parameter for objective (>0 increases the entropy)")
+        ("l1_strength,C",po::value<double>()->default_value(0.0), "L1 regularization strength")
+        ("memory_buffers,M",po::value<unsigned>()->default_value(20), "Memory buffers used in LBFGS")
+        ("kbest_repository,R",po::value<string>(), "Accumulate k-best lists from previous iterations (parameter is path to repository)")
+        ("kbest_size,k",po::value<unsigned>()->default_value(500u), "Top k-hypotheses to extract")
+        ("help,h", "Help");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  bool flag = false;
+  if (!conf->count("reference")) {
+    cerr << "Please specify one or more references using -r <REF.TXT>\n";
+    flag = true;
+  }
+  if (!conf->count("weights")) {
+    cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
+    flag = true;
+  }
+  if (flag || conf->count("help")) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+EvaluationMetric* metric = NULL;
+
+struct RiskObjective {
+  explicit RiskObjective(const vector<training::CandidateSet>& tr, const double temp) : training(tr), T(temp) {}
+  double operator()(const vector<double>& x, double* g) const {
+    fill(g, g + x.size(), 0.0);
+    double obj = 0;
+    double h = 0;
+    for (unsigned i = 0; i < training.size(); ++i) {
+      training::CandidateSetRisk risk(training[i], *metric);
+      training::CandidateSetEntropy entropy(training[i]);
+      SparseVector<double> tg, hg;
+      double r = risk(x, &tg);
+      double hh = entropy(x, &hg);
+      h += hh;
+      obj += r;
+      for (SparseVector<double>::iterator it = tg.begin(); it != tg.end(); ++it)
+        g[it->first] += it->second;
+      if (T) {
+        for (SparseVector<double>::iterator it = hg.begin(); it != hg.end(); ++it)
+          g[it->first] += T * it->second;
+      }
+    }
+    cerr << (1-(obj / training.size())) << "  H=" << h << endl;
+    return obj - T * h;
+  }
+  const vector<training::CandidateSet>& training;
+  const double T; // temperature for entropy regularization
+};  
+
+double LearnParameters(const vector<training::CandidateSet>& training,
+                       const double temp, // > 0 increases the entropy, < 0 decreases the entropy
+                       const double C1,
+                       const unsigned memory_buffers,
+                       vector<weight_t>* px) {
+  RiskObjective obj(training, temp);
+  LBFGS<RiskObjective> lbfgs(px, obj, memory_buffers, C1);
+  lbfgs.MinimizeFunction();
+  return 0;
+}
+
+#if 0
+struct FooLoss {
+  double operator()(const vector<double>& x, double* g) const {
+    fill(g, g + x.size(), 0.0);
+    training::CandidateSet cs;
+    training::CandidateSetEntropy cse(cs);
+    cs.cs.resize(3);
+    cs.cs[0].fmap.set_value(FD::Convert("F1"), -1.0);
+    cs.cs[1].fmap.set_value(FD::Convert("F2"), 1.0);
+    cs.cs[2].fmap.set_value(FD::Convert("F1"), 2.0);
+    cs.cs[2].fmap.set_value(FD::Convert("F2"), 0.5);
+    SparseVector<double> xx;
+    double h = cse(x, &xx);
+    cerr << cse(x, &xx) << endl; cerr << "G: " << xx << endl;
+    for (SparseVector<double>::iterator i = xx.begin(); i != xx.end(); ++i)
+      g[i->first] += i->second;
+    return -h;
+  }
+};
+#endif
+
+int main(int argc, char** argv) {
+#if 0
+  training::CandidateSet cs;
+  training::CandidateSetEntropy cse(cs);
+  cs.cs.resize(3);
+  cs.cs[0].fmap.set_value(FD::Convert("F1"), -1.0);
+  cs.cs[1].fmap.set_value(FD::Convert("F2"), 1.0);
+  cs.cs[2].fmap.set_value(FD::Convert("F1"), 2.0);
+  cs.cs[2].fmap.set_value(FD::Convert("F2"), 0.5);
+  FooLoss foo;
+  vector<double> ww(FD::NumFeats()); ww[FD::Convert("F1")] = 1.0;
+  LBFGS<FooLoss> lbfgs(&ww, foo, 100, 0.0);
+  lbfgs.MinimizeFunction();
+  return 1;
+#endif
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  const string evaluation_metric = conf["evaluation_metric"].as<string>();
+
+  metric = EvaluationMetric::Instance(evaluation_metric);
+  DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+  cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
+
+  Hypergraph hg;
+  string last_file;
+  ReadFile in_read(conf["input"].as<string>());
+  string kbest_repo;
+  if (conf.count("kbest_repository")) {
+    kbest_repo = conf["kbest_repository"].as<string>();
+    MkDirP(kbest_repo);
+  }
+  istream &in=*in_read.stream();
+  const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
+  vector<weight_t> weights;
+  const string weightsf = conf["weights"].as<string>();
+  Weights::InitFromFile(weightsf, &weights);
+  double t = 0;
+  for (unsigned i = 0; i < weights.size(); ++i)
+    t += weights[i] * weights[i];
+  if (t > 0) {
+    for (unsigned i = 0; i < weights.size(); ++i)
+      weights[i] /= sqrt(t);
+  }
+  string line, file;
+  vector<training::CandidateSet> kis;
+  cerr << "Loading hypergraphs...\n";
+  while(getline(in, line)) {
+    istringstream is(line);
+    int sent_id;
+    kis.resize(kis.size() + 1);
+    training::CandidateSet& curkbest = kis.back();
+    string kbest_file;
+    if (kbest_repo.size()) {
+      ostringstream os;
+      os << kbest_repo << "/kbest." << sent_id << ".txt.gz";
+      kbest_file = os.str();
+      if (FileExists(kbest_file))
+        curkbest.ReadFromFile(kbest_file);
+    }
+    is >> file >> sent_id;
+    ReadFile rf(file);
+    if (kis.size() % 5 == 0) { cerr << '.'; }
+    if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; }
+    HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+    hg.Reweight(weights);
+    curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]);
+    if (kbest_file.size())
+      curkbest.WriteToFile(kbest_file);
+  }
+  cerr << "\nHypergraphs loaded.\n";
+  weights.resize(FD::NumFeats());
+
+  double c1 = conf["l1_strength"].as<double>();
+  double temp = conf["temperature"].as<double>();
+  unsigned m = conf["memory_buffers"].as<unsigned>();
+  LearnParameters(kis, temp, c1, m, &weights);
+  Weights::WriteToFile("-", weights);
+  return 0;
+}
+
diff --git a/training/mira/Makefile.am b/training/mira/Makefile.am
new file mode 100644
index 00000000..ae609ede
--- /dev/null
+++ b/training/mira/Makefile.am
@@ -0,0 +1,6 @@
+bin_PROGRAMS = kbest_mira
+
+kbest_mira_SOURCES = kbest_mira.cc
+kbest_mira_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/mira/kbest_mira.cc b/training/mira/kbest_mira.cc
new file mode 100644
index 00000000..8b7993dd
--- /dev/null
+++ b/training/mira/kbest_mira.cc
@@ -0,0 +1,309 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+#include <tr1/memory>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "hg_sampler.h"
+#include "sentence_metadata.h"
+#include "scorer.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "hg.h"
+#include "prob.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool invert_score;
+std::tr1::shared_ptr<MT19937> rng;
+
+void RandomPermutation(int len, vector<int>* p_ids) {
+  vector<int>& ids = *p_ids;
+  ids.resize(len);
+  for (int i = 0; i < len; ++i) ids[i] = i;
+  for (int i = len; i > 0; --i) {
+    int j = rng->next() * i;
+    if (j == i) i--;
+    swap(ids[i-1], ids[j]);
+  }  
+}
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("input_weights,w",po::value<string>(),"Input feature weights file")
+        ("source,i",po::value<string>(),"Source file for development set")
+        ("passes,p", po::value<int>()->default_value(15), "Number of passes through the training data")
+        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)")
+        ("mt_metric,m",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)")
+        ("max_step_size,C", po::value<double>()->default_value(0.01), "regularization strength (C)")
+        ("mt_metric_scale,s", po::value<double>()->default_value(1.0), "Amount to scale MT loss function by")
+        ("k_best_size,k", po::value<int>()->default_value(250), "Size of hypothesis list to search for oracles")
+        ("sample_forest,f", "Instead of a k-best list, sample k hypotheses from the decoder's forest")
+        ("sample_forest_unit_weight_vector,x", "Before sampling (must use -f option), rescale the weight vector used so it has unit length; this may improve the quality of the samples")
+        ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+        ("decoder_config,c",po::value<string>(),"Decoder configuration file");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("input_weights") || !conf->count("source") || !conf->count("decoder_config") || !conf->count("reference")) {
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct HypothesisInfo {
+  SparseVector<double> features;
+  double mt_metric;
+};
+
+struct GoodBadOracle {
+  std::tr1::shared_ptr<HypothesisInfo> good;
+  std::tr1::shared_ptr<HypothesisInfo> bad;
+};
+
+struct TrainingObserver : public DecoderObserver {
+  TrainingObserver(const int k, const DocScorer& d, bool sf, vector<GoodBadOracle>* o) : ds(d), oracles(*o), kbest_size(k), sample_forest(sf) {}
+  const DocScorer& ds;
+  vector<GoodBadOracle>& oracles;
+  std::tr1::shared_ptr<HypothesisInfo> cur_best;
+  const int kbest_size;
+  const bool sample_forest;
+
+  const HypothesisInfo& GetCurrentBestHypothesis() const {
+    return *cur_best;
+  }
+
+  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    UpdateOracles(smeta.GetSentenceID(), *hg);
+  }
+
+  std::tr1::shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double score) {
+    std::tr1::shared_ptr<HypothesisInfo> h(new HypothesisInfo);
+    h->features = feats;
+    h->mt_metric = score;
+    return h;
+  }
+
+  void UpdateOracles(int sent_id, const Hypergraph& forest) {
+    std::tr1::shared_ptr<HypothesisInfo>& cur_good = oracles[sent_id].good;
+    std::tr1::shared_ptr<HypothesisInfo>& cur_bad = oracles[sent_id].bad;
+    cur_bad.reset();  // TODO get rid of??
+
+    if (sample_forest) {
+      vector<WordID> cur_prediction;
+      ViterbiESentence(forest, &cur_prediction);
+      float sentscore = ds[sent_id]->ScoreCandidate(cur_prediction)->ComputeScore();
+      cur_best = MakeHypothesisInfo(ViterbiFeatures(forest), sentscore);
+
+      vector<HypergraphSampler::Hypothesis> samples;
+      HypergraphSampler::sample_hypotheses(forest, kbest_size, &*rng, &samples);
+      for (unsigned i = 0; i < samples.size(); ++i) {
+        sentscore = ds[sent_id]->ScoreCandidate(samples[i].words)->ComputeScore();
+        if (invert_score) sentscore *= -1.0;
+        if (!cur_good || sentscore > cur_good->mt_metric)
+          cur_good = MakeHypothesisInfo(samples[i].fmap, sentscore);
+        if (!cur_bad || sentscore < cur_bad->mt_metric)
+          cur_bad = MakeHypothesisInfo(samples[i].fmap, sentscore);
+      }
+    } else {
+      KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, kbest_size);
+      for (int i = 0; i < kbest_size; ++i) {
+        const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+          kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+        if (!d) break;
+        float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore();
+        if (invert_score) sentscore *= -1.0;
+        // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl;
+        if (i == 0)
+          cur_best = MakeHypothesisInfo(d->feature_values, sentscore);
+        if (!cur_good || sentscore > cur_good->mt_metric)
+          cur_good = MakeHypothesisInfo(d->feature_values, sentscore);
+        if (!cur_bad || sentscore < cur_bad->mt_metric)
+          cur_bad = MakeHypothesisInfo(d->feature_values, sentscore);
+      }
+      //cerr << "GOOD: " << cur_good->mt_metric << endl;
+      //cerr << " CUR: " << cur_best->mt_metric << endl;
+      //cerr << " BAD: " << cur_bad->mt_metric << endl;
+    }
+  }
+};
+
+void ReadTrainingCorpus(const string& fname, vector<string>* c) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    c->push_back(line);
+  }
+}
+
+bool ApproxEqual(double a, double b) {
+  if (a == b) return true;
+  return (fabs(a-b)/fabs(b)) < 0.000001;
+}
+
+int main(int argc, char** argv) {
+  register_feature_functions();
+  SetSilent(true);  // turn off verbose decoder output
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf)) return 1;
+
+  if (conf.count("random_seed"))
+    rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    rng.reset(new MT19937);
+  const bool sample_forest = conf.count("sample_forest") > 0;
+  const bool sample_forest_unit_weight_vector = conf.count("sample_forest_unit_weight_vector") > 0;
+  if (sample_forest_unit_weight_vector && !sample_forest) {
+    cerr << "Cannot --sample_forest_unit_weight_vector without --sample_forest" << endl;
+    return 1;
+  }
+  vector<string> corpus;
+  ReadTrainingCorpus(conf["source"].as<string>(), &corpus);
+  const string metric_name = conf["mt_metric"].as<string>();
+  ScoreType type = ScoreTypeFromString(metric_name);
+  if (type == TER) {
+    invert_score = true;
+  } else {
+    invert_score = false;
+  }
+  DocScorer ds(type, conf["reference"].as<vector<string> >(), "");
+  cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl;
+  if (ds.size() != corpus.size()) {
+    cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n";
+    return 1;
+  }
+
+  ReadFile ini_rf(conf["decoder_config"].as<string>());
+  Decoder decoder(ini_rf.stream());
+
+  // load initial weights
+  vector<weight_t>& dense_weights = decoder.CurrentWeightVector();
+  SparseVector<weight_t> lambdas;
+  Weights::InitFromFile(conf["input_weights"].as<string>(), &dense_weights);
+  Weights::InitSparseVector(dense_weights, &lambdas);
+
+  const double max_step_size = conf["max_step_size"].as<double>();
+  const double mt_metric_scale = conf["mt_metric_scale"].as<double>();
+
+  assert(corpus.size() > 0);
+  vector<GoodBadOracle> oracles(corpus.size());
+
+  TrainingObserver observer(conf["k_best_size"].as<int>(), ds, sample_forest, &oracles);
+  int cur_sent = 0;
+  int lcount = 0;
+  int normalizer = 0;
+  double tot_loss = 0;
+  int dots = 0;
+  int cur_pass = 0;
+  SparseVector<double> tot;
+  tot += lambdas;          // initial weights
+  normalizer++;            // count for initial weights
+  int max_iteration = conf["passes"].as<int>() * corpus.size();
+  string msg = "# MIRA tuned weights";
+  string msga = "# MIRA tuned weights AVERAGED";
+  vector<int> order;
+  RandomPermutation(corpus.size(), &order);
+  while (lcount <= max_iteration) {
+    lambdas.init_vector(&dense_weights);
+    if ((cur_sent * 40 / corpus.size()) > dots) { ++dots; cerr << '.'; }
+    if (corpus.size() == cur_sent) {
+      cerr << " [AVG METRIC LAST PASS=" << (tot_loss / corpus.size()) << "]\n";
+      Weights::ShowLargestFeatures(dense_weights);
+      cur_sent = 0;
+      tot_loss = 0;
+      dots = 0;
+      ostringstream os;
+      os << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << ".gz";
+      SparseVector<double> x = tot;
+      x /= normalizer;
+      ostringstream sa;
+      sa << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "-avg.gz";
+      x.init_vector(&dense_weights);
+      Weights::WriteToFile(os.str(), dense_weights, true, &msg);
+      ++cur_pass;
+      RandomPermutation(corpus.size(), &order);
+    }
+    if (cur_sent == 0) {
+      cerr << "PASS " << (lcount / corpus.size() + 1) << endl;
+    }
+    decoder.SetId(order[cur_sent]);
+    double sc = 1.0;
+    if (sample_forest_unit_weight_vector) {
+      sc = lambdas.l2norm();
+      if (sc > 0) {
+        for (unsigned i = 0; i < dense_weights.size(); ++i)
+          dense_weights[i] /= sc;
+      }
+    }
+    decoder.Decode(corpus[order[cur_sent]], &observer);  // update oracles
+    if (sc && sc != 1.0) {
+      for (unsigned i = 0; i < dense_weights.size(); ++i)
+        dense_weights[i] *= sc;
+    }
+    const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis();
+    const HypothesisInfo& cur_good = *oracles[order[cur_sent]].good;
+    const HypothesisInfo& cur_bad = *oracles[order[cur_sent]].bad;
+    tot_loss += cur_hyp.mt_metric;
+    if (!ApproxEqual(cur_hyp.mt_metric, cur_good.mt_metric)) {
+      const double loss = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights) +
+          mt_metric_scale * (cur_good.mt_metric - cur_bad.mt_metric);
+      //cerr << "LOSS: " << loss << endl;
+      if (loss > 0.0) {
+        SparseVector<double> diff = cur_good.features;
+        diff -= cur_bad.features;
+        double step_size = loss / diff.l2norm_sq();
+        //cerr << loss << " " << step_size << " " << diff << endl;
+        if (step_size > max_step_size) step_size = max_step_size;
+        lambdas += (cur_good.features * step_size);
+        lambdas -= (cur_bad.features * step_size);
+        //cerr << "L: " << lambdas << endl;
+      }
+    }
+    tot += lambdas;
+    ++normalizer;
+    ++lcount;
+    ++cur_sent;
+  }
+  cerr << endl;
+  Weights::WriteToFile("weights.mira-final.gz", dense_weights, true, &msg);
+  tot /= normalizer;
+  tot.init_vector(dense_weights);
+  msg = "# MIRA tuned weights (averaged vector)";
+  Weights::WriteToFile("weights.mira-final-avg.gz", dense_weights, true, &msg);
+  cerr << "Optimization complete.\nAVERAGED WEIGHTS: weights.mira-final-avg.gz\n";
+  return 0;
+}
+
diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc
deleted file mode 100644
index 2eff07e4..00000000
--- a/training/mpi_batch_optimize.cc
+++ /dev/null
@@ -1,372 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include "config.h"
-#ifdef HAVE_MPI
-#include <boost/mpi/timer.hpp>
-#include <boost/mpi.hpp>
-namespace mpi = boost::mpi;
-#endif
-
-#include <boost/shared_ptr.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "sentence_metadata.h"
-#include "cllh_observer.h"
-#include "verbose.h"
-#include "hg.h"
-#include "prob.h"
-#include "inside_outside.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "stringlib.h"
-#include "optimize.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("input_weights,w",po::value<string>(),"Input feature weights file")
-        ("training_data,t",po::value<string>(),"Training data")
-        ("test_data,T",po::value<string>(),"(optional) test data")
-        ("decoder_config,c",po::value<string>(),"Decoder configuration file")
-        ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file")
-        ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)")
-	("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory")
-        ("gaussian_prior,p","Use a Gaussian prior on the weights")
-        ("sigma_squared", po::value<double>()->default_value(1.0), "Sigma squared term for spherical Gaussian prior")
-        ("means,u", po::value<string>(), "(optional) file containing the means for Gaussian prior");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data")) || !conf->count("decoder_config")) {
-    cerr << dcmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  int lc = 0;
-  while(in) {
-    getline(in, line);
-    if (!in) break;
-    if (lc % size == rank) c->push_back(line);
-    ++lc;
-  }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct TrainingObserver : public DecoderObserver {
-  void Reset() {
-    acc_grad.clear();
-    acc_obj = 0;
-    total_complete = 0;
-    trg_words = 0;
-  } 
-
-  void SetLocalGradientAndObjective(vector<double>* g, double* o) const {
-    *o = acc_obj;
-    for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
-      (*g)[it->first] = it->second.as_float();
-  }
-
-  virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
-    cur_model_exp.clear();
-    cur_obj = 0;
-    state = 1;
-  }
-
-  // compute model expectations, denominator of objective
-  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    assert(state == 1);
-    state = 2;
-    const prob_t z = InsideOutside<prob_t,
-                                   EdgeProb,
-                                   SparseVector<prob_t>,
-                                   EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
-    cur_obj = log(z);
-    cur_model_exp /= z;
-  }
-
-  // compute "empirical" expectations, numerator of objective
-  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    assert(state == 2);
-    state = 3;
-    SparseVector<prob_t> ref_exp;
-    const prob_t ref_z = InsideOutside<prob_t,
-                                       EdgeProb,
-                                       SparseVector<prob_t>,
-                                       EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp);
-    ref_exp /= ref_z;
-
-    double log_ref_z;
-#if 0
-    if (crf_uniform_empirical) {
-      log_ref_z = ref_exp.dot(feature_weights);
-    } else {
-      log_ref_z = log(ref_z);
-    }
-#else
-    log_ref_z = log(ref_z);
-#endif
-
-    // rounding errors means that <0 is too strict
-    if ((cur_obj - log_ref_z) < kMINUS_EPSILON) {
-      cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
-      exit(1);
-    }
-    assert(!std::isnan(log_ref_z));
-    ref_exp -= cur_model_exp;
-    acc_grad -= ref_exp;
-    acc_obj += (cur_obj - log_ref_z);
-    trg_words += smeta.GetReference().size();
-  }
-
-  virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
-    if (state == 3) {
-      ++total_complete;
-    } else {
-    }
-  }
-
-  int total_complete;
-  SparseVector<prob_t> cur_model_exp;
-  SparseVector<prob_t> acc_grad;
-  double acc_obj;
-  double cur_obj;
-  unsigned trg_words;
-  int state;
-};
-
-void ReadConfig(const string& ini, vector<string>* out) {
-  ReadFile rf(ini);
-  istream& in = *rf.stream();
-  while(in) {
-    string line;
-    getline(in, line);
-    if (!in) continue;
-    out->push_back(line);
-  }
-}
-
-void StoreConfig(const vector<string>& cfg, istringstream* o) {
-  ostringstream os;
-  for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; }
-  o->str(os.str());
-}
-
-template <typename T>
-struct VectorPlus : public binary_function<vector<T>, vector<T>, vector<T> >  {
-  vector<T> operator()(const vector<int>& a, const vector<int>& b) const {
-    assert(a.size() == b.size());
-    vector<T> v(a.size());
-    transform(a.begin(), a.end(), b.begin(), v.begin(), plus<T>()); 
-    return v;
-  } 
-}; 
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
-  mpi::environment env(argc, argv);
-  mpi::communicator world;
-  const int size = world.size(); 
-  const int rank = world.rank();
-#else
-  const int size = 1;
-  const int rank = 0;
-#endif
-  SetSilent(true);  // turn off verbose decoder output
-  register_feature_functions();
-
-  po::variables_map conf;
-  if (!InitCommandLine(argc, argv, &conf)) return 1;
-
-  // load cdec.ini and set up decoder
-  vector<string> cdec_ini;
-  ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
-  istringstream ini;
-  StoreConfig(cdec_ini, &ini);
-  if (rank == 0) cerr << "Loading grammar...\n";
-  Decoder* decoder = new Decoder(&ini);
-  if (decoder->GetConf()["input"].as<string>() != "-") {
-    cerr << "cdec.ini must not set an input file\n";
-    return 1;
-  }
-  if (rank == 0) cerr << "Done loading grammar!\n";
-
-  // load initial weights
-  if (rank == 0) { cerr << "Loading weights...\n"; }
-  vector<weight_t>& lambdas = decoder->CurrentWeightVector();
-  Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas);
-  if (rank == 0) { cerr << "Done loading weights.\n"; }
-
-  // freeze feature set (should be optional?)
-  const bool freeze_feature_set = true;
-  if (freeze_feature_set) FD::Freeze();
-
-  const int num_feats = FD::NumFeats();
-  if (rank == 0) cerr << "Number of features: " << num_feats << endl;
-  lambdas.resize(num_feats);
-
-  const bool gaussian_prior = conf.count("gaussian_prior");
-  vector<weight_t> means(num_feats, 0);
-  if (conf.count("means")) {
-    if (!gaussian_prior) {
-      cerr << "Don't use --means without --gaussian_prior!\n";
-      exit(1);
-    }
-    Weights::InitFromFile(conf["means"].as<string>(), &means);
-  }
-  boost::shared_ptr<BatchOptimizer> o;
-  if (rank == 0) {
-    const string omethod = conf["optimization_method"].as<string>();
-    if (omethod == "rprop")
-      o.reset(new RPropOptimizer(num_feats));  // TODO add configuration
-    else
-      o.reset(new LBFGSOptimizer(num_feats, conf["correction_buffers"].as<int>()));
-    cerr << "Optimizer: " << o->Name() << endl;
-  }
-  double objective = 0;
-  vector<double> gradient(num_feats, 0.0);
-  vector<double> rcv_grad;
-  rcv_grad.clear();
-  bool converged = false;
-
-  vector<string> corpus, test_corpus;
-  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
-  assert(corpus.size() > 0);
-  if (conf.count("test_data"))
-    ReadTrainingCorpus(conf["test_data"].as<string>(), rank, size, &test_corpus);
-
-  TrainingObserver observer;
-  ConditionalLikelihoodObserver cllh_observer;
-  while (!converged) {
-    observer.Reset();
-    cllh_observer.Reset();
-#ifdef HAVE_MPI
-    mpi::timer timer;
-    world.barrier();
-#endif
-    if (rank == 0) {
-      cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n";
-      cerr << "  Testset size: " << test_corpus.size() << " sentences / proc)\n";
-    }
-    for (int i = 0; i < corpus.size(); ++i)
-      decoder->Decode(corpus[i], &observer);
-    cerr << "  process " << rank << '/' << size << " done\n";
-    fill(gradient.begin(), gradient.end(), 0);
-    observer.SetLocalGradientAndObjective(&gradient, &objective);
-
-    unsigned total_words = 0;
-#ifdef HAVE_MPI
-    double to = 0;
-    rcv_grad.resize(num_feats, 0.0);
-    mpi::reduce(world, &gradient[0], gradient.size(), &rcv_grad[0], plus<double>(), 0);
-    swap(gradient, rcv_grad);
-    rcv_grad.clear();
-
-    reduce(world, observer.trg_words, total_words, std::plus<unsigned>(), 0);
-    mpi::reduce(world, objective, to, plus<double>(), 0);
-    objective = to;
-#else
-    total_words = observer.trg_words;
-#endif
-    if (rank == 0)
-      cerr << "TRAINING CORPUS: ln p(f|e)=" << objective << "\t log_2 p(f|e) = " << (objective/log(2)) << "\t cond. entropy = " << (objective/log(2) / total_words) << "\t ppl = " << pow(2, (objective/log(2) / total_words)) << endl;
-
-    for (int i = 0; i < test_corpus.size(); ++i)
-      decoder->Decode(test_corpus[i], &cllh_observer);
-
-    double test_objective = 0;
-    unsigned test_total_words = 0;
-#ifdef HAVE_MPI
-    reduce(world, cllh_observer.acc_obj, test_objective, std::plus<double>(), 0);
-    reduce(world, cllh_observer.trg_words, test_total_words, std::plus<unsigned>(), 0);
-#else
-    test_objective = cllh_observer.acc_obj;
-    test_total_words = cllh_observer.trg_words;
-#endif
-
-    if (rank == 0) {  // run optimizer only on rank=0 node
-      if (test_corpus.size())
-        cerr << "    TEST CORPUS: ln p(f|e)=" << test_objective << "\t log_2 p(f|e) = " << (test_objective/log(2)) << "\t cond. entropy = " << (test_objective/log(2) / test_total_words) << "\t ppl = " << pow(2, (test_objective/log(2) / test_total_words)) << endl;
-      if (gaussian_prior) {
-        const double sigsq = conf["sigma_squared"].as<double>();
-        double norm = 0;
-        for (int k = 1; k < lambdas.size(); ++k) {
-          const double& lambda_k = lambdas[k];
-          if (lambda_k) {
-            const double param = (lambda_k - means[k]);
-            norm += param * param;
-            gradient[k] += param / sigsq;
-          }
-        }
-        const double reg = norm / (2.0 * sigsq);
-        cerr << "REGULARIZATION TERM: " << reg << endl;
-        objective += reg;
-      }
-      cerr << "EVALUATION #" << o->EvaluationCount() << " OBJECTIVE: " << objective << endl;
-      double gnorm = 0;
-      for (int i = 0; i < gradient.size(); ++i)
-        gnorm += gradient[i] * gradient[i];
-      cerr << "  GNORM=" << sqrt(gnorm) << endl;
-      vector<weight_t> old = lambdas;
-      int c = 0;
-      while (old == lambdas) {
-        ++c;
-        if (c > 1) { cerr << "Same lambdas, repeating optimization\n"; }
-        o->Optimize(objective, gradient, &lambdas);
-        assert(c < 5);
-      }
-      old.clear();
-      Weights::SanityCheck(lambdas);
-      Weights::ShowLargestFeatures(lambdas);
-
-      converged = o->HasConverged();
-      if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
-
-      string fname = "weights.cur.gz";
-      if (converged) { fname = "weights.final.gz"; }
-      ostringstream vv;
-      vv << "Objective = " << objective << "  (eval count=" << o->EvaluationCount() << ")";
-      const string svv = vv.str();
-      Weights::WriteToFile(fname, lambdas, true, &svv);
-    }  // rank == 0
-    int cint = converged;
-#ifdef HAVE_MPI
-    mpi::broadcast(world, &lambdas[0], lambdas.size(), 0);
-    mpi::broadcast(world, cint, 0);
-    if (rank == 0) { cerr << "  ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; }
-#endif
-    converged = cint;
-  }
-  return 0;
-}
-
diff --git a/training/mpi_compute_cllh.cc b/training/mpi_compute_cllh.cc
deleted file mode 100644
index 066389d0..00000000
--- a/training/mpi_compute_cllh.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include "config.h"
-#ifdef HAVE_MPI
-#include <boost/mpi.hpp>
-#endif
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "cllh_observer.h"
-#include "sentence_metadata.h"
-#include "verbose.h"
-#include "hg.h"
-#include "prob.h"
-#include "inside_outside.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "weights.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("weights,w",po::value<string>(),"Input feature weights file")
-        ("training_data,t",po::value<string>(),"Training data corpus")
-        ("decoder_config,c",po::value<string>(),"Decoder configuration file");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
-    cerr << dcmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-void ReadInstances(const string& fname, int rank, int size, vector<string>* c) {
-  assert(fname != "-");
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  int lc = 0;
-  while(in) {
-    getline(in, line);
-    if (!in) break;
-    if (lc % size == rank) c->push_back(line);
-    ++lc;
-  }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-#ifdef HAVE_MPI
-namespace mpi = boost::mpi;
-#endif
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
-  mpi::environment env(argc, argv);
-  mpi::communicator world;
-  const int size = world.size(); 
-  const int rank = world.rank();
-#else
-  const int size = 1;
-  const int rank = 0;
-#endif
-  if (size > 1) SetSilent(true);  // turn off verbose decoder output
-  register_feature_functions();
-
-  po::variables_map conf;
-  if (!InitCommandLine(argc, argv, &conf))
-    return false;
-
-  // load cdec.ini and set up decoder
-  ReadFile ini_rf(conf["decoder_config"].as<string>());
-  Decoder decoder(ini_rf.stream());
-  if (decoder.GetConf()["input"].as<string>() != "-") {
-    cerr << "cdec.ini must not set an input file\n";
-    abort();
-  }
-
-  // load weights
-  vector<weight_t>& weights = decoder.CurrentWeightVector();
-  if (conf.count("weights"))
-    Weights::InitFromFile(conf["weights"].as<string>(), &weights);
-
-  vector<string> corpus;
-  ReadInstances(conf["training_data"].as<string>(), rank, size, &corpus);
-  assert(corpus.size() > 0);
-
-  if (rank == 0)
-    cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
-
-  ConditionalLikelihoodObserver observer;
-  for (int i = 0; i < corpus.size(); ++i)
-    decoder.Decode(corpus[i], &observer);
-
-  double objective = 0;
-  unsigned total_words = 0;
-#ifdef HAVE_MPI
-  reduce(world, observer.acc_obj, objective, std::plus<double>(), 0);
-  reduce(world, observer.trg_words, total_words, std::plus<unsigned>(), 0);
-#else
-  objective = observer.acc_obj;
-#endif
-
-  if (rank == 0) {
-    cout << "CONDITIONAL LOG_e LIKELIHOOD: " << objective << endl;
-    cout << "CONDITIONAL LOG_2 LIKELIHOOD: " << (objective/log(2)) << endl;
-    cout << "         CONDITIONAL ENTROPY: " << (objective/log(2) / total_words) << endl;
-    cout << "                  PERPLEXITY: " << pow(2, (objective/log(2) / total_words)) << endl;
-  }
-
-  return 0;
-}
-
diff --git a/training/mpi_em_optimize.cc b/training/mpi_em_optimize.cc
deleted file mode 100644
index 48683b15..00000000
--- a/training/mpi_em_optimize.cc
+++ /dev/null
@@ -1,389 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-#include <boost/shared_ptr.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "verbose.h"
-#include "hg.h"
-#include "prob.h"
-#include "inside_outside.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "optimize.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-
-using namespace std;
-using boost::shared_ptr;
-namespace po = boost::program_options;
-
-void SanityCheck(const vector<double>& w) {
-  for (int i = 0; i < w.size(); ++i) {
-    assert(!isnan(w[i]));
-    assert(!isinf(w[i]));
-  }
-}
-
-struct FComp {
-  const vector<double>& w_;
-  FComp(const vector<double>& w) : w_(w) {}
-  bool operator()(int a, int b) const {
-    return fabs(w_[a]) > fabs(w_[b]);
-  }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
-  vector<int> fnums(w.size());
-  for (int i = 0; i < w.size(); ++i)
-    fnums[i] = i;
-  vector<int>::iterator mid = fnums.begin();
-  mid += (w.size() > 10 ? 10 : w.size());
-  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
-  cerr << "TOP FEATURES:";
-  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
-    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
-  }
-  cerr << endl;
-}
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("input_weights,w",po::value<string>(),"Input feature weights file")
-        ("training_data,t",po::value<string>(),"Training data")
-        ("decoder_config,c",po::value<string>(),"Decoder configuration file")
-        ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !(conf->count("training_data")) || !conf->count("decoder_config")) {
-    cerr << dcmdline_options << endl;
-#ifdef HAVE_MPI
-    MPI::Finalize();
-#endif
-    exit(1);
-  }
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  int lc = 0;
-  while(in) {
-    getline(in, line);
-    if (!in) break;
-    if (lc % size == rank) c->push_back(line);
-    ++lc;
-  }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct TrainingObserver : public DecoderObserver {
-  void Reset() {
-    total_complete = 0;
-    cur_obj = 0;
-    tot_obj = 0;
-    tot.clear();
-  } 
-
-  void SetLocalGradientAndObjective(SparseVector<double>* g, double* o) const {
-    *o = tot_obj;
-    *g = tot;
-  }
-
-  virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
-    cur_obj = 0;
-    state = 1;
-  }
-
-  void ExtractExpectedCounts(Hypergraph* hg) {
-    vector<prob_t> posts;
-    cur.clear();
-    const prob_t z = hg->ComputeEdgePosteriors(1.0, &posts);
-    cur_obj = log(z);
-    for (int i = 0; i < posts.size(); ++i) {
-      const SparseVector<double>& efeats = hg->edges_[i].feature_values_;
-      const double post = static_cast<double>(posts[i] / z);
-      for (SparseVector<double>::const_iterator j = efeats.begin(); j != efeats.end(); ++j)
-        cur.add_value(j->first, post);
-    }
-  }
-
-  // compute model expectations, denominator of objective
-  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    assert(state == 1);
-    state = 2;
-    ExtractExpectedCounts(hg);
-  }
-
-  // replace translation forest, since we're doing EM training (we don't know which)
-  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    assert(state == 2);
-    state = 3;
-    ExtractExpectedCounts(hg);
-  }
-
-  virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
-    ++total_complete;
-    tot_obj += cur_obj;
-    tot += cur;
-  }
-
-  int total_complete;
-  double cur_obj;
-  double tot_obj;
-  SparseVector<double> cur, tot;
-  int state;
-};
-
-void ReadConfig(const string& ini, vector<string>* out) {
-  ReadFile rf(ini);
-  istream& in = *rf.stream();
-  while(in) {
-    string line;
-    getline(in, line);
-    if (!in) continue;
-    out->push_back(line);
-  }
-}
-
-void StoreConfig(const vector<string>& cfg, istringstream* o) {
-  ostringstream os;
-  for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; }
-  o->str(os.str());
-}
-
-struct OptimizableMultinomialFamily {
-  struct CPD {
-    CPD() : z() {}
-    double z;
-    map<WordID, double> c2counts;
-  };
-  map<WordID, CPD> counts;
-  double Value(WordID conditioning, WordID generated) const {
-    map<WordID, CPD>::const_iterator it = counts.find(conditioning);
-    assert(it != counts.end());
-    map<WordID,double>::const_iterator r = it->second.c2counts.find(generated);
-    if (r == it->second.c2counts.end()) return 0;
-    return r->second;
-  }
-  void Increment(WordID conditioning, WordID generated, double count) {
-    CPD& cc = counts[conditioning];
-    cc.z += count;
-    cc.c2counts[generated] += count;
-  }
-  void Optimize() {
-    for (map<WordID, CPD>::iterator i = counts.begin(); i != counts.end(); ++i) {
-      CPD& cpd = i->second;
-      for (map<WordID, double>::iterator j = cpd.c2counts.begin(); j != cpd.c2counts.end(); ++j) {
-        j->second /= cpd.z;
-        // cerr << "P(" << TD::Convert(j->first) << " | " << TD::Convert(i->first) << " ) =  " << j->second << endl;
-      }
-    }
-  }
-  void Clear() {
-    counts.clear();
-  }
-};
-
-struct CountManager {
-  CountManager(size_t num_types) : oms_(num_types) {}
-  virtual ~CountManager();
-  virtual void AddCounts(const SparseVector<double>& c) = 0;
-  void Optimize(SparseVector<double>* weights) {
-    for (int i = 0; i < oms_.size(); ++i) {
-      oms_[i].Optimize();
-    }
-    GetOptimalValues(weights);
-    for (int i = 0; i < oms_.size(); ++i) {
-      oms_[i].Clear();
-    }
-  }
-  virtual void GetOptimalValues(SparseVector<double>* wv) const = 0;
-  vector<OptimizableMultinomialFamily> oms_;
-};
-CountManager::~CountManager() {}
-
-struct TaggerCountManager : public CountManager {
-  // 0 = transitions, 2 = emissions
-  TaggerCountManager() : CountManager(2) {}
-  void AddCounts(const SparseVector<double>& c);
-  void GetOptimalValues(SparseVector<double>* wv) const {
-    for (set<int>::const_iterator it = fids_.begin(); it != fids_.end(); ++it) {
-      int ftype;
-      WordID cond, gen;
-      bool is_optimized = TaggerCountManager::GetFeature(*it, &ftype, &cond, &gen);
-      assert(is_optimized);
-      wv->set_value(*it, log(oms_[ftype].Value(cond, gen)));
-    }
-  }
-  // Id:0:a=1 Bi:a_b=1 Bi:b_c=1 Bi:c_d=1 Uni:a=1 Uni:b=1 Uni:c=1 Uni:d=1 Id:1:b=1 Bi:BOS_a=1 Id:2:c=1
-  static bool GetFeature(const int fid, int* feature_type, WordID* cond, WordID* gen) {
-    const string& feat = FD::Convert(fid);
-    if (feat.size() > 5 && feat[0] == 'I' && feat[1] == 'd' && feat[2] == ':') {
-      // emission
-      const size_t p = feat.rfind(':');
-      assert(p != string::npos);
-      *cond = TD::Convert(feat.substr(p+1));
-      *gen = TD::Convert(feat.substr(3, p - 3));
-      *feature_type = 1;
-      return true;
-    } else if (feat[0] == 'B' && feat.size() > 5 && feat[2] == ':' && feat[1] == 'i') {
-      // transition
-      const size_t p = feat.rfind('_');
-      assert(p != string::npos);
-      *gen = TD::Convert(feat.substr(p+1));
-      *cond = TD::Convert(feat.substr(3, p - 3));
-      *feature_type = 0;
-      return true;
-    } else if (feat[0] == 'U' && feat.size() > 4 && feat[1] == 'n' && feat[2] == 'i' && feat[3] == ':') {
-      // ignore
-      return false;
-    } else {
-      cerr << "Don't know how to deal with feature of type: " << feat << endl;
-      abort();
-    }
-  }
-  set<int> fids_;
-};
-
-void TaggerCountManager::AddCounts(const SparseVector<double>& c) {
-  for (SparseVector<double>::const_iterator it = c.begin(); it != c.end(); ++it) {
-    const double& val = it->second;
-    int ftype;
-    WordID cond, gen;
-    if (GetFeature(it->first, &ftype, &cond, &gen)) {
-      oms_[ftype].Increment(cond, gen, val);
-      fids_.insert(it->first);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
-  MPI::Init(argc, argv);
-  const int size = MPI::COMM_WORLD.Get_size(); 
-  const int rank = MPI::COMM_WORLD.Get_rank();
-#else
-  const int size = 1;
-  const int rank = 0;
-#endif
-  SetSilent(true);  // turn off verbose decoder output
-  register_feature_functions();
-
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-
-  TaggerCountManager tcm;
-
-  // load cdec.ini and set up decoder
-  vector<string> cdec_ini;
-  ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
-  istringstream ini;
-  StoreConfig(cdec_ini, &ini);
-  if (rank == 0) cerr << "Loading grammar...\n";
-  Decoder* decoder = new Decoder(&ini);
-  if (decoder->GetConf()["input"].as<string>() != "-") {
-    cerr << "cdec.ini must not set an input file\n";
-#ifdef HAVE_MPI
-    MPI::COMM_WORLD.Abort(1);
-#endif
-  }
-  if (rank == 0) cerr << "Done loading grammar!\n";
-  Weights w;
-  if (conf.count("input_weights"))
-    w.InitFromFile(conf["input_weights"].as<string>());
-
-  double objective = 0;
-  bool converged = false;
-
-  vector<double> lambdas;
-  w.InitVector(&lambdas);
-  vector<string> corpus;
-  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
-  assert(corpus.size() > 0);
-
-  int iteration = 0;
-  TrainingObserver observer;
-  while (!converged) {
-    ++iteration;
-    observer.Reset();
-    if (rank == 0) {
-      cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n";
-    }
-    decoder->SetWeights(lambdas);
-    for (int i = 0; i < corpus.size(); ++i)
-      decoder->Decode(corpus[i], &observer);
-
-    SparseVector<double> x;
-    observer.SetLocalGradientAndObjective(&x, &objective);
-    cerr << "COUNTS = " << x << endl;
-    cerr << "   OBJ = " << objective << endl;
-    tcm.AddCounts(x);
-
-#if 0
-#ifdef HAVE_MPI
-    MPI::COMM_WORLD.Reduce(const_cast<double*>(&gradient.data()[0]), &rcv_grad[0], num_feats, MPI::DOUBLE, MPI::SUM, 0);
-    MPI::COMM_WORLD.Reduce(&objective, &to, 1, MPI::DOUBLE, MPI::SUM, 0);
-    swap(gradient, rcv_grad);
-    objective = to;
-#endif
-#endif
-
-    if (rank == 0) {
-      SparseVector<double> wsv;
-      tcm.Optimize(&wsv);
-
-      w.InitFromVector(wsv);
-      w.InitVector(&lambdas);
-
-      ShowLargestFeatures(lambdas);
-
-      converged = iteration > 100;
-      if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
-
-      string fname = "weights.cur.gz";
-      if (converged) { fname = "weights.final.gz"; }
-      ostringstream vv;
-      vv << "Objective = " << objective << "  (ITERATION=" << iteration << ")";
-      const string svv = vv.str();
-      w.WriteToFile(fname, true, &svv);
-    }  // rank == 0
-    int cint = converged;
-#ifdef HAVE_MPI
-    MPI::COMM_WORLD.Bcast(const_cast<double*>(&lambdas.data()[0]), num_feats, MPI::DOUBLE, 0);
-    MPI::COMM_WORLD.Bcast(&cint, 1, MPI::INT, 0);
-    MPI::COMM_WORLD.Barrier();
-#endif
-    converged = cint;
-  }
-#ifdef HAVE_MPI
-  MPI::Finalize(); 
-#endif
-  return 0;
-}
diff --git a/training/mpi_extract_features.cc b/training/mpi_extract_features.cc
deleted file mode 100644
index 6750aa15..00000000
--- a/training/mpi_extract_features.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include <cassert>
-
-#include "config.h"
-#ifdef HAVE_MPI
-#include <boost/mpi.hpp>
-#endif
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "ff_register.h"
-#include "verbose.h"
-#include "filelib.h"
-#include "fdict.h"
-#include "decoder.h"
-#include "weights.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("training_data,t",po::value<string>(),"Training data corpus")
-        ("decoder_config,c",po::value<string>(),"Decoder configuration file")
-        ("weights,w", po::value<string>(), "(Optional) weights file; weights may affect what features are encountered in pruning configurations")
-        ("output_prefix,o",po::value<string>()->default_value("features"),"Output path prefix");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
-    cerr << "Decode an input set (optionally in parallel using MPI) and write\nout the feature strings encountered.\n";
-    cerr << dcmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  int lc = 0;
-  while(in) {
-    getline(in, line);
-    if (!in) break;
-    if (lc % size == rank) c->push_back(line);
-    ++lc;
-  }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct TrainingObserver : public DecoderObserver {
-
-  virtual void NotifyDecodingStart(const SentenceMetadata&) {
-  }
-
-  // compute model expectations, denominator of objective
-  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
-  }
-
-  // compute "empirical" expectations, numerator of objective
-  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-  }
-};
-
-#ifdef HAVE_MPI
-namespace mpi = boost::mpi;
-#endif
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
-  mpi::environment env(argc, argv);
-  mpi::communicator world;
-  const int size = world.size(); 
-  const int rank = world.rank();
-#else
-  const int size = 1;
-  const int rank = 0;
-#endif
-  if (size > 1) SetSilent(true);  // turn off verbose decoder output
-  register_feature_functions();
-
-  po::variables_map conf;
-  if (!InitCommandLine(argc, argv, &conf))
-    return false;
-
-  // load cdec.ini and set up decoder
-  ReadFile ini_rf(conf["decoder_config"].as<string>());
-  Decoder decoder(ini_rf.stream());
-  if (decoder.GetConf()["input"].as<string>() != "-") {
-    cerr << "cdec.ini must not set an input file\n";
-    abort();
-  }
-
-  if (FD::UsingPerfectHashFunction()) {
-    cerr << "Your configuration file has enabled a cmph hash function. Please disable.\n";
-    return 1;
-  }
-
-  // load optional weights
-  if (conf.count("weights"))
-    Weights::InitFromFile(conf["weights"].as<string>(), &decoder.CurrentWeightVector());
-
-  vector<string> corpus;
-  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
-  assert(corpus.size() > 0);
-
-  TrainingObserver observer;
-
-  if (rank == 0)
-    cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
-
-  for (int i = 0; i < corpus.size(); ++i)
-    decoder.Decode(corpus[i], &observer);
-
-  {
-    ostringstream os;
-    os << conf["output_prefix"].as<string>() << '.' << rank << "_of_" << size;
-    WriteFile wf(os.str());
-    ostream& out = *wf.stream();
-    const unsigned num_feats = FD::NumFeats();
-    for (unsigned i = 1; i < num_feats; ++i) {
-      out << FD::Convert(i) << endl;
-    }
-    cerr << "Wrote " << os.str() << endl;
-  }
-
-#ifdef HAVE_MPI
-  world.barrier();
-#else
-#endif
-
-  return 0;
-}
-
diff --git a/training/mpi_extract_reachable.cc b/training/mpi_extract_reachable.cc
deleted file mode 100644
index 2a7c2b9d..00000000
--- a/training/mpi_extract_reachable.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include <cassert>
-
-#include "config.h"
-#ifdef HAVE_MPI
-#include <boost/mpi.hpp>
-#endif
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "ff_register.h"
-#include "verbose.h"
-#include "filelib.h"
-#include "fdict.h"
-#include "decoder.h"
-#include "weights.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("training_data,t",po::value<string>(),"Training data corpus")
-        ("decoder_config,c",po::value<string>(),"Decoder configuration file")
-        ("weights,w", po::value<string>(), "(Optional) weights file; weights may affect what features are encountered in pruning configurations")
-        ("output_prefix,o",po::value<string>()->default_value("reachable"),"Output path prefix");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
-    cerr << "Decode an input set (optionally in parallel using MPI) and write\nout the inputs that produce reachable parallel parses.\n";
-    cerr << dcmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  int lc = 0;
-  while(in) {
-    getline(in, line);
-    if (!in) break;
-    if (lc % size == rank) c->push_back(line);
-    ++lc;
-  }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct ReachabilityObserver : public DecoderObserver {
-
-  virtual void NotifyDecodingStart(const SentenceMetadata&) {
-    reachable = false;
-  }
-
-  // compute model expectations, denominator of objective
-  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
-  }
-
-  // compute "empirical" expectations, numerator of objective
-  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    reachable = true;
-  }
-
-  bool reachable;
-};
-
-#ifdef HAVE_MPI
-namespace mpi = boost::mpi;
-#endif
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
-  mpi::environment env(argc, argv);
-  mpi::communicator world;
-  const int size = world.size(); 
-  const int rank = world.rank();
-#else
-  const int size = 1;
-  const int rank = 0;
-#endif
-  if (size > 1) SetSilent(true);  // turn off verbose decoder output
-  register_feature_functions();
-
-  po::variables_map conf;
-  if (!InitCommandLine(argc, argv, &conf))
-    return false;
-
-  // load cdec.ini and set up decoder
-  ReadFile ini_rf(conf["decoder_config"].as<string>());
-  Decoder decoder(ini_rf.stream());
-  if (decoder.GetConf()["input"].as<string>() != "-") {
-    cerr << "cdec.ini must not set an input file\n";
-    abort();
-  }
-
-  if (FD::UsingPerfectHashFunction()) {
-    cerr << "Your configuration file has enabled a cmph hash function. Please disable.\n";
-    return 1;
-  }
-
-  // load optional weights
-  if (conf.count("weights"))
-    Weights::InitFromFile(conf["weights"].as<string>(), &decoder.CurrentWeightVector());
-
-  vector<string> corpus;
-  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
-  assert(corpus.size() > 0);
-
-
-  if (rank == 0)
-    cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
-
-  size_t num_reached = 0;
-  {
-    ostringstream os;
-    os << conf["output_prefix"].as<string>() << '.' << rank << "_of_" << size;
-    WriteFile wf(os.str());
-    ostream& out = *wf.stream();
-    ReachabilityObserver observer;
-    for (int i = 0; i < corpus.size(); ++i) {
-      decoder.Decode(corpus[i], &observer);
-      if (observer.reachable) {
-         out << corpus[i] << endl;
-         ++num_reached;
-      }
-      corpus[i].clear();
-    }
-    cerr << "Shard " << rank << '/' << size << " finished, wrote "
-         << num_reached << " instances to " << os.str() << endl;
-  }
-
-  size_t total = 0;
-#ifdef HAVE_MPI
-  reduce(world, num_reached, total, std::plus<double>(), 0);
-#else
-  total = num_reached;
-#endif
-  if (rank == 0) {
-    cerr << "-----------------------------------------\n";
-    cerr << "TOTAL = " << total << " instances\n";
-  }
-  return 0;
-}
-
diff --git a/training/mpi_flex_optimize.cc b/training/mpi_flex_optimize.cc
deleted file mode 100644
index b52decdc..00000000
--- a/training/mpi_flex_optimize.cc
+++ /dev/null
@@ -1,386 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include <boost/shared_ptr.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "stringlib.h"
-#include "verbose.h"
-#include "hg.h"
-#include "prob.h"
-#include "inside_outside.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "optimize.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "sampler.h"
-
-#ifdef HAVE_MPI
-#include <boost/mpi/timer.hpp>
-#include <boost/mpi.hpp>
-namespace mpi = boost::mpi;
-#endif
-
-using namespace std;
-namespace po = boost::program_options;
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("cdec_config,c",po::value<string>(),"Decoder configuration file")
-        ("weights,w",po::value<string>(),"Initial feature weights")
-        ("training_data,d",po::value<string>(),"Training data")
-        ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(6), "Number of training instances evaluated per processor in each minibatch")
-        ("minibatch_iterations,i", po::value<unsigned>()->default_value(10), "Number of optimization iterations per minibatch")
-        ("iterations,I", po::value<unsigned>()->default_value(50), "Number of passes through the training data before termination")
-        ("regularization_strength,C", po::value<double>()->default_value(0.2), "Regularization strength")
-        ("time_series_strength,T", po::value<double>()->default_value(0.0), "Time series regularization strength")
-        ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
-        ("lbfgs_memory_buffers,M", po::value<unsigned>()->default_value(10), "Number of memory buffers for LBFGS history");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !conf->count("training_data") || !conf->count("cdec_config")) {
-    cerr << "LBFGS minibatch online optimizer (MPI support "
-#if HAVE_MPI
-         << "enabled"
-#else
-         << "not enabled"
-#endif
-         << ")\n" << dcmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) {
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  int id = 0;
-  while(in) {
-    getline(in, line);
-    if (!in) break;
-    if (id % size == rank) {
-      c->push_back(line);
-      order->push_back(id);
-    }
-    ++id;
-  }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct CopyHGsObserver : public DecoderObserver {
-  Hypergraph* hg_;
-  Hypergraph* gold_hg_;
-
-  // this can free up some memory
-  void RemoveRules(Hypergraph* h) {
-    for (unsigned i = 0; i < h->edges_.size(); ++i)
-      h->edges_[i].rule_.reset();
-  }
-
-  void SetCurrentHypergraphs(Hypergraph* h, Hypergraph* gold_h) {
-    hg_ = h;
-    gold_hg_ = gold_h;
-  }
-
-  virtual void NotifyDecodingStart(const SentenceMetadata&) {
-    state = 1;
-  }
-
-  // compute model expectations, denominator of objective
-  virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
-    *hg_ = *hg;
-    RemoveRules(hg_);
-    assert(state == 1);
-    state = 2;
-  }
-
-  // compute "empirical" expectations, numerator of objective
-  virtual void NotifyAlignmentForest(const SentenceMetadata&, Hypergraph* hg) {
-    assert(state == 2);
-    state = 3;
-    *gold_hg_ = *hg;
-    RemoveRules(gold_hg_);
-  }
-
-  virtual void NotifyDecodingComplete(const SentenceMetadata&) {
-    if (state == 3) {
-    } else {
-      hg_->clear();
-      gold_hg_->clear();
-    }
-  }
-
-  int state;
-};
-
-void ReadConfig(const string& ini, istringstream* out) {
-  ReadFile rf(ini);
-  istream& in = *rf.stream();
-  ostringstream os;
-  while(in) {
-    string line;
-    getline(in, line);
-    if (!in) continue;
-    os << line << endl;
-  }
-  out->str(os.str());
-}
-
-#ifdef HAVE_MPI
-namespace boost { namespace mpi {
-  template<>
-  struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> > 
-    : mpl::true_ { };
-} } // end namespace boost::mpi
-#endif
-
-void AddGrad(const SparseVector<prob_t> x, double s, SparseVector<double>* acc) {
-  for (SparseVector<prob_t>::const_iterator it = x.begin(); it != x.end(); ++it)
-    acc->add_value(it->first, it->second.as_float() * s);
-}
-
-double PNorm(const vector<double>& v, const double p) {
-  double acc = 0;
-  for (int i = 0; i < v.size(); ++i)
-    acc += pow(v[i], p);
-  return pow(acc, 1.0 / p);
-}
-
-void VV(ostream&os, const vector<double>& v) {
-  for (int i = 1; i < v.size(); ++i)
-    if (v[i]) os << FD::Convert(i) << "=" << v[i] << " ";
-}
-
-double ApplyRegularizationTerms(const double C,
-                                const double T,
-                                const vector<double>& weights,
-                                const vector<double>& prev_weights,
-                                double* g) {
-  double reg = 0;
-  for (size_t i = 0; i < weights.size(); ++i) {
-    const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0);
-    const double& w_i = weights[i];
-    reg += C * w_i * w_i;
-    g[i] += 2 * C * w_i;
-
-    reg += T * (w_i - prev_w_i) * (w_i - prev_w_i);
-    g[i] += 2 * T * (w_i - prev_w_i);
-  }
-  return reg;
-}
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
-  mpi::environment env(argc, argv);
-  mpi::communicator world;
-  const int size = world.size(); 
-  const int rank = world.rank();
-#else
-  const int size = 1;
-  const int rank = 0;
-#endif
-  if (size > 1) SetSilent(true);  // turn off verbose decoder output
-  register_feature_functions();
-  MT19937* rng = NULL;
-
-  po::variables_map conf;
-  if (!InitCommandLine(argc, argv, &conf))
-    return 1;
-
-  boost::shared_ptr<BatchOptimizer> o;
-  const unsigned lbfgs_memory_buffers = conf["lbfgs_memory_buffers"].as<unsigned>();
-  const unsigned size_per_proc = conf["minibatch_size_per_proc"].as<unsigned>();
-  const unsigned minibatch_iterations = conf["minibatch_iterations"].as<unsigned>();
-  const double regularization_strength = conf["regularization_strength"].as<double>();
-  const double time_series_strength = conf["time_series_strength"].as<double>();
-  const bool use_time_series_reg = time_series_strength > 0.0;
-  const unsigned max_iteration = conf["iterations"].as<unsigned>();
-
-  vector<string> corpus;
-  vector<int> ids;
-  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
-  assert(corpus.size() > 0);
-
-  if (size_per_proc > corpus.size()) {
-    cerr << "Minibatch size (per processor) must be smaller or equal to the local corpus size!\n";
-    return 1;
-  }
-
-  // initialize decoder (loads hash functions if necessary)
-  istringstream ins;
-  ReadConfig(conf["cdec_config"].as<string>(), &ins);
-  Decoder decoder(&ins);
-
-  // load initial weights
-  vector<weight_t> prev_weights;
-  if (conf.count("weights"))
-    Weights::InitFromFile(conf["weights"].as<string>(), &prev_weights);
-
-  if (conf.count("random_seed"))
-    rng = new MT19937(conf["random_seed"].as<uint32_t>());
-  else
-    rng = new MT19937;
-
-  size_t total_corpus_size = 0;
-#ifdef HAVE_MPI
-  reduce(world, corpus.size(), total_corpus_size, std::plus<size_t>(), 0);
-#else
-  total_corpus_size = corpus.size();
-#endif
-
-  if (rank == 0)
-    cerr << "Total corpus size: " << total_corpus_size << endl;
-
-  CopyHGsObserver observer;
-
-  int write_weights_every_ith = 100; // TODO configure
-  int titer = -1;
-
-  vector<weight_t>& cur_weights = decoder.CurrentWeightVector();
-  if (use_time_series_reg) {
-    cur_weights = prev_weights;
-  } else {
-    cur_weights.swap(prev_weights);
-    prev_weights.clear();
-  }
-
-  int iter = -1;
-  bool converged = false;
-  vector<double> gg;
-  while (!converged) {
-#ifdef HAVE_MPI
-    mpi::timer timer;
-#endif
-    ++iter; ++titer;
-    if (rank == 0) {
-      converged = (iter == max_iteration);
-        string fname = "weights.cur.gz";
-        if (iter % write_weights_every_ith == 0) {
-          ostringstream o; o << "weights.epoch_" << iter << ".gz";
-          fname = o.str();
-        }
-        if (converged) { fname = "weights.final.gz"; }
-        ostringstream vv;
-        vv << "total iter=" << titer << " (of current config iter=" << iter << ")  minibatch=" << size_per_proc << " sentences/proc x " << size << " procs.   num_feats=" << FD::NumFeats() << "   passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size()));
-        const string svv = vv.str();
-        Weights::WriteToFile(fname, cur_weights, true, &svv);
-      }
-
-      vector<Hypergraph> hgs(size_per_proc);
-      vector<Hypergraph> gold_hgs(size_per_proc);
-      for (int i = 0; i < size_per_proc; ++i) {
-        int ei = corpus.size() * rng->next();
-        int id = ids[ei];
-        observer.SetCurrentHypergraphs(&hgs[i], &gold_hgs[i]);
-        decoder.SetId(id);
-        decoder.Decode(corpus[ei], &observer);
-      }
-
-      SparseVector<double> local_grad, g;
-      double local_obj = 0;
-      o.reset();
-      for (unsigned mi = 0; mi < minibatch_iterations; ++mi) {
-        local_grad.clear();
-        g.clear();
-        local_obj = 0;
-
-        for (unsigned i = 0; i < size_per_proc; ++i) {
-          Hypergraph& hg = hgs[i];
-          Hypergraph& hg_gold = gold_hgs[i];
-          if (hg.edges_.size() < 2) continue;
-
-          hg.Reweight(cur_weights);
-          hg_gold.Reweight(cur_weights);
-          SparseVector<prob_t> model_exp, gold_exp;
-          const prob_t z = InsideOutside<prob_t,
-                                         EdgeProb,
-                                         SparseVector<prob_t>,
-                                         EdgeFeaturesAndProbWeightFunction>(hg, &model_exp);
-          local_obj += log(z);
-          model_exp /= z;
-          AddGrad(model_exp, 1.0, &local_grad);
-          model_exp.clear();
-
-          const prob_t goldz = InsideOutside<prob_t,
-                                         EdgeProb,
-                                         SparseVector<prob_t>,
-                                         EdgeFeaturesAndProbWeightFunction>(hg_gold, &gold_exp);
-          local_obj -= log(goldz);
-
-          if (log(z) - log(goldz) < kMINUS_EPSILON) {
-            cerr << "DIFF. ERR! log_model_z < log_gold_z: " << log(z) << " " << log(goldz) << endl;
-            return 1;
-          }
-
-          gold_exp /= goldz;
-          AddGrad(gold_exp, -1.0, &local_grad);
-        }
-
-        double obj = 0;
-#ifdef HAVE_MPI
-        reduce(world, local_obj, obj, std::plus<double>(), 0);
-        reduce(world, local_grad, g, std::plus<SparseVector<double> >(), 0);
-#else
-        obj = local_obj;
-        g.swap(local_grad);
-#endif
-        local_grad.clear();
-        if (rank == 0) {
-          // g /= (size_per_proc * size);
-          if (!o)
-            o.reset(new LBFGSOptimizer(FD::NumFeats(), lbfgs_memory_buffers));
-          gg.clear();
-          gg.resize(FD::NumFeats());
-          if (gg.size() != cur_weights.size()) { cur_weights.resize(gg.size()); }
-          for (SparseVector<double>::iterator it = g.begin(); it != g.end(); ++it)
-            if (it->first) { gg[it->first] = it->second; }
-          g.clear();
-          double r = ApplyRegularizationTerms(regularization_strength,
-                                time_series_strength, // * (iter == 0 ? 0.0 : 1.0),
-                                cur_weights,
-                                prev_weights,
-                                &gg[0]);
-          obj += r;
-          if (mi == 0 || mi == (minibatch_iterations - 1)) {
-            if (!mi) cerr << iter << ' '; else cerr << ' ';
-            cerr << "OBJ=" << obj << " (REG=" << r << ")" << " |g|=" << PNorm(gg, 2) << " |w|=" << PNorm(cur_weights, 2); 
-            if (mi > 0) cerr << endl << flush; else cerr << ' ';
-          } else { cerr << '.' << flush; }
-          // cerr << "w = "; VV(cerr, cur_weights); cerr << endl;
-          // cerr << "g = "; VV(cerr, gg); cerr << endl;
-          o->Optimize(obj, gg, &cur_weights);
-        }
-#ifdef HAVE_MPI
-        broadcast(world, cur_weights, 0);
-        broadcast(world, converged, 0);
-        world.barrier();
-#endif
-    }
-    prev_weights = cur_weights;
-  }
-  return 0;
-}
diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc
deleted file mode 100644
index d6968848..00000000
--- a/training/mpi_online_optimize.cc
+++ /dev/null
@@ -1,374 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-#include <tr1/memory>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "stringlib.h"
-#include "verbose.h"
-#include "hg.h"
-#include "prob.h"
-#include "inside_outside.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "online_optimizer.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "sampler.h"
-
-#ifdef HAVE_MPI
-#include <boost/mpi/timer.hpp>
-#include <boost/mpi.hpp>
-namespace mpi = boost::mpi;
-#endif
-
-using namespace std;
-namespace po = boost::program_options;
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("input_weights,w",po::value<string>(),"Input feature weights file")
-        ("frozen_features,z",po::value<string>(), "List of features not to optimize")
-        ("training_data,t",po::value<string>(),"Training data corpus")
-        ("training_agenda,a",po::value<string>(), "Text file listing a series of configuration files and the number of iterations to train using each configuration successively")
-        ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(5), "Number of training instances evaluated per processor in each minibatch")
-        ("optimization_method,m", po::value<string>()->default_value("sgd"), "Optimization method (sgd)")
-        ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
-        ("eta_0,e", po::value<double>()->default_value(0.2), "Initial learning rate for SGD (eta_0)")
-        ("L1,1","Use L1 regularization")
-        ("regularization_strength,C", po::value<double>()->default_value(1.0), "Regularization strength (C)");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !conf->count("training_data") || !conf->count("training_agenda")) {
-    cerr << dcmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) {
-  ReadFile rf(fname);
-  istream& in = *rf.stream();
-  string line;
-  int id = 0;
-  while(in) {
-    getline(in, line);
-    if (!in) break;
-    if (id % size == rank) {
-      c->push_back(line);
-      order->push_back(id);
-    }
-    ++id;
-  }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct TrainingObserver : public DecoderObserver {
-  void Reset() {
-    acc_grad.clear();
-    acc_obj = 0;
-    total_complete = 0;
-  } 
-
-  void SetLocalGradientAndObjective(vector<double>* g, double* o) const {
-    *o = acc_obj;
-    for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
-      (*g)[it->first] = it->second.as_float();
-  }
-
-  virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
-    cur_model_exp.clear();
-    cur_obj = 0;
-    state = 1;
-  }
-
-  // compute model expectations, denominator of objective
-  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    assert(state == 1);
-    state = 2;
-    const prob_t z = InsideOutside<prob_t,
-                                   EdgeProb,
-                                   SparseVector<prob_t>,
-                                   EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
-    cur_obj = log(z);
-    cur_model_exp /= z;
-  }
-
-  // compute "empirical" expectations, numerator of objective
-  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
-    assert(state == 2);
-    state = 3;
-    SparseVector<prob_t> ref_exp;
-    const prob_t ref_z = InsideOutside<prob_t,
-                                       EdgeProb,
-                                       SparseVector<prob_t>,
-                                       EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp);
-    ref_exp /= ref_z;
-
-    double log_ref_z;
-#if 0
-    if (crf_uniform_empirical) {
-      log_ref_z = ref_exp.dot(feature_weights);
-    } else {
-      log_ref_z = log(ref_z);
-    }
-#else
-    log_ref_z = log(ref_z);
-#endif
-
-    // rounding errors means that <0 is too strict
-    if ((cur_obj - log_ref_z) < kMINUS_EPSILON) {
-      cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
-      exit(1);
-    }
-    assert(!std::isnan(log_ref_z));
-    ref_exp -= cur_model_exp;
-    acc_grad += ref_exp;
-    acc_obj += (cur_obj - log_ref_z);
-  }
-
-  virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
-    if (state == 3) {
-      ++total_complete;
-    } else {
-    }
-  }
-
-  void GetGradient(SparseVector<double>* g) const {
-    g->clear();
-    for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
-      g->set_value(it->first, it->second.as_float());
-  }
-
-  int total_complete;
-  SparseVector<prob_t> cur_model_exp;
-  SparseVector<prob_t> acc_grad;
-  double acc_obj;
-  double cur_obj;
-  int state;
-};
-
-#ifdef HAVE_MPI
-namespace boost { namespace mpi {
-  template<>
-  struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> > 
-    : mpl::true_ { };
-} } // end namespace boost::mpi
-#endif
-
-bool LoadAgenda(const string& file, vector<pair<string, int> >* a) {
-  ReadFile rf(file);
-  istream& in = *rf.stream();
-  string line;
-  while(in) {
-    getline(in, line);
-    if (!in) break;
-    if (line.empty()) continue;
-    if (line[0] == '#') continue;
-    int sc = 0;
-    if (line.size() < 3) return false;
-    for (int i = 0; i < line.size(); ++i) { if (line[i] == ' ') ++sc; }
-    if (sc != 1) { cerr << "Too many spaces in line: " << line << endl; return false; }
-    size_t d = line.find(" ");
-    pair<string, int> x;
-    x.first = line.substr(0,d);
-    x.second = atoi(line.substr(d+1).c_str());
-    a->push_back(x);
-    if (!FileExists(x.first)) {
-      cerr << "Can't find file " << x.first << endl;
-      return false;
-    }
-  }
-  return true;
-}
-
-int main(int argc, char** argv) {
-  cerr << "THIS SOFTWARE IS DEPRECATED YOU SHOULD USE mpi_flex_optimize\n";
-#ifdef HAVE_MPI
-  mpi::environment env(argc, argv);
-  mpi::communicator world;
-  const int size = world.size(); 
-  const int rank = world.rank();
-#else
-  const int size = 1;
-  const int rank = 0;
-#endif
-  if (size > 1) SetSilent(true);  // turn off verbose decoder output
-  register_feature_functions();
-  std::tr1::shared_ptr<MT19937> rng;
-
-  po::variables_map conf;
-  if (!InitCommandLine(argc, argv, &conf))
-    return 1;
-
-  vector<pair<string, int> > agenda;
-  if (!LoadAgenda(conf["training_agenda"].as<string>(), &agenda))
-    return 1;
-  if (rank == 0)
-    cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n";
-
-  assert(agenda.size() > 0);
-
-  if (1) {  // hack to load the feature hash functions -- TODO this should not be in cdec.ini
-    const string& cur_config = agenda[0].first;
-    const unsigned max_iteration = agenda[0].second;
-    ReadFile ini_rf(cur_config);
-    Decoder decoder(ini_rf.stream());
-  }
-
-  // load initial weights
-  vector<weight_t> init_weights;
-  if (conf.count("input_weights"))
-    Weights::InitFromFile(conf["input_weights"].as<string>(), &init_weights);
-
-  vector<int> frozen_fids;
-  if (conf.count("frozen_features")) {
-    ReadFile rf(conf["frozen_features"].as<string>());
-    istream& in = *rf.stream();
-    string line;
-    while(in) {
-      getline(in, line);
-      if (line.empty()) continue;
-      if (line[0] == ' ' || line[line.size() - 1] == ' ') { line = Trim(line); }
-      frozen_fids.push_back(FD::Convert(line));
-    }
-    if (rank == 0) cerr << "Freezing " << frozen_fids.size() << " features.\n";
-  }
-
-  vector<string> corpus;
-  vector<int> ids;
-  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
-  assert(corpus.size() > 0);
-
-  std::tr1::shared_ptr<OnlineOptimizer> o;
-  std::tr1::shared_ptr<LearningRateSchedule> lr;
-
-  const unsigned size_per_proc = conf["minibatch_size_per_proc"].as<unsigned>();
-  if (size_per_proc > corpus.size()) {
-    cerr << "Minibatch size must be smaller than corpus size!\n";
-    return 1;
-  }
-
-  size_t total_corpus_size = 0;
-#ifdef HAVE_MPI
-  reduce(world, corpus.size(), total_corpus_size, std::plus<size_t>(), 0);
-#else
-  total_corpus_size = corpus.size();
-#endif
-
-  if (rank == 0) {
-    cerr << "Total corpus size: " << total_corpus_size << endl;
-    const unsigned batch_size = size_per_proc * size;
-    // TODO config
-    lr.reset(new ExponentialDecayLearningRate(batch_size, conf["eta_0"].as<double>()));
-
-    const string omethod = conf["optimization_method"].as<string>();
-    if (omethod == "sgd") {
-      const double C = conf["regularization_strength"].as<double>();
-      o.reset(new CumulativeL1OnlineOptimizer(lr, total_corpus_size, C, frozen_fids));
-    } else {
-      assert(!"fail");
-    }
-  }
-  if (conf.count("random_seed"))
-    rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
-  else
-    rng.reset(new MT19937);
-
-  SparseVector<double> x;
-  Weights::InitSparseVector(init_weights, &x);
-  TrainingObserver observer;
-
-  int write_weights_every_ith = 100; // TODO configure
-  int titer = -1;
-
-  for (int ai = 0; ai < agenda.size(); ++ai) {
-    const string& cur_config = agenda[ai].first;
-    const unsigned max_iteration = agenda[ai].second;
-    if (rank == 0)
-      cerr << "STARTING TRAINING EPOCH " << (ai+1) << ". CONFIG=" << cur_config << endl;
-    // load cdec.ini and set up decoder
-    ReadFile ini_rf(cur_config);
-    Decoder decoder(ini_rf.stream());
-    vector<weight_t>& lambdas = decoder.CurrentWeightVector();
-    if (ai == 0) { lambdas.swap(init_weights); init_weights.clear(); }
-
-    if (rank == 0)
-      o->ResetEpoch(); // resets the learning rate-- TODO is this good?
-
-    int iter = -1;
-    bool converged = false;
-    while (!converged) {
-#ifdef HAVE_MPI
-      mpi::timer timer;
-#endif
-      x.init_vector(&lambdas);
-      ++iter; ++titer;
-      observer.Reset();
-      if (rank == 0) {
-        converged = (iter == max_iteration);
-        Weights::SanityCheck(lambdas);
-        static int cc = 0; ++cc; if (cc > 1) { Weights::ShowLargestFeatures(lambdas); }
-        string fname = "weights.cur.gz";
-        if (iter % write_weights_every_ith == 0) {
-          ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz";
-          fname = o.str();
-        }
-        if (converged && ((ai+1)==agenda.size())) { fname = "weights.final.gz"; }
-        ostringstream vv;
-        vv << "total iter=" << titer << " (of current config iter=" << iter << ")  minibatch=" << size_per_proc << " sentences/proc x " << size << " procs.   num_feats=" << x.size() << '/' << FD::NumFeats() << "   passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << "   eta=" << lr->eta(titer);
-        const string svv = vv.str();
-        cerr << svv << endl;
-        Weights::WriteToFile(fname, lambdas, true, &svv);
-      }
-
-      for (int i = 0; i < size_per_proc; ++i) {
-        int ei = corpus.size() * rng->next();
-        int id = ids[ei];
-        decoder.SetId(id);
-        decoder.Decode(corpus[ei], &observer);
-      }
-      SparseVector<double> local_grad, g;
-      observer.GetGradient(&local_grad);
-#ifdef HAVE_MPI
-      reduce(world, local_grad, g, std::plus<SparseVector<double> >(), 0);
-#else
-      g.swap(local_grad);
-#endif
-      local_grad.clear();
-      if (rank == 0) {
-        g /= (size_per_proc * size);
-        o->UpdateWeights(g, FD::NumFeats(), &x);
-      }
-#ifdef HAVE_MPI
-      broadcast(world, x, 0);
-      broadcast(world, converged, 0);
-      world.barrier();
-      if (rank == 0) { cerr << "  ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; }
-#endif
-    }
-  }
-  return 0;
-}
diff --git a/training/mr_em_adapted_reduce.cc b/training/mr_em_adapted_reduce.cc
deleted file mode 100644
index f65b5440..00000000
--- a/training/mr_em_adapted_reduce.cc
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "filelib.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "m.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("optimization_method,m", po::value<string>()->default_value("em"), "Optimization method (em, vb)")
-        ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help")) {
-    cerr << dcmdline_options << endl;
-    exit(1);
-  }
-}
-
-double NoZero(const double& x) {
-  if (x) return x;
-  return 1e-35;
-}
-
-void Maximize(const bool use_vb,
-              const double& alpha,
-              const int total_event_types,
-              SparseVector<double>* pc) {
-  const SparseVector<double>& counts = *pc;
-
-  if (use_vb)
-    assert(total_event_types >= counts.size());
-
-  double tot = 0;
-  for (SparseVector<double>::const_iterator it = counts.begin();
-       it != counts.end(); ++it)
-    tot += it->second;
-//  cerr << " = " << tot << endl;
-  assert(tot > 0.0);
-  double ltot = log(tot);
-  if (use_vb)
-    ltot = Md::digamma(tot + total_event_types * alpha);
-  for (SparseVector<double>::const_iterator it = counts.begin();
-       it != counts.end(); ++it) {
-    if (use_vb) {
-      pc->set_value(it->first, NoZero(Md::digamma(it->second + alpha) - ltot));
-    } else {
-      pc->set_value(it->first, NoZero(log(it->second) - ltot));
-    }
-  }
-#if 0
-  if (counts.size() < 50) {
-    for (SparseVector<double>::const_iterator it = counts.begin();
-         it != counts.end(); ++it) {
-      cerr << " p(" << FD::Convert(it->first) << ")=" << exp(it->second);
-    }
-    cerr << endl;
-  }
-#endif
-}
-
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-
-  const bool use_b64 = conf["input_format"].as<string>() == "b64";
-  const bool use_vb = conf["optimization_method"].as<string>() == "vb";
-  const double alpha = 1e-09;
-  if (use_vb)
-    cerr << "Using variational Bayes, make sure alphas are set\n";
-
-  const string s_obj = "**OBJ**";
-  // E-step
-  string cur_key = "";
-  SparseVector<double> acc;
-  double logprob = 0;
-  while(cin) {
-    string line;
-    getline(cin, line);
-    if (line.empty()) continue;
-    int feat;
-    double val;
-    size_t i = line.find("\t");
-    const string key = line.substr(0, i);
-    assert(i != string::npos);
-    ++i;
-    if (key != cur_key) {
-      if  (cur_key.size() > 0) {
-        // TODO shouldn't be num_active, should be total number
-        // of events
-        Maximize(use_vb, alpha, acc.size(), &acc);
-        cout << cur_key << '\t';
-        if (use_b64)
-          B64::Encode(0.0, acc, &cout);
-        else
-          cout << acc;
-        cout << endl;
-        acc.clear();
-      }
-      cur_key = key;
-    }
-    if (use_b64) {
-      SparseVector<double> g;
-      double obj;
-      if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) {
-        cerr << "B64 decoder returned error, skipping!\n";
-        continue;
-      }
-      logprob += obj;
-      acc += g;
-    } else {       // text encoding - your counts will not be accurate!
-      while (i < line.size()) {
-        size_t start = i;
-        while (line[i] != '=' && i < line.size()) ++i;
-        if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; }
-        string fname = line.substr(start, i - start);
-        if (fname == s_obj) {
-          feat = -1;
-        } else {
-          feat = FD::Convert(line.substr(start, i - start));
-        }
-        ++i;
-        start = i;
-        while (line[i] != ';' && i < line.size()) ++i;
-        if (i - start == 0) continue;
-        val = atof(line.substr(start, i - start).c_str());
-        ++i;
-        if (feat == -1) {
-          logprob += val;
-        } else {
-          acc.add_value(feat, val);
-        }
-      }
-    }
-  }
-  // TODO shouldn't be num_active, should be total number
-  // of events
-  Maximize(use_vb, alpha, acc.size(), &acc);
-  cout << cur_key << '\t';
-  if (use_b64)
-    B64::Encode(0.0, acc, &cout);
-  else
-    cout << acc;
-  cout << endl << flush;
-
-  cerr << "LOGPROB: " << logprob << endl;
-
-  return 0;
-}
diff --git a/training/mr_em_map_adapter.cc b/training/mr_em_map_adapter.cc
deleted file mode 100644
index ead4598d..00000000
--- a/training/mr_em_map_adapter.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cassert>
-#include <cmath>
-
-#include <boost/utility.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include "boost/tuple/tuple.hpp"
-
-#include "fdict.h"
-#include "sparse_vector.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-// useful for EM models parameterized by a bunch of multinomials
-// this converts event counts (returned from cdec as feature expectations)
-// into different keys and values (which are lists of all the events,
-// conditioned on the key) for summing and normalization by a reducer
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("buffer_size,b", po::value<int>()->default_value(1), "Buffer size (in # of counts) before emitting counts")
-        ("format,f",po::value<string>()->default_value("b64"), "Encoding of the input (b64 or text)");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help")) {
-    cerr << dcmdline_options << endl;
-    exit(1);
-  }
-}
-
-struct EventMapper {
-  int Map(int fid) {
-    int& cv = map_[fid];
-    if (!cv) {
-      cv = GetConditioningVariable(fid);
-    }
-    return cv;
-  }
-  void Clear() { map_.clear(); }
- protected:
-  virtual int GetConditioningVariable(int fid) const = 0;
- private:
-  map<int, int> map_;
-};
-
-struct LexAlignEventMapper : public EventMapper {
- protected:
-  virtual int GetConditioningVariable(int fid) const {
-    const string& str = FD::Convert(fid);
-    size_t pos = str.rfind("_");
-    if (pos == string::npos || pos == 0 || pos >= str.size() - 1) {
-      cerr << "Bad feature for EM adapter: " << str << endl;
-      abort();
-    }
-    return FD::Convert(str.substr(0, pos));
-  }
-};
-
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-
-  const bool use_b64 = conf["format"].as<string>() == "b64";
-  const int buffer_size = conf["buffer_size"].as<int>();
-
-  const string s_obj = "**OBJ**";
-  // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2;
-  // 0<TAB>**OBJ**=1.1;Feat1=1.0;
-
-  EventMapper* event_mapper = new LexAlignEventMapper;
-  map<int, SparseVector<double> > counts;
-  size_t total = 0;
-  while(cin) {
-    string line;
-    getline(cin, line);
-    if (line.empty()) continue;
-    int feat;
-    double val;
-    size_t i = line.find("\t");
-    assert(i != string::npos);
-    ++i;
-    SparseVector<double> g;
-    double obj = 0;
-    if (use_b64) {
-      if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) {
-        cerr << "B64 decoder returned error, skipping!\n";
-        continue;
-      }
-    } else {       // text encoding - your counts will not be accurate!
-      while (i < line.size()) {
-        size_t start = i;
-        while (line[i] != '=' && i < line.size()) ++i;
-        if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; }
-        string fname = line.substr(start, i - start);
-        if (fname == s_obj) {
-          feat = -1;
-        } else {
-          feat = FD::Convert(line.substr(start, i - start));
-        }
-        ++i;
-        start = i;
-        while (line[i] != ';' && i < line.size()) ++i;
-        if (i - start == 0) continue;
-        val = atof(line.substr(start, i - start).c_str());
-        ++i;
-        if (feat == -1) {
-          obj = val;
-        } else {
-          g.set_value(feat, val);
-        }
-      }
-    }
-    //cerr << "OBJ: " << obj << endl;
-    const SparseVector<double>& cg = g;
-    for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) {
-      const int cond_var = event_mapper->Map(it->first);
-      SparseVector<double>& cond_counts = counts[cond_var];
-      int delta = cond_counts.size();
-      cond_counts.add_value(it->first, it->second);
-      delta = cond_counts.size() - delta;
-      total += delta;
-    }
-    if (total > buffer_size) {
-      for (map<int, SparseVector<double> >::iterator it = counts.begin();
-           it != counts.end(); ++it) {
-        const SparseVector<double>& cc = it->second;
-        cout << FD::Convert(it->first) << '\t';
-        if (use_b64) {
-          B64::Encode(0.0, cc, &cout);
-        } else {
-          abort();
-        }
-        cout << endl;
-      }
-      cout << flush;
-      total = 0;
-      counts.clear();
-    }
-  }
-
-  return 0;
-}
-
diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc
deleted file mode 100644
index d490192f..00000000
--- a/training/mr_optimize_reduce.cc
+++ /dev/null
@@ -1,231 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include <boost/shared_ptr.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "optimize.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void SanityCheck(const vector<double>& w) {
-  for (int i = 0; i < w.size(); ++i) {
-    assert(!std::isnan(w[i]));
-    assert(!std::isinf(w[i]));
-  }
-}
-
-struct FComp {
-  const vector<double>& w_;
-  FComp(const vector<double>& w) : w_(w) {}
-  bool operator()(int a, int b) const {
-    return fabs(w_[a]) > fabs(w_[b]);
-  }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
-  vector<int> fnums(w.size());
-  for (int i = 0; i < w.size(); ++i)
-    fnums[i] = i;
-  vector<int>::iterator mid = fnums.begin();
-  mid += (w.size() > 10 ? 10 : w.size());
-  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
-  cerr << "TOP FEATURES:";
-  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
-    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
-  }
-  cerr << endl;
-}
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("input_weights,i",po::value<string>(),"Input feature weights file")
-        ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file")
-        ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)")
-        ("state,s",po::value<string>(),"Read (and write if output_state is not set) optimizer state from this state file. In the first iteration, the file should not exist.")
-        ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)")
-        ("output_state,S", po::value<string>(), "Output state file (optional override)")
-	("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory")
-        ("eta,e", po::value<double>()->default_value(0.1), "Learning rate for SGD (eta)")
-        ("gaussian_prior,p","Use a Gaussian prior on the weights")
-        ("means,u", po::value<string>(), "File containing the means for Gaussian prior")
-        ("sigma_squared", po::value<double>()->default_value(1.0), "Sigma squared term for spherical Gaussian prior");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help") || !conf->count("input_weights") || !conf->count("state")) {
-    cerr << dcmdline_options << endl;
-    exit(1);
-  }
-}
-
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-
-  const bool use_b64 = conf["input_format"].as<string>() == "b64";
-
-  vector<weight_t> lambdas;
-  Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas);
-  const string s_obj = "**OBJ**";
-  int num_feats = FD::NumFeats();
-  cerr << "Number of features: " << num_feats << endl;
-  const bool gaussian_prior = conf.count("gaussian_prior");
-  vector<weight_t> means(num_feats, 0);
-  if (conf.count("means")) {
-    if (!gaussian_prior) {
-      cerr << "Don't use --means without --gaussian_prior!\n";
-      exit(1);
-    }
-    Weights::InitFromFile(conf["means"].as<string>(), &means);
-  }
-  boost::shared_ptr<BatchOptimizer> o;
-  const string omethod = conf["optimization_method"].as<string>();
-  if (omethod == "rprop")
-    o.reset(new RPropOptimizer(num_feats));  // TODO add configuration
-  else
-    o.reset(new LBFGSOptimizer(num_feats, conf["correction_buffers"].as<int>()));
-  cerr << "Optimizer: " << o->Name() << endl;
-  string state_file = conf["state"].as<string>();
-  {
-    ifstream in(state_file.c_str(), ios::binary);
-    if (in)
-      o->Load(&in);
-    else
-      cerr << "No state file found, assuming ITERATION 1\n";
-  }
-
-  double objective = 0;
-  vector<double> gradient(num_feats, 0);
-  // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2;
-  // 0<TAB>**OBJ**=1.1;Feat1=1.0;
-  int total_lines = 0;  // TODO - this should be a count of the
-                        // training instances!!
-  while(cin) {
-    string line;
-    getline(cin, line);
-    if (line.empty()) continue;
-    ++total_lines;
-    int feat;
-    double val;
-    size_t i = line.find("\t");
-    assert(i != string::npos);
-    ++i;
-    if (use_b64) {
-      SparseVector<double> g;
-      double obj;
-      if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) {
-        cerr << "B64 decoder returned error, skipping gradient!\n";
-	cerr << "  START: " << line.substr(0,line.size() > 200 ? 200 : line.size()) << endl;
-	if (line.size() > 200)
-	  cerr << "    END: " << line.substr(line.size() - 200, 200) << endl;
-        cout << "-1\tRESTART\n";
-        exit(99);
-      }
-      objective += obj;
-      const SparseVector<double>& cg = g;
-      for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) {
-        if (it->first >= num_feats) {
-	  cerr << "Unexpected feature in gradient: " << FD::Convert(it->first) << endl;
-	  abort();
-        }
-        gradient[it->first] -= it->second;
-      }
-    } else {       // text encoding - your gradients will not be accurate!
-      while (i < line.size()) {
-        size_t start = i;
-        while (line[i] != '=' && i < line.size()) ++i;
-        if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; }
-        string fname = line.substr(start, i - start);
-        if (fname == s_obj) {
-          feat = -1;
-        } else {
-          feat = FD::Convert(line.substr(start, i - start));
-          if (feat >= num_feats) {
-	    cerr << "Unexpected feature in gradient: " << line.substr(start, i - start) << endl;
-	    abort();
-	  }
-        }
-        ++i;
-        start = i;
-        while (line[i] != ';' && i < line.size()) ++i;
-        if (i - start == 0) continue;
-        val = atof(line.substr(start, i - start).c_str());
-        ++i;
-        if (feat == -1) {
-          objective += val;
-        } else {
-          gradient[feat] -= val;
-        }
-      }
-    }
-  }
-
-  if (gaussian_prior) {
-    const double sigsq = conf["sigma_squared"].as<double>();
-    double norm = 0;
-    for (int k = 1; k < lambdas.size(); ++k) {
-      const double& lambda_k = lambdas[k];
-      if (lambda_k) {
-        const double param = (lambda_k - means[k]);
-        norm += param * param;
-        gradient[k] += param / sigsq;
-      }
-    }
-    const double reg = norm / (2.0 * sigsq);
-    cerr << "REGULARIZATION TERM: " << reg << endl;
-    objective += reg;
-  }
-  cerr << "EVALUATION #" << o->EvaluationCount() << " OBJECTIVE: " << objective << endl;
-  double gnorm = 0;
-  for (int i = 0; i < gradient.size(); ++i)
-    gnorm += gradient[i] * gradient[i];
-  cerr << "  GNORM=" << sqrt(gnorm) << endl;
-  vector<double> old = lambdas;
-  int c = 0;
-  while (old == lambdas) {
-    ++c;
-    if (c > 1) { cerr << "Same lambdas, repeating optimization\n"; }
-    o->Optimize(objective, gradient, &lambdas);
-    assert(c < 5);
-  }
-  old.clear();
-  SanityCheck(lambdas);
-  ShowLargestFeatures(lambdas);
-  Weights::WriteToFile(conf["output_weights"].as<string>(), lambdas, false);
-
-  const bool conv = o->HasConverged();
-  if (conv) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
-  
-  if (conf.count("output_state"))
-    state_file = conf["output_state"].as<string>();
-  ofstream out(state_file.c_str(), ios::binary);
-  cerr << "Writing state to: " << state_file << endl;
-  o->Save(&out);
-  out.close();
-
-  cout << o->EvaluationCount() << "\t" << conv << endl;
-  return 0;
-}
diff --git a/training/mr_reduce_to_weights.cc b/training/mr_reduce_to_weights.cc
deleted file mode 100644
index 16b47720..00000000
--- a/training/mr_reduce_to_weights.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "filelib.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)")
-        ("input,i",po::value<string>()->default_value("-"),"Read file from")
-        ("output,o",po::value<string>()->default_value("-"),"Write weights to");
-  po::options_description clo("Command line options");
-  clo.add_options()
-        ("config", po::value<string>(), "Configuration file")
-        ("help,h", "Print this help message and exit");
-  po::options_description dconfig_options, dcmdline_options;
-  dconfig_options.add(opts);
-  dcmdline_options.add(opts).add(clo);
-  
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  if (conf->count("config")) {
-    ifstream config((*conf)["config"].as<string>().c_str());
-    po::store(po::parse_config_file(config, dconfig_options), *conf);
-  }
-  po::notify(*conf);
-
-  if (conf->count("help")) {
-    cerr << dcmdline_options << endl;
-    exit(1);
-  }
-}
-
-void WriteWeights(const SparseVector<double>& weights, ostream* out) {
-  for (SparseVector<double>::const_iterator it = weights.begin();
-       it != weights.end(); ++it) {
-    (*out) << FD::Convert(it->first) << " " << it->second << endl;
-  }
-}
-
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-
-  const bool use_b64 = conf["input_format"].as<string>() == "b64";
-
-  const string s_obj = "**OBJ**";
-  // E-step
-  ReadFile rf(conf["input"].as<string>());
-  istream* in = rf.stream();
-  assert(*in);
-  WriteFile wf(conf["output"].as<string>());
-  ostream* out = wf.stream();
-  out->precision(17);
-  while(*in) {
-    string line;
-    getline(*in, line);
-    if (line.empty()) continue;
-    int feat;
-    double val;
-    size_t i = line.find("\t");
-    assert(i != string::npos);
-    ++i;
-    if (use_b64) {
-      SparseVector<double> g;
-      double obj;
-      if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) {
-        cerr << "B64 decoder returned error, skipping!\n";
-        continue;
-      }
-      WriteWeights(g, out);
-    } else {       // text encoding - your counts will not be accurate!
-      SparseVector<double> weights;
-      while (i < line.size()) {
-        size_t start = i;
-        while (line[i] != '=' && i < line.size()) ++i;
-        if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; }
-        string fname = line.substr(start, i - start);
-        if (fname == s_obj) {
-          feat = -1;
-        } else {
-          feat = FD::Convert(line.substr(start, i - start));
-        }
-        ++i;
-        start = i;
-        while (line[i] != ';' && i < line.size()) ++i;
-        if (i - start == 0) continue;
-        val = atof(line.substr(start, i - start).c_str());
-        ++i;
-        if (feat != -1) {
-          weights.set_value(feat, val);
-        }
-      }
-      WriteWeights(weights, out);
-    }
-  }
-
-  return 0;
-}
diff --git a/training/online_optimizer.cc b/training/online_optimizer.cc
deleted file mode 100644
index 3ed95452..00000000
--- a/training/online_optimizer.cc
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "online_optimizer.h"
-
-LearningRateSchedule::~LearningRateSchedule() {}
-
-double StandardLearningRate::eta(int k) const {
-  return eta_0_ / (1.0 + k / N_);
-}
-
-double ExponentialDecayLearningRate::eta(int k) const {
-  return eta_0_ * pow(alpha_, k / N_);
-}
-
-OnlineOptimizer::~OnlineOptimizer() {}
-
-void OnlineOptimizer::ResetEpochImpl() {}
-
diff --git a/training/online_optimizer.h b/training/online_optimizer.h
deleted file mode 100644
index 28d89344..00000000
--- a/training/online_optimizer.h
+++ /dev/null
@@ -1,129 +0,0 @@
-#ifndef _ONL_OPTIMIZE_H_
-#define _ONL_OPTIMIZE_H_
-
-#include <tr1/memory>
-#include <set>
-#include <string>
-#include <cmath>
-#include "sparse_vector.h"
-
-struct LearningRateSchedule {
-  virtual ~LearningRateSchedule();
-  // returns the learning rate for the kth iteration
-  virtual double eta(int k) const = 0;
-};
-
-// TODO in the Tsoruoaka et al. (ACL 2009) paper, they use N
-// to mean the batch size in most places, but it doesn't completely
-// make sense to me in the learning rate schedules-- this needs
-// to be worked out to make sure they didn't mean corpus size
-// in some places and batch size in others (since in the paper they
-// only ever work with batch sizes of 1)
-struct StandardLearningRate : public LearningRateSchedule {
-  StandardLearningRate(
-      size_t batch_size,        // batch size, not corpus size!
-      double eta_0 = 0.2) :
-    eta_0_(eta_0),
-    N_(static_cast<double>(batch_size)) {}
-
-  virtual double eta(int k) const;
-
- private:
-  const double eta_0_;
-  const double N_;
-};
-
-struct ExponentialDecayLearningRate : public LearningRateSchedule {
-  ExponentialDecayLearningRate(
-      size_t batch_size,        // batch size, not corpus size!
-      double eta_0 = 0.2,
-      double alpha = 0.85       // recommended by Tsuruoka et al. (ACL 2009)
-    ) : eta_0_(eta_0),
-        N_(static_cast<double>(batch_size)),
-        alpha_(alpha) {
-    assert(alpha > 0);
-    assert(alpha < 1.0);
-  }
-
-  virtual double eta(int k) const;
-
- private:
-  const double eta_0_;
-  const double N_;
-  const double alpha_;
-};
-
-class OnlineOptimizer {
- public:
-  virtual ~OnlineOptimizer();
-  OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s,
-                  size_t batch_size,
-                  const std::vector<int>& frozen_feats = std::vector<int>())
-      : N_(batch_size),schedule_(s),k_() {
-    for (int i = 0; i < frozen_feats.size(); ++i)
-      frozen_.insert(frozen_feats[i]);
-  }
-  void ResetEpoch() { k_ = 0; ResetEpochImpl(); }
-  void UpdateWeights(const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) {
-    ++k_;
-    const double eta = schedule_->eta(k_);
-    UpdateWeightsImpl(eta, approx_g, max_feat, weights);
-  }
-
- protected:
-  virtual void ResetEpochImpl();
-  virtual void UpdateWeightsImpl(const double& eta, const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) = 0;
-  const size_t N_; // number of training instances per batch
-  std::set<int> frozen_;  // frozen (non-optimizing) features
-
- private:
-  std::tr1::shared_ptr<LearningRateSchedule> schedule_;
-  int k_;  // iteration count
-};
-
-class CumulativeL1OnlineOptimizer : public OnlineOptimizer {
- public:
-  CumulativeL1OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s,
-                              size_t training_instances, double C,
-                              const std::vector<int>& frozen) :
-    OnlineOptimizer(s, training_instances, frozen), C_(C), u_() {}
-
- protected:
-  void ResetEpochImpl() { u_ = 0; }
-  void UpdateWeightsImpl(const double& eta, const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) {
-    u_ += eta * C_ / N_;
-    for (SparseVector<double>::const_iterator it = approx_g.begin(); 
-         it != approx_g.end(); ++it) {
-      if (frozen_.count(it->first) == 0)
-        weights->add_value(it->first, eta * it->second);
-    }
-    for (int i = 1; i < max_feat; ++i)
-      if (frozen_.count(i) == 0) ApplyPenalty(i, weights);
-  }
-
- private:
-  void ApplyPenalty(int i, SparseVector<double>* w) {
-    const double z = w->value(i);
-    double w_i = z;
-    double q_i = q_.value(i);
-    if (w_i > 0.0)
-      w_i = std::max(0.0, w_i - (u_ + q_i));
-    else if (w_i < 0.0)
-      w_i = std::min(0.0, w_i + (u_ - q_i));
-    q_i += w_i - z;
-    if (q_i == 0.0)
-      q_.erase(i);
-    else
-      q_.set_value(i, q_i);
-    if (w_i == 0.0)
-      w->erase(i);
-    else
-      w->set_value(i, w_i);
-  }
-
-  const double C_;  // reguarlization strength
-  double u_;
-  SparseVector<double> q_;
-};
-
-#endif
diff --git a/training/optimize.cc b/training/optimize.cc
deleted file mode 100644
index 41ac90d8..00000000
--- a/training/optimize.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-#include "optimize.h"
-
-#include <iostream>
-#include <cassert>
-
-#include "lbfgs.h"
-
-using namespace std;
-
-BatchOptimizer::~BatchOptimizer() {}
-
-void BatchOptimizer::Save(ostream* out) const {
-  out->write((const char*)&eval_, sizeof(eval_));
-  out->write((const char*)&has_converged_, sizeof(has_converged_));
-  SaveImpl(out);
-  unsigned int magic = 0xABCDDCBA;  // should be uint32_t
-  out->write((const char*)&magic, sizeof(magic));
-}
-
-void BatchOptimizer::Load(istream* in) {
-  in->read((char*)&eval_, sizeof(eval_));
-  in->read((char*)&has_converged_, sizeof(has_converged_));
-  LoadImpl(in);
-  unsigned int magic = 0;           // should be uint32_t
-  in->read((char*)&magic, sizeof(magic));
-  assert(magic == 0xABCDDCBA);
-  cerr << Name() << " EVALUATION #" << eval_ << endl;
-}
-
-void BatchOptimizer::SaveImpl(ostream* out) const {
-  (void)out;
-}
-
-void BatchOptimizer::LoadImpl(istream* in) {
-  (void)in;
-}
-
-string RPropOptimizer::Name() const {
-  return "RPropOptimizer";
-}
-
-void RPropOptimizer::OptimizeImpl(const double& obj,
-                              const vector<double>& g,
-                              vector<double>* x) {
-  for (int i = 0; i < g.size(); ++i) {
-    const double g_i = g[i];
-    const double sign_i = (signbit(g_i) ? -1.0 : 1.0);
-    const double prod = g_i * prev_g_[i];
-    if (prod > 0.0) {
-      const double dij = min(delta_ij_[i] * eta_plus_, delta_max_);
-      (*x)[i] -= dij * sign_i;
-      delta_ij_[i] = dij;
-      prev_g_[i] = g_i;
-    } else if (prod < 0.0) {
-      delta_ij_[i] = max(delta_ij_[i] * eta_minus_, delta_min_);
-      prev_g_[i] = 0.0;
-    } else {
-      (*x)[i] -= delta_ij_[i] * sign_i;
-      prev_g_[i] = g_i;
-    }
-  }
-}
-
-void RPropOptimizer::SaveImpl(ostream* out) const {
-  const size_t n = prev_g_.size();
-  out->write((const char*)&n, sizeof(n));
-  out->write((const char*)&prev_g_[0], sizeof(double) * n);
-  out->write((const char*)&delta_ij_[0], sizeof(double) * n);
-}
-
-void RPropOptimizer::LoadImpl(istream* in) {
-  size_t n;
-  in->read((char*)&n, sizeof(n));
-  assert(n == prev_g_.size());
-  assert(n == delta_ij_.size());
-  in->read((char*)&prev_g_[0], sizeof(double) * n);
-  in->read((char*)&delta_ij_[0], sizeof(double) * n);
-}
-
-string LBFGSOptimizer::Name() const {
-  return "LBFGSOptimizer";
-}
-
-LBFGSOptimizer::LBFGSOptimizer(int num_feats, int memory_buffers) :
-  opt_(num_feats, memory_buffers) {}
-
-void LBFGSOptimizer::SaveImpl(ostream* out) const {
-  opt_.serialize(out);
-}
-
-void LBFGSOptimizer::LoadImpl(istream* in) {
-  opt_.deserialize(in);
-}
-
-void LBFGSOptimizer::OptimizeImpl(const double& obj,
-                                  const vector<double>& g,
-                                  vector<double>* x) {
-  opt_.run(&(*x)[0], obj, &g[0]);
-  if (!opt_.requests_f_and_g()) opt_.run(&(*x)[0], obj, &g[0]);
-  // cerr << opt_ << endl;
-}
-
diff --git a/training/optimize.h b/training/optimize.h
deleted file mode 100644
index 07943b44..00000000
--- a/training/optimize.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef _OPTIMIZE_H_
-#define _OPTIMIZE_H_
-
-#include <iostream>
-#include <vector>
-#include <string>
-#include <cassert>
-
-#include "lbfgs.h"
-
-// abstract base class for first order optimizers
-// order of invocation: new, Load(), Optimize(), Save(), delete
-class BatchOptimizer {
- public:
-  BatchOptimizer() : eval_(1), has_converged_(false) {}
-  virtual ~BatchOptimizer();
-  virtual std::string Name() const = 0;
-  int EvaluationCount() const { return eval_; }
-  bool HasConverged() const { return has_converged_; }
-
-  void Optimize(const double& obj,
-                const std::vector<double>& g,
-                std::vector<double>* x) {
-    assert(g.size() == x->size());
-    ++eval_;
-    OptimizeImpl(obj, g, x);
-    scitbx::lbfgs::traditional_convergence_test<double> converged(g.size());
-    has_converged_ = converged(&(*x)[0], &g[0]);
-  }
-
-  void Save(std::ostream* out) const;
-  void Load(std::istream* in);
- protected:
-  virtual void SaveImpl(std::ostream* out) const;
-  virtual void LoadImpl(std::istream* in);
-  virtual void OptimizeImpl(const double& obj,
-                            const std::vector<double>& g,
-                            std::vector<double>* x) = 0;
-
-  int eval_;
- private:
-  bool has_converged_;
-};
-
-class RPropOptimizer : public BatchOptimizer {
- public:
-  explicit RPropOptimizer(int num_vars,
-                          double eta_plus = 1.2,
-                          double eta_minus = 0.5,
-                          double delta_0 = 0.1,
-                          double delta_max = 50.0,
-                          double delta_min = 1e-6) :
-      prev_g_(num_vars, 0.0),
-      delta_ij_(num_vars, delta_0),
-      eta_plus_(eta_plus),
-      eta_minus_(eta_minus),
-      delta_max_(delta_max),
-      delta_min_(delta_min) {
-    assert(eta_plus > 1.0);
-    assert(eta_minus > 0.0 && eta_minus < 1.0);
-    assert(delta_max > 0.0);
-    assert(delta_min > 0.0);
-  }
-  std::string Name() const;
-  void OptimizeImpl(const double& obj,
-                    const std::vector<double>& g,
-                    std::vector<double>* x);
-  void SaveImpl(std::ostream* out) const;
-  void LoadImpl(std::istream* in);
- private:
-  std::vector<double> prev_g_;
-  std::vector<double> delta_ij_;
-  const double eta_plus_;
-  const double eta_minus_;
-  const double delta_max_;
-  const double delta_min_;
-};
-
-class LBFGSOptimizer : public BatchOptimizer {
- public:
-  explicit LBFGSOptimizer(int num_vars, int memory_buffers = 10);
-  std::string Name() const;
-  void SaveImpl(std::ostream* out) const;
-  void LoadImpl(std::istream* in);
-  void OptimizeImpl(const double& obj,
-                    const std::vector<double>& g,
-                    std::vector<double>* x);
- private:
-  scitbx::lbfgs::minimizer<double> opt_;
-};
-
-#endif
diff --git a/training/optimize_test.cc b/training/optimize_test.cc
deleted file mode 100644
index bff2ca03..00000000
--- a/training/optimize_test.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-#include <cassert>
-#include <iostream>
-#include <sstream>
-#include <boost/program_options/variables_map.hpp>
-#include "optimize.h"
-#include "online_optimizer.h"
-#include "sparse_vector.h"
-#include "fdict.h"
-
-using namespace std;
-
-double TestOptimizer(BatchOptimizer* opt) {
-  cerr << "TESTING NON-PERSISTENT OPTIMIZER\n";
-
-  // f(x,y) = 4x1^2 + x1*x2 + x2^2 + x3^2 + 6x3 + 5
-  // df/dx1 = 8*x1 + x2
-  // df/dx2 = 2*x2 + x1
-  // df/dx3 = 2*x3 + 6
-  vector<double> x(3);
-  vector<double> g(3);
-  x[0] = 8;
-  x[1] = 8;
-  x[2] = 8;
-  double obj = 0;
-  do {
-    g[0] = 8 * x[0] + x[1];
-    g[1] = 2 * x[1] + x[0];
-    g[2] = 2 * x[2] + 6;
-    obj = 4 * x[0]*x[0] + x[0] * x[1] + x[1]*x[1] + x[2]*x[2] + 6 * x[2] + 5;
-    opt->Optimize(obj, g, &x);
-
-    cerr << x[0] << " " << x[1] << " " << x[2] << endl;
-    cerr << "   obj=" << obj << "\td/dx1=" << g[0] << " d/dx2=" << g[1] << " d/dx3=" << g[2] << endl;
-  } while (!opt->HasConverged());
-  return obj;
-}
-
-double TestPersistentOptimizer(BatchOptimizer* opt) {
-  cerr << "\nTESTING PERSISTENT OPTIMIZER\n";
-  // f(x,y) = 4x1^2 + x1*x2 + x2^2 + x3^2 + 6x3 + 5
-  // df/dx1 = 8*x1 + x2
-  // df/dx2 = 2*x2 + x1
-  // df/dx3 = 2*x3 + 6
-  vector<double> x(3);
-  vector<double> g(3);
-  x[0] = 8;
-  x[1] = 8;
-  x[2] = 8;
-  double obj = 0;
-  string state;
-  bool converged = false;
-  while (!converged) {
-    g[0] = 8 * x[0] + x[1];
-    g[1] = 2 * x[1] + x[0];
-    g[2] = 2 * x[2] + 6;
-    obj = 4 * x[0]*x[0] + x[0] * x[1] + x[1]*x[1] + x[2]*x[2] + 6 * x[2] + 5;
-
-    {
-      if (state.size() > 0) {
-        istringstream is(state, ios::binary);
-        opt->Load(&is);
-      }
-      opt->Optimize(obj, g, &x);
-      ostringstream os(ios::binary); opt->Save(&os); state = os.str();
-
-    }
-
-    cerr << x[0] << " " << x[1] << " " << x[2] << endl;
-    cerr << "   obj=" << obj << "\td/dx1=" << g[0] << " d/dx2=" << g[1] << " d/dx3=" << g[2] << endl;
-    converged = opt->HasConverged();
-    if (!converged) {
-      // now screw up the state (should be undone by Load)
-      obj += 2.0;
-      g[1] = -g[2];
-      vector<double> x2 = x;
-      try {
-        opt->Optimize(obj, g, &x2);
-      } catch (...) { }
-    }
-  }
-  return obj;
-}
-
-template <class O>
-void TestOptimizerVariants(int num_vars) {
-  O oa(num_vars);
-  cerr << "-------------------------------------------------------------------------\n";
-  cerr << "TESTING: " << oa.Name() << endl;
-  double o1 = TestOptimizer(&oa);
-  O ob(num_vars);
-  double o2 = TestPersistentOptimizer(&ob);
-  if (o1 != o2) {
-    cerr << oa.Name() << " VARIANTS PERFORMED DIFFERENTLY!\n" << o1 << " vs. " << o2 << endl;
-    exit(1);
-  }
-  cerr << oa.Name() << " SUCCESS\n";
-}
-
-using namespace std::tr1;
-
-void TestOnline() {
-  size_t N = 20;
-  double C = 1.0;
-  double eta0 = 0.2;
-  std::tr1::shared_ptr<LearningRateSchedule> r(new ExponentialDecayLearningRate(N, eta0, 0.85));
-  //shared_ptr<LearningRateSchedule> r(new StandardLearningRate(N, eta0));
-  CumulativeL1OnlineOptimizer opt(r, N, C, std::vector<int>());
-  assert(r->eta(10) < r->eta(1));
-}
-
-int main() {
-  int n = 3;
-  TestOptimizerVariants<LBFGSOptimizer>(n);
-  TestOptimizerVariants<RPropOptimizer>(n);
-  TestOnline();
-  return 0;
-}
-
diff --git a/training/pro/Makefile.am b/training/pro/Makefile.am
new file mode 100644
index 00000000..1916b6b2
--- /dev/null
+++ b/training/pro/Makefile.am
@@ -0,0 +1,11 @@
+bin_PROGRAMS = \
+  mr_pro_map \
+  mr_pro_reduce
+
+mr_pro_map_SOURCES = mr_pro_map.cc
+mr_pro_map_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+mr_pro_reduce_SOURCES = mr_pro_reduce.cc
+mr_pro_reduce_LDADD = $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/utils/libutils.a -lz
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils -I$(top_srcdir)/training
diff --git a/training/pro/mr_pro_generate_mapper_input.pl b/training/pro/mr_pro_generate_mapper_input.pl
new file mode 100755
index 00000000..b30fc4fd
--- /dev/null
+++ b/training/pro/mr_pro_generate_mapper_input.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1;
+my $d = shift @ARGV;
+die "Can't find directory $d" unless -d $d;
+
+opendir(DIR, $d) or die "Can't read $d: $!";
+my @hgs = grep { /\.gz$/ } readdir(DIR);
+closedir DIR;
+
+for my $hg (@hgs) {
+  my $file = $hg;
+  my $id = $hg;
+  $id =~ s/(\.json)?\.gz//;
+  print "$d/$file $id\n";
+}
+
diff --git a/training/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc
new file mode 100644
index 00000000..eef40b8a
--- /dev/null
+++ b/training/pro/mr_pro_map.cc
@@ -0,0 +1,201 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <tr1/unordered_map>
+
+#include <boost/functional/hash.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "candidate_set.h"
+#include "sampler.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "weights.h"
+#include "inside_outside.h"
+#include "hg_io.h"
+#include "ns.h"
+#include "ns_docscorer.h"
+
+// This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011)
+
+using namespace std;
+namespace po = boost::program_options;
+
+boost::shared_ptr<MT19937> rng;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+        ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
+        ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)")
+        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+        ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)")
+        ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)")
+        ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")
+        ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")
+        ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)")
+        ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+        ("help,h", "Help");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  bool flag = false;
+  if (!conf->count("reference")) {
+    cerr << "Please specify one or more references using -r <REF.TXT>\n";
+    flag = true;
+  }
+  if (!conf->count("weights")) {
+    cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
+    flag = true;
+  }
+  if (flag || conf->count("help")) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+struct ThresholdAlpha {
+  explicit ThresholdAlpha(double t = 0.05) : threshold(t) {}
+  double operator()(double mag) const {
+    if (mag < threshold) return 0.0; else return 1.0;
+  }
+  const double threshold;
+};
+
+struct TrainingInstance {
+  TrainingInstance(const SparseVector<weight_t>& feats, bool positive, float diff) : x(feats), y(positive), gdiff(diff) {}
+  SparseVector<weight_t> x;
+#undef DEBUGGING_PRO
+#ifdef DEBUGGING_PRO
+  vector<WordID> a;
+  vector<WordID> b;
+#endif
+  bool y;
+  float gdiff;
+};
+#ifdef DEBUGGING_PRO
+ostream& operator<<(ostream& os, const TrainingInstance& d) {
+  return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x;
+}
+#endif
+
+struct DiffOrder {
+  bool operator()(const TrainingInstance& a, const TrainingInstance& b) const {
+    return a.gdiff > b.gdiff;
+  }
+};
+
+void Sample(const unsigned gamma,
+            const unsigned xi,
+            const training::CandidateSet& J_i,
+            const EvaluationMetric* metric,
+            vector<TrainingInstance>* pv) {
+  const bool invert_score = metric->IsErrorMetric();
+  vector<TrainingInstance> v1, v2;
+  float avg_diff = 0;
+  for (unsigned i = 0; i < gamma; ++i) {
+    const size_t a = rng->inclusive(0, J_i.size() - 1)();
+    const size_t b = rng->inclusive(0, J_i.size() - 1)();
+    if (a == b) continue;
+    float ga = metric->ComputeScore(J_i[a].eval_feats);
+    float gb = metric->ComputeScore(J_i[b].eval_feats);
+    bool positive = gb < ga;
+    if (invert_score) positive = !positive;
+    const float gdiff = fabs(ga - gb);
+    if (!gdiff) continue;
+    avg_diff += gdiff;
+    SparseVector<weight_t> xdiff = (J_i[a].fmap - J_i[b].fmap).erase_zeros();
+    if (xdiff.empty()) {
+      cerr << "Empty diff:\n  " << TD::GetString(J_i[a].ewords) << endl << "x=" << J_i[a].fmap << endl;
+      cerr << "  " << TD::GetString(J_i[b].ewords) << endl << "x=" << J_i[b].fmap << endl;
+      continue;
+    }
+    v1.push_back(TrainingInstance(xdiff, positive, gdiff));
+#ifdef DEBUGGING_PRO
+    v1.back().a = J_i[a].hyp;
+    v1.back().b = J_i[b].hyp;
+    cerr << "N: " << v1.back() << endl;
+#endif
+  }
+  avg_diff /= v1.size();
+
+  for (unsigned i = 0; i < v1.size(); ++i) {
+    double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff));
+    // cerr << "avg_diff=" << avg_diff << "  gdiff=" << v1[i].gdiff << "  p=" << p << endl;
+    if (rng->next() < p) v2.push_back(v1[i]);
+  }
+  vector<TrainingInstance>::iterator mid = v2.begin() + xi;
+  if (xi > v2.size()) mid = v2.end();
+  partial_sort(v2.begin(), mid, v2.end(), DiffOrder());
+  copy(v2.begin(), mid, back_inserter(*pv));
+#ifdef DEBUGGING_PRO
+  if (v2.size() >= 5) {
+    for (int i =0; i < (mid - v2.begin()); ++i) {
+      cerr << v2[i] << endl;
+    }
+    cerr << pv->back() << endl;
+  }
+#endif
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  if (conf.count("random_seed"))
+    rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    rng.reset(new MT19937);
+  const string evaluation_metric = conf["evaluation_metric"].as<string>();
+
+  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+  DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+  cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
+
+  Hypergraph hg;
+  string last_file;
+  ReadFile in_read(conf["input"].as<string>());
+  istream &in=*in_read.stream();
+  const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
+  const unsigned gamma = conf["candidate_pairs"].as<unsigned>();
+  const unsigned xi = conf["best_pairs"].as<unsigned>();
+  string weightsf = conf["weights"].as<string>();
+  vector<weight_t> weights;
+  Weights::InitFromFile(weightsf, &weights);
+  string kbest_repo = conf["kbest_repository"].as<string>();
+  MkDirP(kbest_repo);
+  while(in) {
+    vector<TrainingInstance> v;
+    string line;
+    getline(in, line);
+    if (line.empty()) continue;
+    istringstream is(line);
+    int sent_id;
+    string file;
+    // path-to-file (JSON) sent_id
+    is >> file >> sent_id;
+    ReadFile rf(file);
+    ostringstream os;
+    training::CandidateSet J_i;
+    os << kbest_repo << "/kbest." << sent_id << ".txt.gz";
+    const string kbest_file = os.str();
+    if (FileExists(kbest_file))
+      J_i.ReadFromFile(kbest_file);
+    HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+    hg.Reweight(weights);
+    J_i.AddKBestCandidates(hg, kbest_size, ds[sent_id]);
+    J_i.WriteToFile(kbest_file);
+
+    Sample(gamma, xi, J_i, metric, &v);
+    for (unsigned i = 0; i < v.size(); ++i) {
+      const TrainingInstance& vi = v[i];
+      cout << vi.y << "\t" << vi.x << endl;
+      cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl;
+    }
+  }
+  return 0;
+}
+
diff --git a/training/pro/mr_pro_reduce.cc b/training/pro/mr_pro_reduce.cc
new file mode 100644
index 00000000..5ef9b470
--- /dev/null
+++ b/training/pro/mr_pro_reduce.cc
@@ -0,0 +1,286 @@
+#include <cstdlib>
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "optimize.h"
+#include "liblbfgs/lbfgs++.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+// since this is a ranking model, there should be equal numbers of
+// positive and negative examples, so the bias should be 0
+static const double MAX_BIAS = 1e-10;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")
+        ("regularization_strength,C",po::value<double>()->default_value(500.0), "l2 regularization strength")
+        ("l1",po::value<double>()->default_value(0.0), "l1 regularization strength")
+        ("regularize_to_weights,y",po::value<double>()->default_value(5000.0), "Differences in learned weights to previous weights are penalized with an l2 penalty with this strength; 0.0 = no effect")
+        ("memory_buffers,m",po::value<unsigned>()->default_value(100), "Number of memory buffers (LBFGS)")
+        ("min_reg,r",po::value<double>()->default_value(0.01), "When tuning (-T) regularization strength, minimum regularization strenght")
+        ("max_reg,R",po::value<double>()->default_value(1e6), "When tuning (-T) regularization strength, maximum regularization strenght")
+        ("testset,t",po::value<string>(), "Optional held-out test set")
+        ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength")
+        ("interpolate_with_weights,p",po::value<double>()->default_value(1.0), "[deprecated] Output weights are p*w + (1-p)*w_prev; 1.0 = no effect")
+        ("help,h", "Help");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("help")) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+void ParseSparseVector(string& line, size_t cur, SparseVector<weight_t>* out) {
+  SparseVector<weight_t>& x = *out;
+  size_t last_start = cur;
+  size_t last_comma = string::npos;
+  while(cur <= line.size()) {
+    if (line[cur] == ' ' || cur == line.size()) {
+      if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
+        cerr << "[ERROR] " << line << endl << "  position = " << cur << endl;
+        exit(1);
+      }
+      const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
+      if (cur < line.size()) line[cur] = 0;
+      const weight_t val = strtod(&line[last_comma + 1], NULL);
+      x.set_value(fid, val);
+
+      last_comma = string::npos;
+      last_start = cur+1;
+    } else {
+      if (line[cur] == '=')
+        last_comma = cur;
+    }
+    ++cur;
+  }
+}
+
+void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<weight_t> > >* corpus) {
+  istream& in = *pin;
+  corpus->clear();
+  bool flag = false;
+  int lc = 0;
+  string line;
+  SparseVector<weight_t> x;
+  while(getline(in, line)) {
+    ++lc;
+    if (lc % 1000 == 0) { cerr << '.'; flag = true; }
+    if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
+    if (line.empty()) continue;
+    const size_t ks = line.find("\t");
+    assert(string::npos != ks);
+    assert(ks == 1);
+    const bool y = line[0] == '1';
+    x.clear();
+    ParseSparseVector(line, ks + 1, &x);
+    corpus->push_back(make_pair(y, x));
+  }
+  if (flag) cerr << endl;
+}
+
+void GradAdd(const SparseVector<weight_t>& v, const double scale, weight_t* acc) {
+  for (SparseVector<weight_t>::const_iterator it = v.begin();
+       it != v.end(); ++it) {
+    acc[it->first] += it->second * scale;
+  }
+}
+
+double ApplyRegularizationTerms(const double C,
+                                const double T,
+                                const vector<weight_t>& weights,
+                                const vector<weight_t>& prev_weights,
+                                weight_t* g) {
+  double reg = 0;
+  for (size_t i = 0; i < weights.size(); ++i) {
+    const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0);
+    const double& w_i = weights[i];
+    reg += C * w_i * w_i;
+    g[i] += 2 * C * w_i;
+
+    const double diff_i = w_i - prev_w_i;
+    reg += T * diff_i * diff_i;
+    g[i] += 2 * T * diff_i;
+  }
+  return reg;
+}
+
+double TrainingInference(const vector<weight_t>& x,
+                         const vector<pair<bool, SparseVector<weight_t> > >& corpus,
+                         weight_t* g = NULL) {
+  double cll = 0;
+  for (int i = 0; i < corpus.size(); ++i) {
+    const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias
+    double lp_false = dotprod;
+    double lp_true = -dotprod;
+    if (0 < lp_true) {
+      lp_true += log1p(exp(-lp_true));
+      lp_false = log1p(exp(lp_false));
+    } else {
+      lp_true = log1p(exp(lp_true));
+      lp_false += log1p(exp(-lp_false));
+    }
+    lp_true*=-1;
+    lp_false*=-1;
+    if (corpus[i].first) {  // true label
+      cll -= lp_true;
+      if (g) {
+        // g -= corpus[i].second * exp(lp_false);
+        GradAdd(corpus[i].second, -exp(lp_false), g);
+        g[0] -= exp(lp_false); // bias
+      }
+    } else {                  // false label
+      cll -= lp_false;
+      if (g) {
+        // g += corpus[i].second * exp(lp_true);
+        GradAdd(corpus[i].second, exp(lp_true), g);
+        g[0] += exp(lp_true); // bias
+      }
+    }
+  }
+  return cll;
+}
+
+struct ProLoss {
+  ProLoss(const vector<pair<bool, SparseVector<weight_t> > >& tr,
+          const vector<pair<bool, SparseVector<weight_t> > >& te,
+          const double c,
+          const double t,
+          const vector<weight_t>& px) : training(tr), testing(te), C(c), T(t), prev_x(px){}
+  double operator()(const vector<double>& x, double* g) const {
+    fill(g, g + x.size(), 0.0);
+    double cll = TrainingInference(x, training, g);
+    tppl = 0;
+    if (testing.size())
+      tppl = pow(2.0, TrainingInference(x, testing, g) / (log(2) * testing.size()));
+    double ppl = cll / log(2);
+    ppl /= training.size();
+    ppl = pow(2.0, ppl);
+    double reg = ApplyRegularizationTerms(C, T, x, prev_x, g);
+    return cll + reg;
+  }
+  const vector<pair<bool, SparseVector<weight_t> > >& training, testing;
+  const double C, T;
+  const vector<double>& prev_x;
+  mutable double tppl;
+};
+
+// return held-out log likelihood
+double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training,
+                       const vector<pair<bool, SparseVector<weight_t> > >& testing,
+                       const double C,
+                       const double C1,
+                       const double T,
+                       const unsigned memory_buffers,
+                       const vector<weight_t>& prev_x,
+                       vector<weight_t>* px) {
+  assert(px->size() == prev_x.size());
+  ProLoss loss(training, testing, C, T, prev_x);
+  LBFGS<ProLoss> lbfgs(px, loss, memory_buffers, C1);
+  lbfgs.MinimizeFunction();
+  return loss.tppl;
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  string line;
+  vector<pair<bool, SparseVector<weight_t> > > training, testing;
+  const bool tune_regularizer = conf.count("tune_regularizer");
+  if (tune_regularizer && !conf.count("testset")) {
+    cerr << "--tune_regularizer requires --testset to be set\n";
+    return 1;
+  }
+  const double min_reg = conf["min_reg"].as<double>();
+  const double max_reg = conf["max_reg"].as<double>();
+  double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned
+  double C1 = conf["l1"].as<double>(); // will be overridden if parameter is tuned
+  const double T = conf["regularize_to_weights"].as<double>();
+  assert(C >= 0.0);
+  assert(min_reg >= 0.0);
+  assert(max_reg >= 0.0);
+  assert(max_reg > min_reg);
+  const double psi = conf["interpolate_with_weights"].as<double>();
+  if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; }
+  ReadCorpus(&cin, &training);
+  if (conf.count("testset")) {
+    ReadFile rf(conf["testset"].as<string>());
+    ReadCorpus(rf.stream(), &testing);
+  }
+  cerr << "Number of features: " << FD::NumFeats() << endl;
+
+  vector<weight_t> x, prev_x;  // x[0] is bias
+  if (conf.count("weights")) {
+    Weights::InitFromFile(conf["weights"].as<string>(), &x);
+    x.resize(FD::NumFeats());
+    prev_x = x;
+  } else {
+    x.resize(FD::NumFeats());
+    prev_x = x;
+  }
+  cerr << "         Number of features: " << x.size() << endl;
+  cerr << "Number of training examples: " << training.size() << endl;
+  cerr << "Number of  testing examples: " << testing.size() << endl;
+  double tppl = 0.0;
+  vector<pair<double,double> > sp;
+  vector<double> smoothed;
+  if (tune_regularizer) {
+    C = min_reg;
+    const double steps = 18;
+    double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps);
+    cerr << "SWEEP FACTOR: " << sweep_factor << endl;
+    while(C < max_reg) {
+      cerr << "C=" << C << "\tT=" <<T << endl;
+      tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x);
+      sp.push_back(make_pair(C, tppl));
+      C *= sweep_factor;
+    }
+    smoothed.resize(sp.size(), 0);
+    smoothed[0] = sp[0].second;
+    smoothed.back() = sp.back().second; 
+    for (int i = 1; i < sp.size()-1; ++i) {
+      double prev = sp[i-1].second;
+      double next = sp[i+1].second;
+      double cur = sp[i].second;
+      smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next);
+    }
+    double best_ppl = 9999999;
+    unsigned best_i = 0;
+    for (unsigned i = 0; i < sp.size(); ++i) {
+      if (smoothed[i] < best_ppl) {
+        best_ppl = smoothed[i];
+        best_i = i;
+      }
+    }
+    C = sp[best_i].first;
+  }  // tune regularizer
+  tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x);
+  if (conf.count("weights")) {
+    for (int i = 1; i < x.size(); ++i) {
+      x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi);
+    }
+  }
+  cout.precision(15);
+  cout << "# C=" << C << "\theld out perplexity=";
+  if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; }
+  if (sp.size()) {
+    cout << "# Parameter sweep:\n";
+    for (int i = 0; i < sp.size(); ++i) {
+      cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl;
+    }
+  }
+  Weights::WriteToFile("-", x);
+  return 0;
+}
diff --git a/training/pro/pro.pl b/training/pro/pro.pl
new file mode 100755
index 00000000..3b30c379
--- /dev/null
+++ b/training/pro/pro.pl
@@ -0,0 +1,555 @@
+#!/usr/bin/env perl
+use strict;
+use File::Basename qw(basename);
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+my $default_jobs = env_default_jobs();
+
+my $UTILS_DIR="$SCRIPT_DIR/../utils";
+require "$UTILS_DIR/libcall.pl";
+
+# Default settings
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl";
+my $MAPPER = "$bin_dir/mr_pro_map";
+my $REDUCER = "$bin_dir/mr_pro_reduce";
+my $parallelize = "$UTILS_DIR/parallelize.pl";
+my $libcall = "$UTILS_DIR/libcall.pl";
+my $sentserver = "$UTILS_DIR/sentserver";
+my $sentclient = "$UTILS_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 30;
+my $iteration = 1;
+my $best_weights;
+my $psi = 1;
+my $default_max_iter = 30;
+my $max_iterations = $default_max_iter;
+my $jobs = $default_jobs;   # number of decode nodes
+my $pmem = "4g";
+my $disable_clean = 0;
+my %seen_weights;
+my $help = 0;
+my $epsilon = 0.0001;
+my $dryrun = 0;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $use_make = 1;  # use make to parallelize
+my $useqsub = 0;
+my $initial_weights;
+my $pass_suffix = '';
+my $devset;
+
+# regularization strength
+my $reg = 500;
+my $reg_previous = 5000;
+
+# Process command-line options
+if (GetOptions(
+	"config=s" => \$iniFile,
+	"weights=s" => \$initial_weights,
+        "devset=s" => \$devset,
+	"jobs=i" => \$jobs,
+	"metric=s" => \$metric,
+	"pass-suffix=s" => \$pass_suffix,
+        "qsub" => \$useqsub,
+	"help" => \$help,
+	"reg=f" => \$reg,
+	"reg-previous=f" => \$reg_previous,
+	"output-dir=s" => \$dir,
+) == 0 || @ARGV!=0 || $help) {
+	print_help();
+	exit;
+}
+
+if ($useqsub) {
+  $use_make = 0;
+  die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (!defined $iniFile) { push @missing_args, "--config"; }
+if (!defined $devset) { push @missing_args, "--devset"; }
+if (!defined $initial_weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
+
+if ($metric =~ /^(combi|ter)$/i) {
+  $lines_per_mapper = 5;
+}
+
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+  $DIR_FLAG = '';
+}
+
+unless ($dir){
+	$dir = 'pro';
+}
+unless ($dir =~ /^\//){  # convert relative path to absolute path
+	my $basedir = check_output("pwd");
+	chomp $basedir;
+	$dir = "$basedir/$dir";
+}
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+	print STDERR "Cleanup...\n";
+	for my $pid (@childpids){ unchecked_call("kill $pid"); }
+	for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+	exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit = 
+    sub{ cleanup(); }; 
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+
+
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+if (-e $dir) {
+	die "ERROR: working dir $dir already exists\n\n";
+} else {
+	mkdir "$dir" or die "Can't mkdir $dir: $!";
+	mkdir "$dir/hgs" or die;
+	mkdir "$dir/scripts" or die;
+	print STDERR <<EOT;
+	DECODER:          $decoder
+	INI FILE:         $iniFile
+	WORKING DIR:      $dir
+	DEVSET:           $devset
+	EVAL METRIC:      $metric
+	MAX ITERATIONS:   $max_iterations
+	PARALLEL JOBS:    $jobs
+	HEAD NODE:        $host
+	PMEM (DECODING):  $pmem
+	INITIAL WEIGHTS:  $initial_weights
+EOT
+}
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+check_call("cp $initial_weights $dir/weights.0");
+$iniFile = $newIniFile;
+
+my $refs = "$dir/dev.refs";
+split_devset($devset, "$dir/dev.input.raw", $refs);
+my $newsrc = "$dir/dev.input";
+enseg("$dir/dev.input.raw", $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+my @allweights;
+while (1){
+	print STDERR "\n\nITERATION $iteration\n==========\n";
+
+	if ($iteration > $max_iterations){
+		print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+		last;
+	}
+	# iteration-specific files
+	my $runFile="$dir/run.raw.$iteration";
+	my $onebestFile="$dir/1best.$iteration";
+	my $logdir="$dir/logs.$iteration";
+	my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+	my $scorerLog="$logdir/scorer.log.$iteration";
+	check_call("mkdir -p $logdir");
+
+
+	#decode
+	print STDERR "RUNNING DECODER AT ";
+	print STDERR unchecked_output("date");
+	my $im1 = $iteration - 1;
+	my $weightsFile="$dir/weights.$im1";
+        push @allweights, "-w $dir/weights.$im1";
+        `rm -f $dir/hgs/*.gz`;
+	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+	my $pcmd;
+	if ($use_make) {
+		$pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
+	} else {
+		$pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
+	}
+	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_bash_call($cmd);
+        my $num_hgs;
+        my $num_topbest;
+        my $retries = 0;
+	while($retries < 5) {
+	    $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+	    $num_topbest = check_output("wc -l < $runFile");
+	    print STDERR "NUMBER OF HGs: $num_hgs\n";
+	    print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+	    if($devSize == $num_hgs && $devSize == $num_topbest) {
+		last;
+	    } else {
+		print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+		sleep(3);
+	    }
+	    $retries++;
+	}
+	die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+	my $dec_score = check_output("cat $runFile | $SCORER -r $refs -m $metric");
+	chomp $dec_score;
+	print STDERR "DECODER SCORE: $dec_score\n";
+
+	# save space
+	check_call("gzip -f $runFile");
+	check_call("gzip -f $decoderLog");
+
+	# run optimizer
+	print STDERR "RUNNING OPTIMIZER AT ";
+	print STDERR unchecked_output("date");
+	print STDERR " - GENERATE TRAINING EXEMPLARS\n";
+	my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+	my $score = 0;
+	my $icc = 0;
+	my $inweights="$dir/weights.$im1";
+	$cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_call($cmd);
+	check_call("mkdir -p $dir/splag.$im1");
+	$cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1 $dir/splag.$im1/mapinput.";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_call($cmd);
+	opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
+	my @shards = grep { /^mapinput\./ } readdir(DIR);
+	closedir DIR;
+	die "No shards!" unless scalar @shards > 0;
+	my $joblist = "";
+	my $nmappers = 0;
+	@cleanupcmds = ();
+	my %o2i = ();
+	my $first_shard = 1;
+	my $mkfile; # only used with makefiles
+	my $mkfilename;
+	if ($use_make) {
+		$mkfilename = "$dir/splag.$im1/domap.mk";
+		open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
+		print $mkfile "all: $dir/splag.$im1/map.done\n\n";
+	}
+	my @mkouts = ();  # only used with makefiles
+	my @mapoutputs = ();
+	for my $shard (@shards) {
+		my $mapoutput = $shard;
+		my $client_name = $shard;
+		$client_name =~ s/mapinput.//;
+		$client_name = "pro.$client_name";
+		$mapoutput =~ s/mapinput/mapoutput/;
+		push @mapoutputs, "$dir/splag.$im1/$mapoutput";
+		$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
+		my $script = "$MAPPER -s $srcFile -m $metric -r $refs -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
+		if ($use_make) {
+			my $script_file = "$dir/scripts/map.$shard";
+			open F, ">$script_file" or die "Can't write $script_file: $!";
+			print F "#!/bin/bash\n";
+			print F "$script\n";
+			close F;
+			my $output = "$dir/splag.$im1/$mapoutput";
+			push @mkouts, $output;
+			chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
+			if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+			print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
+		} else {
+			my $script_file = "$dir/scripts/map.$shard";
+			open F, ">$script_file" or die "Can't write $script_file: $!";
+			print F "$script\n";
+			close F;
+			if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+
+			$nmappers++;
+			my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
+			my $jobid = check_output("$qcmd");
+			chomp $jobid;
+			$jobid =~ s/^(\d+)(.*?)$/\1/g;
+			$jobid =~ s/^Your job (\d+) .*$/\1/;
+		 	push(@cleanupcmds, "qdel $jobid 2> /dev/null");
+			print STDERR " $jobid";
+			if ($joblist == "") { $joblist = $jobid; }
+			else {$joblist = $joblist . "\|" . $jobid; }
+		}
+	}
+	my @dev_outs = ();
+	my @devtest_outs = ();
+	@dev_outs = @mapoutputs;
+	if ($use_make) {
+		print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
+		close $mkfile;
+		my $mcmd = "make -j $jobs -f $mkfilename";
+		print STDERR "\nExecuting: $mcmd\n";
+		check_call($mcmd);
+	} else {
+		print STDERR "\nLaunched $nmappers mappers.\n";
+      		sleep 8;
+		print STDERR "Waiting for mappers to complete...\n";
+		while ($nmappers > 0) {
+		  sleep 5;
+		  my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
+		  $nmappers = scalar @livejobs;
+		}
+		print STDERR "All mappers complete.\n";
+	}
+	my $tol = 0;
+	my $til = 0;
+	my $dev_test_file = "$dir/splag.$im1/devtest.gz";
+	print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n";
+	print STDERR unchecked_output("date");
+	$cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -C $reg -y $reg_previous --interpolate_with_weights $psi";
+        $cmd .= " > $dir/weights.$iteration";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_bash_call($cmd);
+	$lastWeightsFile = "$dir/weights.$iteration";
+	$lastPScore = $score;
+	$iteration++;
+	print STDERR "\n==========\n";
+}
+
+
+check_call("cp $lastWeightsFile $dir/weights.final");
+print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n";
+print STDOUT "$dir/weights.final\n";
+
+exit 0;
+
+sub read_weights_file {
+  my ($file) = @_;
+  open F, "<$file" or die "Couldn't read $file: $!";
+  my @r = ();
+  my $pm = -1;
+  while(<F>) {
+    next if /^#/;
+    next if /^\s*$/;
+    chomp;
+    if (/^(.+)\s+(.+)$/) {
+      my $m = $1;
+      my $w = $2;
+      die "Weights out of order: $m <= $pm" unless $m > $pm;
+      push @r, $w;
+    } else {
+      warn "Unexpected feature name in weight file: $_";
+    }
+  }
+  close F;
+  return join ' ', @r;
+}
+
+sub enseg {
+	my $src = shift;
+	my $newsrc = shift;
+	open(SRC, $src);
+	open(NEWSRC, ">$newsrc");
+	my $i=0;
+	while (my $line=<SRC>){
+		chomp $line;
+		if ($line =~ /^\s*<seg/i) {
+		    if($line =~ /id="[0-9]+"/) {
+			print NEWSRC "$line\n";
+		    } else {
+			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+		    }
+		} else {
+			print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+		}
+		$i++;
+	}
+	close SRC;
+	close NEWSRC;
+	die "Empty dev set!" if ($i == 0);
+}
+
+sub print_help {
+
+	my $executable = basename($0); chomp $executable;
+	print << "Help";
+
+Usage: $executable [options]
+
+	$executable [options]
+		Runs a complete PRO optimization using the ini file specified.
+
+Required:
+
+	--config <cdec.ini>
+		Decoder configuration file.
+
+	--devset <files>
+		Dev set source and reference data.
+
+	--weights <file>
+		Initial weights file (use empty file to start from 0)
+
+General options:
+
+	--help
+		Print this message and exit.
+
+	--max-iterations <M>
+		Maximum number of iterations to run.  If not specified, defaults
+		to $default_max_iter.
+
+	--metric <method>
+		Metric to optimize.
+		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+	--pass-suffix <S>
+		If the decoder is doing multi-pass decoding, the pass suffix "2",
+		"3", etc., is used to control what iteration of weights is set.
+
+	--workdir <dir>
+		Directory for intermediate and output files.  If not specified, the
+		name is derived from the ini filename.  Assuming that the ini
+		filename begins with the decoder name and ends with ini, the default
+		name of the working directory is inferred from the middle part of
+		the filename.  E.g. an ini file named decoder.foo.ini would have
+		a default working directory name foo.
+
+Regularization options:
+
+	--reg <F>
+		l2 regularization strength [default=500]. The greater this value,
+		the closer to zero the weights will be.
+
+	--reg-previous <F>
+		l2 penalty for moving away from the weights from the previous
+		iteration. [default=5000]. The greater this value, the closer
+		to the previous iteration's weights the next iteration's weights
+		will be.
+
+Job control options:
+
+	--jobs <I>
+		Number of decoder processes to run in parallel. [default=$default_jobs]
+
+	--qsub
+		Use qsub to run jobs in parallel (qsub must be configured in
+		environment/LocalEnvironment.pm)
+
+	--pmem <N>
+		Amount of physical memory requested for parallel decoding jobs
+		(used with qsub requests only)
+
+Deprecated options:
+
+	--interpolate-with-weights <F>
+		[deprecated] At each iteration the resulting weights are
+		interpolated with the weights from the previous iteration, with
+		this factor. [default=1.0, i.e., no effect]
+
+Help
+}
+
+sub convert {
+  my ($str) = @_;
+  my @ps = split /;/, $str;
+  my %dict = ();
+  for my $p (@ps) {
+    my ($k, $v) = split /=/, $p;
+    $dict{$k} = $v;
+  }
+  return %dict;
+}
+
+
+sub cmdline {
+    return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+    my ($arg)=@_;
+    return undef unless defined $arg;
+    if ($arg =~ /$is_shell_special/) {
+        $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+        return "\"$arg\"";
+    }
+    return $arg;
+}
+
+sub escaped_shell_args {
+    return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+    return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
+
+sub split_devset {
+  my ($infile, $outsrc, $outref) = @_;
+  open F, "<$infile" or die "Can't read $infile: $!";
+  open S, ">$outsrc" or die "Can't write $outsrc: $!";
+  open R, ">$outref" or die "Can't write $outref: $!";
+  while(<F>) {
+    chomp;
+    my ($src, @refs) = split /\s*\|\|\|\s*/;
+    die "Malformed devset line: $_\n" unless scalar @refs > 0;
+    print S "$src\n";
+    print R join(' ||| ', @refs) . "\n";
+  }
+  close R;
+  close S;
+  close F;
+}
+
diff --git a/training/rampion/Makefile.am b/training/rampion/Makefile.am
new file mode 100644
index 00000000..1633d0f7
--- /dev/null
+++ b/training/rampion/Makefile.am
@@ -0,0 +1,6 @@
+bin_PROGRAMS = rampion_cccp
+
+rampion_cccp_SOURCES = rampion_cccp.cc
+rampion_cccp_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+
+AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils
diff --git a/training/rampion/rampion.pl b/training/rampion/rampion.pl
new file mode 100755
index 00000000..ae084db6
--- /dev/null
+++ b/training/rampion/rampion.pl
@@ -0,0 +1,540 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+my $default_jobs = env_default_jobs();
+
+my $UTILS_DIR="$SCRIPT_DIR/../utils";
+require "$UTILS_DIR/libcall.pl";
+
+# Default settings
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/rampion_generate_input.pl";
+my $MAPPER = "$bin_dir/rampion_cccp";
+my $parallelize = "$UTILS_DIR/parallelize.pl";
+my $libcall = "$UTILS_DIR/libcall.pl";
+my $sentserver = "$UTILS_DIR/sentserver";
+my $sentclient = "$UTILS_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 30;
+my $iteration = 1;
+my $best_weights;
+my $psi = 1;
+my $default_max_iter = 30;
+my $max_iterations = $default_max_iter;
+my $jobs = $default_jobs;   # number of decode nodes
+my $pmem = "4g";
+my $disable_clean = 0;
+my %seen_weights;
+my $help = 0;
+my $epsilon = 0.0001;
+my $dryrun = 0;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $use_make = 1;  # use make to parallelize
+my $useqsub = 0;
+my $initial_weights;
+my $pass_suffix = '';
+my $cpbin=1;
+
+# regularization strength
+my $tune_regularizer = 0;
+my $reg = 500;
+my $reg_previous = 5000;
+my $dont_accum = 0;
+
+# Process command-line options
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+	"jobs=i" => \$jobs,
+	"dont-clean" => \$disable_clean,
+	"dont-accumulate" => \$dont_accum,
+	"pass-suffix=s" => \$pass_suffix,
+        "qsub" => \$useqsub,
+	"dry-run" => \$dryrun,
+	"epsilon=s" => \$epsilon,
+	"help" => \$help,
+        "weights=s" => \$initial_weights,
+	"reg=f" => \$reg,
+	"use-make=i" => \$use_make,
+	"max-iterations=i" => \$max_iterations,
+	"pmem=s" => \$pmem,
+        "cpbin!" => \$cpbin,
+	"ref-files=s" => \$refFiles,
+	"metric=s" => \$metric,
+	"source-file=s" => \$srcFile,
+	"workdir=s" => \$dir,
+) == 0 || @ARGV!=1 || $help) {
+	print_help();
+	exit;
+}
+
+die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer;
+
+if ($useqsub) {
+  $use_make = 0;
+  die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (!defined $srcFile) { push @missing_args, "--source-file"; }
+if (!defined $refFiles) { push @missing_args, "--ref-files"; }
+if (!defined $initial_weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
+
+if ($metric =~ /^(combi|ter)$/i) {
+  $lines_per_mapper = 5;
+}
+
+($iniFile) = @ARGV;
+
+
+sub write_config;
+sub enseg;
+sub print_help;
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+  $DIR_FLAG = '';
+}
+
+my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
+
+unless ($dir){
+	$dir = "rampion";
+}
+unless ($dir =~ /^\//){  # convert relative path to absolute path
+	my $basedir = check_output("pwd");
+	chomp $basedir;
+	$dir = "$basedir/$dir";
+}
+
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+	print STDERR "Cleanup...\n";
+	for my $pid (@childpids){ unchecked_call("kill $pid"); }
+	for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+	exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit = 
+    sub{ cleanup(); }; 
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+use File::Basename qw(basename);
+#pass bindir, refs to vars holding bin
+sub modbin {
+    local $_;
+    my $bindir=shift;
+    check_call("mkdir -p $bindir");
+    -d $bindir || die "couldn't make bindir $bindir";
+    for (@_) {
+        my $src=$$_;
+        $$_="$bindir/".basename($src);
+        check_call("cp -p $src $$_");
+    }
+}
+sub dirsize {
+    opendir ISEMPTY,$_[0];
+    return scalar(readdir(ISEMPTY))-1;
+}
+my @allweights;
+if ($dryrun){
+	write_config(*STDERR);
+	exit 0;
+} else {
+	if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs
+	  die "ERROR: working dir $dir already exists\n\n";
+	} else {
+		-e $dir || mkdir $dir;
+		mkdir "$dir/hgs";
+        modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
+    mkdir "$dir/scripts";
+        my $cmdfile="$dir/rerun-pro.sh";
+        open CMD,'>',$cmdfile;
+        print CMD "cd ",&getcwd,"\n";
+#        print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
+        my $cline=&cmdline."\n";
+        print CMD $cline;
+        close CMD;
+        print STDERR $cline;
+        chmod(0755,$cmdfile);
+	check_call("cp $initial_weights $dir/weights.0");
+	die "Can't find weights.0" unless (-e "$dir/weights.0");
+	}
+	write_config(*STDERR);
+}
+
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+$iniFile = $newIniFile;
+
+my $newsrc = "$dir/dev.input";
+enseg($srcFile, $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+my $kbest = "$dir/kbest";
+if ($dont_accum) {
+  $kbest = '';
+} else {
+  check_call("mkdir -p $kbest");
+  $kbest = "--kbest_repository $kbest";
+}
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+while (1){
+	print STDERR "\n\nITERATION $iteration\n==========\n";
+
+	if ($iteration > $max_iterations){
+		print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+		last;
+	}
+	# iteration-specific files
+	my $runFile="$dir/run.raw.$iteration";
+	my $onebestFile="$dir/1best.$iteration";
+	my $logdir="$dir/logs.$iteration";
+	my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+	my $scorerLog="$logdir/scorer.log.$iteration";
+	check_call("mkdir -p $logdir");
+
+
+	#decode
+	print STDERR "RUNNING DECODER AT ";
+	print STDERR unchecked_output("date");
+	my $im1 = $iteration - 1;
+	my $weightsFile="$dir/weights.$im1";
+        push @allweights, "-w $dir/weights.$im1";
+        `rm -f $dir/hgs/*.gz`;
+	my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+	my $pcmd;
+	if ($use_make) {
+		$pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
+	} else {
+		$pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
+	}
+	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_bash_call($cmd);
+        my $num_hgs;
+        my $num_topbest;
+        my $retries = 0;
+	while($retries < 5) {
+	    $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+	    $num_topbest = check_output("wc -l < $runFile");
+	    print STDERR "NUMBER OF HGs: $num_hgs\n";
+	    print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+	    if($devSize == $num_hgs && $devSize == $num_topbest) {
+		last;
+	    } else {
+		print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+		sleep(3);
+	    }
+	    $retries++;
+	}
+	die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+	my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric");
+	chomp $dec_score;
+	print STDERR "DECODER SCORE: $dec_score\n";
+
+	# save space
+	check_call("gzip -f $runFile");
+	check_call("gzip -f $decoderLog");
+
+	# run optimizer
+	print STDERR "RUNNING OPTIMIZER AT ";
+	print STDERR unchecked_output("date");
+	print STDERR " - GENERATE TRAINING EXEMPLARS\n";
+	my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+	my $score = 0;
+	my $icc = 0;
+	my $inweights="$dir/weights.$im1";
+	my $outweights="$dir/weights.$iteration";
+	$cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_call($cmd);
+	$cmd="$MAPPER $refs_comma_sep -m $metric -i $dir/agenda.$im1 $kbest -w $inweights > $outweights";
+	check_call($cmd);
+	$lastWeightsFile = $outweights;
+	$iteration++;
+	`rm hgs/*.gz`;
+	print STDERR "\n==========\n";
+}
+
+print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
+
+print STDOUT "$lastWeightsFile\n";
+
+exit 0;
+
+sub get_lines {
+  my $fn = shift @_;
+  open FL, "<$fn" or die "Couldn't read $fn: $!";
+  my $lc = 0;
+  while(<FL>) { $lc++; }
+  return $lc;
+}
+
+sub get_comma_sep_refs {
+  my ($r,$p) = @_;
+  my $o = check_output("echo $p");
+  chomp $o;
+  my @files = split /\s+/, $o;
+  return "-$r " . join(" -$r ", @files);
+}
+
+sub read_weights_file {
+  my ($file) = @_;
+  open F, "<$file" or die "Couldn't read $file: $!";
+  my @r = ();
+  my $pm = -1;
+  while(<F>) {
+    next if /^#/;
+    next if /^\s*$/;
+    chomp;
+    if (/^(.+)\s+(.+)$/) {
+      my $m = $1;
+      my $w = $2;
+      die "Weights out of order: $m <= $pm" unless $m > $pm;
+      push @r, $w;
+    } else {
+      warn "Unexpected feature name in weight file: $_";
+    }
+  }
+  close F;
+  return join ' ', @r;
+}
+
+# subs
+sub write_config {
+	my $fh = shift;
+	my $cleanup = "yes";
+	if ($disable_clean) {$cleanup = "no";}
+
+	print $fh "\n";
+	print $fh "DECODER:          $decoder\n";
+	print $fh "INI FILE:         $iniFile\n";
+	print $fh "WORKING DIR:      $dir\n";
+	print $fh "SOURCE (DEV):     $srcFile\n";
+	print $fh "REFS (DEV):       $refFiles\n";
+	print $fh "EVAL METRIC:      $metric\n";
+	print $fh "MAX ITERATIONS:   $max_iterations\n";
+	print $fh "JOBS:             $jobs\n";
+	print $fh "HEAD NODE:        $host\n";
+	print $fh "PMEM (DECODING):  $pmem\n";
+	print $fh "CLEANUP:          $cleanup\n";
+}
+
+sub update_weights_file {
+  my ($neww, $rfn, $rpts) = @_;
+  my @feats = @$rfn;
+  my @pts = @$rpts;
+  my $num_feats = scalar @feats;
+  my $num_pts = scalar @pts;
+  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+  open G, ">$neww" or die;
+  for (my $i = 0; $i < $num_feats; $i++) {
+    my $f = $feats[$i];
+    my $lambda = $pts[$i];
+    print G "$f $lambda\n";
+  }
+  close G;
+}
+
+sub enseg {
+	my $src = shift;
+	my $newsrc = shift;
+	open(SRC, $src);
+	open(NEWSRC, ">$newsrc");
+	my $i=0;
+	while (my $line=<SRC>){
+		chomp $line;
+		if ($line =~ /^\s*<seg/i) {
+		    if($line =~ /id="[0-9]+"/) {
+			print NEWSRC "$line\n";
+		    } else {
+			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+		    }
+		} else {
+			print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+		}
+		$i++;
+	}
+	close SRC;
+	close NEWSRC;
+	die "Empty dev set!" if ($i == 0);
+}
+
+sub print_help {
+
+	my $executable = check_output("basename $0"); chomp $executable;
+	print << "Help";
+
+Usage: $executable [options] <ini file>
+
+	$executable [options] <ini file>
+		Runs a complete PRO optimization using the ini file specified.
+
+Required:
+
+	--ref-files <files>
+		Dev set ref files.  This option takes only a single string argument.
+		To use multiple files (including file globbing), this argument should
+		be quoted.
+
+	--source-file <file>
+		Dev set source file.
+
+	--weights <file>
+		Initial weights file (use empty file to start from 0)
+
+General options:
+
+	--help
+		Print this message and exit.
+
+	--dont-accumulate
+		Don't accumulate k-best lists from multiple iterations.
+
+	--max-iterations <M>
+		Maximum number of iterations to run.  If not specified, defaults
+		to $default_max_iter.
+
+	--metric <method>
+		Metric to optimize.
+		Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+	--pass-suffix <S>
+		If the decoder is doing multi-pass decoding, the pass suffix "2",
+		"3", etc., is used to control what iteration of weights is set.
+
+	--workdir <dir>
+		Directory for intermediate and output files.  If not specified, the
+		name is derived from the ini filename.  Assuming that the ini
+		filename begins with the decoder name and ends with ini, the default
+		name of the working directory is inferred from the middle part of
+		the filename.  E.g. an ini file named decoder.foo.ini would have
+		a default working directory name foo.
+
+Regularization options:
+
+	--reg <F>
+		l2 regularization strength [default=500]. The greater this value,
+		the closer to zero the weights will be.
+
+Job control options:
+
+	--jobs <I>
+		Number of decoder processes to run in parallel. [default=$default_jobs]
+
+	--qsub
+		Use qsub to run jobs in parallel (qsub must be configured in
+		environment/LocalEnvironment.pm)
+
+	--pmem <N>
+		Amount of physical memory requested for parallel decoding jobs
+		(used with qsub requests only)
+
+Help
+}
+
+sub convert {
+  my ($str) = @_;
+  my @ps = split /;/, $str;
+  my %dict = ();
+  for my $p (@ps) {
+    my ($k, $v) = split /=/, $p;
+    $dict{$k} = $v;
+  }
+  return %dict;
+}
+
+
+sub cmdline {
+    return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+    my ($arg)=@_;
+    return undef unless defined $arg;
+    if ($arg =~ /$is_shell_special/) {
+        $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+        return "\"$arg\"";
+    }
+    return $arg;
+}
+
+sub escaped_shell_args {
+    return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+    return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
diff --git a/training/rampion/rampion_cccp.cc b/training/rampion/rampion_cccp.cc
new file mode 100644
index 00000000..1e36dc51
--- /dev/null
+++ b/training/rampion/rampion_cccp.cc
@@ -0,0 +1,168 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <limits>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "stringlib.h"
+#include "weights.h"
+#include "hg_io.h"
+#include "kbest.h"
+#include "viterbi.h"
+#include "ns.h"
+#include "ns_docscorer.h"
+#include "candidate_set.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+        ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
+        ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+        ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)")
+        ("kbest_repository,R",po::value<string>(), "Accumulate k-best lists from previous iterations (parameter is path to repository)")
+        ("kbest_size,k",po::value<unsigned>()->default_value(500u), "Top k-hypotheses to extract")
+        ("cccp_iterations,I", po::value<unsigned>()->default_value(10u), "CCCP iterations (T')")
+        ("ssd_iterations,J", po::value<unsigned>()->default_value(5u), "Stochastic subgradient iterations (T'')")
+        ("eta", po::value<double>()->default_value(1e-4), "Step size")
+        ("regularization_strength,C", po::value<double>()->default_value(1.0), "L2 regularization strength")
+        ("alpha,a", po::value<double>()->default_value(10.0), "Cost scale (alpha); alpha * [1-metric(y,y')]")
+        ("help,h", "Help");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  bool flag = false;
+  if (!conf->count("reference")) {
+    cerr << "Please specify one or more references using -r <REF.TXT>\n";
+    flag = true;
+  }
+  if (!conf->count("weights")) {
+    cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
+    flag = true;
+  }
+  if (flag || conf->count("help")) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+struct GainFunction {
+  explicit GainFunction(const EvaluationMetric* m) : metric(m) {}
+  float operator()(const SufficientStats& eval_feats) const {
+    float g = metric->ComputeScore(eval_feats);
+    if (!metric->IsErrorMetric()) g = 1 - g;
+    return g;
+  }
+  const EvaluationMetric* metric;
+};
+
+template <typename GainFunc>
+void CostAugmentedSearch(const GainFunc& gain,
+                         const training::CandidateSet& cs,
+                         const SparseVector<double>& w,
+                         double alpha,
+                         SparseVector<double>* fmap) {
+  unsigned best_i = 0;
+  double best = -numeric_limits<double>::infinity();
+  for (unsigned i = 0; i < cs.size(); ++i) {
+    double s = cs[i].fmap.dot(w) + alpha * gain(cs[i].eval_feats);
+    if (s > best) {
+      best = s;
+      best_i = i;
+    }
+  }
+  *fmap = cs[best_i].fmap;
+}
+
+
+
+// runs lines 4--15 of rampion algorithm
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  const string evaluation_metric = conf["evaluation_metric"].as<string>();
+
+  EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+  DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+  cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
+  double goodsign = -1;
+  double badsign = -goodsign;
+
+  Hypergraph hg;
+  string last_file;
+  ReadFile in_read(conf["input"].as<string>());
+  string kbest_repo;
+  if (conf.count("kbest_repository")) {
+    kbest_repo = conf["kbest_repository"].as<string>();
+    MkDirP(kbest_repo);
+  }
+  istream &in=*in_read.stream();
+  const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
+  const unsigned tp = conf["cccp_iterations"].as<unsigned>();
+  const unsigned tpp = conf["ssd_iterations"].as<unsigned>();
+  const double eta = conf["eta"].as<double>();
+  const double reg = conf["regularization_strength"].as<double>();
+  const double alpha = conf["alpha"].as<double>();
+  SparseVector<weight_t> weights;
+  {
+    vector<weight_t> vweights;
+    const string weightsf = conf["weights"].as<string>();
+    Weights::InitFromFile(weightsf, &vweights);
+    Weights::InitSparseVector(vweights, &weights);
+  }
+  string line, file;
+  vector<training::CandidateSet> kis;
+  cerr << "Loading hypergraphs...\n";
+  while(getline(in, line)) {
+    istringstream is(line);
+    int sent_id;
+    kis.resize(kis.size() + 1);
+    training::CandidateSet& curkbest = kis.back();
+    string kbest_file;
+    if (kbest_repo.size()) {
+      ostringstream os;
+      os << kbest_repo << "/kbest." << sent_id << ".txt.gz";
+      kbest_file = os.str();
+      if (FileExists(kbest_file))
+        curkbest.ReadFromFile(kbest_file);
+    }
+    is >> file >> sent_id;
+    ReadFile rf(file);
+    if (kis.size() % 5 == 0) { cerr << '.'; }
+    if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; }
+    HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+    hg.Reweight(weights);
+    curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]);
+    if (kbest_file.size())
+      curkbest.WriteToFile(kbest_file);
+  }
+  cerr << "\nHypergraphs loaded.\n";
+
+  vector<SparseVector<weight_t> > goals(kis.size());  // f(x_i,y+,h+)
+  SparseVector<weight_t> fear;  // f(x,y-,h-)
+  const GainFunction gain(metric);
+  for (unsigned iterp = 1; iterp <= tp; ++iterp) {
+    cerr << "CCCP Iteration " << iterp << endl;
+    for (unsigned i = 0; i < goals.size(); ++i)
+      CostAugmentedSearch(gain, kis[i], weights, goodsign * alpha, &goals[i]);
+    for (unsigned iterpp = 1; iterpp <= tpp; ++iterpp) {
+      cerr << "  SSD Iteration " << iterpp << endl;
+      for (unsigned i = 0; i < goals.size(); ++i) {
+        CostAugmentedSearch(gain, kis[i], weights, badsign * alpha, &fear);
+        weights -= weights * (eta * reg / goals.size());
+        weights += (goals[i] - fear) * eta;
+      }
+    }
+  }
+  vector<weight_t> w;
+  weights.init_vector(&w);
+  Weights::WriteToFile("-", w);
+  return 0;
+}
+
diff --git a/training/rampion/rampion_generate_input.pl b/training/rampion/rampion_generate_input.pl
new file mode 100755
index 00000000..b30fc4fd
--- /dev/null
+++ b/training/rampion/rampion_generate_input.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1;
+my $d = shift @ARGV;
+die "Can't find directory $d" unless -d $d;
+
+opendir(DIR, $d) or die "Can't read $d: $!";
+my @hgs = grep { /\.gz$/ } readdir(DIR);
+closedir DIR;
+
+for my $hg (@hgs) {
+  my $file = $hg;
+  my $id = $hg;
+  $id =~ s/(\.json)?\.gz//;
+  print "$d/$file $id\n";
+}
+
diff --git a/training/risk.cc b/training/risk.cc
deleted file mode 100644
index d5a12cfd..00000000
--- a/training/risk.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-#include "risk.h"
-
-#include "prob.h"
-#include "candidate_set.h"
-#include "ns.h"
-
-using namespace std;
-
-namespace training {
-
-// g = \sum_e p(e|f) * loss(e) * (phi(e,f) - E[phi(e,f)])
-double CandidateSetRisk::operator()(const vector<double>& params,
-                                    SparseVector<double>* g) const {
-  prob_t z;
-  for (unsigned i = 0; i < cands_.size(); ++i) {
-    const prob_t u(cands_[i].fmap.dot(params), init_lnx());
-    z += u;
-  }
-  const double log_z = log(z);
-
-  SparseVector<double> exp_feats;
-  if (g) {
-    for (unsigned i = 0; i < cands_.size(); ++i) {
-      const double log_prob = cands_[i].fmap.dot(params) - log_z;
-      const double prob = exp(log_prob);
-      exp_feats += cands_[i].fmap * prob;
-    }
-  }
-
-  double risk = 0;
-  for (unsigned i = 0; i < cands_.size(); ++i) {
-    const double log_prob = cands_[i].fmap.dot(params) - log_z;
-    const double prob = exp(log_prob);
-    const double cost = metric_.IsErrorMetric() ? metric_.ComputeScore(cands_[i].eval_feats)
-                                                : 1.0 - metric_.ComputeScore(cands_[i].eval_feats);
-    const double r = prob * cost;
-    risk += r;
-    if (g) (*g) += (cands_[i].fmap - exp_feats) * r;
-  }
-  return risk;
-}
-
-}
-
-
diff --git a/training/risk.h b/training/risk.h
deleted file mode 100644
index 2e8db0fb..00000000
--- a/training/risk.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _RISK_H_
-#define _RISK_H_
-
-#include <vector>
-#include "sparse_vector.h"
-class EvaluationMetric;
-
-namespace training {
-  class CandidateSet;
-
-  class CandidateSetRisk {
-   public:
-    explicit CandidateSetRisk(const CandidateSet& cs, const EvaluationMetric& metric) :
-       cands_(cs),
-       metric_(metric) {}
-    // compute the risk (expected loss) of a CandidateSet
-    // (optional) the gradient of the risk with respect to params
-    double operator()(const std::vector<double>& params,
-                      SparseVector<double>* g = NULL) const;
-   private:
-    const CandidateSet& cands_;
-    const EvaluationMetric& metric_;
-  };
-};
-
-#endif
diff --git a/training/ttables.cc b/training/ttables.cc
deleted file mode 100644
index 45bf14c5..00000000
--- a/training/ttables.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-#include "ttables.h"
-
-#include <cassert>
-
-#include "dict.h"
-
-using namespace std;
-using namespace std::tr1;
-
-void TTable::DeserializeProbsFromText(std::istream* in) {
-  int c = 0;
-  while(*in) {
-    string e;
-    string f;
-    double p;
-    (*in) >> e >> f >> p;
-    if (e.empty()) break;
-    ++c;
-    ttable[TD::Convert(e)][TD::Convert(f)] = p;
-  }
-  cerr << "Loaded " << c << " translation parameters.\n";
-}
-
-void TTable::SerializeHelper(string* out, const Word2Word2Double& o) {
-  assert(!"not implemented");
-}
-
-void TTable::DeserializeHelper(const string& in, Word2Word2Double* o) {
-  assert(!"not implemented");
-}
-
diff --git a/training/ttables.h b/training/ttables.h
deleted file mode 100644
index 9baa13ca..00000000
--- a/training/ttables.h
+++ /dev/null
@@ -1,101 +0,0 @@
-#ifndef _TTABLES_H_
-#define _TTABLES_H_
-
-#include <iostream>
-#include <tr1/unordered_map>
-
-#include "sparse_vector.h"
-#include "m.h"
-#include "wordid.h"
-#include "tdict.h"
-
-class TTable {
- public:
-  TTable() {}
-  typedef std::tr1::unordered_map<WordID, double> Word2Double;
-  typedef std::tr1::unordered_map<WordID, Word2Double> Word2Word2Double;
-  inline double prob(const int& e, const int& f) const {
-    const Word2Word2Double::const_iterator cit = ttable.find(e);
-    if (cit != ttable.end()) {
-      const Word2Double& cpd = cit->second;
-      const Word2Double::const_iterator it = cpd.find(f);
-      if (it == cpd.end()) return 1e-9;
-      return it->second;
-    } else {
-      return 1e-9;
-    }
-  }
-  inline void Increment(const int& e, const int& f) {
-    counts[e][f] += 1.0;
-  }
-  inline void Increment(const int& e, const int& f, double x) {
-    counts[e][f] += x;
-  }
-  void NormalizeVB(const double alpha) {
-    ttable.swap(counts);
-    for (Word2Word2Double::iterator cit = ttable.begin();
-         cit != ttable.end(); ++cit) {
-      double tot = 0;
-      Word2Double& cpd = cit->second;
-      for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
-        tot += it->second + alpha;
-      for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
-        it->second = exp(Md::digamma(it->second + alpha) - Md::digamma(tot));
-    }
-    counts.clear();
-  }
-  void Normalize() {
-    ttable.swap(counts);
-    for (Word2Word2Double::iterator cit = ttable.begin();
-         cit != ttable.end(); ++cit) {
-      double tot = 0;
-      Word2Double& cpd = cit->second;
-      for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
-        tot += it->second;
-      for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
-        it->second /= tot;
-    }
-    counts.clear();
-  }
-  // adds counts from another TTable - probabilities remain unchanged
-  TTable& operator+=(const TTable& rhs) {
-    for (Word2Word2Double::const_iterator it = rhs.counts.begin();
-         it != rhs.counts.end(); ++it) {
-      const Word2Double& cpd = it->second;
-      Word2Double& tgt = counts[it->first];
-      for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) {
-        tgt[j->first] += j->second;
-      }
-    }
-    return *this;
-  }
-  void ShowTTable() const {
-    for (Word2Word2Double::const_iterator it = ttable.begin(); it != ttable.end(); ++it) {
-      const Word2Double& cpd = it->second;
-      for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) {
-        std::cerr << "P(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl;
-      }
-    }
-  }
-  void ShowCounts() const {
-    for (Word2Word2Double::const_iterator it = counts.begin(); it != counts.end(); ++it) {
-      const Word2Double& cpd = it->second;
-      for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) {
-        std::cerr << "c(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl;
-      }
-    }
-  }
-  void DeserializeProbsFromText(std::istream* in);
-  void SerializeCounts(std::string* out) const { SerializeHelper(out, counts); }
-  void DeserializeCounts(const std::string& in) { DeserializeHelper(in, &counts); }
-  void SerializeProbs(std::string* out) const { SerializeHelper(out, ttable); }
-  void DeserializeProbs(const std::string& in) { DeserializeHelper(in, &ttable); }
- private:
-  static void SerializeHelper(std::string*, const Word2Word2Double& o);
-  static void DeserializeHelper(const std::string&, Word2Word2Double* o);
- public:
-  Word2Word2Double ttable;
-  Word2Word2Double counts;
-};
-
-#endif
diff --git a/training/utils/candidate_set.cc b/training/utils/candidate_set.cc
new file mode 100644
index 00000000..087efec3
--- /dev/null
+++ b/training/utils/candidate_set.cc
@@ -0,0 +1,169 @@
+#include "candidate_set.h"
+
+#include <tr1/unordered_set>
+
+#include <boost/functional/hash.hpp>
+
+#include "verbose.h"
+#include "ns.h"
+#include "filelib.h"
+#include "wordid.h"
+#include "tdict.h"
+#include "hg.h"
+#include "kbest.h"
+#include "viterbi.h"
+
+using namespace std;
+
+namespace training {
+
+struct ApproxVectorHasher {
+  static const size_t MASK = 0xFFFFFFFFull;
+  union UType {
+    double f;   // leave as double
+    size_t i;
+  };
+  static inline double round(const double x) {
+    UType t;
+    t.f = x;
+    size_t r = t.i & MASK;
+    if ((r << 1) > MASK)
+      t.i += MASK - r + 1;
+    else
+      t.i &= (1ull - MASK);
+    return t.f;
+  }
+  size_t operator()(const SparseVector<double>& x) const {
+    size_t h = 0x573915839;
+    for (SparseVector<double>::const_iterator it = x.begin(); it != x.end(); ++it) {
+      UType t;
+      t.f = it->second;
+      if (t.f) {
+        size_t z = (t.i >> 32);
+        boost::hash_combine(h, it->first);
+        boost::hash_combine(h, z);
+      }
+    }
+    return h;
+  }
+};
+
+struct ApproxVectorEquals {
+  bool operator()(const SparseVector<double>& a, const SparseVector<double>& b) const {
+    SparseVector<double>::const_iterator bit = b.begin();
+    for (SparseVector<double>::const_iterator ait = a.begin(); ait != a.end(); ++ait) {
+      if (bit == b.end() ||
+          ait->first != bit->first ||
+          ApproxVectorHasher::round(ait->second) != ApproxVectorHasher::round(bit->second))
+        return false;
+      ++bit;
+    }
+    if (bit != b.end()) return false;
+    return true;
+  }
+};
+
+struct CandidateCompare {
+  bool operator()(const Candidate& a, const Candidate& b) const {
+    ApproxVectorEquals eq;
+    return (a.ewords == b.ewords && eq(a.fmap,b.fmap));
+  }
+};
+
+struct CandidateHasher {
+  size_t operator()(const Candidate& x) const {
+    boost::hash<vector<WordID> > hhasher;
+    ApproxVectorHasher vhasher;
+    size_t ha = hhasher(x.ewords);
+    boost::hash_combine(ha, vhasher(x.fmap));
+    return ha;
+  }
+};
+
+static void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
+  SparseVector<double>& x = *out;
+  size_t last_start = cur;
+  size_t last_comma = string::npos;
+  while(cur <= line.size()) {
+    if (line[cur] == ' ' || cur == line.size()) {
+      if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
+        cerr << "[ERROR] " << line << endl << "  position = " << cur << endl;
+        exit(1);
+      }
+      const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
+      if (cur < line.size()) line[cur] = 0;
+      const double val = strtod(&line[last_comma + 1], NULL);
+      x.set_value(fid, val);
+
+      last_comma = string::npos;
+      last_start = cur+1;
+    } else {
+      if (line[cur] == '=')
+        last_comma = cur;
+    }
+    ++cur;
+  }
+}
+
+void CandidateSet::WriteToFile(const string& file) const {
+  WriteFile wf(file);
+  ostream& out = *wf.stream();
+  out.precision(10);
+  string ss;
+  for (unsigned i = 0; i < cs.size(); ++i) {
+    out << TD::GetString(cs[i].ewords) << endl;
+    out << cs[i].fmap << endl;
+    cs[i].eval_feats.Encode(&ss);
+    out << ss << endl;
+  }
+}
+
+void CandidateSet::ReadFromFile(const string& file) {
+  if(!SILENT) cerr << "Reading candidates from " << file << endl;
+  ReadFile rf(file);
+  istream& in = *rf.stream();
+  string cand;
+  string feats;
+  string ss;
+  while(getline(in, cand)) {
+    getline(in, feats);
+    getline(in, ss);
+    assert(in);
+    cs.push_back(Candidate());
+    TD::ConvertSentence(cand, &cs.back().ewords);
+    ParseSparseVector(feats, 0, &cs.back().fmap);
+    cs.back().eval_feats = SufficientStats(ss);
+  }
+  if(!SILENT) cerr << "  read " << cs.size() << " candidates\n";
+}
+
+void CandidateSet::Dedup() {
+  if(!SILENT) cerr << "Dedup in=" << cs.size();
+  tr1::unordered_set<Candidate, CandidateHasher, CandidateCompare> u;
+  while(cs.size() > 0) {
+    u.insert(cs.back());
+    cs.pop_back();
+  }
+  tr1::unordered_set<Candidate, CandidateHasher, CandidateCompare>::iterator it = u.begin();
+  while (it != u.end()) {
+    cs.push_back(*it);
+    it = u.erase(it);
+  }
+  if(!SILENT) cerr << "  out=" << cs.size() << endl;
+}
+
+void CandidateSet::AddKBestCandidates(const Hypergraph& hg, size_t kbest_size, const SegmentEvaluator* scorer) {
+  KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
+
+  for (unsigned i = 0; i < kbest_size; ++i) {
+    const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+      kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+    if (!d) break;
+    cs.push_back(Candidate(d->yield, d->feature_values));
+    if (scorer)
+      scorer->Evaluate(d->yield, &cs.back().eval_feats);
+  }
+  Dedup();
+}
+
+}
diff --git a/training/utils/candidate_set.h b/training/utils/candidate_set.h
new file mode 100644
index 00000000..9d326ed0
--- /dev/null
+++ b/training/utils/candidate_set.h
@@ -0,0 +1,60 @@
+#ifndef _CANDIDATE_SET_H_
+#define _CANDIDATE_SET_H_
+
+#include <vector>
+#include <algorithm>
+
+#include "ns.h"
+#include "wordid.h"
+#include "sparse_vector.h"
+
+class Hypergraph;
+
+namespace training {
+
+struct Candidate {
+  Candidate() {}
+  Candidate(const std::vector<WordID>& e, const SparseVector<double>& fm) :
+      ewords(e),
+      fmap(fm) {}
+  Candidate(const std::vector<WordID>& e,
+            const SparseVector<double>& fm,
+            const SegmentEvaluator& se) :
+      ewords(e),
+      fmap(fm) {
+    se.Evaluate(ewords, &eval_feats);
+  }
+
+  void swap(Candidate& other) {
+    eval_feats.swap(other.eval_feats);
+    ewords.swap(other.ewords);
+    fmap.swap(other.fmap);
+  }
+
+  std::vector<WordID> ewords;
+  SparseVector<double> fmap;
+  SufficientStats eval_feats;
+};
+
+// represents some kind of collection of translation candidates, e.g.
+// aggregated k-best lists, sample lists, etc.
+class CandidateSet {
+ public:
+  CandidateSet() {}
+  inline size_t size() const { return cs.size(); }
+  const Candidate& operator[](size_t i) const { return cs[i]; }
+
+  void ReadFromFile(const std::string& file);
+  void WriteToFile(const std::string& file) const;
+  void AddKBestCandidates(const Hypergraph& hg, size_t kbest_size, const SegmentEvaluator* scorer = NULL);
+  // TODO add code to do unique k-best
+  // TODO add code to draw k samples
+
+ private:
+  void Dedup();
+  std::vector<Candidate> cs;
+};
+
+}
+
+#endif
diff --git a/training/utils/decode-and-evaluate.pl b/training/utils/decode-and-evaluate.pl
new file mode 100755
index 00000000..1a332c08
--- /dev/null
+++ b/training/utils/decode-and-evaluate.pl
@@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use File::Basename qw(basename);
+my $QSUB_CMD = qsub_args(mert_memory());
+
+require "libcall.pl";
+
+# Default settings
+my $default_jobs = env_default_jobs();
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $parallelize = "$bin_dir/parallelize.pl";
+my $libcall = "$bin_dir/libcall.pl";
+my $sentserver = "$bin_dir/sentserver";
+my $sentclient = "$bin_dir/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+my $cdec = "$bin_dir/../../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $jobs = $default_jobs;   # number of decode nodes
+my $pmem = "9g";
+my $help = 0;
+my $config;
+my $test_set;
+my $weights;
+my $use_make = 1;
+my $useqsub;
+my $cpbin=1;
+# Process command-line options
+if (GetOptions(
+	"jobs=i" => \$jobs,
+	"help" => \$help,
+	"qsub" => \$useqsub,
+	"input=s" => \$test_set,
+        "config=s" => \$config,
+	"weights=s" => \$weights,
+) == 0 || @ARGV!=0 || $help) {
+	print_help();
+	exit;
+}
+
+if ($useqsub) {
+  $use_make = 0;
+  die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+
+if (!defined $test_set) { push @missing_args, "--input"; }
+if (!defined $config) { push @missing_args, "--config"; }
+if (!defined $weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args);
+
+my @tf = localtime(time);
+my $tname = basename($test_set);
+$tname =~ s/\.(sgm|sgml|xml)$//i;
+my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]);
+
+my $time = unchecked_output("date");
+
+check_call("mkdir -p $dir");
+
+split_devset($test_set, "$dir/test.input.raw", "$dir/test.refs");
+my $refs = "-r $dir/test.refs";
+my $newsrc = "$dir/test.input";
+enseg("$dir/test.input.raw", $newsrc);
+my $src_file = $newsrc;
+open F, "<$src_file" or die "Can't read $src_file: $!"; close F;
+
+my $test_trans="$dir/test.trans";
+my $logdir="$dir/logs";
+my $decoderLog="$logdir/decoder.sentserver.log";
+check_call("mkdir -p $logdir");
+
+#decode
+print STDERR "RUNNING DECODER AT ";
+print STDERR unchecked_output("date");
+my $decoder_cmd = "$decoder -c $config --weights $weights";
+my $pcmd;
+if ($use_make) {
+	$pcmd = "cat $src_file | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --";
+} else {
+	$pcmd = "cat $src_file | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --";
+}
+my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $test_trans";
+check_bash_call($cmd);
+print STDERR "DECODER COMPLETED AT ";
+print STDERR unchecked_output("date");
+print STDERR "\nOUTPUT: $test_trans\n\n";
+my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu");
+chomp $bleu;
+print STDERR "BLEU: $bleu\n";
+my $ter = check_output("cat $test_trans | $SCORER $refs -m ter");
+chomp $ter;
+print STDERR " TER: $ter\n";
+open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!";
+print TR <<EOT;
+### SCORE REPORT #############################################################
+        OUTPUT=$test_trans
+  SCRIPT INPUT=$test_set
+ DECODER INPUT=$src_file
+    REFERENCES=$dir/test.refs
+------------------------------------------------------------------------------
+          BLEU=$bleu
+           TER=$ter
+##############################################################################
+EOT
+close TR;
+my $sr = unchecked_output("cat $dir/test.scores");
+print STDERR "\n\n$sr\n(A copy of this report can be found in $dir/test.scores)\n\n";
+exit 0;
+
+sub enseg {
+	my $src = shift;
+	my $newsrc = shift;
+	open(SRC, $src);
+	open(NEWSRC, ">$newsrc");
+	my $i=0;
+	while (my $line=<SRC>){
+		chomp $line;
+		if ($line =~ /^\s*<seg/i) {
+		    if($line =~ /id="[0-9]+"/) {
+			print NEWSRC "$line\n";
+		    } else {
+			die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+		    }
+		} else {
+			print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+		}
+		$i++;
+	}
+	close SRC;
+	close NEWSRC;
+}
+
+sub print_help {
+	my $executable = basename($0); chomp $executable;
+	print << "Help";
+
+Usage: $executable [options] <ini file>
+
+	$executable --config cdec.ini --weights weights.txt [--jobs N] [--qsub] <testset.in-ref>
+
+Options:
+
+	--help
+		Print this message and exit.
+
+	--config <file>
+		A path to the cdec.ini file.
+
+	--weights <file>
+		A file specifying feature weights.
+
+	--dir <dir>
+		Directory for intermediate and output files.
+
+Job control options:
+
+	--jobs <I>
+		Number of decoder processes to run in parallel. [default=$default_jobs]
+
+	--qsub
+		Use qsub to run jobs in parallel (qsub must be configured in
+		environment/LocalEnvironment.pm)
+
+	--pmem <N>
+		Amount of physical memory requested for parallel decoding jobs
+		(used with qsub requests only)
+
+Help
+}
+
+sub convert {
+  my ($str) = @_;
+  my @ps = split /;/, $str;
+  my %dict = ();
+  for my $p (@ps) {
+    my ($k, $v) = split /=/, $p;
+    $dict{$k} = $v;
+  }
+  return %dict;
+}
+
+
+
+sub cmdline {
+    return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+    my ($arg)=@_;
+    return undef unless defined $arg;
+    if ($arg =~ /$is_shell_special/) {
+        $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+        return "\"$arg\"";
+    }
+    return $arg;
+}
+
+sub escaped_shell_args {
+    return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+    return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
+
+sub split_devset {
+  my ($infile, $outsrc, $outref) = @_;
+  open F, "<$infile" or die "Can't read $infile: $!";
+  open S, ">$outsrc" or die "Can't write $outsrc: $!";
+  open R, ">$outref" or die "Can't write $outref: $!";
+  while(<F>) {
+    chomp;
+    my ($src, @refs) = split /\s*\|\|\|\s*/;
+    die "Malformed devset line: $_\n" unless scalar @refs > 0;
+    print S "$src\n";
+    print R join(' ||| ', @refs) . "\n";
+  }
+  close R;
+  close S;
+  close F;
+}
+
diff --git a/training/utils/entropy.cc b/training/utils/entropy.cc
new file mode 100644
index 00000000..4fdbe2be
--- /dev/null
+++ b/training/utils/entropy.cc
@@ -0,0 +1,41 @@
+#include "entropy.h"
+
+#include "prob.h"
+#include "candidate_set.h"
+
+using namespace std;
+
+namespace training {
+
+// see Mann and McCallum "Efficient Computation of Entropy Gradient ..." for
+// a mostly clear derivation of:
+//   g = E[ F(x,y) * log p(y|x) ] + H(y | x) * E[ F(x,y) ]
+double CandidateSetEntropy::operator()(const vector<double>& params,
+                                       SparseVector<double>* g) const {
+  prob_t z;
+  vector<double> dps(cands_.size());
+  for (unsigned i = 0; i < cands_.size(); ++i) {
+    dps[i] = cands_[i].fmap.dot(params);
+    const prob_t u(dps[i], init_lnx());
+    z += u;
+  }
+  const double log_z = log(z);
+
+  SparseVector<double> exp_feats;
+  double entropy = 0;
+  for (unsigned i = 0; i < cands_.size(); ++i) {
+    const double log_prob = cands_[i].fmap.dot(params) - log_z;
+    const double prob = exp(log_prob);
+    const double e_logprob = prob * log_prob;
+    entropy -= e_logprob;
+    if (g) {
+      (*g) += cands_[i].fmap * e_logprob;
+      exp_feats += cands_[i].fmap * prob;
+    }
+  }
+  if (g) (*g) += exp_feats * entropy;
+  return entropy;
+}
+
+}
+
diff --git a/training/utils/entropy.h b/training/utils/entropy.h
new file mode 100644
index 00000000..796589ca
--- /dev/null
+++ b/training/utils/entropy.h
@@ -0,0 +1,22 @@
+#ifndef _CSENTROPY_H_
+#define _CSENTROPY_H_
+
+#include <vector>
+#include "sparse_vector.h"
+
+namespace training {
+  class CandidateSet;
+
+  class CandidateSetEntropy {
+   public:
+    explicit CandidateSetEntropy(const CandidateSet& cs) : cands_(cs) {}
+    // compute the entropy (expected log likelihood) of a CandidateSet
+    // (optional) the gradient of the entropy with respect to params
+    double operator()(const std::vector<double>& params,
+                      SparseVector<double>* g = NULL) const;
+   private:
+    const CandidateSet& cands_;
+  };
+};
+
+#endif
diff --git a/training/utils/grammar_convert.cc b/training/utils/grammar_convert.cc
new file mode 100644
index 00000000..607a7cb9
--- /dev/null
+++ b/training/utils/grammar_convert.cc
@@ -0,0 +1,348 @@
+/*
+  this program modifies cfg hypergraphs (forests) and extracts kbests?
+  what are: json, split ?
+ */
+#include <iostream>
+#include <algorithm>
+#include <sstream>
+
+#include <boost/lexical_cast.hpp>
+#include <boost/program_options.hpp>
+
+#include "inside_outside.h"
+#include "tdict.h"
+#include "filelib.h"
+#include "hg.h"
+#include "hg_io.h"
+#include "kbest.h"
+#include "viterbi.h"
+#include "weights.h"
+
+namespace po = boost::program_options;
+using namespace std;
+
+WordID kSTART;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("input,i", po::value<string>()->default_value("-"), "Input file")
+        ("format,f", po::value<string>()->default_value("cfg"), "Input format. Values: cfg, json, split")
+        ("output,o", po::value<string>()->default_value("json"), "Output command. Values: json, 1best")
+        ("reorder,r", "Add Yamada & Knight (2002) reorderings")
+        ("weights,w", po::value<string>(), "Feature weights for k-best derivations [optional]")
+        ("collapse_weights,C", "Collapse order features into a single feature whose value is all of the locally applying feature weights")
+        ("k_derivations,k", po::value<int>(), "Show k derivations and their features")
+        ("max_reorder,m", po::value<int>()->default_value(999), "Move a constituent at most this far")
+        ("help,h", "Print this help message and exit");
+  po::options_description clo("Command line options");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  po::notify(*conf);
+
+  if (conf->count("help") || conf->count("input") == 0) {
+    cerr << "\nUsage: grammar_convert [-options]\n\nConverts a grammar file (in Hiero format) into JSON hypergraph.\n";
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+int GetOrCreateNode(const WordID& lhs, map<WordID, int>* lhs2node, Hypergraph* hg) {
+  int& node_id = (*lhs2node)[lhs];
+  if (!node_id)
+    node_id = hg->AddNode(lhs)->id_ + 1;
+  return node_id - 1;
+}
+
+void FilterAndCheckCorrectness(int goal, Hypergraph* hg) {
+  if (goal < 0) {
+    cerr << "Error! [S] not found in grammar!\n";
+    exit(1);
+  }
+  if (hg->nodes_[goal].in_edges_.size() != 1) {
+    cerr << "Error! [S] has more than one rewrite!\n";
+    exit(1);
+  }
+  int old_size = hg->nodes_.size();
+  hg->TopologicallySortNodesAndEdges(goal);
+  if (hg->nodes_.size() != old_size) {
+    cerr << "Warning! During sorting " << (old_size - hg->nodes_.size()) << " disappeared!\n";
+  }
+  vector<double> inside; // inside score at each node
+  double p = Inside<double, TransitionCountWeightFunction>(*hg, &inside);
+  if (!p) {
+    cerr << "Warning! Grammar defines the empty language!\n";
+    hg->clear();
+    return;
+  }
+  vector<bool> prune(hg->edges_.size(), false);
+  int bad_edges = 0;
+  for (unsigned i = 0; i < hg->edges_.size(); ++i) {
+    Hypergraph::Edge& edge = hg->edges_[i];
+    bool bad = false;
+    for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) {
+      if (!inside[edge.tail_nodes_[j]]) {
+        bad = true;
+        ++bad_edges;
+      }
+    }
+    prune[i] = bad;
+  }
+  cerr << "Removing " << bad_edges << " bad edges from the grammar.\n";
+  for (unsigned i = 0; i < hg->edges_.size(); ++i) {
+    if (prune[i])
+      cerr << "   " << hg->edges_[i].rule_->AsString() << endl;
+  }
+  hg->PruneEdges(prune);
+}
+
+void CreateEdge(const TRulePtr& r, const Hypergraph::TailNodeVector& tail, Hypergraph::Node* head_node, Hypergraph* hg) {
+  Hypergraph::Edge* new_edge = hg->AddEdge(r, tail);
+  hg->ConnectEdgeToHeadNode(new_edge, head_node);
+  new_edge->feature_values_ = r->scores_;
+}
+
+// from a category label like "NP_2", return "NP"
+string PureCategory(WordID cat) {
+  assert(cat < 0);
+  string c = TD::Convert(cat*-1);
+  size_t p = c.find("_");
+  if (p == string::npos) return c;
+  return c.substr(0, p);
+};
+
+string ConstituentOrderFeature(const TRule& rule, const vector<int>& pi) {
+  const static string kTERM_VAR = "x";
+  const vector<WordID>& f = rule.f();
+  map<string, int> used;
+  vector<string> terms(f.size());
+  for (int i = 0; i < f.size(); ++i) {
+    const string term = (f[i] < 0 ? PureCategory(f[i]) : kTERM_VAR);
+    int& count = used[term];
+    if (!count) {
+      terms[i] = term;
+    } else {
+      ostringstream os;
+      os << term << count;
+      terms[i] = os.str();
+    }
+    ++count;
+  }
+  ostringstream os;
+  os << PureCategory(rule.GetLHS()) << ':';
+  for (int i = 0; i < f.size(); ++i) {
+    if (i > 0) os << '_';
+    os << terms[pi[i]];
+  }
+  return os.str();
+}
+
+bool CheckPermutationMask(const vector<int>& mask, const vector<int>& pi) {
+  assert(mask.size() == pi.size());
+
+  int req_min = -1;
+  int cur_max = 0;
+  int cur_mask = -1;
+  for (int i = 0; i < mask.size(); ++i) {
+    if (mask[i] != cur_mask) {
+      cur_mask = mask[i];
+      req_min = cur_max - 1;
+    }
+    if (pi[i] > req_min) {
+      if (pi[i] > cur_max) cur_max = pi[i];
+    } else {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void PermuteYKRecursive(int nodeid, const WordID& parent, const int max_reorder, Hypergraph* hg) {
+  // Hypergraph tmp = *hg;
+  Hypergraph::Node* node = &hg->nodes_[nodeid];
+  if (node->in_edges_.size() != 1) {
+    cerr << "Multiple rewrites of [" << TD::Convert(node->cat_ * -1) << "] (parent is [" << TD::Convert(parent*-1) << "])\n";
+    cerr << "  not recursing!\n";
+    return;
+  }
+//  for (int eii = 0; eii < node->in_edges_.size(); ++eii) {
+    const int oe_index = node->in_edges_.front();
+    const TRule& rule = *hg->edges_[oe_index].rule_;
+    const Hypergraph::TailNodeVector orig_tail = hg->edges_[oe_index].tail_nodes_;
+    const int tail_size = orig_tail.size();
+    for (int i = 0; i < tail_size; ++i) {
+      PermuteYKRecursive(hg->edges_[oe_index].tail_nodes_[i], node->cat_, max_reorder, hg);
+    }
+    const vector<WordID>& of = rule.f_;
+    if (of.size() == 1) return;
+  //  cerr << "Permuting [" << TD::Convert(node->cat_ * -1) << "]\n";
+  //  cerr << "ORIG: " << rule.AsString() << endl;
+    vector<WordID> pi(of.size(), 0);
+    for (int i = 0; i < pi.size(); ++i) pi[i] = i;
+
+    vector<int> permutation_mask(of.size(), 0);
+    const bool dont_reorder_across_PU = true;  // TODO add configuration
+    if (dont_reorder_across_PU) {
+      int cur = 0;
+      for (int i = 0; i < pi.size(); ++i) {
+        if (of[i] >= 0) continue;
+        const string cat = PureCategory(of[i]);
+        if (cat == "PU" || cat == "PU!H" || cat == "PUNC" || cat == "PUNC!H" || cat == "CC") {
+          ++cur;
+          permutation_mask[i] = cur;
+          ++cur;
+        } else {
+          permutation_mask[i] = cur;
+        }
+      }
+    }
+    int fid = FD::Convert(ConstituentOrderFeature(rule, pi));
+    hg->edges_[oe_index].feature_values_.set_value(fid, 1.0);
+    while (next_permutation(pi.begin(), pi.end())) {
+      if (!CheckPermutationMask(permutation_mask, pi))
+        continue;
+      vector<WordID> nf(pi.size(), 0);
+      Hypergraph::TailNodeVector tail(pi.size(), 0);
+      bool skip = false;
+      for (int i = 0; i < pi.size(); ++i) {
+        int dist = pi[i] - i; if (dist < 0) dist *= -1;
+        if (dist > max_reorder) { skip = true; break; }
+        nf[i] = of[pi[i]];
+        tail[i] = orig_tail[pi[i]];
+      }
+      if (skip) continue;
+      TRulePtr nr(new TRule(rule));
+      nr->f_ = nf;
+      int fid = FD::Convert(ConstituentOrderFeature(rule, pi));
+      nr->scores_.set_value(fid, 1.0);
+  //    cerr << "PERM: " << nr->AsString() << endl;
+      CreateEdge(nr, tail, node, hg);
+    }
+ // }
+}
+
+void PermuteYamadaAndKnight(Hypergraph* hg, int max_reorder) {
+  assert(hg->nodes_.back().cat_ == kSTART);
+  assert(hg->nodes_.back().in_edges_.size() == 1);
+  PermuteYKRecursive(hg->nodes_.size() - 1, kSTART, max_reorder, hg);
+}
+
+void CollapseWeights(Hypergraph* hg) {
+  int fid = FD::Convert("Reordering");
+  for (int i = 0; i < hg->edges_.size(); ++i) {
+    Hypergraph::Edge& edge = hg->edges_[i];
+    edge.feature_values_.clear();
+    if (edge.edge_prob_ != prob_t::Zero()) {
+      edge.feature_values_.set_value(fid, log(edge.edge_prob_));
+    }
+  }
+}
+
+void ProcessHypergraph(const vector<double>& w, const po::variables_map& conf, const string& ref, Hypergraph* hg) {
+  if (conf.count("reorder"))
+    PermuteYamadaAndKnight(hg, conf["max_reorder"].as<int>());
+  if (w.size() > 0) { hg->Reweight(w); }
+  if (conf.count("collapse_weights")) CollapseWeights(hg);
+  if (conf["output"].as<string>() == "json") {
+    HypergraphIO::WriteToJSON(*hg, false, &cout);
+    if (!ref.empty()) { cerr << "REF: " << ref << endl; }
+  } else {
+    vector<WordID> onebest;
+    ViterbiESentence(*hg, &onebest);
+    if (ref.empty()) {
+      cout << TD::GetString(onebest) << endl;
+    } else {
+      cout << TD::GetString(onebest) << " ||| " << ref << endl;
+    }
+  }
+  if (conf.count("k_derivations")) {
+    const int k = conf["k_derivations"].as<int>();
+    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(*hg, k);
+    for (int i = 0; i < k; ++i) {
+      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+        kbest.LazyKthBest(hg->nodes_.size() - 1, i);
+      if (!d) break;
+      cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  kSTART = TD::Convert("S") * -1;
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  string infile = conf["input"].as<string>();
+  const bool is_split_input = (conf["format"].as<string>() == "split");
+  const bool is_json_input = is_split_input || (conf["format"].as<string>() == "json");
+  const bool collapse_weights = conf.count("collapse_weights");
+  vector<double> w;
+  if (conf.count("weights"))
+    Weights::InitFromFile(conf["weights"].as<string>(), &w);
+
+  if (collapse_weights && !w.size()) {
+    cerr << "--collapse_weights requires a weights file to be specified!\n";
+    exit(1);
+  }
+  ReadFile rf(infile);
+  istream* in = rf.stream();
+  assert(*in);
+  int lc = 0;
+  Hypergraph hg;
+  map<WordID, int> lhs2node;
+  while(*in) {
+    string line;
+    ++lc;
+    getline(*in, line);
+    if (is_json_input) {
+      if (line.empty() || line[0] == '#') continue;
+      string ref;
+      if (is_split_input) {
+        size_t pos = line.rfind("}}");
+        assert(pos != string::npos);
+        size_t rstart = line.find("||| ", pos);
+        assert(rstart != string::npos);
+        ref = line.substr(rstart + 4);
+        line = line.substr(0, pos + 2);
+      }
+      istringstream is(line);
+      if (HypergraphIO::ReadFromJSON(&is, &hg)) {
+        ProcessHypergraph(w, conf, ref, &hg);
+        hg.clear();
+      } else {
+        cerr << "Error reading grammar from JSON: line " << lc << endl;
+        exit(1);
+      }
+    } else {
+      if (line.empty()) {
+        int goal = lhs2node[kSTART] - 1;
+        FilterAndCheckCorrectness(goal, &hg);
+        ProcessHypergraph(w, conf, "", &hg);
+        hg.clear();
+        lhs2node.clear();
+        continue;
+      }
+      if (line[0] == '#') continue;
+      if (line[0] != '[') {
+        cerr << "Line " << lc << ": bad format\n";
+        exit(1);
+      }
+      TRulePtr tr(TRule::CreateRuleMonolingual(line));
+      Hypergraph::TailNodeVector tail;
+      for (int i = 0; i < tr->f_.size(); ++i) {
+        WordID var_cat = tr->f_[i];
+        if (var_cat < 0)
+          tail.push_back(GetOrCreateNode(var_cat, &lhs2node, &hg));
+      }
+      const WordID lhs = tr->GetLHS();
+      int head = GetOrCreateNode(lhs, &lhs2node, &hg);
+      Hypergraph::Edge* edge = hg.AddEdge(tr, tail);
+      edge->feature_values_ = tr->scores_;
+      Hypergraph::Node* node = &hg.nodes_[head];
+      hg.ConnectEdgeToHeadNode(edge, node);
+    }
+  }
+}
+
diff --git a/training/utils/lbfgs.h b/training/utils/lbfgs.h
new file mode 100644
index 00000000..e8baecab
--- /dev/null
+++ b/training/utils/lbfgs.h
@@ -0,0 +1,1459 @@
+#ifndef SCITBX_LBFGS_H
+#define SCITBX_LBFGS_H
+
+#include <cstdio>
+#include <cstddef>
+#include <cmath>
+#include <stdexcept>
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <sstream>
+
+namespace scitbx {
+
+//! Limited-memory Broyden-Fletcher-Goldfarb-Shanno (LBFGS) %minimizer.
+/*! Implementation of the
+    Limited-memory Broyden-Fletcher-Goldfarb-Shanno (LBFGS)
+    algorithm for large-scale multidimensional minimization
+    problems.
+
+    This code was manually derived from Java code which was
+    in turn derived from the Fortran program
+    <code>lbfgs.f</code>.  The Java translation was
+    effected mostly mechanically, with some manual
+    clean-up; in particular, array indices start at 0
+    instead of 1.  Most of the comments from the Fortran
+    code have been pasted in.
+
+    Information on the original LBFGS Fortran source code is
+    available at
+    http://www.netlib.org/opt/lbfgs_um.shar . The following
+    information is taken verbatim from the Netlib documentation
+    for the Fortran source.
+
+    <pre>
+    file    opt/lbfgs_um.shar
+    for     unconstrained optimization problems
+    alg     limited memory BFGS method
+    by      J. Nocedal
+    contact nocedal@eecs.nwu.edu
+    ref     D. C. Liu and J. Nocedal, ``On the limited memory BFGS method for
+    ,       large scale optimization methods'' Mathematical Programming 45
+    ,       (1989), pp. 503-528.
+    ,       (Postscript file of this paper is available via anonymous ftp
+    ,       to eecs.nwu.edu in the directory pub/%lbfgs/lbfgs_um.)
+    </pre>
+
+    @author Jorge Nocedal: original Fortran version, including comments
+    (July 1990).<br>
+    Robert Dodier: Java translation, August 1997.<br>
+    Ralf W. Grosse-Kunstleve: C++ port, March 2002.<br>
+    Chris Dyer: serialize/deserialize functionality
+ */
+namespace lbfgs {
+
+  //! Generic exception class for %lbfgs %error messages.
+  /*! All exceptions thrown by the minimizer are derived from this class.
+   */
+  class error : public std::exception {
+    public:
+      //! Constructor.
+      error(std::string const& msg) throw()
+        : msg_("lbfgs error: " + msg)
+      {}
+      //! Access to error message.
+      virtual const char* what() const throw() { return msg_.c_str(); }
+    protected:
+      virtual ~error() throw() {}
+      std::string msg_;
+    public:
+      static std::string itoa(unsigned long i) {
+        std::ostringstream os;
+        os << i;
+        return os.str();
+      }
+  };
+
+  //! Specific exception class.
+  class error_internal_error : public error {
+    public:
+      //! Constructor.
+      error_internal_error(const char* file, unsigned long line) throw()
+        : error(
+            "Internal Error: " + std::string(file) + "(" + itoa(line) + ")")
+      {}
+  };
+
+  //! Specific exception class.
+  class error_improper_input_parameter : public error {
+    public:
+      //! Constructor.
+      error_improper_input_parameter(std::string const& msg) throw()
+        : error("Improper input parameter: " + msg)
+      {}
+  };
+
+  //! Specific exception class.
+  class error_improper_input_data : public error {
+    public:
+      //! Constructor.
+      error_improper_input_data(std::string const& msg) throw()
+        : error("Improper input data: " + msg)
+      {}
+  };
+
+  //! Specific exception class.
+  class error_search_direction_not_descent : public error {
+    public:
+      //! Constructor.
+      error_search_direction_not_descent() throw()
+        : error("The search direction is not a descent direction.")
+      {}
+  };
+
+  //! Specific exception class.
+  class error_line_search_failed : public error {
+    public:
+      //! Constructor.
+      error_line_search_failed(std::string const& msg) throw()
+        : error("Line search failed: " + msg)
+      {}
+  };
+
+  //! Specific exception class.
+  class error_line_search_failed_rounding_errors
+  : public error_line_search_failed {
+    public:
+      //! Constructor.
+      error_line_search_failed_rounding_errors(std::string const& msg) throw()
+        : error_line_search_failed(msg)
+      {}
+  };
+
+  namespace detail {
+
+    template <typename NumType>
+    inline
+    NumType
+    pow2(NumType const& x) { return x * x; }
+
+    template <typename NumType>
+    inline
+    NumType
+    abs(NumType const& x) {
+      if (x < NumType(0)) return -x;
+      return x;
+    }
+
+    // This class implements an algorithm for multi-dimensional line search.
+    template <typename FloatType, typename SizeType = std::size_t>
+    class mcsrch
+    {
+      protected:
+        int infoc;
+        FloatType dginit;
+        bool brackt;
+        bool stage1;
+        FloatType finit;
+        FloatType dgtest;
+        FloatType width;
+        FloatType width1;
+        FloatType stx;
+        FloatType fx;
+        FloatType dgx;
+        FloatType sty;
+        FloatType fy;
+        FloatType dgy;
+        FloatType stmin;
+        FloatType stmax;
+
+        static FloatType const& max3(
+          FloatType const& x,
+          FloatType const& y,
+          FloatType const& z)
+        {
+          return x < y ? (y < z ? z : y ) : (x < z ? z : x );
+        }
+
+      public:
+        /* Minimize a function along a search direction. This code is
+           a Java translation of the function <code>MCSRCH</code> from
+           <code>lbfgs.f</code>, which in turn is a slight modification
+           of the subroutine <code>CSRCH</code> of More' and Thuente.
+           The changes are to allow reverse communication, and do not
+           affect the performance of the routine. This function, in turn,
+           calls <code>mcstep</code>.<p>
+
+           The Java translation was effected mostly mechanically, with
+           some manual clean-up; in particular, array indices start at 0
+           instead of 1.  Most of the comments from the Fortran code have
+           been pasted in here as well.<p>
+
+           The purpose of <code>mcsrch</code> is to find a step which
+           satisfies a sufficient decrease condition and a curvature
+           condition.<p>
+
+           At each stage this function updates an interval of uncertainty
+           with endpoints <code>stx</code> and <code>sty</code>. The
+           interval of uncertainty is initially chosen so that it
+           contains a minimizer of the modified function
+           <pre>
+                f(x+stp*s) - f(x) - ftol*stp*(gradf(x)'s).
+           </pre>
+           If a step is obtained for which the modified function has a
+           nonpositive function value and nonnegative derivative, then
+           the interval of uncertainty is chosen so that it contains a
+           minimizer of <code>f(x+stp*s)</code>.<p>
+
+           The algorithm is designed to find a step which satisfies
+           the sufficient decrease condition
+           <pre>
+                 f(x+stp*s) &lt;= f(X) + ftol*stp*(gradf(x)'s),
+           </pre>
+           and the curvature condition
+           <pre>
+                 abs(gradf(x+stp*s)'s)) &lt;= gtol*abs(gradf(x)'s).
+           </pre>
+           If <code>ftol</code> is less than <code>gtol</code> and if,
+           for example, the function is bounded below, then there is
+           always a step which satisfies both conditions. If no step can
+           be found which satisfies both conditions, then the algorithm
+           usually stops when rounding errors prevent further progress.
+           In this case <code>stp</code> only satisfies the sufficient
+           decrease condition.<p>
+
+           @author Original Fortran version by Jorge J. More' and
+             David J. Thuente as part of the Minpack project, June 1983,
+             Argonne National Laboratory. Java translation by Robert
+             Dodier, August 1997.
+
+           @param n The number of variables.
+
+           @param x On entry this contains the base point for the line
+             search. On exit it contains <code>x + stp*s</code>.
+
+           @param f On entry this contains the value of the objective
+             function at <code>x</code>. On exit it contains the value
+             of the objective function at <code>x + stp*s</code>.
+
+           @param g On entry this contains the gradient of the objective
+             function at <code>x</code>. On exit it contains the gradient
+             at <code>x + stp*s</code>.
+
+           @param s The search direction.
+
+           @param stp On entry this contains an initial estimate of a
+             satifactory step length. On exit <code>stp</code> contains
+             the final estimate.
+
+           @param ftol Tolerance for the sufficient decrease condition.
+
+           @param xtol Termination occurs when the relative width of the
+             interval of uncertainty is at most <code>xtol</code>.
+
+           @param maxfev Termination occurs when the number of evaluations
+             of the objective function is at least <code>maxfev</code> by
+             the end of an iteration.
+
+           @param info This is an output variable, which can have these
+             values:
+             <ul>
+             <li><code>info = -1</code> A return is made to compute
+                 the function and gradient.
+             <li><code>info = 1</code> The sufficient decrease condition
+                 and the directional derivative condition hold.
+             </ul>
+
+           @param nfev On exit, this is set to the number of function
+             evaluations.
+
+           @param wa Temporary storage array, of length <code>n</code>.
+         */
+        void run(
+          FloatType const& gtol,
+          FloatType const& stpmin,
+          FloatType const& stpmax,
+          SizeType n,
+          FloatType* x,
+          FloatType f,
+          const FloatType* g,
+          FloatType* s,
+          SizeType is0,
+          FloatType& stp,
+          FloatType ftol,
+          FloatType xtol,
+          SizeType maxfev,
+          int& info,
+          SizeType& nfev,
+          FloatType* wa);
+
+        /* The purpose of this function is to compute a safeguarded step
+           for a linesearch and to update an interval of uncertainty for
+           a minimizer of the function.<p>
+
+           The parameter <code>stx</code> contains the step with the
+           least function value. The parameter <code>stp</code> contains
+           the current step. It is assumed that the derivative at
+           <code>stx</code> is negative in the direction of the step. If
+           <code>brackt</code> is <code>true</code> when
+           <code>mcstep</code> returns then a minimizer has been
+           bracketed in an interval of uncertainty with endpoints
+           <code>stx</code> and <code>sty</code>.<p>
+
+           Variables that must be modified by <code>mcstep</code> are
+           implemented as 1-element arrays.
+
+           @param stx Step at the best step obtained so far.
+             This variable is modified by <code>mcstep</code>.
+           @param fx Function value at the best step obtained so far.
+             This variable is modified by <code>mcstep</code>.
+           @param dx Derivative at the best step obtained so far.
+             The derivative must be negative in the direction of the
+             step, that is, <code>dx</code> and <code>stp-stx</code> must
+             have opposite signs.  This variable is modified by
+             <code>mcstep</code>.
+
+           @param sty Step at the other endpoint of the interval of
+             uncertainty. This variable is modified by <code>mcstep</code>.
+           @param fy Function value at the other endpoint of the interval
+             of uncertainty. This variable is modified by
+             <code>mcstep</code>.
+
+           @param dy Derivative at the other endpoint of the interval of
+             uncertainty. This variable is modified by <code>mcstep</code>.
+
+           @param stp Step at the current step. If <code>brackt</code> is set
+             then on input <code>stp</code> must be between <code>stx</code>
+             and <code>sty</code>. On output <code>stp</code> is set to the
+             new step.
+           @param fp Function value at the current step.
+           @param dp Derivative at the current step.
+
+           @param brackt Tells whether a minimizer has been bracketed.
+             If the minimizer has not been bracketed, then on input this
+             variable must be set <code>false</code>. If the minimizer has
+             been bracketed, then on output this variable is
+             <code>true</code>.
+
+           @param stpmin Lower bound for the step.
+           @param stpmax Upper bound for the step.
+
+           If the return value is 1, 2, 3, or 4, then the step has
+           been computed successfully. A return value of 0 indicates
+           improper input parameters.
+
+           @author Jorge J. More, David J. Thuente: original Fortran version,
+             as part of Minpack project. Argonne Nat'l Laboratory, June 1983.
+             Robert Dodier: Java translation, August 1997.
+         */
+        static int mcstep(
+          FloatType& stx,
+          FloatType& fx,
+          FloatType& dx,
+          FloatType& sty,
+          FloatType& fy,
+          FloatType& dy,
+          FloatType& stp,
+          FloatType fp,
+          FloatType dp,
+          bool& brackt,
+          FloatType stpmin,
+          FloatType stpmax);
+
+        void serialize(std::ostream* out) const {
+          out->write((const char*)&infoc,sizeof(infoc));
+          out->write((const char*)&dginit,sizeof(dginit));
+          out->write((const char*)&brackt,sizeof(brackt));
+          out->write((const char*)&stage1,sizeof(stage1));
+          out->write((const char*)&finit,sizeof(finit));
+          out->write((const char*)&dgtest,sizeof(dgtest));
+          out->write((const char*)&width,sizeof(width));
+          out->write((const char*)&width1,sizeof(width1));
+          out->write((const char*)&stx,sizeof(stx));
+          out->write((const char*)&fx,sizeof(fx));
+          out->write((const char*)&dgx,sizeof(dgx));
+          out->write((const char*)&sty,sizeof(sty));
+          out->write((const char*)&fy,sizeof(fy));
+          out->write((const char*)&dgy,sizeof(dgy));
+          out->write((const char*)&stmin,sizeof(stmin));
+          out->write((const char*)&stmax,sizeof(stmax));
+        }
+
+        void deserialize(std::istream* in) const {
+          in->read((char*)&infoc, sizeof(infoc));
+          in->read((char*)&dginit, sizeof(dginit));
+          in->read((char*)&brackt, sizeof(brackt));
+          in->read((char*)&stage1, sizeof(stage1));
+          in->read((char*)&finit, sizeof(finit));
+          in->read((char*)&dgtest, sizeof(dgtest));
+          in->read((char*)&width, sizeof(width));
+          in->read((char*)&width1, sizeof(width1));
+          in->read((char*)&stx, sizeof(stx));
+          in->read((char*)&fx, sizeof(fx));
+          in->read((char*)&dgx, sizeof(dgx));
+          in->read((char*)&sty, sizeof(sty));
+          in->read((char*)&fy, sizeof(fy));
+          in->read((char*)&dgy, sizeof(dgy));
+          in->read((char*)&stmin, sizeof(stmin));
+          in->read((char*)&stmax, sizeof(stmax));
+        }
+    };
+
+    template <typename FloatType, typename SizeType>
+    void mcsrch<FloatType, SizeType>::run(
+      FloatType const& gtol,
+      FloatType const& stpmin,
+      FloatType const& stpmax,
+      SizeType n,
+      FloatType* x,
+      FloatType f,
+      const FloatType* g,
+      FloatType* s,
+      SizeType is0,
+      FloatType& stp,
+      FloatType ftol,
+      FloatType xtol,
+      SizeType maxfev,
+      int& info,
+      SizeType& nfev,
+      FloatType* wa)
+    {
+      if (info != -1) {
+        infoc = 1;
+        if (   n == 0
+            || maxfev == 0
+            || gtol < FloatType(0)
+            || xtol < FloatType(0)
+            || stpmin < FloatType(0)
+            || stpmax < stpmin) {
+          throw error_internal_error(__FILE__, __LINE__);
+        }
+        if (stp <= FloatType(0) || ftol < FloatType(0)) {
+          throw error_internal_error(__FILE__, __LINE__);
+        }
+        // Compute the initial gradient in the search direction
+        // and check that s is a descent direction.
+        dginit = FloatType(0);
+        for (SizeType j = 0; j < n; j++) {
+          dginit += g[j] * s[is0+j];
+        }
+        if (dginit >= FloatType(0)) {
+          throw error_search_direction_not_descent();
+        }
+        brackt = false;
+        stage1 = true;
+        nfev = 0;
+        finit = f;
+        dgtest = ftol*dginit;
+        width = stpmax - stpmin;
+        width1 = FloatType(2) * width;
+        std::copy(x, x+n, wa);
+        // The variables stx, fx, dgx contain the values of the step,
+        // function, and directional derivative at the best step.
+        // The variables sty, fy, dgy contain the value of the step,
+        // function, and derivative at the other endpoint of
+        // the interval of uncertainty.
+        // The variables stp, f, dg contain the values of the step,
+        // function, and derivative at the current step.
+        stx = FloatType(0);
+        fx = finit;
+        dgx = dginit;
+        sty = FloatType(0);
+        fy = finit;
+        dgy = dginit;
+      }
+      for (;;) {
+        if (info != -1) {
+          // Set the minimum and maximum steps to correspond
+          // to the present interval of uncertainty.
+          if (brackt) {
+            stmin = std::min(stx, sty);
+            stmax = std::max(stx, sty);
+          }
+          else {
+            stmin = stx;
+            stmax = stp + FloatType(4) * (stp - stx);
+          }
+          // Force the step to be within the bounds stpmax and stpmin.
+          stp = std::max(stp, stpmin);
+          stp = std::min(stp, stpmax);
+          // If an unusual termination is to occur then let
+          // stp be the lowest point obtained so far.
+          if (   (brackt && (stp <= stmin || stp >= stmax))
+              || nfev >= maxfev - 1 || infoc == 0
+              || (brackt && stmax - stmin <= xtol * stmax)) {
+            stp = stx;
+          }
+          // Evaluate the function and gradient at stp
+          // and compute the directional derivative.
+          // We return to main program to obtain F and G.
+          for (SizeType j = 0; j < n; j++) {
+            x[j] = wa[j] + stp * s[is0+j];
+          }
+          info=-1;
+          break;
+        }
+        info = 0;
+        nfev++;
+        FloatType dg(0);
+        for (SizeType j = 0; j < n; j++) {
+          dg += g[j] * s[is0+j];
+        }
+        FloatType ftest1 = finit + stp*dgtest;
+        // Test for convergence.
+        if ((brackt && (stp <= stmin || stp >= stmax)) || infoc == 0) {
+          throw error_line_search_failed_rounding_errors(
+            "Rounding errors prevent further progress."
+            " There may not be a step which satisfies the"
+            " sufficient decrease and curvature conditions."
+            " Tolerances may be too small.");
+        }
+        if (stp == stpmax && f <= ftest1 && dg <= dgtest) {
+          throw error_line_search_failed(
+            "The step is at the upper bound stpmax().");
+        }
+        if (stp == stpmin && (f > ftest1 || dg >= dgtest)) {
+          throw error_line_search_failed(
+            "The step is at the lower bound stpmin().");
+        }
+        if (nfev >= maxfev) {
+          throw error_line_search_failed(
+            "Number of function evaluations has reached maxfev().");
+        }
+        if (brackt && stmax - stmin <= xtol * stmax) {
+          throw error_line_search_failed(
+            "Relative width of the interval of uncertainty"
+            " is at most xtol().");
+        }
+        // Check for termination.
+        if (f <= ftest1 && abs(dg) <= gtol * (-dginit)) {
+          info = 1;
+          break;
+        }
+        // In the first stage we seek a step for which the modified
+        // function has a nonpositive value and nonnegative derivative.
+        if (   stage1 && f <= ftest1
+            && dg >= std::min(ftol, gtol) * dginit) {
+          stage1 = false;
+        }
+        // A modified function is used to predict the step only if
+        // we have not obtained a step for which the modified
+        // function has a nonpositive function value and nonnegative
+        // derivative, and if a lower function value has been
+        // obtained but the decrease is not sufficient.
+        if (stage1 && f <= fx && f > ftest1) {
+          // Define the modified function and derivative values.
+          FloatType fm = f - stp*dgtest;
+          FloatType fxm = fx - stx*dgtest;
+          FloatType fym = fy - sty*dgtest;
+          FloatType dgm = dg - dgtest;
+          FloatType dgxm = dgx - dgtest;
+          FloatType dgym = dgy - dgtest;
+          // Call cstep to update the interval of uncertainty
+          // and to compute the new step.
+          infoc = mcstep(stx, fxm, dgxm, sty, fym, dgym, stp, fm, dgm,
+                         brackt, stmin, stmax);
+          // Reset the function and gradient values for f.
+          fx = fxm + stx*dgtest;
+          fy = fym + sty*dgtest;
+          dgx = dgxm + dgtest;
+          dgy = dgym + dgtest;
+        }
+        else {
+          // Call mcstep to update the interval of uncertainty
+          // and to compute the new step.
+          infoc = mcstep(stx, fx, dgx, sty, fy, dgy, stp, f, dg,
+                         brackt, stmin, stmax);
+        }
+        // Force a sufficient decrease in the size of the
+        // interval of uncertainty.
+        if (brackt) {
+          if (abs(sty - stx) >= FloatType(0.66) * width1) {
+            stp = stx + FloatType(0.5) * (sty - stx);
+          }
+          width1 = width;
+          width = abs(sty - stx);
+        }
+      }
+    }
+
+    template <typename FloatType, typename SizeType>
+    int mcsrch<FloatType, SizeType>::mcstep(
+      FloatType& stx,
+      FloatType& fx,
+      FloatType& dx,
+      FloatType& sty,
+      FloatType& fy,
+      FloatType& dy,
+      FloatType& stp,
+      FloatType fp,
+      FloatType dp,
+      bool& brackt,
+      FloatType stpmin,
+      FloatType stpmax)
+    {
+      bool bound;
+      FloatType gamma, p, q, r, s, sgnd, stpc, stpf, stpq, theta;
+      int info = 0;
+      if (   (   brackt && (stp <= std::min(stx, sty)
+              || stp >= std::max(stx, sty)))
+          || dx * (stp - stx) >= FloatType(0) || stpmax < stpmin) {
+        return 0;
+      }
+      // Determine if the derivatives have opposite sign.
+      sgnd = dp * (dx / abs(dx));
+      if (fp > fx) {
+        // First case. A higher function value.
+        // The minimum is bracketed. If the cubic step is closer
+        // to stx than the quadratic step, the cubic step is taken,
+        // else the average of the cubic and quadratic steps is taken.
+        info = 1;
+        bound = true;
+        theta = FloatType(3) * (fx - fp) / (stp - stx) + dx + dp;
+        s = max3(abs(theta), abs(dx), abs(dp));
+        gamma = s * std::sqrt(pow2(theta / s) - (dx / s) * (dp / s));
+        if (stp < stx) gamma = - gamma;
+        p = (gamma - dx) + theta;
+        q = ((gamma - dx) + gamma) + dp;
+        r = p/q;
+        stpc = stx + r * (stp - stx);
+        stpq = stx
+          + ((dx / ((fx - fp) / (stp - stx) + dx)) / FloatType(2))
+            * (stp - stx);
+        if (abs(stpc - stx) < abs(stpq - stx)) {
+          stpf = stpc;
+        }
+        else {
+          stpf = stpc + (stpq - stpc) / FloatType(2);
+        }
+        brackt = true;
+      }
+      else if (sgnd < FloatType(0)) {
+        // Second case. A lower function value and derivatives of
+        // opposite sign. The minimum is bracketed. If the cubic
+        // step is closer to stx than the quadratic (secant) step,
+        // the cubic step is taken, else the quadratic step is taken.
+        info = 2;
+        bound = false;
+        theta = FloatType(3) * (fx - fp) / (stp - stx) + dx + dp;
+        s = max3(abs(theta), abs(dx), abs(dp));
+        gamma = s * std::sqrt(pow2(theta / s) - (dx / s) * (dp / s));
+        if (stp > stx) gamma = - gamma;
+        p = (gamma - dp) + theta;
+        q = ((gamma - dp) + gamma) + dx;
+        r = p/q;
+        stpc = stp + r * (stx - stp);
+        stpq = stp + (dp / (dp - dx)) * (stx - stp);
+        if (abs(stpc - stp) > abs(stpq - stp)) {
+          stpf = stpc;
+        }
+        else {
+          stpf = stpq;
+        }
+        brackt = true;
+      }
+      else if (abs(dp) < abs(dx)) {
+        // Third case. A lower function value, derivatives of the
+        // same sign, and the magnitude of the derivative decreases.
+        // The cubic step is only used if the cubic tends to infinity
+        // in the direction of the step or if the minimum of the cubic
+        // is beyond stp. Otherwise the cubic step is defined to be
+        // either stpmin or stpmax. The quadratic (secant) step is also
+        // computed and if the minimum is bracketed then the the step
+        // closest to stx is taken, else the step farthest away is taken.
+        info = 3;
+        bound = true;
+        theta = FloatType(3) * (fx - fp) / (stp - stx) + dx + dp;
+        s = max3(abs(theta), abs(dx), abs(dp));
+        gamma = s * std::sqrt(
+          std::max(FloatType(0), pow2(theta / s) - (dx / s) * (dp / s)));
+        if (stp > stx) gamma = -gamma;
+        p = (gamma - dp) + theta;
+        q = (gamma + (dx - dp)) + gamma;
+        r = p/q;
+        if (r < FloatType(0) && gamma != FloatType(0)) {
+          stpc = stp + r * (stx - stp);
+        }
+        else if (stp > stx) {
+          stpc = stpmax;
+        }
+        else {
+          stpc = stpmin;
+        }
+        stpq = stp + (dp / (dp - dx)) * (stx - stp);
+        if (brackt) {
+          if (abs(stp - stpc) < abs(stp - stpq)) {
+            stpf = stpc;
+          }
+          else {
+            stpf = stpq;
+          }
+        }
+        else {
+          if (abs(stp - stpc) > abs(stp - stpq)) {
+            stpf = stpc;
+          }
+          else {
+            stpf = stpq;
+          }
+        }
+      }
+      else {
+        // Fourth case. A lower function value, derivatives of the
+        // same sign, and the magnitude of the derivative does
+        // not decrease. If the minimum is not bracketed, the step
+        // is either stpmin or stpmax, else the cubic step is taken.
+        info = 4;
+        bound = false;
+        if (brackt) {
+          theta = FloatType(3) * (fp - fy) / (sty - stp) + dy + dp;
+          s = max3(abs(theta), abs(dy), abs(dp));
+          gamma = s * std::sqrt(pow2(theta / s) - (dy / s) * (dp / s));
+          if (stp > sty) gamma = -gamma;
+          p = (gamma - dp) + theta;
+          q = ((gamma - dp) + gamma) + dy;
+          r = p/q;
+          stpc = stp + r * (sty - stp);
+          stpf = stpc;
+        }
+        else if (stp > stx) {
+          stpf = stpmax;
+        }
+        else {
+          stpf = stpmin;
+        }
+      }
+      // Update the interval of uncertainty. This update does not
+      // depend on the new step or the case analysis above.
+      if (fp > fx) {
+        sty = stp;
+        fy = fp;
+        dy = dp;
+      }
+      else {
+        if (sgnd < FloatType(0)) {
+          sty = stx;
+          fy = fx;
+          dy = dx;
+        }
+        stx = stp;
+        fx = fp;
+        dx = dp;
+      }
+      // Compute the new step and safeguard it.
+      stpf = std::min(stpmax, stpf);
+      stpf = std::max(stpmin, stpf);
+      stp = stpf;
+      if (brackt && bound) {
+        if (sty > stx) {
+          stp = std::min(stx + FloatType(0.66) * (sty - stx), stp);
+        }
+        else {
+          stp = std::max(stx + FloatType(0.66) * (sty - stx), stp);
+        }
+      }
+      return info;
+    }
+
+    /* Compute the sum of a vector times a scalar plus another vector.
+       Adapted from the subroutine <code>daxpy</code> in
+       <code>lbfgs.f</code>.
+     */
+    template <typename FloatType, typename SizeType>
+    void daxpy(
+      SizeType n,
+      FloatType da,
+      const FloatType* dx,
+      SizeType ix0,
+      SizeType incx,
+      FloatType* dy,
+      SizeType iy0,
+      SizeType incy)
+    {
+      SizeType i, ix, iy, m;
+      if (n == 0) return;
+      if (da == FloatType(0)) return;
+      if  (!(incx == 1 && incy == 1)) {
+        ix = 0;
+        iy = 0;
+        for (i = 0; i < n; i++) {
+          dy[iy0+iy] += da * dx[ix0+ix];
+          ix += incx;
+          iy += incy;
+        }
+        return;
+      }
+      m = n % 4;
+      for (i = 0; i < m; i++) {
+        dy[iy0+i] += da * dx[ix0+i];
+      }
+      for (; i < n;) {
+        dy[iy0+i] += da * dx[ix0+i]; i++;
+        dy[iy0+i] += da * dx[ix0+i]; i++;
+        dy[iy0+i] += da * dx[ix0+i]; i++;
+        dy[iy0+i] += da * dx[ix0+i]; i++;
+      }
+    }
+
+    template <typename FloatType, typename SizeType>
+    inline
+    void daxpy(
+      SizeType n,
+      FloatType da,
+      const FloatType* dx,
+      SizeType ix0,
+      FloatType* dy)
+    {
+      daxpy(n, da, dx, ix0, SizeType(1), dy, SizeType(0), SizeType(1));
+    }
+
+    /* Compute the dot product of two vectors.
+       Adapted from the subroutine <code>ddot</code>
+       in <code>lbfgs.f</code>.
+     */
+    template <typename FloatType, typename SizeType>
+    FloatType ddot(
+      SizeType n,
+      const FloatType* dx,
+      SizeType ix0,
+      SizeType incx,
+      const FloatType* dy,
+      SizeType iy0,
+      SizeType incy)
+    {
+      SizeType i, ix, iy, m;
+      FloatType dtemp(0);
+      if (n == 0) return FloatType(0);
+      if (!(incx == 1 && incy == 1)) {
+        ix = 0;
+        iy = 0;
+        for (i = 0; i < n; i++) {
+          dtemp += dx[ix0+ix] * dy[iy0+iy];
+          ix += incx;
+          iy += incy;
+        }
+        return dtemp;
+      }
+      m = n % 5;
+      for (i = 0; i < m; i++) {
+        dtemp += dx[ix0+i] * dy[iy0+i];
+      }
+      for (; i < n;) {
+        dtemp += dx[ix0+i] * dy[iy0+i]; i++;
+        dtemp += dx[ix0+i] * dy[iy0+i]; i++;
+        dtemp += dx[ix0+i] * dy[iy0+i]; i++;
+        dtemp += dx[ix0+i] * dy[iy0+i]; i++;
+        dtemp += dx[ix0+i] * dy[iy0+i]; i++;
+      }
+      return dtemp;
+    }
+
+    template <typename FloatType, typename SizeType>
+    inline
+    FloatType ddot(
+      SizeType n,
+      const FloatType* dx,
+      const FloatType* dy)
+    {
+      return ddot(
+        n, dx, SizeType(0), SizeType(1), dy, SizeType(0), SizeType(1));
+    }
+
+  } // namespace detail
+
+  //! Interface to the LBFGS %minimizer.
+  /*! This class solves the unconstrained minimization problem
+      <pre>
+          min f(x),  x = (x1,x2,...,x_n),
+      </pre>
+      using the limited-memory BFGS method. The routine is
+      especially effective on problems involving a large number of
+      variables. In a typical iteration of this method an
+      approximation Hk to the inverse of the Hessian
+      is obtained by applying <code>m</code> BFGS updates to a
+      diagonal matrix Hk0, using information from the
+      previous <code>m</code> steps.  The user specifies the number
+      <code>m</code>, which determines the amount of storage
+      required by the routine. The user may also provide the
+      diagonal matrices Hk0 (parameter <code>diag</code> in the run()
+      function) if not satisfied with the default choice. The
+      algorithm is described in "On the limited memory BFGS method for
+      large scale optimization", by D. Liu and J. Nocedal, Mathematical
+      Programming B 45 (1989) 503-528.
+
+      The user is required to calculate the function value
+      <code>f</code> and its gradient <code>g</code>. In order to
+      allow the user complete control over these computations,
+      reverse communication is used. The routine must be called
+      repeatedly under the control of the member functions
+      <code>requests_f_and_g()</code>,
+      <code>requests_diag()</code>.
+      If neither requests_f_and_g() nor requests_diag() is
+      <code>true</code> the user should check for convergence
+      (using class traditional_convergence_test or any
+      other custom test). If the convergence test is negative,
+      the minimizer may be called again for the next iteration.
+
+      The steplength (stp()) is determined at each iteration
+      by means of the line search routine <code>mcsrch</code>, which is
+      a slight modification of the routine <code>CSRCH</code> written
+      by More' and Thuente.
+
+      The only variables that are machine-dependent are
+      <code>xtol</code>,
+      <code>stpmin</code> and
+      <code>stpmax</code>.
+
+      Fatal errors cause <code>error</code> exceptions to be thrown.
+      The generic class <code>error</code> is sub-classed (e.g.
+      class <code>error_line_search_failed</code>) to facilitate
+      granular %error handling.
+
+      A note on performance: Using Compaq Fortran V5.4 and
+      Compaq C++ V6.5, the C++ implementation is about 15% slower
+      than the Fortran implementation.
+   */
+  template <typename FloatType, typename SizeType = std::size_t>
+  class minimizer
+  {
+    public:
+      //! Default constructor. Some members are not initialized!
+      minimizer()
+      : n_(0), m_(0), maxfev_(0),
+        gtol_(0), xtol_(0),
+        stpmin_(0), stpmax_(0),
+        ispt(0), iypt(0)
+      {}
+
+      //! Constructor.
+      /*! @param n The number of variables in the minimization problem.
+             Restriction: <code>n &gt; 0</code>.
+
+          @param m The number of corrections used in the BFGS update.
+             Values of <code>m</code> less than 3 are not recommended;
+             large values of <code>m</code> will result in excessive
+             computing time. <code>3 &lt;= m &lt;= 7</code> is
+             recommended.
+             Restriction: <code>m &gt; 0</code>.
+
+          @param maxfev Maximum number of function evaluations
+             <b>per line search</b>.
+             Termination occurs when the number of evaluations
+             of the objective function is at least <code>maxfev</code> by
+             the end of an iteration.
+
+          @param gtol Controls the accuracy of the line search.
+            If the function and gradient evaluations are inexpensive with
+            respect to the cost of the iteration (which is sometimes the
+            case when solving very large problems) it may be advantageous
+            to set <code>gtol</code> to a small value. A typical small
+            value is 0.1.
+            Restriction: <code>gtol</code> should be greater than 1e-4.
+
+          @param xtol An estimate of the machine precision (e.g. 10e-16
+            on a SUN station 3/60). The line search routine will
+            terminate if the relative width of the interval of
+            uncertainty is less than <code>xtol</code>.
+
+          @param stpmin Specifies the lower bound for the step
+            in the line search.
+            The default value is 1e-20. This value need not be modified
+            unless the exponent is too large for the machine being used,
+            or unless the problem is extremely badly scaled (in which
+            case the exponent should be increased).
+
+          @param stpmax specifies the upper bound for the step
+            in the line search.
+            The default value is 1e20. This value need not be modified
+            unless the exponent is too large for the machine being used,
+            or unless the problem is extremely badly scaled (in which
+            case the exponent should be increased).
+       */
+      explicit
+      minimizer(
+        SizeType n,
+        SizeType m = 5,
+        SizeType maxfev = 20,
+        FloatType gtol = FloatType(0.9),
+        FloatType xtol = FloatType(1.e-16),
+        FloatType stpmin = FloatType(1.e-20),
+        FloatType stpmax = FloatType(1.e20))
+        : n_(n), m_(m), maxfev_(maxfev),
+          gtol_(gtol), xtol_(xtol),
+          stpmin_(stpmin), stpmax_(stpmax),
+          iflag_(0), requests_f_and_g_(false), requests_diag_(false),
+          iter_(0), nfun_(0), stp_(0),
+          stp1(0), ftol(0.0001), ys(0), point(0), npt(0),
+          ispt(n+2*m), iypt((n+2*m)+n*m),
+          info(0), bound(0), nfev(0)
+      {
+        if (n_ == 0) {
+          throw error_improper_input_parameter("n = 0.");
+        }
+        if (m_ == 0) {
+          throw error_improper_input_parameter("m = 0.");
+        }
+        if (maxfev_ == 0) {
+         throw error_improper_input_parameter("maxfev = 0.");
+        }
+        if (gtol_ <= FloatType(1.e-4)) {
+          throw error_improper_input_parameter("gtol <= 1.e-4.");
+        }
+        if (xtol_ < FloatType(0)) {
+          throw error_improper_input_parameter("xtol < 0.");
+        }
+        if (stpmin_ < FloatType(0)) {
+          throw error_improper_input_parameter("stpmin < 0.");
+        }
+        if (stpmax_ < stpmin) {
+          throw error_improper_input_parameter("stpmax < stpmin");
+        }
+        w_.resize(n_*(2*m_+1)+2*m_);
+        scratch_array_.resize(n_);
+      }
+
+      //! Number of free parameters (as passed to the constructor).
+      SizeType n() const { return n_; }
+
+      //! Number of corrections kept (as passed to the constructor).
+      SizeType m() const { return m_; }
+
+      /*! \brief Maximum number of evaluations of the objective function
+          per line search (as passed to the constructor).
+       */
+      SizeType maxfev() const { return maxfev_; }
+
+      /*! \brief Control of the accuracy of the line search.
+          (as passed to the constructor).
+       */
+      FloatType gtol() const { return gtol_; }
+
+      //! Estimate of the machine precision (as passed to the constructor).
+      FloatType xtol() const { return xtol_; }
+
+      /*! \brief Lower bound for the step in the line search.
+          (as passed to the constructor).
+       */
+      FloatType stpmin() const { return stpmin_; }
+
+      /*! \brief Upper bound for the step in the line search.
+          (as passed to the constructor).
+       */
+      FloatType stpmax() const { return stpmax_; }
+
+      //! Status indicator for reverse communication.
+      /*! <code>true</code> if the run() function returns to request
+          evaluation of the objective function (<code>f</code>) and
+          gradients (<code>g</code>) for the current point
+          (<code>x</code>). To continue the minimization the
+          run() function is called again with the updated values for
+          <code>f</code> and <code>g</code>.
+          <p>
+          See also: requests_diag()
+       */
+      bool requests_f_and_g() const { return requests_f_and_g_; }
+
+      //! Status indicator for reverse communication.
+      /*! <code>true</code> if the run() function returns to request
+          evaluation of the diagonal matrix (<code>diag</code>)
+          for the current point (<code>x</code>).
+          To continue the minimization the run() function is called
+          again with the updated values for <code>diag</code>.
+          <p>
+          See also: requests_f_and_g()
+       */
+      bool requests_diag() const { return requests_diag_; }
+
+      //! Number of iterations so far.
+      /*! Note that one iteration may involve multiple evaluations
+          of the objective function.
+          <p>
+          See also: nfun()
+       */
+      SizeType iter() const { return iter_; }
+
+      //! Total number of evaluations of the objective function so far.
+      /*! The total number of function evaluations increases by the
+          number of evaluations required for the line search. The total
+          is only increased after a successful line search.
+          <p>
+          See also: iter()
+       */
+      SizeType nfun() const { return nfun_; }
+
+      //! Norm of gradient given gradient array of length n().
+      FloatType euclidean_norm(const FloatType* a) const {
+        return std::sqrt(detail::ddot(n_, a, a));
+      }
+
+      //! Current stepsize.
+      FloatType stp() const { return stp_; }
+
+      //! Execution of one step of the minimization.
+      /*! @param x On initial entry this must be set by the user to
+             the values of the initial estimate of the solution vector.
+
+          @param f Before initial entry or on re-entry under the
+             control of requests_f_and_g(), <code>f</code> must be set
+             by the user to contain the value of the objective function
+             at the current point <code>x</code>.
+
+          @param g Before initial entry or on re-entry under the
+             control of requests_f_and_g(), <code>g</code> must be set
+             by the user to contain the components of the gradient at
+             the current point <code>x</code>.
+
+          The return value is <code>true</code> if either
+          requests_f_and_g() or requests_diag() is <code>true</code>.
+          Otherwise the user should check for convergence
+          (e.g. using class traditional_convergence_test) and
+          call the run() function again to continue the minimization.
+          If the return value is <code>false</code> the user
+          should <b>not</b> update <code>f</code>, <code>g</code> or
+          <code>diag</code> (other overload) before calling
+          the run() function again.
+
+          Note that <code>x</code> is always modified by the run()
+          function. Depending on the situation it can therefore be
+          necessary to evaluate the objective function one more time
+          after the minimization is terminated.
+       */
+      bool run(
+        FloatType* x,
+        FloatType f,
+        const FloatType* g)
+      {
+        return generic_run(x, f, g, false, 0);
+      }
+
+      //! Execution of one step of the minimization.
+      /*! @param x See other overload.
+
+          @param f See other overload.
+
+          @param g See other overload.
+
+          @param diag On initial entry or on re-entry under the
+             control of requests_diag(), <code>diag</code> must be set by
+             the user to contain the values of the diagonal matrix Hk0.
+             The routine will return at each iteration of the algorithm
+             with requests_diag() set to <code>true</code>.
+             <p>
+             Restriction: all elements of <code>diag</code> must be
+             positive.
+       */
+      bool run(
+        FloatType* x,
+        FloatType f,
+        const FloatType* g,
+        const FloatType* diag)
+      {
+        return generic_run(x, f, g, true, diag);
+      }
+
+      void serialize(std::ostream* out) const {
+        out->write((const char*)&n_, sizeof(n_)); // sanity check
+        out->write((const char*)&m_, sizeof(m_)); // sanity check
+        SizeType fs = sizeof(FloatType);
+        out->write((const char*)&fs, sizeof(fs)); // sanity check
+
+        mcsrch_instance.serialize(out);
+        out->write((const char*)&iflag_, sizeof(iflag_));
+        out->write((const char*)&requests_f_and_g_, sizeof(requests_f_and_g_));
+        out->write((const char*)&requests_diag_, sizeof(requests_diag_));
+        out->write((const char*)&iter_, sizeof(iter_));
+        out->write((const char*)&nfun_, sizeof(nfun_));
+        out->write((const char*)&stp_, sizeof(stp_));
+        out->write((const char*)&stp1, sizeof(stp1));
+        out->write((const char*)&ftol, sizeof(ftol));
+        out->write((const char*)&ys, sizeof(ys));
+        out->write((const char*)&point, sizeof(point));
+        out->write((const char*)&npt, sizeof(npt));
+        out->write((const char*)&info, sizeof(info));
+        out->write((const char*)&bound, sizeof(bound));
+        out->write((const char*)&nfev, sizeof(nfev));
+        out->write((const char*)&w_[0], sizeof(FloatType) * w_.size());
+        out->write((const char*)&scratch_array_[0], sizeof(FloatType) * scratch_array_.size());
+      }
+
+      void deserialize(std::istream* in) {
+        SizeType n, m, fs;
+        in->read((char*)&n, sizeof(n));
+        in->read((char*)&m, sizeof(m));
+        in->read((char*)&fs, sizeof(fs));
+        assert(n == n_);
+        assert(m == m_);
+        assert(fs == sizeof(FloatType));
+
+        mcsrch_instance.deserialize(in);
+        in->read((char*)&iflag_, sizeof(iflag_));
+        in->read((char*)&requests_f_and_g_, sizeof(requests_f_and_g_));
+        in->read((char*)&requests_diag_, sizeof(requests_diag_));
+        in->read((char*)&iter_, sizeof(iter_));
+        in->read((char*)&nfun_, sizeof(nfun_));
+        in->read((char*)&stp_, sizeof(stp_));
+        in->read((char*)&stp1, sizeof(stp1));
+        in->read((char*)&ftol, sizeof(ftol));
+        in->read((char*)&ys, sizeof(ys));
+        in->read((char*)&point, sizeof(point));
+        in->read((char*)&npt, sizeof(npt));
+        in->read((char*)&info, sizeof(info));
+        in->read((char*)&bound, sizeof(bound));
+        in->read((char*)&nfev, sizeof(nfev));
+        in->read((char*)&w_[0], sizeof(FloatType) * w_.size());
+        in->read((char*)&scratch_array_[0], sizeof(FloatType) * scratch_array_.size());
+      }
+
+    protected:
+      static void throw_diagonal_element_not_positive(SizeType i) {
+        throw error_improper_input_data(
+          "The " + error::itoa(i) + ". diagonal element of the"
+          " inverse Hessian approximation is not positive.");
+      }
+
+      bool generic_run(
+        FloatType* x,
+        FloatType f,
+        const FloatType* g,
+        bool diagco,
+        const FloatType* diag);
+
+      detail::mcsrch<FloatType, SizeType> mcsrch_instance;
+      const SizeType n_;
+      const SizeType m_;
+      const SizeType maxfev_;
+      const FloatType gtol_;
+      const FloatType xtol_;
+      const FloatType stpmin_;
+      const FloatType stpmax_;
+      int iflag_;
+      bool requests_f_and_g_;
+      bool requests_diag_;
+      SizeType iter_;
+      SizeType nfun_;
+      FloatType stp_;
+      FloatType stp1;
+      FloatType ftol;
+      FloatType ys;
+      SizeType point;
+      SizeType npt;
+      const SizeType ispt;
+      const SizeType iypt;
+      int info;
+      SizeType bound;
+      SizeType nfev;
+      std::vector<FloatType> w_;
+      std::vector<FloatType> scratch_array_;
+  };
+
+  template <typename FloatType, typename SizeType>
+  bool minimizer<FloatType, SizeType>::generic_run(
+    FloatType* x,
+    FloatType f,
+    const FloatType* g,
+    bool diagco,
+    const FloatType* diag)
+  {
+    bool execute_entire_while_loop = false;
+    if (!(requests_f_and_g_ || requests_diag_)) {
+      execute_entire_while_loop = true;
+    }
+    requests_f_and_g_ = false;
+    requests_diag_ = false;
+    FloatType* w = &(*(w_.begin()));
+    if (iflag_ == 0) { // Initialize.
+      nfun_ = 1;
+      if (diagco) {
+        for (SizeType i = 0; i < n_; i++) {
+          if (diag[i] <= FloatType(0)) {
+            throw_diagonal_element_not_positive(i);
+          }
+        }
+      }
+      else {
+        std::fill_n(scratch_array_.begin(), n_, FloatType(1));
+        diag = &(*(scratch_array_.begin()));
+      }
+      for (SizeType i = 0; i < n_; i++) {
+        w[ispt + i] = -g[i] * diag[i];
+      }
+      FloatType gnorm = std::sqrt(detail::ddot(n_, g, g));
+      if (gnorm == FloatType(0)) return false;
+      stp1 = FloatType(1) / gnorm;
+      execute_entire_while_loop = true;
+    }
+    if (execute_entire_while_loop) {
+      bound = iter_;
+      iter_++;
+      info = 0;
+      if (iter_ != 1) {
+        if (iter_ > m_) bound = m_;
+        ys = detail::ddot(
+          n_, w, iypt + npt, SizeType(1), w, ispt + npt, SizeType(1));
+        if (!diagco) {
+          FloatType yy = detail::ddot(
+            n_, w, iypt + npt, SizeType(1), w, iypt + npt, SizeType(1));
+          std::fill_n(scratch_array_.begin(), n_, ys / yy);
+          diag = &(*(scratch_array_.begin()));
+        }
+        else {
+          iflag_ = 2;
+          requests_diag_ = true;
+          return true;
+        }
+      }
+    }
+    if (execute_entire_while_loop || iflag_ == 2) {
+      if (iter_ != 1) {
+        if (diag == 0) {
+          throw error_internal_error(__FILE__, __LINE__);
+        }
+        if (diagco) {
+          for (SizeType i = 0; i < n_; i++) {
+            if (diag[i] <= FloatType(0)) {
+              throw_diagonal_element_not_positive(i);
+            }
+          }
+        }
+        SizeType cp = point;
+        if (point == 0) cp = m_;
+        w[n_ + cp -1] = 1 / ys;
+        SizeType i;
+        for (i = 0; i < n_; i++) {
+          w[i] = -g[i];
+        }
+        cp = point;
+        for (i = 0; i < bound; i++) {
+          if (cp == 0) cp = m_;
+          cp--;
+          FloatType sq = detail::ddot(
+            n_, w, ispt + cp * n_, SizeType(1), w, SizeType(0), SizeType(1));
+          SizeType inmc=n_+m_+cp;
+          SizeType iycn=iypt+cp*n_;
+          w[inmc] = w[n_ + cp] * sq;
+          detail::daxpy(n_, -w[inmc], w, iycn, w);
+        }
+        for (i = 0; i < n_; i++) {
+          w[i] *= diag[i];
+        }
+        for (i = 0; i < bound; i++) {
+          FloatType yr = detail::ddot(
+            n_, w, iypt + cp * n_, SizeType(1), w, SizeType(0), SizeType(1));
+          FloatType beta = w[n_ + cp] * yr;
+          SizeType inmc=n_+m_+cp;
+          beta = w[inmc] - beta;
+          SizeType iscn=ispt+cp*n_;
+          detail::daxpy(n_, beta, w, iscn, w);
+          cp++;
+          if (cp == m_) cp = 0;
+        }
+        std::copy(w, w+n_, w+(ispt + point * n_));
+      }
+      stp_ = FloatType(1);
+      if (iter_ == 1) stp_ = stp1;
+      std::copy(g, g+n_, w);
+    }
+    mcsrch_instance.run(
+      gtol_, stpmin_, stpmax_, n_, x, f, g, w, ispt + point * n_,
+      stp_, ftol, xtol_, maxfev_, info, nfev, &(*(scratch_array_.begin())));
+    if (info == -1) {
+      iflag_ = 1;
+      requests_f_and_g_ = true;
+      return true;
+    }
+    if (info != 1) {
+      throw error_internal_error(__FILE__, __LINE__);
+    }
+    nfun_ += nfev;
+    npt = point*n_;
+    for (SizeType i = 0; i < n_; i++) {
+      w[ispt + npt + i] = stp_ * w[ispt + npt + i];
+      w[iypt + npt + i] = g[i] - w[i];
+    }
+    point++;
+    if (point == m_) point = 0;
+    return false;
+  }
+
+  //! Traditional LBFGS convergence test.
+  /*! This convergence test is equivalent to the test embedded
+      in the <code>lbfgs.f</code> Fortran code. The test assumes that
+      there is a meaningful relation between the Euclidean norm of the
+      parameter vector <code>x</code> and the norm of the gradient
+      vector <code>g</code>. Therefore this test should not be used if
+      this assumption is not correct for a given problem.
+   */
+  template <typename FloatType, typename SizeType = std::size_t>
+  class traditional_convergence_test
+  {
+    public:
+      //! Default constructor.
+      traditional_convergence_test()
+      : n_(0), eps_(0)
+      {}
+
+      //! Constructor.
+      /*! @param n The number of variables in the minimization problem.
+             Restriction: <code>n &gt; 0</code>.
+
+          @param eps Determines the accuracy with which the solution
+            is to be found.
+       */
+      explicit
+      traditional_convergence_test(
+        SizeType n,
+        FloatType eps = FloatType(1.e-5))
+      : n_(n), eps_(eps)
+      {
+        if (n_ == 0) {
+          throw error_improper_input_parameter("n = 0.");
+        }
+        if (eps_ < FloatType(0)) {
+          throw error_improper_input_parameter("eps < 0.");
+        }
+      }
+
+      //! Number of free parameters (as passed to the constructor).
+      SizeType n() const { return n_; }
+
+      /*! \brief Accuracy with which the solution is to be found
+          (as passed to the constructor).
+       */
+      FloatType eps() const { return eps_; }
+
+      //! Execution of the convergence test for the given parameters.
+      /*! Returns <code>true</code> if
+          <pre>
+            ||g|| &lt; eps * max(1,||x||),
+          </pre>
+          where <code>||.||</code> denotes the Euclidean norm.
+
+          @param x Current solution vector.
+
+          @param g Components of the gradient at the current
+            point <code>x</code>.
+       */
+      bool
+      operator()(const FloatType* x, const FloatType* g) const
+      {
+        FloatType xnorm = std::sqrt(detail::ddot(n_, x, x));
+        FloatType gnorm = std::sqrt(detail::ddot(n_, g, g));
+        if (gnorm <= eps_ * std::max(FloatType(1), xnorm)) return true;
+        return false;
+      }
+    protected:
+      const SizeType n_;
+      const FloatType eps_;
+  };
+
+}} // namespace scitbx::lbfgs
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const scitbx::lbfgs::minimizer<T>& min) {
+  return os << "ITER=" << min.iter() << "\tNFUN=" << min.nfun() << "\tSTP=" << min.stp() << "\tDIAG=" << min.requests_diag() << "\tF&G=" << min.requests_f_and_g();
+}
+
+
+#endif // SCITBX_LBFGS_H
diff --git a/training/utils/lbfgs_test.cc b/training/utils/lbfgs_test.cc
new file mode 100644
index 00000000..9678e788
--- /dev/null
+++ b/training/utils/lbfgs_test.cc
@@ -0,0 +1,117 @@
+#include <cassert>
+#include <iostream>
+#include <sstream>
+#include <cmath>
+#include "lbfgs.h"
+#include "sparse_vector.h"
+#include "fdict.h"
+
+using namespace std;
+
+double TestOptimizer() {
+  cerr << "TESTING NON-PERSISTENT OPTIMIZER\n";
+
+  // f(x,y) = 4x1^2 + x1*x2 + x2^2 + x3^2 + 6x3 + 5
+  // df/dx1 = 8*x1 + x2
+  // df/dx2 = 2*x2 + x1
+  // df/dx3 = 2*x3 + 6
+  double x[3];
+  double g[3];
+  scitbx::lbfgs::minimizer<double> opt(3);
+  scitbx::lbfgs::traditional_convergence_test<double> converged(3);
+  x[0] = 8;
+  x[1] = 8;
+  x[2] = 8;
+  double obj = 0;
+  do {
+    g[0] = 8 * x[0] + x[1];
+    g[1] = 2 * x[1] + x[0];
+    g[2] = 2 * x[2] + 6;
+    obj = 4 * x[0]*x[0] + x[0] * x[1] + x[1]*x[1] + x[2]*x[2] + 6 * x[2] + 5;
+    opt.run(x, obj, g);
+    if (!opt.requests_f_and_g()) {
+      if (converged(x,g)) break;
+      opt.run(x, obj, g);
+    }
+    cerr << x[0] << " " << x[1] << " " << x[2] << endl;
+    cerr << "   obj=" << obj << "\td/dx1=" << g[0] << " d/dx2=" << g[1] << " d/dx3=" << g[2] << endl;
+    cerr << opt << endl;
+  } while (true);
+  return obj;
+}
+
+double TestPersistentOptimizer() {
+  cerr << "\nTESTING PERSISTENT OPTIMIZER\n";
+  // f(x,y) = 4x1^2 + x1*x2 + x2^2 + x3^2 + 6x3 + 5
+  // df/dx1 = 8*x1 + x2
+  // df/dx2 = 2*x2 + x1
+  // df/dx3 = 2*x3 + 6
+  double x[3];
+  double g[3];
+  scitbx::lbfgs::traditional_convergence_test<double> converged(3);
+  x[0] = 8;
+  x[1] = 8;
+  x[2] = 8;
+  double obj = 0;
+  string state;
+  do {
+    g[0] = 8 * x[0] + x[1];
+    g[1] = 2 * x[1] + x[0];
+    g[2] = 2 * x[2] + 6;
+    obj = 4 * x[0]*x[0] + x[0] * x[1] + x[1]*x[1] + x[2]*x[2] + 6 * x[2] + 5;
+
+    {
+      scitbx::lbfgs::minimizer<double> opt(3);
+      if (state.size() > 0) {
+        istringstream is(state, ios::binary);
+        opt.deserialize(&is);
+      }
+      opt.run(x, obj, g);
+      ostringstream os(ios::binary); opt.serialize(&os); state = os.str();
+    }
+
+    cerr << x[0] << " " << x[1] << " " << x[2] << endl;
+    cerr << "   obj=" << obj << "\td/dx1=" << g[0] << " d/dx2=" << g[1] << " d/dx3=" << g[2] << endl;
+  } while (!converged(x, g));
+  return obj;
+}
+
+void TestSparseVector() {
+  cerr << "Testing SparseVector<double> serialization.\n";
+  int f1 = FD::Convert("Feature_1");
+  int f2 = FD::Convert("Feature_2");
+  FD::Convert("LanguageModel");
+  int f4 = FD::Convert("SomeFeature");
+  int f5 = FD::Convert("SomeOtherFeature");
+  SparseVector<double> g;
+  g.set_value(f2, log(0.5));
+  g.set_value(f4, log(0.125));
+  g.set_value(f1, 0);
+  g.set_value(f5, 23.777);
+  ostringstream os;
+  double iobj = 1.5;
+  B64::Encode(iobj, g, &os);
+  cerr << iobj << "\t" << g << endl;
+  string data = os.str();
+  cout << data << endl;
+  SparseVector<double> v;
+  double obj;
+  bool decode_b64 = B64::Decode(&obj, &v, &data[0], data.size());
+  cerr << obj << "\t" << v << endl;
+  assert(decode_b64);
+  assert(obj == iobj);
+  assert(g.size() == v.size());
+}
+
+int main() {
+  double o1 = TestOptimizer();
+  double o2 = TestPersistentOptimizer();
+  if (fabs(o1 - o2) > 1e-5) {
+    cerr << "OPTIMIZERS PERFORMED DIFFERENTLY!\n" << o1 << " vs. " << o2 << endl;
+    return 1;
+  }
+  TestSparseVector();
+  cerr << "SUCCESS\n";
+  return 0;
+}
+
diff --git a/training/utils/libcall.pl b/training/utils/libcall.pl
new file mode 100644
index 00000000..c7d0f128
--- /dev/null
+++ b/training/utils/libcall.pl
@@ -0,0 +1,71 @@
+use IPC::Open3;
+use Symbol qw(gensym);
+
+$DUMMY_STDERR = gensym();
+$DUMMY_STDIN = gensym();
+
+# Run the command and ignore failures
+sub unchecked_call {
+    system("@_")
+}
+
+# Run the command and return its output, if any ignoring failures
+sub unchecked_output {
+    return `@_`
+}
+
+# WARNING: Do not use this for commands that will return large amounts
+# of stdout or stderr -- they might block indefinitely
+sub check_output {
+    print STDERR "Executing and gathering output: @_\n";
+
+    my $pid = open3($DUMMY_STDIN, \*PH, $DUMMY_STDERR, @_);
+    my $proc_output = "";
+    while( <PH> ) {
+	$proc_output .= $_;
+    }
+    waitpid($pid, 0);
+    # TODO: Grab signal that the process died from
+    my $child_exit_status = $? >> 8;
+    if($child_exit_status == 0) {
+	return $proc_output;
+    } else {
+	print STDERR "ERROR: Execution of @_ failed.\n";
+	exit(1);
+    }
+}
+
+# Based on Moses' safesystem sub
+sub check_call {
+    print STDERR "Executing: @_\n";
+    system(@_);
+    my $exitcode = $? >> 8;
+    if($exitcode == 0) {
+	return 0;
+    } elsif ($? == -1) {
+	print STDERR "ERROR: Failed to execute: @_\n  $!\n";
+	exit(1);
+
+    } elsif ($? & 127) {
+      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n",
+      ($? & 127),  ($? & 128) ? 'with' : 'without';
+      exit(1);
+
+    } else {
+	print STDERR "Failed with exit code: $exitcode\n" if $exitcode;
+	exit($exitcode);
+    }
+}
+
+sub check_bash_call {
+    my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_");
+    check_call(@args);
+}
+
+sub check_bash_output {
+    my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_");
+    return check_output(@args);
+}
+
+# perl module weirdness...
+return 1;
diff --git a/training/utils/online_optimizer.cc b/training/utils/online_optimizer.cc
new file mode 100644
index 00000000..3ed95452
--- /dev/null
+++ b/training/utils/online_optimizer.cc
@@ -0,0 +1,16 @@
+#include "online_optimizer.h"
+
+LearningRateSchedule::~LearningRateSchedule() {}
+
+double StandardLearningRate::eta(int k) const {
+  return eta_0_ / (1.0 + k / N_);
+}
+
+double ExponentialDecayLearningRate::eta(int k) const {
+  return eta_0_ * pow(alpha_, k / N_);
+}
+
+OnlineOptimizer::~OnlineOptimizer() {}
+
+void OnlineOptimizer::ResetEpochImpl() {}
+
diff --git a/training/utils/online_optimizer.h b/training/utils/online_optimizer.h
new file mode 100644
index 00000000..28d89344
--- /dev/null
+++ b/training/utils/online_optimizer.h
@@ -0,0 +1,129 @@
+#ifndef _ONL_OPTIMIZE_H_
+#define _ONL_OPTIMIZE_H_
+
+#include <tr1/memory>
+#include <set>
+#include <string>
+#include <cmath>
+#include "sparse_vector.h"
+
+struct LearningRateSchedule {
+  virtual ~LearningRateSchedule();
+  // returns the learning rate for the kth iteration
+  virtual double eta(int k) const = 0;
+};
+
+// TODO in the Tsoruoaka et al. (ACL 2009) paper, they use N
+// to mean the batch size in most places, but it doesn't completely
+// make sense to me in the learning rate schedules-- this needs
+// to be worked out to make sure they didn't mean corpus size
+// in some places and batch size in others (since in the paper they
+// only ever work with batch sizes of 1)
+struct StandardLearningRate : public LearningRateSchedule {
+  StandardLearningRate(
+      size_t batch_size,        // batch size, not corpus size!
+      double eta_0 = 0.2) :
+    eta_0_(eta_0),
+    N_(static_cast<double>(batch_size)) {}
+
+  virtual double eta(int k) const;
+
+ private:
+  const double eta_0_;
+  const double N_;
+};
+
+struct ExponentialDecayLearningRate : public LearningRateSchedule {
+  ExponentialDecayLearningRate(
+      size_t batch_size,        // batch size, not corpus size!
+      double eta_0 = 0.2,
+      double alpha = 0.85       // recommended by Tsuruoka et al. (ACL 2009)
+    ) : eta_0_(eta_0),
+        N_(static_cast<double>(batch_size)),
+        alpha_(alpha) {
+    assert(alpha > 0);
+    assert(alpha < 1.0);
+  }
+
+  virtual double eta(int k) const;
+
+ private:
+  const double eta_0_;
+  const double N_;
+  const double alpha_;
+};
+
+class OnlineOptimizer {
+ public:
+  virtual ~OnlineOptimizer();
+  OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s,
+                  size_t batch_size,
+                  const std::vector<int>& frozen_feats = std::vector<int>())
+      : N_(batch_size),schedule_(s),k_() {
+    for (int i = 0; i < frozen_feats.size(); ++i)
+      frozen_.insert(frozen_feats[i]);
+  }
+  void ResetEpoch() { k_ = 0; ResetEpochImpl(); }
+  void UpdateWeights(const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) {
+    ++k_;
+    const double eta = schedule_->eta(k_);
+    UpdateWeightsImpl(eta, approx_g, max_feat, weights);
+  }
+
+ protected:
+  virtual void ResetEpochImpl();
+  virtual void UpdateWeightsImpl(const double& eta, const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) = 0;
+  const size_t N_; // number of training instances per batch
+  std::set<int> frozen_;  // frozen (non-optimizing) features
+
+ private:
+  std::tr1::shared_ptr<LearningRateSchedule> schedule_;
+  int k_;  // iteration count
+};
+
+class CumulativeL1OnlineOptimizer : public OnlineOptimizer {
+ public:
+  CumulativeL1OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s,
+                              size_t training_instances, double C,
+                              const std::vector<int>& frozen) :
+    OnlineOptimizer(s, training_instances, frozen), C_(C), u_() {}
+
+ protected:
+  void ResetEpochImpl() { u_ = 0; }
+  void UpdateWeightsImpl(const double& eta, const SparseVector<double>& approx_g, int max_feat, SparseVector<double>* weights) {
+    u_ += eta * C_ / N_;
+    for (SparseVector<double>::const_iterator it = approx_g.begin(); 
+         it != approx_g.end(); ++it) {
+      if (frozen_.count(it->first) == 0)
+        weights->add_value(it->first, eta * it->second);
+    }
+    for (int i = 1; i < max_feat; ++i)
+      if (frozen_.count(i) == 0) ApplyPenalty(i, weights);
+  }
+
+ private:
+  void ApplyPenalty(int i, SparseVector<double>* w) {
+    const double z = w->value(i);
+    double w_i = z;
+    double q_i = q_.value(i);
+    if (w_i > 0.0)
+      w_i = std::max(0.0, w_i - (u_ + q_i));
+    else if (w_i < 0.0)
+      w_i = std::min(0.0, w_i + (u_ - q_i));
+    q_i += w_i - z;
+    if (q_i == 0.0)
+      q_.erase(i);
+    else
+      q_.set_value(i, q_i);
+    if (w_i == 0.0)
+      w->erase(i);
+    else
+      w->set_value(i, w_i);
+  }
+
+  const double C_;  // reguarlization strength
+  double u_;
+  SparseVector<double> q_;
+};
+
+#endif
diff --git a/training/utils/optimize.cc b/training/utils/optimize.cc
new file mode 100644
index 00000000..41ac90d8
--- /dev/null
+++ b/training/utils/optimize.cc
@@ -0,0 +1,102 @@
+#include "optimize.h"
+
+#include <iostream>
+#include <cassert>
+
+#include "lbfgs.h"
+
+using namespace std;
+
+BatchOptimizer::~BatchOptimizer() {}
+
+void BatchOptimizer::Save(ostream* out) const {
+  out->write((const char*)&eval_, sizeof(eval_));
+  out->write((const char*)&has_converged_, sizeof(has_converged_));
+  SaveImpl(out);
+  unsigned int magic = 0xABCDDCBA;  // should be uint32_t
+  out->write((const char*)&magic, sizeof(magic));
+}
+
+void BatchOptimizer::Load(istream* in) {
+  in->read((char*)&eval_, sizeof(eval_));
+  in->read((char*)&has_converged_, sizeof(has_converged_));
+  LoadImpl(in);
+  unsigned int magic = 0;           // should be uint32_t
+  in->read((char*)&magic, sizeof(magic));
+  assert(magic == 0xABCDDCBA);
+  cerr << Name() << " EVALUATION #" << eval_ << endl;
+}
+
+void BatchOptimizer::SaveImpl(ostream* out) const {
+  (void)out;
+}
+
+void BatchOptimizer::LoadImpl(istream* in) {
+  (void)in;
+}
+
+string RPropOptimizer::Name() const {
+  return "RPropOptimizer";
+}
+
+void RPropOptimizer::OptimizeImpl(const double& obj,
+                              const vector<double>& g,
+                              vector<double>* x) {
+  for (int i = 0; i < g.size(); ++i) {
+    const double g_i = g[i];
+    const double sign_i = (signbit(g_i) ? -1.0 : 1.0);
+    const double prod = g_i * prev_g_[i];
+    if (prod > 0.0) {
+      const double dij = min(delta_ij_[i] * eta_plus_, delta_max_);
+      (*x)[i] -= dij * sign_i;
+      delta_ij_[i] = dij;
+      prev_g_[i] = g_i;
+    } else if (prod < 0.0) {
+      delta_ij_[i] = max(delta_ij_[i] * eta_minus_, delta_min_);
+      prev_g_[i] = 0.0;
+    } else {
+      (*x)[i] -= delta_ij_[i] * sign_i;
+      prev_g_[i] = g_i;
+    }
+  }
+}
+
+void RPropOptimizer::SaveImpl(ostream* out) const {
+  const size_t n = prev_g_.size();
+  out->write((const char*)&n, sizeof(n));
+  out->write((const char*)&prev_g_[0], sizeof(double) * n);
+  out->write((const char*)&delta_ij_[0], sizeof(double) * n);
+}
+
+void RPropOptimizer::LoadImpl(istream* in) {
+  size_t n;
+  in->read((char*)&n, sizeof(n));
+  assert(n == prev_g_.size());
+  assert(n == delta_ij_.size());
+  in->read((char*)&prev_g_[0], sizeof(double) * n);
+  in->read((char*)&delta_ij_[0], sizeof(double) * n);
+}
+
+string LBFGSOptimizer::Name() const {
+  return "LBFGSOptimizer";
+}
+
+LBFGSOptimizer::LBFGSOptimizer(int num_feats, int memory_buffers) :
+  opt_(num_feats, memory_buffers) {}
+
+void LBFGSOptimizer::SaveImpl(ostream* out) const {
+  opt_.serialize(out);
+}
+
+void LBFGSOptimizer::LoadImpl(istream* in) {
+  opt_.deserialize(in);
+}
+
+void LBFGSOptimizer::OptimizeImpl(const double& obj,
+                                  const vector<double>& g,
+                                  vector<double>* x) {
+  opt_.run(&(*x)[0], obj, &g[0]);
+  if (!opt_.requests_f_and_g()) opt_.run(&(*x)[0], obj, &g[0]);
+  // cerr << opt_ << endl;
+}
+
diff --git a/training/utils/optimize.h b/training/utils/optimize.h
new file mode 100644
index 00000000..07943b44
--- /dev/null
+++ b/training/utils/optimize.h
@@ -0,0 +1,92 @@
+#ifndef _OPTIMIZE_H_
+#define _OPTIMIZE_H_
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <cassert>
+
+#include "lbfgs.h"
+
+// abstract base class for first order optimizers
+// order of invocation: new, Load(), Optimize(), Save(), delete
+class BatchOptimizer {
+ public:
+  BatchOptimizer() : eval_(1), has_converged_(false) {}
+  virtual ~BatchOptimizer();
+  virtual std::string Name() const = 0;
+  int EvaluationCount() const { return eval_; }
+  bool HasConverged() const { return has_converged_; }
+
+  void Optimize(const double& obj,
+                const std::vector<double>& g,
+                std::vector<double>* x) {
+    assert(g.size() == x->size());
+    ++eval_;
+    OptimizeImpl(obj, g, x);
+    scitbx::lbfgs::traditional_convergence_test<double> converged(g.size());
+    has_converged_ = converged(&(*x)[0], &g[0]);
+  }
+
+  void Save(std::ostream* out) const;
+  void Load(std::istream* in);
+ protected:
+  virtual void SaveImpl(std::ostream* out) const;
+  virtual void LoadImpl(std::istream* in);
+  virtual void OptimizeImpl(const double& obj,
+                            const std::vector<double>& g,
+                            std::vector<double>* x) = 0;
+
+  int eval_;
+ private:
+  bool has_converged_;
+};
+
+class RPropOptimizer : public BatchOptimizer {
+ public:
+  explicit RPropOptimizer(int num_vars,
+                          double eta_plus = 1.2,
+                          double eta_minus = 0.5,
+                          double delta_0 = 0.1,
+                          double delta_max = 50.0,
+                          double delta_min = 1e-6) :
+      prev_g_(num_vars, 0.0),
+      delta_ij_(num_vars, delta_0),
+      eta_plus_(eta_plus),
+      eta_minus_(eta_minus),
+      delta_max_(delta_max),
+      delta_min_(delta_min) {
+    assert(eta_plus > 1.0);
+    assert(eta_minus > 0.0 && eta_minus < 1.0);
+    assert(delta_max > 0.0);
+    assert(delta_min > 0.0);
+  }
+  std::string Name() const;
+  void OptimizeImpl(const double& obj,
+                    const std::vector<double>& g,
+                    std::vector<double>* x);
+  void SaveImpl(std::ostream* out) const;
+  void LoadImpl(std::istream* in);
+ private:
+  std::vector<double> prev_g_;
+  std::vector<double> delta_ij_;
+  const double eta_plus_;
+  const double eta_minus_;
+  const double delta_max_;
+  const double delta_min_;
+};
+
+class LBFGSOptimizer : public BatchOptimizer {
+ public:
+  explicit LBFGSOptimizer(int num_vars, int memory_buffers = 10);
+  std::string Name() const;
+  void SaveImpl(std::ostream* out) const;
+  void LoadImpl(std::istream* in);
+  void OptimizeImpl(const double& obj,
+                    const std::vector<double>& g,
+                    std::vector<double>* x);
+ private:
+  scitbx::lbfgs::minimizer<double> opt_;
+};
+
+#endif
diff --git a/training/utils/optimize_test.cc b/training/utils/optimize_test.cc
new file mode 100644
index 00000000..bff2ca03
--- /dev/null
+++ b/training/utils/optimize_test.cc
@@ -0,0 +1,118 @@
+#include <cassert>
+#include <iostream>
+#include <sstream>
+#include <boost/program_options/variables_map.hpp>
+#include "optimize.h"
+#include "online_optimizer.h"
+#include "sparse_vector.h"
+#include "fdict.h"
+
+using namespace std;
+
+double TestOptimizer(BatchOptimizer* opt) {
+  cerr << "TESTING NON-PERSISTENT OPTIMIZER\n";
+
+  // f(x,y) = 4x1^2 + x1*x2 + x2^2 + x3^2 + 6x3 + 5
+  // df/dx1 = 8*x1 + x2
+  // df/dx2 = 2*x2 + x1
+  // df/dx3 = 2*x3 + 6
+  vector<double> x(3);
+  vector<double> g(3);
+  x[0] = 8;
+  x[1] = 8;
+  x[2] = 8;
+  double obj = 0;
+  do {
+    g[0] = 8 * x[0] + x[1];
+    g[1] = 2 * x[1] + x[0];
+    g[2] = 2 * x[2] + 6;
+    obj = 4 * x[0]*x[0] + x[0] * x[1] + x[1]*x[1] + x[2]*x[2] + 6 * x[2] + 5;
+    opt->Optimize(obj, g, &x);
+
+    cerr << x[0] << " " << x[1] << " " << x[2] << endl;
+    cerr << "   obj=" << obj << "\td/dx1=" << g[0] << " d/dx2=" << g[1] << " d/dx3=" << g[2] << endl;
+  } while (!opt->HasConverged());
+  return obj;
+}
+
+double TestPersistentOptimizer(BatchOptimizer* opt) {
+  cerr << "\nTESTING PERSISTENT OPTIMIZER\n";
+  // f(x,y) = 4x1^2 + x1*x2 + x2^2 + x3^2 + 6x3 + 5
+  // df/dx1 = 8*x1 + x2
+  // df/dx2 = 2*x2 + x1
+  // df/dx3 = 2*x3 + 6
+  vector<double> x(3);
+  vector<double> g(3);
+  x[0] = 8;
+  x[1] = 8;
+  x[2] = 8;
+  double obj = 0;
+  string state;
+  bool converged = false;
+  while (!converged) {
+    g[0] = 8 * x[0] + x[1];
+    g[1] = 2 * x[1] + x[0];
+    g[2] = 2 * x[2] + 6;
+    obj = 4 * x[0]*x[0] + x[0] * x[1] + x[1]*x[1] + x[2]*x[2] + 6 * x[2] + 5;
+
+    {
+      if (state.size() > 0) {
+        istringstream is(state, ios::binary);
+        opt->Load(&is);
+      }
+      opt->Optimize(obj, g, &x);
+      ostringstream os(ios::binary); opt->Save(&os); state = os.str();
+
+    }
+
+    cerr << x[0] << " " << x[1] << " " << x[2] << endl;
+    cerr << "   obj=" << obj << "\td/dx1=" << g[0] << " d/dx2=" << g[1] << " d/dx3=" << g[2] << endl;
+    converged = opt->HasConverged();
+    if (!converged) {
+      // now screw up the state (should be undone by Load)
+      obj += 2.0;
+      g[1] = -g[2];
+      vector<double> x2 = x;
+      try {
+        opt->Optimize(obj, g, &x2);
+      } catch (...) { }
+    }
+  }
+  return obj;
+}
+
+template <class O>
+void TestOptimizerVariants(int num_vars) {
+  O oa(num_vars);
+  cerr << "-------------------------------------------------------------------------\n";
+  cerr << "TESTING: " << oa.Name() << endl;
+  double o1 = TestOptimizer(&oa);
+  O ob(num_vars);
+  double o2 = TestPersistentOptimizer(&ob);
+  if (o1 != o2) {
+    cerr << oa.Name() << " VARIANTS PERFORMED DIFFERENTLY!\n" << o1 << " vs. " << o2 << endl;
+    exit(1);
+  }
+  cerr << oa.Name() << " SUCCESS\n";
+}
+
+using namespace std::tr1;
+
+void TestOnline() {
+  size_t N = 20;
+  double C = 1.0;
+  double eta0 = 0.2;
+  std::tr1::shared_ptr<LearningRateSchedule> r(new ExponentialDecayLearningRate(N, eta0, 0.85));
+  //shared_ptr<LearningRateSchedule> r(new StandardLearningRate(N, eta0));
+  CumulativeL1OnlineOptimizer opt(r, N, C, std::vector<int>());
+  assert(r->eta(10) < r->eta(1));
+}
+
+int main() {
+  int n = 3;
+  TestOptimizerVariants<LBFGSOptimizer>(n);
+  TestOptimizerVariants<RPropOptimizer>(n);
+  TestOnline();
+  return 0;
+}
+
diff --git a/training/utils/parallelize.pl b/training/utils/parallelize.pl
new file mode 100755
index 00000000..4197e0e5
--- /dev/null
+++ b/training/utils/parallelize.pl
@@ -0,0 +1,423 @@
+#!/usr/bin/env perl
+
+# Author: Adam Lopez
+#
+# This script takes a command that processes input
+# from stdin one-line-at-time, and parallelizes it
+# on the cluster using David Chiang's sentserver/
+# sentclient architecture.
+#
+# Prerequisites: the command *must* read each line
+# without waiting for subsequent lines of input
+# (for instance, a command which must read all lines
+# of input before processing will not work) and
+# return it to the output *without* buffering
+# multiple lines.
+
+#TODO: if -j 1, run immediately, not via sentserver?  possible differences in environment might make debugging harder
+
+#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps.  time cut down to 15s from 60s
+
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }
+use LocalConfig;
+
+use Cwd qw/ abs_path cwd getcwd /; 
+use File::Temp qw/ tempfile /;
+use Getopt::Long;
+use IPC::Open2;
+use strict;
+use POSIX ":sys_wait_h";
+
+use File::Basename;
+my $myDir = dirname(__FILE__);
+print STDERR __FILE__." -> $myDir\n";
+push(@INC, $myDir);
+require "libcall.pl";
+
+my $tailn=5; # +0 = concatenate all the client logs.  5 = last 5 lines
+my $recycle_clients;    # spawn new clients when previous ones terminate
+my $stay_alive;      # dont let server die when having zero clients
+my $joblist = "";
+my $errordir="";
+my $multiline;
+my $workdir = '.';
+my $numnodes = 8;
+my $user = $ENV{"USER"};
+my $pmem = "9g";
+my $basep=50300;
+my $randp=300;
+my $tryp=50;
+my $no_which;
+my $no_cd;
+
+my $DEBUG=$ENV{DEBUG};
+print STDERR "DEBUG=$DEBUG output enabled.\n" if $DEBUG;
+my $verbose = 1;
+sub verbose {
+    if ($verbose) {
+        print STDERR @_,"\n";
+    }
+}
+sub debug {
+    if ($DEBUG) {
+        my ($package, $filename, $line) = caller;
+        print STDERR "DEBUG: $filename($line): ",join(' ',@_),"\n";
+    }
+}
+my $is_shell_special=qr.[ \t\n\\><|&;"'`~*?{}$!()].;
+my $shell_escape_in_quote=qr.[\\"\$`!].;
+sub escape_shell {
+    my ($arg)=@_;
+    return undef unless defined $arg;
+    return '""' unless $arg;
+    if ($arg =~ /$is_shell_special/) {
+        $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+        return "\"$arg\"";
+    }
+    return $arg;
+}
+sub preview_files {
+    my ($l,$skipempty,$footer,$n)=@_;
+    $n=$tailn unless defined $n;
+    my @f=grep { ! ($skipempty && -z $_) } @$l;
+    my $fn=join(' ',map {escape_shell($_)} @f);
+    my $cmd="tail -n $n $fn";
+    unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":"");
+}
+sub prefix_dirname($) {
+    #like `dirname but if ends in / then return the whole thing
+    local ($_)=@_;
+    if (/\/$/) {
+        $_;
+    } else {
+        s#/[^/]$##;
+        $_ ? $_ : '';
+    }
+}
+sub ensure_final_slash($) {
+    local ($_)=@_;
+    m#/$# ? $_ : ($_."/");
+}
+sub extend_path($$;$$) {
+    my ($base,$ext,$mkdir,$baseisdir)=@_;
+    if (-d $base) {
+        $base.="/";
+    } else {
+        my $dir;
+        if ($baseisdir) {
+            $dir=$base;
+            $base.='/' unless $base =~ /\/$/;
+        } else {
+            $dir=prefix_dirname($base);
+        }
+        my @cmd=("/bin/mkdir","-p",$dir);
+        check_call(@cmd) if $mkdir;
+    }
+    return $base.$ext;
+}
+
+my $abscwd=abs_path(&getcwd);
+sub print_help;
+
+my $use_fork;
+my @pids;
+
+# Process command-line options
+unless (GetOptions(
+      "stay-alive" => \$stay_alive,
+      "recycle-clients" => \$recycle_clients,
+      "error-dir=s" => \$errordir,
+      "multi-line" => \$multiline,
+      "workdir=s" => \$workdir,
+      "use-fork" => \$use_fork,
+      "verbose" => \$verbose,
+      "jobs=i" => \$numnodes,
+      "pmem=s" => \$pmem,
+        "baseport=i" => \$basep,
+#       "iport=i" => \$randp, #for short name -i
+        "no-which!" => \$no_which,
+            "no-cd!" => \$no_cd,
+            "tailn=s" => \$tailn,
+) && scalar @ARGV){
+  print_help();
+    die "bad options.";
+}
+
+my $cmd = "";
+my $prog=shift;
+if ($no_which) {
+    $cmd=$prog;
+} else {
+    $cmd=check_output("which $prog");
+    chomp $cmd;
+    die "$prog not found - $cmd" unless $cmd;
+}
+#$cmd=abs_path($cmd);
+for my $arg (@ARGV) {
+    $cmd .= " ".escape_shell($arg);
+}
+die "Please specify a command to parallelize\n" if $cmd eq '';
+
+my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n");
+
+my $executable = $cmd;
+$executable =~ s/^\s*(\S+)($|\s.*)/$1/;
+$executable=check_output("basename $executable");
+chomp $executable;
+
+
+print STDERR "Parallelizing ($numnodes ways): $cmd\n\n";
+
+# create -e dir and save .sh
+use File::Temp qw/tempdir/;
+unless ($errordir) {
+    $errordir=tempdir("$executable.XXXXXX",CLEANUP=>1);
+}
+if ($errordir) {
+    my $scriptfile=extend_path("$errordir/","$executable.sh",1,1);
+    -d $errordir || die "should have created -e dir $errordir";
+    open SF,">",$scriptfile || die;
+    print SF "$cdcmd$cmd\n";
+    close SF;
+    chmod 0755,$scriptfile;
+    $errordir=abs_path($errordir);
+    &verbose("-e dir: $errordir");
+}
+
+# set cleanup handler
+my @cleanup_cmds;
+sub cleanup;
+sub cleanup_and_die;
+$SIG{INT} = "cleanup_and_die";
+$SIG{TERM} = "cleanup_and_die";
+$SIG{HUP} = "cleanup_and_die";
+
+# other subs:
+sub numof_live_jobs;
+sub launch_job_on_node;
+
+
+# vars
+my $mydir = check_output("dirname $0"); chomp $mydir;
+my $sentserver = "$mydir/sentserver";
+my $sentclient = "$mydir/sentclient";
+my $host = check_output("hostname");
+chomp $host;
+
+
+# find open port
+srand;
+my $port = 50300+int(rand($randp));
+my $endp=$port+$tryp;
+sub listening_port_lines {
+    my $quiet=$verbose?'':'2>/dev/null';
+    return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp");
+}
+my $netstat=&listening_port_lines;
+
+if ($verbose){ print STDERR "Testing port $port...";}
+
+while ($netstat=~/$port/ || &listening_port_lines=~/$port/){
+  if ($verbose){ print STDERR "port is busy\n";}
+  $port++;
+  if ($port > $endp){
+    die "Unable to find open port\n";
+  }
+  if ($verbose){ print STDERR "Testing port $port... "; }
+}
+if ($verbose){
+  print STDERR "port $port is available\n";
+}
+
+my $key = int(rand()*1000000);
+
+my $multiflag = "";
+if ($multiline){ $multiflag = "-m"; print STDERR "expecting multiline output.\n"; }
+my $stay_alive_flag = "";
+if ($stay_alive){ $stay_alive_flag = "--stay-alive"; print STDERR "staying alive while no clients are connected.\n"; }
+
+my $node_count = 0;
+my $script = "";
+# fork == one thread runs the sentserver, while the
+# other spawns the sentclient commands.
+my $pid = fork;
+if ($pid == 0) { # child
+  sleep 8; # give other thread time to start sentserver
+  $script = "$cdcmd$sentclient $host:$port:$key $cmd";
+
+  if ($verbose){
+    print STDERR "Client script:\n====\n";
+    print STDERR $script;
+    print STDERR "====\n";
+  }
+  for (my $jobn=0; $jobn<$numnodes; $jobn++){
+    launch_job();
+  }
+  if ($recycle_clients) {
+    my $ret;
+    my $livejobs;
+    while (1) {
+      $ret = waitpid($pid, WNOHANG);
+      #print STDERR "waitpid $pid ret = $ret \n";
+      last if ($ret != 0);
+      $livejobs = numof_live_jobs();
+      if ($numnodes >= $livejobs ) {  # a client terminated, OR # lines of input was less than -j
+        print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n";
+        launch_job();
+      } else {
+        sleep 15;
+      }
+    }
+  }
+  print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n";
+  for my $p (@pids) {
+    waitpid($p, 0);
+  }
+} else {
+#  my $todo = "$sentserver -k $key $multiflag $port ";
+  my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag ";
+  if ($verbose){ print STDERR "Running: $todo\n"; }
+  check_call($todo);
+  print STDERR "Call to $sentserver returned.\n";
+  cleanup();
+  exit(0);
+}
+
+sub numof_live_jobs {
+  if ($use_fork) {
+    die "not implemented";
+  } else {
+    # We can probably continue decoding if the qstat error is only temporary
+    my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat")));
+    return ($#livejobs + 1);
+  }
+}
+my (@errors,@outs,@cmds);
+
+sub launch_job {
+    if ($use_fork) { return launch_job_fork(); }
+    my $errorfile = "/dev/null";
+    my $outfile = "/dev/null";
+    $node_count++;
+    my $clientname = $executable;
+    $clientname =~ s/^(.{4}).*$/$1/;
+    $clientname = "$clientname.$node_count";
+    if ($errordir){
+      $errorfile = "$errordir/$clientname.ER";
+      $outfile = "$errordir/$clientname.OU";
+      push @errors,$errorfile;
+      push @outs,$outfile;
+    }
+    my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile";
+    push @cmds,$todo;
+
+    print STDERR "Running: $todo\n";
+    local(*QOUT, *QIN);
+    open2(\*QOUT, \*QIN, $todo) or die "Failed to open2: $!";
+    print QIN $script;
+    close QIN;
+    while (my $jobid=<QOUT>){
+      chomp $jobid;
+      if ($verbose){ print STDERR "Launched client job: $jobid"; }
+      $jobid =~ s/^(\d+)(.*?)$/\1/g;
+            $jobid =~ s/^Your job (\d+) .*$/\1/;
+      print STDERR " short job id $jobid\n";
+            if ($verbose){
+                print STDERR "cd: $abscwd\n";
+                print STDERR "cmd: $cmd\n";
+            }
+      if ($joblist == "") { $joblist = $jobid; }
+      else {$joblist = $joblist . "\|" . $jobid; }
+      my $cleanfn="qdel $jobid 2> /dev/null";
+      push(@cleanup_cmds, $cleanfn);
+    }
+    close QOUT;
+}
+
+sub launch_job_fork {
+  my $errorfile = "/dev/null";
+  my $outfile = "/dev/null";
+  $node_count++;
+  my $clientname = $executable;
+  $clientname =~ s/^(.{4}).*$/$1/;
+  $clientname = "$clientname.$node_count";
+  if ($errordir){
+    $errorfile = "$errordir/$clientname.ER";
+    $outfile = "$errordir/$clientname.OU";
+    push @errors,$errorfile;
+    push @outs,$outfile;
+  }
+  my $pid = fork;
+  if ($pid == 0) {
+    my ($fh, $scr_name) = get_temp_script();
+    print $fh $script;
+    close $fh;
+    my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile";
+    print STDERR "EXEC: $todo\n";
+    my $out = check_output("$todo");
+    unlink $scr_name or warn "Failed to remove $scr_name";
+    exit 0;
+  } else {
+    push @pids, $pid;
+  }
+}
+
+sub get_temp_script {
+  my ($fh, $filename) = tempfile( "$workdir/workXXXX", SUFFIX => '.sh');
+  return ($fh, $filename);
+}
+
+sub cleanup_and_die {
+  cleanup();
+  die "\n";
+}
+
+sub cleanup {
+  print STDERR "Cleaning up...\n";
+  for $cmd (@cleanup_cmds){
+    print STDERR "  Cleanup command: $cmd\n";
+    eval $cmd;
+  }
+  print STDERR "outputs:\n",preview_files(\@outs,1),"\n";
+  print STDERR "errors:\n",preview_files(\@errors,1),"\n";
+  print STDERR "cmd:\n",$cmd,"\n";
+  print STDERR " cat $errordir/*.ER\nfor logs.\n";
+  print STDERR "Cleanup finished.\n";
+}
+
+sub print_help
+{
+  my $name = check_output("basename $0"); chomp $name;
+  print << "Help";
+
+usage: $name [options]
+
+  Automatic black-box parallelization of commands.
+
+options:
+
+  --use-fork
+    Instead of using qsub, use fork.
+
+  -e, --error-dir <dir>
+    Retain output files from jobs in <dir>, rather
+    than silently deleting them.
+
+  -m, --multi-line
+    Expect that command may produce multiple output
+    lines for a single input line.  $name makes a
+    reasonable attempt to obtain all output before
+    processing additional inputs.  However, use of this
+    option is inherently unsafe.
+
+  -v, --verbose
+    Print diagnostic informatoin on stderr.
+
+  -j, --jobs
+    Number of jobs to use.
+
+  -p, --pmem
+    pmem setting for each job.
+
+Help
+}
diff --git a/training/utils/risk.cc b/training/utils/risk.cc
new file mode 100644
index 00000000..d5a12cfd
--- /dev/null
+++ b/training/utils/risk.cc
@@ -0,0 +1,45 @@
+#include "risk.h"
+
+#include "prob.h"
+#include "candidate_set.h"
+#include "ns.h"
+
+using namespace std;
+
+namespace training {
+
+// g = \sum_e p(e|f) * loss(e) * (phi(e,f) - E[phi(e,f)])
+double CandidateSetRisk::operator()(const vector<double>& params,
+                                    SparseVector<double>* g) const {
+  prob_t z;
+  for (unsigned i = 0; i < cands_.size(); ++i) {
+    const prob_t u(cands_[i].fmap.dot(params), init_lnx());
+    z += u;
+  }
+  const double log_z = log(z);
+
+  SparseVector<double> exp_feats;
+  if (g) {
+    for (unsigned i = 0; i < cands_.size(); ++i) {
+      const double log_prob = cands_[i].fmap.dot(params) - log_z;
+      const double prob = exp(log_prob);
+      exp_feats += cands_[i].fmap * prob;
+    }
+  }
+
+  double risk = 0;
+  for (unsigned i = 0; i < cands_.size(); ++i) {
+    const double log_prob = cands_[i].fmap.dot(params) - log_z;
+    const double prob = exp(log_prob);
+    const double cost = metric_.IsErrorMetric() ? metric_.ComputeScore(cands_[i].eval_feats)
+                                                : 1.0 - metric_.ComputeScore(cands_[i].eval_feats);
+    const double r = prob * cost;
+    risk += r;
+    if (g) (*g) += (cands_[i].fmap - exp_feats) * r;
+  }
+  return risk;
+}
+
+}
+
+
diff --git a/training/utils/risk.h b/training/utils/risk.h
new file mode 100644
index 00000000..2e8db0fb
--- /dev/null
+++ b/training/utils/risk.h
@@ -0,0 +1,26 @@
+#ifndef _RISK_H_
+#define _RISK_H_
+
+#include <vector>
+#include "sparse_vector.h"
+class EvaluationMetric;
+
+namespace training {
+  class CandidateSet;
+
+  class CandidateSetRisk {
+   public:
+    explicit CandidateSetRisk(const CandidateSet& cs, const EvaluationMetric& metric) :
+       cands_(cs),
+       metric_(metric) {}
+    // compute the risk (expected loss) of a CandidateSet
+    // (optional) the gradient of the risk with respect to params
+    double operator()(const std::vector<double>& params,
+                      SparseVector<double>* g = NULL) const;
+   private:
+    const CandidateSet& cands_;
+    const EvaluationMetric& metric_;
+  };
+};
+
+#endif
diff --git a/training/utils/sentclient.c b/training/utils/sentclient.c
new file mode 100644
index 00000000..91d994ab
--- /dev/null
+++ b/training/utils/sentclient.c
@@ -0,0 +1,76 @@
+/* Copyright (c) 2001 by David Chiang. All rights reserved.*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <string.h>
+
+#include "sentserver.h"
+
+int main (int argc, char *argv[]) {
+  int sock, port;
+  char *s, *key;
+  struct hostent *hp;
+  struct sockaddr_in server;
+  int errors = 0;
+
+  if (argc < 3) {
+    fprintf(stderr, "Usage: sentclient host[:port[:key]] command [args ...]\n");
+    exit(1);
+  }
+
+  s = strchr(argv[1], ':');
+  key = NULL;
+
+  if (s == NULL) {
+    port = DEFAULT_PORT;
+  } else {
+    *s = '\0';
+    s+=1;
+	/* dumb hack */
+	key = strchr(s, ':');
+	if (key != NULL){
+		*key = '\0';
+		key += 1;
+	}
+    port = atoi(s);
+  }
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+
+  hp = gethostbyname(argv[1]);
+  if (hp == NULL) {
+    fprintf(stderr, "unknown host %s\n", argv[1]);
+    exit(1);
+  }
+
+  bzero((char *)&server, sizeof(server));
+  bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
+  server.sin_family = hp->h_addrtype;
+  server.sin_port = htons(port);
+
+  while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) {
+    perror("connect()");
+    sleep(1);
+    errors++;
+    if (errors > 5)
+      exit(1);
+  }
+
+  close(0);
+  close(1);
+  dup2(sock, 0);
+  dup2(sock, 1);
+
+  if (key != NULL){
+	write(1, key, strlen(key));
+	write(1, "\n", 1);
+  }
+
+  execvp(argv[2], argv+2);
+  return 0;
+}
diff --git a/training/utils/sentserver.c b/training/utils/sentserver.c
new file mode 100644
index 00000000..c20b4fa6
--- /dev/null
+++ b/training/utils/sentserver.c
@@ -0,0 +1,515 @@
+/* Copyright (c) 2001 by David Chiang. All rights reserved.*/
+
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <netinet/in.h>
+#include <sched.h>
+#include <pthread.h>
+#include <errno.h>
+
+#include "sentserver.h"
+
+#define MAX_CLIENTS 64
+
+struct clientinfo {
+  int s;
+  struct sockaddr_in sin;
+};
+
+struct line {
+  int id;
+  char *s;
+  int status;
+  struct line *next;
+} *head, **ptail;
+
+int n_sent = 0, n_received=0, n_flushed=0;
+
+#define STATUS_RUNNING 0
+#define STATUS_ABORTED 1
+#define STATUS_FINISHED 2
+
+pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t clients_mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+int n_clients = 0;
+int s;
+int expect_multiline_output = 0;
+int log_mutex = 0;
+int stay_alive = 0;		/* dont panic and die with zero clients */
+
+void queue_finish(struct line *node, char *s, int fid);
+char * read_line(int fd, int multiline);
+void done (int code);
+
+struct line * queue_get(int fid) {
+	struct line *cur;
+	char *s, *synch;
+
+	if (log_mutex) fprintf(stderr, "Getting for data for fid %d\n", fid);
+	if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+	pthread_mutex_lock(&queue_mutex);
+
+	/* First, check for aborted sentences. */
+
+	if (log_mutex) fprintf(stderr, "  Checking queue for aborted jobs (fid %d)\n", fid);
+	for (cur = head; cur != NULL; cur = cur->next) {
+		if (cur->status == STATUS_ABORTED) {
+			cur->status = STATUS_RUNNING;
+
+			if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+			pthread_mutex_unlock(&queue_mutex);
+
+			return cur;
+		}
+	}
+	if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+	pthread_mutex_unlock(&queue_mutex);
+
+	/* Otherwise, read a new one. */
+	if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid);
+	if (log_mutex) fprintf(stderr, "  Reading input for new data (fid %d)\n", fid);
+	pthread_mutex_lock(&input_mutex);
+	s = read_line(0,0);
+
+	while (s) {
+		if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+		pthread_mutex_lock(&queue_mutex);
+		if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid);
+		pthread_mutex_unlock(&input_mutex);
+
+		cur = malloc(sizeof (struct line));
+		cur->id = n_sent;
+		cur->s = s;
+		cur->next = NULL;
+
+		*ptail = cur;
+		ptail = &cur->next;
+
+		n_sent++;
+
+		if (strcmp(s,"===SYNCH===\n")==0){
+			fprintf(stderr, "Received ===SYNCH=== signal (fid %d)\n", fid);
+			// Note: queue_finish calls free(cur->s).
+			// Therefore we need to create a new string here.
+			synch = malloc((strlen("===SYNCH===\n")+2) * sizeof (char));
+			synch = strcpy(synch, s);
+
+			if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+			pthread_mutex_unlock(&queue_mutex);
+			queue_finish(cur, synch, fid); /* handles its own lock */
+
+			if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid);
+			if (log_mutex) fprintf(stderr, "  Reading input for new data (fid %d)\n", fid);
+			pthread_mutex_lock(&input_mutex);
+
+			s = read_line(0,0);
+		} else {
+			if (log_mutex) fprintf(stderr, "  Received new data %d (fid %d)\n", cur->id, fid);
+			cur->status = STATUS_RUNNING;
+			if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+			pthread_mutex_unlock(&queue_mutex);
+			return cur;
+		}
+	}
+
+	if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid);
+	pthread_mutex_unlock(&input_mutex);
+	/* Only way to reach this point: no more output */
+
+	if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+	pthread_mutex_lock(&queue_mutex);
+	if (head == NULL) {
+		fprintf(stderr, "Reached end of file. Exiting.\n");
+		done(0);
+	} else
+		ptail = NULL; /* This serves as a signal that there is no more input */
+	if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+	pthread_mutex_unlock(&queue_mutex);
+
+	return NULL;
+}
+
+void queue_panic() {
+	struct line *next;
+	while (head && head->status == STATUS_FINISHED) {
+		/* Write out finished sentences */
+		if (head->status == STATUS_FINISHED) {
+			fputs(head->s, stdout);
+			fflush(stdout);
+		}
+		/* Write out blank line for unfinished sentences */
+		if (head->status == STATUS_ABORTED) {
+			fputs("\n", stdout);
+			fflush(stdout);
+		}
+		/* By defition, there cannot be any RUNNING sentences, since
+		function is only called when n_clients == 0 */
+		free(head->s);
+		next = head->next;
+		free(head);
+		head = next;
+		n_flushed++;
+	}
+	fclose(stdout);
+	fprintf(stderr, "All clients died. Panicking, flushing completed sentences and exiting.\n");
+	done(1);
+}
+
+void queue_abort(struct line *node, int fid) {
+	if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+	pthread_mutex_lock(&queue_mutex);
+	node->status = STATUS_ABORTED;
+	if (n_clients == 0) {
+		if (stay_alive) {
+			fprintf(stderr, "Warning! No live clients detected! Staying alive, will retry soon.\n");
+		} else {
+			queue_panic();
+		}
+	}
+	if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+	pthread_mutex_unlock(&queue_mutex);
+}
+
+
+void queue_print() {
+  struct line *cur;
+
+  fprintf(stderr, "  Queue\n");
+
+  for (cur = head; cur != NULL; cur = cur->next) {
+    switch(cur->status) {
+    case STATUS_RUNNING:
+      fprintf(stderr, "    %d running  ", cur->id); break;
+    case STATUS_ABORTED:
+      fprintf(stderr, "    %d aborted  ", cur->id); break;
+    case STATUS_FINISHED:
+      fprintf(stderr, "    %d finished ", cur->id); break;
+
+    }
+	fprintf(stderr, "\n");
+    //fprintf(stderr, cur->s);
+  }
+}
+
+void queue_finish(struct line *node, char *s, int fid) {
+  struct line *next;
+  if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+  pthread_mutex_lock(&queue_mutex);
+
+  free(node->s);
+  node->s = s;
+  node->status = STATUS_FINISHED;
+  n_received++;
+
+  /* Flush out finished nodes */
+  while (head && head->status == STATUS_FINISHED) {
+
+    if (log_mutex) fprintf(stderr, "  Flushing finished node %d\n", head->id);
+
+    fputs(head->s, stdout);
+    fflush(stdout);
+    if (log_mutex) fprintf(stderr, "  Flushed node %d\n", head->id);
+    free(head->s);
+
+    next = head->next;
+    free(head);
+
+    head = next;
+
+    n_flushed++;
+
+    if (head == NULL) { /* empty queue */
+      if (ptail == NULL) { /* This can only happen if set in queue_get as signal that there is no more input. */
+        fprintf(stderr, "All sentences finished. Exiting.\n");
+        done(0);
+      } else /* ptail pointed at something which was just popped off the stack -- reset to head*/
+        ptail = &head;
+    }
+  }
+
+  if (log_mutex) fprintf(stderr, "  Flushing output %d\n", head->id);
+  fflush(stdout);
+  fprintf(stderr, "%d sentences sent, %d sentences finished, %d sentences flushed\n", n_sent, n_received, n_flushed);
+
+  if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+  pthread_mutex_unlock(&queue_mutex);
+
+}
+
+char * read_line(int fd, int multiline) {
+  int size = 80;
+  char errorbuf[100];
+  char *s = malloc(size+2);
+  int result, errors=0;
+  int i = 0;
+
+  result = read(fd, s+i, 1);
+
+  while (1) {
+    if (result < 0) {
+      perror("read()");
+      sprintf(errorbuf, "Error code: %d\n", errno);
+      fprintf(stderr, errorbuf);
+      errors++;
+      if (errors > 5) {
+	free(s);
+	return NULL;
+      } else {
+	sleep(1); /* retry after delay */
+      }
+    } else if (result == 0) {
+      break;
+    } else if (multiline==0 && s[i] == '\n') {
+      break;
+    } else {
+      if (s[i] == '\n'){
+	/* if we've reached this point,
+	   then multiline must be 1, and we're
+	   going to poll the fd for an additional
+	   line of data.  The basic design is to
+	   run a select on the filedescriptor fd.
+	   Select will return under two conditions:
+	   if there is data on the fd, or if a
+	   timeout is reached.  We'll select on this
+	   fd.  If select returns because there's data
+	   ready, keep going; else assume there's no
+	   more and return the data we already have.
+	*/
+
+	fd_set set;
+	FD_ZERO(&set);
+	FD_SET(fd, &set);
+
+	struct timeval timeout;
+	timeout.tv_sec = 3; // number of seconds for timeout
+	timeout.tv_usec = 0;
+
+	int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout);
+	if (ready<1){
+	  break; // no more data, stop looping
+	}
+      }
+      i++;
+
+      if (i == size) {
+	size = size*2;
+	s = realloc(s, size+2);
+      }
+    }
+
+    result = read(fd, s+i, 1);
+  }
+
+  if (result == 0 && i == 0) { /* end of file */
+    free(s);
+    return NULL;
+  }
+
+  s[i] = '\n';
+  s[i+1] = '\0';
+
+  return s;
+}
+
+void * new_client(void *arg) {
+  struct clientinfo *client = (struct clientinfo *)arg;
+  struct line *cur;
+  int result;
+  char *s;
+  char errorbuf[100];
+
+  pthread_mutex_lock(&clients_mutex);
+  n_clients++;
+  pthread_mutex_unlock(&clients_mutex);
+
+  fprintf(stderr, "Client connected (%d connected)\n", n_clients);
+
+  for (;;) {
+
+    cur = queue_get(client->s);
+
+    if (cur) {
+      /* fprintf(stderr, "Sending to client: %s", cur->s); */
+      fprintf(stderr, "Sending data %d to client (fid %d)\n", cur->id, client->s);
+      result = write(client->s, cur->s, strlen(cur->s));
+      if (result < strlen(cur->s)){
+        perror("write()");
+        sprintf(errorbuf, "Error code: %d\n", errno);
+        fprintf(stderr, errorbuf);
+
+        pthread_mutex_lock(&clients_mutex);
+        n_clients--;
+        pthread_mutex_unlock(&clients_mutex);
+
+        fprintf(stderr, "Client died (%d connected)\n", n_clients);
+        queue_abort(cur, client->s);
+
+        close(client->s);
+        free(client);
+
+        pthread_exit(NULL);
+      }
+    } else {
+      close(client->s);
+      pthread_mutex_lock(&clients_mutex);
+      n_clients--;
+      pthread_mutex_unlock(&clients_mutex);
+      fprintf(stderr, "Client dismissed (%d connected)\n", n_clients);
+      pthread_exit(NULL);
+    }
+
+    s = read_line(client->s,expect_multiline_output);
+    if (s) {
+      /* fprintf(stderr, "Client (fid %d) returned: %s", client->s, s); */
+      fprintf(stderr, "Client (fid %d) returned data %d\n", client->s, cur->id);
+//      queue_print();
+      queue_finish(cur, s, client->s);
+    } else {
+      pthread_mutex_lock(&clients_mutex);
+      n_clients--;
+      pthread_mutex_unlock(&clients_mutex);
+
+      fprintf(stderr, "Client died (%d connected)\n", n_clients);
+      queue_abort(cur, client->s);
+
+      close(client->s);
+      free(client);
+
+      pthread_exit(NULL);
+    }
+
+  }
+  return 0;
+}
+
+void done (int code) {
+  close(s);
+  exit(code);
+}
+
+
+
+int main (int argc, char *argv[]) {
+  struct sockaddr_in sin, from;
+  int g;
+  socklen_t len;
+  struct clientinfo *client;
+  int port;
+  int opt;
+  int errors = 0;
+  int argi;
+  char *key = NULL, *client_key;
+  int use_key = 0;
+  /* the key stuff here doesn't provide any
+  real measure of security, it's mainly to keep
+  jobs from bumping into each other.  */
+
+  pthread_t tid;
+  port = DEFAULT_PORT;
+
+  for (argi=1; argi < argc; argi++){
+    if (strcmp(argv[argi], "-m")==0){
+      expect_multiline_output = 1;
+    } else if (strcmp(argv[argi], "-k")==0){
+      argi++;
+      if (argi == argc){
+      	fprintf(stderr, "Key must be specified after -k\n");
+      	exit(1);
+      }
+      key = argv[argi];
+      use_key = 1;
+    } else if (strcmp(argv[argi], "--stay-alive")==0){
+      stay_alive = 1;    /* dont panic and die with zero clients */
+    } else {
+      port = atoi(argv[argi]);
+    }
+  }
+
+  /* Initialize data structures */
+  head = NULL;
+  ptail = &head;
+
+  /* Set up listener */
+  s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+  opt = 1;
+  setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
+
+  sin.sin_family = AF_INET;
+  sin.sin_addr.s_addr = htonl(INADDR_ANY);
+  sin.sin_port = htons(port);
+  while (bind(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) {
+	perror("bind()");
+	sleep(1);
+	errors++;
+	if (errors > 100)
+	  exit(1);
+  }
+
+  len = sizeof(sin);
+  getsockname(s, (struct sockaddr *) &sin, &len);
+
+  fprintf(stderr, "Listening on port %hu\n", ntohs(sin.sin_port));
+
+  while (listen(s, MAX_CLIENTS) < 0) {
+	perror("listen()");
+	sleep(1);
+	errors++;
+	if (errors > 100)
+	  exit(1);
+  }
+
+  for (;;) {
+    len = sizeof(from);
+    g = accept(s, (struct sockaddr *)&from, &len);
+    if (g < 0) {
+      perror("accept()");
+      sleep(1);
+      continue;
+    }
+    client = malloc(sizeof(struct clientinfo));
+    client->s = g;
+    bcopy(&from, &client->sin, len);
+
+	if (use_key){
+		fd_set set;
+		FD_ZERO(&set);
+		FD_SET(client->s, &set);
+
+		struct timeval timeout;
+		timeout.tv_sec = 3; // number of seconds for timeout
+		timeout.tv_usec = 0;
+
+		int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout);
+		if (ready<1){
+			fprintf(stderr, "Prospective client failed to respond with correct key.\n");
+			close(client->s);
+			free(client);
+		} else {
+			client_key = read_line(client->s,0);
+			client_key[strlen(client_key)-1]='\0'; /* chop trailing newline */
+			if (strcmp(key, client_key)==0){
+				pthread_create(&tid, NULL, new_client, client);
+			} else {
+				fprintf(stderr, "Prospective client failed to respond with correct key.\n");
+				close(client->s);
+				free(client);
+			}
+			free(client_key);
+		}
+	} else {
+		pthread_create(&tid, NULL, new_client, client);
+	}
+  }
+
+}
+
+
+
diff --git a/training/utils/sentserver.h b/training/utils/sentserver.h
new file mode 100644
index 00000000..cd17a546
--- /dev/null
+++ b/training/utils/sentserver.h
@@ -0,0 +1,6 @@
+#ifndef SENTSERVER_H
+#define SENTSERVER_H
+
+#define DEFAULT_PORT 50000
+
+#endif
-- 
cgit v1.2.3


From fbe3415e35f7c16177314422a4f4e146bb4d808f Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>
Date: Sun, 18 Nov 2012 13:44:44 -0500
Subject: forgotten makefile

---
 .gitignore                 |  1 -
 training/utils/Makefile.am | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 training/utils/Makefile.am

(limited to 'training')

diff --git a/.gitignore b/.gitignore
index c6023822..56372ad4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -197,7 +197,6 @@ training/mira/kbest_mira
 training/pro/mr_pro_map
 training/pro/mr_pro_reduce
 training/rampion/rampion_cccp
-training/utils/Makefile.am
 training/utils/lbfgs_test
 training/utils/optimize_test
 training/utils/sentclient
diff --git a/training/utils/Makefile.am b/training/utils/Makefile.am
new file mode 100644
index 00000000..189d9a76
--- /dev/null
+++ b/training/utils/Makefile.am
@@ -0,0 +1,33 @@
+noinst_LIBRARIES = libtraining_utils.a
+
+bin_PROGRAMS = \
+  sentserver \
+  sentclient
+
+noinst_PROGRAMS = \
+  lbfgs_test \
+  optimize_test
+
+sentserver_SOURCES = sentserver.c
+sentserver_LDFLAGS = -pthread
+
+sentclient_SOURCES = sentclient.c
+sentclient_LDFLAGS = -pthread
+
+TESTS = lbfgs_test optimize_test
+
+libtraining_utils_a_SOURCES = \
+  candidate_set.cc \
+  entropy.cc \
+  optimize.cc \
+  online_optimizer.cc \
+  risk.cc
+
+optimize_test_SOURCES = optimize_test.cc
+optimize_test_LDADD = libtraining_utils.a $(top_srcdir)/utils/libutils.a -lz
+
+lbfgs_test_SOURCES = lbfgs_test.cc
+lbfgs_test_LDADD = $(top_srcdir)/utils/libutils.a -lz
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I$(top_srcdir)/klm
+
-- 
cgit v1.2.3


From 128563db1ed56e2b94a28d0a6670cc7faf6dce7b Mon Sep 17 00:00:00 2001
From: Patrick Simianer <simianer@cl.uni-heidelberg.de>
Date: Mon, 19 Nov 2012 11:38:45 +0100
Subject: merge upstream/master

---
 training/dtrain/parallelize/test/cdec.ini        | 22 ----------------------
 training/dtrain/parallelize/test/dtrain.ini      | 15 ---------------
 training/dtrain/parallelize/test/in              | 10 ----------
 training/dtrain/parallelize/test/refs            | 10 ----------
 training/dtrain/test/parallelize/test/cdec.ini   | 22 ++++++++++++++++++++++
 training/dtrain/test/parallelize/test/dtrain.ini | 15 +++++++++++++++
 training/dtrain/test/parallelize/test/in         | 10 ++++++++++
 training/dtrain/test/parallelize/test/refs       | 10 ++++++++++
 8 files changed, 57 insertions(+), 57 deletions(-)
 delete mode 100644 training/dtrain/parallelize/test/cdec.ini
 delete mode 100644 training/dtrain/parallelize/test/dtrain.ini
 delete mode 100644 training/dtrain/parallelize/test/in
 delete mode 100644 training/dtrain/parallelize/test/refs
 create mode 100644 training/dtrain/test/parallelize/test/cdec.ini
 create mode 100644 training/dtrain/test/parallelize/test/dtrain.ini
 create mode 100644 training/dtrain/test/parallelize/test/in
 create mode 100644 training/dtrain/test/parallelize/test/refs

(limited to 'training')

diff --git a/training/dtrain/parallelize/test/cdec.ini b/training/dtrain/parallelize/test/cdec.ini
deleted file mode 100644
index 72e99dc5..00000000
--- a/training/dtrain/parallelize/test/cdec.ini
+++ /dev/null
@@ -1,22 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-intersection_strategy=cube_pruning
-cubepruning_pop_limit=200
-scfg_max_span_limit=15
-feature_function=WordPenalty
-feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5
-#feature_function=ArityPenalty
-#feature_function=CMR2008ReorderingFeatures
-#feature_function=Dwarf
-#feature_function=InputIndicator
-#feature_function=LexNullJump
-#feature_function=NewJump
-#feature_function=NgramFeatures
-#feature_function=NonLatinCount
-#feature_function=OutputIndicator
-#feature_function=RuleIdentityFeatures
-#feature_function=RuleNgramFeatures
-#feature_function=RuleShape
-#feature_function=SourceSpanSizeFeatures
-#feature_function=SourceWordPenalty
-#feature_function=SpanFeatures
diff --git a/training/dtrain/parallelize/test/dtrain.ini b/training/dtrain/parallelize/test/dtrain.ini
deleted file mode 100644
index 03f9d240..00000000
--- a/training/dtrain/parallelize/test/dtrain.ini
+++ /dev/null
@@ -1,15 +0,0 @@
-k=100
-N=4
-learning_rate=0.0001
-gamma=0
-loss_margin=0
-epochs=1
-scorer=stupid_bleu
-sample_from=kbest
-filter=uniq
-pair_sampling=XYX
-hi_lo=0.1
-select_weights=last
-print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
-tmp=/tmp
-decoder_config=cdec.ini
diff --git a/training/dtrain/parallelize/test/in b/training/dtrain/parallelize/test/in
deleted file mode 100644
index a312809f..00000000
--- a/training/dtrain/parallelize/test/in
+++ /dev/null
@@ -1,10 +0,0 @@
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.0.gz" id="0">barack obama erhält als vierter us @-@ präsident den frieden nobelpreis</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.1.gz" id="1">der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.2.gz" id="2">darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.3.gz" id="3">der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.4.gz" id="4">zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.5.gz" id="5">das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.6.gz" id="6">nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.7.gz" id="7">diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.8.gz" id="8">das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.9.gz" id="9">der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt .</seg>
diff --git a/training/dtrain/parallelize/test/refs b/training/dtrain/parallelize/test/refs
deleted file mode 100644
index 4d3128cb..00000000
--- a/training/dtrain/parallelize/test/refs
+++ /dev/null
@@ -1,10 +0,0 @@
-barack obama becomes the fourth american president to receive the nobel peace prize
-the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so .
-he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things .
-the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule .
-first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations .
-the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway .
-then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award .
-he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office .
-the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan .
-the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries .
diff --git a/training/dtrain/test/parallelize/test/cdec.ini b/training/dtrain/test/parallelize/test/cdec.ini
new file mode 100644
index 00000000..72e99dc5
--- /dev/null
+++ b/training/dtrain/test/parallelize/test/cdec.ini
@@ -0,0 +1,22 @@
+formalism=scfg
+add_pass_through_rules=true
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=200
+scfg_max_span_limit=15
+feature_function=WordPenalty
+feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+#feature_function=RuleIdentityFeatures
+#feature_function=RuleNgramFeatures
+#feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/training/dtrain/test/parallelize/test/dtrain.ini b/training/dtrain/test/parallelize/test/dtrain.ini
new file mode 100644
index 00000000..03f9d240
--- /dev/null
+++ b/training/dtrain/test/parallelize/test/dtrain.ini
@@ -0,0 +1,15 @@
+k=100
+N=4
+learning_rate=0.0001
+gamma=0
+loss_margin=0
+epochs=1
+scorer=stupid_bleu
+sample_from=kbest
+filter=uniq
+pair_sampling=XYX
+hi_lo=0.1
+select_weights=last
+print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
+tmp=/tmp
+decoder_config=cdec.ini
diff --git a/training/dtrain/test/parallelize/test/in b/training/dtrain/test/parallelize/test/in
new file mode 100644
index 00000000..a312809f
--- /dev/null
+++ b/training/dtrain/test/parallelize/test/in
@@ -0,0 +1,10 @@
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.0.gz" id="0">barack obama erhält als vierter us @-@ präsident den frieden nobelpreis</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.1.gz" id="1">der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.2.gz" id="2">darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.3.gz" id="3">der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.4.gz" id="4">zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.5.gz" id="5">das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.6.gz" id="6">nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.7.gz" id="7">diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.8.gz" id="8">das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird .</seg>
+<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.9.gz" id="9">der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt .</seg>
diff --git a/training/dtrain/test/parallelize/test/refs b/training/dtrain/test/parallelize/test/refs
new file mode 100644
index 00000000..4d3128cb
--- /dev/null
+++ b/training/dtrain/test/parallelize/test/refs
@@ -0,0 +1,10 @@
+barack obama becomes the fourth american president to receive the nobel peace prize
+the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so .
+he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things .
+the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule .
+first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations .
+the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway .
+then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award .
+he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office .
+the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan .
+the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries .
-- 
cgit v1.2.3


From 655aac48c50d6c1da4a7b2f15034ec9f9bbbc0d3 Mon Sep 17 00:00:00 2001
From: Patrick Simianer <simianer@cl.uni-heidelberg.de>
Date: Mon, 19 Nov 2012 11:38:58 +0100
Subject: merge upstream/master

---
 training/dtrain/parallelize.rb | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'training')

diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb
index 1d277ff6..eb4148f5 100755
--- a/training/dtrain/parallelize.rb
+++ b/training/dtrain/parallelize.rb
@@ -7,9 +7,10 @@ if ARGV.size != 5
   exit
 end
 
-dtrain_bin = '/home/pks/bin/dtrain_local'
+cdec_dir   = '/path/to/cdec_dir'
+dtrain_bin = "#{cdec_dir}/training/dtrain/dtrain_local"
 ruby       = '/usr/bin/ruby'
-lplp_rb    = '/home/pks/mt/cdec-dtrain/dtrain/hstreaming/lplp.rb'
+lplp_rb    = "#{cdec_dir}/training/dtrain/hstreaming/lplp.rb"
 lplp_args  = 'l2 select_k 100000'
 gzip       = '/bin/gzip'
 
-- 
cgit v1.2.3


From 6802ac200ef614b4935d597ed4cfc3857c1f6c06 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>
Date: Tue, 20 Nov 2012 13:56:08 -0500
Subject: fixes for 2011 optimizer

---
 training/crf/mpi_online_optimize.cc       | 12 +++++++++++-
 word-aligner/aligner.pl                   | 19 ++++++++++++++++++-
 word-aligner/makefiles/makefile.grammars  |  2 +-
 word-aligner/makefiles/makefile.model.f-e | 14 ++++++++++++++
 4 files changed, 44 insertions(+), 3 deletions(-)
 create mode 100644 word-aligner/makefiles/makefile.model.f-e

(limited to 'training')

diff --git a/training/crf/mpi_online_optimize.cc b/training/crf/mpi_online_optimize.cc
index d6968848..9e1ae34c 100644
--- a/training/crf/mpi_online_optimize.cc
+++ b/training/crf/mpi_online_optimize.cc
@@ -5,6 +5,7 @@
 #include <cassert>
 #include <cmath>
 #include <tr1/memory>
+#include <ctime>
 
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
@@ -41,6 +42,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("training_agenda,a",po::value<string>(), "Text file listing a series of configuration files and the number of iterations to train using each configuration successively")
         ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(5), "Number of training instances evaluated per processor in each minibatch")
         ("optimization_method,m", po::value<string>()->default_value("sgd"), "Optimization method (sgd)")
+        ("max_walltime", po::value<unsigned>(), "Maximum walltime to run (in minutes)")
         ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
         ("eta_0,e", po::value<double>()->default_value(0.2), "Initial learning rate for SGD (eta_0)")
         ("L1,1","Use L1 regularization")
@@ -304,6 +306,9 @@ int main(int argc, char** argv) {
   int write_weights_every_ith = 100; // TODO configure
   int titer = -1;
 
+  unsigned timeout = 0;
+  if (conf.count("max_walltime")) timeout = 60 * conf["max_walltime"].as<unsigned>();
+  const time_t start_time = time(NULL);
   for (int ai = 0; ai < agenda.size(); ++ai) {
     const string& cur_config = agenda[ai].first;
     const unsigned max_iteration = agenda[ai].second;
@@ -336,9 +341,14 @@ int main(int argc, char** argv) {
           ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz";
           fname = o.str();
         }
+        const time_t cur_time = time(NULL);
+        if (timeout) {
+          if ((cur_time - start_time) > timeout) converged = true;
+        }
         if (converged && ((ai+1)==agenda.size())) { fname = "weights.final.gz"; }
         ostringstream vv;
-        vv << "total iter=" << titer << " (of current config iter=" << iter << ")  minibatch=" << size_per_proc << " sentences/proc x " << size << " procs.   num_feats=" << x.size() << '/' << FD::NumFeats() << "   passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << "   eta=" << lr->eta(titer);
+        double minutes = (cur_time - start_time) / 60.0;
+        vv << "total walltime=" << minutes << "min iter=" << titer << " (of current config iter=" << iter << ")  minibatch=" << size_per_proc << " sentences/proc x " << size << " procs.   num_feats=" << x.size() << '/' << FD::NumFeats() << "   passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << "   eta=" << lr->eta(titer);
         const string svv = vv.str();
         cerr << svv << endl;
         Weights::WriteToFile(fname, lambdas, true, &svv);
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index c5078645..cbccb94a 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -51,6 +51,8 @@ while(<IN>) {
   chomp;
   my ($f, $e) = split / \|\|\| /;
   die "Bad format, excepted ||| separated line" unless defined $f && defined $e;
+  $f =~ s/\[/(/g;
+  $e =~ s/\]/)/g;
   print F "$f\n";
   print E "$e\n";
 }
@@ -80,6 +82,11 @@ NCLASSES = $num_classes
 TARGETS = @targets
 PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary
 PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15
+#MPIJOBS = 4
+#MPIRUN = mpirun -np $(MPIJOBS)
+MPIRUN=
+
+WALLTIME=90
 
 export
 
@@ -99,7 +106,15 @@ clean:
 EOT
 close TOPLEVEL;
 
-print STDERR "Created alignment task. chdir to talign/ then type make.\n\n";
+print STDERR <<EOT;
+Created alignment task. To start, run:
+cd talign/
+make
+
+To specify the walltime *in minutes* used by the optimizer, use
+make WALLTIME=120
+
+EOT
 exit 0;
 
 sub make_stage {
@@ -142,6 +157,8 @@ EOT
   open AGENDA, ">$stage_dir/agenda.txt" or die "Can't write $stage_dir/agenda.txt: $!";
   print AGENDA "cdec.ini $TRAINING_ITERATIONS\n";
   close AGENDA;
+  `cp $SCRIPT_DIR/makefiles/makefile.model.$direction $stage_dir/Makefile`;
+  die unless $? == 0;
 }
 
 sub usage {
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index ce3e1638..8d3ea8cb 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -4,7 +4,7 @@ clean:
 	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* freq* wordpairs*
 
 SUPPORT_DIR = $(SCRIPT_DIR)/support
-GZIP = /usr/bin/gzip
+GZIP = gzip
 ZCAT = zcat
 EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl
 EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl
diff --git a/word-aligner/makefiles/makefile.model.f-e b/word-aligner/makefiles/makefile.model.f-e
new file mode 100644
index 00000000..404f5b30
--- /dev/null
+++ b/word-aligner/makefiles/makefile.model.f-e
@@ -0,0 +1,14 @@
+all: output.f-e.aligned
+
+clean:
+	$(RM) output.f-e.a weights.cur.gz
+
+CDEC = $(SCRIPT_DIR)/../decoder/cdec
+OPTIMIZE = $(SCRIPT_DIR)/../training/crf/mpi_online_optimize
+
+weights.cur.gz: ../grammars/wordpairs.f-e.features.gz
+	$(MPIRUN) $(OPTIMIZE) -a agenda.txt -1 -C 1.0 -t ../grammars/corpus.f-e --max_walltime 90
+
+output.f-e.aligned: weights.cur.gz
+	$(CDEC) -c cdec.ini -w $< --lextrans_align_only -i ../grammars/corpus.f-e -a > $@
+
-- 
cgit v1.2.3


From 29a47a94bfc09450802484e5cd3f835d39c9f66c Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>
Date: Sat, 15 Dec 2012 02:53:56 -0500
Subject: enable kenlm compression

---
 configure.ac                 | 26 ++++++++++++++++++--------
 decoder/Makefile.am          | 11 +++++------
 example_extff/Makefile.am    |  2 +-
 klm/util/have.hh             |  3 +--
 mteval/Makefile.am           |  6 +++---
 python/setup.py.in           |  2 +-
 training/dpmert/Makefile.am  | 10 +++++-----
 training/dtrain/Makefile.am  |  2 +-
 training/minrisk/Makefile.am |  2 +-
 training/mira/Makefile.am    |  2 +-
 training/pro/Makefile.am     |  4 ++--
 training/rampion/Makefile.am |  2 +-
 training/utils/Makefile.am   |  4 ++--
 utils/Makefile.am            | 18 +++++++++---------
 word-aligner/Makefile.am     |  2 +-
 15 files changed, 52 insertions(+), 44 deletions(-)

(limited to 'training')

diff --git a/configure.ac b/configure.ac
index f4650ca4..eabb8645 100644
--- a/configure.ac
+++ b/configure.ac
@@ -18,6 +18,23 @@ BOOST_TEST
 AM_PATH_PYTHON
 AC_CHECK_HEADER(dlfcn.h,AC_DEFINE(HAVE_DLFCN_H))
 AC_CHECK_LIB(dl, dlopen)
+AC_CHECK_HEADERS(zlib.h,
+    AC_CHECK_LIB(z, gzread,[
+        AC_DEFINE(HAVE_ZLIB,[],[Do we have zlib])
+        ZLIBS="$ZLIBS -lz"
+    ]))
+
+AC_CHECK_HEADERS(bzlib.h,
+    AC_CHECK_LIB(bz2, BZ2_bzReadOpen,[
+        AC_DEFINE(HAVE_BZLIB,[],[Do we have bzlib])
+        ZLIBS="$ZLIBS -lbz2"
+    ]))
+
+AC_CHECK_HEADERS(lzma.h,
+    AC_CHECK_LIB(lzma, lzma_code,[
+        AC_DEFINE(HAVE_XZLIB,[],[Do we have lzma])
+        ZLIBS="$ZLIBS -llzma"
+    ]))
 
 AC_ARG_ENABLE(mpi,
  [ --enable-mpi  Build MPI binaries, assumes mpi.h is present ],
@@ -72,19 +89,12 @@ fi
 CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
 LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS $BOOST_SERIALIZATION_LDFLAGS $BOOST_SYSTEM_LDFLAGS"
 # $BOOST_THREAD_LDFLAGS"
-LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_SERIALIZATION_LIBS $BOOST_SYSTEM_LIBS"
+LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS $BOOST_SERIALIZATION_LIBS $BOOST_SYSTEM_LIBS $ZLIBS"
 # $BOOST_THREAD_LIBS"
 
 AC_CHECK_HEADER(google/dense_hash_map,
                [AC_DEFINE([HAVE_SPARSEHASH], [1], [flag for google::dense_hash_map])])
 
-AC_CHECK_HEADER(zlib.h,
-               [AC_DEFINE([HAVE_ZLIB], [1], [zlib])])
-AC_CHECK_HEADER(bzlib.h,
-               [AC_DEFINE([HAVE_BZLIB], [1], [bzlib])])
-AC_CHECK_HEADER(lzma.h,
-               [AC_DEFINE([HAVE_XZLIB], [1], [xzlib])])
-
 AC_PROG_INSTALL
 
 CPPFLAGS="-DPIC -fPIC $CPPFLAGS -DHAVE_CONFIG_H -DKENLM_MAX_ORDER=6"
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 6914fa0f..88a6116c 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -8,16 +8,16 @@ noinst_PROGRAMS = \
  
 TESTS = trule_test parser_test grammar_test hg_test
 parser_test_SOURCES = parser_test.cc
-parser_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
+parser_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a
 grammar_test_SOURCES = grammar_test.cc
-grammar_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
+grammar_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a
 hg_test_SOURCES = hg_test.cc
-hg_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
+hg_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a
 trule_test_SOURCES = trule_test.cc
-trule_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a -lz
+trule_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) libcdec.a ../mteval/libmteval.a ../utils/libutils.a
 
 cdec_SOURCES = cdec.cc
-cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/search/libksearch.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+cdec_LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a ../klm/search/libksearch.a ../klm/lm/libklm.a ../klm/util/libklm_util.a
 
 AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils -I../klm
 
@@ -82,4 +82,3 @@ libcdec_a_SOURCES = \
   JSON_parser.c \
   json_parse.cc \
   grammar.cc
-
diff --git a/example_extff/Makefile.am b/example_extff/Makefile.am
index ac2694ca..7b7c34b5 100644
--- a/example_extff/Makefile.am
+++ b/example_extff/Makefile.am
@@ -1,4 +1,4 @@
-AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils -I../klm -I../decoder
+AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I.. -I../mteval -I../utils -I../klm -I../decoder
 
 lib_LTLIBRARIES = libff_example.la
 libff_example_la_SOURCES = ff_example.cc
diff --git a/klm/util/have.hh b/klm/util/have.hh
index b86ba11e..85b838e4 100644
--- a/klm/util/have.hh
+++ b/klm/util/have.hh
@@ -11,8 +11,7 @@
 #endif
 
 #ifdef HAVE_CONFIG_H
-// Chris; uncomment this line.  
-//#include "config.h"
+#include "config.h"
 #endif
 
 #endif // UTIL_HAVE__
diff --git a/mteval/Makefile.am b/mteval/Makefile.am
index 5e9bba91..4444285f 100644
--- a/mteval/Makefile.am
+++ b/mteval/Makefile.am
@@ -23,12 +23,12 @@ libmteval_a_SOURCES = \
   ter.cc
 
 fast_score_SOURCES = fast_score.cc
-fast_score_LDADD = libmteval.a $(top_srcdir)/utils/libutils.a -lz
+fast_score_LDADD = libmteval.a $(top_srcdir)/utils/libutils.a
 
 mbr_kbest_SOURCES = mbr_kbest.cc
-mbr_kbest_LDADD = libmteval.a $(top_srcdir)/utils/libutils.a -lz
+mbr_kbest_LDADD = libmteval.a $(top_srcdir)/utils/libutils.a
 
 scorer_test_SOURCES = scorer_test.cc
-scorer_test_LDADD = libmteval.a $(top_srcdir)/utils/libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
+scorer_test_LDADD = libmteval.a $(top_srcdir)/utils/libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
 
 AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils
diff --git a/python/setup.py.in b/python/setup.py.in
index dac72903..fa8a9f5e 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -17,7 +17,7 @@ ext_modules = [
         sources=['src/_cdec.cpp'],
         include_dirs=INC,
         library_dirs=LIB,
-        libraries=LIBS + ['z', 'cdec', 'utils', 'mteval', 'training_utils', 'klm', 'klm_util', 'ksearch'],
+        libraries=['cdec', 'utils', 'mteval', 'training_utils', 'klm', 'klm_util', 'ksearch'] + LIBS,
         extra_compile_args=CPPFLAGS,
         extra_link_args=LDFLAGS),
     Extension(name='cdec.sa._sa',
diff --git a/training/dpmert/Makefile.am b/training/dpmert/Makefile.am
index ff318bef..3dbdfa69 100644
--- a/training/dpmert/Makefile.am
+++ b/training/dpmert/Makefile.am
@@ -8,18 +8,18 @@ noinst_PROGRAMS = \
 TESTS = lo_test
 
 mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc
-mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a
 
 # nbest2hg_SOURCES = nbest2hg.cc
-# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz
+# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst
 
 mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc
-mr_dpmert_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+mr_dpmert_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a
 
 mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc
-mr_dpmert_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+mr_dpmert_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a
 
 lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc
-lo_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+lo_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a
 
 AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/dtrain/Makefile.am b/training/dtrain/Makefile.am
index 5b48e756..4f51b0c8 100644
--- a/training/dtrain/Makefile.am
+++ b/training/dtrain/Makefile.am
@@ -1,7 +1,7 @@
 bin_PROGRAMS = dtrain
 
 dtrain_SOURCES = dtrain.cc score.cc
-dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a
 
 AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
 
diff --git a/training/minrisk/Makefile.am b/training/minrisk/Makefile.am
index a15e821e..821730c2 100644
--- a/training/minrisk/Makefile.am
+++ b/training/minrisk/Makefile.am
@@ -1,6 +1,6 @@
 bin_PROGRAMS = minrisk_optimize
 
 minrisk_optimize_SOURCES = minrisk_optimize.cc
-minrisk_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/training/liblbfgs/liblbfgs.a -lz
+minrisk_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/training/liblbfgs/liblbfgs.a
 
 AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training -I$(top_srcdir)/training/utils
diff --git a/training/mira/Makefile.am b/training/mira/Makefile.am
index ae609ede..c8f404fb 100644
--- a/training/mira/Makefile.am
+++ b/training/mira/Makefile.am
@@ -1,6 +1,6 @@
 bin_PROGRAMS = kbest_mira
 
 kbest_mira_SOURCES = kbest_mira.cc
-kbest_mira_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+kbest_mira_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a
 
 AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/pro/Makefile.am b/training/pro/Makefile.am
index 1916b6b2..e0a45a33 100644
--- a/training/pro/Makefile.am
+++ b/training/pro/Makefile.am
@@ -3,9 +3,9 @@ bin_PROGRAMS = \
   mr_pro_reduce
 
 mr_pro_map_SOURCES = mr_pro_map.cc
-mr_pro_map_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+mr_pro_map_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a
 
 mr_pro_reduce_SOURCES = mr_pro_reduce.cc
-mr_pro_reduce_LDADD = $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/utils/libutils.a -lz
+mr_pro_reduce_LDADD = $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/utils/libutils.a
 
 AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils -I$(top_srcdir)/training
diff --git a/training/rampion/Makefile.am b/training/rampion/Makefile.am
index 1633d0f7..ef0ca147 100644
--- a/training/rampion/Makefile.am
+++ b/training/rampion/Makefile.am
@@ -1,6 +1,6 @@
 bin_PROGRAMS = rampion_cccp
 
 rampion_cccp_SOURCES = rampion_cccp.cc
-rampion_cccp_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+rampion_cccp_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a
 
 AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils
diff --git a/training/utils/Makefile.am b/training/utils/Makefile.am
index 189d9a76..c9405d4e 100644
--- a/training/utils/Makefile.am
+++ b/training/utils/Makefile.am
@@ -24,10 +24,10 @@ libtraining_utils_a_SOURCES = \
   risk.cc
 
 optimize_test_SOURCES = optimize_test.cc
-optimize_test_LDADD = libtraining_utils.a $(top_srcdir)/utils/libutils.a -lz
+optimize_test_LDADD = libtraining_utils.a $(top_srcdir)/utils/libutils.a
 
 lbfgs_test_SOURCES = lbfgs_test.cc
-lbfgs_test_LDADD = $(top_srcdir)/utils/libutils.a -lz
+lbfgs_test_LDADD = $(top_srcdir)/utils/libutils.a
 
 AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I$(top_srcdir)/klm
 
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 3ad9d69e..639c30b8 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -33,24 +33,24 @@ if HAVE_CMPH
 endif
 
 reconstruct_weights_SOURCES = reconstruct_weights.cc
-reconstruct_weights_LDADD = libutils.a -lz
+reconstruct_weights_LDADD = libutils.a
 atools_SOURCES = atools.cc
-atools_LDADD = libutils.a -lz
+atools_LDADD = libutils.a
 
 phmt_SOURCES = phmt.cc
-phmt_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
+phmt_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
 ts_SOURCES = ts.cc
-ts_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
+ts_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
 m_test_SOURCES = m_test.cc
-m_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
+m_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
 dict_test_SOURCES = dict_test.cc
-dict_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
+dict_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
 weights_test_SOURCES = weights_test.cc
-weights_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
+weights_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
 logval_test_SOURCES = logval_test.cc
-logval_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
+logval_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
 small_vector_test_SOURCES = small_vector_test.cc
-small_vector_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -lz
+small_vector_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
 
 ################################################################
 # do NOT NOT NOT add any other -I includes NO NO NO NO NO ######
diff --git a/word-aligner/Makefile.am b/word-aligner/Makefile.am
index 280d3ae7..2dcb688e 100644
--- a/word-aligner/Makefile.am
+++ b/word-aligner/Makefile.am
@@ -1,6 +1,6 @@
 bin_PROGRAMS = fast_align
 
 fast_align_SOURCES = fast_align.cc ttables.cc
-fast_align_LDADD = $(top_srcdir)/utils/libutils.a -lz
+fast_align_LDADD = $(top_srcdir)/utils/libutils.a
 
 AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/training
-- 
cgit v1.2.3


From 201af2acd394415a05072fbd53d42584875aa4b4 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sun, 16 Dec 2012 21:19:36 -0500
Subject: add grammar convert back in

---
 training/utils/Makefile.am | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'training')

diff --git a/training/utils/Makefile.am b/training/utils/Makefile.am
index c9405d4e..d708a9f5 100644
--- a/training/utils/Makefile.am
+++ b/training/utils/Makefile.am
@@ -2,7 +2,8 @@ noinst_LIBRARIES = libtraining_utils.a
 
 bin_PROGRAMS = \
   sentserver \
-  sentclient
+  sentclient \
+  grammar_convert
 
 noinst_PROGRAMS = \
   lbfgs_test \
@@ -26,6 +27,9 @@ libtraining_utils_a_SOURCES = \
 optimize_test_SOURCES = optimize_test.cc
 optimize_test_LDADD = libtraining_utils.a $(top_srcdir)/utils/libutils.a
 
+grammar_convert_SOURCES = grammar_convert.cc
+grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a
+
 lbfgs_test_SOURCES = lbfgs_test.cc
 lbfgs_test_LDADD = $(top_srcdir)/utils/libutils.a
 
-- 
cgit v1.2.3