Merge branch 'master' of https://github.com/redpony/cdec

author: Paul Baltescu <pauldb89@gmail.com> 2013-04-24 17:18:10 +0100
committer: Paul Baltescu <pauldb89@gmail.com> 2013-04-24 17:18:10 +0100
commit: e8b412577b9d3fe2090b9f48443f919cd268c809 (patch)
tree: b46a7b51d365519dfb5170d71bac33be6d3e29b9 /training
parent: d189426a7ea56b71eb6e25ed02a7b0993cfb56a8 (diff)
parent: 5aee54869aa19cfe9be965e67a472e94449d16da (diff)
73 files changed, 2800 insertions, 571 deletions
diff --git a/training/Makefile.am b/training/Makefile.am
index e95e045f..8ef3c939 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -6,6 +6,7 @@ SUBDIRS = \
   dpmert \
   pro \
   dtrain \
+  latent_svm \
   mira \
   rampion
 
diff --git a/training/dtrain/README.md b/training/dtrain/README.md
index 7edabbf1..2ab2f232 100644
--- a/training/dtrain/README.md
+++ b/training/dtrain/README.md
@@ -13,36 +13,18 @@ Builds when building cdec, see ../BUILDING .
 To build only parts needed for dtrain do
 ```
   autoreconf -ifv
-  ./configure [--disable-gtest]
-  cd dtrain/; make
+  ./configure
+  cd training/dtrain/; make
 ```
 
 Running
 -------
-To run this on a dev set locally:
-```
-    #define DTRAIN_LOCAL
-```
-otherwise remove that line or undef, then recompile. You need a single
-grammar file or input annotated with per-sentence grammars (psg) as you
-would use with cdec. Additionally you need to give dtrain a file with
-references (--refs) when running locally.
-
-The input for use with hadoop streaming looks like this:
-```
-    <sid>\t<source>\t<ref>\t<grammar rules separated by \t>
-```
-To convert a psg to this format you need to replace all "\n"
-by "\t". Make sure there are no tabs in your data.
-
-For an example of local usage (with the 'distributed' format)
-the see test/example/ . This expects dtrain to be built without
-DTRAIN_LOCAL.
+See directories under test/ .
 
 Legal
 -----
-Copyright (c) 2012 by Patrick Simianer <p@simianer.de>
+Copyright (c) 2012-2013 by Patrick Simianer <p@simianer.de>
 
-See the file ../LICENSE.txt for the licensing terms that this software is
+See the file LICENSE.txt in the root folder for the licensing terms that this software is
 released under.
 
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
index 18286668..149f87d4 100644
--- a/training/dtrain/dtrain.cc
+++ b/training/dtrain/dtrain.cc
@@ -6,15 +6,14 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
 {
   po::options_description ini("Configuration File Options");
   ini.add_options()
-    ("input",             po::value<string>()->default_value("-"),                                                   "input file")
+    ("input",             po::value<string>()->default_value("-"),                                             "input file (src)")
+    ("refs,r",            po::value<string>(),                                                                       "references")
     ("output",            po::value<string>()->default_value("-"),                          "output weights file, '-' for STDOUT")
     ("input_weights",     po::value<string>(),                                "input weights file (e.g. from previous iteration)")
     ("decoder_config",    po::value<string>(),                                                      "configuration file for cdec")
     ("print_weights",     po::value<string>(),                                               "weights to print on each iteration")
     ("stop_after",        po::value<unsigned>()->default_value(0),                                 "stop after X input sentences")
-    ("tmp",               po::value<string>()->default_value("/tmp"),                                           "temp dir to use")
     ("keep",              po::value<bool>()->zero_tokens(),                               "keep weights files for each iteration")
-    ("hstreaming",        po::value<string>(),                                   "run in hadoop streaming mode, arg is a task id")
     ("epochs",            po::value<unsigned>()->default_value(10),                               "# of iterations T (per shard)")
     ("k",                 po::value<unsigned>()->default_value(100),                            "how many translations to sample")
     ("sample_from",       po::value<string>()->default_value("kbest"),     "where to sample translations from: 'kbest', 'forest'")
@@ -28,16 +27,13 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
     ("gamma",             po::value<weight_t>()->default_value(0.),                            "gamma for SVM (0 for perceptron)")
     ("select_weights",    po::value<string>()->default_value("last"),     "output best, last, avg weights ('VOID' to throw away)")
     ("rescale",           po::value<bool>()->zero_tokens(),                              "rescale weight vector after each input")
-    ("l1_reg",            po::value<string>()->default_value("none"),      "apply l1 regularization as in 'Tsuroka et al' (2010)")
+    ("l1_reg",            po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010) UNTESTED")
     ("l1_reg_strength",   po::value<weight_t>(),                                                     "l1 regularization strength")
     ("fselect",           po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPLEMENTED") // TODO
     ("approx_bleu_d",     po::value<score_t>()->default_value(0.9),                                   "discount for approx. BLEU")
     ("scale_bleu_diff",   po::value<bool>()->zero_tokens(),                      "learning rate <- bleu diff of a misranked pair")
     ("loss_margin",       po::value<weight_t>()->default_value(0.),  "update if no error in pref pair but model scores this near")
     ("max_pairs",         po::value<unsigned>()->default_value(std::numeric_limits<unsigned>::max()), "max. # of pairs per Sent.")
-#ifdef DTRAIN_LOCAL
-    ("refs,r",            po::value<string>(),                                                         "references in local mode")
-#endif
     ("noup",              po::value<bool>()->zero_tokens(),                                               "do not update weights");
   po::options_description cl("Command Line Options");
   cl.add_options()
@@ -55,16 +51,6 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
     cerr << cl << endl;
     return false;
   }
-  if (cfg->count("hstreaming") && (*cfg)["output"].as<string>() != "-") {
-    cerr << "When using 'hstreaming' the 'output' param should be '-'." << endl;
-    return false;
-  }
-#ifdef DTRAIN_LOCAL
-  if ((*cfg)["input"].as<string>() == "-") {
-    cerr << "Can't use stdin as input with this binary. Recompile without DTRAIN_LOCAL" << endl;
-    return false;
-  }
-#endif
   if ((*cfg)["sample_from"].as<string>() != "kbest"
        && (*cfg)["sample_from"].as<string>() != "forest") {
     cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as<string>() << "', use 'kbest' or 'forest'." << endl;
@@ -111,17 +97,8 @@ main(int argc, char** argv)
   if (cfg.count("verbose")) verbose = true;
   bool noup = false;
   if (cfg.count("noup")) noup = true;
-  bool hstreaming = false;
-  string task_id;
-  if (cfg.count("hstreaming")) {
-    hstreaming = true;
-    quiet = true;
-    task_id = cfg["hstreaming"].as<string>();
-    cerr.precision(17);
-  }
   bool rescale = false;
   if (cfg.count("rescale")) rescale = true;
-  HSReporter rep(task_id);
   bool keep = false;
   if (cfg.count("keep")) keep = true;
 
@@ -148,6 +125,7 @@ main(int argc, char** argv)
   if (cfg.count("print_weights"))
     boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" "));
 
+
   // setup decoder
   register_feature_functions();
   SetSilent(true);
@@ -163,6 +141,8 @@ main(int argc, char** argv)
     scorer = dynamic_cast<BleuScorer*>(new BleuScorer);
   } else if (scorer_str == "stupid_bleu") {
     scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer);
+  } else if (scorer_str == "fixed_stupid_bleu") {
+    scorer = dynamic_cast<FixedStupidBleuScorer*>(new FixedStupidBleuScorer);
   } else if (scorer_str == "smooth_bleu") {
     scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);
   } else if (scorer_str == "sum_bleu") {
@@ -201,6 +181,11 @@ main(int argc, char** argv)
   weight_t eta = cfg["learning_rate"].as<weight_t>();
   weight_t gamma = cfg["gamma"].as<weight_t>();
 
+  // faster perceptron: consider only misranked pairs, see
+  // DO NOT ENABLE  WITH SVM (gamma > 0) OR loss_margin!
+  bool faster_perceptron = false;
+  if (gamma==0 && loss_margin==0) faster_perceptron = true;
+
   // l1 regularization
   bool l1naive = false;
   bool l1clip = false;
@@ -222,16 +207,8 @@ main(int argc, char** argv)
   // buffer input for t > 0
   vector<string> src_str_buf;          // source strings (decoder takes only strings)
   vector<vector<WordID> > ref_ids_buf; // references as WordID vecs
-  // where temp files go
-  string tmp_path = cfg["tmp"].as<string>();
-#ifdef DTRAIN_LOCAL
   string refs_fn = cfg["refs"].as<string>();
   ReadFile refs(refs_fn);
-#else
-  string grammar_buf_fn = gettmpf(tmp_path, "dtrain-grammars");
-  ogzstream grammar_buf_out;
-  grammar_buf_out.open(grammar_buf_fn.c_str());
-#endif
 
   unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size
   vector<pair<score_t, score_t> > all_scores;
@@ -246,7 +223,7 @@ main(int argc, char** argv)
     cerr << setw(25) << "k " << k << endl;
     cerr << setw(25) << "N " << N << endl;
     cerr << setw(25) << "T " << T << endl;
-    cerr << setw(25) << "scorer '" << scorer_str << "'" << endl;
+    cerr << setw(26) << "scorer '" << scorer_str << "'" << endl;
     if (scorer_str == "approx_bleu")
       cerr << setw(25) << "approx. B discount " << approx_bleu_d << endl;
     cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl;
@@ -256,6 +233,7 @@ main(int argc, char** argv)
     else cerr << setw(25) << "learning rate " << "bleu diff" << endl;
     cerr << setw(25) << "gamma " << gamma << endl;
     cerr << setw(25) << "loss margin " << loss_margin << endl;
+    cerr << setw(25) << "faster perceptron " << faster_perceptron << endl;
     cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;
     if (pair_sampling == "XYX")
       cerr << setw(25) << "hi lo " << hi_lo << endl;
@@ -268,9 +246,7 @@ main(int argc, char** argv)
     cerr << setw(25) << "max pairs " << max_pairs << endl;
     cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
     cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
-#ifdef DTRAIN_LOCAL
     cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl;
-#endif
     cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
     if (cfg.count("input_weights"))
       cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl;
@@ -283,14 +259,8 @@ main(int argc, char** argv)
   for (unsigned t = 0; t < T; t++) // T epochs
   {
 
-  if (hstreaming) cerr << "reporter:status:Iteration #" << t+1 << " of " << T << endl;
-
   time_t start, end;
   time(&start);
-#ifndef DTRAIN_LOCAL
-  igzstream grammar_buf_in;
-  if (t > 0) grammar_buf_in.open(grammar_buf_fn.c_str());
-#endif
   score_t score_sum = 0.;
   score_t model_sum(0);
   unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0;
@@ -338,52 +308,6 @@ main(int argc, char** argv)
 
     // getting input
     vector<WordID> ref_ids; // reference as vector<WordID>
-#ifndef DTRAIN_LOCAL
-    vector<string> in_split; // input: sid\tsrc\tref\tpsg
-    if (t == 0) {
-      // handling input
-      split_in(in, in_split);
-      if (hstreaming && ii == 0) cerr << "reporter:counter:" << task_id << ",First ID," << in_split[0] << endl;
-      // getting reference
-      vector<string> ref_tok;
-      boost::split(ref_tok, in_split[2], boost::is_any_of(" "));
-      register_and_convert(ref_tok, ref_ids);
-      ref_ids_buf.push_back(ref_ids);
-      // process and set grammar
-      bool broken_grammar = true; // ignore broken grammars
-      for (string::iterator it = in.begin(); it != in.end(); it++) {
-        if (!isspace(*it)) {
-          broken_grammar = false;
-          break;
-        }
-      }
-      if (broken_grammar) {
-        cerr << "Broken grammar for " << ii+1 << "! Ignoring this input." << endl;
-        continue;
-      }
-      boost::replace_all(in, "\t", "\n");
-      in += "\n";
-      grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
-      decoder.AddSupplementalGrammarFromString(in);
-      src_str_buf.push_back(in_split[1]);
-      // decode
-      observer->SetRef(ref_ids);
-      decoder.Decode(in_split[1], observer);
-    } else {
-      // get buffered grammar
-      string grammar_str;
-      while (true) {
-        string rule;
-        getline(grammar_buf_in, rule);
-        if (boost::starts_with(rule, DTRAIN_GRAMMAR_DELIM)) break;
-        grammar_str += rule + "\n";
-      }
-      decoder.AddSupplementalGrammarFromString(grammar_str);
-      // decode
-      observer->SetRef(ref_ids_buf[ii]);
-      decoder.Decode(src_str_buf[ii], observer);
-    }
-#else
     if (t == 0) {
       string r_;
       getline(*refs, r_);
@@ -400,7 +324,6 @@ main(int argc, char** argv)
       decoder.Decode(in, observer);
     else
       decoder.Decode(src_str_buf[ii], observer);
-#endif
 
     // get (scored) samples
     vector<ScoredHyp>* samples = observer->GetSamples();
@@ -430,25 +353,26 @@ main(int argc, char** argv)
       // get pairs
       vector<pair<ScoredHyp,ScoredHyp> > pairs;
       if (pair_sampling == "all")
-        all_pairs(samples, pairs, pair_threshold, max_pairs);
+        all_pairs(samples, pairs, pair_threshold, max_pairs, faster_perceptron);
       if (pair_sampling == "XYX")
-        partXYX(samples, pairs, pair_threshold, max_pairs, hi_lo);
+        partXYX(samples, pairs, pair_threshold, max_pairs, faster_perceptron, hi_lo);
       if (pair_sampling == "PRO")
         PROsampling(samples, pairs, pair_threshold, max_pairs);
       npairs += pairs.size();
 
       for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin();
            it != pairs.end(); it++) {
-#ifdef DTRAIN_FASTER_PERCEPTRON
-        bool rank_error = true; // pair sampling already did this for us
-        rank_errors++;
-        score_t margin = std::numeric_limits<float>::max();
-#else
-        bool rank_error = it->first.model <= it->second.model;
+        bool rank_error;
+        score_t margin;
+        if (faster_perceptron) { // we only have considering misranked pairs
+          rank_error = true; // pair sampling already did this for us
+          margin = std::numeric_limits<float>::max();
+        } else {
+          rank_error = it->first.model <= it->second.model;
+          margin = fabs(fabs(it->first.model) - fabs(it->second.model));
+          if (!rank_error && margin < loss_margin) margin_violations++;
+        }
         if (rank_error) rank_errors++;
-        score_t margin = fabs(fabs(it->first.model) - fabs(it->second.model));
-        if (!rank_error && margin < loss_margin) margin_violations++;
-#endif
         if (scale_bleu_diff) eta = it->first.score - it->second.score;
         if (rank_error || margin < loss_margin) {
           SparseVector<weight_t> diff_vec = it->first.f - it->second.f;
@@ -459,35 +383,40 @@ main(int argc, char** argv)
       }
 
       // l1 regularization
+      // please note that this penalizes _all_ weights
+      // (contrary to only the ones changed by the last update)
+      // after a _sentence_ (not after each example/pair)
       if (l1naive) {
-        for (unsigned d = 0; d < lambdas.size(); d++) {
-          weight_t v = lambdas.get(d);
-          lambdas.set_value(d, v - sign(v) * l1_reg);
+        FastSparseVector<weight_t>::iterator it = lambdas.begin();
+        for (; it != lambdas.end(); ++it) {
+          it->second -= sign(it->second) * l1_reg;
         }
       } else if (l1clip) {
-        for (unsigned d = 0; d < lambdas.size(); d++) {
-          if (lambdas.nonzero(d)) {
-            weight_t v = lambdas.get(d);
+        FastSparseVector<weight_t>::iterator it = lambdas.begin();
+        for (; it != lambdas.end(); ++it) {
+          if (it->second != 0) {
+            weight_t v = it->second;
             if (v > 0) {
-              lambdas.set_value(d, max(0., v - l1_reg));
+              it->second = max(0., v - l1_reg);
             } else {
-              lambdas.set_value(d, min(0., v + l1_reg));
+              it->second = min(0., v + l1_reg);
             }
           }
         }
       } else if (l1cumul) {
         weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input
-        for (unsigned d = 0; d < lambdas.size(); d++) {
-          if (lambdas.nonzero(d)) {
-            weight_t v = lambdas.get(d);
-            weight_t penalty = 0;
+        FastSparseVector<weight_t>::iterator it = lambdas.begin();
+        for (; it != lambdas.end(); ++it) {
+          if (it->second != 0) {
+            weight_t v = it->second;
+            weight_t penalized = 0.;
             if (v > 0) {
-              penalty = max(0., v-(acc_penalty + cumulative_penalties.get(d)));
+              penalized = max(0., v-(acc_penalty + cumulative_penalties.get(it->first)));
             } else {
-              penalty = min(0., v+(acc_penalty - cumulative_penalties.get(d)));
+              penalized = min(0., v+(acc_penalty - cumulative_penalties.get(it->first)));
             }
-            lambdas.set_value(d, penalty);
-            cumulative_penalties.set_value(d, cumulative_penalties.get(d)+penalty);
+            it->second = penalized;
+            cumulative_penalties.set_value(it->first, cumulative_penalties.get(it->first)+penalized);
           }
         }
       }
@@ -498,11 +427,6 @@ main(int argc, char** argv)
 
     ++ii;
 
-    if (hstreaming) {
-      rep.update_counter("Seen #"+boost::lexical_cast<string>(t+1), 1u);
-      rep.update_counter("Seen", 1u);
-    }
-
   } // input loop
 
   if (average) w_average += lambdas;
@@ -511,21 +435,8 @@ main(int argc, char** argv)
 
   if (t == 0) {
     in_sz = ii; // remember size of input (# lines)
-    if (hstreaming) {
-      rep.update_counter("|Input|", ii);
-      rep.update_gcounter("|Input|", ii);
-      rep.update_gcounter("Shards", 1u);
-    }
   }
 
-#ifndef DTRAIN_LOCAL
-  if (t == 0) {
-    grammar_buf_out.close();
-  } else {
-    grammar_buf_in.close();
-  }
-#endif
-
   // print some stats
   score_t score_avg = score_sum/(score_t)in_sz;
   score_t model_avg = model_sum/(score_t)in_sz;
@@ -539,7 +450,7 @@ main(int argc, char** argv)
   }
 
   unsigned nonz = 0;
-  if (!quiet || hstreaming) nonz = (unsigned)lambdas.num_nonzero();
+  if (!quiet) nonz = (unsigned)lambdas.num_nonzero();
 
   if (!quiet) {
     cerr << _p5 << _p << "WEIGHTS" << endl;
@@ -552,28 +463,18 @@ main(int argc, char** argv)
     cerr << _np << " 1best avg model score: " << model_avg;
     cerr << _p << " (" << model_diff << ")" << endl;
     cerr << "           avg # pairs: ";
-    cerr << _np << npairs/(float)in_sz << endl;
+    cerr << _np << npairs/(float)in_sz;
+    if (faster_perceptron) cerr << " (meaningless)";
+    cerr << endl;
     cerr << "        avg # rank err: ";
     cerr << rank_errors/(float)in_sz << endl;
-#ifndef DTRAIN_FASTER_PERCEPTRON
     cerr << "     avg # margin viol: ";
     cerr << margin_violations/(float)in_sz << endl;
-#endif
     cerr << "    non0 feature count: " <<  nonz << endl;
     cerr << "           avg list sz: " << list_sz/(float)in_sz << endl;
     cerr << "           avg f count: " << f_count/(float)list_sz << endl;
   }
 
-  if (hstreaming) {
-    rep.update_counter("Score 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(score_avg*DTRAIN_SCALE));
-    rep.update_counter("Model 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(model_avg*DTRAIN_SCALE));
-    rep.update_counter("Pairs avg #"+boost::lexical_cast<string>(t+1), (unsigned)((npairs/(weight_t)in_sz)*DTRAIN_SCALE));
-    rep.update_counter("Rank errors avg #"+boost::lexical_cast<string>(t+1), (unsigned)((rank_errors/(weight_t)in_sz)*DTRAIN_SCALE));
-    rep.update_counter("Margin violations avg #"+boost::lexical_cast<string>(t+1), (unsigned)((margin_violations/(weight_t)in_sz)*DTRAIN_SCALE));
-    rep.update_counter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz);
-    rep.update_gcounter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz);
-  }
-
   pair<score_t,score_t> remember;
   remember.first = score_avg;
   remember.second = model_avg;
@@ -604,10 +505,6 @@ main(int argc, char** argv)
 
   if (average) w_average /= (weight_t)T;
 
-#ifndef DTRAIN_LOCAL
-  unlink(grammar_buf_fn.c_str());
-#endif
-
   if (!noup) {
     if (!quiet) cerr << endl << "Writing weights file to '" << output_fn << "' ..." << endl;
     if (select_weights == "last" || average) { // last, average
@@ -644,7 +541,6 @@ main(int argc, char** argv)
         }
       }
     }
-    if (output_fn == "-" && hstreaming) cout << "__SHARD_COUNT__\t1" << endl;
     if (!quiet) cerr << "done" << endl;
   }
 
diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h
index 4b6f415c..eb0b9f17 100644
--- a/training/dtrain/dtrain.h
+++ b/training/dtrain/dtrain.h
@@ -1,14 +1,9 @@
 #ifndef _DTRAIN_H_
 #define _DTRAIN_H_
 
-#undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs
-                                 // DO NOT USE WITH SVM!
-//#define DTRAIN_LOCAL
 #define DTRAIN_DOTS 10 // after how many inputs to display a '.'
-#define DTRAIN_GRAMMAR_DELIM "########EOS########"
 #define DTRAIN_SCALE 100000
 
-
 #include <iomanip>
 #include <climits>
 #include <string.h>
diff --git a/training/dtrain/examples/parallelized/README b/training/dtrain/examples/parallelized/README
new file mode 100644
index 00000000..89715105
--- /dev/null
+++ b/training/dtrain/examples/parallelized/README
@@ -0,0 +1,5 @@
+run for example
+  ../../parallelize.rb ./dtrain.ini 4 false 2 2 ./in ./refs
+
+final weights will be in the file work/weights.3
+
diff --git a/training/dtrain/test/parallelize/cdec.ini b/training/dtrain/examples/parallelized/cdec.ini
index 72e99dc5..e43ba1c4 100644
--- a/training/dtrain/test/parallelize/cdec.ini
+++ b/training/dtrain/examples/parallelized/cdec.ini
@@ -4,7 +4,7 @@ intersection_strategy=cube_pruning
 cubepruning_pop_limit=200
 scfg_max_span_limit=15
 feature_function=WordPenalty
-feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5
+feature_function=KLanguageModel ../example/nc-wmt11.en.srilm.gz
 #feature_function=ArityPenalty
 #feature_function=CMR2008ReorderingFeatures
 #feature_function=Dwarf
diff --git a/training/dtrain/test/parallelize/dtrain.ini b/training/dtrain/examples/parallelized/dtrain.ini
index 03f9d240..f19ef891 100644
--- a/training/dtrain/test/parallelize/dtrain.ini
+++ b/training/dtrain/examples/parallelized/dtrain.ini
@@ -2,7 +2,7 @@ k=100
 N=4
 learning_rate=0.0001
 gamma=0
-loss_margin=0
+loss_margin=1.0
 epochs=1
 scorer=stupid_bleu
 sample_from=kbest
@@ -11,5 +11,6 @@ pair_sampling=XYX
 hi_lo=0.1
 select_weights=last
 print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
-tmp=/tmp
+# newer version of the grammar extractor use different feature names: 
+#print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
 decoder_config=cdec.ini
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.0.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.0.gz
new file mode 100644
index 00000000..1e28a24b
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.0.gz
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.1.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.1.gz
new file mode 100644
index 00000000..372f5675
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.1.gz
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.2.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.2.gz
new file mode 100644
index 00000000..145d0dc0
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.2.gz
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.3.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.3.gz
new file mode 100644
index 00000000..105593ff
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.3.gz
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.4.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.4.gz
new file mode 100644
index 00000000..30781f48
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.4.gz
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.5.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.5.gz
new file mode 100644
index 00000000..834ee759
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.5.gz
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.6.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.6.gz
new file mode 100644
index 00000000..2e76f348
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.6.gz
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.7.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.7.gz
new file mode 100644
index 00000000..3741a887
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.7.gz
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.8.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.8.gz
new file mode 100644
index 00000000..ebf6bd0c
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.8.gz
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.9.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.9.gz
new file mode 100644
index 00000000..c1791059
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.9.gz
diff --git a/training/dtrain/examples/parallelized/in b/training/dtrain/examples/parallelized/in
new file mode 100644
index 00000000..51d01fe7
--- /dev/null
+++ b/training/dtrain/examples/parallelized/in
@@ -0,0 +1,10 @@
+<seg grammar="grammar/grammar.out.0.gz" id="0">europas nach rassen geteiltes haus</seg>
+<seg grammar="grammar/grammar.out.1.gz" id="1">ein gemeinsames merkmal aller extremen rechten in europa ist ihr rassismus und die tatsache , daß sie das einwanderungsproblem als politischen hebel benutzen .</seg>
+<seg grammar="grammar/grammar.out.2.gz" id="2">der lega nord in italien , der vlaams block in den niederlanden , die anhänger von le pens nationaler front in frankreich , sind beispiele für parteien oder bewegungen , die sich um das gemeinsame thema : ablehnung der zuwanderung gebildet haben und um forderung nach einer vereinfachten politik , um sie zu regeln .</seg>
+<seg grammar="grammar/grammar.out.3.gz" id="3">während individuen wie jörg haidar und jean @-@ marie le pen kommen und ( leider nicht zu bald ) wieder gehen mögen , wird die rassenfrage aus der europäischer politik nicht so bald verschwinden .</seg>
+<seg grammar="grammar/grammar.out.4.gz" id="4">eine alternde einheimische bevölkerung und immer offenere grenzen vermehren die rassistische zersplitterung in den europäischen ländern .</seg>
+<seg grammar="grammar/grammar.out.5.gz" id="5">die großen parteien der rechten und der linken mitte haben sich dem problem gestellt , in dem sie den kopf in den sand gesteckt und allen aussichten zuwider gehofft haben , es möge bald verschwinden .</seg>
+<seg grammar="grammar/grammar.out.6.gz" id="6">das aber wird es nicht , wie die geschichte des rassismus in amerika deutlich zeigt .</seg>
+<seg grammar="grammar/grammar.out.7.gz" id="7">die beziehungen zwischen den rassen standen in den usa über jahrzehnte - und tun das noch heute - im zentrum der politischen debatte . das ging so weit , daß rassentrennung genauso wichtig wie das einkommen wurde , - wenn nicht sogar noch wichtiger - um politische zuneigungen und einstellungen zu bestimmen .</seg>
+<seg grammar="grammar/grammar.out.8.gz" id="8">der erste schritt , um mit der rassenfrage umzugehen ist , ursache und folgen rassistischer feindseligkeiten zu verstehen , auch dann , wenn das bedeutet , unangenehme tatsachen aufzudecken .</seg>
+<seg grammar="grammar/grammar.out.9.gz" id="9">genau das haben in den usa eine große anzahl an forschungsvorhaben in wirtschaft , soziologie , psychologie und politikwissenschaft geleistet . diese forschungen zeigten , daß menschen unterschiedlicher rasse einander deutlich weniger vertrauen .</seg>
diff --git a/training/dtrain/examples/parallelized/refs b/training/dtrain/examples/parallelized/refs
new file mode 100644
index 00000000..632e27b0
--- /dev/null
+++ b/training/dtrain/examples/parallelized/refs
@@ -0,0 +1,10 @@
+europe 's divided racial house
+a common feature of europe 's extreme right is its racism and use of the immigration issue as a political wedge .
+the lega nord in italy , the vlaams blok in the netherlands , the supporters of le pen 's national front in france , are all examples of parties or movements formed on the common theme of aversion to immigrants and promotion of simplistic policies to control them .
+while individuals like jorg haidar and jean @-@ marie le pen may come and ( never to soon ) go , the race question will not disappear from european politics anytime soon .
+an aging population at home and ever more open borders imply increasing racial fragmentation in european countries .
+mainstream parties of the center left and center right have confronted this prospect by hiding their heads in the ground , hoping against hope that the problem will disappear .
+it will not , as america 's racial history clearly shows .
+race relations in the us have been for decades - and remain - at the center of political debate , to the point that racial cleavages are as important as income , if not more , as determinants of political preferences and attitudes .
+the first step to address racial politics is to understand the origin and consequences of racial animosity , even if it means uncovering unpleasant truths .
+this is precisely what a large amount of research in economics , sociology , psychology and political science has done for the us .
diff --git a/training/dtrain/examples/parallelized/work/out.0.0 b/training/dtrain/examples/parallelized/work/out.0.0
new file mode 100644
index 00000000..7a00ed0f
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/out.0.0
@@ -0,0 +1,61 @@
+                cdec cfg 'cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading ../example/nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+Seeding random number sequence to 3121929377
+
+dtrain
+Parameters:
+                       k 100
+                       N 4
+                       T 1
+                  scorer 'stupid_bleu'
+             sample from 'kbest'
+                  filter 'uniq'
+           learning rate 0.0001
+                   gamma 0
+             loss margin 1
+                   pairs 'XYX'
+                   hi lo 0.1
+          pair threshold 0
+          select weights 'last'
+                  l1 reg 0 'none'
+               max pairs 4294967295
+                cdec cfg 'cdec.ini'
+                   input 'work/shard.0.0.in'
+                    refs 'work/shard.0.0.refs'
+                  output 'work/weights.0.0'
+(a dot represents 10 inputs)
+Iteration #1 of 1.
+  5
+WEIGHTS
+              Glue = +0.2663
+       WordPenalty = -0.0079042
+     LanguageModel = +0.44782
+ LanguageModel_OOV = -0.0401
+     PhraseModel_0 = -0.193
+     PhraseModel_1 = +0.71321
+     PhraseModel_2 = +0.85196
+     PhraseModel_3 = -0.43986
+     PhraseModel_4 = -0.44803
+     PhraseModel_5 = -0.0538
+     PhraseModel_6 = -0.1788
+       PassThrough = -0.1477
+        ---
+       1best avg score: 0.17521 (+0.17521)
+ 1best avg model score: 21.556 (+21.556)
+           avg # pairs: 1671.2
+        avg # rank err: 1118.6
+     avg # margin viol: 552.6
+    non0 feature count: 12
+           avg list sz: 100
+           avg f count: 11.32
+(time 0.37 min, 4.4 s/S)
+
+Writing weights file to 'work/weights.0.0' ...
+done
+
+---
+Best iteration: 1 [SCORE 'stupid_bleu'=0.17521].
+This took 0.36667 min.
diff --git a/training/dtrain/examples/parallelized/work/out.0.1 b/training/dtrain/examples/parallelized/work/out.0.1
new file mode 100644
index 00000000..e2bd6649
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/out.0.1
@@ -0,0 +1,62 @@
+                cdec cfg 'cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading ../example/nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+Seeding random number sequence to 2767202922
+
+dtrain
+Parameters:
+                       k 100
+                       N 4
+                       T 1
+                  scorer 'stupid_bleu'
+             sample from 'kbest'
+                  filter 'uniq'
+           learning rate 0.0001
+                   gamma 0
+             loss margin 1
+                   pairs 'XYX'
+                   hi lo 0.1
+          pair threshold 0
+          select weights 'last'
+                  l1 reg 0 'none'
+               max pairs 4294967295
+                cdec cfg 'cdec.ini'
+                   input 'work/shard.0.0.in'
+                    refs 'work/shard.0.0.refs'
+                  output 'work/weights.0.1'
+              weights in 'work/weights.0'
+(a dot represents 10 inputs)
+Iteration #1 of 1.
+  5
+WEIGHTS
+              Glue = -0.2699
+       WordPenalty = +0.080605
+     LanguageModel = -0.026572
+ LanguageModel_OOV = -0.30025
+     PhraseModel_0 = -0.32076
+     PhraseModel_1 = +0.67451
+     PhraseModel_2 = +0.92
+     PhraseModel_3 = -0.36402
+     PhraseModel_4 = -0.592
+     PhraseModel_5 = -0.0269
+     PhraseModel_6 = -0.28755
+       PassThrough = -0.33285
+        ---
+       1best avg score: 0.26638 (+0.26638)
+ 1best avg model score: 53.197 (+53.197)
+           avg # pairs: 2028.6
+        avg # rank err: 998.2
+     avg # margin viol: 918.8
+    non0 feature count: 12
+           avg list sz: 100
+           avg f count: 10.496
+(time 0.32 min, 3.8 s/S)
+
+Writing weights file to 'work/weights.0.1' ...
+done
+
+---
+Best iteration: 1 [SCORE 'stupid_bleu'=0.26638].
+This took 0.31667 min.
diff --git a/training/dtrain/examples/parallelized/work/out.1.0 b/training/dtrain/examples/parallelized/work/out.1.0
new file mode 100644
index 00000000..6e790e38
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/out.1.0
@@ -0,0 +1,61 @@
+                cdec cfg 'cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading ../example/nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+Seeding random number sequence to 1432415010
+
+dtrain
+Parameters:
+                       k 100
+                       N 4
+                       T 1
+                  scorer 'stupid_bleu'
+             sample from 'kbest'
+                  filter 'uniq'
+           learning rate 0.0001
+                   gamma 0
+             loss margin 1
+                   pairs 'XYX'
+                   hi lo 0.1
+          pair threshold 0
+          select weights 'last'
+                  l1 reg 0 'none'
+               max pairs 4294967295
+                cdec cfg 'cdec.ini'
+                   input 'work/shard.1.0.in'
+                    refs 'work/shard.1.0.refs'
+                  output 'work/weights.1.0'
+(a dot represents 10 inputs)
+Iteration #1 of 1.
+  5
+WEIGHTS
+              Glue = -0.3815
+       WordPenalty = +0.20064
+     LanguageModel = +0.95304
+ LanguageModel_OOV = -0.264
+     PhraseModel_0 = -0.22362
+     PhraseModel_1 = +0.12254
+     PhraseModel_2 = +0.26328
+     PhraseModel_3 = +0.38018
+     PhraseModel_4 = -0.48654
+     PhraseModel_5 = +0
+     PhraseModel_6 = -0.3645
+       PassThrough = -0.2216
+        ---
+       1best avg score: 0.10863 (+0.10863)
+ 1best avg model score: -4.9841 (-4.9841)
+           avg # pairs: 1345.4
+        avg # rank err: 822.4
+     avg # margin viol: 501
+    non0 feature count: 11
+           avg list sz: 100
+           avg f count: 11.814
+(time 0.45 min, 5.4 s/S)
+
+Writing weights file to 'work/weights.1.0' ...
+done
+
+---
+Best iteration: 1 [SCORE 'stupid_bleu'=0.10863].
+This took 0.45 min.
diff --git a/training/dtrain/examples/parallelized/work/out.1.1 b/training/dtrain/examples/parallelized/work/out.1.1
new file mode 100644
index 00000000..0b984761
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/out.1.1
@@ -0,0 +1,62 @@
+                cdec cfg 'cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading ../example/nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+Seeding random number sequence to 1771918374
+
+dtrain
+Parameters:
+                       k 100
+                       N 4
+                       T 1
+                  scorer 'stupid_bleu'
+             sample from 'kbest'
+                  filter 'uniq'
+           learning rate 0.0001
+                   gamma 0
+             loss margin 1
+                   pairs 'XYX'
+                   hi lo 0.1
+          pair threshold 0
+          select weights 'last'
+                  l1 reg 0 'none'
+               max pairs 4294967295
+                cdec cfg 'cdec.ini'
+                   input 'work/shard.1.0.in'
+                    refs 'work/shard.1.0.refs'
+                  output 'work/weights.1.1'
+              weights in 'work/weights.0'
+(a dot represents 10 inputs)
+Iteration #1 of 1.
+  5
+WEIGHTS
+              Glue = -0.3178
+       WordPenalty = +0.11092
+     LanguageModel = +0.17269
+ LanguageModel_OOV = -0.13485
+     PhraseModel_0 = -0.45371
+     PhraseModel_1 = +0.38789
+     PhraseModel_2 = +0.75311
+     PhraseModel_3 = -0.38163
+     PhraseModel_4 = -0.58817
+     PhraseModel_5 = -0.0269
+     PhraseModel_6 = -0.27315
+       PassThrough = -0.16745
+        ---
+       1best avg score: 0.13169 (+0.13169)
+ 1best avg model score: 24.226 (+24.226)
+           avg # pairs: 1951.2
+        avg # rank err: 985.4
+     avg # margin viol: 951
+    non0 feature count: 12
+           avg list sz: 100
+           avg f count: 11.224
+(time 0.42 min, 5 s/S)
+
+Writing weights file to 'work/weights.1.1' ...
+done
+
+---
+Best iteration: 1 [SCORE 'stupid_bleu'=0.13169].
+This took 0.41667 min.
diff --git a/training/dtrain/examples/parallelized/work/shard.0.0.in b/training/dtrain/examples/parallelized/work/shard.0.0.in
new file mode 100644
index 00000000..92f9c78e
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/shard.0.0.in
@@ -0,0 +1,5 @@
+<seg grammar="grammar/grammar.out.0.gz" id="0">europas nach rassen geteiltes haus</seg>
+<seg grammar="grammar/grammar.out.1.gz" id="1">ein gemeinsames merkmal aller extremen rechten in europa ist ihr rassismus und die tatsache , daß sie das einwanderungsproblem als politischen hebel benutzen .</seg>
+<seg grammar="grammar/grammar.out.2.gz" id="2">der lega nord in italien , der vlaams block in den niederlanden , die anhänger von le pens nationaler front in frankreich , sind beispiele für parteien oder bewegungen , die sich um das gemeinsame thema : ablehnung der zuwanderung gebildet haben und um forderung nach einer vereinfachten politik , um sie zu regeln .</seg>
+<seg grammar="grammar/grammar.out.3.gz" id="3">während individuen wie jörg haidar und jean @-@ marie le pen kommen und ( leider nicht zu bald ) wieder gehen mögen , wird die rassenfrage aus der europäischer politik nicht so bald verschwinden .</seg>
+<seg grammar="grammar/grammar.out.4.gz" id="4">eine alternde einheimische bevölkerung und immer offenere grenzen vermehren die rassistische zersplitterung in den europäischen ländern .</seg>
diff --git a/training/dtrain/examples/parallelized/work/shard.0.0.refs b/training/dtrain/examples/parallelized/work/shard.0.0.refs
new file mode 100644
index 00000000..bef68fee
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/shard.0.0.refs
@@ -0,0 +1,5 @@
+europe 's divided racial house
+a common feature of europe 's extreme right is its racism and use of the immigration issue as a political wedge .
+the lega nord in italy , the vlaams blok in the netherlands , the supporters of le pen 's national front in france , are all examples of parties or movements formed on the common theme of aversion to immigrants and promotion of simplistic policies to control them .
+while individuals like jorg haidar and jean @-@ marie le pen may come and ( never to soon ) go , the race question will not disappear from european politics anytime soon .
+an aging population at home and ever more open borders imply increasing racial fragmentation in european countries .
diff --git a/training/dtrain/examples/parallelized/work/shard.1.0.in b/training/dtrain/examples/parallelized/work/shard.1.0.in
new file mode 100644
index 00000000..b7695ce7
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/shard.1.0.in
@@ -0,0 +1,5 @@
+<seg grammar="grammar/grammar.out.5.gz" id="5">die großen parteien der rechten und der linken mitte haben sich dem problem gestellt , in dem sie den kopf in den sand gesteckt und allen aussichten zuwider gehofft haben , es möge bald verschwinden .</seg>
+<seg grammar="grammar/grammar.out.6.gz" id="6">das aber wird es nicht , wie die geschichte des rassismus in amerika deutlich zeigt .</seg>
+<seg grammar="grammar/grammar.out.7.gz" id="7">die beziehungen zwischen den rassen standen in den usa über jahrzehnte - und tun das noch heute - im zentrum der politischen debatte . das ging so weit , daß rassentrennung genauso wichtig wie das einkommen wurde , - wenn nicht sogar noch wichtiger - um politische zuneigungen und einstellungen zu bestimmen .</seg>
+<seg grammar="grammar/grammar.out.8.gz" id="8">der erste schritt , um mit der rassenfrage umzugehen ist , ursache und folgen rassistischer feindseligkeiten zu verstehen , auch dann , wenn das bedeutet , unangenehme tatsachen aufzudecken .</seg>
+<seg grammar="grammar/grammar.out.9.gz" id="9">genau das haben in den usa eine große anzahl an forschungsvorhaben in wirtschaft , soziologie , psychologie und politikwissenschaft geleistet . diese forschungen zeigten , daß menschen unterschiedlicher rasse einander deutlich weniger vertrauen .</seg>
diff --git a/training/dtrain/examples/parallelized/work/shard.1.0.refs b/training/dtrain/examples/parallelized/work/shard.1.0.refs
new file mode 100644
index 00000000..6076f6d5
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/shard.1.0.refs
@@ -0,0 +1,5 @@
+mainstream parties of the center left and center right have confronted this prospect by hiding their heads in the ground , hoping against hope that the problem will disappear .
+it will not , as america 's racial history clearly shows .
+race relations in the us have been for decades - and remain - at the center of political debate , to the point that racial cleavages are as important as income , if not more , as determinants of political preferences and attitudes .
+the first step to address racial politics is to understand the origin and consequences of racial animosity , even if it means uncovering unpleasant truths .
+this is precisely what a large amount of research in economics , sociology , psychology and political science has done for the us .
diff --git a/training/dtrain/examples/parallelized/work/weights.0 b/training/dtrain/examples/parallelized/work/weights.0
new file mode 100644
index 00000000..ddd595a8
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/weights.0
@@ -0,0 +1,12 @@
+LanguageModel	0.7004298992212881
+PhraseModel_2	0.5576194336478857
+PhraseModel_1	0.41787318415343155
+PhraseModel_4	-0.46728502545635164
+PhraseModel_3	-0.029839521598455515
+Glue	-0.05760000000000068
+PhraseModel_6	-0.2716499999999978
+PhraseModel_0	-0.20831031065605327
+LanguageModel_OOV	-0.15205000000000077
+PassThrough	-0.1846500000000006
+WordPenalty	0.09636994553433414
+PhraseModel_5	-0.026900000000000257
diff --git a/training/dtrain/examples/parallelized/work/weights.0.0 b/training/dtrain/examples/parallelized/work/weights.0.0
new file mode 100644
index 00000000..c9370b18
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/weights.0.0
@@ -0,0 +1,12 @@
+WordPenalty	-0.0079041595706392243
+LanguageModel	0.44781580828279532
+LanguageModel_OOV	-0.04010000000000042
+Glue	0.26629999999999948
+PhraseModel_0	-0.19299677809125185
+PhraseModel_1	0.71321026861732773
+PhraseModel_2	0.85195540993310537
+PhraseModel_3	-0.43986310822842656
+PhraseModel_4	-0.44802855630415955
+PhraseModel_5	-0.053800000000000514
+PhraseModel_6	-0.17879999999999835
+PassThrough	-0.14770000000000036
diff --git a/training/dtrain/examples/parallelized/work/weights.0.1 b/training/dtrain/examples/parallelized/work/weights.0.1
new file mode 100644
index 00000000..8fad3de8
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/weights.0.1
@@ -0,0 +1,12 @@
+WordPenalty	0.080605055841244472
+LanguageModel	-0.026571720531022844
+LanguageModel_OOV	-0.30024999999999141
+Glue	-0.26989999999999842
+PhraseModel_2	0.92000295209089566
+PhraseModel_1	0.67450748692470841
+PhraseModel_4	-0.5920000014976784
+PhraseModel_3	-0.36402437203127397
+PhraseModel_6	-0.28754999999999603
+PhraseModel_0	-0.32076244202907672
+PassThrough	-0.33284999999999004
+PhraseModel_5	-0.026900000000000257
diff --git a/training/dtrain/examples/parallelized/work/weights.1 b/training/dtrain/examples/parallelized/work/weights.1
new file mode 100644
index 00000000..03058a16
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/weights.1
@@ -0,0 +1,12 @@
+PhraseModel_2	0.8365578543552836
+PhraseModel_4	-0.5900840266009169
+PhraseModel_1	0.5312000609786991
+PhraseModel_0	-0.3872342271319619
+PhraseModel_3	-0.3728279676912084
+Glue	-0.2938500000000036
+PhraseModel_6	-0.2803499999999967
+PassThrough	-0.25014999999999626
+LanguageModel_OOV	-0.21754999999999702
+LanguageModel	0.07306061161169894
+WordPenalty	0.09576193325966899
+PhraseModel_5	-0.026900000000000257
diff --git a/training/dtrain/examples/parallelized/work/weights.1.0 b/training/dtrain/examples/parallelized/work/weights.1.0
new file mode 100644
index 00000000..6a6a65c1
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/weights.1.0
@@ -0,0 +1,11 @@
+WordPenalty	0.20064405063930751
+LanguageModel	0.9530439901597807
+LanguageModel_OOV	-0.26400000000000112
+Glue	-0.38150000000000084
+PhraseModel_0	-0.22362384322085468
+PhraseModel_1	0.12253609968953538
+PhraseModel_2	0.26328345736266612
+PhraseModel_3	0.38018406503151553
+PhraseModel_4	-0.48654149460854373
+PhraseModel_6	-0.36449999999999722
+PassThrough	-0.22160000000000085
diff --git a/training/dtrain/examples/parallelized/work/weights.1.1 b/training/dtrain/examples/parallelized/work/weights.1.1
new file mode 100644
index 00000000..f56ea4a2
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/weights.1.1
@@ -0,0 +1,12 @@
+WordPenalty	0.1109188106780935
+LanguageModel	0.17269294375442074
+LanguageModel_OOV	-0.13485000000000266
+Glue	-0.3178000000000088
+PhraseModel_2	0.75311275661967159
+PhraseModel_1	0.38789263503268989
+PhraseModel_4	-0.58816805170415531
+PhraseModel_3	-0.38163156335114284
+PhraseModel_6	-0.27314999999999739
+PhraseModel_0	-0.45370601223484697
+PassThrough	-0.16745000000000249
+PhraseModel_5	-0.026900000000000257
diff --git a/training/dtrain/examples/standard/README b/training/dtrain/examples/standard/README
new file mode 100644
index 00000000..ce37d31a
--- /dev/null
+++ b/training/dtrain/examples/standard/README
@@ -0,0 +1,2 @@
+Call `dtrain` from this folder with ../../dtrain -c dtrain.ini .
+
diff --git a/training/dtrain/test/example/cdec.ini b/training/dtrain/examples/standard/cdec.ini
index d5955f0e..e1edc68d 100644
--- a/training/dtrain/test/example/cdec.ini
+++ b/training/dtrain/examples/standard/cdec.ini
@@ -2,9 +2,10 @@ formalism=scfg
 add_pass_through_rules=true
 scfg_max_span_limit=15
 intersection_strategy=cube_pruning
-cubepruning_pop_limit=30
+cubepruning_pop_limit=200
+grammar=nc-wmt11.grammar.gz
 feature_function=WordPenalty
-feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz
+feature_function=KLanguageModel ./nc-wmt11.en.srilm.gz
 # all currently working feature functions for translation:
 # (with those features active that were used in the ACL paper)
 #feature_function=ArityPenalty
diff --git a/training/dtrain/test/example/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini
index 72d50ca1..e1072d30 100644
--- a/training/dtrain/test/example/dtrain.ini
+++ b/training/dtrain/examples/standard/dtrain.ini
@@ -1,10 +1,12 @@
-input=test/example/nc-wmt11.1k.gz    # use '-' for STDIN
-output=-                             # a weights file (add .gz for gzip compression) or STDOUT '-'
-select_weights=VOID                  # don't output weights
-decoder_config=test/example/cdec.ini # config for cdec
+input=./nc-wmt11.de.gz
+refs=./nc-wmt11.en.gz
+output=-                  # a weights file (add .gz for gzip compression) or STDOUT '-'
+select_weights=VOID       # output average (over epochs) weight vector
+decoder_config=./cdec.ini # config for cdec
 # weights for these features will be printed on each iteration
 print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
-tmp=/tmp
+# newer version of the grammar extractor use different feature names: 
+#print_weights= EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV
 stop_after=10 # stop epoch after 10 inputs
 
 # interesting stuff
@@ -16,7 +18,7 @@ learning_rate=1.0       # learning rate, don't care if gamma=0 (perceptron)
 gamma=0                 # use SVM reg
 sample_from=kbest       # use kbest lists (as opposed to forest)
 filter=uniq             # only unique entries in kbest (surface form)
-pair_sampling=XYX
+pair_sampling=XYX       #
 hi_lo=0.1               # 10 vs 80 vs 10 and 80 vs 10 here
-pair_threshold=0        # minimum distance in BLEU (this will still only use pairs with diff > 0)
-loss_margin=0
+pair_threshold=0        # minimum distance in BLEU (here: > 0)
+loss_margin=0           # update if correctly ranked, but within this margin
diff --git a/training/dtrain/examples/standard/expected-output b/training/dtrain/examples/standard/expected-output
new file mode 100644
index 00000000..7cd09dbf
--- /dev/null
+++ b/training/dtrain/examples/standard/expected-output
@@ -0,0 +1,91 @@
+                cdec cfg './cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading ./nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+  Example feature: Shape_S00000_T00000
+Seeding random number sequence to 2679584485
+
+dtrain
+Parameters:
+                       k 100
+                       N 4
+                       T 2
+                  scorer 'stupid_bleu'
+             sample from 'kbest'
+                  filter 'uniq'
+           learning rate 1
+                   gamma 0
+             loss margin 0
+       faster perceptron 1
+                   pairs 'XYX'
+                   hi lo 0.1
+          pair threshold 0
+          select weights 'VOID'
+                  l1 reg 0 'none'
+               max pairs 4294967295
+                cdec cfg './cdec.ini'
+                   input './nc-wmt11.de.gz'
+                    refs './nc-wmt11.en.gz'
+                  output '-'
+              stop_after 10
+(a dot represents 10 inputs)
+Iteration #1 of 2.
+ . 10
+Stopping after 10 input sentences.
+WEIGHTS
+              Glue = -576
+       WordPenalty = +417.79
+     LanguageModel = +5117.5
+ LanguageModel_OOV = -1307
+     PhraseModel_0 = -1612
+     PhraseModel_1 = -2159.6
+     PhraseModel_2 = -677.36
+     PhraseModel_3 = +2663.8
+     PhraseModel_4 = -1025.9
+     PhraseModel_5 = -8
+     PhraseModel_6 = +70
+       PassThrough = -1455
+        ---
+       1best avg score: 0.27697 (+0.27697)
+ 1best avg model score: -47918 (-47918)
+           avg # pairs: 581.9 (meaningless)
+        avg # rank err: 581.9
+     avg # margin viol: 0
+    non0 feature count: 703
+           avg list sz: 90.9
+           avg f count: 100.09
+(time 0.25 min, 1.5 s/S)
+
+Iteration #2 of 2.
+ . 10
+WEIGHTS
+              Glue = -622
+       WordPenalty = +898.56
+     LanguageModel = +8066.2
+ LanguageModel_OOV = -2590
+     PhraseModel_0 = -4335.8
+     PhraseModel_1 = -5864.4
+     PhraseModel_2 = -1729.8
+     PhraseModel_3 = +2831.9
+     PhraseModel_4 = -5384.8
+     PhraseModel_5 = +1449
+     PhraseModel_6 = +480
+       PassThrough = -2578
+        ---
+       1best avg score: 0.37119 (+0.094226)
+ 1best avg model score: -1.3174e+05 (-83822)
+           avg # pairs: 584.1 (meaningless)
+        avg # rank err: 584.1
+     avg # margin viol: 0
+    non0 feature count: 1115
+           avg list sz: 91.3
+           avg f count: 90.755
+(time 0.3 min, 1.8 s/S)
+
+Writing weights file to '-' ...
+done
+
+---
+Best iteration: 2 [SCORE 'stupid_bleu'=0.37119].
+This took 0.55 min.
diff --git a/training/dtrain/examples/standard/nc-wmt11.de.gz b/training/dtrain/examples/standard/nc-wmt11.de.gz
new file mode 100644
index 00000000..0741fd92
--- /dev/null
+++ b/training/dtrain/examples/standard/nc-wmt11.de.gz
diff --git a/training/dtrain/examples/standard/nc-wmt11.en.gz b/training/dtrain/examples/standard/nc-wmt11.en.gz
new file mode 100644
index 00000000..1c0bd401
--- /dev/null
+++ b/training/dtrain/examples/standard/nc-wmt11.en.gz
diff --git a/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz b/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz
new file mode 100644
index 00000000..7ce81057
--- /dev/null
+++ b/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz
diff --git a/training/dtrain/examples/standard/nc-wmt11.grammar.gz b/training/dtrain/examples/standard/nc-wmt11.grammar.gz
new file mode 100644
index 00000000..ce4024a1
--- /dev/null
+++ b/training/dtrain/examples/standard/nc-wmt11.grammar.gz
diff --git a/training/dtrain/test/toy/cdec.ini b/training/dtrain/examples/toy/cdec.ini
index 98b02d44..b14f4819 100644
--- a/training/dtrain/test/toy/cdec.ini
+++ b/training/dtrain/examples/toy/cdec.ini
@@ -1,2 +1,3 @@
 formalism=scfg
 add_pass_through_rules=true
+grammar=grammar.gz
diff --git a/training/dtrain/test/toy/dtrain.ini b/training/dtrain/examples/toy/dtrain.ini
index a091732f..cd715f26 100644
--- a/training/dtrain/test/toy/dtrain.ini
+++ b/training/dtrain/examples/toy/dtrain.ini
@@ -1,5 +1,6 @@
-decoder_config=test/toy/cdec.ini
-input=test/toy/input
+decoder_config=cdec.ini
+input=src
+refs=tgt
 output=-
 print_weights=logp shell_rule house_rule small_rule little_rule PassThrough
 k=4
diff --git a/training/dtrain/examples/toy/expected-output b/training/dtrain/examples/toy/expected-output
new file mode 100644
index 00000000..1da2aadd
--- /dev/null
+++ b/training/dtrain/examples/toy/expected-output
@@ -0,0 +1,77 @@
+Warning: hi_lo only works with pair_sampling XYX.
+                cdec cfg 'cdec.ini'
+Seeding random number sequence to 1664825829
+
+dtrain
+Parameters:
+                       k 4
+                       N 4
+                       T 2
+                  scorer 'bleu'
+             sample from 'kbest'
+                  filter 'uniq'
+           learning rate 1
+                   gamma 0
+             loss margin 0
+                   pairs 'all'
+          pair threshold 0
+          select weights 'last'
+                  l1 reg 0 'none'
+               max pairs 4294967295
+                cdec cfg 'cdec.ini'
+                   input 'src'
+                    refs 'tgt'
+                  output '-'
+(a dot represents 10 inputs)
+Iteration #1 of 2.
+  2
+WEIGHTS
+              logp = +0
+        shell_rule = -1
+        house_rule = +2
+        small_rule = -2
+       little_rule = +3
+       PassThrough = -5
+        ---
+       1best avg score: 0.5 (+0.5)
+ 1best avg model score: 2.5 (+2.5)
+           avg # pairs: 4
+        avg # rank err: 1.5
+     avg # margin viol: 0
+    non0 feature count: 6
+           avg list sz: 4
+           avg f count: 2.875
+(time 0 min, 0 s/S)
+
+Iteration #2 of 2.
+  2
+WEIGHTS
+              logp = +0
+        shell_rule = -1
+        house_rule = +2
+        small_rule = -2
+       little_rule = +3
+       PassThrough = -5
+        ---
+       1best avg score: 1 (+0.5)
+ 1best avg model score: 5 (+2.5)
+           avg # pairs: 5
+        avg # rank err: 0
+     avg # margin viol: 0
+    non0 feature count: 6
+           avg list sz: 4
+           avg f count: 3
+(time 0 min, 0 s/S)
+
+Writing weights file to '-' ...
+house_rule	2
+little_rule	3
+Glue	-4
+PassThrough	-5
+small_rule	-2
+shell_rule	-1
+done
+
+---
+Best iteration: 2 [SCORE 'bleu'=1].
+This took 0 min.
diff --git a/training/dtrain/examples/toy/grammar.gz b/training/dtrain/examples/toy/grammar.gz
new file mode 100644
index 00000000..8eb0d29e
--- /dev/null
+++ b/training/dtrain/examples/toy/grammar.gz
diff --git a/training/dtrain/examples/toy/src b/training/dtrain/examples/toy/src
new file mode 100644
index 00000000..87e39ef2
--- /dev/null
+++ b/training/dtrain/examples/toy/src
@@ -0,0 +1,2 @@
+ich sah ein kleines haus
+ich fand ein kleines haus
diff --git a/training/dtrain/examples/toy/tgt b/training/dtrain/examples/toy/tgt
new file mode 100644
index 00000000..174926b3
--- /dev/null
+++ b/training/dtrain/examples/toy/tgt
@@ -0,0 +1,2 @@
+i saw a little house
+i found a little house
diff --git a/training/dtrain/hstreaming/avg.rb b/training/dtrain/hstreaming/avg.rb
deleted file mode 100755
index 2599c732..00000000
--- a/training/dtrain/hstreaming/avg.rb
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env ruby
-# first arg may be an int of custom shard count
-
-shard_count_key = "__SHARD_COUNT__"
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-w = {}
-c = {}
-w.default = 0
-c.default = 0
-while line = STDIN.gets
-  key, val = line.split /\s/
-  w[key] += val.to_f
-  c[key] += 1
-end
-
-if ARGV.size == 0
-  shard_count = w["__SHARD_COUNT__"]
-else
-  shard_count = ARGV[0].to_f
-end
-w.each_key { |k|
-  if k == shard_count_key
-    next
-  else
-    puts "#{k}\t#{w[k]/shard_count}"
-    #puts "# #{c[k]}"
-  end
-}
-
diff --git a/training/dtrain/hstreaming/cdec.ini b/training/dtrain/hstreaming/cdec.ini
deleted file mode 100644
index d4f5cecd..00000000
--- a/training/dtrain/hstreaming/cdec.ini
+++ /dev/null
@@ -1,22 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-scfg_max_span_limit=15
-intersection_strategy=cube_pruning
-cubepruning_pop_limit=30
-feature_function=WordPenalty
-feature_function=KLanguageModel nc-wmt11.en.srilm.gz
-#feature_function=ArityPenalty
-#feature_function=CMR2008ReorderingFeatures
-#feature_function=Dwarf
-#feature_function=InputIndicator
-#feature_function=LexNullJump
-#feature_function=NewJump
-#feature_function=NgramFeatures
-#feature_function=NonLatinCount
-#feature_function=OutputIndicator
-#feature_function=RuleIdentityFeatures
-#feature_function=RuleNgramFeatures
-#feature_function=RuleShape
-#feature_function=SourceSpanSizeFeatures
-#feature_function=SourceWordPenalty
-#feature_function=SpanFeatures
diff --git a/training/dtrain/hstreaming/dtrain.ini b/training/dtrain/hstreaming/dtrain.ini
deleted file mode 100644
index a2c219a1..00000000
--- a/training/dtrain/hstreaming/dtrain.ini
+++ /dev/null
@@ -1,15 +0,0 @@
-input=-
-output=-
-decoder_config=cdec.ini
-tmp=/var/hadoop/mapred/local/
-epochs=1
-k=100
-N=4
-learning_rate=0.0001
-gamma=0
-scorer=stupid_bleu
-sample_from=kbest
-filter=uniq
-pair_sampling=XYX
-pair_threshold=0
-select_weights=last
diff --git a/training/dtrain/hstreaming/dtrain.sh b/training/dtrain/hstreaming/dtrain.sh
deleted file mode 100755
index 877ff94c..00000000
--- a/training/dtrain/hstreaming/dtrain.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# script to run dtrain with a task id
-
-pushd . &>/dev/null
-cd ..
-ID=$(basename $(pwd)) # attempt_...
-popd &>/dev/null
-./dtrain -c dtrain.ini --hstreaming $ID
-
diff --git a/training/dtrain/hstreaming/hadoop-streaming-job.sh b/training/dtrain/hstreaming/hadoop-streaming-job.sh
deleted file mode 100755
index 92419956..00000000
--- a/training/dtrain/hstreaming/hadoop-streaming-job.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/sh
-
-EXP=a_simple_test
-
-# change these vars to fit your hadoop installation
-HADOOP_HOME=/usr/lib/hadoop-0.20
-JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
-HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-
- IN=input_on_hdfs
-OUT=output_weights_on_hdfs
-
-# you can -reducer to NONE if you want to
-# do feature selection/averaging locally (e.g. to
-# keep weights of all epochs)
-$HSTREAMING \
-    -mapper "dtrain.sh" \
-    -reducer "ruby lplp.rb l2 select_k 100000" \
-    -input $IN \
-    -output $OUT \
-    -file dtrain.sh \
-    -file lplp.rb \
-    -file ../dtrain \
-    -file dtrain.ini \
-    -file cdec.ini \
-    -file ../test/example/nc-wmt11.en.srilm.gz \
-    -jobconf mapred.reduce.tasks=30 \
-    -jobconf mapred.max.map.failures.percent=0 \
-    -jobconf mapred.job.name="dtrain $EXP"
-
diff --git a/training/dtrain/hstreaming/red-test b/training/dtrain/hstreaming/red-test
deleted file mode 100644
index 2623d697..00000000
--- a/training/dtrain/hstreaming/red-test
+++ /dev/null
@@ -1,9 +0,0 @@
-a	1
-b	2
-c	3.5
-a	1
-b	2
-c	3.5
-d	1
-e	2
-__SHARD_COUNT__	2
diff --git a/training/dtrain/hstreaming/lplp.rb b/training/dtrain/lplp.rb
index f0cd58c5..86e835e8 100755
--- a/training/dtrain/hstreaming/lplp.rb
+++ b/training/dtrain/lplp.rb
@@ -84,34 +84,28 @@ def _test()
 end
 #_test()
 
-# actually do something
+
 def usage()
-  puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> [n] < <input>"
+  puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> <#shards> < <input>"
   puts "   l0...: norms for selection"
   puts "select_k: only output top k (according to the norm of their column vector) features"
   puts "     cut: output features with weight >= threshold"
   puts "       n: if we do not have a shard count use this number for averaging"
-  exit
+  exit 1
 end
 
-if ARGV.size < 3 then usage end
+if ARGV.size < 4 then usage end
 norm_fun = method(ARGV[0].to_sym)
 type = ARGV[1]
 x = ARGV[2].to_f
-
-shard_count_key = "__SHARD_COUNT__"
+shard_count = ARGV[3].to_f
 
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
 w = {}
-shard_count = 0
 while line = STDIN.gets
   key, val = line.split /\s+/
-  if key == shard_count_key
-    shard_count += 1
-    next
-  end
   if w.has_key? key
     w[key].push val.to_f
   else
@@ -119,8 +113,6 @@ while line = STDIN.gets
   end
 end
 
-if ARGV.size == 4 then shard_count = ARGV[3].to_f end
-
 if type == 'cut'
   cut(w, norm_fun, shard_count, x)
 elsif type == 'select_k'
diff --git a/training/dtrain/pairsampling.h b/training/dtrain/pairsampling.h
index 84be1efb..3f67e209 100644
--- a/training/dtrain/pairsampling.h
+++ b/training/dtrain/pairsampling.h
@@ -19,7 +19,7 @@ cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b)
 }
 
 inline void
-all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float _unused=1)
+all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool misranked_only, float _unused=1)
 {
   sort(s->begin(), s->end(), cmp_hyp_by_score_d);
   unsigned sz = s->size();
@@ -27,6 +27,7 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc
   unsigned count = 0;
   for (unsigned i = 0; i < sz-1; i++) {
     for (unsigned j = i+1; j < sz; j++) {
+      if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
       if (threshold > 0) {
         if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
           training.push_back(make_pair((*s)[i], (*s)[j]));
@@ -51,7 +52,7 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc
  */
 
 inline void
-partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float hi_lo)
+partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool misranked_only, float hi_lo)
 {
   unsigned sz = s->size();
   if (sz < 2) return;
@@ -64,9 +65,7 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
   unsigned count = 0;
   for (unsigned i = 0; i < sep_hi; i++) {
     for (unsigned j = sep_hi; j < sz; j++) {
-#ifdef DTRAIN_FASTER_PERCEPTRON
-      if ((*s)[i].model <= (*s)[j].model) {
-#endif
+      if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
       if (threshold > 0) {
         if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
           training.push_back(make_pair((*s)[i], (*s)[j]));
@@ -78,9 +77,6 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
         b = true;
         break;
       }
-#ifdef DTRAIN_FASTER_PERCEPTRON
-      }
-#endif
     }
     if (b) break;
   }
@@ -88,9 +84,7 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
   while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo;
   for (unsigned i = sep_hi; i < sz-sep_lo; i++) {
     for (unsigned j = sz-sep_lo; j < sz; j++) {
-#ifdef DTRAIN_FASTER_PERCEPTRON
-      if ((*s)[i].model <= (*s)[j].model) {
-#endif
+      if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
       if (threshold > 0) {
         if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
           training.push_back(make_pair((*s)[i], (*s)[j]));
@@ -99,9 +93,6 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
           training.push_back(make_pair((*s)[i], (*s)[j]));
       }
       if (++count == max) return;
-#ifdef DTRAIN_FASTER_PERCEPTRON
-      }
-#endif
     }
   }
 }
@@ -119,7 +110,7 @@ _PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b
   return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
 }
 inline void
-PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float _unused=1)
+PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool _unused=false, float _also_unused=0)
 {
   unsigned max_count = 5000, count = 0, sz = s->size();
   bool b = false;
diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb
index eb4148f5..e661416e 100755
--- a/training/dtrain/parallelize.rb
+++ b/training/dtrain/parallelize.rb
@@ -1,80 +1,149 @@
 #!/usr/bin/env ruby
 
+require 'trollop'
 
-if ARGV.size != 5
+def usage
   STDERR.write "Usage: "
-  STDERR.write "ruby parallelize.rb <#shards> <input> <refs> <epochs> <dtrain.ini>\n"
-  exit
+  STDERR.write "ruby parallelize.rb -c <dtrain.ini> [-e <epochs=10>] [--randomize/-z] [--reshard/-y] -s <#shards|0> [-p <at once=9999>] -i <input> -r <refs> [--qsub/-q] [--dtrain_binary <path to dtrain binary>] [-l \"l2 select_k 100000\"]\n"
+  exit 1
 end
 
-cdec_dir   = '/path/to/cdec_dir'
-dtrain_bin = "#{cdec_dir}/training/dtrain/dtrain_local"
-ruby       = '/usr/bin/ruby'
-lplp_rb    = "#{cdec_dir}/training/dtrain/hstreaming/lplp.rb"
-lplp_args  = 'l2 select_k 100000'
-gzip       = '/bin/gzip'
+opts = Trollop::options do
+  opt :config, "dtrain config file", :type => :string
+  opt :epochs, "number of epochs", :type => :int, :default => 10
+  opt :lplp_args, "arguments for lplp.rb", :type => :string, :default => "l2 select_k 100000"
+  opt :randomize, "randomize shards before each epoch", :type => :bool, :short => '-z', :default => false
+  opt :reshard, "reshard after each epoch", :type => :bool, :short => '-y', :default => false
+  opt :shards, "number of shards", :type => :int
+  opt :processes_at_once, "have this number (max) running at the same time", :type => :int, :default => 9999
+  opt :input, "input", :type => :string
+  opt :references, "references", :type => :string
+  opt :qsub, "use qsub", :type => :bool, :default => false
+  opt :dtrain_binary, "path to dtrain binary", :type => :string
+end
+usage if not opts[:config]&&opts[:shards]&&opts[:input]&&opts[:references]
+
 
-num_shards = ARGV[0].to_i
-input      = ARGV[1]
-refs       = ARGV[2]
-epochs     = ARGV[3].to_i
-ini        = ARGV[4]
+dtrain_dir = File.expand_path File.dirname(__FILE__)
+if not opts[:dtrain_binary]
+  dtrain_bin = "#{dtrain_dir}/dtrain"
+else
+  dtrain_bin = opts[:dtrain_binary]
+end
+ruby       = '/usr/bin/ruby'
+lplp_rb    = "#{dtrain_dir}/lplp.rb"
+lplp_args  = opts[:lplp_args]
+cat        = '/bin/cat'
 
+ini        = opts[:config]
+epochs     = opts[:epochs]
+rand       = opts[:randomize]
+reshard    = opts[:reshard]
+predefined_shards = false
+if opts[:shards] == 0
+  predefined_shards = true
+  num_shards = 0
+else
+  num_shards = opts[:shards]
+end
+input = opts[:input]
+refs  = opts[:references]
+use_qsub       = opts[:qsub]
+shards_at_once = opts[:processes_at_once]
 
 `mkdir work`
 
-def make_shards(input, refs, num_shards)
+def make_shards(input, refs, num_shards, epoch, rand)
   lc = `wc -l #{input}`.split.first.to_i
+  index = (0..lc-1).to_a
+  index.reverse!
+  index.shuffle! if rand
   shard_sz = lc / num_shards
   leftover = lc % num_shards
   in_f = File.new input, 'r'
+  in_lines = in_f.readlines
   refs_f = File.new refs, 'r'
+  refs_lines = refs_f.readlines
   shard_in_files = []
   shard_refs_files = []
+  in_fns = []
+  refs_fns = []
   0.upto(num_shards-1) { |shard|
-    shard_in = File.new "work/shard.#{shard}.in", 'w+'
-    shard_refs = File.new "work/shard.#{shard}.refs", 'w+'
+    in_fn = "work/shard.#{shard}.#{epoch}.in"
+    shard_in = File.new in_fn, 'w+'
+    in_fns << in_fn
+    refs_fn = "work/shard.#{shard}.#{epoch}.refs"
+    shard_refs = File.new refs_fn, 'w+'
+    refs_fns << refs_fn
     0.upto(shard_sz-1) { |i|
-      shard_in.write in_f.gets
-      shard_refs.write refs_f.gets
+      j = index.pop
+      shard_in.write in_lines[j]
+      shard_refs.write refs_lines[j]
     }
     shard_in_files << shard_in
     shard_refs_files << shard_refs
   }
   while leftover > 0
-    shard_in_files[-1].write in_f.gets
-    shard_refs_files[-1].write refs_f.gets
+    j = index.pop
+    shard_in_files[-1].write in_lines[j]
+    shard_refs_files[-1].write refs_lines[j]
     leftover -= 1
   end
   (shard_in_files + shard_refs_files).each do |f| f.close end
   in_f.close
   refs_f.close
+  return [in_fns, refs_fns]
 end
 
-make_shards input, refs, num_shards
+input_files = []
+refs_files = []
+if predefined_shards
+  input_files = File.new(input).readlines.map {|i| i.strip }
+  refs_files = File.new(refs).readlines.map {|i| i.strip }
+  num_shards = input_files.size
+else
+  input_files, refs_files = make_shards input, refs, num_shards, 0, rand
+end
 
 0.upto(epochs-1) { |epoch|
+  puts "epoch #{epoch+1}"
   pids = []
   input_weights = ''
   if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end
   weights_files = []
-  0.upto(num_shards-1) { |shard|
-    pids << Kernel.fork {
-      `#{dtrain_bin} -c #{ini}\
-        --input work/shard.#{shard}.in\
-        --refs work/shard.#{shard}.refs #{input_weights}\
-        --output work/weights.#{shard}.#{epoch}\
-        &> work/out.#{shard}.#{epoch}`
+  shard = 0
+  remaining_shards = num_shards
+  while remaining_shards > 0
+    shards_at_once.times {
+      break if remaining_shards==0
+      qsub_str_start = qsub_str_end = ''
+      local_end = ''
+      if use_qsub
+        qsub_str_start = "qsub -cwd -sync y -b y -j y -o work/out.#{shard}.#{epoch} -N dtrain.#{shard}.#{epoch} \""
+        qsub_str_end = "\""
+        local_end = ''
+      else
+        local_end = "&>work/out.#{shard}.#{epoch}"
+      end
+      pids << Kernel.fork {
+        `#{qsub_str_start}#{dtrain_bin} -c #{ini}\
+          --input #{input_files[shard]}\
+          --refs #{refs_files[shard]} #{input_weights}\
+          --output work/weights.#{shard}.#{epoch}#{qsub_str_end} #{local_end}`
+      }
+      weights_files << "work/weights.#{shard}.#{epoch}"
+      shard += 1
+      remaining_shards -= 1
     }
-    weights_files << "work/weights.#{shard}.#{epoch}"
-  }
-  pids.each { |pid| Process.wait(pid) }
-  cat = File.new('work/weights_cat', 'w+')
-  weights_files.each { |f| cat.write File.new(f, 'r').read }
-  cat.close
-  `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat &> work/weights.#{epoch}`
+    pids.each { |pid| Process.wait(pid) }
+    pids.clear
+  end
+  `#{cat} work/weights.*.#{epoch} > work/weights_cat`
+  `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat > work/weights.#{epoch}`
+  if rand and reshard and epoch+1!=epochs
+    input_files, refs_files = make_shards input, refs, num_shards, epoch+1, rand
+  end
 }
 
 `rm work/weights_cat`
-`#{gzip} work/*`
 
diff --git a/training/dtrain/score.cc b/training/dtrain/score.cc
index 34fc86a9..96d6e10a 100644
--- a/training/dtrain/score.cc
+++ b/training/dtrain/score.cc
@@ -49,7 +49,7 @@ BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
  *        for Machine Translation"
  * (Lin & Och '04)
  *
- * NOTE: 0 iff no 1gram match
+ * NOTE: 0 iff no 1gram match ('grounded')
  */
 score_t
 StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
@@ -74,6 +74,35 @@ StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
 }
 
 /*
+ * fixed 'stupid' bleu
+ *
+ * as in "Optimizing for Sentence-Level BLEU+1
+ *        Yields Short Translations"
+ * (Nakov et al. '12)
+ */
+score_t
+FixedStupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+                        const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
+  if (hyp_len == 0 || ref_len == 0) return 0.;
+  NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+  unsigned M = N_;
+  vector<score_t> v = w_;
+  if (ref_len < N_) {
+    M = ref_len;
+    for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M);
+  }
+  score_t sum = 0, add = 0;
+  for (unsigned i = 0; i < M; i++) {
+    if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.;
+    if (i == 1) add = 1;
+    sum += v[i] * log(((score_t)counts.clipped_[i] + add)/((counts.sum_[i] + add)));
+  }
+  return  brevity_penalty(hyp_len, ref_len+1) * exp(sum); // <- fix
+}
+
+/*
  * smooth bleu
  *
  * as in "An End-to-End Discriminative Approach
diff --git a/training/dtrain/score.h b/training/dtrain/score.h
index f317c903..bddaa071 100644
--- a/training/dtrain/score.h
+++ b/training/dtrain/score.h
@@ -148,6 +148,11 @@ struct StupidBleuScorer : public LocalScorer
   score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
 };
 
+struct FixedStupidBleuScorer : public LocalScorer
+{
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
 struct SmoothBleuScorer : public LocalScorer
 {
   score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
diff --git a/training/dtrain/test/example/README b/training/dtrain/test/example/README
deleted file mode 100644
index 6937b11b..00000000
--- a/training/dtrain/test/example/README
+++ /dev/null
@@ -1,8 +0,0 @@
-Small example of input format for distributed training.
-Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini .
-
-For this to work, undef 'DTRAIN_LOCAL' in dtrain.h
-and recompile.
-
-Data is here: http://simianer.de/#dtrain
-
diff --git a/training/dtrain/test/example/expected-output b/training/dtrain/test/example/expected-output
deleted file mode 100644
index 05326763..00000000
--- a/training/dtrain/test/example/expected-output
+++ /dev/null
@@ -1,89 +0,0 @@
-                cdec cfg 'test/example/cdec.ini'
-Loading the LM will be faster if you build a binary file.
-Reading test/example/nc-wmt11.en.srilm.gz
-----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
-****************************************************************************************************
-  Example feature: Shape_S00000_T00000
-Seeding random number sequence to 2912000813
-
-dtrain
-Parameters:
-                       k 100
-                       N 4
-                       T 2
-                 scorer 'stupid_bleu'
-             sample from 'kbest'
-                  filter 'uniq'
-           learning rate 1
-                   gamma 0
-             loss margin 0
-                   pairs 'XYX'
-                   hi lo 0.1
-          pair threshold 0
-          select weights 'VOID'
-                  l1 reg 0 'none'
-               max pairs 4294967295
-                cdec cfg 'test/example/cdec.ini'
-                   input 'test/example/nc-wmt11.1k.gz'
-                  output '-'
-              stop_after 10
-(a dot represents 10 inputs)
-Iteration #1 of 2.
- . 10
-Stopping after 10 input sentences.
-WEIGHTS
-              Glue = -637
-       WordPenalty = +1064
-     LanguageModel = +1175.3
- LanguageModel_OOV = -1437
-     PhraseModel_0 = +1935.6
-     PhraseModel_1 = +2499.3
-     PhraseModel_2 = +964.96
-     PhraseModel_3 = +1410.8
-     PhraseModel_4 = -5977.9
-     PhraseModel_5 = +522
-     PhraseModel_6 = +1089
-       PassThrough = -1308
-        ---
-       1best avg score: 0.16963 (+0.16963)
- 1best avg model score: 64485 (+64485)
-           avg # pairs: 1494.4
-        avg # rank err: 702.6
-     avg # margin viol: 0
-    non0 feature count: 528
-           avg list sz: 85.7
-           avg f count: 102.75
-(time 0.083 min, 0.5 s/S)
-
-Iteration #2 of 2.
- . 10
-WEIGHTS
-              Glue = -1196
-       WordPenalty = +809.52
-     LanguageModel = +3112.1
- LanguageModel_OOV = -1464
-     PhraseModel_0 = +3895.5
-     PhraseModel_1 = +4683.4
-     PhraseModel_2 = +1092.8
-     PhraseModel_3 = +1079.6
-     PhraseModel_4 = -6827.7
-     PhraseModel_5 = -888
-     PhraseModel_6 = +142
-       PassThrough = -1335
-        ---
-       1best avg score: 0.277 (+0.10736)
- 1best avg model score: -3110.5 (-67595)
-           avg # pairs: 1144.2
-        avg # rank err: 529.1
-     avg # margin viol: 0
-    non0 feature count: 859
-           avg list sz: 74.9
-           avg f count: 112.84
-(time 0.067 min, 0.4 s/S)
-
-Writing weights file to '-' ...
-done
-
----
-Best iteration: 2 [SCORE 'stupid_bleu'=0.277].
-This took 0.15 min.
diff --git a/training/dtrain/test/parallelize/in b/training/dtrain/test/parallelize/in
deleted file mode 100644
index a312809f..00000000
--- a/training/dtrain/test/parallelize/in
+++ /dev/null
@@ -1,10 +0,0 @@
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.0.gz" id="0">barack obama erhält als vierter us @-@ präsident den frieden nobelpreis</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.1.gz" id="1">der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.2.gz" id="2">darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.3.gz" id="3">der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.4.gz" id="4">zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.5.gz" id="5">das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.6.gz" id="6">nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.7.gz" id="7">diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.8.gz" id="8">das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.9.gz" id="9">der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt .</seg>
diff --git a/training/dtrain/test/parallelize/refs b/training/dtrain/test/parallelize/refs
deleted file mode 100644
index 4d3128cb..00000000
--- a/training/dtrain/test/parallelize/refs
+++ /dev/null
@@ -1,10 +0,0 @@
-barack obama becomes the fourth american president to receive the nobel peace prize
-the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so .
-he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things .
-the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule .
-first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations .
-the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway .
-then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award .
-he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office .
-the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan .
-the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries .
diff --git a/training/dtrain/test/parallelize/test/cdec.ini b/training/dtrain/test/parallelize/test/cdec.ini
deleted file mode 100644
index 72e99dc5..00000000
--- a/training/dtrain/test/parallelize/test/cdec.ini
+++ /dev/null
@@ -1,22 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-intersection_strategy=cube_pruning
-cubepruning_pop_limit=200
-scfg_max_span_limit=15
-feature_function=WordPenalty
-feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5
-#feature_function=ArityPenalty
-#feature_function=CMR2008ReorderingFeatures
-#feature_function=Dwarf
-#feature_function=InputIndicator
-#feature_function=LexNullJump
-#feature_function=NewJump
-#feature_function=NgramFeatures
-#feature_function=NonLatinCount
-#feature_function=OutputIndicator
-#feature_function=RuleIdentityFeatures
-#feature_function=RuleNgramFeatures
-#feature_function=RuleShape
-#feature_function=SourceSpanSizeFeatures
-#feature_function=SourceWordPenalty
-#feature_function=SpanFeatures
diff --git a/training/dtrain/test/parallelize/test/dtrain.ini b/training/dtrain/test/parallelize/test/dtrain.ini
deleted file mode 100644
index 03f9d240..00000000
--- a/training/dtrain/test/parallelize/test/dtrain.ini
+++ /dev/null
@@ -1,15 +0,0 @@
-k=100
-N=4
-learning_rate=0.0001
-gamma=0
-loss_margin=0
-epochs=1
-scorer=stupid_bleu
-sample_from=kbest
-filter=uniq
-pair_sampling=XYX
-hi_lo=0.1
-select_weights=last
-print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
-tmp=/tmp
-decoder_config=cdec.ini
diff --git a/training/dtrain/test/parallelize/test/in b/training/dtrain/test/parallelize/test/in
deleted file mode 100644
index a312809f..00000000
--- a/training/dtrain/test/parallelize/test/in
+++ /dev/null
@@ -1,10 +0,0 @@
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.0.gz" id="0">barack obama erhält als vierter us @-@ präsident den frieden nobelpreis</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.1.gz" id="1">der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.2.gz" id="2">darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.3.gz" id="3">der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.4.gz" id="4">zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.5.gz" id="5">das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.6.gz" id="6">nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.7.gz" id="7">diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.8.gz" id="8">das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird .</seg>
-<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.9.gz" id="9">der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt .</seg>
diff --git a/training/dtrain/test/parallelize/test/refs b/training/dtrain/test/parallelize/test/refs
deleted file mode 100644
index 4d3128cb..00000000
--- a/training/dtrain/test/parallelize/test/refs
+++ /dev/null
@@ -1,10 +0,0 @@
-barack obama becomes the fourth american president to receive the nobel peace prize
-the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so .
-he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things .
-the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule .
-first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations .
-the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway .
-then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award .
-he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office .
-the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan .
-the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries .
diff --git a/training/dtrain/test/toy/input b/training/dtrain/test/toy/input
deleted file mode 100644
index 4d10a9ea..00000000
--- a/training/dtrain/test/toy/input
+++ /dev/null
@@ -1,2 +0,0 @@
-0	ich sah ein kleines haus	i saw a little house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0	[NP] ||| ich ||| i ||| logp=0	[NP] ||| ein [NN,1] ||| a [1] ||| logp=0	[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1	[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1	[JJ] ||| kleines ||| small ||| logp=0 small_rule=1	[JJ] ||| kleines ||| little ||| logp=0 little_rule=1	[JJ] ||| grosses ||| big ||| logp=0	[JJ] ||| grosses ||| large ||| logp=0	[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0	[V] ||| sah ||| saw ||| logp=0	[V] ||| fand ||| found ||| logp=0
-1	ich fand ein kleines haus	i found a little house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0	[NP] ||| ich ||| i ||| logp=0	[NP] ||| ein [NN,1] ||| a [1] ||| logp=0	[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1	[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1	[JJ] ||| kleines ||| small ||| logp=0 small_rule=1	[JJ] ||| kleines ||| little ||| logp=0 little_rule=1	[JJ] ||| grosses ||| big ||| logp=0	[JJ] ||| grosses ||| large ||| logp=0	[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0	[V] ||| sah ||| saw ||| logp=0	[V] ||| fand ||| found ||| logp=0
diff --git a/training/latent_svm/Makefile.am b/training/latent_svm/Makefile.am
new file mode 100644
index 00000000..65c5e038
--- /dev/null
+++ b/training/latent_svm/Makefile.am
@@ -0,0 +1,6 @@
+bin_PROGRAMS = latent_svm
+
+latent_svm_SOURCES = latent_svm.cc
+latent_svm_LDADD = ../..//decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/latent_svm/latent_svm.cc b/training/latent_svm/latent_svm.cc
new file mode 100644
index 00000000..ab9c1d5d
--- /dev/null
+++ b/training/latent_svm/latent_svm.cc
@@ -0,0 +1,412 @@
+/*
+Points to note regarding variable names:
+total_loss and prev_loss actually refer not to loss, but the metric (usually BLEU)
+*/
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+//boost libraries
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+//cdec libraries
+#include "config.h"
+#include "hg_sampler.h"
+#include "sentence_metadata.h"
+#include "scorer.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "hg.h"
+#include "prob.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+using namespace std;
+using boost::shared_ptr;
+namespace po = boost::program_options;
+
+bool invert_score; 
+boost::shared_ptr<MT19937> rng; //random seed ptr
+
+void RandomPermutation(int len, vector<int>* p_ids) {
+  vector<int>& ids = *p_ids;
+  ids.resize(len);
+  for (int i = 0; i < len; ++i) ids[i] = i;
+  for (int i = len; i > 0; --i) {
+    int j = rng->next() * i;
+    if (j == i) i--;
+    swap(ids[i-1], ids[j]);
+  }  
+}
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("weights,w",po::value<string>(),"[REQD] Input feature weights file")
+        ("input,i",po::value<string>(),"[REQD] Input source file for development set")
+        ("passes,p", po::value<int>()->default_value(15), "Number of passes through the training data")
+        ("weights_write_interval,n", po::value<int>()->default_value(1000), "Number of lines between writing out weights")
+        ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)")
+        ("mt_metric,m",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)")
+        ("regularizer_strength,C", po::value<double>()->default_value(0.01), "regularization strength")
+        ("mt_metric_scale,s", po::value<double>()->default_value(1.0), "Cost function is -mt_metric_scale*BLEU")
+        ("costaug_log_bleu,l", "Flag converts BLEU to log space. Cost function is thus -mt_metric_scale*log(BLEU). Not on by default")
+        ("average,A", "Average the weights (this is a weighted average due to the scaling factor)")
+        ("mu,u", po::value<double>()->default_value(0.0), "weight (between 0 and 1) to scale model score by for oracle selection")
+        ("stepsize_param,a", po::value<double>()->default_value(0.01), "Stepsize parameter, during optimization")
+        ("stepsize_reduce,t", "Divide step size by sqrt(number of examples seen so far), as per Ratliff et al., 2007")
+	("metric_threshold,T", po::value<double>()->default_value(0.0), "Threshold for diff between oracle BLEU and cost-aug BLEU for updating the weights")
+	("check_positive,P", "Check that the loss is positive before updating")
+        ("k_best_size,k", po::value<int>()->default_value(250), "Size of hypothesis list to search for oracles")
+        ("best_ever,b", "Keep track of the best hypothesis we've ever seen (metric score), and use that as the reference")
+        ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+        ("decoder_config,c",po::value<string>(),"Decoder configuration file");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("weights") || !conf->count("input") || !conf->count("decoder_config") || !conf->count("reference")) {
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+double scaling_trick = 1; // see http://blog.smola.org/post/940672544/fast-quadratic-regularization-for-online-learning
+/*computes and returns cost augmented score for negative example selection*/
+double cost_augmented_score(const LogVal<double> model_score, const double mt_metric_score, const double mt_metric_scale, const bool logbleu) {
+  if(logbleu) {
+    if(mt_metric_score != 0)
+      // NOTE: log(model_score) is just the model score feature weights * features
+      return log(model_score) * scaling_trick + (- mt_metric_scale * log(mt_metric_score));
+    else
+      return -1000000;
+  }
+  // NOTE: log(model_score) is just the model score feature weights * features
+  return log(model_score) * scaling_trick + (- mt_metric_scale * mt_metric_score);
+}
+
+/*computes and returns mu score, for oracle selection*/
+double muscore(const vector<weight_t>& feature_weights, const SparseVector<double>& feature_values, const double mt_metric_score, const double mu, const bool logbleu) {
+  if(logbleu) {
+    if(mt_metric_score != 0)
+      return feature_values.dot(feature_weights) * mu + (1 - mu) * log(mt_metric_score);
+    else
+      return feature_values.dot(feature_weights) * mu + (1 - mu) * (-1000000);  // log(0) is -inf
+  }
+  return feature_values.dot(feature_weights) * mu + (1 - mu) * mt_metric_score;
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct HypothesisInfo {
+  SparseVector<double> features;
+  double mt_metric_score;
+  // The model score changes when the feature weights change, so it is not stored here
+  // It must be recomputed every time
+};
+
+struct GoodOracle {
+  shared_ptr<HypothesisInfo> good;
+};
+
+struct TrainingObserver : public DecoderObserver {
+  TrainingObserver(const int k,
+                   const DocScorer& d,
+                   vector<GoodOracle>* o,
+                   const vector<weight_t>& feat_weights,
+                   const double metric_scale,
+                   const double Mu,
+                   const bool bestever,
+                   const bool LogBleu) : ds(d), feature_weights(feat_weights), oracles(*o), kbest_size(k), mt_metric_scale(metric_scale), mu(Mu), best_ever(bestever), log_bleu(LogBleu) {}
+  const DocScorer& ds;
+  const vector<weight_t>& feature_weights;
+  vector<GoodOracle>& oracles;
+  shared_ptr<HypothesisInfo> cur_best;
+  shared_ptr<HypothesisInfo> cur_costaug_best;
+  shared_ptr<HypothesisInfo> cur_ref; 
+  const int kbest_size;
+  const double mt_metric_scale;
+  const double mu;
+  const bool best_ever;
+  const bool log_bleu;
+
+  const HypothesisInfo& GetCurrentBestHypothesis() const {
+    return *cur_best;
+  }
+
+  const HypothesisInfo& GetCurrentCostAugmentedHypothesis() const {
+    return *cur_costaug_best;
+  }
+
+  const HypothesisInfo& GetCurrentReference() const {
+    return *cur_ref; 
+  }
+
+  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    UpdateOracles(smeta.GetSentenceID(), *hg);
+  }
+
+  shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double metric) {
+    shared_ptr<HypothesisInfo> h(new HypothesisInfo);
+    h->features = feats;
+    h->mt_metric_score = metric;
+    return h;
+  }
+
+  void UpdateOracles(int sent_id, const Hypergraph& forest) {
+    //shared_ptr<HypothesisInfo>& cur_ref = oracles[sent_id].good;
+    cur_ref = oracles[sent_id].good; 
+    if(!best_ever)
+      cur_ref.reset();
+
+    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, kbest_size);
+    double costaug_best_score = 0;
+
+    for (int i = 0; i < kbest_size; ++i) {
+      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+        kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+      if (!d) break;
+      double mt_metric_score = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore(); //this might need to change!!
+      const SparseVector<double>& feature_vals = d->feature_values; 
+      double costaugmented_score = cost_augmented_score(d->score, mt_metric_score, mt_metric_scale, log_bleu); //note that d->score, i.e., model score, is passed in
+      if (i == 0) { //i.e., setting up cur_best to be model score highest, and initializing costaug_best
+        cur_best = MakeHypothesisInfo(feature_vals, mt_metric_score);
+        cur_costaug_best = cur_best;
+        costaug_best_score = costaugmented_score; 
+      }
+      if (costaugmented_score > costaug_best_score) {   // kbest_mira's cur_bad, i.e., "fear" derivation
+        cur_costaug_best = MakeHypothesisInfo(feature_vals, mt_metric_score);
+        costaug_best_score = costaugmented_score;
+      }
+      double cur_muscore = mt_metric_score;
+      if (!cur_ref)   // kbest_mira's cur_good, i.e., "hope" derivation
+        cur_ref =  MakeHypothesisInfo(feature_vals, cur_muscore);
+      else {
+          double cur_ref_muscore = cur_ref->mt_metric_score;
+          if(mu > 0) { //select oracle with mixture of model score and BLEU
+              cur_ref_muscore =  muscore(feature_weights, cur_ref->features, cur_ref->mt_metric_score, mu, log_bleu);
+              cur_muscore = muscore(feature_weights, d->feature_values, mt_metric_score, mu, log_bleu);
+          }
+          if (cur_muscore > cur_ref_muscore) //replace oracle
+            cur_ref = MakeHypothesisInfo(feature_vals, mt_metric_score);
+      }
+    }
+  }
+};
+
+void ReadTrainingCorpus(const string& fname, vector<string>* c) {
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    c->push_back(line);
+  }
+}
+
+bool ApproxEqual(double a, double b) {
+  if (a == b) return true;
+  return (fabs(a-b)/fabs(b)) < 0.000001;
+}
+
+int main(int argc, char** argv) {
+  register_feature_functions();
+  SetSilent(true);  // turn off verbose decoder output
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf)) return 1;
+
+  if (conf.count("random_seed"))
+    rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    rng.reset(new MT19937);
+
+  const bool best_ever = conf.count("best_ever") > 0;
+  vector<string> corpus;
+  ReadTrainingCorpus(conf["input"].as<string>(), &corpus);
+
+  const string metric_name = conf["mt_metric"].as<string>(); //set up scoring; this may need to be changed!!
+  
+  ScoreType type = ScoreTypeFromString(metric_name);
+  if (type == TER) {
+    invert_score = true;
+  } else {
+    invert_score = false;
+  } 
+  DocScorer ds(type, conf["reference"].as<vector<string> >(), "");
+  cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl;
+  if (ds.size() != corpus.size()) {
+    cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n";
+    return 1;
+  }
+
+  ReadFile ini_rf(conf["decoder_config"].as<string>());
+  Decoder decoder(ini_rf.stream());
+
+  // load initial weights
+  vector<weight_t>& decoder_weights = decoder.CurrentWeightVector(); //equivalent to "dense_weights" vector in kbest_mira.cc
+  SparseVector<weight_t> sparse_weights; //equivaelnt to  kbest_mira.cc "lambdas"
+  Weights::InitFromFile(conf["weights"].as<string>(), &decoder_weights);
+  Weights::InitSparseVector(decoder_weights, &sparse_weights);
+
+  //initializing other algorithm and output parameters
+  const double c = conf["regularizer_strength"].as<double>();
+  const int weights_write_interval = conf["weights_write_interval"].as<int>();
+  const double mt_metric_scale = conf["mt_metric_scale"].as<double>();
+  const double mu = conf["mu"].as<double>();
+  const double metric_threshold = conf["metric_threshold"].as<double>();
+  const double stepsize_param = conf["stepsize_param"].as<double>(); //step size in structured SGD optimization step
+  const bool stepsize_reduce = conf.count("stepsize_reduce") > 0; 
+  const bool costaug_log_bleu = conf.count("costaug_log_bleu") > 0;
+  const bool average = conf.count("average") > 0;
+  const bool checkpositive = conf.count("check_positive") > 0;
+
+  assert(corpus.size() > 0);
+  vector<GoodOracle> oracles(corpus.size());
+  TrainingObserver observer(conf["k_best_size"].as<int>(),  // kbest size
+                            ds,                             // doc scorer
+                            &oracles,
+                            decoder_weights,
+                            mt_metric_scale,
+                            mu,
+                            best_ever,
+                            costaug_log_bleu);
+  int cur_sent = 0;
+  int line_count = 0;
+  int normalizer = 0; 
+  double total_loss = 0;
+  double prev_loss = 0;
+  int dots = 0;             // progess bar
+  int cur_pass = 0;
+  SparseVector<double> tot;
+  tot += sparse_weights; //add initial weights to total
+  normalizer++; //add 1 to normalizer
+  int max_iteration = conf["passes"].as<int>();
+  string msg = "# LatentSVM tuned weights";
+  vector<int> order;
+  int interval_counter = 0;
+  RandomPermutation(corpus.size(), &order); //shuffle corpus
+  while (line_count <= max_iteration * corpus.size()) { //loop over all (passes * num sentences) examples
+    //if ((interval_counter * 40 / weights_write_interval) > dots) { ++dots; cerr << '.'; } //check this
+    if ((cur_sent * 40 / corpus.size()) > dots) { ++dots; cerr << '.';}
+    if (interval_counter == weights_write_interval) { //i.e., we need to write out weights
+      sparse_weights *= scaling_trick;
+      tot *= scaling_trick;
+      scaling_trick = 1;
+      cerr << " [SENTENCE NUMBER= " << cur_sent << "\n";
+      cerr << " [AVG METRIC LAST INTERVAL =" << ((total_loss - prev_loss) / weights_write_interval) << "]\n";
+      cerr << " [AVG METRIC THIS PASS THUS FAR =" << (total_loss / cur_sent) << "]\n";
+      cerr << " [TOTAL LOSS: =" << total_loss << "\n";
+      Weights::ShowLargestFeatures(decoder_weights);
+      //dots = 0;
+      interval_counter = 0;
+      prev_loss = total_loss;
+      if (average){
+	SparseVector<double> x = tot;
+	x /= normalizer;
+	ostringstream sa;
+	sa << "weights.latentsvm-" << line_count/weights_write_interval << "-avg.gz";
+	x.init_vector(&decoder_weights);
+	Weights::WriteToFile(sa.str(), decoder_weights, true, &msg); 
+      }
+      else {
+	ostringstream os;
+	os << "weights.latentsvm-" << line_count/weights_write_interval << ".gz";
+	sparse_weights.init_vector(&decoder_weights);
+	Weights::WriteToFile(os.str(), decoder_weights, true, &msg);
+      }
+    }
+    if (corpus.size() == cur_sent) { //i.e., finished a pass
+      //cerr << " [AVG METRIC LAST PASS=" << (document_metric_score / corpus.size()) << "]\n";
+      cerr << " [AVG METRIC LAST PASS=" << (total_loss / corpus.size()) << "]\n";
+      cerr << " TOTAL LOSS: " << total_loss << "\n";
+      Weights::ShowLargestFeatures(decoder_weights);
+      cur_sent = 0;
+      total_loss = 0;
+      dots = 0;
+      if(average) {
+        SparseVector<double> x = tot; 
+        x /= normalizer;
+        ostringstream sa;
+        sa << "weights.latentsvm-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "-avg.gz";
+        x.init_vector(&decoder_weights);
+        Weights::WriteToFile(sa.str(), decoder_weights, true, &msg);
+      }
+      else {
+	ostringstream os;
+	os << "weights.latentsvm-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << ".gz";
+	Weights::WriteToFile(os.str(), decoder_weights, true, &msg);	
+      }
+      cur_pass++;
+      RandomPermutation(corpus.size(), &order);
+    }
+    if (cur_sent == 0) { //i.e., starting a new pass
+      cerr << "PASS " << (line_count / corpus.size() + 1) << endl;
+    }
+    sparse_weights.init_vector(&decoder_weights);   // copy sparse_weights to the decoder weights
+    decoder.SetId(order[cur_sent]); //assign current sentence
+    decoder.Decode(corpus[order[cur_sent]], &observer);  // decode/update oracles
+
+    const HypothesisInfo& cur_best = observer.GetCurrentBestHypothesis(); //model score best
+    const HypothesisInfo& cur_costaug = observer.GetCurrentCostAugmentedHypothesis(); //(model + cost) best; cost = -metric_scale*log(BLEU) or -metric_scale*BLEU
+    //const HypothesisInfo& cur_ref = *oracles[order[cur_sent]].good; //this oracle-best line only picks based on BLEU
+    const HypothesisInfo& cur_ref = observer.GetCurrentReference();  //if mu > 0, this mu-mixed oracle will be picked; otherwise, only on BLEU
+    total_loss += cur_best.mt_metric_score; 
+
+    double step_size = stepsize_param;
+    if (stepsize_reduce){       // w_{t+1} = w_t - stepsize_t * grad(Loss) 
+        step_size  /= (sqrt(cur_sent+1.0)); 
+    }
+    //actual update step - compute gradient, and modify sparse_weights
+    if(cur_ref.mt_metric_score - cur_costaug.mt_metric_score > metric_threshold) {
+      const double loss = (cur_costaug.features.dot(decoder_weights) - cur_ref.features.dot(decoder_weights)) * scaling_trick + mt_metric_scale * (cur_ref.mt_metric_score - cur_costaug.mt_metric_score);
+      if (!checkpositive || loss > 0.0) { //can update either all the time if check positive is off, or only when loss > 0 if it's on
+	sparse_weights -= cur_costaug.features * step_size / ((1.0-2.0*step_size*c)*scaling_trick);    // cost augmented hyp orig -
+	sparse_weights += cur_ref.features * step_size / ((1.0-2.0*step_size*c)*scaling_trick);        // ref orig +
+      }
+    }
+    scaling_trick *= (1.0 - 2.0 * step_size * c);
+
+    tot += sparse_weights; //for averaging purposes
+    normalizer++; //for averaging purposes
+    line_count++;
+    interval_counter++;
+    cur_sent++;
+  }
+  cerr << endl;
+  if(average) {
+    tot /= normalizer;
+    tot.init_vector(decoder_weights);
+    msg = "# Latent SSVM tuned weights (averaged vector)";
+    Weights::WriteToFile("weights.latentsvm-final-avg.gz", decoder_weights, true, &msg); 
+    cerr << "Optimization complete.\n" << "AVERAGED WEIGHTS: weights.latentsvm-final-avg.gz\n";
+  } else {
+    Weights::WriteToFile("weights.latentsvm-final.gz", decoder_weights, true, &msg);    
+    cerr << "Optimization complete.\n";
+  }
+  return 0;
+}
+
diff --git a/training/mira/Makefile.am b/training/mira/Makefile.am
index fa4fb22d..8cddc2d7 100644
--- a/training/mira/Makefile.am
+++ b/training/mira/Makefile.am
@@ -1,6 +1,11 @@
-bin_PROGRAMS = kbest_mira
+bin_PROGRAMS = kbest_mira \
+		kbest_cut_mira 
 
 kbest_mira_SOURCES = kbest_mira.cc
 kbest_mira_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a
 
+
+kbest_cut_mira_SOURCES = kbest_cut_mira.cc
+kbest_cut_mira_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a
+
 AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc
new file mode 100644
index 00000000..7df9a18f
--- /dev/null
+++ b/training/mira/kbest_cut_mira.cc
@@ -0,0 +1,954 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+#include <algorithm>
+
+#include "config.h"
+
+
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sentence_metadata.h"
+#include "scorer.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "hg.h"
+#include "prob.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "time.h"
+#include "sampler.h"
+
+#include "weights.h"
+#include "sparse_vector.h"
+
+using namespace std;
+using boost::shared_ptr;
+namespace po = boost::program_options;
+
+bool invert_score;
+boost::shared_ptr<MT19937> rng;
+bool approx_score;
+bool no_reweight;
+bool no_select;
+bool unique_kbest;
+int update_list_size;
+vector<weight_t> dense_w_local;
+double mt_metric_scale;
+int optimizer;
+int fear_select;
+int hope_select;
+bool pseudo_doc;
+bool sent_approx;
+bool checkloss;
+
+void SanityCheck(const vector<double>& w) {
+  for (int i = 0; i < w.size(); ++i) {
+    assert(!isnan(w[i]));
+    assert(!isinf(w[i]));
+  }
+}
+
+struct FComp {
+  const vector<double>& w_;
+  FComp(const vector<double>& w) : w_(w) {}
+  bool operator()(int a, int b) const {
+    return fabs(w_[a]) > fabs(w_[b]);
+  }
+};
+
+void ShowLargestFeatures(const vector<double>& w) {
+  vector<int> fnums(w.size());
+  for (int i = 0; i < w.size(); ++i)
+    fnums[i] = i;
+  vector<int>::iterator mid = fnums.begin();
+  mid += (w.size() > 10 ? 10 : w.size());
+  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
+  cerr << "TOP FEATURES:";
+  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
+    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
+  }
+  cerr << endl;
+}
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+    ("input_weights,w",po::value<string>(),"Input feature weights file")
+    ("source,i",po::value<string>(),"Source file for development set")
+    ("pass,p", po::value<int>()->default_value(15), "Current pass through the training data")
+    ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)")
+    ("mt_metric,m",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)")
+    ("optimizer,o",po::value<int>()->default_value(1), "Optimizer (SGD=1, PA MIRA w/Delta=2, Cutting Plane MIRA=3, PA MIRA=4, Triple nbest list MIRA=5)")
+    ("fear,f",po::value<int>()->default_value(1), "Fear selection (model-cost=1, maxcost=2, maxscore=3)")
+    ("hope,h",po::value<int>()->default_value(1), "Hope selection (model+cost=1, mincost=2)")
+    ("max_step_size,C", po::value<double>()->default_value(0.01), "regularization strength (C)")
+    ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+    ("mt_metric_scale,s", po::value<double>()->default_value(1.0), "Amount to scale MT loss function by")
+    ("sent_approx,a", "Use smoothed sentence-level BLEU score for approximate scoring")
+    ("pseudo_doc,e", "Use pseudo-document BLEU score for approximate scoring")
+    ("no_reweight,d","Do not reweight forest for cutting plane")
+    ("no_select,n", "Do not use selection heuristic")
+    ("k_best_size,k", po::value<int>()->default_value(250), "Size of hypothesis list to search for oracles")
+    ("update_k_best,b", po::value<int>()->default_value(1), "Size of good, bad lists to perform update with")
+    ("unique_k_best,u", "Unique k-best translation list")
+    ("weights_output,O",po::value<string>(),"Directory to write weights to")
+    ("output_dir,D",po::value<string>(),"Directory to place output in")
+    ("decoder_config,c",po::value<string>(),"Decoder configuration file");
+  po::options_description clo("Command line options");
+  clo.add_options()
+    ("config", po::value<string>(), "Configuration file")
+    ("help,H", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("input_weights") || !conf->count("decoder_config") || !conf->count("reference")) {
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+//load previous translation, store array of each sentences score, subtract it from current sentence and replace with new translation score
+
+
+static const double kMINUS_EPSILON = -1e-6;
+static const double EPSILON = 0.000001;
+static const double SMO_EPSILON = 0.0001;
+static const double PSEUDO_SCALE = 0.95;
+static const int MAX_SMO = 10;
+int cur_pass;
+
+struct HypothesisInfo {
+  SparseVector<double> features;
+  vector<WordID> hyp;
+  double mt_metric;
+  double hope;
+  double fear;
+  double alpha;
+  double oracle_loss;
+  SparseVector<double> oracle_feat_diff;
+  shared_ptr<HypothesisInfo> oracleN;
+};
+
+bool ApproxEqual(double a, double b) {
+  if (a == b) return true;
+  return (fabs(a-b)/fabs(b)) < EPSILON;
+}
+
+typedef shared_ptr<HypothesisInfo> HI;
+bool HypothesisCompareB(const HI& h1, const HI& h2 ) 
+{
+  return h1->mt_metric > h2->mt_metric;
+};
+
+
+bool HopeCompareB(const HI& h1, const HI& h2 ) 
+{
+  return h1->hope > h2->hope;
+};
+
+bool FearCompareB(const HI& h1, const HI& h2 ) 
+{
+  return h1->fear > h2->fear;
+};
+
+bool FearComparePred(const HI& h1, const HI& h2 ) 
+{
+  return h1->features.dot(dense_w_local) > h2->features.dot(dense_w_local);
+};
+
+bool HypothesisCompareG(const HI& h1, const HI& h2 ) 
+{
+  return h1->mt_metric < h2->mt_metric;
+};
+
+
+void CuttingPlane(vector<shared_ptr<HypothesisInfo> >* cur_c, bool* again, vector<shared_ptr<HypothesisInfo> >& all_hyp, vector<weight_t> dense_weights)
+{
+  bool DEBUG_CUT = false;
+  shared_ptr<HypothesisInfo> max_fear, max_fear_in_set;
+  vector<shared_ptr<HypothesisInfo> >& cur_constraint = *cur_c;
+
+  if(no_reweight)
+    {
+      //find new hope hypothesis
+      for(int u=0;u!=all_hyp.size();u++)	
+	{ 
+	  double t_score = all_hyp[u]->features.dot(dense_weights);
+	  all_hyp[u]->hope = 1 * all_hyp[u]->mt_metric + t_score;
+	}
+      
+      //sort hyps by hope score
+      sort(all_hyp.begin(),all_hyp.end(),HopeCompareB);    
+      
+      double hope_score = all_hyp[0]->features.dot(dense_weights);
+      if(DEBUG_CUT) cerr << "New hope derivation score " << hope_score << endl;
+     
+      for(int u=0;u!=all_hyp.size();u++)	
+	{ 
+	  double t_score = all_hyp[u]->features.dot(dense_weights);
+	  all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*all_hyp[0]->mt_metric - hope_score + t_score; //relative loss
+	}
+    
+      sort(all_hyp.begin(),all_hyp.end(),FearCompareB);
+      
+    }
+  //assign maximum fear derivation from all derivations
+  max_fear = all_hyp[0];
+  
+  if(DEBUG_CUT) cerr <<"Cutting Plane Max Fear "<<max_fear->fear ;
+  for(int i=0; i < cur_constraint.size();i++) //select maximal violator already in constraint set
+    {
+      if (!max_fear_in_set || cur_constraint[i]->fear > max_fear_in_set->fear)
+	max_fear_in_set = cur_constraint[i];
+    }
+  if(DEBUG_CUT) cerr << "Max Fear in constraint set " << max_fear_in_set->fear << endl;
+  
+  if(max_fear->fear > max_fear_in_set->fear + SMO_EPSILON)
+    {
+      cur_constraint.push_back(max_fear);
+      *again = true;
+      if(DEBUG_CUT) cerr << "Optimize Again " << *again << endl;
+    }
+}
+
+
+double ComputeDelta(vector<shared_ptr<HypothesisInfo> >* cur_p, double max_step_size,vector<weight_t> dense_weights )
+{
+  vector<shared_ptr<HypothesisInfo> >& cur_pair = *cur_p;
+   double loss = cur_pair[0]->oracle_loss - cur_pair[1]->oracle_loss;
+
+   double margin = -(cur_pair[0]->oracleN->features.dot(dense_weights)- cur_pair[0]->features.dot(dense_weights)) + (cur_pair[1]->oracleN->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights));
+   const double num = margin +  loss;
+   cerr << "LOSS: " << num << " Margin:" << margin << " BLEUL:" << loss << " " << cur_pair[1]->features.dot(dense_weights) << " " << cur_pair[0]->features.dot(dense_weights) <<endl;
+   
+
+  SparseVector<double> diff = cur_pair[0]->features;
+  diff -= cur_pair[1]->features;
+  double diffsqnorm = diff.l2norm_sq();
+  double delta;
+  if (diffsqnorm > 0)
+    delta = num / (diffsqnorm * max_step_size);
+  else
+    delta = 0;
+  cerr << " D1:" << delta;
+  //clip delta (enforce margin constraints)
+  delta = max(-cur_pair[0]->alpha, min(delta, cur_pair[1]->alpha));
+  cerr << " D2:" << delta;
+  return delta;
+}
+
+
+vector<shared_ptr<HypothesisInfo> > SelectPair(vector<shared_ptr<HypothesisInfo> >* cur_c)
+{
+  bool DEBUG_SELECT= false;
+  vector<shared_ptr<HypothesisInfo> >& cur_constraint = *cur_c;
+  
+  vector<shared_ptr<HypothesisInfo> > pair;
+
+  if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for pa-mira
+
+      pair.push_back(cur_constraint[0]);
+      pair.push_back(cur_constraint[1]);
+      return pair;
+
+    }
+  
+  for(int u=0;u != cur_constraint.size();u++)	
+    {
+      shared_ptr<HypothesisInfo> max_fear;
+      
+      if(DEBUG_SELECT) cerr<< "cur alpha " << u  << " " << cur_constraint[u]->alpha;
+      for(int i=0; i < cur_constraint.size();i++) //select maximal violator
+	{
+	  if(i != u)
+	    if (!max_fear || cur_constraint[i]->fear > max_fear->fear)
+	      max_fear = cur_constraint[i];
+	}
+      if(!max_fear) return pair; //
+      
+      
+      if ((cur_constraint[u]->alpha == 0) && (cur_constraint[u]->fear > max_fear->fear + SMO_EPSILON))
+	{
+	  for(int i=0; i < cur_constraint.size();i++) //select maximal violator
+	    {
+	      if(i != u)
+		if (cur_constraint[i]->alpha > 0)
+		  {
+		    pair.push_back(cur_constraint[u]);
+		    pair.push_back(cur_constraint[i]);		    
+		    return pair;
+		  }
+	    }
+	}	       
+      if ((cur_constraint[u]->alpha > 0) && (cur_constraint[u]->fear < max_fear->fear - SMO_EPSILON))
+	{
+	  for(int i=0; i < cur_constraint.size();i++) //select maximal violator
+	    {
+	      if(i != u)	
+		if (cur_constraint[i]->fear > cur_constraint[u]->fear)
+		  {
+		    pair.push_back(cur_constraint[u]);
+		    pair.push_back(cur_constraint[i]);
+		    return pair;
+		  }
+	    }  
+	}
+    
+    } 
+  return pair; //no more constraints to optimize, we're done here
+
+}
+
+struct GoodBadOracle {
+  vector<shared_ptr<HypothesisInfo> > good;
+  vector<shared_ptr<HypothesisInfo> > bad;
+};
+
+struct TrainingObserver : public DecoderObserver {
+  TrainingObserver(const int k, const DocScorer& d, vector<GoodBadOracle>* o, vector<ScoreP>* cbs) : ds(d), oracles(*o), corpus_bleu_sent_stats(*cbs), kbest_size(k) {
+    
+
+    if(!pseudo_doc && !sent_approx)
+    if(cur_pass > 0)     //calculate corpus bleu score from previous iterations 1-best for BLEU gain
+      {
+	ScoreP acc;
+	for (int ii = 0; ii < corpus_bleu_sent_stats.size(); ii++) {
+	  if (!acc) { acc = corpus_bleu_sent_stats[ii]->GetZero(); }
+	  acc->PlusEquals(*corpus_bleu_sent_stats[ii]);
+	  
+	}
+	corpus_bleu_stats = acc;
+	corpus_bleu_score = acc->ComputeScore();
+      }
+
+}
+  const DocScorer& ds;
+  vector<ScoreP>& corpus_bleu_sent_stats;
+  vector<GoodBadOracle>& oracles;
+  vector<shared_ptr<HypothesisInfo> > cur_best;
+  shared_ptr<HypothesisInfo> cur_oracle;
+  const int kbest_size;
+  Hypergraph forest;
+  int cur_sent;
+  ScoreP corpus_bleu_stats;
+  float corpus_bleu_score;
+
+  float corpus_src_length;
+  float curr_src_length;
+
+  const int GetCurrentSent() const {
+    return cur_sent;
+  }
+
+  const HypothesisInfo& GetCurrentBestHypothesis() const {
+    return *cur_best[0];
+  }
+
+  const vector<shared_ptr<HypothesisInfo> > GetCurrentBest() const {
+    return cur_best;
+  }
+  
+ const HypothesisInfo& GetCurrentOracle() const {
+    return *cur_oracle;
+  }
+  
+  const Hypergraph& GetCurrentForest() const {
+    return forest;
+  }
+  
+
+  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    cur_sent = smeta.GetSentenceID();
+    curr_src_length = (float) smeta.GetSourceLength();
+
+    if(unique_kbest)
+      UpdateOracles<KBest::FilterUnique>(smeta.GetSentenceID(), *hg);
+    else
+      UpdateOracles<KBest::NoFilter<std::vector<WordID> > >(smeta.GetSentenceID(), *hg);
+    forest = *hg;
+    
+  }
+
+  shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double score, const vector<WordID>& hyp) {
+    shared_ptr<HypothesisInfo> h(new HypothesisInfo);
+    h->features = feats;
+    h->mt_metric = score;
+    h->hyp = hyp;
+    return h;
+  }
+
+  template <class Filter>  
+  void UpdateOracles(int sent_id, const Hypergraph& forest) {
+
+    bool PRINT_LIST= false;    
+    vector<shared_ptr<HypothesisInfo> >& cur_good = oracles[sent_id].good;
+    vector<shared_ptr<HypothesisInfo> >& cur_bad = oracles[sent_id].bad;
+    //TODO: look at keeping previous iterations hypothesis lists around
+    cur_best.clear();
+    cur_good.clear();
+    cur_bad.clear();
+
+    vector<shared_ptr<HypothesisInfo> > all_hyp;
+
+    typedef KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,Filter> K;
+    K kbest(forest,kbest_size);
+    
+    for (int i = 0; i < kbest_size; ++i) {
+
+      typename K::Derivation *d =
+        kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+      if (!d) break;
+
+      float sentscore;
+	  if(cur_pass > 0 && !pseudo_doc && !sent_approx)
+	    {
+	      ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield);
+	      ScoreP corpus_no_best = corpus_bleu_stats->GetZero();
+
+	      corpus_bleu_stats->Subtract(*corpus_bleu_sent_stats[sent_id], &*corpus_no_best);
+	      sent_stats->PlusEquals(*corpus_no_best, 0.5);
+	      
+	      //compute gain from new sentence in 1-best corpus
+	      sentscore = mt_metric_scale * (sent_stats->ComputeScore() - corpus_no_best->ComputeScore());// - corpus_bleu_score);
+	    }
+	  else if(pseudo_doc)   //pseudo-corpus smoothing 
+	    {
+	      float src_scale = corpus_src_length + curr_src_length;
+	      ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield);
+	      if(!corpus_bleu_stats){ corpus_bleu_stats = sent_stats->GetZero();}
+	      
+	      sent_stats->PlusEquals(*corpus_bleu_stats);
+	      sentscore =  mt_metric_scale  * src_scale * sent_stats->ComputeScore();
+
+	    }
+	  else //use sentence-level smoothing ( used when cur_pass=0 if not pseudo_doc)
+	    {
+	     
+	      sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore());
+	    }
+	
+      if (invert_score) sentscore *= -1.0;
+      
+      if (i < update_list_size){ 
+	if(PRINT_LIST)cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; 
+	cur_best.push_back( MakeHypothesisInfo(d->feature_values, sentscore, d->yield));
+      }
+      
+      all_hyp.push_back(MakeHypothesisInfo(d->feature_values, sentscore,d->yield));   //store all hyp to extract hope and fear         
+    }
+    
+    if(pseudo_doc){
+    //update psuedo-doc stats
+      string details, details2;     
+      corpus_bleu_stats->ScoreDetails(&details2);   
+      ScoreP sent_stats = ds[sent_id]->ScoreCandidate(cur_best[0]->hyp);
+      corpus_bleu_stats->PlusEquals(*sent_stats);
+      
+      sent_stats->ScoreDetails(&details);
+      sent_stats = corpus_bleu_stats;
+      corpus_bleu_stats = sent_stats->GetZero();
+      corpus_bleu_stats->PlusEquals(*sent_stats, PSEUDO_SCALE);
+            
+      corpus_src_length = PSEUDO_SCALE * (corpus_src_length + curr_src_length);
+      cerr << "ps corpus size: " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n" << details2 << endl;
+    }
+
+    //figure out how many hyps we can keep maximum
+    int temp_update_size = update_list_size;
+    if (all_hyp.size() < update_list_size){ temp_update_size = all_hyp.size();}
+
+    //sort all hyps by sentscore (eg. bleu)
+    sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareB);
+    
+    if(PRINT_LIST){  cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++)  
+						   cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_w_local) << endl; }
+    
+    if(hope_select == 1)
+      {
+	//find hope hypothesis using model + bleu
+	if (PRINT_LIST) cerr << "HOPE " << endl;
+	for(int u=0;u!=all_hyp.size();u++)	
+	  { 
+	    double t_score = all_hyp[u]->features.dot(dense_w_local);
+	    all_hyp[u]->hope = all_hyp[u]->mt_metric + t_score;
+	    if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; 
+	    
+	  }
+	
+	//sort hyps by hope score
+	sort(all_hyp.begin(),all_hyp.end(),HopeCompareB);
+      }        
+
+    //assign cur_good the sorted list
+    cur_good.insert(cur_good.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size);    
+    if(PRINT_LIST) { cerr << "GOOD" << endl;  for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;}     
+
+    //use hope for fear selection
+    shared_ptr<HypothesisInfo>& oracleN = cur_good[0];
+
+    if(fear_select == 1){   //compute fear hyps with model - bleu
+      if (PRINT_LIST) cerr << "FEAR " << endl;
+      double hope_score = oracleN->features.dot(dense_w_local);
+
+      if (PRINT_LIST) cerr << "hope score " << hope_score << endl;
+      for(int u=0;u!=all_hyp.size();u++)	
+	{ 
+	  double t_score = all_hyp[u]->features.dot(dense_w_local);
+
+	  all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric - hope_score + t_score; //relative loss
+	  all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric;
+	  all_hyp[u]->oracle_feat_diff = oracleN->features - all_hyp[u]->features;
+	  all_hyp[u]->oracleN=oracleN;
+	  if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; 
+	  
+	}
+      
+      sort(all_hyp.begin(),all_hyp.end(),FearCompareB);
+      
+    }
+    else if(fear_select == 2) //select fear based on cost
+      {
+	sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareG);
+      }
+    else //max model score, also known as prediction-based
+      {
+	sort(all_hyp.begin(),all_hyp.end(),FearComparePred);
+      }
+    cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); 
+
+    if(PRINT_LIST){ cerr<< "BAD"<<endl; for(int u=0;u!=cur_bad.size();u++) cerr << cur_bad[u]->mt_metric << " H:" << cur_bad[u]->hope << " F:" << cur_bad[u]->fear << endl;}
+    
+    cerr << "GOOD (BEST): " << cur_good[0]->mt_metric << endl;
+    cerr << " CUR: " << cur_best[0]->mt_metric << endl;
+    cerr << " BAD (WORST): " << cur_bad[0]->mt_metric << endl;
+  }
+};
+
+void ReadTrainingCorpus(const string& fname, vector<string>* c) {
+
+
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    c->push_back(line);
+  }
+}
+
+void ReadPastTranslationForScore(const int cur_pass, vector<ScoreP>* c, DocScorer& ds, const string& od)
+{
+  cerr << "Reading BLEU gain file ";
+  string fname;
+  if(cur_pass == 0)
+    {
+      fname = od + "/run.raw.init";
+    }
+  else
+    {
+      int last_pass = cur_pass - 1; 
+      fname = od + "/run.raw."  +  boost::lexical_cast<std::string>(last_pass) + ".B";
+    }
+  cerr << fname << "\n";
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  ScoreP acc;
+  string line;
+  int lc = 0;
+  while(in) {
+    getline(in, line);
+    if (line.empty() && !in) break;
+    vector<WordID> sent;
+    TD::ConvertSentence(line, &sent);
+    ScoreP sentscore = ds[lc]->ScoreCandidate(sent);
+    c->push_back(sentscore);
+    if (!acc) { acc = sentscore->GetZero(); }
+    acc->PlusEquals(*sentscore);
+    ++lc;
+ 
+  }
+  
+  assert(lc > 0);
+  float score = acc->ComputeScore();
+  string details;
+  acc->ScoreDetails(&details);
+  cerr << "Previous run: " << details << score << endl;
+
+}
+
+
+int main(int argc, char** argv) {
+  register_feature_functions();
+  SetSilent(true);  // turn off verbose decoder output
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf)) return 1;
+
+  if (conf.count("random_seed"))
+    rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    rng.reset(new MT19937);
+  
+  vector<string> corpus;
+
+  const string metric_name = conf["mt_metric"].as<string>();
+  optimizer = conf["optimizer"].as<int>();
+  fear_select = conf["fear"].as<int>();
+  hope_select = conf["hope"].as<int>();
+  mt_metric_scale = conf["mt_metric_scale"].as<double>();
+  approx_score = conf.count("approx_score");
+  no_reweight = conf.count("no_reweight");
+  no_select = conf.count("no_select");
+  update_list_size = conf["update_k_best"].as<int>();
+  unique_kbest = conf.count("unique_k_best");
+  pseudo_doc = conf.count("pseudo_doc");
+  sent_approx = conf.count("sent_approx");
+  cerr << "Using pseudo-doc:" << pseudo_doc << " Sent:" << sent_approx << endl;
+  if(pseudo_doc)
+    mt_metric_scale=1;
+
+  const string weights_dir = conf["weights_output"].as<string>();
+  const string output_dir = conf["output_dir"].as<string>();
+  ScoreType type = ScoreTypeFromString(metric_name);
+
+  //establish metric used for tuning
+  if (type == TER) {
+    invert_score = true;
+  } else {
+    invert_score = false;
+  }
+
+  //load references
+  DocScorer ds(type, conf["reference"].as<vector<string> >(), "");
+  cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl;
+  vector<ScoreP> corpus_bleu_sent_stats;
+  
+  //check training pass,if >0, then use previous iterations corpus bleu stats
+  cur_pass = conf["pass"].as<int>();
+  if(cur_pass > 0)
+    {
+      ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir);
+    }
+  
+  cerr << "Using optimizer:" << optimizer << endl;
+    
+  ReadFile ini_rf(conf["decoder_config"].as<string>());
+  Decoder decoder(ini_rf.stream());
+
+  vector<weight_t>& dense_weights = decoder.CurrentWeightVector();
+  
+  SparseVector<weight_t> lambdas;
+  Weights::InitFromFile(conf["input_weights"].as<string>(), &dense_weights);
+  Weights::InitSparseVector(dense_weights, &lambdas);
+
+  const string input = decoder.GetConf()["input"].as<string>();
+  if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl;
+  ReadFile in_read(input);
+  istream *in = in_read.stream();
+  assert(*in);  
+  string buf;
+  
+  const double max_step_size = conf["max_step_size"].as<double>();
+
+  vector<GoodBadOracle> oracles(ds.size());
+
+  TrainingObserver observer(conf["k_best_size"].as<int>(), ds, &oracles, &corpus_bleu_sent_stats);
+
+  int cur_sent = 0;
+  int lcount = 0;
+  double objective=0;
+  double tot_loss = 0;
+  int dots = 0;
+  SparseVector<double> tot;
+  SparseVector<double> final_tot;
+
+  SparseVector<double> old_lambdas = lambdas;
+  tot.clear();
+  tot += lambdas;
+  cerr << "PASS " << cur_pass << " " << endl << lambdas << endl; 
+  ScoreP acc, acc_h, acc_f;
+  
+  while(*in) {
+      getline(*in, buf);
+      if (buf.empty()) continue;
+      //TODO: allow batch updating
+      lambdas.init_vector(&dense_weights);
+      dense_w_local = dense_weights;
+      decoder.SetId(cur_sent);
+      decoder.Decode(buf, &observer);  // decode the sentence, calling Notify to get the hope,fear, and model best hyps. 
+      
+      cur_sent = observer.GetCurrentSent();
+      cerr << "SENT: " << cur_sent << endl;
+      const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis();
+      const HypothesisInfo& cur_good = *oracles[cur_sent].good[0];
+      const HypothesisInfo& cur_bad = *oracles[cur_sent].bad[0];
+
+      vector<shared_ptr<HypothesisInfo> >& cur_good_v = oracles[cur_sent].good;
+      vector<shared_ptr<HypothesisInfo> >& cur_bad_v = oracles[cur_sent].bad;
+      vector<shared_ptr<HypothesisInfo> > cur_best_v = observer.GetCurrentBest();
+
+      tot_loss += cur_hyp.mt_metric;
+      
+      //score hyps to be able to compute corpus level bleu after we finish this iteration through the corpus
+      ScoreP sentscore = ds[cur_sent]->ScoreCandidate(cur_hyp.hyp);
+      if (!acc) { acc = sentscore->GetZero(); }
+      acc->PlusEquals(*sentscore);
+
+      ScoreP hope_sentscore = ds[cur_sent]->ScoreCandidate(cur_good.hyp);
+      if (!acc_h) { acc_h = hope_sentscore->GetZero(); }
+      acc_h->PlusEquals(*hope_sentscore);
+
+      ScoreP fear_sentscore = ds[cur_sent]->ScoreCandidate(cur_bad.hyp);
+      if (!acc_f) { acc_f = fear_sentscore->GetZero(); }
+      acc_f->PlusEquals(*fear_sentscore);
+      
+      if(optimizer == 4) { //passive-aggresive update (single dual coordinate step)
+      
+	  double margin = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights);
+	  double mt_loss = (cur_good.mt_metric - cur_bad.mt_metric);
+	  const double loss = margin +  mt_loss;
+	  cerr << "LOSS: " << loss << " Margin:" << margin << " BLEUL:" << mt_loss << " " << cur_bad.features.dot(dense_weights) << " " << cur_good.features.dot(dense_weights) <<endl;
+	  if (loss > 0.0 || !checkloss) {
+	    SparseVector<double> diff = cur_good.features;
+	    diff -= cur_bad.features;	    
+
+	    double diffsqnorm = diff.l2norm_sq();
+	    double delta;
+	    if (diffsqnorm > 0)
+	      delta = loss / (diffsqnorm);
+	    else
+	      delta = 0;
+	    
+	    if (delta > max_step_size) delta = max_step_size;
+	    lambdas += (cur_good.features * delta);
+	    lambdas -= (cur_bad.features * delta);
+	    
+	  }
+      }
+      else if(optimizer == 1) //sgd - nonadapted step size
+	{
+	   
+	  lambdas += (cur_good.features) * max_step_size;
+	  lambdas -= (cur_bad.features) * max_step_size;
+	}
+      else if(optimizer == 5) //full mira with n-best list of constraints from hope, fear, model best
+	{
+	  vector<shared_ptr<HypothesisInfo> > cur_constraint;
+	  cur_constraint.insert(cur_constraint.begin(), cur_bad_v.begin(), cur_bad_v.end());
+	  cur_constraint.insert(cur_constraint.begin(), cur_best_v.begin(), cur_best_v.end());
+	  cur_constraint.insert(cur_constraint.begin(), cur_good_v.begin(), cur_good_v.end());
+
+	  bool optimize_again;
+	  vector<shared_ptr<HypothesisInfo> > cur_pair;
+	  //SMO 
+	  for(int u=0;u!=cur_constraint.size();u++)	
+	    cur_constraint[u]->alpha =0;	      
+	  
+	  cur_constraint[0]->alpha =1; //set oracle to alpha=1
+
+	  cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl;
+	  int smo_iter = MAX_SMO, smo_iter2 = MAX_SMO;
+	  int iter, iter2 =0;
+	  bool DEBUG_SMO = false;
+	  while (iter2 < smo_iter2)
+	    {
+	      iter =0;
+	      while (iter < smo_iter)
+		{
+		  optimize_again = true;
+		  for (int i = 0; i< cur_constraint.size(); i++)
+		    for (int j = i+1; j< cur_constraint.size(); j++)
+		      {
+			if(DEBUG_SMO) cerr << "start " << i << " " << j <<  endl;
+			cur_pair.clear();
+			cur_pair.push_back(cur_constraint[j]);
+			cur_pair.push_back(cur_constraint[i]);
+			double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights);
+			
+			if (delta == 0) optimize_again = false;
+			cur_constraint[j]->alpha += delta;
+			cur_constraint[i]->alpha -= delta;
+			double step_size = delta * max_step_size;
+			
+			lambdas += (cur_constraint[i]->features) * step_size;
+			lambdas -= (cur_constraint[j]->features) * step_size;
+			if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << i << " " << j << " " <<  delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha <<  endl;		
+		      }
+		  iter++;
+		  
+		  if(!optimize_again)
+		    { 
+		      iter = MAX_SMO;
+		      cerr << "Optimization stopped, delta =0" << endl;
+		    }		  
+		}
+	      iter2++;
+	    }	  
+	}
+      else if(optimizer == 2 || optimizer == 3) //PA and Cutting Plane MIRA update
+	  {
+	    bool DEBUG_SMO= true;
+	    vector<shared_ptr<HypothesisInfo> > cur_constraint;
+	    cur_constraint.push_back(cur_good_v[0]); //add oracle to constraint set
+	    bool optimize_again = true;
+	    int cut_plane_calls = 0;
+	    while (optimize_again)
+	      { 
+		if(DEBUG_SMO) cerr<< "optimize again: " << optimize_again << endl;
+		if(optimizer == 2){ //PA
+		  cur_constraint.push_back(cur_bad_v[0]);
+
+		  //check if we have a violation
+		  if(!(cur_constraint[1]->fear > cur_constraint[0]->fear + SMO_EPSILON))
+		    {
+		      optimize_again = false;
+		      cerr << "Constraint not violated" << endl;
+		    }
+		}
+		else
+		  { //cutting plane to add constraints
+		    if(DEBUG_SMO) cerr<< "Cutting Plane " << cut_plane_calls << " with " << lambdas << endl;
+		    optimize_again = false;
+		    cut_plane_calls++;
+		    CuttingPlane(&cur_constraint, &optimize_again, oracles[cur_sent].bad, dense_weights);
+		    if (cut_plane_calls >= MAX_SMO) optimize_again = false;
+		  }
+
+		if(optimize_again)
+		  {
+		    //SMO 
+		    for(int u=0;u!=cur_constraint.size();u++)	
+		      { 
+			cur_constraint[u]->alpha =0;
+		      }
+		    cur_constraint[0]->alpha = 1;
+		    cerr <<" Optimizing with " << cur_constraint.size() << " constraints" << endl;
+		    int smo_iter = MAX_SMO;
+		    int iter =0;
+		    while (iter < smo_iter)
+		      {			
+			//select pair to optimize from constraint set
+			vector<shared_ptr<HypothesisInfo> > cur_pair = SelectPair(&cur_constraint);
+			
+			if(cur_pair.empty()){
+			  iter=MAX_SMO; 
+			  cerr << "Undefined pair " << endl; 
+			  continue;
+			} //pair is undefined so we are done with this smo 
+
+			double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights);
+
+			cur_pair[0]->alpha += delta;
+			cur_pair[1]->alpha -= delta;
+			double step_size = delta * max_step_size;
+			cerr << "step " << step_size << endl;
+
+			lambdas += (cur_pair[1]->features) * step_size;
+			lambdas -= (cur_pair[0]->features) * step_size;
+			cerr << " Lambdas " << lambdas << endl;
+			//reload weights based on update
+
+			dense_weights.clear();
+			lambdas.init_vector(&dense_weights);
+			dense_w_local = dense_weights;
+			iter++;
+					
+			if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha <<  endl;		
+			if(no_select) //don't use selection heuristic to determine when to stop SMO, rather just when delta =0 
+			  if (delta == 0) iter = MAX_SMO;
+			
+			//only perform one dual coordinate ascent step
+			if(optimizer == 2) 
+			  {
+			    optimize_again = false;
+			    iter = MAX_SMO;
+			  }					
+		      }
+		    if(optimizer == 3)
+		      {
+			if(!no_reweight) //reweight the forest and select a new k-best
+			  {
+			    if(DEBUG_SMO) cerr<< "Decoding with new weights -- now orac are " << oracles[cur_sent].good.size() << endl;
+			    Hypergraph hg = observer.GetCurrentForest();
+			    hg.Reweight(dense_weights);
+			    if(unique_kbest)
+                              observer.UpdateOracles<KBest::FilterUnique>(cur_sent, hg);
+                            else
+                              observer.UpdateOracles<KBest::NoFilter<std::vector<WordID> > >(cur_sent, hg);			    
+			  }
+		      }
+		  }
+		
+	      }
+	   
+	    //print objective after this sentence
+	    double lambda_change = (lambdas - old_lambdas).l2norm_sq();
+	    double max_fear = cur_constraint[cur_constraint.size()-1]->fear;
+	    double temp_objective = 0.5 * lambda_change;// + max_step_size * max_fear;
+
+	    for(int u=0;u!=cur_constraint.size();u++)	
+	      { 
+		cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << " " << cur_constraint[u]->fear << endl;
+		temp_objective += cur_constraint[u]->alpha * cur_constraint[u]->fear;
+	      }
+	    objective += temp_objective;
+	    
+	    cerr << "SENT OBJ: " << temp_objective << " NEW OBJ: " << objective << endl;
+	  }
+      
+    
+      if ((cur_sent * 40 / ds.size()) > dots) { ++dots; cerr << '.'; }
+      tot += lambdas;
+      ++lcount;
+      cur_sent++;
+      
+      cout << TD::GetString(cur_good_v[0]->hyp) << " ||| " << TD::GetString(cur_best_v[0]->hyp) << " ||| " << TD::GetString(cur_bad_v[0]->hyp) << endl;
+
+    }
+
+    cerr << "FINAL OBJECTIVE: "<< objective << endl;
+    final_tot += tot;
+    cerr << "Translated " << lcount << " sentences " << endl;
+    cerr << " [AVG METRIC LAST PASS=" << (tot_loss / lcount) << "]\n";
+    tot_loss = 0;
+    
+    int node_id = rng->next() * 100000;
+    cerr << " Writing weights to " << node_id << endl;
+    Weights::ShowLargestFeatures(dense_weights);
+    dots = 0;
+    ostringstream os;
+    os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz";
+    string msg = "# MIRA tuned weights ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount);
+    lambdas.init_vector(&dense_weights);
+    Weights::WriteToFile(os.str(), dense_weights, true, &msg);
+
+    SparseVector<double> x = tot;
+    x /= lcount+1;
+    ostringstream sa;
+    string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount);
+    sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz";
+    x.init_vector(&dense_weights);
+    Weights::WriteToFile(sa.str(), dense_weights, true, &msga);
+    
+    cerr << "Optimization complete.\n";
+    return 0;
+}
+
diff --git a/training/mira/kbest_mira.cc b/training/mira/kbest_mira.cc
index 8b7993dd..d59b4224 100644
--- a/training/mira/kbest_mira.cc
+++ b/training/mira/kbest_mira.cc
@@ -8,9 +8,11 @@
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
+#include "stringlib.h"
 #include "hg_sampler.h"
 #include "sentence_metadata.h"
-#include "scorer.h"
+#include "ns.h"
+#include "ns_docscorer.h"
 #include "verbose.h"
 #include "viterbi.h"
 #include "hg.h"
@@ -91,8 +93,9 @@ struct GoodBadOracle {
 };
 
 struct TrainingObserver : public DecoderObserver {
-  TrainingObserver(const int k, const DocScorer& d, bool sf, vector<GoodBadOracle>* o) : ds(d), oracles(*o), kbest_size(k), sample_forest(sf) {}
-  const DocScorer& ds;
+  TrainingObserver(const int k, const DocumentScorer& d, const EvaluationMetric& m, bool sf, vector<GoodBadOracle>* o) : ds(d), metric(m), oracles(*o), kbest_size(k), sample_forest(sf) {}
+  const DocumentScorer& ds;
+  const EvaluationMetric& metric;
   vector<GoodBadOracle>& oracles;
   std::tr1::shared_ptr<HypothesisInfo> cur_best;
   const int kbest_size;
@@ -121,13 +124,16 @@ struct TrainingObserver : public DecoderObserver {
     if (sample_forest) {
       vector<WordID> cur_prediction;
       ViterbiESentence(forest, &cur_prediction);
-      float sentscore = ds[sent_id]->ScoreCandidate(cur_prediction)->ComputeScore();
+      SufficientStats sstats;
+      ds[sent_id]->Evaluate(cur_prediction, &sstats);
+      float sentscore = metric.ComputeScore(sstats);
       cur_best = MakeHypothesisInfo(ViterbiFeatures(forest), sentscore);
 
       vector<HypergraphSampler::Hypothesis> samples;
       HypergraphSampler::sample_hypotheses(forest, kbest_size, &*rng, &samples);
       for (unsigned i = 0; i < samples.size(); ++i) {
-        sentscore = ds[sent_id]->ScoreCandidate(samples[i].words)->ComputeScore();
+        ds[sent_id]->Evaluate(samples[i].words, &sstats);
+        float sentscore = metric.ComputeScore(sstats);
         if (invert_score) sentscore *= -1.0;
         if (!cur_good || sentscore > cur_good->mt_metric)
           cur_good = MakeHypothesisInfo(samples[i].fmap, sentscore);
@@ -136,11 +142,13 @@ struct TrainingObserver : public DecoderObserver {
       }
     } else {
       KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, kbest_size);
+      SufficientStats sstats;
       for (int i = 0; i < kbest_size; ++i) {
         const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
           kbest.LazyKthBest(forest.nodes_.size() - 1, i);
         if (!d) break;
-        float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore();
+        ds[sent_id]->Evaluate(d->yield, &sstats);
+        float sentscore = metric.ComputeScore(sstats);
         if (invert_score) sentscore *= -1.0;
         // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl;
         if (i == 0)
@@ -192,15 +200,20 @@ int main(int argc, char** argv) {
   }
   vector<string> corpus;
   ReadTrainingCorpus(conf["source"].as<string>(), &corpus);
-  const string metric_name = conf["mt_metric"].as<string>();
-  ScoreType type = ScoreTypeFromString(metric_name);
-  if (type == TER) {
-    invert_score = true;
-  } else {
-    invert_score = false;
+
+  string metric_name = UppercaseString(conf["mt_metric"].as<string>());
+  if (metric_name == "COMBI") {
+    cerr << "WARNING: 'combi' metric is no longer supported, switching to 'COMB:TER=-0.5;IBM_BLEU=0.5'\n";
+    metric_name = "COMB:TER=-0.5;IBM_BLEU=0.5";
+  } else if (metric_name == "BLEU") {
+    cerr << "WARNING: 'BLEU' is ambiguous, assuming 'IBM_BLEU'\n";
+    metric_name = "IBM_BLEU";
   }
-  DocScorer ds(type, conf["reference"].as<vector<string> >(), "");
+  EvaluationMetric* metric = EvaluationMetric::Instance(metric_name);
+  DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
   cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl;
+  invert_score = metric->IsErrorMetric();
+
   if (ds.size() != corpus.size()) {
     cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n";
     return 1;
@@ -221,7 +234,7 @@ int main(int argc, char** argv) {
   assert(corpus.size() > 0);
   vector<GoodBadOracle> oracles(corpus.size());
 
-  TrainingObserver observer(conf["k_best_size"].as<int>(), ds, sample_forest, &oracles);
+  TrainingObserver observer(conf["k_best_size"].as<int>(), ds, *metric, sample_forest, &oracles);
   int cur_sent = 0;
   int lcount = 0;
   int normalizer = 0;
diff --git a/training/mira/run_mira.pl b/training/mira/run_mira.pl
new file mode 100755
index 00000000..d71590ba
--- /dev/null
+++ b/training/mira/run_mira.pl
@@ -0,0 +1,630 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0));
+push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+my $default_jobs = env_default_jobs();
+
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+
+my $iteration = 0.0;
+my $max_iterations = 10;
+my $metric = "ibm_bleu";
+my $iniFile;
+my $weights;
+my $initialWeights;
+my $jobs = $default_jobs;   # number of decode nodes
+my $pmem = "1g";
+my $dir;
+
+my $SCORER = $FAST_SCORE;
+
+my $UTILS_DIR="$SCRIPT_DIR/../utils";
+require "$UTILS_DIR/libcall.pl";
+
+my $parallelize = "$UTILS_DIR/parallelize.pl";
+my $libcall = "$UTILS_DIR/libcall.pl";
+my $sentserver = "$UTILS_DIR/sentserver";
+my $sentclient = "$UTILS_DIR/sentclient";
+
+my $run_local = 0;
+my $pass_suffix = '';
+
+my $cdec ="$bin_dir/kbest_cut_mira"; 
+
+die "Can't find decoder in $cdec" unless -x $cdec;
+my $decoder = $cdec;
+my $decoderOpt;
+my $update_size;
+my $approx_score;
+my $kbest_size=250;
+my $metric_scale=1;
+my $optimizer=2;
+my $disable_clean = 0;
+my $use_make=0;  
+my $density_prune;
+my $cpbin=1;
+my $help = 0;
+my $epsilon = 0.0001;
+my $step_size = 0.01;
+my $gpref;
+my $unique_kbest;
+my $freeze;
+my $hopes=1;
+my $fears=1;
+my $sent_approx=0;
+my $pseudo_doc=0;
+
+my $range = 35000;
+my $minimum = 15000;
+my $portn = int(rand($range)) + $minimum;
+
+
+# Process command-line options
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+        "decoder=s" => \$decoderOpt,
+        "jobs=i" => \$jobs,
+        "density-prune=f" => \$density_prune,
+        "dont-clean" => \$disable_clean,
+        "pass-suffix=s" => \$pass_suffix,
+        "epsilon=s" => \$epsilon,
+        "help" => \$help,
+        "local" => \$run_local,
+        "use-make=i" => \$use_make,
+        "max-iterations=i" => \$max_iterations,
+        "pmem=s" => \$pmem,
+        "cpbin!" => \$cpbin,
+        "ref-files=s" => \$refFiles,
+        "metric=s" => \$metric,
+        "source-file=s" => \$srcFile,
+        "weights=s" => \$initialWeights,
+	"optimizer=i" => \$optimizer,
+	"metric-scale=i" => \$metric_scale,
+	"kbest-size=i" => \$kbest_size,
+	"update-size=i" => \$update_size,
+	"step-size=f" => \$step_size,
+	"hope-select=i" => \$hopes,
+	"fear-select=i" => \$fears,
+	"sent-approx" => \$sent_approx,
+        "pseudo-doc" => \$pseudo_doc,
+	"unique-kbest" => \$unique_kbest,
+        "grammar-prefix=s" => \$gpref,
+	"freeze" => \$freeze,
+        "workdir=s" => \$dir,
+	) == 0 || @ARGV!=1 || $help) {
+        print_help();
+        exit;
+}
+
+($iniFile) = @ARGV;
+
+
+sub write_config;
+sub enseg;
+sub print_help;
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+
+#my $refs_comma_sep = get_comma_sep_refs($refFiles);
+my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
+
+#my $refs_comma_sep_4cdec = get_comma_sep_refs_4cdec($refFiles);
+
+unless ($dir){
+        $dir = "mira";
+}
+unless ($dir =~ /^\//){  # convert relative path to absolute path
+        my $basedir = check_output("pwd");
+        chomp $basedir;
+        $dir = "$basedir/$dir";
+}
+
+if ($decoderOpt){ $decoder = $decoderOpt; }
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+        print STDERR "Cleanup...\n";
+        for my $pid (@childpids){ unchecked_call("kill $pid"); }
+        for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+        exit 1;
+};
+
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit =
+    sub{ cleanup(); };
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+
+
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+use File::Basename qw(basename);
+#pass bindir, refs to vars holding bin
+sub modbin {
+    local $_;
+    my $bindir=shift;
+    check_call("mkdir -p $bindir");
+    -d $bindir || die "couldn't make bindir $bindir";
+    for (@_) {
+        my $src=$$_;
+        $$_="$bindir/".basename($src);
+        check_call("cp -p $src $$_");
+    }
+}
+sub dirsize {
+    opendir ISEMPTY,$_[0];
+    return scalar(readdir(ISEMPTY))-1;
+}
+
+
+
+
+if (-e $dir && dirsize($dir)>1 && -e "$dir/weights" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs
+    die "ERROR: working dir $dir already exists\n\n";
+} else {
+    -e $dir || mkdir $dir;
+    mkdir "$dir/scripts";
+    my $cmdfile="$dir/rerun-mira.sh";
+    open CMD,'>',$cmdfile;
+    print CMD "cd ",&getcwd,"\n";
+    my $cline=&cmdline."\n";
+    print CMD $cline;
+    close CMD;
+    print STDERR $cline;
+    chmod(0755,$cmdfile);
+    unless (-e $initialWeights) {
+        print STDERR "Please specify an initial weights file with --initial-weights\n";
+        print_help();
+        exit;
+    }
+    check_call("cp $initialWeights $dir/weights.0");
+    die "Can't find weights.0" unless (-e "$dir/weights.0");
+}
+write_config(*STDERR);
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+$iniFile = $newIniFile;
+
+my $newsrc = "$dir/dev.input";
+enseg($srcFile, $newsrc, $gpref);
+
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+my $lastPScore = 0;
+my $lastWeightsFile;
+my $bestScoreIter=-1;
+my $bestScore=-1;
+unless ($update_size){$update_size = $kbest_size;}
+# main optimization loop
+#while (1){
+for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) {
+
+	print STDERR "\n\nITERATION $opt_iter\n==========\n";
+	print STDERR "Using port $portn\n";
+
+	# iteration-specific files
+	my $runFile="$dir/run.raw.$opt_iter";
+	my $onebestFile="$dir/1best.$opt_iter";
+	my $logdir="$dir/logs.$opt_iter";
+	my $decoderLog="$logdir/decoder.sentserver.log.$opt_iter";
+	my $scorerLog="$logdir/scorer.log.$opt_iter";
+	my $weightdir="$dir/weights.pass$opt_iter/";
+	check_call("mkdir -p $logdir");
+	check_call("mkdir -p $weightdir");
+
+	#decode
+	print STDERR "RUNNING DECODER AT ";
+	print STDERR unchecked_output("date");
+#	my $im1 = $opt_iter - 1;
+	my $weightsFile="$dir/weights.$opt_iter";
+	print "ITER $iteration " ;
+	my $cur_pass = "-p 0$opt_iter";
+	my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile $refs_comma_sep -m $metric -s $metric_scale -b $update_size -k $kbest_size -o $optimizer $cur_pass -O $weightdir -D $dir  -h $hopes -f $fears -C $step_size";
+	if($unique_kbest){
+		$decoder_cmd .= " -u";
+	}
+	if($sent_approx){
+		$decoder_cmd .= " -a";
+	}
+	if($pseudo_doc){
+                $decoder_cmd .= " -e";
+        }
+	if ($density_prune) {
+		$decoder_cmd .= " --density_prune $density_prune";
+	}
+	my $pcmd;
+	if ($run_local) {
+		$pcmd = "cat $srcFile |";
+	} elsif ($use_make) {
+	    # TODO: Throw error when jobs is speong with use_make
+		$pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --";
+	} 
+	else {
+	    $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --baseport $portn --";
+	}
+	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_bash_call($cmd);
+
+	my $retries = 0;
+        my $num_topbest;
+        while($retries < 6) {
+            $num_topbest = check_output("wc -l < $runFile");
+            print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+            if($devSize == $num_topbest) {
+                last;
+            } else {
+                print STDERR "Incorrect number of topbest. Waiting for distributed filesystem and retrying...\n";
+                sleep(10);
+            }
+            $retries++;
+        }
+	 die "Dev set contains $devSize sentences, but we don't have topbest for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_topbest);
+
+
+	#score the output from this iteration
+	open RUN, "<$runFile" or die "Can't read $runFile: $!";
+	open H, ">$runFile.H" or die;
+	open F, ">$runFile.F" or die;
+	open B, ">$runFile.B" or die;
+	while(<RUN>) {
+	    chomp();
+	    (my $hope,my $best,my $fear) = split(/ \|\|\| /);
+	    print H "$hope \n"; 	    
+	    print B "$best \n";
+ 	    print F "$fear \n";
+	}
+	close RUN;
+	close F; close B; close H;
+	
+	my $dec_score = check_output("cat $runFile.B | $SCORER $refs_comma_sep -m $metric");
+	my $dec_score_h = check_output("cat $runFile.H | $SCORER $refs_comma_sep -m $metric");
+	my $dec_score_f = check_output("cat $runFile.F | $SCORER $refs_comma_sep -m $metric");
+	chomp $dec_score; chomp $dec_score_h; chomp $dec_score_f;
+	print STDERR "DECODER SCORE: $dec_score HOPE: $dec_score_h FEAR: $dec_score_f\n";
+	if ($dec_score> $bestScore){
+		$bestScoreIter=$opt_iter; 
+		$bestScore=$dec_score;
+	}
+	# save space
+	check_call("gzip -f $runFile");
+	check_call("gzip -f $decoderLog");
+		my $iter_filler="";
+	if($opt_iter < 10)
+	{$iter_filler="0";}
+
+	my $nextIter = $opt_iter + 1;
+	my $newWeightsFile = "$dir/weights.$nextIter";
+	$lastWeightsFile = "$dir/weights.$opt_iter";
+
+	average_weights("$weightdir/weights.mira-pass*.*[0-9].gz", $newWeightsFile, $logdir);
+	system("gzip -f $logdir/kbes*");
+	print STDERR "\n==========\n";
+	$iteration++;
+}
+print STDERR "\nBEST ITER: $bestScoreIter :: $bestScore\n\n\n";
+
+print STDOUT "$lastWeightsFile\n";
+
+sub get_lines {
+  my $fn = shift @_;
+  open FL, "<$fn" or die "Couldn't read $fn: $!";
+  my $lc = 0;
+  while(<FL>) { $lc++; }
+  return $lc;
+}
+
+sub get_comma_sep_refs {
+  my ($r,$p) = @_;
+  my $o = check_output("echo $p");
+  chomp $o;
+  my @files = split /\s+/, $o;
+  return "-$r " . join(" -$r ", @files);
+}
+
+
+sub read_weights_file {
+  my ($file) = @_;
+  open F, "<$file" or die "Couldn't read $file: $!";
+  my @r = ();
+  my $pm = -1;
+  while(<F>) {
+    next if /^#/;
+    next if /^\s*$/;
+    chomp;
+    if (/^(.+)\s+(.+)$/) {
+      my $m = $1;
+      my $w = $2;
+      die "Weights out of order: $m <= $pm" unless $m > $pm;
+      push @r, $w;
+    } else {
+      warn "Unexpected feature name in weight file: $_";
+    }
+  }
+  close F;
+  return join ' ', @r;
+}
+
+sub write_config {
+	my $fh = shift;
+	my $cleanup = "yes";
+	if ($disable_clean) {$cleanup = "no";}
+
+	print $fh "\n";
+	print $fh "DECODER:          $decoder\n";
+	print $fh "INI FILE:         $iniFile\n";
+	print $fh "WORKING DIR:      $dir\n";
+	print $fh "SOURCE (DEV):     $srcFile\n";
+	print $fh "REFS (DEV):       $refFiles\n";
+	print $fh "EVAL METRIC:      $metric\n";
+	print $fh "START ITERATION:  $iteration\n";
+	print $fh "MAX ITERATIONS:   $max_iterations\n";
+	print $fh "DECODE NODES:     $jobs\n";
+	print $fh "HEAD NODE:        $host\n";
+	print $fh "PMEM (DECODING):  $pmem\n";
+	print $fh "CLEANUP:          $cleanup\n";
+	print $fh "INITIAL WEIGHTS:  $initialWeights\n";
+        print $fh "GRAMMAR PREFIX:   $gpref\n";
+}
+
+sub update_weights_file {
+  my ($neww, $rfn, $rpts) = @_;
+  my @feats = @$rfn;
+  my @pts = @$rpts;
+  my $num_feats = scalar @feats;
+  my $num_pts = scalar @pts;
+  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+  open G, ">$neww" or die;
+  for (my $i = 0; $i < $num_feats; $i++) {
+    my $f = $feats[$i];
+    my $lambda = $pts[$i];
+    print G "$f $lambda\n";
+  }
+  close G;
+}
+
+sub enseg {
+    my $src = shift;
+    my $newsrc = shift;
+    my $grammarpref = shift;
+
+    open(SRC, $src);
+    open(NEWSRC, ">$newsrc");
+    my $i=0;
+    while (my $line=<SRC>){
+	chomp $line;
+	if ($line =~ /^\s*<seg/i) {
+	    if($line =~ /id="[0-9]+"/) {
+		print NEWSRC "$line\n";
+	    } else {
+		die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+	    }
+	}
+	elsif (defined $grammarpref) {
+	    print NEWSRC "<seg id=\"$i\" grammar=\"$grammarpref.$i.gz\">$line</seg>\n";}
+	else {
+	    print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+	}
+	$i++;
+    }
+    close SRC;
+    close NEWSRC;
+}
+
+sub print_help {
+ my $executable = check_output("basename $0"); chomp $executable;
+        print << "Help";
+
+Usage: $executable [options] <ini file>
+        Runs a complete MIRA optimization using the ini file specified.
+	Example invocation:
+	run_mira.pl \
+        --pmem 3g \
+        --max-iterations 20 \
+        --optimizer 2 \
+        --unique-kbest \
+        --jobs 15 \
+        --kbest-size 500 \
+        --hope-select 1 \
+        --fear-select 1  \
+        --ref-files "ref.0.soseos ref.1.soseos" \
+        --source-file src.soseos \
+        --weights weights.init \
+        --workdir workdir \
+        --grammar-prefix grammars/grammar \
+        --step-size 0.01 \
+        --metric-scale 10000 \
+
+Required:
+
+        --ref-files <files>
+                Dev set ref files.  This option takes only a single string argument.
+                To use multiple files (including file globbing), this argument should
+                be quoted.
+        --source-file <file>
+                Dev set source file.
+        --weights <file>
+                Initial weights file
+
+General options:
+
+        --help
+                Print this message and exit.
+
+       --max-iterations <M>
+                Maximum number of iterations to run.  If not specified, defaults
+                to $max_iterations.
+
+        --metric <method>
+                Metric to optimize.
+                Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+        --workdir <dir>
+                Directory for intermediate and output files.  If not specified, the
+                name is derived from the ini filename.  Assuming that the ini
+                filename begins with the decoder name and ends with ini, the default
+                name of the working directory is inferred from the middle part of
+                the filename.  E.g. an ini file named decoder.foo.ini would have
+                a default working directory name foo.
+	--optimizer <I>
+		Learning method to use for weight update. Choice are 1) SGD, 2) PA MIRA with Selection from Cutting Plane, 3) Cutting Plane MIRA, 4) PA MIRA,5) nbest MIRA with hope, fear, and model constraints
+	--metric-scale <I>
+		Scale MT loss by this amount when computing hope/fear candidates
+	--kbest-size <I>
+		Size of k-best list to extract from forest
+	--update-size <I>
+		Size of k-best list to use for update (applies to optimizer 5)
+	--step-size <F>
+		Controls aggresiveness of update (C) 
+	--hope-select<I>
+		How to select hope candidate. Choices are 1) model score - cost, 2) min cost
+	--fear-select <I>
+		How to select fear candodate. Choices are 1) model score + cost, 2) max cost, 3) max score
+	--sent-approx
+		Use smoothed sentence-level MT metric
+	--pseudo-doc
+		Use pseudo document to approximate MT metric
+	--unique-kbest
+		Extract unique k-best from forest
+	--grammar-prefix <path>
+		Path to sentence-specific grammar files
+
+Job control options:
+
+        --jobs <I>
+                Number of decoder processes to run in parallel. [default=$default_jobs]
+
+        --pmem <N>
+                Amount of physical memory requested for parallel decoding jobs
+                (used with qsub requests only)
+
+	--local 
+		Run single learner
+	--use-make <I>
+		Run parallel learners on a single machine through fork.
+
+
+Help
+}
+
+
+sub cmdline {
+    return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+    my ($arg)=@_;
+    return undef unless defined $arg;
+    if ($arg =~ /$is_shell_special/) {
+        $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+        return "\"$arg\"";
+    }
+    return $arg;
+}
+
+sub escaped_shell_args {
+    return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+    return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
+
+sub average_weights {
+
+    my $path = shift;
+    my $out = shift;
+    my $logpath = shift;
+    print "AVERAGE $path $out\n";
+    my %feature_weights= ();
+    my $total =0;
+    my $total_mult =0;
+    sleep(10);
+    foreach my $file (glob "$path")
+    {
+	$file =~ /\/([^\/]+).gz$/;
+	my $fname = $1;
+	my $cmd = "gzip -d $file";
+	$file =~ s/\.gz//;
+	check_bash_call($cmd);
+	my $mult = 0;
+	print "FILE $file \n";
+	open SCORE, "< $file" or next;
+	$total++;
+	while( <SCORE> ) {
+	    my $line = $_;
+	    if ($line !~ m/^\#/)
+	    {
+		my @s = split(" ",$line);
+		$feature_weights{$s[0]}+= $mult * $s[1];
+	    }
+	    else
+	    {
+		(my $msg,my $ran,$mult) = split(/ \|\|\| /);
+		print "Processing $ran $mult\n";
+	    }
+	}
+	$total_mult += $mult;
+	
+	close SCORE;
+	$cmd = "gzip $file"; check_bash_call($cmd);
+    }
+    
+#print out new averaged weights
+    open OUT, "> $out" or next;
+    for my $f ( keys %feature_weights ) {
+	print "$f $feature_weights{$f} $total_mult\n";
+	my $ave = $feature_weights{$f} / $total_mult;
+	
+	print "Printing $f $ave ||| ";
+	print OUT "$f $ave\n";
+    }
+    
+}
author	Paul Baltescu <pauldb89@gmail.com>	2013-04-24 17:18:10 +0100
committer	Paul Baltescu <pauldb89@gmail.com>	2013-04-24 17:18:10 +0100
commit	e8b412577b9d3fe2090b9f48443f919cd268c809 (patch)
tree	b46a7b51d365519dfb5170d71bac33be6d3e29b9 /training
parent	d189426a7ea56b71eb6e25ed02a7b0993cfb56a8 (diff)
parent	5aee54869aa19cfe9be965e67a472e94449d16da (diff)