10 files changed, 155 insertions, 168 deletions
diff --git a/training/dtrain/Makefile.am b/training/dtrain/Makefile.am
index aadd376d..a6c65b1e 100644
--- a/training/dtrain/Makefile.am
+++ b/training/dtrain/Makefile.am
@@ -1,6 +1,6 @@
 bin_PROGRAMS = dtrain
 
-dtrain_SOURCES = dtrain.cc dtrain.h sample.h update.h score.h
+dtrain_SOURCES = dtrain.cc dtrain.h sample.h score.h update.h
 dtrain_LDADD   = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a
 
 AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
index 1b7047b0..63b154b4 100644
--- a/training/dtrain/dtrain.cc
+++ b/training/dtrain/dtrain.cc
@@ -1,6 +1,6 @@
 #include "dtrain.h"
-#include "score.h"
 #include "sample.h"
+#include "score.h"
 #include "update.h"
 
 using namespace dtrain;
@@ -16,21 +16,20 @@ main(int argc, char** argv)
   const size_t N              = conf["N"].as<size_t>();
   const size_t T              = conf["iterations"].as<size_t>();
   const weight_t eta          = conf["learning_rate"].as<weight_t>();
-  const weight_t error_margin = conf["error_margin"].as<weight_t>();
+  const weight_t margin       = conf["margin"].as<weight_t>();
   const bool average          = conf["average"].as<bool>();
-  const bool keep             = conf["keep"].as<bool>();
   const weight_t l1_reg       = conf["l1_reg"].as<weight_t>();
+  const bool keep             = conf["keep"].as<bool>();
   const string output_fn      = conf["output"].as<string>();
   vector<string> print_weights;
-  boost::split(print_weights, conf["print_weights"].as<string>(), boost::is_any_of(" "));
+  boost::split(print_weights, conf["print_weights"].as<string>(),
+               boost::is_any_of(" "));
 
   // setup decoder
   register_feature_functions();
   SetSilent(true);
-  ReadFile f(conf["decoder_config"].as<string>());
+  ReadFile f(conf["decoder_conf"].as<string>());
   Decoder decoder(f.stream());
-
-  // setup decoder observer
   ScoredKbest* observer = new ScoredKbest(k, new PerSentenceBleuScorer(N));
 
   // weights
@@ -44,25 +43,29 @@ main(int argc, char** argv)
   // input
   string input_fn = conf["bitext"].as<string>();
   ReadFile input(input_fn);
-  vector<string> buf;              // source strings (decoder takes only strings)
-  vector<vector<Ngrams> > buf_ngs;  // compute ngrams and lengths of references
-  vector<vector<size_t> > buf_ls; // just once
+  vector<string> buf;              // decoder only accepts strings as input
+  vector<vector<Ngrams> > buf_ngs; // compute ngrams and lengths of references
+  vector<vector<size_t> > buf_ls;  // just once
   size_t input_sz = 0;
 
+  cerr << _p4;
   // output configuration
-  cerr << _p5 << "dtrain" << endl << "Parameters:" << endl;
+  cerr << "dtrain" << endl << "Parameters:" << endl;
   cerr << setw(25) << "k " << k << endl;
   cerr << setw(25) << "N " << N << endl;
   cerr << setw(25) << "T " << T << endl;
   cerr << setw(25) << "learning rate " << eta << endl;
-  cerr << setw(25) << "error margin " << error_margin << endl;
+  cerr << setw(25) << "margin " << margin << endl;
   cerr << setw(25) << "l1 reg " << l1_reg << endl;
-  cerr << setw(25) << "decoder conf " << "'" << conf["decoder_config"].as<string>() << "'" << endl;
+  cerr << setw(25) << "decoder conf " << "'"
+       << conf["decoder_conf"].as<string>() << "'" << endl;
   cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
   cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
-  if (conf.count("input_weights"))
-    cerr << setw(25) << "weights in " << "'" << conf["input_weights"].as<string>() << "'" << endl;
-  cerr << "(a dot per input)" << endl;
+  if (conf.count("input_weights")) {
+    cerr << setw(25) << "weights in " << "'"
+         << conf["input_weights"].as<string>() << "'" << endl;
+  }
+  cerr << "(1 dot per processed input)" << endl;
 
   // meta
   weight_t best=0., gold_prev=0.;
@@ -75,7 +78,7 @@ main(int argc, char** argv)
   time_t start, end;
   time(&start);
   weight_t gold_sum=0., model_sum=0.;
-  size_t i = 0, num_pairs = 0, feature_count = 0, list_sz = 0;
+  size_t i=0, num_up=0, feature_count=0, list_sz=0;
 
   cerr << "Iteration #" << t+1 << " of " << T << "." << endl;
 
@@ -97,9 +100,10 @@ main(int argc, char** argv)
         buf_ls.push_back({});
         for (auto s: parts) {
           vector<WordID> r;
-          vector<string> tok;
-          boost::split(tok, s, boost::is_any_of(" "));
-          RegisterAndConvert(tok, r);
+          vector<string> toks;
+          boost::split(toks, s, boost::is_any_of(" "));
+          for (auto tok: toks)
+            r.push_back(TD::Convert(tok));
           buf_ngs.back().emplace_back(MakeNgrams(r, N));
           buf_ls.back().push_back(r.size());
         }
@@ -109,12 +113,16 @@ main(int argc, char** argv)
     }
 
     // produce some pretty output
-    if (i == 0 || (i+1)%20==0)
-      cerr << " ";
-    cerr << ".";
+    if (next) {
+      if (i%20==0)
+        cerr << " ";
+      cerr << ".";
+      if ((i+1)%20==0)
+        cerr << " " << i+1 << endl;
+    } else {
+      cerr << " " << i << endl;
+    }
     cerr.flush();
-    if (!next)
-      if (i%20 != 0) cerr << " " << i << endl;
 
     // stop iterating
     if (!next) break;
@@ -133,9 +141,8 @@ main(int argc, char** argv)
     list_sz += observer->GetSize();
 
     // get pairs and update
-    vector<pair<ScoredHyp,ScoredHyp> > pairs;
     SparseVector<weight_t> updates;
-    num_pairs += CollectUpdates(samples, updates, error_margin);
+    num_up += CollectUpdates(samples, updates, margin);
     SparseVector<weight_t> lambdas_copy;
     if (l1_reg)
       lambdas_copy = lambdas;
@@ -147,11 +154,12 @@ main(int argc, char** argv)
     if (l1_reg) {
       SparseVector<weight_t>::iterator it = lambdas.begin();
       for (; it != lambdas.end(); ++it) {
-        if (it->second == 0) continue;
-        if (!lambdas_copy.get(it->first)                // new or..
-            || lambdas_copy.get(it->first)!=it->second) // updated feature
+        weight_t v = it->second;
+        if (!v)
+          continue;
+        if (!lambdas_copy.get(it->first)       // new or..
+            || lambdas_copy.get(it->first)!=v) // updated feature
         {
-          weight_t v = it->second;
           if (v > 0) {
             it->second = max(0., v - l1_reg);
           } else {
@@ -174,19 +182,19 @@ main(int argc, char** argv)
 
   // stats
   weight_t gold_avg = gold_sum/(weight_t)input_sz;
-  size_t non_zero = (size_t)lambdas.num_nonzero();
-  cerr << _p5 << _p << "WEIGHTS" << endl;
+  cerr << _p << "WEIGHTS" << endl;
   for (auto name: print_weights)
     cerr << setw(18) << name << " = " << lambdas.get(FD::Convert(name)) << endl;
   cerr << "        ---" << endl;
-  cerr << _np << "       1best avg score: " << gold_avg;
-  cerr << _p << " (" << gold_avg-gold_prev << ")" << endl;
-  cerr << _np << " 1best avg model score: " << model_sum/(weight_t)input_sz << endl;
-  cerr << "           avg # pairs: ";
-  cerr << _np << num_pairs/(float)input_sz << endl;
-  cerr << "   non-0 feature count: " <<  non_zero << endl;
-  cerr << "           avg list sz: " << list_sz/(float)input_sz << endl;
+  cerr << _np << "       1best avg score: " << gold_avg*100;
+  cerr << _p << " (" << (gold_avg-gold_prev)*100 << ")" << endl;
+  cerr << " 1best avg model score: "
+       << model_sum/(weight_t)input_sz << endl;
+  cerr << "         avg # updates: ";
+  cerr << _np <<  num_up/(float)input_sz << endl;
+  cerr << "   non-0 feature count: " << lambdas.num_nonzero() << endl;
   cerr << "           avg f count: " << feature_count/(float)list_sz << endl;
+  cerr << "           avg list sz: " << list_sz/(float)input_sz << endl;
 
   if (gold_avg > best) {
     best = gold_avg;
@@ -197,7 +205,7 @@ main(int argc, char** argv)
   time (&end);
   time_t time_diff = difftime(end, start);
   total_time += time_diff;
-  cerr << _p2 << _np << "(time " << time_diff/60. << " min, ";
+  cerr << "(time " << time_diff/60. << " min, ";
   cerr << time_diff/input_sz << " s/S)" << endl;
   if (t+1 != T) cerr << endl;
 
@@ -211,15 +219,16 @@ main(int argc, char** argv)
 
   // final weights
   if (average) {
-    w_average /= (weight_t)T;
+    w_average /= T;
     w_average.init_vector(decoder_weights);
   } else if (!keep) {
     lambdas.init_vector(decoder_weights);
   }
-  Weights::WriteToFile(output_fn, decoder_weights, true);
+  if (average || !keep)
+    Weights::WriteToFile(output_fn, decoder_weights, true);
 
-  cerr << _p5 << _np << endl << "---" << endl << "Best iteration: ";
-  cerr << best_iteration+1 << " [GOLD = " << best << "]." << endl;
+  cerr << endl << "---" << endl << "Best iteration: ";
+  cerr << best_iteration+1 << " [GOLD = " << best*100 << "]." << endl;
   cerr << "This took " << total_time/60. << " min." << endl;
 
   return 0;
diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h
index 728b0698..8b1a00eb 100644
--- a/training/dtrain/dtrain.h
+++ b/training/dtrain/dtrain.h
@@ -27,17 +27,10 @@ struct ScoredHyp
   vector<WordID>         w;
   SparseVector<weight_t> f;
   weight_t               model, gold;
-  size_t               rank;
+  size_t                 rank;
 };
 
 inline void
-RegisterAndConvert(const vector<string>& strs, vector<WordID>& ids)
-{
-  for (auto s: strs)
-    ids.push_back(TD::Convert(s));
-}
-
-inline void
 PrintWordIDVec(vector<WordID>& v, ostream& os=cerr)
 {
   for (size_t i = 0; i < v.size(); i++) {
@@ -48,44 +41,43 @@ PrintWordIDVec(vector<WordID>& v, ostream& os=cerr)
 
 inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }
 inline ostream& _p(ostream& out)  { return out << setiosflags(ios::showpos); }
-inline ostream& _p2(ostream& out) { return out << setprecision(2); }
-inline ostream& _p5(ostream& out) { return out << setprecision(5); }
+inline ostream& _p4(ostream& out) { return out << setprecision(4); }
 
 bool
 dtrain_init(int argc, char** argv, po::variables_map* conf)
 {
   po::options_description ini("Configuration File Options");
   ini.add_options()
-    ("bitext,b",          po::value<string>(),                                                   "bitext")
-    ("decoder_config,C",  po::value<string>(),                           "configuration file for decoder")
-    ("iterations,T",      po::value<size_t>()->default_value(10),    "number of iterations T (per shard)")
-    ("k",                 po::value<size_t>()->default_value(100),                   "size of kbest list")
-    ("learning_rate,l",   po::value<weight_t>()->default_value(1.0),                      "learning rate")
-    ("l1_reg,r",          po::value<weight_t>()->default_value(0.),          "l1 regularization strength")
-    ("error_margin,m",    po::value<weight_t>()->default_value(0.),        "margin for margin perceptron")
-    ("N",                 po::value<size_t>()->default_value(4),               "N for BLEU approximation")
-    ("input_weights,w",   po::value<string>(),                                       "input weights file")
-    ("average,a",         po::value<bool>()->default_value(false),               "output average weights")
-    ("keep,K",            po::value<bool>()->default_value(false),   "output a weight file per iteration")
-    ("output,o",          po::value<string>()->default_value("-"),  "output weights file, '-' for STDOUT")
+    ("bitext,b",          po::value<string>(),                                                  "bitext")
+    ("decoder_conf,C",    po::value<string>(),                          "configuration file for decoder")
+    ("iterations,T",      po::value<size_t>()->default_value(10),   "number of iterations T (per shard)")
+    ("k",                 po::value<size_t>()->default_value(100),                  "size of kbest list")
+    ("learning_rate,l",   po::value<weight_t>()->default_value(1.0),                     "learning rate")
+    ("l1_reg,r",          po::value<weight_t>()->default_value(0.),         "l1 regularization strength")
+    ("error_margin,m",    po::value<weight_t>()->default_value(0.),       "margin for margin perceptron")
+    ("N",                 po::value<size_t>()->default_value(4),              "N for BLEU approximation")
+    ("input_weights,w",   po::value<string>(),                                      "input weights file")
+    ("average,a",         po::value<bool>()->default_value(false),              "output average weights")
+    ("keep,K",            po::value<bool>()->default_value(false),  "output a weight file per iteration")
+    ("output,o",          po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT")
     ("print_weights,P",   po::value<string>()->default_value("EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV"),
-                                                          "list of weights to print after each iteration");
+                                                         "list of weights to print after each iteration");
   po::options_description cl("Command Line Options");
   cl.add_options()
-    ("config,c", po::value<string>(), "dtrain config file");
+    ("conf,c", po::value<string>(), "dtrain configuration file");
   cl.add(ini);
   po::store(parse_command_line(argc, argv, cl), *conf);
-  if (conf->count("config")) {
-    ifstream f((*conf)["config"].as<string>().c_str());
+  if (conf->count("conf")) {
+    ifstream f((*conf)["conf"].as<string>().c_str());
     po::store(po::parse_config_file(f, ini), *conf);
   }
   po::notify(*conf);
-  if (!conf->count("decoder_config")) {
+  if (!conf->count("decoder_conf")) {
     cerr << "Missing decoder configuration." << endl;
     return false;
   }
   if (!conf->count("bitext")) {
-    cerr << "No training data given." << endl;
+    cerr << "No input given." << endl;
     return false;
   }
 
diff --git a/training/dtrain/examples/standard/cdec.ini b/training/dtrain/examples/standard/cdec.ini
index 3330dd71..36368d44 100644
--- a/training/dtrain/examples/standard/cdec.ini
+++ b/training/dtrain/examples/standard/cdec.ini
@@ -21,7 +21,7 @@ feature_function=RuleIdentityFeatures
 feature_function=RuleSourceBigramFeatures
 feature_function=RuleTargetBigramFeatures
 feature_function=RuleShape
-feature_function=LexicalFeatures 1 1 1
+#feature_function=LexicalFeatures 1 1 1
 #feature_function=SourceSpanSizeFeatures
 #feature_function=SourceWordPenalty
 #feature_function=SpanFeatures
diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini
index f2698007..610d41d7 100644
--- a/training/dtrain/examples/standard/dtrain.ini
+++ b/training/dtrain/examples/standard/dtrain.ini
@@ -1,6 +1,6 @@
-bitext=./nc-wmt11.gz      # input bitext
+bitext=nc-wmt11.100.gz    # input bitext
 output=-                  # a weights file (add .gz for gzip compression) or STDOUT '-'
-decoder_config=./cdec.ini # config for cdec
+decoder_conf=./cdec.ini   # config for cdec
 iterations=3              # run over input 3 times
 k=100                     # use 100best lists
 N=4                       # optimize (approx.) BLEU4
diff --git a/training/dtrain/lplp.rb b/training/dtrain/lplp.rb
index a1fcd1a3..62c80489 100755
--- a/training/dtrain/lplp.rb
+++ b/training/dtrain/lplp.rb
@@ -1,4 +1,4 @@
-# lplp.rb
+#!/usr/bin/env ruby
 
 # norms
 def l0(feature_column, n)
@@ -19,7 +19,7 @@ end
 
 # stats
 def median(feature_column, n)
-  return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0})\
+  return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0})
     .sort[feature_column.size/2]
 end
 
@@ -85,7 +85,6 @@ def _test()
 end
 #_test()
 
-
 def usage()
   puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> <#shards> < <input>"
   puts "   l0...: norms for selection"
@@ -95,7 +94,7 @@ def usage()
   exit 1
 end
 
-if ARGV.size < 4 then usage end
+usage if ARGV.size<4
 norm_fun = method(ARGV[0].to_sym)
 type = ARGV[1]
 x = ARGV[2].to_f
diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb
index 29f3e609..563145b6 100755
--- a/training/dtrain/parallelize.rb
+++ b/training/dtrain/parallelize.rb
@@ -4,19 +4,19 @@ require 'trollop'
 require 'zipf'
 
 conf = Trollop::options do
-  opt :config,                    "dtrain configuration",                  :type => :string
-  opt :input,                     "input as bitext (f ||| e)",             :type => :string
-  opt :epochs,                    "number of epochs",                      :type => :int, :default => 10
-  opt :lplp_args,                 "arguments for lplp.rb",                 :type => :string, :default => "l2 select_k 100000"
-  opt :randomize,                 "randomize shards once",                 :type => :bool,   :default => false, :short => '-z'
-  opt :reshard,                   "randomize after each epoch",            :type => :bool,   :default => false, :short => '-y'
-  opt :shards,                    "number of shards",                      :type => :int
-  opt :weights,                   "input weights for first epoch",         :type => :string, :default => ''
-  opt :per_shard_decoder_configs, "give custom decoder config per shard",  :type => :string, :short => '-o'
-  opt :processes_at_once,         "jobs to run at oce",                    :type => :int,    :default => 9999
-  opt :qsub,                      "use qsub",                              :type => :bool,   :default => false
-  opt :qsub_args,                 "extra args for qsub",                   :type => :string, :default => "-l h_vmem=5G"
-  opt :dtrain_binary,             "path to dtrain binary",                 :type => :string
+  opt :conf,                      "dtrain configuration",                 :type => :string, :short => '-c'
+  opt :input,                     "input as bitext (f ||| e)",            :type => :string, :short => '-i'
+  opt :epochs,                    "number of epochs",                     :type => :int,    :default => 10
+  opt :randomize,                 "randomize shards once",                :type => :bool,   :default => false, :short => '-z'
+  opt :reshard,                   "randomize after each epoch",           :type => :bool,   :default => false, :short => '-y'
+  opt :shards,                    "number of shards",                     :type => :int     :short => '-s'
+  opt :weights,                   "input weights for first epoch",        :type => :string, :default => '', :short => '-w'
+  opt :lplp_args,                 "arguments for lplp.rb",                :type => :string, :default => "l2 select_k 100000", :short => '-l'
+  opt :per_shard_decoder_configs, "give custom decoder config per shard", :type => :string, :short => '-o'
+  opt :processes_at_once,         "jobs to run at oce",                   :type => :int,    :default => 9999, :short => '-p'
+  opt :qsub,                      "use qsub",                             :type => :bool,   :default => false, :short => '-q'
+  opt :qsub_args,                 "extra args for qsub",                  :type => :string, :default => "-l h_vmem=5G", :short => 'r'
+  opt :dtrain_binary,             "path to dtrain binary",                :type => :string, :short => '-d'
 end
 
 dtrain_dir = File.expand_path File.dirname(__FILE__)
@@ -55,16 +55,16 @@ def make_shards input, num_shards, epoch, rand
   index.shuffle! if rand
   shard_sz = (lc / num_shards.to_f).round 0
   leftover = lc - (num_shards*shard_sz)
-  leftover = 0 if leftover < 0
+  leftover = max(0, leftover)
   in_f = File.new input, 'r'
   in_lines = in_f.readlines
   shard_in_files = []
   in_fns = []
-  new_num_shards = 0
+  real_num_shards = 0
   0.upto(num_shards-1) { |shard|
     break if index.size==0
-    new_num_shards += 1
-    in_fn = "work/shard.#{shard}.#{epoch}.in"
+    real_num_shards += 1
+    in_fn = "work/shard.#{shard}.#{epoch}"
     shard_in = File.new in_fn, 'w+'
     in_fns << in_fn
     0.upto(shard_sz-1) { |i|
@@ -81,12 +81,12 @@ def make_shards input, num_shards, epoch, rand
   end
   shard_in_files.each do |f| f.close end
   in_f.close
-  return in_fns, new_num_shards
+  return in_fns, real_num_shards
 end
 
 input_files = []
 if predefined_shards
-  input_files = File.new(input).readlines.map {|i| i.strip }
+  input_files = File.new(input).readlines.map { |i| i.strip }
   if per_shard_decoder_configs
     decoder_configs = ReadFile.readlines_strip(conf[:per_shard_decoder_configs]
                                               ).map { |i| i.strip }
@@ -100,15 +100,14 @@ end
   puts "epoch #{epoch+1}"
   pids = []
   input_weights = ''
-  if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end
+  input_weights = "--input_weights work/weights.#{epoch-1}" if epoch>0
   weights_files = []
   shard = 0
   remaining_shards = num_shards
   while remaining_shards > 0
     shards_at_once.times {
       break if remaining_shards==0
-      qsub_str_start = qsub_str_end = ''
-      local_end = ''
+      qsub_str_start = qsub_str_end = local_end = ''
       if use_qsub
         qsub_str_start = "qsub #{conf[:qsub_args]} -cwd -sync y -b y -j y\
                            -o work/out.#{shard}.#{epoch}\
@@ -123,7 +122,7 @@ end
       else
         cdec_conf = ""
       end
-      if first_input_weights!='' && epoch == 0
+      if first_input_weights != '' && epoch == 0
         input_weights = "--input_weights #{first_input_weights}"
       end
       pids << Kernel.fork {
diff --git a/training/dtrain/sample.h b/training/dtrain/sample.h
index c3586c58..03cc82c3 100644
--- a/training/dtrain/sample.h
+++ b/training/dtrain/sample.h
@@ -3,20 +3,19 @@
 
 #include "kbest.h"
 
+#include "score.h"
+
 namespace dtrain
 {
 
-
 struct ScoredKbest : public DecoderObserver
 {
   const size_t k_;
-  vector<ScoredHyp> s_;
-  size_t src_len_;
+  size_t feature_count_, effective_sz_;
+  vector<ScoredHyp> samples_;
   PerSentenceBleuScorer* scorer_;
-  vector<vector<WordID> >* refs_;
   vector<Ngrams>* ref_ngs_;
   vector<size_t>* ref_ls_;
-  size_t f_count_, sz_;
 
   ScoredKbest(const size_t k, PerSentenceBleuScorer* scorer) :
     k_(k), scorer_(scorer) {}
@@ -24,14 +23,13 @@ struct ScoredKbest : public DecoderObserver
   virtual void
   NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
   {
-    src_len_ = smeta.GetSourceLength();
-    s_.clear(); sz_ = f_count_ = 0;
+    samples_.clear(); effective_sz_ = feature_count_ = 0;
     KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,
       KBest::FilterUnique, prob_t, EdgeProb> kbest(*hg, k_);
     for (size_t i = 0; i < k_; ++i) {
-      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique,
-              prob_t, EdgeProb>::Derivation* d =
-            kbest.LazyKthBest(hg->nodes_.size() - 1, i);
+      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,
+            KBest::FilterUnique, prob_t, EdgeProb>::Derivation* d =
+              kbest.LazyKthBest(hg->nodes_.size() - 1, i);
       if (!d) break;
       ScoredHyp h;
       h.w = d->yield;
@@ -39,23 +37,22 @@ struct ScoredKbest : public DecoderObserver
       h.model = log(d->score);
       h.rank = i;
       h.gold = scorer_->Score(h.w, *ref_ngs_, *ref_ls_);
-      s_.push_back(h);
-      sz_++;
-      f_count_ += h.f.size();
+      samples_.push_back(h);
+      effective_sz_++;
+      feature_count_ += h.f.size();
     }
   }
 
-  vector<ScoredHyp>* GetSamples() { return &s_; }
+  vector<ScoredHyp>* GetSamples() { return &samples_; }
   inline void SetReference(vector<Ngrams>& ngs, vector<size_t>& ls)
   {
     ref_ngs_ = &ngs;
     ref_ls_ = &ls;
   }
-  inline size_t GetFeatureCount() { return f_count_; }
-  inline size_t GetSize() { return sz_; }
+  inline size_t GetFeatureCount() { return feature_count_; }
+  inline size_t GetSize() { return effective_sz_; }
 };
 
-
 } // namespace
 
 #endif
diff --git a/training/dtrain/score.h b/training/dtrain/score.h
index d51aef82..06dbc5a4 100644
--- a/training/dtrain/score.h
+++ b/training/dtrain/score.h
@@ -34,15 +34,6 @@ struct NgramCounts
   }
 
   inline void
-  operator*=(const weight_t rhs)
-  {
-    for (size_t i = 0; i < N_; i++) {
-      this->clipped_[i] *= rhs;
-      this->sum_[i] *= rhs;
-    }
-  }
-
-  inline void
   Add(const size_t count, const size_t ref_count, const size_t i)
   {
     assert(i < N_);
@@ -64,15 +55,7 @@ struct NgramCounts
   }
 
   inline void
-  Print(ostream& os=cerr)
-  {
-    for (size_t i = 0; i < N_; i++) {
-      os << i+1 << "grams (clipped):\t" << clipped_[i] << endl;
-      os << i+1 << "grams:\t\t\t" << sum_[i] << endl;
-    }
-  }
-
-  inline void Resize(size_t N)
+  Resize(size_t N)
   {
     if (N == N_) return;
     else if (N > N_) {
@@ -158,16 +141,13 @@ struct PerSentenceBleuScorer
     return exp(1 - (weight_t)rl/hl);
   }
 
-  weight_t
-  Score(const vector<WordID>& hyp,
-        const vector<Ngrams>& ref_ngs,
-        const vector<size_t>& ref_ls)
+  inline size_t
+  BestMatchLength(const size_t hl,
+                  const vector<size_t>& ref_ls)
   {
-    size_t hl = hyp.size(), rl = 0;
-    if (hl == 0) return 0.;
-    // best match reference length
+    size_t m;
     if (ref_ls.size() == 1)  {
-      rl = ref_ls.front();
+      m = ref_ls.front();
     } else {
       size_t i = 0, best_idx = 0;
       size_t best = numeric_limits<size_t>::max();
@@ -179,8 +159,20 @@ struct PerSentenceBleuScorer
         }
         i += 1;
       }
-      rl = ref_ls[best_idx];
+      m = ref_ls[best_idx];
     }
+
+    return m;
+  }
+
+  weight_t
+  Score(const vector<WordID>& hyp,
+        const vector<Ngrams>& ref_ngs,
+        const vector<size_t>& ref_ls)
+  {
+    size_t hl = hyp.size(), rl = 0;
+    if (hl == 0) return 0.;
+    rl = BestMatchLength(hl, ref_ls);
     if (rl == 0) return 0.;
     NgramCounts counts = MakeNgramCounts(hyp, ref_ngs, N_);
     size_t M = N_;
@@ -192,8 +184,9 @@ struct PerSentenceBleuScorer
     weight_t sum = 0, add = 0;
     for (size_t i = 0; i < M; i++) {
       if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.;
-      if (i == 1) add = 1;
-      sum += v[i] * log(((weight_t)counts.clipped_[i] + add)/((counts.sum_[i] + add)));
+      if (i > 0) add = 1;
+      sum += v[i] * log(((weight_t)counts.clipped_[i] + add)
+                        / ((counts.sum_[i] + add)));
     }
 
     return  BrevityPenalty(hl, rl+1) * exp(sum);
diff --git a/training/dtrain/update.h b/training/dtrain/update.h
index 57671ce1..72d369c4 100644
--- a/training/dtrain/update.h
+++ b/training/dtrain/update.h
@@ -5,7 +5,7 @@ namespace dtrain
 {
 
 bool
-CmpHypsByGold(ScoredHyp a, ScoredHyp b)
+_cmp(ScoredHyp a, ScoredHyp b)
 {
   return a.gold > b.gold;
 }
@@ -19,44 +19,42 @@ CmpHypsByGold(ScoredHyp a, ScoredHyp b)
 inline size_t
 CollectUpdates(vector<ScoredHyp>* s,
                SparseVector<weight_t>& updates,
-               float margin=1.0)
+               float margin=0.)
 {
-  size_t num_pairs = 0;
+  size_t num_up = 0;
   size_t sz = s->size();
   if (sz < 2) return 0;
-  sort(s->begin(), s->end(), CmpHypsByGold);
+  sort(s->begin(), s->end(), _cmp);
   size_t sep = round(sz*0.1);
   size_t sep_hi = sep;
   if (sz > 4) {
-    while
-      (sep_hi < sz && (*s)[sep_hi-1].gold == (*s)[sep_hi].gold) ++sep_hi;
+    while (sep_hi<sz && (*s)[sep_hi-1].gold==(*s)[sep_hi].gold)
+      ++sep_hi;
   }
   else sep_hi = 1;
   for (size_t i = 0; i < sep_hi; i++) {
     for (size_t j = sep_hi; j < sz; j++) {
-      if (((*s)[i].model-(*s)[j].model) > margin)
+      if (((*s)[i].model-(*s)[j].model) > margin
+          || (*s)[i].gold == (*s)[j].gold)
         continue;
-      if ((*s)[i].gold != (*s)[j].gold) {
-        updates += (*s)[i].f-(*s)[j].f;
-        num_pairs++;
-      }
+      updates += (*s)[i].f-(*s)[j].f;
+      num_up++;
     }
   }
   size_t sep_lo = sz-sep;
-  while (sep_lo > 0 && (*s)[sep_lo-1].gold == (*s)[sep_lo].gold)
+  while (sep_lo>=sep_hi && (*s)[sep_lo].gold==(*s)[sep_lo+1].gold)
     --sep_lo;
   for (size_t i = sep_hi; i < sep_lo; i++) {
     for (size_t j = sep_lo; j < sz; j++) {
-      if (((*s)[i].model-(*s)[j].model) > margin)
+      if (((*s)[i].model-(*s)[j].model) > margin
+          || (*s)[i].gold == (*s)[j].gold)
         continue;
-      if ((*s)[i].gold != (*s)[j].gold) {
-        updates += (*s)[i].f-(*s)[j].f;
-        num_pairs++;
-      }
+      updates += (*s)[i].f-(*s)[j].f;
+      num_up++;
     }
   }
 
-  return num_pairs;
+  return num_up;
 }
 
 } // namespace