11 files changed, 222 insertions, 19 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index f39d161e..64fef489 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -3,5 +3,5 @@ bin_PROGRAMS = dtrain
 dtrain_SOURCES = dtrain.cc score.cc
 dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
-AM_CPPFLAGS = -O3 -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
 
diff --git a/dtrain/README.md b/dtrain/README.md
index 9580df6d..350c7423 100644
--- a/dtrain/README.md
+++ b/dtrain/README.md
@@ -41,6 +41,8 @@ DTRAIN_LOCAL.
 
 Next
 ----
++ approx. Bleu?
++ turn off inclusion
 + (dtrain|decoder) more meta-parameters testing
 + feature selection directly in dtrain
 + feature template: target side rule ngrams
@@ -48,6 +50,13 @@ Next
 + make svm doable; no subgradient?
 + reranking while sgd?
 + try PRO, mira emulations
++ sampling (MBR)
++ forest (on train)?
++ best BLEU transl (Sokolov)?
++ entire reg. path
++ resharding [nfold cross val.]
++ bigger LM, feats (target side Ng., word alignments etc.)
++ merge kbest lists
 
 Legal
 -----
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 8b1fc953..717d47a2 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -33,6 +33,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
     ("fselect",           po::value<weight_t>()->default_value(-1), "TODO select top x percent (or by threshold) of features after each epoch")
     ("approx_bleu_d",     po::value<score_t>()->default_value(0.9),                                "discount for approx. BLEU")
     ("scale_bleu_diff",   po::value<bool>()->zero_tokens(),                   "learning rate <- bleu diff of a misranked pair")
+    ("loss_margin",       po::value<weight_t>()->default_value(0.), "update if no error in pref pair but model scores this near")
 #ifdef DTRAIN_LOCAL
     ("refs,r",            po::value<string>(),                                                      "references in local mode")
 #endif
@@ -134,6 +135,8 @@ main(int argc, char** argv)
   const string select_weights = cfg["select_weights"].as<string>();
   const float hi_lo = cfg["hi_lo"].as<float>();
   const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>();
+  weight_t loss_margin = cfg["loss_margin"].as<weight_t>();
+  if (loss_margin > 9998.) loss_margin = std::numeric_limits<float>::max();
   bool scale_bleu_diff = false;
   if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true;
   bool average = false;
@@ -160,6 +163,8 @@ main(int argc, char** argv)
     scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer);
   } else if (scorer_str == "smooth_bleu") {
     scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);
+  } else if (scorer_str == "smooth_single_bleu") {
+    scorer = dynamic_cast<SmoothSingleBleuScorer*>(new SmoothSingleBleuScorer);
   } else if (scorer_str == "approx_bleu") {
     scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d));
   } else {
@@ -220,7 +225,7 @@ main(int argc, char** argv)
   grammar_buf_out.open(grammar_buf_fn.c_str());
 #endif
 
-  unsigned in_sz = UINT_MAX; // input index, input size
+  unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size
   vector<pair<score_t, score_t> > all_scores;
   score_t max_score = 0.;
   unsigned best_it = 0;
@@ -242,6 +247,7 @@ main(int argc, char** argv)
     if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl;
     else cerr << setw(25) << "learning rate " << "bleu diff" << endl;
     cerr << setw(25) << "gamma " << gamma << endl;
+    cerr << setw(25) << "loss margin " << loss_margin << endl;
     cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;
     if (pair_sampling == "XYX")
       cerr << setw(25) << "hi lo " << hi_lo << endl;
@@ -424,12 +430,18 @@ main(int argc, char** argv)
 
       for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin();
            it != pairs.end(); it++) {
+#ifdef DTRAIN_FASTER_PERCEPTRON
+        bool rank_error = true; // pair filtering already did this for us
+        rank_errors++;
+        score_t margin = std::numeric_limits<float>::max();
+#else
         bool rank_error = it->first.model <= it->second.model;
         if (rank_error) rank_errors++;
-        score_t margin = fabs(it->first.model - it->second.model);
-        if (!rank_error && margin < 1) margin_violations++;
+        score_t margin = fabs(fabs(it->first.model) - fabs(it->second.model));
+        if (!rank_error && margin < loss_margin) margin_violations++;
+#endif
         if (scale_bleu_diff) eta = it->first.score - it->second.score;
-        if (rank_error || (gamma && margin<1)) {
+        if (rank_error || margin < loss_margin) {
           SparseVector<weight_t> diff_vec = it->first.f - it->second.f;
           lambdas.plus_eq_v_times_s(diff_vec, eta);
           if (gamma)
@@ -534,8 +546,10 @@ main(int argc, char** argv)
     cerr << _np << npairs/(float)in_sz << endl;
     cerr << "        avg # rank err: ";
     cerr << rank_errors/(float)in_sz << endl;
+#ifndef DTRAIN_FASTER_PERCEPTRON
     cerr << "     avg # margin viol: ";
     cerr << margin_violations/(float)in_sz << endl;
+#endif
     cerr << "    non0 feature count: " <<  nonz << endl;
     cerr << "           avg list sz: " << list_sz/(float)in_sz << endl;
     cerr << "           avg f count: " << f_count/(float)list_sz << endl;
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 94d149ce..d8dc14b6 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -1,6 +1,14 @@
 #ifndef _DTRAIN_H_
 #define _DTRAIN_H_
 
+#undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs
+                                 // DO NOT USE WITH SVM!
+#undef DTRAIN_LOCAL
+#define DTRAIN_DOTS 10 // after how many inputs to display a '.'
+#define DTRAIN_GRAMMAR_DELIM "########EOS########"
+#define DTRAIN_SCALE 100000
+
+
 #include <iomanip>
 #include <climits>
 #include <string.h>
@@ -13,11 +21,7 @@
 
 #include "filelib.h"
 
-#undef DTRAIN_LOCAL
 
-#define DTRAIN_DOTS 10 // after how many inputs to display a '.'
-#define DTRAIN_GRAMMAR_DELIM "########EOS########"
-#define DTRAIN_SCALE 100000
 
 using namespace std;
 using namespace dtrain;
@@ -32,7 +36,7 @@ inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids
 
 inline string gettmpf(const string path, const string infix)
 {
-  char fn[1024];
+  char fn[path.size() + infix.size() + 8];
   strcpy(fn, path.c_str());
   strcat(fn, "/");
   strcat(fn, infix.c_str());
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index f52fb649..bc2f56cd 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -8,6 +8,11 @@
 namespace dtrain
 {
 
+bool
+cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b)
+{
+  return a.model > b.model;
+}
 
 struct KSampler : public HypSampler
 {
@@ -44,6 +49,8 @@ struct KSampler : public HypSampler
       sz_++;
       f_count_ += h.f.size();
     }
+    sort(s_.begin(), s_.end(), cmp_hyp_by_model_d);
+    for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i;
   }
 };
 
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index bac132c6..32006a41 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -46,11 +46,18 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc
 inline void
 partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float hi_lo)
 {
-  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
   unsigned sz = s->size();
+  if (sz < 2) return;
+  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
   unsigned sep = round(sz*hi_lo);
-  for (unsigned i = 0; i < sep; i++) {
-    for (unsigned j = sep; j < sz; j++) {
+  unsigned sep_hi = sep;
+  if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi;
+  else sep_hi = 1;
+  for (unsigned i = 0; i < sep_hi; i++) {
+    for (unsigned j = sep_hi; j < sz; j++) {
+#ifdef DTRAIN_FASTER_PERCEPTRON
+      if ((*s)[i].model <= (*s)[j].model) {
+#endif
       if (threshold > 0) {
         if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
           training.push_back(make_pair((*s)[i], (*s)[j]));
@@ -58,10 +65,18 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
         if ((*s)[i].score != (*s)[j].score)
           training.push_back(make_pair((*s)[i], (*s)[j]));
       }
+#ifdef DTRAIN_FASTER_PERCEPTRON
+      }
+#endif
     }
   }
-  for (unsigned i = sep; i < sz-sep; i++) {
-    for (unsigned j = sz-sep; j < sz; j++) {
+  unsigned sep_lo = sz-sep;
+  while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo;
+  for (unsigned i = sep_hi; i < sz-sep_lo; i++) {
+    for (unsigned j = sz-sep_lo; j < sz; j++) {
+#ifdef DTRAIN_FASTER_PERCEPTRON
+      if ((*s)[i].model <= (*s)[j].model) {
+#endif
       if (threshold > 0) {
         if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
           training.push_back(make_pair((*s)[i], (*s)[j]));
@@ -69,6 +84,9 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
         if ((*s)[i].score != (*s)[j].score)
           training.push_back(make_pair((*s)[i], (*s)[j]));
       }
+#ifdef DTRAIN_FASTER_PERCEPTRON
+      }
+#endif
     }
   }
 }
diff --git a/dtrain/score.cc b/dtrain/score.cc
index 7b1f6be4..b331dc4f 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -103,7 +103,27 @@ SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
         i_bleu[j] += (1/((score_t)j+1)) * i_ng;
       }
     }
-    sum += exp(i_bleu[i])/(pow(2.0, static_cast<double>(N_-i)));
+    sum += exp(i_bleu[i])/(pow(2.0, N_-i));
+  }
+  return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+// variant of smooth_bleu; i-Bleu scores only single 'i'
+score_t
+SmoothSingleBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+                        const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
+  if (hyp_len == 0 || ref_len == 0) return 0.;
+  NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+  unsigned M = N_;
+  if (ref_len < N_) M = ref_len;
+  score_t sum = 0.;
+  unsigned j = 1;
+  for (unsigned i = 0; i < M; i++) {
+    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break;
+    sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2.0, N_-j+1);
+    j++;
   }
   return brevity_penalty(hyp_len, ref_len) * sum;
 }
diff --git a/dtrain/score.h b/dtrain/score.h
index eb8ad912..d4fba22c 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -128,6 +128,11 @@ struct SmoothBleuScorer : public LocalScorer
   score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
 };
 
+struct SmoothSingleBleuScorer : public LocalScorer
+{
+   score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
 struct ApproxBleuScorer : public BleuScorer
 {
   NgramCounts glob_onebest_counts_;
diff --git a/dtrain/test/example/README b/dtrain/test/example/README
index b3ea5f06..6937b11b 100644
--- a/dtrain/test/example/README
+++ b/dtrain/test/example/README
@@ -1,8 +1,8 @@
 Small example of input format for distributed training.
 Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini .
 
-For this to work, disable '#define DTRAIN_LOCAL' from dtrain.h
+For this to work, undef 'DTRAIN_LOCAL' in dtrain.h
 and recompile.
 
-Data is here: http://simianer.de/dtrain
+Data is here: http://simianer.de/#dtrain
 
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index f87ee9cf..c8ac7c3f 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -5,7 +5,7 @@ decoder_config=test/example/cdec.ini # config for cdec
 # weights for these features will be printed on each iteration
 print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
 tmp=/tmp
-stop_after=10 # stop epoch after 20 inputs
+stop_after=10 # stop epoch after 10 inputs
 
 # interesting stuff
 epochs=3                # run over input 3 times
@@ -19,3 +19,4 @@ filter=uniq             # only unique entries in kbest (surface form)
 pair_sampling=XYX
 hi_lo=0.1               # 10 vs 80 vs 10 and 80 vs 10 here
 pair_threshold=0        # minimum distance in BLEU (this will still only use pairs with diff > 0)
+loss_margin=0
diff --git a/dtrain/test/example/expected-output b/dtrain/test/example/expected-output
new file mode 100644
index 00000000..25d2c069
--- /dev/null
+++ b/dtrain/test/example/expected-output
@@ -0,0 +1,125 @@
+                cdec cfg 'test/example/cdec.ini'
+feature: WordPenalty (no config parameters)
+State is 0 bytes for feature WordPenalty
+feature: KLanguageModel (with config parameters 'test/example/nc-wmt11.en.srilm.gz')
+Loading the LM will be faster if you build a binary file.
+Reading test/example/nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+Loaded 5-gram KLM from test/example/nc-wmt11.en.srilm.gz (MapSize=49581)
+State is 98 bytes for feature KLanguageModel test/example/nc-wmt11.en.srilm.gz
+feature: RuleIdentityFeatures (no config parameters)
+State is 0 bytes for feature RuleIdentityFeatures
+feature: RuleNgramFeatures (no config parameters)
+State is 0 bytes for feature RuleNgramFeatures
+feature: RuleShape (no config parameters)
+  Example feature: Shape_S00000_T00000
+State is 0 bytes for feature RuleShape
+Seeding random number sequence to 1072059181
+
+dtrain
+Parameters:
+                       k 100
+                       N 4
+                       T 3
+                 scorer 'stupid_bleu'
+             sample from 'kbest'
+                  filter 'uniq'
+           learning rate 0.0001
+                   gamma 0
+             loss margin 0
+                   pairs 'XYX'
+                   hi lo 0.1
+          pair threshold 0
+          select weights 'VOID'
+                  l1 reg 0 'none'
+                cdec cfg 'test/example/cdec.ini'
+                   input 'test/example/nc-wmt11.1k.gz'
+                  output '-'
+              stop_after 10
+(a dot represents 10 inputs)
+Iteration #1 of 3.
+ . 10
+Stopping after 10 input sentences.
+WEIGHTS
+              Glue = -0.0293
+       WordPenalty = +0.049075
+     LanguageModel = +0.24345
+ LanguageModel_OOV = -0.2029
+     PhraseModel_0 = +0.0084102
+     PhraseModel_1 = +0.021729
+     PhraseModel_2 = +0.014922
+     PhraseModel_3 = +0.104
+     PhraseModel_4 = -0.14308
+     PhraseModel_5 = +0.0247
+     PhraseModel_6 = -0.012
+       PassThrough = -0.2161
+        ---
+       1best avg score: 0.16872 (+0.16872)
+ 1best avg model score: -1.8276 (-1.8276)
+           avg # pairs: 1121.1
+        avg # rank err: 555.6
+     avg # margin viol: 0
+    non0 feature count: 277
+           avg list sz: 77.2
+           avg f count: 90.96
+(time 0.1 min, 0.6 s/S)
+
+Iteration #2 of 3.
+ . 10
+WEIGHTS
+              Glue = -0.3526
+       WordPenalty = +0.067576
+     LanguageModel = +1.155
+ LanguageModel_OOV = -0.2728
+     PhraseModel_0 = -0.025529
+     PhraseModel_1 = +0.095869
+     PhraseModel_2 = +0.094567
+     PhraseModel_3 = +0.12482
+     PhraseModel_4 = -0.36533
+     PhraseModel_5 = +0.1068
+     PhraseModel_6 = -0.1517
+       PassThrough = -0.286
+        ---
+       1best avg score: 0.18394 (+0.015221)
+ 1best avg model score: 3.205 (+5.0326)
+           avg # pairs: 1168.3
+        avg # rank err: 594.8
+     avg # margin viol: 0
+    non0 feature count: 543
+           avg list sz: 77.5
+           avg f count: 85.916
+(time 0.083 min, 0.5 s/S)
+
+Iteration #3 of 3.
+ . 10
+WEIGHTS
+              Glue = -0.392
+       WordPenalty = +0.071963
+     LanguageModel = +0.81266
+ LanguageModel_OOV = -0.4177
+     PhraseModel_0 = -0.2649
+     PhraseModel_1 = -0.17931
+     PhraseModel_2 = +0.038261
+     PhraseModel_3 = +0.20261
+     PhraseModel_4 = -0.42621
+     PhraseModel_5 = +0.3198
+     PhraseModel_6 = -0.1437
+       PassThrough = -0.4309
+        ---
+       1best avg score: 0.2962 (+0.11225)
+ 1best avg model score: -36.274 (-39.479)
+           avg # pairs: 1109.6
+        avg # rank err: 515.9
+     avg # margin viol: 0
+    non0 feature count: 741
+           avg list sz: 77
+           avg f count: 88.982
+(time 0.083 min, 0.5 s/S)
+
+Writing weights file to '-' ...
+done
+
+---
+Best iteration: 3 [SCORE 'stupid_bleu'=0.2962].
+This took 0.26667 min.