10 files changed, 87 insertions, 67 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index 64fef489..f39d161e 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -3,5 +3,5 @@ bin_PROGRAMS = dtrain
 dtrain_SOURCES = dtrain.cc score.cc
 dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
+AM_CPPFLAGS = -O3 -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
 
diff --git a/dtrain/README.md b/dtrain/README.md
index 2a24ec22..92d6ba0d 100644
--- a/dtrain/README.md
+++ b/dtrain/README.md
@@ -3,7 +3,8 @@ which is able to train the weights of very many (sparse) features.
 It was used here:
   "Joint Feature Selection in Distributed Stochastic
    Learning for Large-Scale Discriminative Training in
-   SMT" Simianer, Riezler, Dyer; ACL 2012
+   SMT"
+(Simianer, Riezler, Dyer; ACL 2012)
 
 
 Building
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index ea5b8835..3dee10f2 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -32,7 +32,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
     ("l1_reg_strength",   po::value<weight_t>(),                                                  "l1 regularization strength")
     ("inc_correct",       po::value<bool>()->zero_tokens(),                      "include correctly ranked pairs into updates")
     ("fselect",           po::value<weight_t>()->default_value(-1),   "TODO select top x percent of features after each epoch")
-    ("approx_bleu_scale", po::value<score_t>()->default_value(0.9),                                 "scaling for approx. BLEU")
+    ("approx_bleu_d",     po::value<score_t>()->default_value(0.9),                                "discount for approx. BLEU")
 #ifdef DTRAIN_LOCAL
     ("refs,r",            po::value<string>(),                                                      "references in local mode")
 #endif
@@ -136,6 +136,7 @@ main(int argc, char** argv)
   const score_t pair_threshold = cfg["pair_threshold"].as<score_t>();
   const string select_weights = cfg["select_weights"].as<string>();
   const float hi_lo = cfg["hi_lo"].as<float>();
+  const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>();
   bool average = false;
   if (select_weights == "avg")
     average = true;
@@ -161,7 +162,7 @@ main(int argc, char** argv)
   } else if (scorer_str == "smooth_bleu") {
     scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);
   } else if (scorer_str == "approx_bleu") {
-    scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N));
+    scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d));
   } else {
     cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;
     exit(1);
@@ -235,6 +236,8 @@ main(int argc, char** argv)
     cerr << setw(25) << "N " << N << endl;
     cerr << setw(25) << "T " << T << endl;
     cerr << setw(25) << "scorer '" << scorer_str << "'" << endl;
+    if (scorer_str == "approx_bleu")
+      cerr << setw(25) << "approx. B discount " << approx_bleu_d << endl;
     cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl;
     if (sample_from == "kbest")
       cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl;
@@ -242,7 +245,7 @@ main(int argc, char** argv)
     cerr << setw(25) << "gamma " << gamma << endl;
     cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;
     if (pair_sampling == "XYX")
-      cerr << setw(25) << "hi lo " << "'" << hi_lo << "'" << endl;
+      cerr << setw(25) << "hi lo " << hi_lo << endl;
     cerr << setw(25) << "pair threshold " << pair_threshold << endl;
     cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl;
     if (cfg.count("l1_reg"))
@@ -261,7 +264,7 @@ main(int argc, char** argv)
       cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl;
     if (cfg.count("stop-after"))
       cerr << setw(25) << "stop_after " << stop_after << endl;
-    if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " lines of input)" << endl;
+    if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " inputs)" << endl;
   }
 
 
diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini
index 05535299..a2c219a1 100644
--- a/dtrain/hstreaming/dtrain.ini
+++ b/dtrain/hstreaming/dtrain.ini
@@ -10,6 +10,6 @@ gamma=0
 scorer=stupid_bleu
 sample_from=kbest
 filter=uniq
-pair_sampling=108010
+pair_sampling=XYX
 pair_threshold=0
 select_weights=last
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index bcd82610..77d4a139 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -2,6 +2,8 @@
 #define _DTRAIN_KBESTGET_H_
 
 #include "kbest.h" // cdec
+#include "sentence_metadata.h"
+
 #include "verbose.h"
 #include "viterbi.h"
 #include "ff_register.h"
@@ -32,7 +34,7 @@ struct LocalScorer
   vector<score_t> w_;
 
   virtual score_t
-  Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank)=0;
+  Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len)=0;
 
   void Reset() {} // only for approx bleu
 
@@ -71,13 +73,15 @@ struct KBestGetter : public HypSampler
   const unsigned k_;
   const string filter_type_;
   vector<ScoredHyp> s_;
+  unsigned src_len_;
 
   KBestGetter(const unsigned k, const string filter_type) :
     k_(k), filter_type_(filter_type) {}
 
   virtual void
-  NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg)
+  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
   {
+    src_len_ = smeta.GetSourceLength();
     KBestScored(*hg);
   }
 
@@ -109,7 +113,7 @@ struct KBestGetter : public HypSampler
       h.f = d->feature_values;
       h.model = log(d->score);
       h.rank = i;
-      h.score = scorer_->Score(h.w, *ref_, i);
+      h.score = scorer_->Score(h.w, *ref_, i, src_len_);
       s_.push_back(h);
     }
   }
@@ -128,7 +132,7 @@ struct KBestGetter : public HypSampler
       h.f = d->feature_values;
       h.model = log(d->score);
       h.rank = i;
-      h.score = scorer_->Score(h.w, *ref_, i);
+      h.score = scorer_->Score(h.w, *ref_, i, src_len_);
       s_.push_back(h);
     }
   }
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index eb4813ab..0783f98b 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -15,13 +15,15 @@ struct KSampler : public HypSampler
   vector<ScoredHyp> s_;
   MT19937* prng_;
   score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);
+  unsigned src_len_;
 
   explicit KSampler(const unsigned k, MT19937* prng) :
     k_(k), prng_(prng) {}
 
   virtual void
-  NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg)
+  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
   {
+    src_len_ = smeta.GetSourceLength();
     ScoredSamples(*hg);
   }
 
@@ -37,7 +39,7 @@ struct KSampler : public HypSampler
       h.f = samples[i].fmap;
       h.model = log(samples[i].model_score);
       h.rank = i;
-      h.score = scorer_->Score(h.w, *ref_, i);
+      h.score = scorer_->Score(h.w, *ref_, i, src_len_);
       s_.push_back(h);
     }
   }
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 66ca1706..56702b86 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -44,7 +44,7 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
 {
   sort(s->begin(), s->end(), _XYX_cmp_hyp_by_score);
   unsigned sz = s->size();
-  unsigned sep = sz * hi_lo;
+  unsigned sep = round(sz*hi_lo);
   for (unsigned i = 0; i < sep; i++) {
     for (unsigned j = sep; j < sz; j++) {
       if ((*s)[i].rank < (*s)[j].rank) {
diff --git a/dtrain/score.cc b/dtrain/score.cc
index d964b4da..d0f9e8a0 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -16,23 +16,23 @@ namespace dtrain
 score_t
 BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len)
 {
-  if (hyp_len == 0 || ref_len == 0) return 0;
+  if (hyp_len == 0 || ref_len == 0) return 0.;
   unsigned M = N_;
   if (ref_len < N_) M = ref_len;
   score_t sum = 0;
   for (unsigned i = 0; i < M; i++) {
-    if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0;
-    sum += w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]);
+    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) return 0.;
+    sum += w_[i] * log((score_t)counts.clipped_[i]/counts.sum_[i]);
   }
   return brevity_penalty(hyp_len, ref_len) * exp(sum);
 }
 
 score_t
 BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
-                  const unsigned /*rank*/)
+                  const unsigned /*rank*/, const unsigned /*src_len*/)
 {
   unsigned hyp_len = hyp.size(), ref_len = ref.size();
-  if (hyp_len == 0 || ref_len == 0) return 0;
+  if (hyp_len == 0 || ref_len == 0) return 0.;
   NgramCounts counts = make_ngram_counts(hyp, ref, N_);
   return Bleu(counts, hyp_len, ref_len);
 }
@@ -49,18 +49,18 @@ BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
  */
 score_t
 StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
-                        const unsigned /*rank*/)
+                        const unsigned /*rank*/, const unsigned /*src_len*/)
 {
   unsigned hyp_len = hyp.size(), ref_len = ref.size();
-  if (hyp_len == 0 || ref_len == 0) return 0;
+  if (hyp_len == 0 || ref_len == 0) return 0.;
   NgramCounts counts = make_ngram_counts(hyp, ref, N_);
   unsigned M = N_;
   if (ref_len < N_) M = ref_len;
   score_t sum = 0, add = 0;
   for (unsigned i = 0; i < M; i++) {
-    if (i == 0 && (counts.clipped[i] == 0 || counts.sum[i] == 0)) return 0;
+    if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.;
     if (i == 1) add = 1;
-    sum += w_[i] * log(((score_t)counts.clipped[i] + add)/((counts.sum[i] + add)));
+    sum += w_[i] * log(((score_t)counts.clipped_[i] + add)/((counts.sum_[i] + add)));
   }
   return  brevity_penalty(hyp_len, ref_len) * exp(sum);
 }
@@ -76,10 +76,10 @@ StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
  */
 score_t
 SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
-                        const unsigned /*rank*/)
+                        const unsigned /*rank*/, const unsigned /*src_len*/)
 {
   unsigned hyp_len = hyp.size(), ref_len = ref.size();
-  if (hyp_len == 0 || ref_len == 0) return 0;
+  if (hyp_len == 0 || ref_len == 0) return 0.;
   NgramCounts counts = make_ngram_counts(hyp, ref, N_);
   unsigned M = N_;
   if (ref_len < N_) M = ref_len;
@@ -87,10 +87,10 @@ SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
   vector<score_t> i_bleu;
   for (unsigned i = 0; i < M; i++) i_bleu.push_back(0.);
   for (unsigned i = 0; i < M; i++) {
-    if (counts.clipped[i] == 0 || counts.sum[i] == 0) {
+    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) {
       break;
     } else {
-      score_t i_ng = log((score_t)counts.clipped[i]/counts.sum[i]);
+      score_t i_ng = log((score_t)counts.clipped_[i]/counts.sum_[i]);
       for (unsigned j = i; j < M; j++) {
         i_bleu[j] += (1/((score_t)j+1)) * i_ng;
       }
@@ -107,29 +107,29 @@ SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
  *        and Structural Translation Features"
  * (Chiang et al. '08)
  *
- * NOTE: needs some code in dtrain.cc
+ * NOTE: needs some more code in dtrain.cc
  */
 score_t
 ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
-                        const unsigned rank)
+                        const unsigned rank, const unsigned src_len)
 {
   unsigned hyp_len = hyp.size(), ref_len = ref.size();
-  if (hyp_len == 0 || ref_len == 0) return 0;
-  NgramCounts counts = make_ngram_counts(hyp, ref, N_);
-  NgramCounts tmp(N_);
+  if (ref_len == 0) return 0.;
+  score_t score = 0.;
+  NgramCounts counts(N_);
+  if (hyp_len > 0) {
+    counts = make_ngram_counts(hyp, ref, N_);
+    NgramCounts tmp = glob_onebest_counts_ + counts;
+    score = Bleu(tmp, hyp_len, ref_len);
+  }
   if (rank == 0) { // 'context of 1best translations'
-    glob_onebest_counts += counts;
-    glob_hyp_len += hyp_len;
-    glob_ref_len += ref_len;
-    hyp_len = glob_hyp_len;
-    ref_len = glob_ref_len;
-    tmp = glob_onebest_counts;
-  } else {
-    hyp_len = hyp.size();
-    ref_len = ref.size();
-    tmp = glob_onebest_counts + counts;
+    glob_onebest_counts_ += counts;
+    glob_onebest_counts_ *= discount_;
+    glob_hyp_len_ = discount_ * (glob_hyp_len_ + hyp_len);
+    glob_ref_len_ = discount_ * (glob_ref_len_ + ref_len);
+    glob_src_len_ = discount_ * (glob_src_len_ + src_len);
   }
-  return 0.9 * Bleu(tmp, hyp_len, ref_len); // TODO param
+  return (score_t)glob_src_len_ * score;
 }
 
 
diff --git a/dtrain/score.h b/dtrain/score.h
index 5aceb81f..d0e79f65 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -12,8 +12,8 @@ namespace dtrain
 struct NgramCounts
 {
   unsigned N_;
-  map<unsigned, unsigned> clipped;
-  map<unsigned, unsigned> sum;
+  map<unsigned, score_t> clipped_;
+  map<unsigned, score_t> sum_;
 
   NgramCounts(const unsigned N) : N_(N) { Zero(); }
 
@@ -22,8 +22,8 @@ struct NgramCounts
   {
     assert(N_ == rhs.N_);
     for (unsigned i = 0; i < N_; i++) {
-      this->clipped[i] += rhs.clipped.find(i)->second;
-      this->sum[i] += rhs.sum.find(i)->second;
+      this->clipped_[i] += rhs.clipped_.find(i)->second;
+      this->sum_[i] += rhs.sum_.find(i)->second;
     }
   }
 
@@ -36,15 +36,24 @@ struct NgramCounts
   }
 
   inline void
+  operator*=(const score_t rhs)
+  {
+    for (unsigned i = 0; i < N_; i++) {
+      this->clipped_[i] *= rhs;
+      this->sum_[i] *= rhs;
+    }
+  }
+
+  inline void
   Add(const unsigned count, const unsigned ref_count, const unsigned i)
   {
     assert(i < N_);
     if (count > ref_count) {
-      clipped[i] += ref_count;
+      clipped_[i] += ref_count;
     } else {
-      clipped[i] += count;
+      clipped_[i] += count;
     }
-    sum[i] += count;
+    sum_[i] += count;
   }
 
   inline void
@@ -52,8 +61,8 @@ struct NgramCounts
   {
     unsigned i;
     for (i = 0; i < N_; i++) {
-      clipped[i] = 0;
-      sum[i] = 0;
+      clipped_[i] = 0;
+      sum_[i] = 0;
     }
   }
 
@@ -61,8 +70,8 @@ struct NgramCounts
   Print()
   {
     for (unsigned i = 0; i < N_; i++) {
-      cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
-      cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
+      cout << i+1 << "grams (clipped):\t" << clipped_[i] << endl;
+      cout << i+1 << "grams:\t\t\t" << sum_[i] << endl;
     }
   }
 };
@@ -106,35 +115,36 @@ make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const un
 struct BleuScorer : public LocalScorer
 {
   score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len);
-  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank);
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
 };
 
 struct StupidBleuScorer : public LocalScorer
 {
-  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank);
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
 };
 
 struct SmoothBleuScorer : public LocalScorer
 {
-  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank);
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
 };
 
 struct ApproxBleuScorer : public BleuScorer
 {
-  NgramCounts glob_onebest_counts;
-  unsigned glob_hyp_len, glob_ref_len;
+  NgramCounts glob_onebest_counts_;
+  unsigned glob_hyp_len_, glob_ref_len_, glob_src_len_;
+  score_t discount_;
 
-  ApproxBleuScorer(unsigned N) : glob_onebest_counts(NgramCounts(N))
+  ApproxBleuScorer(unsigned N, score_t d) : glob_onebest_counts_(NgramCounts(N)), discount_(d)
   {
-    glob_hyp_len = glob_ref_len = 0;
+    glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0;
   }
 
   inline void Reset() {
-    glob_onebest_counts.Zero();
-    glob_hyp_len = glob_ref_len = 0;
+    glob_onebest_counts_.Zero();
+    glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0.;
   }
 
-  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank);
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len);
 };
 
 
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index cd2c75e7..2ad44688 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -4,18 +4,18 @@ decoder_config=test/example/cdec.ini # config for cdec
 # weights for these features will be printed on each iteration
 print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
 tmp=/tmp
-stop_after=100 # stop epoch after 100 inputs
+stop_after=20 # stop epoch after 20 inputs
 
 # interesting stuff
 epochs=3                # run over input 3 times
 k=100                   # use 100best lists
 N=4                     # optimize (approx) BLEU4
-scorer=approx_bleu      # use 'stupid' BLEU+1
+scorer=stupid_bleu      # use 'stupid' BLEU+1
 learning_rate=0.0001    # learning rate
 gamma=0                 # use SVM reg
 sample_from=kbest       # use kbest lists (as opposed to forest)
 filter=uniq             # only unique entries in kbest (surface form)
 pair_sampling=XYX
-hi_lo=0.1               # 10 vs 80 vs 10 and 80 vs 10
+hi_lo=0.1               # 10 vs 80 vs 10 and 80 vs 10 here
 pair_threshold=0        # minimum distance in BLEU (this will still only use pairs with diff > 0)
 select_weights=VOID     # don't output weights