From e16b311246f9f2c309b257debd5f50a28b04802b Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Mon, 26 Sep 2011 18:24:58 +0200
Subject: score refactoring #1

---
 dtrain/dtrain.cc               |  41 ++++---------
 dtrain/kbestget.h              |  20 ++++++-
 dtrain/ksampler.h              |   8 +++
 dtrain/pairsampling.h          |   4 ++
 dtrain/score.cc                | 132 +++++++++++++++++++++--------------------
 dtrain/score.h                 | 120 ++++++++++++++++++++++++++++++-------
 dtrain/test/example/cdec.ini   |   2 +-
 dtrain/test/example/dtrain.ini |   4 +-
 dtrain/test/example/weights.gz | Bin 248 -> 12001 bytes
 9 files changed, 213 insertions(+), 118 deletions(-)
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 44090242..35e6cc46 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -106,7 +106,7 @@ main(int argc, char** argv)
 
   // scoring metric/scorer
   string scorer_str = cfg["scorer"].as<string>();
-  score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);
+  /*score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);
   if (scorer_str == "bleu") {
     scorer = &bleu;
   } else if (scorer_str == "stupid_bleu") {
@@ -122,9 +122,11 @@ main(int argc, char** argv)
   NgramCounts global_counts(N); // counts for 1 best translations
   unsigned global_hyp_len = 0;    // sum hypothesis lengths
   unsigned global_ref_len = 0;    // sum reference lengths
-  // ^^^ global_* for approx_bleu
+  // ^^^ global_* for approx_bleu*/
   vector<score_t> bleu_weights;   // we leave this empty -> 1/N 
-  if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl;
+  //if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl;
+  StupidBleuScorer scorer;
+  scorer.Init(N, bleu_weights);
 
   // init weights
   Weights weights;
@@ -240,7 +242,6 @@ main(int argc, char** argv)
       // handling input
       strsplit(in, in_split, '\t', 4);
       // getting reference
-      ref_ids.clear();
       vector<string> ref_tok;
       strsplit(in_split[2], ref_tok, ' ');
       register_and_convert(ref_tok, ref_ids);
@@ -279,43 +280,23 @@ main(int argc, char** argv)
 
     // (local) scoring
     if (t > 0) ref_ids = ref_ids_buf[ii];
-    score_t score = 0.;
     for (unsigned i = 0; i < samples->size(); i++) {
-      NgramCounts counts = make_ngram_counts(ref_ids, (*samples)[i].w, N);
-      if (scorer_str == "approx_bleu") {
-        unsigned hyp_len = 0;
-        if (i == 0) { // 'context of 1best translations'
-          global_counts  += counts;
-          global_hyp_len += (*samples)[i].w.size();
-          global_ref_len += ref_ids.size();
-          counts.reset();
-        } else {
-            hyp_len = (*samples)[i].w.size();
-        }
-        NgramCounts _c = global_counts + counts;
-        score = .9 * scorer(_c,
-                            global_ref_len,
-                            global_hyp_len + hyp_len, N, bleu_weights);
-      } else {
-        score = scorer(counts,
-                       ref_ids.size(),
-                       (*samples)[i].w.size(), N, bleu_weights);
-      }
-
-      (*samples)[i].score = (score);
+        //cout << ii << " " << i << endl;
 
+        cout << _p9;
+      (*samples)[i].score = scorer.Score((*samples)[i], ref_ids, ii);
       if (i == 0) {
-        score_sum += score;
+        score_sum += (*samples)[i].score;
         model_sum += (*samples)[i].model;
       }
 
       if (verbose) {
         if (i == 0) cerr << "'" << TD::GetString(ref_ids) << "' [ref]" << endl;
         cerr << _p5 << _np << "[hyp " << i << "] " << "'" << TD::GetString((*samples)[i].w) << "'";
-        cerr << " [SCORE=" << score << ",model="<< (*samples)[i].model << "]" << endl;
+        cerr << " [SCORE=" << (*samples)[i].score << ",model="<< (*samples)[i].model << "]" << endl;
         cerr << (*samples)[i].f << endl;
       }
-    } // sample/scoring loop
+    }
 
     if (verbose) cerr << endl;
 
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index 935998a0..2a2c6073 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -1,11 +1,24 @@
 #ifndef _DTRAIN_KBESTGET_H_
 #define _DTRAIN_KBESTGET_H_
 
-#include "kbest.h"
+
+#include <vector>
+#include <string>
+
+using namespace std;
+
+#include "kbest.h" // cdec
+#include "verbose.h"
+#include "viterbi.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "weights.h"
 
 namespace dtrain
 {
 
+typedef double score_t; // float
+
 
 struct ScoredHyp
 {
@@ -13,11 +26,12 @@ struct ScoredHyp
   SparseVector<double> f;
   score_t model;
   score_t score;
+  unsigned rank;
 };
 
 struct HypSampler : public DecoderObserver
 {
-  virtual vector<ScoredHyp>* GetSamples() {}
+  virtual vector<ScoredHyp>* GetSamples()=0;
 };
 
 struct KBestGetter : public HypSampler
@@ -62,6 +76,7 @@ struct KBestGetter : public HypSampler
       h.w = d->yield;
       h.f = d->feature_values;
       h.model = log(d->score);
+      h.rank = i;
       s_.push_back(h);
     }
   }
@@ -79,6 +94,7 @@ struct KBestGetter : public HypSampler
       h.w = d->yield;
       h.f = d->feature_values;
       h.model = log(d->score);
+      h.rank = i;
       s_.push_back(h);
     }
   }
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index 17b0ba56..767dc42e 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -1,7 +1,13 @@
 #ifndef _DTRAIN_KSAMPLER_H_
 #define _DTRAIN_KSAMPLER_H_
 
+#include "kbestget.h"
 #include "hgsampler.h"
+#include <vector>
+#include <string>
+
+using namespace std;
+
 #include "kbest.h" // cdec
 #include "sampler.h"
 
@@ -14,6 +20,7 @@ struct KSampler : public HypSampler
   const unsigned k_;
   vector<ScoredHyp> s_;
   MT19937* prng_;
+  score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);
 
   explicit KSampler(const unsigned k, MT19937* prng) :
     k_(k), prng_(prng) {}
@@ -35,6 +42,7 @@ struct KSampler : public HypSampler
       h.w = samples[i].words;
       h.f = samples[i].fmap;
       h.model = log(samples[i].model_score); 
+      h.rank = i;
       s_.push_back(h);
     }
   }
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 9546a945..4a6d93d1 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -2,6 +2,10 @@
 #define _DTRAIN_PAIRSAMPLING_H_
 
 #include "kbestget.h"
+#include "score.h"
+#include <vector>
+#include <string>
+using namespace std;
 #include "sampler.h" // cdec, MT19937
 
 namespace dtrain
diff --git a/dtrain/score.cc b/dtrain/score.cc
index 52644250..9b22508b 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -4,40 +4,6 @@ namespace dtrain
 {
 
 
-Ngrams
-make_ngrams(vector<WordID>& s, unsigned N)
-{
-  Ngrams ngrams;
-  vector<WordID> ng;
-  for (size_t i = 0; i < s.size(); i++) {
-    ng.clear();
-    for (unsigned j = i; j < min(i+N, s.size()); j++) {
-      ng.push_back(s[j]);
-      ngrams[ng]++;
-    }
-  }
-  return ngrams;
-}
-
-NgramCounts
-make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N)
-{
-  Ngrams hyp_ngrams = make_ngrams(hyp, N);
-  Ngrams ref_ngrams = make_ngrams(ref, N);
-  NgramCounts counts(N);
-  Ngrams::iterator it;
-  Ngrams::iterator ti;
-  for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) {
-    ti = ref_ngrams.find(it->first);
-    if (ti != ref_ngrams.end()) {
-      counts.add(it->second, ti->second, it->first.size() - 1);
-    } else {
-      counts.add(it->second, 0, it->first.size() - 1);
-    }
-  }
-  return counts;
-}
-
 /*
  * bleu
  *
@@ -48,26 +14,28 @@ make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N)
  * NOTE: 0 if one n in {1..N} has 0 count
  */
 score_t
-brevity_penaly(const unsigned hyp_len, const unsigned ref_len)
-{
-  if (hyp_len > ref_len) return 1;
-  return exp(1 - (score_t)ref_len/hyp_len);
-}
-score_t
-bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
-      unsigned N, vector<score_t> weights )
+BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len)
 {
   if (hyp_len == 0 || ref_len == 0) return 0;
-  if (ref_len < N) N = ref_len;
-  if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N);
+  unsigned M = N_;
+  if (ref_len < N_) M = ref_len;
   score_t sum = 0;
-  for (unsigned i = 0; i < N; i++) {
+  for (unsigned i = 0; i < M; i++) {
     if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0;
-    sum += weights[i] * log((score_t)counts.clipped[i] / counts.sum[i]);
+    sum += w_[i] * log((score_t)counts.clipped[i] / counts.sum[i]);
   }
   return brevity_penaly(hyp_len, ref_len) * exp(sum);
 }
 
+score_t
+BleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
+{
+  unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size();
+  if (hyp_len == 0 || ref_len == 0) return 0;
+  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_);
+  return Bleu(counts, hyp_len, ref_len);
+}
+
 /*
  * 'stupid' bleu
  *
@@ -79,18 +47,31 @@ bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
  * NOTE: 0 iff no 1gram match
  */
 score_t
-stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
-             unsigned N, vector<score_t> weights )
+StupidBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
 {
+  unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size();
   if (hyp_len == 0 || ref_len == 0) return 0;
-  if (ref_len < N) N = ref_len;
-  if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N);
+  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_);
+  unsigned M = N_;
+  if (ref_len < N_) M = ref_len;
   score_t sum = 0, add = 0;
-  for (unsigned i = 0; i < N; i++) {
+  for (unsigned i = 0; i < M; i++) {
     if (i == 1) add = 1;
-    sum += weights[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add)));
+    //cout << ((score_t)counts.clipped[i] + add) << "/" << counts.sum[i] +add << "." << endl;
+    //cout << "w_[i] " << w_[i] << endl;
+    sum += w_[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add)));
+    //cout << "sum += "<< w_[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add))) << endl;
   }
-  return brevity_penaly(hyp_len, ref_len) * exp(sum);
+  /*cout << ref_ids << endl;
+  cout << hyp.w << endl;
+  cout << "ref_len " << ref_len << endl;
+  cout << "hyp_len " << hyp_len << endl;
+  cout << "bp " << brevity_penaly(hyp_len, ref_len) << endl;
+  cout << "exp(sum) " << exp(sum) << endl;
+  counts.Print();
+  cout << brevity_penaly(hyp_len, ref_len) * exp(sum) << endl;
+  cout << "---" << endl;*/
+  return  brevity_penaly(hyp_len, ref_len) * exp(sum);
 }
 
 /*
@@ -103,16 +84,16 @@ stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
  * NOTE: max is 0.9375
  */
 score_t
-smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
-            const unsigned N, vector<score_t> weights )
+SmoothBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
 {
+  unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size();
   if (hyp_len == 0 || ref_len == 0) return 0;
-  if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N);
+  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_);
   score_t sum = 0;
   unsigned j = 1;
-  for (unsigned i = 0; i < N; i++) {
+  for (unsigned i = 0; i < N_; i++) {
     if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
-    sum += exp((weights[i] * log((score_t)counts.clipped[i]/counts.sum[i]))) / pow(2, N-j+1);
+    sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]))) / pow(2, N_-j+1);
     j++;
   }
   return brevity_penaly(hyp_len, ref_len) * sum;
@@ -125,14 +106,39 @@ smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
  *        and Structural Translation Features"
  * (Chiang et al. '08)
  */
-score_t
-approx_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,
-            const unsigned N, vector<score_t> weights)
+/*void
+ApproxBleuScorer::Prep(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len)
+{
+  glob_onebest_counts += counts;
+  glob_hyp_len += hyp_len;
+  glob_ref_len += ref_len;
+}
+
+void
+ApproxBleuScorer::Reset()
 {
-  return brevity_penaly(hyp_len, ref_len) 
-           * 0.9 * bleu(counts, hyp_len, ref_len, N, weights);
+  glob_onebest_counts.Zero();
+  glob_hyp_len = 0;
+  glob_ref_len = 0;
 }
 
+score_t
+ApproxBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
+{
+  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_);
+  if (id == 0) reset();
+  unsigned hyp_len = 0, ref_len = 0;
+  if (hyp.rank == 0) { // 'context of 1best translations'
+    scorer->prep(counts, hyp.w.size(), ref_ids.size()); 
+    counts.reset();
+  } else {
+    hyp_len = hyp.w.size();
+    ref_len = ref_ids.size();
+  }
+  return 0.9 * BleuScorer::Bleu(glob_onebest_counts + counts,
+                                glob_hyp_len + hyp_len, glob_ref_len + ref_len);
+}*/
+
 
 } // namespace
 
diff --git a/dtrain/score.h b/dtrain/score.h
index 3e5d82a9..f87d708c 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -7,6 +7,8 @@
 #include <cassert>
 #include <cmath>
 
+#include "kbestget.h"
+
 #include "wordid.h" // cdec
 
 using namespace std;
@@ -15,15 +17,13 @@ namespace dtrain
 {
 
 
-typedef double score_t; // float
-
 struct NgramCounts
 {
   unsigned N_;
   map<unsigned, unsigned> clipped;
   map<unsigned, unsigned> sum;
 
-  NgramCounts(const unsigned N) : N_(N) { reset(); } 
+  NgramCounts(const unsigned N) : N_(N) { Zero(); } 
 
   void
   operator+=(const NgramCounts& rhs)
@@ -44,20 +44,19 @@ struct NgramCounts
   }
 
   void
-  add(unsigned count, unsigned ref_count, unsigned i)
+  Add(unsigned count, unsigned ref_count, unsigned i)
   {
     assert(i < N_);
     if (count > ref_count) {
       clipped[i] += ref_count;
-      sum[i] += count;
     } else {
       clipped[i] += count;
-      sum[i] += count;
     }
+    sum[i] += count;
   }
 
   void
-  reset()
+  Zero()
   {
     unsigned i;
     for (i = 0; i < N_; i++) {
@@ -67,7 +66,7 @@ struct NgramCounts
   }
 
   void
-  print()
+  Print()
   {
     for (unsigned i = 0; i < N_; i++) {
       cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
@@ -78,18 +77,99 @@ struct NgramCounts
 
 typedef map<vector<WordID>, unsigned> Ngrams;
 
-Ngrams make_ngrams(vector<WordID>& s, unsigned N);
-NgramCounts make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N);
-
-score_t brevity_penaly(const unsigned hyp_len, const unsigned ref_len);
-score_t bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N,
-             vector<score_t> weights = vector<score_t>());
-score_t stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, unsigned N,
-                    vector<score_t> weights = vector<score_t>());
-score_t smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N,
-                    vector<score_t> weights = vector<score_t>());
-score_t approx_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N,
-                    vector<score_t> weights = vector<score_t>());
+inline Ngrams
+make_ngrams(const vector<WordID>& s, const unsigned N)
+{
+  Ngrams ngrams;
+  vector<WordID> ng;
+  for (size_t i = 0; i < s.size(); i++) {
+    ng.clear();
+    for (unsigned j = i; j < min(i+N, s.size()); j++) {
+      ng.push_back(s[j]);
+      ngrams[ng]++;
+    }
+  }
+  return ngrams;
+}
+
+inline NgramCounts
+make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned N)
+{
+  Ngrams hyp_ngrams = make_ngrams(hyp, N);
+  Ngrams ref_ngrams = make_ngrams(ref, N);
+  NgramCounts counts(N);
+  Ngrams::iterator it;
+  Ngrams::iterator ti;
+  for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) {
+    ti = ref_ngrams.find(it->first);
+    if (ti != ref_ngrams.end()) {
+      counts.Add(it->second, ti->second, it->first.size() - 1);
+    } else {
+      counts.Add(it->second, 0, it->first.size() - 1);
+    }
+  }
+  return counts;
+}
+
+struct LocalScorer
+{
+  unsigned N_;
+  vector<score_t> w_;
+
+  virtual score_t
+  Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)=0;
+
+  void
+  Init(unsigned N, vector<score_t> weights)
+  {
+    assert(N > 0);
+    N_ = N;
+    if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_);
+    else w_ = weights;
+  }
+
+  score_t
+  brevity_penaly(const unsigned hyp_len, const unsigned ref_len)
+  {
+    if (hyp_len > ref_len) return 1;
+    return exp(1 - (score_t)ref_len/hyp_len);
+  }
+};
+
+struct BleuScorer : public LocalScorer
+{
+  score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len);
+  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id);
+};
+
+struct StupidBleuScorer : public LocalScorer
+{
+  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id);
+};
+
+struct SmoothBleuScorer : public LocalScorer
+{
+  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id);
+};
+
+// FIXME
+/*struct ApproxBleuScorer : public LocalScorer
+{
+  NgramCounts glob_onebest_counts;
+  unsigned glob_hyp_len, glob_ref_len;
+
+  void Prep(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len);
+  void Reset();
+  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id);
+
+  ApproxBleuScorer() 
+  {
+    glob_onebest_counts.Zero();
+    glob_hyp_len = 0;
+    glob_ref_len = 0;
+  }
+};*/
+
 
 
 } // namespace
diff --git a/dtrain/test/example/cdec.ini b/dtrain/test/example/cdec.ini
index 50379afe..31a205c7 100644
--- a/dtrain/test/example/cdec.ini
+++ b/dtrain/test/example/cdec.ini
@@ -4,4 +4,4 @@ cubepruning_pop_limit=30
 scfg_max_span_limit=15
 feature_function=WordPenalty
 feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz
-#feature_function=RuleIdentityFeatures
+feature_function=RuleIdentityFeatures
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index fbddb915..df746e51 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,7 +1,7 @@
 decoder_config=test/example/cdec.ini
 k=100
-N=3
-epochs=1000
+N=4
+epochs=10
 input=test/example/nc-1k.gz
 scorer=stupid_bleu
 output=test/example/weights.gz
diff --git a/dtrain/test/example/weights.gz b/dtrain/test/example/weights.gz
index e2e1ecce..e7baa367 100644
Binary files a/dtrain/test/example/weights.gz and b/dtrain/test/example/weights.gz differ
-- 
cgit v1.2.3