9 files changed, 98 insertions, 156 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 35e6cc46..622cd01e 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -95,38 +95,32 @@ main(int argc, char** argv)
     cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
   Decoder decoder(ini_rf.stream());
 
-  MT19937 rng; // random number generator
-  // setup decoder observer
-  HypSampler* observer;
-  if (sample_from == "kbest") {
-    observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type));
-  } else {
-    observer = dynamic_cast<KSampler*>(new KSampler(k, &rng));
-  }
-
   // scoring metric/scorer
   string scorer_str = cfg["scorer"].as<string>();
-  /*score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);
+  LocalScorer* scorer;
   if (scorer_str == "bleu") {
-    scorer = &bleu;
   } else if (scorer_str == "stupid_bleu") {
-    scorer = &stupid_bleu;
+    scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer);
   } else if (scorer_str == "smooth_bleu") {
-    scorer = &smooth_bleu;
+      scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);
   } else if (scorer_str == "approx_bleu") {
-    scorer = &approx_bleu;
+      scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer); // FIXME
   } else {
     cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;
     exit(1);
   }
-  NgramCounts global_counts(N); // counts for 1 best translations
-  unsigned global_hyp_len = 0;    // sum hypothesis lengths
-  unsigned global_ref_len = 0;    // sum reference lengths
-  // ^^^ global_* for approx_bleu*/
-  vector<score_t> bleu_weights;   // we leave this empty -> 1/N 
-  //if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl;
-  StupidBleuScorer scorer;
-  scorer.Init(N, bleu_weights);
+  vector<score_t> bleu_weights;
+  scorer->Init(N, bleu_weights);
+  if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl;
+
+  // setup decoder observer
+  MT19937 rng; // random number generator
+  HypSampler* observer;
+  if (sample_from == "kbest")
+    observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type));
+  else
+    observer = dynamic_cast<KSampler*>(new KSampler(k, &rng));
+  observer->SetScorer(scorer);
 
   // init weights
   Weights weights;
@@ -240,10 +234,10 @@ main(int argc, char** argv)
     vector<WordID> ref_ids;  // reference as vector<WordID>
     if (t == 0) {
       // handling input
-      strsplit(in, in_split, '\t', 4);
+      boost::split(in_split, in, boost::is_any_of("\t"));
       // getting reference
       vector<string> ref_tok;
-      strsplit(in_split[2], ref_tok, ' ');
+      boost::split(ref_tok, in_split[2], boost::is_any_of(" "));
       register_and_convert(ref_tok, ref_ids);
       ref_ids_buf.push_back(ref_ids);
       // process and set grammar
@@ -259,8 +253,9 @@ main(int argc, char** argv)
       in_split[3] += "\n";
       grammar_buf_out << in_split[3] << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
       decoder.SetSentenceGrammarFromString(in_split[3]);
-      // decode
       src_str_buf.push_back(in_split[1]);
+      // decode
+      observer->SetRef(ref_ids);
       decoder.Decode(in_split[1], observer);
     } else {
       // get buffered grammar
@@ -273,32 +268,24 @@ main(int argc, char** argv)
       }
       decoder.SetSentenceGrammarFromString(grammar_str);
       // decode
+      observer->SetRef(ref_ids_buf[ii]);
       decoder.Decode(src_str_buf[ii], observer);
     }
 
+    // get (scored) samples 
     vector<ScoredHyp>* samples = observer->GetSamples();
 
-    // (local) scoring
-    if (t > 0) ref_ids = ref_ids_buf[ii];
-    for (unsigned i = 0; i < samples->size(); i++) {
-        //cout << ii << " " << i << endl;
-
-        cout << _p9;
-      (*samples)[i].score = scorer.Score((*samples)[i], ref_ids, ii);
-      if (i == 0) {
-        score_sum += (*samples)[i].score;
-        model_sum += (*samples)[i].model;
-      }
-
-      if (verbose) {
-        if (i == 0) cerr << "'" << TD::GetString(ref_ids) << "' [ref]" << endl;
-        cerr << _p5 << _np << "[hyp " << i << "] " << "'" << TD::GetString((*samples)[i].w) << "'";
-        cerr << " [SCORE=" << (*samples)[i].score << ",model="<< (*samples)[i].model << "]" << endl;
-        cerr << (*samples)[i].f << endl;
-      }
+    if (verbose) {
+      cout << "[ref: '";
+      if (t > 0) cout << ref_ids_buf[ii];
+      else cout << ref_ids;
+      cout << endl;
+      cout << _p5 << _np << "1best: " << "'" << (*samples)[0].w << "'" << endl;
+      cout << "SCORE=" << (*samples)[0].score << ",model="<< (*samples)[0].model << endl;
+      cout << "F{" << (*samples)[0].f << "} ]" << endl << endl;
     }
-
-    if (verbose) cerr << endl;
+    score_sum += (*samples)[0].score;
+    model_sum += (*samples)[0].model;
 
 //////////////////////////////////////////////////////////
     // UPDATE WEIGHTS
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index ed75a297..0c27167d 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -7,14 +7,6 @@
 #include <boost/algorithm/string.hpp>
 #include <boost/program_options.hpp>
 
-#include "verbose.h"
-#include "viterbi.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "weights.h"
-
-#include "score.h"
-#include "kbestget.h"
 #include "ksampler.h"
 #include "pairsampling.h"
 
@@ -31,27 +23,12 @@ inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids
   for (it = strs.begin(); it < strs.end(); it++)
     ids.push_back(TD::Convert(*it));
 }
+
 inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }
 inline ostream& _p(ostream& out)  { return out << setiosflags(ios::showpos); }
 inline ostream& _p2(ostream& out) { return out << setprecision(2); }
 inline ostream& _p5(ostream& out) { return out << setprecision(5); }
 inline ostream& _p9(ostream& out) { return out << setprecision(9); }
-inline void strsplit(string &s, vector<string>& v, char d = '\t', unsigned parts = 0) { 
-  stringstream ss(s);
-  string t;
-  unsigned i = 0;
-  while(true)
-  {
-    if (parts > 0 && i == parts-1) {
-      getline(ss, t);
-      v.push_back(t);
-      break;
-    }
-    if (!getline(ss, t, d)) break;
-    v.push_back(t);
-    i++;
-  }
-}
 
 #endif
 
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index 2a2c6073..c0fd3f47 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -1,12 +1,6 @@
 #ifndef _DTRAIN_KBESTGET_H_
 #define _DTRAIN_KBESTGET_H_
 
-
-#include <vector>
-#include <string>
-
-using namespace std;
-
 #include "kbest.h" // cdec
 #include "verbose.h"
 #include "viterbi.h"
@@ -14,11 +8,13 @@ using namespace std;
 #include "decoder.h"
 #include "weights.h"
 
+using namespace std;
+
 namespace dtrain
 {
 
-typedef double score_t; // float
 
+typedef double score_t; // float
 
 struct ScoredHyp
 {
@@ -29,10 +25,44 @@ struct ScoredHyp
   unsigned rank;
 };
 
+struct LocalScorer
+{
+  unsigned N_;
+  vector<score_t> w_;
+
+  virtual score_t
+  Score(vector<WordID>& hyp, vector<WordID>& ref)=0;
+
+  void
+  Init(unsigned N, vector<score_t> weights)
+  {
+    assert(N > 0);
+    N_ = N;
+    if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_);
+    else w_ = weights;
+  }
+
+  score_t
+  brevity_penaly(const unsigned hyp_len, const unsigned ref_len)
+  {
+    if (hyp_len > ref_len) return 1;
+    return exp(1 - (score_t)ref_len/hyp_len);
+  }
+};
+
 struct HypSampler : public DecoderObserver
 {
+  LocalScorer* scorer_;
+  vector<WordID>* ref_;
   virtual vector<ScoredHyp>* GetSamples()=0;
+  void SetScorer(LocalScorer* scorer) { scorer_ = scorer; }
+  void SetRef(vector<WordID>& ref) { ref_ = &ref; } 
 };
+/////////////////////////////////////////////////////////////////////
+// wtf
+
+
+
 
 struct KBestGetter : public HypSampler
 {
@@ -77,6 +107,7 @@ struct KBestGetter : public HypSampler
       h.f = d->feature_values;
       h.model = log(d->score);
       h.rank = i;
+      h.score = scorer_->Score(h.w, *ref_);
       s_.push_back(h);
     }
   }
@@ -95,6 +126,7 @@ struct KBestGetter : public HypSampler
       h.f = d->feature_values;
       h.model = log(d->score);
       h.rank = i;
+      h.score = scorer_->Score(h.w, *ref_);
       s_.push_back(h);
     }
   }
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index 767dc42e..7567f43a 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -1,15 +1,9 @@
 #ifndef _DTRAIN_KSAMPLER_H_
 #define _DTRAIN_KSAMPLER_H_
 
-#include "kbestget.h"
 #include "hgsampler.h"
-#include <vector>
-#include <string>
-
-using namespace std;
-
-#include "kbest.h" // cdec
-#include "sampler.h"
+#include "kbestget.h"
+#include "score.h"
 
 namespace dtrain
 {
@@ -43,6 +37,7 @@ struct KSampler : public HypSampler
       h.f = samples[i].fmap;
       h.model = log(samples[i].model_score); 
       h.rank = i;
+      h.score = scorer_->Score(h.w, *ref_);
       s_.push_back(h);
     }
   }
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 4a6d93d1..6db0c045 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -1,12 +1,6 @@
 #ifndef _DTRAIN_PAIRSAMPLING_H_
 #define _DTRAIN_PAIRSAMPLING_H_
 
-#include "kbestget.h"
-#include "score.h"
-#include <vector>
-#include <string>
-using namespace std;
-#include "sampler.h" // cdec, MT19937
 
 namespace dtrain
 {
diff --git a/dtrain/score.cc b/dtrain/score.cc
index 9b22508b..93c4e80b 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -22,17 +22,17 @@ BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref
   score_t sum = 0;
   for (unsigned i = 0; i < M; i++) {
     if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0;
-    sum += w_[i] * log((score_t)counts.clipped[i] / counts.sum[i]);
+    sum += w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]);
   }
   return brevity_penaly(hyp_len, ref_len) * exp(sum);
 }
 
 score_t
-BleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
+BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref)
 {
-  unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size();
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
   if (hyp_len == 0 || ref_len == 0) return 0;
-  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_);
+  NgramCounts counts = make_ngram_counts(hyp, ref, N_);
   return Bleu(counts, hyp_len, ref_len);
 }
 
@@ -47,30 +47,18 @@ BleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
  * NOTE: 0 iff no 1gram match
  */
 score_t
-StupidBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
+StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref)
 {
-  unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size();
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
   if (hyp_len == 0 || ref_len == 0) return 0;
-  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_);
+  NgramCounts counts = make_ngram_counts(hyp, ref, N_);
   unsigned M = N_;
   if (ref_len < N_) M = ref_len;
   score_t sum = 0, add = 0;
   for (unsigned i = 0; i < M; i++) {
     if (i == 1) add = 1;
-    //cout << ((score_t)counts.clipped[i] + add) << "/" << counts.sum[i] +add << "." << endl;
-    //cout << "w_[i] " << w_[i] << endl;
-    sum += w_[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add)));
-    //cout << "sum += "<< w_[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add))) << endl;
+    sum += w_[i] * log(((score_t)counts.clipped[i] + add)/((counts.sum[i] + add)));
   }
-  /*cout << ref_ids << endl;
-  cout << hyp.w << endl;
-  cout << "ref_len " << ref_len << endl;
-  cout << "hyp_len " << hyp_len << endl;
-  cout << "bp " << brevity_penaly(hyp_len, ref_len) << endl;
-  cout << "exp(sum) " << exp(sum) << endl;
-  counts.Print();
-  cout << brevity_penaly(hyp_len, ref_len) * exp(sum) << endl;
-  cout << "---" << endl;*/
   return  brevity_penaly(hyp_len, ref_len) * exp(sum);
 }
 
@@ -84,21 +72,22 @@ StupidBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
  * NOTE: max is 0.9375
  */
 score_t
-SmoothBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)
+SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref)
 {
-  unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size();
+  unsigned hyp_len = hyp.size(), ref_len = ref.size();
   if (hyp_len == 0 || ref_len == 0) return 0;
-  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_);
+  NgramCounts counts = make_ngram_counts(hyp, ref, N_);
   score_t sum = 0;
   unsigned j = 1;
   for (unsigned i = 0; i < N_; i++) {
     if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
-    sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]))) / pow(2, N_-j+1);
+    sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i])))/pow(2, N_-j+1);
     j++;
   }
   return brevity_penaly(hyp_len, ref_len) * sum;
 }
 
+// FIXME
 /*
  * approx. bleu
  *
diff --git a/dtrain/score.h b/dtrain/score.h
index f87d708c..9af56ef9 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -1,16 +1,8 @@
 #ifndef _DTRAIN_SCORE_H_
 #define _DTRAIN_SCORE_H_
 
-#include <iostream>
-#include <vector>
-#include <map>
-#include <cassert>
-#include <cmath>
-
 #include "kbestget.h"
 
-#include "wordid.h" // cdec
-
 using namespace std;
 
 namespace dtrain
@@ -111,51 +103,28 @@ make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const un
   return counts;
 }
 
-struct LocalScorer
-{
-  unsigned N_;
-  vector<score_t> w_;
-
-  virtual score_t
-  Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)=0;
-
-  void
-  Init(unsigned N, vector<score_t> weights)
-  {
-    assert(N > 0);
-    N_ = N;
-    if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_);
-    else w_ = weights;
-  }
-
-  score_t
-  brevity_penaly(const unsigned hyp_len, const unsigned ref_len)
-  {
-    if (hyp_len > ref_len) return 1;
-    return exp(1 - (score_t)ref_len/hyp_len);
-  }
-};
-
 struct BleuScorer : public LocalScorer
 {
   score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len);
-  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id);
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref_ids);
 };
 
 struct StupidBleuScorer : public LocalScorer
 {
-  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id);
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref);
 };
 
 struct SmoothBleuScorer : public LocalScorer
 {
-  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id);
+  score_t Score(vector<WordID>& hyp, vector<WordID>& ref);
 };
 
 // FIXME
 /*struct ApproxBleuScorer : public LocalScorer
 {
-  NgramCounts glob_onebest_counts;
+  bool prepped;
+
+  NgramCounts* glob_onebest_counts;
   unsigned glob_hyp_len, glob_ref_len;
 
   void Prep(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len);
@@ -171,7 +140,6 @@ struct SmoothBleuScorer : public LocalScorer
 };*/
 
 
-
 } // namespace
 
 #endif
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index df746e51..fd3a3841 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,11 +1,11 @@
 decoder_config=test/example/cdec.ini
 k=100
 N=4
-epochs=10
+epochs=100
 input=test/example/nc-1k.gz
 scorer=stupid_bleu
 output=test/example/weights.gz
-stop_after=10
-sample_from=kbest
-pair_sampling=all
+stop_after=0
+sample_from=forest
+pair_sampling=rand
 print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
diff --git a/dtrain/test/example/weights.gz b/dtrain/test/example/weights.gz
index e7baa367..7960a05a 100644
--- a/dtrain/test/example/weights.gz
+++ b/dtrain/test/example/weights.gz