8 files changed, 59 insertions, 36 deletions
diff --git a/dtrain/README.md b/dtrain/README.md
index 8cf99800..077fbc58 100644
--- a/dtrain/README.md
+++ b/dtrain/README.md
@@ -41,13 +41,13 @@ DTRAIN_LOCAL.
 
 Next
 ----
-+ (dtrain|decoder) meta-parameters testing
-+ target side rule ngrams
++ (dtrain|decoder) more meta-parameters testing
++ feature selection directly in dtrain
++ feature template: target side rule ngrams
 + sa-extract -> leave-one-out for grammar of training set?
 + make svm doable; no subgradient?
 + reranking while sgd?
 + try PRO, mira emulations
-+ avg feature count
 
 Legal
 -----
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index e817e7ab..b662cd26 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -21,17 +21,18 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
     ("filter",            po::value<string>()->default_value("uniq"),                       "filter kbest list: 'not', 'uniq'")
     ("pair_sampling",     po::value<string>()->default_value("XYX"),              "how to sample pairs: 'all', 'XYX' or 'PRO'")
     ("hi_lo",             po::value<float>()->default_value(0.1),                "hi and lo (X) for XYX (default 0.1), <= 0.5")
-    ("pair_threshold",    po::value<score_t>()->default_value(0),                       "bleu [0,1] threshold to filter pairs")
+    ("pair_threshold",    po::value<score_t>()->default_value(0.),                      "bleu [0,1] threshold to filter pairs")
     ("N",                 po::value<unsigned>()->default_value(4),                                       "N for Ngrams (BLEU)")
     ("scorer",            po::value<string>()->default_value("stupid_bleu"),        "scoring: bleu, stupid_, smooth_, approx_")
     ("learning_rate",     po::value<weight_t>()->default_value(0.0001),                                        "learning rate")
-    ("gamma",             po::value<weight_t>()->default_value(0),                          "gamma for SVM (0 for perceptron)")
+    ("gamma",             po::value<weight_t>()->default_value(0.),                         "gamma for SVM (0 for perceptron)")
     ("select_weights",    po::value<string>()->default_value("last"),  "output best, last, avg weights ('VOID' to throw away)")
     ("rescale",           po::value<bool>()->zero_tokens(),                           "rescale weight vector after each input")
     ("l1_reg",            po::value<string>()->default_value("none"),   "apply l1 regularization as in 'Tsuroka et al' (2010)")
     ("l1_reg_strength",   po::value<weight_t>(),                                                  "l1 regularization strength")
-    ("fselect",           po::value<weight_t>()->default_value(-1),   "TODO select top x percent of features after each epoch")
+    ("fselect",           po::value<weight_t>()->default_value(-1), "TODO select top x percent (or by threshold) of features after each epoch")
     ("approx_bleu_d",     po::value<score_t>()->default_value(0.9),                                "discount for approx. BLEU")
+    ("scale_bleu_diff",   po::value<bool>()->zero_tokens(),                   "learning rate <- bleu diff of a misranked pair")
 #ifdef DTRAIN_LOCAL
     ("refs,r",            po::value<string>(),                                                      "references in local mode")
 #endif
@@ -133,6 +134,8 @@ main(int argc, char** argv)
   const string select_weights = cfg["select_weights"].as<string>();
   const float hi_lo = cfg["hi_lo"].as<float>();
   const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>();
+  bool scale_bleu_diff = false;
+  if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true;
   bool average = false;
   if (select_weights == "avg")
     average = true;
@@ -236,7 +239,8 @@ main(int argc, char** argv)
     cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl;
     if (sample_from == "kbest")
       cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl;
-    cerr << setw(25) << "learning rate " << eta << endl;
+    if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl;
+    else cerr << setw(25) << "learning rate " << "bleu diff" << endl;
     cerr << setw(25) << "gamma " << gamma << endl;
     cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;
     if (pair_sampling == "XYX")
@@ -255,7 +259,7 @@ main(int argc, char** argv)
     cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
     if (cfg.count("input_weights"))
       cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl;
-    if (cfg.count("stop-after"))
+    if (stop_after > 0)
       cerr << setw(25) << "stop_after " << stop_after << endl;
     if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " inputs)" << endl;
   }
@@ -274,7 +278,7 @@ main(int argc, char** argv)
 #endif
   score_t score_sum = 0.;
   score_t model_sum(0);
-  unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0;
+  unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0;
   if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl;
 
   while(true)
@@ -392,7 +396,7 @@ main(int argc, char** argv)
       else printWordIDVec(ref_ids);
       cerr << endl;
       for (unsigned u = 0; u < samples->size(); u++) {
-        cerr << _p5 << _np << "[" << u << ". '";
+        cerr << _p2 << _np << "[" << u << ". '";
         printWordIDVec((*samples)[u].w);
         cerr << "'" << endl;
         cerr << "SCORE=" << (*samples)[u].score << ",model="<< (*samples)[u].model << endl;
@@ -403,8 +407,12 @@ main(int argc, char** argv)
     score_sum += (*samples)[0].score; // stats for 1best
     model_sum += (*samples)[0].model;
 
+    f_count += observer->get_f_count();
+    list_sz += observer->get_sz();
+
     // weight updates
     if (!noup) {
+      // get pairs
       vector<pair<ScoredHyp,ScoredHyp> > pairs;
       if (pair_sampling == "all")
         all_pairs(samples, pairs, pair_threshold);
@@ -420,6 +428,7 @@ main(int argc, char** argv)
         if (rank_error) rank_errors++;
         score_t margin = fabs(it->first.model - it->second.model);
         if (!rank_error && margin < 1) margin_violations++;
+        if (scale_bleu_diff) eta = it->first.score - it->second.score;
         if (rank_error || (gamma && margin<1)) {
           SparseVector<weight_t> diff_vec = it->first.f - it->second.f;
           lambdas.plus_eq_v_times_s(diff_vec, eta);
@@ -512,7 +521,7 @@ main(int argc, char** argv)
   if (!quiet || hstreaming) nonz = (unsigned)lambdas.size_nonzero();
 
   if (!quiet) {
-    cerr << _p9 << _p << "WEIGHTS" << endl;
+    cerr << _p5 << _p << "WEIGHTS" << endl;
     for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) {
       cerr << setw(18) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl;
     }
@@ -528,6 +537,8 @@ main(int argc, char** argv)
     cerr << "     avg # margin viol: ";
     cerr << margin_violations/(float)in_sz << endl;
     cerr << "    non0 feature count: " <<  nonz << endl;
+    cerr << "           avg list sz: " << list_sz/(float)in_sz << endl;
+    cerr << "           avg f count: " << f_count/(float)list_sz << endl;
   }
 
   if (hstreaming) {
@@ -617,7 +628,7 @@ main(int argc, char** argv)
   if (!quiet) {
     cerr << _p5 << _np << endl << "---" << endl << "Best iteration: ";
     cerr << best_it+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl;
-    cerr << _p2 << "This took " << overall_time/60. << " min." << endl;
+    cerr << "This took " << overall_time/60. << " min." << endl;
   }
 }
 
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 15d32e36..94d149ce 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -13,7 +13,7 @@
 
 #include "filelib.h"
 
-#define DTRAIN_LOCAL
+#undef DTRAIN_LOCAL
 
 #define DTRAIN_DOTS 10 // after how many inputs to display a '.'
 #define DTRAIN_GRAMMAR_DELIM "########EOS########"
@@ -74,7 +74,6 @@ inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }
 inline ostream& _p(ostream& out)  { return out << setiosflags(ios::showpos); }
 inline ostream& _p2(ostream& out) { return out << setprecision(2); }
 inline ostream& _p5(ostream& out) { return out << setprecision(5); }
-inline ostream& _p9(ostream& out) { return out << setprecision(9); }
 
 inline void printWordIDVec(vector<WordID>& v)
 {
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index 77d4a139..dd8882e1 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -59,9 +59,12 @@ struct HypSampler : public DecoderObserver
 {
   LocalScorer* scorer_;
   vector<WordID>* ref_;
+  unsigned f_count_, sz_;
   virtual vector<ScoredHyp>* GetSamples()=0;
   inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; }
   inline void SetRef(vector<WordID>& ref) { ref_ = &ref; }
+  inline unsigned get_f_count() { return f_count_; }
+  inline unsigned get_sz() { return sz_; }
 };
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -100,7 +103,7 @@ struct KBestGetter : public HypSampler
   void
   KBestUnique(const Hypergraph& forest)
   {
-    s_.clear();
+    s_.clear(); sz_ = f_count_ = 0;
     KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,
       KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_);
     for (unsigned i = 0; i < k_; ++i) {
@@ -115,13 +118,15 @@ struct KBestGetter : public HypSampler
       h.rank = i;
       h.score = scorer_->Score(h.w, *ref_, i, src_len_);
       s_.push_back(h);
+      sz_++;
+      f_count_ += h.f.size();
     }
   }
 
   void
   KBestNoFilter(const Hypergraph& forest)
   {
-    s_.clear();
+    s_.clear(); sz_ = f_count_ = 0;
     KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_);
     for (unsigned i = 0; i < k_; ++i) {
       const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
@@ -134,6 +139,8 @@ struct KBestGetter : public HypSampler
       h.rank = i;
       h.score = scorer_->Score(h.w, *ref_, i, src_len_);
       s_.push_back(h);
+      sz_++;
+      f_count_ += h.f.size();
     }
   }
 };
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index 0783f98b..f52fb649 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -30,7 +30,7 @@ struct KSampler : public HypSampler
   vector<ScoredHyp>* GetSamples() { return &s_; }
 
   void ScoredSamples(const Hypergraph& forest) {
-    s_.clear();
+    s_.clear(); sz_ = f_count_ = 0;
     std::vector<HypergraphSampler::Hypothesis> samples;
     HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples);
     for (unsigned i = 0; i < k_; ++i) {
@@ -41,6 +41,8 @@ struct KSampler : public HypSampler
       h.rank = i;
       h.score = scorer_->Score(h.w, *ref_, i, src_len_);
       s_.push_back(h);
+      sz_++;
+      f_count_ += h.f.size();
     }
   }
 };
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index bb01cf4f..bac132c6 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -12,9 +12,16 @@ accept_pair(score_t a, score_t b, score_t threshold)
   return true;
 }
 
+bool
+cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b)
+{
+  return a.score > b.score;
+}
+
 inline void
-all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused = 1)
+all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused=1)
 {
+  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
   unsigned sz = s->size();
   for (unsigned i = 0; i < sz-1; i++) {
     for (unsigned j = i+1; j < sz; j++) {
@@ -22,7 +29,8 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc
         if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
           training.push_back(make_pair((*s)[i], (*s)[j]));
       } else {
-        training.push_back(make_pair((*s)[i], (*s)[j]));
+        if ((*s)[i].score != (*s)[j].score)
+          training.push_back(make_pair((*s)[i], (*s)[j]));
       }
     }
   }
@@ -34,15 +42,11 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc
  *  compare top X to middle Y and low X
  *  cmp middle Y to low X
  */
-bool
-_XYX_cmp_hyp_by_score(ScoredHyp a, ScoredHyp b)
-{
-  return a.score > b.score;
-}
+
 inline void
 partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float hi_lo)
 {
-  sort(s->begin(), s->end(), _XYX_cmp_hyp_by_score);
+  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
   unsigned sz = s->size();
   unsigned sep = round(sz*hi_lo);
   for (unsigned i = 0; i < sep; i++) {
@@ -51,7 +55,7 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
         if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
           training.push_back(make_pair((*s)[i], (*s)[j]));
       } else {
-        if((*s)[i].score != (*s)[j].score)
+        if ((*s)[i].score != (*s)[j].score)
           training.push_back(make_pair((*s)[i], (*s)[j]));
       }
     }
@@ -62,7 +66,7 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
         if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
           training.push_back(make_pair((*s)[i], (*s)[j]));
       } else {
-        if((*s)[i].score != (*s)[j].score)
+        if ((*s)[i].score != (*s)[j].score)
           training.push_back(make_pair((*s)[i], (*s)[j]));
       }
     }
@@ -77,17 +81,17 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
  *       cut = top 50
  */
 bool
-_PRO_cmp_pair_by_diff(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b)
+_PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b)
 {
   return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
 }
 inline void
-PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused = 1)
+PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused=1)
 {
-  unsigned max_count = 5000, count = 0;
+  unsigned max_count = 5000, count = 0, sz = s->size();
   bool b = false;
-  for (unsigned i = 0; i < s->size()-1; i++) {
-    for (unsigned j = i+1; j < s->size(); j++) {
+  for (unsigned i = 0; i < sz-1; i++) {
+    for (unsigned j = i+1; j < sz; j++) {
       if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) {
         training.push_back(make_pair((*s)[i], (*s)[j]));
         if (++count == max_count) {
@@ -99,7 +103,7 @@ PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training,
     if (b) break;
   }
   if (training.size() > 50) {
-    sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff);
+    sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d);
     training.erase(training.begin()+50, training.end());
   }
   return;
diff --git a/dtrain/score.cc b/dtrain/score.cc
index b09d32ba..71d3e5de 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -21,7 +21,7 @@ BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref
   vector<score_t> v = w_;
   if (ref_len < N_) {
     M = ref_len;
-    for (unsigned i = 0; i < M; i++) v[i] = 1./((score_t)M);
+    for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M);
   }
   score_t sum = 0;
   for (unsigned i = 0; i < M; i++) {
@@ -62,7 +62,7 @@ StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
   vector<score_t> v = w_;
   if (ref_len < N_) {
     M = ref_len;
-    for (unsigned i = 0; i < M; i++) v[i] = 1./((score_t)M);
+    for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M);
   }
   score_t sum = 0, add = 0;
   for (unsigned i = 0; i < M; i++) {
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index 2ad44688..e43d6b34 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,5 +1,6 @@
 input=test/example/nc-wmt11.1k.gz    # use '-' for STDIN
 output=-                             # a weights file (add .gz for gzip compression) or STDOUT '-'
+select_weights=VOID     # don't output weights
 decoder_config=test/example/cdec.ini # config for cdec
 # weights for these features will be printed on each iteration
 print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
@@ -18,4 +19,3 @@ filter=uniq             # only unique entries in kbest (surface form)
 pair_sampling=XYX
 hi_lo=0.1               # 10 vs 80 vs 10 and 80 vs 10 here
 pair_threshold=0        # minimum distance in BLEU (this will still only use pairs with diff > 0)
-select_weights=VOID     # don't output weights