From 220d2ff82d8d7f02b1b93711fe418015026ae1a6 Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Sun, 1 Feb 2015 20:14:37 +0100
Subject: dtrain: rm 'check' and ksampler

---
 training/dtrain/kbestget.h     |  88 -------------------------
 training/dtrain/ksampler.h     |  60 ------------------
 training/dtrain/pairs.h        | 141 +++++++++++++++++++++++++++++++++++++++++
 training/dtrain/pairsampling.h | 141 -----------------------------------------
 training/dtrain/sample.h       |  88 +++++++++++++++++++++++++
 5 files changed, 229 insertions(+), 289 deletions(-)
 delete mode 100644 training/dtrain/kbestget.h
 delete mode 100644 training/dtrain/ksampler.h
 create mode 100644 training/dtrain/pairs.h
 delete mode 100644 training/dtrain/pairsampling.h
 create mode 100644 training/dtrain/sample.h
diff --git a/training/dtrain/kbestget.h b/training/dtrain/kbestget.h
deleted file mode 100644
index 25f02273..00000000
--- a/training/dtrain/kbestget.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef _DTRAIN_KBESTGET_H_
-#define _DTRAIN_KBESTGET_H_
-
-#include "kbest.h"
-
-namespace dtrain
-{
-
-
-struct KBestGetter : public HypSampler
-{
-  const unsigned k_;
-  const string filter_type_;
-  vector<ScoredHyp> s_;
-  unsigned src_len_;
-
-  KBestGetter(const unsigned k, const string filter_type) :
-    k_(k), filter_type_(filter_type) {}
-
-  virtual void
-  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
-  {
-    src_len_ = smeta.GetSourceLength();
-    KBestScored(*hg);
-  }
-
-  vector<ScoredHyp>* GetSamples() { return &s_; }
-
-  void
-  KBestScored(const Hypergraph& forest)
-  {
-    if (filter_type_ == "uniq") {
-      KBestUnique(forest);
-    } else if (filter_type_ == "not") {
-      KBestNoFilter(forest);
-    }
-  }
-
-  void
-  KBestUnique(const Hypergraph& forest)
-  {
-    s_.clear(); sz_ = f_count_ = 0;
-    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,
-      KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_);
-    for (unsigned i = 0; i < k_; ++i) {
-      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique,
-              prob_t, EdgeProb>::Derivation* d =
-            kbest.LazyKthBest(forest.nodes_.size() - 1, i);
-      if (!d) break;
-      ScoredHyp h;
-      h.w = d->yield;
-      h.f = d->feature_values;
-      h.model = log(d->score);
-      h.rank = i;
-      h.score = scorer_->Score(h.w, *refs_, i, src_len_);
-      s_.push_back(h);
-      sz_++;
-      f_count_ += h.f.size();
-    }
-  }
-
-  void
-  KBestNoFilter(const Hypergraph& forest)
-  {
-    s_.clear(); sz_ = f_count_ = 0;
-    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_);
-    for (unsigned i = 0; i < k_; ++i) {
-      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
-            kbest.LazyKthBest(forest.nodes_.size() - 1, i);
-      if (!d) break;
-      ScoredHyp h;
-      h.w = d->yield;
-      h.f = d->feature_values;
-      h.model = log(d->score);
-      h.rank = i;
-      h.score = scorer_->Score(h.w, *refs_, i, src_len_);
-      s_.push_back(h);
-      sz_++;
-      f_count_ += h.f.size();
-    }
-  }
-};
-
-
-} // namespace
-
-#endif
-
diff --git a/training/dtrain/ksampler.h b/training/dtrain/ksampler.h
deleted file mode 100644
index 9eedc74f..00000000
--- a/training/dtrain/ksampler.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef _DTRAIN_KSAMPLER_H_
-#define _DTRAIN_KSAMPLER_H_
-
-#include "hg_sampler.h"
-
-namespace dtrain
-{
-
-
-bool
-cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b)
-{
-  return a.model > b.model;
-}
-
-struct KSampler : public HypSampler
-{
-  const unsigned k_;
-  vector<ScoredHyp> s_;
-  MT19937* prng_;
-  score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);
-  unsigned src_len_;
-
-  explicit KSampler(const unsigned k, MT19937* prng) :
-    k_(k), prng_(prng) {}
-
-  virtual void
-  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
-  {
-    src_len_ = smeta.GetSourceLength();
-    ScoredSamples(*hg);
-  }
-
-  vector<ScoredHyp>* GetSamples() { return &s_; }
-
-  void ScoredSamples(const Hypergraph& forest) {
-    s_.clear(); sz_ = f_count_ = 0;
-    std::vector<HypergraphSampler::Hypothesis> samples;
-    HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples);
-    for (unsigned i = 0; i < k_; ++i) {
-      ScoredHyp h;
-      h.w = samples[i].words;
-      h.f = samples[i].fmap;
-      h.model = log(samples[i].model_score);
-      h.rank = i;
-      h.score = scorer_->Score(h.w, *refs_, i, src_len_);
-      s_.push_back(h);
-      sz_++;
-      f_count_ += h.f.size();
-    }
-    sort(s_.begin(), s_.end(), cmp_hyp_by_model_d);
-    for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i;
-  }
-};
-
-
-} // namespace
-
-#endif
-
diff --git a/training/dtrain/pairs.h b/training/dtrain/pairs.h
new file mode 100644
index 00000000..fd08be8c
--- /dev/null
+++ b/training/dtrain/pairs.h
@@ -0,0 +1,141 @@
+#ifndef _DTRAIN_PAIRSAMPLING_H_
+#define _DTRAIN_PAIRSAMPLING_H_
+
+namespace dtrain
+{
+
+
+bool
+accept_pair(score_t a, score_t b, score_t threshold)
+{
+  if (fabs(a - b) < threshold) return false;
+  return true;
+}
+
+bool
+cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b)
+{
+  return a.score > b.score;
+}
+
+inline void
+all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool misranked_only, float _unused=1)
+{
+  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
+  unsigned sz = s->size();
+  bool b = false;
+  unsigned count = 0;
+  for (unsigned i = 0; i < sz-1; i++) {
+    for (unsigned j = i+1; j < sz; j++) {
+      if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
+      if (threshold > 0) {
+        if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
+          training.push_back(make_pair((*s)[i], (*s)[j]));
+      } else {
+        if ((*s)[i].score != (*s)[j].score)
+          training.push_back(make_pair((*s)[i], (*s)[j]));
+      }
+      if (++count == max) {
+        b = true;
+        break;
+      }
+    }
+    if (b) break;
+  }
+}
+
+/*
+ * multipartite ranking
+ *  sort (descending) by bleu
+ *  compare top X to middle Y and low X
+ *  cmp middle Y to low X
+ */
+
+inline void
+partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool misranked_only, float hi_lo)
+{
+  unsigned sz = s->size();
+  if (sz < 2) return;
+  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
+  unsigned sep = round(sz*hi_lo);
+  unsigned sep_hi = sep;
+  if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi;
+  else sep_hi = 1;
+  bool b = false;
+  unsigned count = 0;
+  for (unsigned i = 0; i < sep_hi; i++) {
+    for (unsigned j = sep_hi; j < sz; j++) {
+      if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
+      if (threshold > 0) {
+        if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
+          training.push_back(make_pair((*s)[i], (*s)[j]));
+      } else {
+        if ((*s)[i].score != (*s)[j].score)
+          training.push_back(make_pair((*s)[i], (*s)[j]));
+      }
+      if (++count == max) {
+        b = true;
+        break;
+      }
+    }
+    if (b) break;
+  }
+  unsigned sep_lo = sz-sep;
+  while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo;
+  for (unsigned i = sep_hi; i < sep_lo; i++) {
+    for (unsigned j = sep_lo; j < sz; j++) {
+      if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
+      if (threshold > 0) {
+        if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
+          training.push_back(make_pair((*s)[i], (*s)[j]));
+      } else {
+        if ((*s)[i].score != (*s)[j].score)
+          training.push_back(make_pair((*s)[i], (*s)[j]));
+      }
+      if (++count == max) return;
+    }
+  }
+}
+
+/*
+ * pair sampling as in
+ * 'Tuning as Ranking' (Hopkins & May, 2011)
+ *     count = max (5000)
+ * threshold = 5% BLEU (0.05 for param 3)
+ *       cut = top 10%
+ */
+bool
+_PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b)
+{
+  return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
+}
+inline void
+PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool _unused=false, float _also_unused=0)
+{
+  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
+  unsigned max_count = max, count = 0, sz = s->size();
+  bool b = false;
+  for (unsigned i = 0; i < sz-1; i++) {
+    for (unsigned j = i+1; j < sz; j++) {
+      if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) {
+        training.push_back(make_pair((*s)[i], (*s)[j]));
+        if (++count == max_count) {
+          b = true;
+          break;
+        }
+      }
+    }
+    if (b) break;
+  }
+  if (training.size() > max/10) {
+    sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d);
+    training.erase(training.begin()+(max/10), training.end());
+  }
+  return;
+}
+
+
+} // namespace
+
+#endif
+
diff --git a/training/dtrain/pairsampling.h b/training/dtrain/pairsampling.h
deleted file mode 100644
index fd08be8c..00000000
--- a/training/dtrain/pairsampling.h
+++ /dev/null
@@ -1,141 +0,0 @@
-#ifndef _DTRAIN_PAIRSAMPLING_H_
-#define _DTRAIN_PAIRSAMPLING_H_
-
-namespace dtrain
-{
-
-
-bool
-accept_pair(score_t a, score_t b, score_t threshold)
-{
-  if (fabs(a - b) < threshold) return false;
-  return true;
-}
-
-bool
-cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b)
-{
-  return a.score > b.score;
-}
-
-inline void
-all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool misranked_only, float _unused=1)
-{
-  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
-  unsigned sz = s->size();
-  bool b = false;
-  unsigned count = 0;
-  for (unsigned i = 0; i < sz-1; i++) {
-    for (unsigned j = i+1; j < sz; j++) {
-      if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
-      if (threshold > 0) {
-        if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
-          training.push_back(make_pair((*s)[i], (*s)[j]));
-      } else {
-        if ((*s)[i].score != (*s)[j].score)
-          training.push_back(make_pair((*s)[i], (*s)[j]));
-      }
-      if (++count == max) {
-        b = true;
-        break;
-      }
-    }
-    if (b) break;
-  }
-}
-
-/*
- * multipartite ranking
- *  sort (descending) by bleu
- *  compare top X to middle Y and low X
- *  cmp middle Y to low X
- */
-
-inline void
-partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool misranked_only, float hi_lo)
-{
-  unsigned sz = s->size();
-  if (sz < 2) return;
-  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
-  unsigned sep = round(sz*hi_lo);
-  unsigned sep_hi = sep;
-  if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi;
-  else sep_hi = 1;
-  bool b = false;
-  unsigned count = 0;
-  for (unsigned i = 0; i < sep_hi; i++) {
-    for (unsigned j = sep_hi; j < sz; j++) {
-      if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
-      if (threshold > 0) {
-        if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
-          training.push_back(make_pair((*s)[i], (*s)[j]));
-      } else {
-        if ((*s)[i].score != (*s)[j].score)
-          training.push_back(make_pair((*s)[i], (*s)[j]));
-      }
-      if (++count == max) {
-        b = true;
-        break;
-      }
-    }
-    if (b) break;
-  }
-  unsigned sep_lo = sz-sep;
-  while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo;
-  for (unsigned i = sep_hi; i < sep_lo; i++) {
-    for (unsigned j = sep_lo; j < sz; j++) {
-      if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
-      if (threshold > 0) {
-        if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
-          training.push_back(make_pair((*s)[i], (*s)[j]));
-      } else {
-        if ((*s)[i].score != (*s)[j].score)
-          training.push_back(make_pair((*s)[i], (*s)[j]));
-      }
-      if (++count == max) return;
-    }
-  }
-}
-
-/*
- * pair sampling as in
- * 'Tuning as Ranking' (Hopkins & May, 2011)
- *     count = max (5000)
- * threshold = 5% BLEU (0.05 for param 3)
- *       cut = top 10%
- */
-bool
-_PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b)
-{
-  return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
-}
-inline void
-PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool _unused=false, float _also_unused=0)
-{
-  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
-  unsigned max_count = max, count = 0, sz = s->size();
-  bool b = false;
-  for (unsigned i = 0; i < sz-1; i++) {
-    for (unsigned j = i+1; j < sz; j++) {
-      if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) {
-        training.push_back(make_pair((*s)[i], (*s)[j]));
-        if (++count == max_count) {
-          b = true;
-          break;
-        }
-      }
-    }
-    if (b) break;
-  }
-  if (training.size() > max/10) {
-    sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d);
-    training.erase(training.begin()+(max/10), training.end());
-  }
-  return;
-}
-
-
-} // namespace
-
-#endif
-
diff --git a/training/dtrain/sample.h b/training/dtrain/sample.h
new file mode 100644
index 00000000..25f02273
--- /dev/null
+++ b/training/dtrain/sample.h
@@ -0,0 +1,88 @@
+#ifndef _DTRAIN_KBESTGET_H_
+#define _DTRAIN_KBESTGET_H_
+
+#include "kbest.h"
+
+namespace dtrain
+{
+
+
+struct KBestGetter : public HypSampler
+{
+  const unsigned k_;
+  const string filter_type_;
+  vector<ScoredHyp> s_;
+  unsigned src_len_;
+
+  KBestGetter(const unsigned k, const string filter_type) :
+    k_(k), filter_type_(filter_type) {}
+
+  virtual void
+  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+  {
+    src_len_ = smeta.GetSourceLength();
+    KBestScored(*hg);
+  }
+
+  vector<ScoredHyp>* GetSamples() { return &s_; }
+
+  void
+  KBestScored(const Hypergraph& forest)
+  {
+    if (filter_type_ == "uniq") {
+      KBestUnique(forest);
+    } else if (filter_type_ == "not") {
+      KBestNoFilter(forest);
+    }
+  }
+
+  void
+  KBestUnique(const Hypergraph& forest)
+  {
+    s_.clear(); sz_ = f_count_ = 0;
+    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,
+      KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_);
+    for (unsigned i = 0; i < k_; ++i) {
+      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique,
+              prob_t, EdgeProb>::Derivation* d =
+            kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+      if (!d) break;
+      ScoredHyp h;
+      h.w = d->yield;
+      h.f = d->feature_values;
+      h.model = log(d->score);
+      h.rank = i;
+      h.score = scorer_->Score(h.w, *refs_, i, src_len_);
+      s_.push_back(h);
+      sz_++;
+      f_count_ += h.f.size();
+    }
+  }
+
+  void
+  KBestNoFilter(const Hypergraph& forest)
+  {
+    s_.clear(); sz_ = f_count_ = 0;
+    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_);
+    for (unsigned i = 0; i < k_; ++i) {
+      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+            kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+      if (!d) break;
+      ScoredHyp h;
+      h.w = d->yield;
+      h.f = d->feature_values;
+      h.model = log(d->score);
+      h.rank = i;
+      h.score = scorer_->Score(h.w, *refs_, i, src_len_);
+      s_.push_back(h);
+      sz_++;
+      f_count_ += h.f.size();
+    }
+  }
+};
+
+
+} // namespace
+
+#endif
+
-- 
cgit v1.2.3