From 220d2ff82d8d7f02b1b93711fe418015026ae1a6 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Sun, 1 Feb 2015 20:14:37 +0100
Subject: dtrain: rm 'check' and ksampler
---
training/dtrain/kbestget.h | 88 -------------------------
training/dtrain/ksampler.h | 60 ------------------
training/dtrain/pairs.h | 141 +++++++++++++++++++++++++++++++++++++++++
training/dtrain/pairsampling.h | 141 -----------------------------------------
training/dtrain/sample.h | 88 +++++++++++++++++++++++++
5 files changed, 229 insertions(+), 289 deletions(-)
delete mode 100644 training/dtrain/kbestget.h
delete mode 100644 training/dtrain/ksampler.h
create mode 100644 training/dtrain/pairs.h
delete mode 100644 training/dtrain/pairsampling.h
create mode 100644 training/dtrain/sample.h
diff --git a/training/dtrain/kbestget.h b/training/dtrain/kbestget.h
deleted file mode 100644
index 25f02273..00000000
--- a/training/dtrain/kbestget.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef _DTRAIN_KBESTGET_H_
-#define _DTRAIN_KBESTGET_H_
-
-#include "kbest.h"
-
-namespace dtrain
-{
-
-
-struct KBestGetter : public HypSampler
-{
- const unsigned k_;
- const string filter_type_;
- vector s_;
- unsigned src_len_;
-
- KBestGetter(const unsigned k, const string filter_type) :
- k_(k), filter_type_(filter_type) {}
-
- virtual void
- NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
- {
- src_len_ = smeta.GetSourceLength();
- KBestScored(*hg);
- }
-
- vector* GetSamples() { return &s_; }
-
- void
- KBestScored(const Hypergraph& forest)
- {
- if (filter_type_ == "uniq") {
- KBestUnique(forest);
- } else if (filter_type_ == "not") {
- KBestNoFilter(forest);
- }
- }
-
- void
- KBestUnique(const Hypergraph& forest)
- {
- s_.clear(); sz_ = f_count_ = 0;
- KBest::KBestDerivations, ESentenceTraversal,
- KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_);
- for (unsigned i = 0; i < k_; ++i) {
- const KBest::KBestDerivations, ESentenceTraversal, KBest::FilterUnique,
- prob_t, EdgeProb>::Derivation* d =
- kbest.LazyKthBest(forest.nodes_.size() - 1, i);
- if (!d) break;
- ScoredHyp h;
- h.w = d->yield;
- h.f = d->feature_values;
- h.model = log(d->score);
- h.rank = i;
- h.score = scorer_->Score(h.w, *refs_, i, src_len_);
- s_.push_back(h);
- sz_++;
- f_count_ += h.f.size();
- }
- }
-
- void
- KBestNoFilter(const Hypergraph& forest)
- {
- s_.clear(); sz_ = f_count_ = 0;
- KBest::KBestDerivations, ESentenceTraversal> kbest(forest, k_);
- for (unsigned i = 0; i < k_; ++i) {
- const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d =
- kbest.LazyKthBest(forest.nodes_.size() - 1, i);
- if (!d) break;
- ScoredHyp h;
- h.w = d->yield;
- h.f = d->feature_values;
- h.model = log(d->score);
- h.rank = i;
- h.score = scorer_->Score(h.w, *refs_, i, src_len_);
- s_.push_back(h);
- sz_++;
- f_count_ += h.f.size();
- }
- }
-};
-
-
-} // namespace
-
-#endif
-
diff --git a/training/dtrain/ksampler.h b/training/dtrain/ksampler.h
deleted file mode 100644
index 9eedc74f..00000000
--- a/training/dtrain/ksampler.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef _DTRAIN_KSAMPLER_H_
-#define _DTRAIN_KSAMPLER_H_
-
-#include "hg_sampler.h"
-
-namespace dtrain
-{
-
-
-bool
-cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b)
-{
- return a.model > b.model;
-}
-
-struct KSampler : public HypSampler
-{
- const unsigned k_;
- vector s_;
- MT19937* prng_;
- score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector);
- unsigned src_len_;
-
- explicit KSampler(const unsigned k, MT19937* prng) :
- k_(k), prng_(prng) {}
-
- virtual void
- NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
- {
- src_len_ = smeta.GetSourceLength();
- ScoredSamples(*hg);
- }
-
- vector* GetSamples() { return &s_; }
-
- void ScoredSamples(const Hypergraph& forest) {
- s_.clear(); sz_ = f_count_ = 0;
- std::vector samples;
- HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples);
- for (unsigned i = 0; i < k_; ++i) {
- ScoredHyp h;
- h.w = samples[i].words;
- h.f = samples[i].fmap;
- h.model = log(samples[i].model_score);
- h.rank = i;
- h.score = scorer_->Score(h.w, *refs_, i, src_len_);
- s_.push_back(h);
- sz_++;
- f_count_ += h.f.size();
- }
- sort(s_.begin(), s_.end(), cmp_hyp_by_model_d);
- for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i;
- }
-};
-
-
-} // namespace
-
-#endif
-
diff --git a/training/dtrain/pairs.h b/training/dtrain/pairs.h
new file mode 100644
index 00000000..fd08be8c
--- /dev/null
+++ b/training/dtrain/pairs.h
@@ -0,0 +1,141 @@
+#ifndef _DTRAIN_PAIRSAMPLING_H_
+#define _DTRAIN_PAIRSAMPLING_H_
+
+namespace dtrain
+{
+
+
+bool
+accept_pair(score_t a, score_t b, score_t threshold)
+{
+ if (fabs(a - b) < threshold) return false;
+ return true;
+}
+
+bool
+cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b)
+{
+ return a.score > b.score;
+}
+
+inline void
+all_pairs(vector* s, vector >& training, score_t threshold, unsigned max, bool misranked_only, float _unused=1)
+{
+ sort(s->begin(), s->end(), cmp_hyp_by_score_d);
+ unsigned sz = s->size();
+ bool b = false;
+ unsigned count = 0;
+ for (unsigned i = 0; i < sz-1; i++) {
+ for (unsigned j = i+1; j < sz; j++) {
+ if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
+ if (threshold > 0) {
+ if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ } else {
+ if ((*s)[i].score != (*s)[j].score)
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ }
+ if (++count == max) {
+ b = true;
+ break;
+ }
+ }
+ if (b) break;
+ }
+}
+
+/*
+ * multipartite ranking
+ * sort (descending) by bleu
+ * compare top X to middle Y and low X
+ * cmp middle Y to low X
+ */
+
+inline void
+partXYX(vector* s, vector >& training, score_t threshold, unsigned max, bool misranked_only, float hi_lo)
+{
+ unsigned sz = s->size();
+ if (sz < 2) return;
+ sort(s->begin(), s->end(), cmp_hyp_by_score_d);
+ unsigned sep = round(sz*hi_lo);
+ unsigned sep_hi = sep;
+ if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi;
+ else sep_hi = 1;
+ bool b = false;
+ unsigned count = 0;
+ for (unsigned i = 0; i < sep_hi; i++) {
+ for (unsigned j = sep_hi; j < sz; j++) {
+ if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
+ if (threshold > 0) {
+ if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ } else {
+ if ((*s)[i].score != (*s)[j].score)
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ }
+ if (++count == max) {
+ b = true;
+ break;
+ }
+ }
+ if (b) break;
+ }
+ unsigned sep_lo = sz-sep;
+ while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo;
+ for (unsigned i = sep_hi; i < sep_lo; i++) {
+ for (unsigned j = sep_lo; j < sz; j++) {
+ if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
+ if (threshold > 0) {
+ if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ } else {
+ if ((*s)[i].score != (*s)[j].score)
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ }
+ if (++count == max) return;
+ }
+ }
+}
+
+/*
+ * pair sampling as in
+ * 'Tuning as Ranking' (Hopkins & May, 2011)
+ * count = max (5000)
+ * threshold = 5% BLEU (0.05 for param 3)
+ * cut = top 10%
+ */
+bool
+_PRO_cmp_pair_by_diff_d(pair a, pair b)
+{
+ return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
+}
+inline void
+PROsampling(vector* s, vector >& training, score_t threshold, unsigned max, bool _unused=false, float _also_unused=0)
+{
+ sort(s->begin(), s->end(), cmp_hyp_by_score_d);
+ unsigned max_count = max, count = 0, sz = s->size();
+ bool b = false;
+ for (unsigned i = 0; i < sz-1; i++) {
+ for (unsigned j = i+1; j < sz; j++) {
+ if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) {
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ if (++count == max_count) {
+ b = true;
+ break;
+ }
+ }
+ }
+ if (b) break;
+ }
+ if (training.size() > max/10) {
+ sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d);
+ training.erase(training.begin()+(max/10), training.end());
+ }
+ return;
+}
+
+
+} // namespace
+
+#endif
+
diff --git a/training/dtrain/pairsampling.h b/training/dtrain/pairsampling.h
deleted file mode 100644
index fd08be8c..00000000
--- a/training/dtrain/pairsampling.h
+++ /dev/null
@@ -1,141 +0,0 @@
-#ifndef _DTRAIN_PAIRSAMPLING_H_
-#define _DTRAIN_PAIRSAMPLING_H_
-
-namespace dtrain
-{
-
-
-bool
-accept_pair(score_t a, score_t b, score_t threshold)
-{
- if (fabs(a - b) < threshold) return false;
- return true;
-}
-
-bool
-cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b)
-{
- return a.score > b.score;
-}
-
-inline void
-all_pairs(vector* s, vector >& training, score_t threshold, unsigned max, bool misranked_only, float _unused=1)
-{
- sort(s->begin(), s->end(), cmp_hyp_by_score_d);
- unsigned sz = s->size();
- bool b = false;
- unsigned count = 0;
- for (unsigned i = 0; i < sz-1; i++) {
- for (unsigned j = i+1; j < sz; j++) {
- if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
- if (threshold > 0) {
- if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
- training.push_back(make_pair((*s)[i], (*s)[j]));
- } else {
- if ((*s)[i].score != (*s)[j].score)
- training.push_back(make_pair((*s)[i], (*s)[j]));
- }
- if (++count == max) {
- b = true;
- break;
- }
- }
- if (b) break;
- }
-}
-
-/*
- * multipartite ranking
- * sort (descending) by bleu
- * compare top X to middle Y and low X
- * cmp middle Y to low X
- */
-
-inline void
-partXYX(vector* s, vector >& training, score_t threshold, unsigned max, bool misranked_only, float hi_lo)
-{
- unsigned sz = s->size();
- if (sz < 2) return;
- sort(s->begin(), s->end(), cmp_hyp_by_score_d);
- unsigned sep = round(sz*hi_lo);
- unsigned sep_hi = sep;
- if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi;
- else sep_hi = 1;
- bool b = false;
- unsigned count = 0;
- for (unsigned i = 0; i < sep_hi; i++) {
- for (unsigned j = sep_hi; j < sz; j++) {
- if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
- if (threshold > 0) {
- if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
- training.push_back(make_pair((*s)[i], (*s)[j]));
- } else {
- if ((*s)[i].score != (*s)[j].score)
- training.push_back(make_pair((*s)[i], (*s)[j]));
- }
- if (++count == max) {
- b = true;
- break;
- }
- }
- if (b) break;
- }
- unsigned sep_lo = sz-sep;
- while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo;
- for (unsigned i = sep_hi; i < sep_lo; i++) {
- for (unsigned j = sep_lo; j < sz; j++) {
- if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
- if (threshold > 0) {
- if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
- training.push_back(make_pair((*s)[i], (*s)[j]));
- } else {
- if ((*s)[i].score != (*s)[j].score)
- training.push_back(make_pair((*s)[i], (*s)[j]));
- }
- if (++count == max) return;
- }
- }
-}
-
-/*
- * pair sampling as in
- * 'Tuning as Ranking' (Hopkins & May, 2011)
- * count = max (5000)
- * threshold = 5% BLEU (0.05 for param 3)
- * cut = top 10%
- */
-bool
-_PRO_cmp_pair_by_diff_d(pair a, pair b)
-{
- return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
-}
-inline void
-PROsampling(vector* s, vector >& training, score_t threshold, unsigned max, bool _unused=false, float _also_unused=0)
-{
- sort(s->begin(), s->end(), cmp_hyp_by_score_d);
- unsigned max_count = max, count = 0, sz = s->size();
- bool b = false;
- for (unsigned i = 0; i < sz-1; i++) {
- for (unsigned j = i+1; j < sz; j++) {
- if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) {
- training.push_back(make_pair((*s)[i], (*s)[j]));
- if (++count == max_count) {
- b = true;
- break;
- }
- }
- }
- if (b) break;
- }
- if (training.size() > max/10) {
- sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d);
- training.erase(training.begin()+(max/10), training.end());
- }
- return;
-}
-
-
-} // namespace
-
-#endif
-
diff --git a/training/dtrain/sample.h b/training/dtrain/sample.h
new file mode 100644
index 00000000..25f02273
--- /dev/null
+++ b/training/dtrain/sample.h
@@ -0,0 +1,88 @@
+#ifndef _DTRAIN_KBESTGET_H_
+#define _DTRAIN_KBESTGET_H_
+
+#include "kbest.h"
+
+namespace dtrain
+{
+
+
+struct KBestGetter : public HypSampler
+{
+ const unsigned k_;
+ const string filter_type_;
+ vector s_;
+ unsigned src_len_;
+
+ KBestGetter(const unsigned k, const string filter_type) :
+ k_(k), filter_type_(filter_type) {}
+
+ virtual void
+ NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+ {
+ src_len_ = smeta.GetSourceLength();
+ KBestScored(*hg);
+ }
+
+ vector* GetSamples() { return &s_; }
+
+ void
+ KBestScored(const Hypergraph& forest)
+ {
+ if (filter_type_ == "uniq") {
+ KBestUnique(forest);
+ } else if (filter_type_ == "not") {
+ KBestNoFilter(forest);
+ }
+ }
+
+ void
+ KBestUnique(const Hypergraph& forest)
+ {
+ s_.clear(); sz_ = f_count_ = 0;
+ KBest::KBestDerivations, ESentenceTraversal,
+ KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_);
+ for (unsigned i = 0; i < k_; ++i) {
+ const KBest::KBestDerivations, ESentenceTraversal, KBest::FilterUnique,
+ prob_t, EdgeProb>::Derivation* d =
+ kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+ if (!d) break;
+ ScoredHyp h;
+ h.w = d->yield;
+ h.f = d->feature_values;
+ h.model = log(d->score);
+ h.rank = i;
+ h.score = scorer_->Score(h.w, *refs_, i, src_len_);
+ s_.push_back(h);
+ sz_++;
+ f_count_ += h.f.size();
+ }
+ }
+
+ void
+ KBestNoFilter(const Hypergraph& forest)
+ {
+ s_.clear(); sz_ = f_count_ = 0;
+ KBest::KBestDerivations, ESentenceTraversal> kbest(forest, k_);
+ for (unsigned i = 0; i < k_; ++i) {
+ const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+ if (!d) break;
+ ScoredHyp h;
+ h.w = d->yield;
+ h.f = d->feature_values;
+ h.model = log(d->score);
+ h.rank = i;
+ h.score = scorer_->Score(h.w, *refs_, i, src_len_);
+ s_.push_back(h);
+ sz_++;
+ f_count_ += h.f.size();
+ }
+ }
+};
+
+
+} // namespace
+
+#endif
+
--
cgit v1.2.3