From 220d2ff82d8d7f02b1b93711fe418015026ae1a6 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 1 Feb 2015 20:14:37 +0100 Subject: dtrain: rm 'check' and ksampler --- training/dtrain/kbestget.h | 88 ------------------------- training/dtrain/ksampler.h | 60 ------------------ training/dtrain/pairs.h | 141 +++++++++++++++++++++++++++++++++++++++++ training/dtrain/pairsampling.h | 141 ----------------------------------------- training/dtrain/sample.h | 88 +++++++++++++++++++++++++ 5 files changed, 229 insertions(+), 289 deletions(-) delete mode 100644 training/dtrain/kbestget.h delete mode 100644 training/dtrain/ksampler.h create mode 100644 training/dtrain/pairs.h delete mode 100644 training/dtrain/pairsampling.h create mode 100644 training/dtrain/sample.h diff --git a/training/dtrain/kbestget.h b/training/dtrain/kbestget.h deleted file mode 100644 index 25f02273..00000000 --- a/training/dtrain/kbestget.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef _DTRAIN_KBESTGET_H_ -#define _DTRAIN_KBESTGET_H_ - -#include "kbest.h" - -namespace dtrain -{ - - -struct KBestGetter : public HypSampler -{ - const unsigned k_; - const string filter_type_; - vector s_; - unsigned src_len_; - - KBestGetter(const unsigned k, const string filter_type) : - k_(k), filter_type_(filter_type) {} - - virtual void - NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) - { - src_len_ = smeta.GetSourceLength(); - KBestScored(*hg); - } - - vector* GetSamples() { return &s_; } - - void - KBestScored(const Hypergraph& forest) - { - if (filter_type_ == "uniq") { - KBestUnique(forest); - } else if (filter_type_ == "not") { - KBestNoFilter(forest); - } - } - - void - KBestUnique(const Hypergraph& forest) - { - s_.clear(); sz_ = f_count_ = 0; - KBest::KBestDerivations, ESentenceTraversal, - KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_); - for (unsigned i = 0; i < k_; ++i) { - const KBest::KBestDerivations, ESentenceTraversal, KBest::FilterUnique, - prob_t, EdgeProb>::Derivation* d = - kbest.LazyKthBest(forest.nodes_.size() - 1, i); - if (!d) break; - ScoredHyp h; - h.w = d->yield; - h.f = d->feature_values; - h.model = log(d->score); - h.rank = i; - h.score = scorer_->Score(h.w, *refs_, i, src_len_); - s_.push_back(h); - sz_++; - f_count_ += h.f.size(); - } - } - - void - KBestNoFilter(const Hypergraph& forest) - { - s_.clear(); sz_ = f_count_ = 0; - KBest::KBestDerivations, ESentenceTraversal> kbest(forest, k_); - for (unsigned i = 0; i < k_; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(forest.nodes_.size() - 1, i); - if (!d) break; - ScoredHyp h; - h.w = d->yield; - h.f = d->feature_values; - h.model = log(d->score); - h.rank = i; - h.score = scorer_->Score(h.w, *refs_, i, src_len_); - s_.push_back(h); - sz_++; - f_count_ += h.f.size(); - } - } -}; - - -} // namespace - -#endif - diff --git a/training/dtrain/ksampler.h b/training/dtrain/ksampler.h deleted file mode 100644 index 9eedc74f..00000000 --- a/training/dtrain/ksampler.h +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef _DTRAIN_KSAMPLER_H_ -#define _DTRAIN_KSAMPLER_H_ - -#include "hg_sampler.h" - -namespace dtrain -{ - - -bool -cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b) -{ - return a.model > b.model; -} - -struct KSampler : public HypSampler -{ - const unsigned k_; - vector s_; - MT19937* prng_; - score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector); - unsigned src_len_; - - explicit KSampler(const unsigned k, MT19937* prng) : - k_(k), prng_(prng) {} - - virtual void - NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) - { - src_len_ = smeta.GetSourceLength(); - ScoredSamples(*hg); - } - - vector* GetSamples() { return &s_; } - - void ScoredSamples(const Hypergraph& forest) { - s_.clear(); sz_ = f_count_ = 0; - std::vector samples; - HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples); - for (unsigned i = 0; i < k_; ++i) { - ScoredHyp h; - h.w = samples[i].words; - h.f = samples[i].fmap; - h.model = log(samples[i].model_score); - h.rank = i; - h.score = scorer_->Score(h.w, *refs_, i, src_len_); - s_.push_back(h); - sz_++; - f_count_ += h.f.size(); - } - sort(s_.begin(), s_.end(), cmp_hyp_by_model_d); - for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i; - } -}; - - -} // namespace - -#endif - diff --git a/training/dtrain/pairs.h b/training/dtrain/pairs.h new file mode 100644 index 00000000..fd08be8c --- /dev/null +++ b/training/dtrain/pairs.h @@ -0,0 +1,141 @@ +#ifndef _DTRAIN_PAIRSAMPLING_H_ +#define _DTRAIN_PAIRSAMPLING_H_ + +namespace dtrain +{ + + +bool +accept_pair(score_t a, score_t b, score_t threshold) +{ + if (fabs(a - b) < threshold) return false; + return true; +} + +bool +cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b) +{ + return a.score > b.score; +} + +inline void +all_pairs(vector* s, vector >& training, score_t threshold, unsigned max, bool misranked_only, float _unused=1) +{ + sort(s->begin(), s->end(), cmp_hyp_by_score_d); + unsigned sz = s->size(); + bool b = false; + unsigned count = 0; + for (unsigned i = 0; i < sz-1; i++) { + for (unsigned j = i+1; j < sz; j++) { + if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue; + if (threshold > 0) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) + training.push_back(make_pair((*s)[i], (*s)[j])); + } else { + if ((*s)[i].score != (*s)[j].score) + training.push_back(make_pair((*s)[i], (*s)[j])); + } + if (++count == max) { + b = true; + break; + } + } + if (b) break; + } +} + +/* + * multipartite ranking + * sort (descending) by bleu + * compare top X to middle Y and low X + * cmp middle Y to low X + */ + +inline void +partXYX(vector* s, vector >& training, score_t threshold, unsigned max, bool misranked_only, float hi_lo) +{ + unsigned sz = s->size(); + if (sz < 2) return; + sort(s->begin(), s->end(), cmp_hyp_by_score_d); + unsigned sep = round(sz*hi_lo); + unsigned sep_hi = sep; + if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi; + else sep_hi = 1; + bool b = false; + unsigned count = 0; + for (unsigned i = 0; i < sep_hi; i++) { + for (unsigned j = sep_hi; j < sz; j++) { + if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue; + if (threshold > 0) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) + training.push_back(make_pair((*s)[i], (*s)[j])); + } else { + if ((*s)[i].score != (*s)[j].score) + training.push_back(make_pair((*s)[i], (*s)[j])); + } + if (++count == max) { + b = true; + break; + } + } + if (b) break; + } + unsigned sep_lo = sz-sep; + while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo; + for (unsigned i = sep_hi; i < sep_lo; i++) { + for (unsigned j = sep_lo; j < sz; j++) { + if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue; + if (threshold > 0) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) + training.push_back(make_pair((*s)[i], (*s)[j])); + } else { + if ((*s)[i].score != (*s)[j].score) + training.push_back(make_pair((*s)[i], (*s)[j])); + } + if (++count == max) return; + } + } +} + +/* + * pair sampling as in + * 'Tuning as Ranking' (Hopkins & May, 2011) + * count = max (5000) + * threshold = 5% BLEU (0.05 for param 3) + * cut = top 10% + */ +bool +_PRO_cmp_pair_by_diff_d(pair a, pair b) +{ + return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score)); +} +inline void +PROsampling(vector* s, vector >& training, score_t threshold, unsigned max, bool _unused=false, float _also_unused=0) +{ + sort(s->begin(), s->end(), cmp_hyp_by_score_d); + unsigned max_count = max, count = 0, sz = s->size(); + bool b = false; + for (unsigned i = 0; i < sz-1; i++) { + for (unsigned j = i+1; j < sz; j++) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) { + training.push_back(make_pair((*s)[i], (*s)[j])); + if (++count == max_count) { + b = true; + break; + } + } + } + if (b) break; + } + if (training.size() > max/10) { + sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d); + training.erase(training.begin()+(max/10), training.end()); + } + return; +} + + +} // namespace + +#endif + diff --git a/training/dtrain/pairsampling.h b/training/dtrain/pairsampling.h deleted file mode 100644 index fd08be8c..00000000 --- a/training/dtrain/pairsampling.h +++ /dev/null @@ -1,141 +0,0 @@ -#ifndef _DTRAIN_PAIRSAMPLING_H_ -#define _DTRAIN_PAIRSAMPLING_H_ - -namespace dtrain -{ - - -bool -accept_pair(score_t a, score_t b, score_t threshold) -{ - if (fabs(a - b) < threshold) return false; - return true; -} - -bool -cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b) -{ - return a.score > b.score; -} - -inline void -all_pairs(vector* s, vector >& training, score_t threshold, unsigned max, bool misranked_only, float _unused=1) -{ - sort(s->begin(), s->end(), cmp_hyp_by_score_d); - unsigned sz = s->size(); - bool b = false; - unsigned count = 0; - for (unsigned i = 0; i < sz-1; i++) { - for (unsigned j = i+1; j < sz; j++) { - if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue; - if (threshold > 0) { - if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) - training.push_back(make_pair((*s)[i], (*s)[j])); - } else { - if ((*s)[i].score != (*s)[j].score) - training.push_back(make_pair((*s)[i], (*s)[j])); - } - if (++count == max) { - b = true; - break; - } - } - if (b) break; - } -} - -/* - * multipartite ranking - * sort (descending) by bleu - * compare top X to middle Y and low X - * cmp middle Y to low X - */ - -inline void -partXYX(vector* s, vector >& training, score_t threshold, unsigned max, bool misranked_only, float hi_lo) -{ - unsigned sz = s->size(); - if (sz < 2) return; - sort(s->begin(), s->end(), cmp_hyp_by_score_d); - unsigned sep = round(sz*hi_lo); - unsigned sep_hi = sep; - if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi; - else sep_hi = 1; - bool b = false; - unsigned count = 0; - for (unsigned i = 0; i < sep_hi; i++) { - for (unsigned j = sep_hi; j < sz; j++) { - if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue; - if (threshold > 0) { - if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) - training.push_back(make_pair((*s)[i], (*s)[j])); - } else { - if ((*s)[i].score != (*s)[j].score) - training.push_back(make_pair((*s)[i], (*s)[j])); - } - if (++count == max) { - b = true; - break; - } - } - if (b) break; - } - unsigned sep_lo = sz-sep; - while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo; - for (unsigned i = sep_hi; i < sep_lo; i++) { - for (unsigned j = sep_lo; j < sz; j++) { - if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue; - if (threshold > 0) { - if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) - training.push_back(make_pair((*s)[i], (*s)[j])); - } else { - if ((*s)[i].score != (*s)[j].score) - training.push_back(make_pair((*s)[i], (*s)[j])); - } - if (++count == max) return; - } - } -} - -/* - * pair sampling as in - * 'Tuning as Ranking' (Hopkins & May, 2011) - * count = max (5000) - * threshold = 5% BLEU (0.05 for param 3) - * cut = top 10% - */ -bool -_PRO_cmp_pair_by_diff_d(pair a, pair b) -{ - return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score)); -} -inline void -PROsampling(vector* s, vector >& training, score_t threshold, unsigned max, bool _unused=false, float _also_unused=0) -{ - sort(s->begin(), s->end(), cmp_hyp_by_score_d); - unsigned max_count = max, count = 0, sz = s->size(); - bool b = false; - for (unsigned i = 0; i < sz-1; i++) { - for (unsigned j = i+1; j < sz; j++) { - if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) { - training.push_back(make_pair((*s)[i], (*s)[j])); - if (++count == max_count) { - b = true; - break; - } - } - } - if (b) break; - } - if (training.size() > max/10) { - sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d); - training.erase(training.begin()+(max/10), training.end()); - } - return; -} - - -} // namespace - -#endif - diff --git a/training/dtrain/sample.h b/training/dtrain/sample.h new file mode 100644 index 00000000..25f02273 --- /dev/null +++ b/training/dtrain/sample.h @@ -0,0 +1,88 @@ +#ifndef _DTRAIN_KBESTGET_H_ +#define _DTRAIN_KBESTGET_H_ + +#include "kbest.h" + +namespace dtrain +{ + + +struct KBestGetter : public HypSampler +{ + const unsigned k_; + const string filter_type_; + vector s_; + unsigned src_len_; + + KBestGetter(const unsigned k, const string filter_type) : + k_(k), filter_type_(filter_type) {} + + virtual void + NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + { + src_len_ = smeta.GetSourceLength(); + KBestScored(*hg); + } + + vector* GetSamples() { return &s_; } + + void + KBestScored(const Hypergraph& forest) + { + if (filter_type_ == "uniq") { + KBestUnique(forest); + } else if (filter_type_ == "not") { + KBestNoFilter(forest); + } + } + + void + KBestUnique(const Hypergraph& forest) + { + s_.clear(); sz_ = f_count_ = 0; + KBest::KBestDerivations, ESentenceTraversal, + KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_); + for (unsigned i = 0; i < k_; ++i) { + const KBest::KBestDerivations, ESentenceTraversal, KBest::FilterUnique, + prob_t, EdgeProb>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + ScoredHyp h; + h.w = d->yield; + h.f = d->feature_values; + h.model = log(d->score); + h.rank = i; + h.score = scorer_->Score(h.w, *refs_, i, src_len_); + s_.push_back(h); + sz_++; + f_count_ += h.f.size(); + } + } + + void + KBestNoFilter(const Hypergraph& forest) + { + s_.clear(); sz_ = f_count_ = 0; + KBest::KBestDerivations, ESentenceTraversal> kbest(forest, k_); + for (unsigned i = 0; i < k_; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + ScoredHyp h; + h.w = d->yield; + h.f = d->feature_values; + h.model = log(d->score); + h.rank = i; + h.score = scorer_->Score(h.w, *refs_, i, src_len_); + s_.push_back(h); + sz_++; + f_count_ += h.f.size(); + } + } +}; + + +} // namespace + +#endif + -- cgit v1.2.3