From 160dbdfa96ae57df82bc33475578904e2cd23317 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 1 Feb 2015 22:32:40 +0100 Subject: dtrain: simplified pair generation --- training/dtrain/pairs.h | 114 ++++++------------------------------------------ 1 file changed, 14 insertions(+), 100 deletions(-) (limited to 'training/dtrain/pairs.h') diff --git a/training/dtrain/pairs.h b/training/dtrain/pairs.h index fd08be8c..dea0dabc 100644 --- a/training/dtrain/pairs.h +++ b/training/dtrain/pairs.h @@ -1,140 +1,54 @@ -#ifndef _DTRAIN_PAIRSAMPLING_H_ -#define _DTRAIN_PAIRSAMPLING_H_ +#ifndef _DTRAIN_PAIRS_H_ +#define _DTRAIN_PAIRS_H_ namespace dtrain { - bool -accept_pair(score_t a, score_t b, score_t threshold) -{ - if (fabs(a - b) < threshold) return false; - return true; -} - -bool -cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b) +CmpHypsByScore(ScoredHyp a, ScoredHyp b) { return a.score > b.score; } -inline void -all_pairs(vector* s, vector >& training, score_t threshold, unsigned max, bool misranked_only, float _unused=1) -{ - sort(s->begin(), s->end(), cmp_hyp_by_score_d); - unsigned sz = s->size(); - bool b = false; - unsigned count = 0; - for (unsigned i = 0; i < sz-1; i++) { - for (unsigned j = i+1; j < sz; j++) { - if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue; - if (threshold > 0) { - if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) - training.push_back(make_pair((*s)[i], (*s)[j])); - } else { - if ((*s)[i].score != (*s)[j].score) - training.push_back(make_pair((*s)[i], (*s)[j])); - } - if (++count == max) { - b = true; - break; - } - } - if (b) break; - } -} - /* * multipartite ranking * sort (descending) by bleu - * compare top X to middle Y and low X + * compare top X (hi) to middle Y (med) and low X (lo) * cmp middle Y to low X */ - inline void -partXYX(vector* s, vector >& training, score_t threshold, unsigned max, bool misranked_only, float hi_lo) +MakePairs(vector* s, + vector >& training, + bool misranked_only, + float hi_lo) { unsigned sz = s->size(); if (sz < 2) return; - sort(s->begin(), s->end(), cmp_hyp_by_score_d); + sort(s->begin(), s->end(), CmpHypsByScore); unsigned sep = round(sz*hi_lo); + // hi vs. med vs. low unsigned sep_hi = sep; if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi; else sep_hi = 1; - bool b = false; - unsigned count = 0; for (unsigned i = 0; i < sep_hi; i++) { for (unsigned j = sep_hi; j < sz; j++) { if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue; - if (threshold > 0) { - if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) - training.push_back(make_pair((*s)[i], (*s)[j])); - } else { - if ((*s)[i].score != (*s)[j].score) - training.push_back(make_pair((*s)[i], (*s)[j])); - } - if (++count == max) { - b = true; - break; - } + if ((*s)[i].score != (*s)[j].score) + training.push_back(make_pair((*s)[i], (*s)[j])); } - if (b) break; } + // med vs. low unsigned sep_lo = sz-sep; while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo; for (unsigned i = sep_hi; i < sep_lo; i++) { for (unsigned j = sep_lo; j < sz; j++) { if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue; - if (threshold > 0) { - if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) - training.push_back(make_pair((*s)[i], (*s)[j])); - } else { - if ((*s)[i].score != (*s)[j].score) - training.push_back(make_pair((*s)[i], (*s)[j])); - } - if (++count == max) return; - } - } -} - -/* - * pair sampling as in - * 'Tuning as Ranking' (Hopkins & May, 2011) - * count = max (5000) - * threshold = 5% BLEU (0.05 for param 3) - * cut = top 10% - */ -bool -_PRO_cmp_pair_by_diff_d(pair a, pair b) -{ - return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score)); -} -inline void -PROsampling(vector* s, vector >& training, score_t threshold, unsigned max, bool _unused=false, float _also_unused=0) -{ - sort(s->begin(), s->end(), cmp_hyp_by_score_d); - unsigned max_count = max, count = 0, sz = s->size(); - bool b = false; - for (unsigned i = 0; i < sz-1; i++) { - for (unsigned j = i+1; j < sz; j++) { - if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) { + if ((*s)[i].score != (*s)[j].score) training.push_back(make_pair((*s)[i], (*s)[j])); - if (++count == max_count) { - b = true; - break; - } - } } - if (b) break; - } - if (training.size() > max/10) { - sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d); - training.erase(training.begin()+(max/10), training.end()); } - return; } - } // namespace #endif -- cgit v1.2.3