diff options
-rw-r--r-- | training/dtrain/update.h | 292 |
1 files changed, 187 insertions, 105 deletions
diff --git a/training/dtrain/update.h b/training/dtrain/update.h index 83dc3186..1c51c4f1 100644 --- a/training/dtrain/update.h +++ b/training/dtrain/update.h @@ -4,143 +4,225 @@ namespace dtrain { -bool -_cmp(ScoredHyp a, ScoredHyp b) -{ - return a.gold > b.gold; -} - -bool -_cmpHope(ScoredHyp a, ScoredHyp b) -{ - return (a.model+a.gold) > (b.model+b.gold); -} - -bool -_cmpFear(ScoredHyp a, ScoredHyp b) -{ - return (a.model-a.gold) > (b.model-b.gold); -} - -inline bool -_good(ScoredHyp& a, ScoredHyp& b, weight_t margin) -{ - if ((a.model-b.model)>margin - || a.gold==b.gold) - return true; - - return false; -} - -inline bool -_goodS(ScoredHyp& a, ScoredHyp& b) -{ - if (a.gold==b.gold) - return true; - - return false; -} - /* - * multipartite ranking - * sort (descending) by bleu - * compare top X (hi) to middle Y (med) and low X (lo) - * cmp middle Y to low X + * multipartite [multi=3] ranking + * partitions are determined by the 'cut' parameter + * 0. sort sample (descending) by bleu + * 1. compare top X(=sz*cut) to middle Y(=sz-2*(sz*cut)) and bottom X + * -"- middle Y to bottom X + * */ inline size_t -CollectUpdates(vector<ScoredHyp>* s, - SparseVector<weight_t>& updates, - weight_t margin=0.) +updates_multipartite(vector<Hyp>* sample, + SparseVector<weight_t>& updates, + weight_t cut, + weight_t margin, + size_t max_up, + weight_t threshold, + bool adjust, + bool output=false, + ostream& os=cout) { - size_t num_up = 0; - size_t sz = s->size(); - sort(s->begin(), s->end(), _cmp); - size_t sep = round(sz*0.1); - for (size_t i = 0; i < sep; i++) { - for (size_t j = sep; j < sz; j++) { - if (_good((*s)[i], (*s)[j], margin)) + size_t up = 0; + size_t sz = sample->size(); + if (sz < 2) return 0; + sort(sample->begin(), sample->end(), [](Hyp& first, Hyp& second) + { + return first.gold > second.gold; + }); + size_t sep = round(sz*cut); + + size_t sep_hi = sep; + if (adjust) { + if (sz > 4) { + while (sep_hi<sz && (*sample)[sep_hi-1].gold==(*sample)[sep_hi].gold) + ++sep_hi; + } else { + sep_hi = 1; + } + } + for (size_t i = 0; i < sep_hi; i++) { + for (size_t j = sep_hi; j < sz; j++) { + Hyp& first=(*sample)[i], second=(*sample)[j]; + if ((first.model-second.model)>margin + || (!adjust && first.gold==second.gold) + || (threshold && (first.gold-second.gold < threshold))) continue; - updates += (*s)[i].f-(*s)[j].f; - num_up++; + if (output) + os << first.f-second.f << endl; + updates += first.f-second.f; + if (++up==max_up) + return up; } } + size_t sep_lo = sz-sep; - for (size_t i = sep; i < sep_lo; i++) { + if (adjust) { + while (sep_lo>0 && (*sample)[sep_lo-1].gold==(*sample)[sep_lo].gold) + --sep_lo; + } + for (size_t i = sep_hi; i < sep_lo; i++) { for (size_t j = sep_lo; j < sz; j++) { - if (_good((*s)[i], (*s)[j], margin)) + Hyp& first=(*sample)[i], second=(*sample)[j]; + if ((first.model-second.model)>margin + || (!adjust && first.gold==second.gold) + || (threshold && (first.gold-second.gold < threshold))) continue; - updates += (*s)[i].f-(*s)[j].f; - num_up++; + if (output) + os << first.f-second.f << endl; + updates += first.f-second.f; + if (++up==max_up) + break; } } - return num_up; + return up; } +/* + * all pairs + * only ignore a pair if gold scores are + * identical + * + */ inline size_t -CollectUpdatesStruct(vector<ScoredHyp>* s, - SparseVector<weight_t>& updates, - weight_t unused=-1) +updates_all(vector<Hyp>* sample, + SparseVector<weight_t>& updates, + size_t max_up, + weight_t threshold, + bool output=false, + ostream& os=cout) +{ + size_t up = 0; + size_t sz = sample->size(); + sort(sample->begin(), sample->end(), [](Hyp& first, Hyp& second) + { + return first.gold > second.gold; + }); + for (size_t i = 0; i < sz-1; i++) { + for (size_t j = i+1; j < sz; j++) { + Hyp& first=(*sample)[i], second=(*sample)[j]; + if (first.gold == second.gold + || (threshold && (first.gold-second.gold < threshold))) + continue; + if (output) + os << first.f-second.f << endl; + updates += first.f-second.f; + if (++up==max_up) + break; + } + } + + return up; +} + +/* + * hope/fear + * just one pair: hope - fear + * + */ +inline size_t +update_structured(vector<Hyp>* sample, + SparseVector<weight_t>& updates, + weight_t margin, + bool output=false, + ostream& os=cout) { // hope - sort(s->begin(), s->end(), _cmpHope); - ScoredHyp hope = (*s)[0]; + sort(sample->begin(), sample->end(), [](Hyp& first, Hyp& second) + { + return (first.model+first.gold) > (second.model+second.gold); + }); + Hyp hope = (*sample)[0]; // fear - sort(s->begin(), s->end(), _cmpFear); - ScoredHyp fear = (*s)[0]; - if (!_goodS(hope, fear)) + sort(sample->begin(), sample->end(), [](Hyp& first, Hyp& second) + { + return (first.model-first.gold) > (second.model-second.gold); + }); + Hyp fear = (*sample)[0]; + + if (hope.gold != fear.gold) { updates += hope.f - fear.f; + if (output) + os << hope.f << "\t" << fear.f << endl; + + return 1; + } - return updates.size(); + if (output) + os << endl; + + return 0; } -inline void -OutputKbest(vector<ScoredHyp>* s) + +/* + * pair sampling as in + * 'Tuning as Ranking' (Hopkins & May, 2011) + * count = 5000 [maxs] + * threshold = 5% BLEU [threshold=0.05] + * cut = top 50 [max_up] + */ +inline size_t +updates_pro(vector<Hyp>* sample, + SparseVector<weight_t>& updates, + size_t maxs, + size_t max_up, + weight_t threshold, + bool output=false, + ostream& os=cout) { - sort(s->begin(), s->end(), _cmp); - size_t i = 0; - for (auto k: *s) { - cout << i << "\t" << k.gold << "\t" << k.model << " \t" << k.f << endl; - i++; + + size_t sz = sample->size(), s; + vector<pair<Hyp*,Hyp*> > g; + while (s < maxs) { + size_t i=rand()%sz, j=rand()%sz; + Hyp& first=(*sample)[i], second=(*sample)[j]; + if (i==j || fabs(first.gold-second.gold)<threshold) + continue; + if (first.gold > second.gold) + g.emplace_back(make_pair(&first,&second)); + else + g.emplace_back(make_pair(&second,&first)); + s++; } -} -inline void -OutputMultipartitePairs(vector<ScoredHyp>* s, - weight_t margin=0., - bool all=true) -{ - size_t sz = s->size(); - sort(s->begin(), s->end(), _cmp); - size_t sep = round(sz*0.1); - for (size_t i = 0; i < sep; i++) { - for (size_t j = sep; j < sz; j++) { - if (!all && _good((*s)[i], (*s)[j], margin)) - continue; - cout << (*s)[i].f-(*s)[j].f << endl; - } + if (g.size() > max_up) { + sort(g.begin(), g.end(), [](pair<Hyp*,Hyp*> a, pair<Hyp*,Hyp*> b) + { + return fabs(a.first->gold-a.second->gold) + > fabs(b.first->gold-b.second->gold); + }); + g.erase(g.begin()+max_up, g.end()); } - size_t sep_lo = sz-sep; - for (size_t i = sep; i < sep_lo; i++) { - for (size_t j = sep_lo; j < sz; j++) { - if (!all && _good((*s)[i], (*s)[j], margin)) - continue; - cout << (*s)[i].f-(*s)[j].f << endl; - } + + for (auto i: g) { + if (output) + os << i.first->f-i.second->f << endl; + updates += i.first->f-i.second->f; } + + return g.size(); } +/* + * output (sorted) items in sample (k-best list) + * + */ inline void -OutputAllPairs(vector<ScoredHyp>* s) +output_sample(vector<Hyp>* sample, + ostream& os=cout, + bool sorted=true) { - size_t sz = s->size(); - sort(s->begin(), s->end(), _cmp); - for (size_t i = 0; i < sz-1; i++) { - for (size_t j = i+1; j < sz; j++) { - if ((*s)[i].gold == (*s)[j].gold) - continue; - cout << (*s)[i].f-(*s)[j].f << endl; - } + if (sorted) + sort(sample->begin(), sample->end(), [](Hyp first, Hyp second) + { + return first.gold > second.gold; + }); + size_t j = 0; + for (auto i: *sample) { + os << j << "\t" << i.gold << "\t" << i.model << "\t" << i.f << endl; + j++; } } |