summaryrefslogtreecommitdiff
path: root/training/dtrain
diff options
context:
space:
mode:
Diffstat (limited to 'training/dtrain')
-rw-r--r--training/dtrain/update.h292
1 files changed, 187 insertions, 105 deletions
diff --git a/training/dtrain/update.h b/training/dtrain/update.h
index 83dc3186..1c51c4f1 100644
--- a/training/dtrain/update.h
+++ b/training/dtrain/update.h
@@ -4,143 +4,225 @@
namespace dtrain
{
-bool
-_cmp(ScoredHyp a, ScoredHyp b)
-{
- return a.gold > b.gold;
-}
-
-bool
-_cmpHope(ScoredHyp a, ScoredHyp b)
-{
- return (a.model+a.gold) > (b.model+b.gold);
-}
-
-bool
-_cmpFear(ScoredHyp a, ScoredHyp b)
-{
- return (a.model-a.gold) > (b.model-b.gold);
-}
-
-inline bool
-_good(ScoredHyp& a, ScoredHyp& b, weight_t margin)
-{
- if ((a.model-b.model)>margin
- || a.gold==b.gold)
- return true;
-
- return false;
-}
-
-inline bool
-_goodS(ScoredHyp& a, ScoredHyp& b)
-{
- if (a.gold==b.gold)
- return true;
-
- return false;
-}
-
/*
- * multipartite ranking
- * sort (descending) by bleu
- * compare top X (hi) to middle Y (med) and low X (lo)
- * cmp middle Y to low X
+ * multipartite [multi=3] ranking
+ * partitions are determined by the 'cut' parameter
+ * 0. sort sample (descending) by bleu
+ * 1. compare top X(=sz*cut) to middle Y(=sz-2*(sz*cut)) and bottom X
+ * -"- middle Y to bottom X
+ *
*/
inline size_t
-CollectUpdates(vector<ScoredHyp>* s,
- SparseVector<weight_t>& updates,
- weight_t margin=0.)
+updates_multipartite(vector<Hyp>* sample,
+ SparseVector<weight_t>& updates,
+ weight_t cut,
+ weight_t margin,
+ size_t max_up,
+ weight_t threshold,
+ bool adjust,
+ bool output=false,
+ ostream& os=cout)
{
- size_t num_up = 0;
- size_t sz = s->size();
- sort(s->begin(), s->end(), _cmp);
- size_t sep = round(sz*0.1);
- for (size_t i = 0; i < sep; i++) {
- for (size_t j = sep; j < sz; j++) {
- if (_good((*s)[i], (*s)[j], margin))
+ size_t up = 0;
+ size_t sz = sample->size();
+ if (sz < 2) return 0;
+ sort(sample->begin(), sample->end(), [](Hyp& first, Hyp& second)
+ {
+ return first.gold > second.gold;
+ });
+ size_t sep = round(sz*cut);
+
+ size_t sep_hi = sep;
+ if (adjust) {
+ if (sz > 4) {
+ while (sep_hi<sz && (*sample)[sep_hi-1].gold==(*sample)[sep_hi].gold)
+ ++sep_hi;
+ } else {
+ sep_hi = 1;
+ }
+ }
+ for (size_t i = 0; i < sep_hi; i++) {
+ for (size_t j = sep_hi; j < sz; j++) {
+ Hyp& first=(*sample)[i], second=(*sample)[j];
+ if ((first.model-second.model)>margin
+ || (!adjust && first.gold==second.gold)
+ || (threshold && (first.gold-second.gold < threshold)))
continue;
- updates += (*s)[i].f-(*s)[j].f;
- num_up++;
+ if (output)
+ os << first.f-second.f << endl;
+ updates += first.f-second.f;
+ if (++up==max_up)
+ return up;
}
}
+
size_t sep_lo = sz-sep;
- for (size_t i = sep; i < sep_lo; i++) {
+ if (adjust) {
+ while (sep_lo>0 && (*sample)[sep_lo-1].gold==(*sample)[sep_lo].gold)
+ --sep_lo;
+ }
+ for (size_t i = sep_hi; i < sep_lo; i++) {
for (size_t j = sep_lo; j < sz; j++) {
- if (_good((*s)[i], (*s)[j], margin))
+ Hyp& first=(*sample)[i], second=(*sample)[j];
+ if ((first.model-second.model)>margin
+ || (!adjust && first.gold==second.gold)
+ || (threshold && (first.gold-second.gold < threshold)))
continue;
- updates += (*s)[i].f-(*s)[j].f;
- num_up++;
+ if (output)
+ os << first.f-second.f << endl;
+ updates += first.f-second.f;
+ if (++up==max_up)
+ break;
}
}
- return num_up;
+ return up;
}
+/*
+ * all pairs
+ * only ignore a pair if gold scores are
+ * identical
+ *
+ */
inline size_t
-CollectUpdatesStruct(vector<ScoredHyp>* s,
- SparseVector<weight_t>& updates,
- weight_t unused=-1)
+updates_all(vector<Hyp>* sample,
+ SparseVector<weight_t>& updates,
+ size_t max_up,
+ weight_t threshold,
+ bool output=false,
+ ostream& os=cout)
+{
+ size_t up = 0;
+ size_t sz = sample->size();
+ sort(sample->begin(), sample->end(), [](Hyp& first, Hyp& second)
+ {
+ return first.gold > second.gold;
+ });
+ for (size_t i = 0; i < sz-1; i++) {
+ for (size_t j = i+1; j < sz; j++) {
+ Hyp& first=(*sample)[i], second=(*sample)[j];
+ if (first.gold == second.gold
+ || (threshold && (first.gold-second.gold < threshold)))
+ continue;
+ if (output)
+ os << first.f-second.f << endl;
+ updates += first.f-second.f;
+ if (++up==max_up)
+ break;
+ }
+ }
+
+ return up;
+}
+
+/*
+ * hope/fear
+ * just one pair: hope - fear
+ *
+ */
+inline size_t
+update_structured(vector<Hyp>* sample,
+ SparseVector<weight_t>& updates,
+ weight_t margin,
+ bool output=false,
+ ostream& os=cout)
{
// hope
- sort(s->begin(), s->end(), _cmpHope);
- ScoredHyp hope = (*s)[0];
+ sort(sample->begin(), sample->end(), [](Hyp& first, Hyp& second)
+ {
+ return (first.model+first.gold) > (second.model+second.gold);
+ });
+ Hyp hope = (*sample)[0];
// fear
- sort(s->begin(), s->end(), _cmpFear);
- ScoredHyp fear = (*s)[0];
- if (!_goodS(hope, fear))
+ sort(sample->begin(), sample->end(), [](Hyp& first, Hyp& second)
+ {
+ return (first.model-first.gold) > (second.model-second.gold);
+ });
+ Hyp fear = (*sample)[0];
+
+ if (hope.gold != fear.gold) {
updates += hope.f - fear.f;
+ if (output)
+ os << hope.f << "\t" << fear.f << endl;
+
+ return 1;
+ }
- return updates.size();
+ if (output)
+ os << endl;
+
+ return 0;
}
-inline void
-OutputKbest(vector<ScoredHyp>* s)
+
+/*
+ * pair sampling as in
+ * 'Tuning as Ranking' (Hopkins & May, 2011)
+ * count = 5000 [maxs]
+ * threshold = 5% BLEU [threshold=0.05]
+ * cut = top 50 [max_up]
+ */
+inline size_t
+updates_pro(vector<Hyp>* sample,
+ SparseVector<weight_t>& updates,
+ size_t maxs,
+ size_t max_up,
+ weight_t threshold,
+ bool output=false,
+ ostream& os=cout)
{
- sort(s->begin(), s->end(), _cmp);
- size_t i = 0;
- for (auto k: *s) {
- cout << i << "\t" << k.gold << "\t" << k.model << " \t" << k.f << endl;
- i++;
+
+ size_t sz = sample->size(), s;
+ vector<pair<Hyp*,Hyp*> > g;
+ while (s < maxs) {
+ size_t i=rand()%sz, j=rand()%sz;
+ Hyp& first=(*sample)[i], second=(*sample)[j];
+ if (i==j || fabs(first.gold-second.gold)<threshold)
+ continue;
+ if (first.gold > second.gold)
+ g.emplace_back(make_pair(&first,&second));
+ else
+ g.emplace_back(make_pair(&second,&first));
+ s++;
}
-}
-inline void
-OutputMultipartitePairs(vector<ScoredHyp>* s,
- weight_t margin=0.,
- bool all=true)
-{
- size_t sz = s->size();
- sort(s->begin(), s->end(), _cmp);
- size_t sep = round(sz*0.1);
- for (size_t i = 0; i < sep; i++) {
- for (size_t j = sep; j < sz; j++) {
- if (!all && _good((*s)[i], (*s)[j], margin))
- continue;
- cout << (*s)[i].f-(*s)[j].f << endl;
- }
+ if (g.size() > max_up) {
+ sort(g.begin(), g.end(), [](pair<Hyp*,Hyp*> a, pair<Hyp*,Hyp*> b)
+ {
+ return fabs(a.first->gold-a.second->gold)
+ > fabs(b.first->gold-b.second->gold);
+ });
+ g.erase(g.begin()+max_up, g.end());
}
- size_t sep_lo = sz-sep;
- for (size_t i = sep; i < sep_lo; i++) {
- for (size_t j = sep_lo; j < sz; j++) {
- if (!all && _good((*s)[i], (*s)[j], margin))
- continue;
- cout << (*s)[i].f-(*s)[j].f << endl;
- }
+
+ for (auto i: g) {
+ if (output)
+ os << i.first->f-i.second->f << endl;
+ updates += i.first->f-i.second->f;
}
+
+ return g.size();
}
+/*
+ * output (sorted) items in sample (k-best list)
+ *
+ */
inline void
-OutputAllPairs(vector<ScoredHyp>* s)
+output_sample(vector<Hyp>* sample,
+ ostream& os=cout,
+ bool sorted=true)
{
- size_t sz = s->size();
- sort(s->begin(), s->end(), _cmp);
- for (size_t i = 0; i < sz-1; i++) {
- for (size_t j = i+1; j < sz; j++) {
- if ((*s)[i].gold == (*s)[j].gold)
- continue;
- cout << (*s)[i].f-(*s)[j].f << endl;
- }
+ if (sorted)
+ sort(sample->begin(), sample->end(), [](Hyp first, Hyp second)
+ {
+ return first.gold > second.gold;
+ });
+ size_t j = 0;
+ for (auto i: *sample) {
+ os << j << "\t" << i.gold << "\t" << i.model << "\t" << i.f << endl;
+ j++;
}
}