dtrain update.h: implement adjustment of cut, all pairs, new pro sampling, output options

author: Patrick Simianer <p@simianer.de> 2015-10-16 10:23:11 +0200
committer: Patrick Simianer <p@simianer.de> 2015-10-16 10:23:11 +0200
commit: e3c3bb3759239853e87e6d564fb36d9868d20cb9 (patch)
tree: ebc9791462b440004e881b419e44e86aa6c9ffa9
parent: 686df8adb559a85e2f63fd807068cfca6f2b7824 (diff)
1 files changed, 187 insertions, 105 deletions
diff --git a/training/dtrain/update.h b/training/dtrain/update.h
index 83dc3186..1c51c4f1 100644
--- a/training/dtrain/update.h
+++ b/training/dtrain/update.h
@@ -4,143 +4,225 @@
 namespace dtrain
 {
 
-bool
-_cmp(ScoredHyp a, ScoredHyp b)
-{
-  return a.gold > b.gold;
-}
-
-bool
-_cmpHope(ScoredHyp a, ScoredHyp b)
-{
-  return (a.model+a.gold) > (b.model+b.gold);
-}
-
-bool
-_cmpFear(ScoredHyp a, ScoredHyp b)
-{
-  return (a.model-a.gold) > (b.model-b.gold);
-}
-
-inline bool
-_good(ScoredHyp& a, ScoredHyp& b, weight_t margin)
-{
-  if ((a.model-b.model)>margin
-      || a.gold==b.gold)
-    return true;
-
-  return false;
-}
-
-inline bool
-_goodS(ScoredHyp& a, ScoredHyp& b)
-{
-  if (a.gold==b.gold)
-    return true;
-
-  return false;
-}
-
 /*
- * multipartite ranking
- *  sort (descending) by bleu
- *  compare top X (hi) to middle Y (med) and low X (lo)
- *  cmp middle Y to low X
+ * multipartite [multi=3] ranking
+ *  partitions are determined by the 'cut' parameter
+ *   0. sort sample (descending) by bleu
+ *   1. compare top X(=sz*cut) to middle Y(=sz-2*(sz*cut)) and bottom X
+ *      -"- middle Y to bottom X
+ *
  */
 inline size_t
-CollectUpdates(vector<ScoredHyp>* s,
-               SparseVector<weight_t>& updates,
-               weight_t margin=0.)
+updates_multipartite(vector<Hyp>* sample,
+                     SparseVector<weight_t>& updates,
+                     weight_t cut,
+                     weight_t margin,
+                     size_t max_up,
+                     weight_t threshold,
+                     bool adjust,
+                     bool output=false,
+                     ostream& os=cout)
 {
-  size_t num_up = 0;
-  size_t sz = s->size();
-  sort(s->begin(), s->end(), _cmp);
-  size_t sep = round(sz*0.1);
-  for (size_t i = 0; i < sep; i++) {
-    for (size_t j = sep; j < sz; j++) {
-      if (_good((*s)[i], (*s)[j], margin))
+  size_t up = 0;
+  size_t sz = sample->size();
+  if (sz < 2) return 0;
+  sort(sample->begin(), sample->end(), [](Hyp& first, Hyp& second)
+    {
+      return first.gold > second.gold;
+    });
+  size_t sep = round(sz*cut);
+
+  size_t sep_hi = sep;
+  if (adjust) {
+    if (sz > 4) {
+      while (sep_hi<sz && (*sample)[sep_hi-1].gold==(*sample)[sep_hi].gold)
+        ++sep_hi;
+    } else {
+      sep_hi = 1;
+    }
+  }
+  for (size_t i = 0; i < sep_hi; i++) {
+    for (size_t j = sep_hi; j < sz; j++) {
+      Hyp& first=(*sample)[i], second=(*sample)[j];
+      if ((first.model-second.model)>margin
+           || (!adjust && first.gold==second.gold)
+           || (threshold && (first.gold-second.gold < threshold)))
         continue;
-      updates += (*s)[i].f-(*s)[j].f;
-      num_up++;
+      if (output)
+        os << first.f-second.f << endl;
+      updates += first.f-second.f;
+      if (++up==max_up)
+        return up;
     }
   }
+
   size_t sep_lo = sz-sep;
-  for (size_t i = sep; i < sep_lo; i++) {
+  if (adjust) {
+    while (sep_lo>0 && (*sample)[sep_lo-1].gold==(*sample)[sep_lo].gold)
+        --sep_lo;
+  }
+  for (size_t i = sep_hi; i < sep_lo; i++) {
     for (size_t j = sep_lo; j < sz; j++) {
-      if (_good((*s)[i], (*s)[j], margin))
+      Hyp& first=(*sample)[i], second=(*sample)[j];
+      if ((first.model-second.model)>margin
+           || (!adjust && first.gold==second.gold)
+           || (threshold && (first.gold-second.gold < threshold)))
         continue;
-      updates += (*s)[i].f-(*s)[j].f;
-      num_up++;
+      if (output)
+        os << first.f-second.f << endl;
+      updates += first.f-second.f;
+      if (++up==max_up)
+        break;
     }
   }
 
-  return num_up;
+  return up;
 }
 
+/*
+ * all pairs
+ *  only ignore a pair if gold scores are
+ *  identical
+ *
+ */
 inline size_t
-CollectUpdatesStruct(vector<ScoredHyp>* s,
-                     SparseVector<weight_t>& updates,
-                     weight_t unused=-1)
+updates_all(vector<Hyp>* sample,
+            SparseVector<weight_t>& updates,
+            size_t max_up,
+            weight_t threshold,
+            bool output=false,
+            ostream& os=cout)
+{
+  size_t up = 0;
+  size_t sz = sample->size();
+  sort(sample->begin(), sample->end(), [](Hyp& first, Hyp& second)
+    {
+      return first.gold > second.gold;
+    });
+  for (size_t i = 0; i < sz-1; i++) {
+    for (size_t j = i+1; j < sz; j++) {
+      Hyp& first=(*sample)[i], second=(*sample)[j];
+      if (first.gold == second.gold
+           || (threshold && (first.gold-second.gold < threshold)))
+        continue;
+      if (output)
+        os << first.f-second.f << endl;
+      updates += first.f-second.f;
+      if (++up==max_up)
+        break;
+    }
+  }
+
+  return up;
+}
+
+/*
+ * hope/fear
+ *  just one pair: hope - fear
+ *
+ */
+inline size_t
+update_structured(vector<Hyp>* sample,
+                  SparseVector<weight_t>& updates,
+                  weight_t margin,
+                  bool output=false,
+                  ostream& os=cout)
 {
   // hope
-  sort(s->begin(), s->end(), _cmpHope);
-  ScoredHyp hope = (*s)[0];
+  sort(sample->begin(), sample->end(), [](Hyp& first, Hyp& second)
+    {
+      return (first.model+first.gold) > (second.model+second.gold);
+    });
+  Hyp hope = (*sample)[0];
   // fear
-  sort(s->begin(), s->end(), _cmpFear);
-  ScoredHyp fear = (*s)[0];
-  if (!_goodS(hope, fear))
+  sort(sample->begin(), sample->end(), [](Hyp& first, Hyp& second)
+    {
+      return (first.model-first.gold) > (second.model-second.gold);
+    });
+  Hyp fear = (*sample)[0];
+
+  if (hope.gold != fear.gold) {
     updates += hope.f - fear.f;
+    if (output)
+      os << hope.f << "\t" << fear.f << endl;
+
+    return 1;
+  }
 
-  return updates.size();
+  if (output)
+    os << endl;
+
+  return 0;
 }
 
-inline void
-OutputKbest(vector<ScoredHyp>* s)
+
+/*
+ * pair sampling as in
+ * 'Tuning as Ranking' (Hopkins & May, 2011)
+ *     count = 5000    [maxs]
+ * threshold = 5% BLEU [threshold=0.05]
+ *       cut = top 50  [max_up]
+ */
+inline size_t
+updates_pro(vector<Hyp>* sample,
+           SparseVector<weight_t>& updates,
+           size_t maxs,
+           size_t max_up,
+           weight_t threshold,
+           bool output=false,
+           ostream& os=cout)
 {
-  sort(s->begin(), s->end(), _cmp);
-  size_t i = 0;
-  for (auto k: *s) {
-    cout << i << "\t" << k.gold << "\t" << k.model << " \t" << k.f << endl;
-    i++;
+
+  size_t sz = sample->size(), s;
+  vector<pair<Hyp*,Hyp*> > g;
+  while (s < maxs) {
+    size_t i=rand()%sz, j=rand()%sz;
+    Hyp& first=(*sample)[i], second=(*sample)[j];
+    if (i==j || fabs(first.gold-second.gold)<threshold)
+      continue;
+    if (first.gold > second.gold)
+      g.emplace_back(make_pair(&first,&second));
+    else
+      g.emplace_back(make_pair(&second,&first));
+    s++;
   }
-}
 
-inline void
-OutputMultipartitePairs(vector<ScoredHyp>* s,
-                               weight_t margin=0.,
-                               bool all=true)
-{
-  size_t sz = s->size();
-  sort(s->begin(), s->end(), _cmp);
-  size_t sep = round(sz*0.1);
-  for (size_t i = 0; i < sep; i++) {
-    for (size_t j = sep; j < sz; j++) {
-      if (!all && _good((*s)[i], (*s)[j], margin))
-        continue;
-      cout << (*s)[i].f-(*s)[j].f << endl;
-    }
+  if (g.size() > max_up) {
+    sort(g.begin(), g.end(), [](pair<Hyp*,Hyp*> a, pair<Hyp*,Hyp*> b)
+    {
+      return fabs(a.first->gold-a.second->gold)
+              > fabs(b.first->gold-b.second->gold);
+    });
+    g.erase(g.begin()+max_up, g.end());
   }
-  size_t sep_lo = sz-sep;
-  for (size_t i = sep; i < sep_lo; i++) {
-    for (size_t j = sep_lo; j < sz; j++) {
-      if (!all && _good((*s)[i], (*s)[j], margin))
-        continue;
-      cout << (*s)[i].f-(*s)[j].f << endl;
-    }
+
+  for (auto i: g) {
+    if (output)
+      os << i.first->f-i.second->f << endl;
+    updates += i.first->f-i.second->f;
   }
+
+  return g.size();
 }
 
+/*
+ * output (sorted) items in sample (k-best list)
+ *
+ */
 inline void
-OutputAllPairs(vector<ScoredHyp>* s)
+output_sample(vector<Hyp>* sample,
+              ostream& os=cout,
+              bool sorted=true)
 {
-  size_t sz = s->size();
-  sort(s->begin(), s->end(), _cmp);
-  for (size_t i = 0; i < sz-1; i++) {
-    for (size_t j = i+1; j < sz; j++) {
-      if ((*s)[i].gold == (*s)[j].gold)
-        continue;
-      cout << (*s)[i].f-(*s)[j].f << endl;
-    }
+  if (sorted)
+    sort(sample->begin(), sample->end(), [](Hyp first, Hyp second)
+      {
+        return first.gold > second.gold;
+      });
+  size_t j = 0;
+  for (auto i: *sample) {
+    os << j << "\t" << i.gold << "\t" << i.model << "\t" << i.f << endl;
+    j++;
   }
 }
author	Patrick Simianer <p@simianer.de>	2015-10-16 10:23:11 +0200
committer	Patrick Simianer <p@simianer.de>	2015-10-16 10:23:11 +0200
commit	e3c3bb3759239853e87e6d564fb36d9868d20cb9 (patch)
tree	ebc9791462b440004e881b419e44e86aa6c9ffa9
parent	686df8adb559a85e2f63fd807068cfca6f2b7824 (diff)