summaryrefslogtreecommitdiff
path: root/dtrain/pairsampling.h
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-11-28 00:48:30 +0100
committerPatrick Simianer <p@simianer.de>2011-11-28 00:48:30 +0100
commit8aa98e9aa2a786151aab1e9398bb03a6ba1ca383 (patch)
treedbdd82da141981905eb64fac58b0a0c3085e81a0 /dtrain/pairsampling.h
parent0982cdb6098ef010ac92ce07a9d93141ddbca900 (diff)
clean up
Diffstat (limited to 'dtrain/pairsampling.h')
-rw-r--r--dtrain/pairsampling.h174
1 files changed, 52 insertions, 122 deletions
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 0951f8e9..e866c8a0 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -5,72 +5,83 @@ namespace dtrain
{
-inline void
-all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training)
+bool
+accept_pair(score_t a, score_t b, score_t threshold)
{
- for (unsigned i = 0; i < s->size()-1; i++) {
- for (unsigned j = i+1; j < s->size(); j++) {
- pair<ScoredHyp,ScoredHyp> p;
- p.first = (*s)[i];
- p.second = (*s)[j];
- training.push_back(p);
- }
- }
+ if (fabs(a - b) < threshold) return false;
+ return true;
}
inline void
-rand_pairs_5050(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training,
- MT19937* prng)
+all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold)
{
for (unsigned i = 0; i < s->size()-1; i++) {
for (unsigned j = i+1; j < s->size(); j++) {
- if (prng->next() < .5) {
- pair<ScoredHyp,ScoredHyp> p;
- p.first = (*s)[i];
- p.second = (*s)[j];
- training.push_back(p);
+ if (threshold > 0) {
+ if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) {
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ }
+ } else {
+ training.push_back(make_pair((*s)[i], (*s)[j]));
}
}
}
}
+/*
+ * multipartite ranking
+ * sort by bleu
+ * compare top 10% to middle 80% and low 10%
+ * 80% to low 10%
+ */
bool
-_multpart_cmp_hyp_by_score(ScoredHyp a, ScoredHyp b)
+_108010_cmp_hyp_by_score(ScoredHyp a, ScoredHyp b)
{
return a.score < b.score;
}
inline void
-multpart108010(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training)
+part108010(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold)
{
- sort(s->begin(), s->end(), _multpart_cmp_hyp_by_score);
- pair<ScoredHyp,ScoredHyp> p;
+ sort(s->begin(), s->end(), _108010_cmp_hyp_by_score);
unsigned sz = s->size();
unsigned slice = 10;
unsigned sep = sz%slice;
if (sep == 0) sep = sz/slice;
for (unsigned i = 0; i < sep; i++) {
for (unsigned j = sep; j < sz; j++) {
- p.first = (*s)[i];
- p.second = (*s)[j];
- if (p.first.rank < p.second.rank) training.push_back(p);
+ if ((*s)[i].rank < (*s)[j].rank) {
+ if (threshold > 0) {
+ if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) {
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ }
+ } else {
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ }
+ }
}
}
for (unsigned i = sep; i < sz-sep; i++) {
for (unsigned j = sz-sep; j < sz; j++) {
- p.first = (*s)[i];
- p.second = (*s)[j];
- if (p.first.rank < p.second.rank) training.push_back(p);
+ if ((*s)[i].rank < (*s)[j].rank) {
+ if (threshold > 0) {
+ if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) {
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ }
+ } else {
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ }
+ }
}
}
}
-
-inline bool
-_PRO_accept_pair(pair<ScoredHyp,ScoredHyp> &p)
-{
- if (fabs(p.first.score - p.second.score) < 0.05) return false;
- return true;
-}
+/*
+ * pair sampling as in
+ * 'Tuning as Ranking' (Hopkins & May, 2011)
+ * count = 5000
+ * threshold = 5% BLEU
+ * cut = top 50
+ */
bool
_PRO_cmp_pair_by_diff(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b)
{
@@ -78,19 +89,15 @@ _PRO_cmp_pair_by_diff(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b)
return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
}
inline void
-PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training) // ugly
+PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold=0.05)
{
unsigned max_count = 5000, count = 0;
bool b = false;
for (unsigned i = 0; i < s->size()-1; i++) {
for (unsigned j = i+1; j < s->size(); j++) {
- pair<ScoredHyp,ScoredHyp> p;
- p.first = (*s)[i];
- p.second = (*s)[j];
- if (_PRO_accept_pair(p)) {
- training.push_back(p);
- count++;
- if (count == max_count) {
+ if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) {
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ if (++count == max_count) {
b = true;
break;
}
@@ -98,88 +105,11 @@ PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training)
}
if (b) break;
}
- sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff);
- if (training.size() > 50)
+ if (training.size() > 50) {
+ sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff);
training.erase(training.begin()+50, training.end());
- return;
-}
-
-inline void
-all_pairs_discard(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training)
-{
- for (unsigned i = 0; i < s->size()-1; i++) {
- for (unsigned j = i+1; j < s->size(); j++) {
- pair<ScoredHyp,ScoredHyp> p;
- p.first = (*s)[i];
- p.second = (*s)[j];
- if(_PRO_accept_pair(p))
- training.push_back(p);
- }
}
-}
-
-inline void
-multpart108010_discard(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training)
-{
- sort(s->begin(), s->end(), _multpart_cmp_hyp_by_score);
- pair<ScoredHyp,ScoredHyp> p;
- unsigned sz = s->size();
- unsigned slice = 10;
- unsigned sep = sz%slice;
- if (sep == 0) sep = sz/slice;
- for (unsigned i = 0; i < sep; i++) {
- for (unsigned j = sep; j < sz; j++) {
- p.first = (*s)[i];
- p.second = (*s)[j];
- if (p.first.rank < p.second.rank) {
- if (_PRO_accept_pair(p)) training.push_back(p);
- }
- }
- }
- for (unsigned i = sep; i < sz-sep; i++) {
- for (unsigned j = sz-sep; j < sz; j++) {
- p.first = (*s)[i];
- p.second = (*s)[j];
- if (p.first.rank < p.second.rank) {
- if (_PRO_accept_pair(p)) training.push_back(p);
- }
- }
- }
- sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff);
- if (training.size() > 50)
- training.erase(training.begin()+50, training.end());
-}
-
-inline void
-multpart108010_discard1(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training)
-{
- sort(s->begin(), s->end(), _multpart_cmp_hyp_by_score);
- pair<ScoredHyp,ScoredHyp> p;
- unsigned sz = s->size();
- unsigned slice = 10;
- unsigned sep = sz%slice;
- if (sep == 0) sep = sz/slice;
- for (unsigned i = 0; i < sep; i++) {
- for (unsigned j = sep; j < sz; j++) {
- p.first = (*s)[i];
- p.second = (*s)[j];
- if (p.first.rank < p.second.rank) {
- if (_PRO_accept_pair(p)) training.push_back(p);
- }
- }
- }
- for (unsigned i = sep; i < sz-sep; i++) {
- for (unsigned j = sz-sep; j < sz; j++) {
- p.first = (*s)[i];
- p.second = (*s)[j];
- if (p.first.rank < p.second.rank) {
- if (_PRO_accept_pair(p)) training.push_back(p);
- }
- }
- }
- sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff);
- if (training.size() > 50)
- training.erase(training.begin()+50, training.end());
+ return;
}