summaryrefslogtreecommitdiff
path: root/dtrain/pairsampling.h
blob: 52eeedd6610e69e28546bbeca939d50ed852e2f0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#ifndef _DTRAIN_PAIRSAMPLING_H_
#define _DTRAIN_PAIRSAMPLING_H_

#define DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs
                                 // DO NOT USE WITH SVM!

namespace dtrain
{


bool
accept_pair(score_t a, score_t b, score_t threshold)
{
  if (fabs(a - b) < threshold) return false;
  return true;
}

bool
cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b)
{
  return a.score > b.score;
}

inline void
all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused=1)
{
  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
  unsigned sz = s->size();
  for (unsigned i = 0; i < sz-1; i++) {
    for (unsigned j = i+1; j < sz; j++) {
      if (threshold > 0) {
        if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
          training.push_back(make_pair((*s)[i], (*s)[j]));
      } else {
        if ((*s)[i].score != (*s)[j].score)
          training.push_back(make_pair((*s)[i], (*s)[j]));
      }
    }
  }
}

/*
 * multipartite ranking
 *  sort (descending) by bleu
 *  compare top X to middle Y and low X
 *  cmp middle Y to low X
 */

inline void
partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float hi_lo)
{
  sort(s->begin(), s->end(), cmp_hyp_by_score_d);
  unsigned sz = s->size();
  unsigned sep = round(sz*hi_lo);
  for (unsigned i = 0; i < sep; i++) {
    for (unsigned j = sep; j < sz; j++) {
#ifdef DTRAIN_FASTER_PERCEPTRON
      if ((*s)[i].model <= (*s)[j].model) {
#endif
      if (threshold > 0) {
        if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
          training.push_back(make_pair((*s)[i], (*s)[j]));
      } else {
        if ((*s)[i].score != (*s)[j].score)
          training.push_back(make_pair((*s)[i], (*s)[j]));
      }
#ifdef DTRAIN_FASTER_PERCEPTRON
      }
#endif
    }
  }
  for (unsigned i = sep; i < sz-sep; i++) {
    for (unsigned j = sz-sep; j < sz; j++) {
#ifdef DTRAIN_FASTER_PERCEPTRON
      if ((*s)[i].model <= (*s)[j].model) {
#endif
      if (threshold > 0) {
        if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
          training.push_back(make_pair((*s)[i], (*s)[j]));
      } else {
        if ((*s)[i].score != (*s)[j].score)
          training.push_back(make_pair((*s)[i], (*s)[j]));
      }
#ifdef DTRAIN_FASTER_PERCEPTRON
      }
#endif
    }
  }
}

/*
 * pair sampling as in
 * 'Tuning as Ranking' (Hopkins & May, 2011)
 *     count = 5000
 * threshold = 5% BLEU (0.05 for param 3)
 *       cut = top 50
 */
bool
_PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b)
{
  return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
}
inline void
PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused=1)
{
  unsigned max_count = 5000, count = 0, sz = s->size();
  bool b = false;
  for (unsigned i = 0; i < sz-1; i++) {
    for (unsigned j = i+1; j < sz; j++) {
      if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) {
        training.push_back(make_pair((*s)[i], (*s)[j]));
        if (++count == max_count) {
          b = true;
          break;
        }
      }
    }
    if (b) break;
  }
  if (training.size() > 50) {
    sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d);
    training.erase(training.begin()+50, training.end());
  }
  return;
}


} // namespace

#endif