summaryrefslogtreecommitdiff
path: root/dtrain/test/mira_update/sample.h
blob: 5c331bba8e9e0bc4c797dc1d0bc311fef850bf34 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#ifndef _DTRAIN_SAMPLE_H_
#define _DTRAIN_SAMPLE_H_


#include "kbestget.h"


namespace dtrain
{


struct TPair
{
  SparseVector<double> first, second;
  size_t first_rank, second_rank;
  double first_score, second_score;
  double model_score_diff;
  double loss_diff;
};

typedef vector<TPair> TrainingInstances;


void
  sample_all( KBestList* kb, TrainingInstances &training, size_t n_pairs )
{
  std::vector<double> loss_diffs;
  TrainingInstances training_tmp;
  for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
    for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
      TPair p;
      p.first = kb->feats[i];
      p.second = kb->feats[j];
      p.first_rank = i;
      p.second_rank = j;
      p.first_score = kb->scores[i];
      p.second_score = kb->scores[j];

      bool conservative = 1;
      if ( kb->scores[i] - kb->scores[j] < 0 ) {
	// j=hope, i=fear                                                                                                                         
	p.model_score_diff = kb->model_scores[j] - kb->model_scores[i];
        p.loss_diff = kb->scores[j] - kb->scores[i];
        training_tmp.push_back(p);
        loss_diffs.push_back(p.loss_diff);
      }
      else if (!conservative) {
	// i=hope, j=fear
	p.model_score_diff = kb->model_scores[i] - kb->model_scores[j];
        p.loss_diff = kb->scores[i] - kb->scores[j];
        training_tmp.push_back(p);
        loss_diffs.push_back(p.loss_diff);
      }
    }
  }
  
  if (training_tmp.size() > 0) {
    double threshold;
    std::sort(loss_diffs.begin(), loss_diffs.end());
    std::reverse(loss_diffs.begin(), loss_diffs.end());
    threshold = loss_diffs.size() >= n_pairs ? loss_diffs[n_pairs-1] : loss_diffs[loss_diffs.size()-1];
    cerr << "threshold: " << threshold << endl;
    size_t constraints = 0;
    for (size_t i = 0; (i < training_tmp.size() && constraints < n_pairs); ++i) {
      if (training_tmp[i].loss_diff >= threshold) {
	training.push_back(training_tmp[i]);
	constraints++;
      }
    }
  }
  else {
    cerr << "No pairs selected." << endl;
  }
}

void
sample_rand( KBestList* kb, TrainingInstances &training )
{
  srand( time(NULL) );
  for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
    for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
      if ( rand() % 2 ) {
        TPair p;
        p.first = kb->feats[i];
        p.second = kb->feats[j];
        p.first_rank = i;
        p.second_rank = j;
        p.first_score = kb->scores[i];
        p.second_score = kb->scores[j];
        training.push_back( p );
      }
    }
  }
}


} // namespace


#endif