summaryrefslogtreecommitdiff
path: root/extractor/backoff_sampler.cc
blob: 28b12909d312b3a3349e056eb0fdf4d2375d9e6c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#include "backoff_sampler.h"

#include "data_array.h"
#include "phrase_location.h"

namespace extractor {

BackoffSampler::BackoffSampler(
    shared_ptr<DataArray> source_data_array, int max_samples) :
    source_data_array(source_data_array), max_samples(max_samples) {}

BackoffSampler::BackoffSampler() {}

PhraseLocation BackoffSampler::Sample(
    const PhraseLocation& location,
    const unordered_set<int>& blacklisted_sentence_ids) const {
  vector<int> samples;
  int low = GetRangeLow(location), high = GetRangeHigh(location);
  int last_position = low - 1;
  double step = max(1.0, (double) (high - low) / max_samples);
  for (double num_samples = 0, i = low;
       num_samples < max_samples && i < high;
       ++num_samples, i += step) {
    int position = GetPosition(location, round(i));
    int sentence_id = source_data_array->GetSentenceId(position);
    bool found = false;
    if (last_position >= position ||
        blacklisted_sentence_ids.count(sentence_id)) {
      for (double backoff_step = 1; backoff_step < step; ++backoff_step) {
        double j = i - backoff_step;
        if (round(j) >= 0) {
          position = GetPosition(location, round(j));
          sentence_id = source_data_array->GetSentenceId(position);
          if (position > last_position &&
              !blacklisted_sentence_ids.count(sentence_id)) {
            found = true;
            last_position = position;
            break;
          }
        }

        double k = i + backoff_step;
        if (round(k) < high) {
          position = GetPosition(location, round(k));
          sentence_id = source_data_array->GetSentenceId(position);
          if (!blacklisted_sentence_ids.count(sentence_id)) {
            found = true;
            last_position = position;
            break;
          }
        }
      }
    } else {
      found = true;
      last_position = position;
    }

    if (found) {
      AppendMatching(samples, position, location);
    }
  }

  return PhraseLocation(samples, GetNumSubpatterns(location));
}

} // namespace extractor