diff options
Diffstat (limited to 'extractor/sampler.cc')
| -rw-r--r-- | extractor/sampler.cc | 26 | 
1 files changed, 15 insertions, 11 deletions
| diff --git a/extractor/sampler.cc b/extractor/sampler.cc index 2f7738db..cb470962 100644 --- a/extractor/sampler.cc +++ b/extractor/sampler.cc @@ -20,35 +20,39 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location, unordered_set<int      num_subpatterns = 1;      int low = location.sa_low, high = location.sa_high;      double step = Round(max(1.0, (double) (high - low) / max_samples)); -    int i = location.sa_low; -    bool found = false; -    while (sample.size() < max_samples && i <= location.sa_high) { +    int i = low, last = i; +    bool found; +    while (sample.size() < max_samples && i < high) {        int x = suffix_array->GetSuffix(i);        int id = source_data_array->GetSentenceId(x);        if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) { +        found = false;          int backoff_step = 1;          while (true) { +          if ((double)backoff_step >= step) break;            int j = i - backoff_step;            x = suffix_array->GetSuffix(j);            id = source_data_array->GetSentenceId(x); -          if ((j >= location.sa_low) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) -              && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; } +          if (j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { +            found = true; last = i; break; +          }            int k = i + backoff_step;            x = suffix_array->GetSuffix(k);            id = source_data_array->GetSentenceId(x); -          if ((k <= location.sa_high) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) -              && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; } -          if (j <= location.sa_low && k >= location.sa_high) break; +          if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { +            found = true; last = k; break; +          } +          if (j <= last && k >= high) break;            backoff_step++;          }        } else {          found = true; +        last = i;        } -      if (found && (find(sample.begin(), sample.end(), x) == sample.end())) sample.push_back(x); +      if (found) sample.push_back(x);        i += step; -      found = false;      } -  } else { // when do we get here? +  } else {      // Sample vector of occurrences.      num_subpatterns = location.num_subpatterns;      int num_matchings = location.matchings->size() / num_subpatterns; | 
