diff options
| author | Chris Dyer <redpony@gmail.com> | 2013-11-13 11:22:24 -0800 | 
|---|---|---|
| committer | Chris Dyer <redpony@gmail.com> | 2013-11-13 11:22:24 -0800 | 
| commit | 1b39e848903743990ca16e2323235b31db20178c (patch) | |
| tree | 19fceda02d09c9d4990aba692ed97193020d1e86 /extractor/sampler.cc | |
| parent | f83186887c94b2ff8b17aefcd0b395f116c09eb6 (diff) | |
| parent | 4c7d24c9357f500839f04c7c8a8cfa0472801e18 (diff) | |
Merge pull request #27 from pks/master
Tidying (soft) syntax features; loo for C++ extractor; updates for dtrain 
Diffstat (limited to 'extractor/sampler.cc')
| -rw-r--r-- | extractor/sampler.cc | 35 | 
1 files changed, 32 insertions, 3 deletions
| diff --git a/extractor/sampler.cc b/extractor/sampler.cc index d81956b5..d332dd90 100644 --- a/extractor/sampler.cc +++ b/extractor/sampler.cc @@ -12,7 +12,7 @@ Sampler::Sampler() {}  Sampler::~Sampler() {} -PhraseLocation Sampler::Sample(const PhraseLocation& location) const { +PhraseLocation Sampler::Sample(const PhraseLocation& location, unordered_set<int> blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array) const {    vector<int> sample;    int num_subpatterns;    if (location.matchings == NULL) { @@ -20,8 +20,37 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location) const {      num_subpatterns = 1;      int low = location.sa_low, high = location.sa_high;      double step = max(1.0, (double) (high - low) / max_samples); -    for (double i = low; i < high && sample.size() < max_samples; i += step) { -      sample.push_back(suffix_array->GetSuffix(Round(i))); +    double i = low, last = i; +    bool found; +    while (sample.size() < max_samples && i < high) { +      int x = suffix_array->GetSuffix(Round(i)); +      int id = source_data_array->GetSentenceId(x); +      if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) { +        found = false; +        double backoff_step = 1; +        while (true) { +          if ((double)backoff_step >= step) break; +          double j = i - backoff_step; +          x = suffix_array->GetSuffix(Round(j)); +          id = source_data_array->GetSentenceId(x); +          if (x >= 0 && j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { +            found = true; last = i; break; +          } +          double k = i + backoff_step; +          x = suffix_array->GetSuffix(Round(k)); +          id = source_data_array->GetSentenceId(x); +          if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { +            found = true; last = k; break; +          } +          if (j <= last && k >= high) break; +          backoff_step++; +        } +      } else { +        found = true; +        last = i; +      } +      if (found) sample.push_back(x); +      i += step;      }    } else {      // Sample vector of occurrences. | 
