diff options
author | Patrick Simianer <p@simianer.de> | 2013-09-20 20:01:03 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2013-09-20 20:01:03 +0200 |
commit | 2e746d6ad25aaf4d85f9c8f277ff109e45bfd93e (patch) | |
tree | d2277986db27f5e4acec62e651f962359c3cbd03 /extractor/sampler.cc | |
parent | 8ea6bd821bf2a71a4a55e137f2f3c7d24200362a (diff) |
loo
Diffstat (limited to 'extractor/sampler.cc')
-rw-r--r-- | extractor/sampler.cc | 35 |
1 files changed, 30 insertions, 5 deletions
diff --git a/extractor/sampler.cc b/extractor/sampler.cc index d81956b5..2f7738db 100644 --- a/extractor/sampler.cc +++ b/extractor/sampler.cc @@ -12,18 +12,43 @@ Sampler::Sampler() {} Sampler::~Sampler() {} -PhraseLocation Sampler::Sample(const PhraseLocation& location) const { +PhraseLocation Sampler::Sample(const PhraseLocation& location, unordered_set<int> blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array) const { vector<int> sample; int num_subpatterns; if (location.matchings == NULL) { // Sample suffix array range. num_subpatterns = 1; int low = location.sa_low, high = location.sa_high; - double step = max(1.0, (double) (high - low) / max_samples); - for (double i = low; i < high && sample.size() < max_samples; i += step) { - sample.push_back(suffix_array->GetSuffix(Round(i))); + double step = Round(max(1.0, (double) (high - low) / max_samples)); + int i = location.sa_low; + bool found = false; + while (sample.size() < max_samples && i <= location.sa_high) { + int x = suffix_array->GetSuffix(i); + int id = source_data_array->GetSentenceId(x); + if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) { + int backoff_step = 1; + while (true) { + int j = i - backoff_step; + x = suffix_array->GetSuffix(j); + id = source_data_array->GetSentenceId(x); + if ((j >= location.sa_low) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) + && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; } + int k = i + backoff_step; + x = suffix_array->GetSuffix(k); + id = source_data_array->GetSentenceId(x); + if ((k <= location.sa_high) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) + && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; } + if (j <= location.sa_low && k >= location.sa_high) break; + backoff_step++; + } + } else { + found = true; + } + if (found && (find(sample.begin(), sample.end(), x) == sample.end())) sample.push_back(x); + i += step; + found = false; } - } else { + } else { // when do we get here? // Sample vector of occurrences. num_subpatterns = location.num_subpatterns; int num_matchings = location.matchings->size() / num_subpatterns; |