summaryrefslogtreecommitdiff
path: root/extractor/sampler.cc
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2013-09-20 20:01:03 +0200
committerPatrick Simianer <p@simianer.de>2013-09-20 20:01:03 +0200
commit2e746d6ad25aaf4d85f9c8f277ff109e45bfd93e (patch)
treed2277986db27f5e4acec62e651f962359c3cbd03 /extractor/sampler.cc
parent8ea6bd821bf2a71a4a55e137f2f3c7d24200362a (diff)
loo
Diffstat (limited to 'extractor/sampler.cc')
-rw-r--r--extractor/sampler.cc35
1 files changed, 30 insertions, 5 deletions
diff --git a/extractor/sampler.cc b/extractor/sampler.cc
index d81956b5..2f7738db 100644
--- a/extractor/sampler.cc
+++ b/extractor/sampler.cc
@@ -12,18 +12,43 @@ Sampler::Sampler() {}
Sampler::~Sampler() {}
-PhraseLocation Sampler::Sample(const PhraseLocation& location) const {
+PhraseLocation Sampler::Sample(const PhraseLocation& location, unordered_set<int> blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array) const {
vector<int> sample;
int num_subpatterns;
if (location.matchings == NULL) {
// Sample suffix array range.
num_subpatterns = 1;
int low = location.sa_low, high = location.sa_high;
- double step = max(1.0, (double) (high - low) / max_samples);
- for (double i = low; i < high && sample.size() < max_samples; i += step) {
- sample.push_back(suffix_array->GetSuffix(Round(i)));
+ double step = Round(max(1.0, (double) (high - low) / max_samples));
+ int i = location.sa_low;
+ bool found = false;
+ while (sample.size() < max_samples && i <= location.sa_high) {
+ int x = suffix_array->GetSuffix(i);
+ int id = source_data_array->GetSentenceId(x);
+ if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) {
+ int backoff_step = 1;
+ while (true) {
+ int j = i - backoff_step;
+ x = suffix_array->GetSuffix(j);
+ id = source_data_array->GetSentenceId(x);
+ if ((j >= location.sa_low) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end())
+ && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; }
+ int k = i + backoff_step;
+ x = suffix_array->GetSuffix(k);
+ id = source_data_array->GetSentenceId(x);
+ if ((k <= location.sa_high) && (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end())
+ && (find(sample.begin(), sample.end(), x) == sample.end())) { found = true; break; }
+ if (j <= location.sa_low && k >= location.sa_high) break;
+ backoff_step++;
+ }
+ } else {
+ found = true;
+ }
+ if (found && (find(sample.begin(), sample.end(), x) == sample.end())) sample.push_back(x);
+ i += step;
+ found = false;
}
- } else {
+ } else { // when do we get here?
// Sample vector of occurrences.
num_subpatterns = location.num_subpatterns;
int num_matchings = location.matchings->size() / num_subpatterns;