summaryrefslogtreecommitdiff
path: root/extractor/sampler.cc
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-11-23 17:33:47 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-11-23 17:33:47 +0000
commitcc6313b23cac25eb05976b6cf64f96faf1ed4163 (patch)
tree3dc28060ad25b43773e875bea7388ab1cefcd927 /extractor/sampler.cc
parent7990c750829af93f0a1e0fc14534582f52ee9e8c (diff)
parentf2fb69b10a897e8beb4e6e6d6cbb4327096235ef (diff)
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'extractor/sampler.cc')
-rw-r--r--extractor/sampler.cc35
1 files changed, 32 insertions, 3 deletions
diff --git a/extractor/sampler.cc b/extractor/sampler.cc
index d81956b5..963afa7a 100644
--- a/extractor/sampler.cc
+++ b/extractor/sampler.cc
@@ -12,7 +12,7 @@ Sampler::Sampler() {}
Sampler::~Sampler() {}
-PhraseLocation Sampler::Sample(const PhraseLocation& location) const {
+PhraseLocation Sampler::Sample(const PhraseLocation& location, const unordered_set<int>& blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array) const {
vector<int> sample;
int num_subpatterns;
if (location.matchings == NULL) {
@@ -20,8 +20,37 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location) const {
num_subpatterns = 1;
int low = location.sa_low, high = location.sa_high;
double step = max(1.0, (double) (high - low) / max_samples);
- for (double i = low; i < high && sample.size() < max_samples; i += step) {
- sample.push_back(suffix_array->GetSuffix(Round(i)));
+ double i = low, last = i;
+ bool found;
+ while (sample.size() < max_samples && i < high) {
+ int x = suffix_array->GetSuffix(Round(i));
+ int id = source_data_array->GetSentenceId(x);
+ if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) {
+ found = false;
+ double backoff_step = 1;
+ while (true) {
+ if ((double)backoff_step >= step) break;
+ double j = i - backoff_step;
+ x = suffix_array->GetSuffix(Round(j));
+ id = source_data_array->GetSentenceId(x);
+ if (x >= 0 && j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) {
+ found = true; last = i; break;
+ }
+ double k = i + backoff_step;
+ x = suffix_array->GetSuffix(Round(k));
+ id = source_data_array->GetSentenceId(x);
+ if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) {
+ found = true; last = k; break;
+ }
+ if (j <= last && k >= high) break;
+ backoff_step++;
+ }
+ } else {
+ found = true;
+ last = i;
+ }
+ if (found) sample.push_back(x);
+ i += step;
}
} else {
// Sample vector of occurrences.