summaryrefslogtreecommitdiff
path: root/extractor/sampler.cc
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-11-25 23:56:31 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-11-25 23:56:31 +0000
commit3c73e472444ff0cd436b12f3679440a6969cbf2d (patch)
tree9ceee03648ea671d7f05215826dc0d0a5890e36b /extractor/sampler.cc
parent2b95390f08d9f556e6207ecff03b4b0fd5ede993 (diff)
Clean up leave-one-out sampling.
Diffstat (limited to 'extractor/sampler.cc')
-rw-r--r--extractor/sampler.cc12
1 files changed, 8 insertions, 4 deletions
diff --git a/extractor/sampler.cc b/extractor/sampler.cc
index 963afa7a..fc386ed1 100644
--- a/extractor/sampler.cc
+++ b/extractor/sampler.cc
@@ -12,7 +12,9 @@ Sampler::Sampler() {}
Sampler::~Sampler() {}
-PhraseLocation Sampler::Sample(const PhraseLocation& location, const unordered_set<int>& blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array) const {
+PhraseLocation Sampler::Sample(
+ const PhraseLocation& location,
+ const unordered_set<int>& blacklisted_sentence_ids) const {
vector<int> sample;
int num_subpatterns;
if (location.matchings == NULL) {
@@ -22,10 +24,11 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location, const unordered_s
double step = max(1.0, (double) (high - low) / max_samples);
double i = low, last = i;
bool found;
+ shared_ptr<DataArray> source_data_array = suffix_array->GetData();
while (sample.size() < max_samples && i < high) {
int x = suffix_array->GetSuffix(Round(i));
int id = source_data_array->GetSentenceId(x);
- if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) {
+ if (blacklisted_sentence_ids.count(id)) {
found = false;
double backoff_step = 1;
while (true) {
@@ -33,13 +36,14 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location, const unordered_s
double j = i - backoff_step;
x = suffix_array->GetSuffix(Round(j));
id = source_data_array->GetSentenceId(x);
- if (x >= 0 && j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) {
+ if (x >= 0 && j > last && !blacklisted_sentence_ids.count(id)) {
found = true; last = i; break;
}
double k = i + backoff_step;
x = suffix_array->GetSuffix(Round(k));
id = source_data_array->GetSentenceId(x);
- if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) {
+ if (k < min(i+step, (double)high) &&
+ !blacklisted_sentence_ids.count(id)) {
found = true; last = k; break;
}
if (j <= last && k >= high) break;