diff options
author | Patrick Simianer <p@simianer.de> | 2013-12-04 20:13:07 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2013-12-04 20:13:07 +0100 |
commit | 9ff43d7c8e076aaa8790bacbd4b2cfe636a55a97 (patch) | |
tree | e1e0265b18ffc854f24209cb36b2c836100f099b /extractor/sampler.cc | |
parent | e59cdac5253df7ab57296d347245d1a8f4d8b287 (diff) | |
parent | 407b100cd3e4ae987504b53101151fba287ad999 (diff) |
fix merge conflict
Diffstat (limited to 'extractor/sampler.cc')
-rw-r--r-- | extractor/sampler.cc | 75 |
1 files changed, 0 insertions, 75 deletions
diff --git a/extractor/sampler.cc b/extractor/sampler.cc deleted file mode 100644 index 963afa7a..00000000 --- a/extractor/sampler.cc +++ /dev/null @@ -1,75 +0,0 @@ -#include "sampler.h" - -#include "phrase_location.h" -#include "suffix_array.h" - -namespace extractor { - -Sampler::Sampler(shared_ptr<SuffixArray> suffix_array, int max_samples) : - suffix_array(suffix_array), max_samples(max_samples) {} - -Sampler::Sampler() {} - -Sampler::~Sampler() {} - -PhraseLocation Sampler::Sample(const PhraseLocation& location, const unordered_set<int>& blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array) const { - vector<int> sample; - int num_subpatterns; - if (location.matchings == NULL) { - // Sample suffix array range. - num_subpatterns = 1; - int low = location.sa_low, high = location.sa_high; - double step = max(1.0, (double) (high - low) / max_samples); - double i = low, last = i; - bool found; - while (sample.size() < max_samples && i < high) { - int x = suffix_array->GetSuffix(Round(i)); - int id = source_data_array->GetSentenceId(x); - if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) { - found = false; - double backoff_step = 1; - while (true) { - if ((double)backoff_step >= step) break; - double j = i - backoff_step; - x = suffix_array->GetSuffix(Round(j)); - id = source_data_array->GetSentenceId(x); - if (x >= 0 && j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { - found = true; last = i; break; - } - double k = i + backoff_step; - x = suffix_array->GetSuffix(Round(k)); - id = source_data_array->GetSentenceId(x); - if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { - found = true; last = k; break; - } - if (j <= last && k >= high) break; - backoff_step++; - } - } else { - found = true; - last = i; - } - if (found) sample.push_back(x); - i += step; - } - } else { - // Sample vector of occurrences. - num_subpatterns = location.num_subpatterns; - int num_matchings = location.matchings->size() / num_subpatterns; - double step = max(1.0, (double) num_matchings / max_samples); - for (double i = 0, num_samples = 0; - i < num_matchings && num_samples < max_samples; - i += step, ++num_samples) { - int start = Round(i) * num_subpatterns; - sample.insert(sample.end(), location.matchings->begin() + start, - location.matchings->begin() + start + num_subpatterns); - } - } - return PhraseLocation(sample, num_subpatterns); -} - -int Sampler::Round(double x) const { - return x + 0.5; -} - -} // namespace extractor |