diff options
author | Patrick Simianer <p@simianer.de> | 2013-05-02 09:09:59 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2013-05-02 09:09:59 +0200 |
commit | 9e50f0237413180fba11b500c9dce5c600e3c157 (patch) | |
tree | 556fc31d231353c853a864afffddd43dc525549a /extractor/sampler.cc | |
parent | d18024a41cbc1b54db88d499571349a6234b6db8 (diff) | |
parent | 14ed53426726202813a8e82d706b44266f015fe1 (diff) |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'extractor/sampler.cc')
-rw-r--r-- | extractor/sampler.cc | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/extractor/sampler.cc b/extractor/sampler.cc new file mode 100644 index 00000000..d81956b5 --- /dev/null +++ b/extractor/sampler.cc @@ -0,0 +1,46 @@ +#include "sampler.h" + +#include "phrase_location.h" +#include "suffix_array.h" + +namespace extractor { + +Sampler::Sampler(shared_ptr<SuffixArray> suffix_array, int max_samples) : + suffix_array(suffix_array), max_samples(max_samples) {} + +Sampler::Sampler() {} + +Sampler::~Sampler() {} + +PhraseLocation Sampler::Sample(const PhraseLocation& location) const { + vector<int> sample; + int num_subpatterns; + if (location.matchings == NULL) { + // Sample suffix array range. + num_subpatterns = 1; + int low = location.sa_low, high = location.sa_high; + double step = max(1.0, (double) (high - low) / max_samples); + for (double i = low; i < high && sample.size() < max_samples; i += step) { + sample.push_back(suffix_array->GetSuffix(Round(i))); + } + } else { + // Sample vector of occurrences. + num_subpatterns = location.num_subpatterns; + int num_matchings = location.matchings->size() / num_subpatterns; + double step = max(1.0, (double) num_matchings / max_samples); + for (double i = 0, num_samples = 0; + i < num_matchings && num_samples < max_samples; + i += step, ++num_samples) { + int start = Round(i) * num_subpatterns; + sample.insert(sample.end(), location.matchings->begin() + start, + location.matchings->begin() + start + num_subpatterns); + } + } + return PhraseLocation(sample, num_subpatterns); +} + +int Sampler::Round(double x) const { + return x + 0.5; +} + +} // namespace extractor |