diff options
author | Patrick Simianer <p@simianer.de> | 2013-05-02 09:09:59 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2013-05-02 09:09:59 +0200 |
commit | 0ce66778da6079506896739e9d97dc7dff83cd72 (patch) | |
tree | f435457bb23dab0c566c9896f9d38cece9d15885 /extractor/sampler.cc | |
parent | b6754386f1109b960b05cdf2eabbc97bdd38e8df (diff) | |
parent | b7ea2615bc9bb69031ff714ddce1539c9f1bda2d (diff) |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'extractor/sampler.cc')
-rw-r--r-- | extractor/sampler.cc | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/extractor/sampler.cc b/extractor/sampler.cc new file mode 100644 index 00000000..d81956b5 --- /dev/null +++ b/extractor/sampler.cc @@ -0,0 +1,46 @@ +#include "sampler.h" + +#include "phrase_location.h" +#include "suffix_array.h" + +namespace extractor { + +Sampler::Sampler(shared_ptr<SuffixArray> suffix_array, int max_samples) : + suffix_array(suffix_array), max_samples(max_samples) {} + +Sampler::Sampler() {} + +Sampler::~Sampler() {} + +PhraseLocation Sampler::Sample(const PhraseLocation& location) const { + vector<int> sample; + int num_subpatterns; + if (location.matchings == NULL) { + // Sample suffix array range. + num_subpatterns = 1; + int low = location.sa_low, high = location.sa_high; + double step = max(1.0, (double) (high - low) / max_samples); + for (double i = low; i < high && sample.size() < max_samples; i += step) { + sample.push_back(suffix_array->GetSuffix(Round(i))); + } + } else { + // Sample vector of occurrences. + num_subpatterns = location.num_subpatterns; + int num_matchings = location.matchings->size() / num_subpatterns; + double step = max(1.0, (double) num_matchings / max_samples); + for (double i = 0, num_samples = 0; + i < num_matchings && num_samples < max_samples; + i += step, ++num_samples) { + int start = Round(i) * num_subpatterns; + sample.insert(sample.end(), location.matchings->begin() + start, + location.matchings->begin() + start + num_subpatterns); + } + } + return PhraseLocation(sample, num_subpatterns); +} + +int Sampler::Round(double x) const { + return x + 0.5; +} + +} // namespace extractor |