diff options
Diffstat (limited to 'extractor/sampler.cc')
-rw-r--r-- | extractor/sampler.cc | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/extractor/sampler.cc b/extractor/sampler.cc new file mode 100644 index 00000000..d8e0f49e --- /dev/null +++ b/extractor/sampler.cc @@ -0,0 +1,36 @@ +#include "sampler.h" + +#include "phrase_location.h" +#include "suffix_array.h" + +Sampler::Sampler(shared_ptr<SuffixArray> suffix_array, int max_samples) : + suffix_array(suffix_array), max_samples(max_samples) {} + +PhraseLocation Sampler::Sample(const PhraseLocation& location) const { + vector<int> sample; + int num_subpatterns; + if (location.matchings == NULL) { + num_subpatterns = 1; + int low = location.sa_low, high = location.sa_high; + double step = max(1.0, (double) (high - low) / max_samples); + for (double i = low; i < high && sample.size() < max_samples; i += step) { + sample.push_back(suffix_array->GetSuffix(Round(i))); + } + } else { + num_subpatterns = location.num_subpatterns; + int num_matchings = location.matchings->size() / num_subpatterns; + double step = max(1.0, (double) num_matchings / max_samples); + for (double i = 0, num_samples = 0; + i < num_matchings && num_samples < max_samples; + i += step, ++num_samples) { + int start = Round(i) * num_subpatterns; + sample.insert(sample.end(), location.matchings->begin() + start, + location.matchings->begin() + start + num_subpatterns); + } + } + return PhraseLocation(sample, num_subpatterns); +} + +int Sampler::Round(double x) const { + return x + 0.5; +} |