Merge pull request #27 from pks/master

Tidying (soft) syntax features; loo for C++ extractor; updates for dtrain
author: Chris Dyer <redpony@gmail.com> 2013-11-13 11:22:24 -0800
committer: Chris Dyer <redpony@gmail.com> 2013-11-13 11:22:24 -0800
commit: 1b39e848903743990ca16e2323235b31db20178c (patch)
tree: 19fceda02d09c9d4990aba692ed97193020d1e86 /extractor/sampler.cc
parent: f83186887c94b2ff8b17aefcd0b395f116c09eb6 (diff)
parent: 4c7d24c9357f500839f04c7c8a8cfa0472801e18 (diff)
1 files changed, 32 insertions, 3 deletions
diff --git a/extractor/sampler.cc b/extractor/sampler.cc
index d81956b5..d332dd90 100644
--- a/extractor/sampler.cc
+++ b/extractor/sampler.cc
@@ -12,7 +12,7 @@ Sampler::Sampler() {}
 
 Sampler::~Sampler() {}
 
-PhraseLocation Sampler::Sample(const PhraseLocation& location) const {
+PhraseLocation Sampler::Sample(const PhraseLocation& location, unordered_set<int> blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array) const {
   vector<int> sample;
   int num_subpatterns;
   if (location.matchings == NULL) {
@@ -20,8 +20,37 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location) const {
     num_subpatterns = 1;
     int low = location.sa_low, high = location.sa_high;
     double step = max(1.0, (double) (high - low) / max_samples);
-    for (double i = low; i < high && sample.size() < max_samples; i += step) {
-      sample.push_back(suffix_array->GetSuffix(Round(i)));
+    double i = low, last = i;
+    bool found;
+    while (sample.size() < max_samples && i < high) {
+      int x = suffix_array->GetSuffix(Round(i));
+      int id = source_data_array->GetSentenceId(x);
+      if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) {
+        found = false;
+        double backoff_step = 1;
+        while (true) {
+          if ((double)backoff_step >= step) break;
+          double j = i - backoff_step;
+          x = suffix_array->GetSuffix(Round(j));
+          id = source_data_array->GetSentenceId(x);
+          if (x >= 0 && j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) {
+            found = true; last = i; break;
+          }
+          double k = i + backoff_step;
+          x = suffix_array->GetSuffix(Round(k));
+          id = source_data_array->GetSentenceId(x);
+          if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) {
+            found = true; last = k; break;
+          }
+          if (j <= last && k >= high) break;
+          backoff_step++;
+        }
+      } else {
+        found = true;
+        last = i;
+      }
+      if (found) sample.push_back(x);
+      i += step;
     }
   } else {
     // Sample vector of occurrences.
author	Chris Dyer <redpony@gmail.com>	2013-11-13 11:22:24 -0800
committer	Chris Dyer <redpony@gmail.com>	2013-11-13 11:22:24 -0800
commit	1b39e848903743990ca16e2323235b31db20178c (patch)
tree	19fceda02d09c9d4990aba692ed97193020d1e86 /extractor/sampler.cc
parent	f83186887c94b2ff8b17aefcd0b395f116c09eb6 (diff)
parent	4c7d24c9357f500839f04c7c8a8cfa0472801e18 (diff)