Script for grammar extraction only.

author: Paul Baltescu <pauldb89@gmail.com> 2013-11-26 15:01:14 +0000
committer: Paul Baltescu <pauldb89@gmail.com> 2013-11-26 15:01:14 +0000
commit: a3826db61847a55f59bb9666f61fd1bb88888085 (patch)
tree: 022475bafbf71ba6aaeb98efdbafcde24f7e60a5 /extractor/sampler.cc
parent: 1cd86c44e1799c441cdcda2a022be0ee6e52d38c (diff)
1 files changed, 11 insertions, 12 deletions
diff --git a/extractor/sampler.cc b/extractor/sampler.cc
index fc386ed1..887aaec1 100644
--- a/extractor/sampler.cc
+++ b/extractor/sampler.cc
@@ -15,6 +15,7 @@ Sampler::~Sampler() {}
 PhraseLocation Sampler::Sample(
     const PhraseLocation& location,
     const unordered_set<int>& blacklisted_sentence_ids) const {
+  shared_ptr<DataArray> source_data_array = suffix_array->GetData();
   vector<int> sample;
   int num_subpatterns;
   if (location.matchings == NULL) {
@@ -22,32 +23,30 @@ PhraseLocation Sampler::Sample(
     num_subpatterns = 1;
     int low = location.sa_low, high = location.sa_high;
     double step = max(1.0, (double) (high - low) / max_samples);
-    double i = low, last = i;
-    bool found;
-    shared_ptr<DataArray> source_data_array = suffix_array->GetData();
+    double i = low, last = i - 1;
     while (sample.size() < max_samples && i < high) {
       int x = suffix_array->GetSuffix(Round(i));
       int id = source_data_array->GetSentenceId(x);
+      bool found = false;
       if (blacklisted_sentence_ids.count(id)) {
-        found = false;
-        double backoff_step = 1;
-        while (true) {
-          if ((double)backoff_step >= step) break;
+        for (int backoff_step = 1; backoff_step <= step; ++backoff_step) {
           double j = i - backoff_step;
           x = suffix_array->GetSuffix(Round(j));
           id = source_data_array->GetSentenceId(x);
           if (x >= 0 && j > last && !blacklisted_sentence_ids.count(id)) {
-            found = true; last = i; break;
+            found = true;
+            last = i;
+            break;
           }
           double k = i + backoff_step;
           x = suffix_array->GetSuffix(Round(k));
           id = source_data_array->GetSentenceId(x);
-          if (k < min(i+step, (double)high) &&
+          if (k < min(i+step, (double) high) &&
               !blacklisted_sentence_ids.count(id)) {
-            found = true; last = k; break;
+            found = true;
+            last = k;
+            break;
           }
-          if (j <= last && k >= high) break;
-          backoff_step++;
         }
       } else {
         found = true;
author	Paul Baltescu <pauldb89@gmail.com>	2013-11-26 15:01:14 +0000
committer	Paul Baltescu <pauldb89@gmail.com>	2013-11-26 15:01:14 +0000
commit	a3826db61847a55f59bb9666f61fd1bb88888085 (patch)
tree	022475bafbf71ba6aaeb98efdbafcde24f7e60a5 /extractor/sampler.cc
parent	1cd86c44e1799c441cdcda2a022be0ee6e52d38c (diff)