Working version of the grammar extractor.

author: Paul Baltescu <pauldb89@gmail.com> 2013-02-14 23:17:15 +0000
committer: Paul Baltescu <pauldb89@gmail.com> 2013-02-14 23:17:15 +0000
commit: 63b30ed9c8510da8c8e2f6a456576424fddacc0e (patch)
tree: 1b5278fb5a4480b7f7a965bb6de8f6f9e9c4d333 /extractor/intersector.cc
parent: 0a53f7eca74c165b5ce1c238f1999ddf1febea55 (diff)
1 files changed, 30 insertions, 1 deletions
diff --git a/extractor/intersector.cc b/extractor/intersector.cc
index b53479af..cf42f630 100644
--- a/extractor/intersector.cc
+++ b/extractor/intersector.cc
@@ -1,5 +1,7 @@
 #include "intersector.h"
 
+#include <chrono>
+
 #include "data_array.h"
 #include "matching_comparator.h"
 #include "phrase.h"
@@ -9,6 +11,10 @@
 #include "veb.h"
 #include "vocabulary.h"
 
+using namespace std::chrono;
+
+typedef high_resolution_clock Clock;
+
 Intersector::Intersector(shared_ptr<Vocabulary> vocabulary,
                          shared_ptr<Precomputation> precomputation,
                          shared_ptr<SuffixArray> suffix_array,
@@ -38,12 +44,22 @@ Intersector::Intersector(shared_ptr<Vocabulary> vocabulary,
   ConvertIndexes(precomputation, suffix_array->GetData());
 }
 
+Intersector::Intersector() {}
+
+Intersector::~Intersector() {}
+
 void Intersector::ConvertIndexes(shared_ptr<Precomputation> precomputation,
                                  shared_ptr<DataArray> data_array) {
   const Index& precomputed_index = precomputation->GetInvertedIndex();
   for (pair<vector<int>, vector<int> > entry: precomputed_index) {
     vector<int> phrase = ConvertPhrase(entry.first, data_array);
     inverted_index[phrase] = entry.second;
+
+    phrase.push_back(vocabulary->GetNonterminalIndex(1));
+    inverted_index[phrase] = entry.second;
+    phrase.pop_back();
+    phrase.insert(phrase.begin(), vocabulary->GetNonterminalIndex(1));
+    inverted_index[phrase] = entry.second;
   }
 
   const Index& precomputed_collocations = precomputation->GetCollocations();
@@ -76,6 +92,9 @@ PhraseLocation Intersector::Intersect(
     const Phrase& prefix, PhraseLocation& prefix_location,
     const Phrase& suffix, PhraseLocation& suffix_location,
     const Phrase& phrase) {
+  if (linear_merge_time == 0) {
+    linear_merger->linear_merge_time = 0;
+  }
   vector<int> symbols = phrase.Get();
 
   // We should never attempt to do an intersect query for a pattern starting or
@@ -95,17 +114,23 @@ PhraseLocation Intersector::Intersect(
   shared_ptr<vector<int> > prefix_matchings = prefix_location.matchings;
   shared_ptr<vector<int> > suffix_matchings = suffix_location.matchings;
   int prefix_subpatterns = prefix_location.num_subpatterns;
-  int suffix_subpatterns = prefix_location.num_subpatterns;
+  int suffix_subpatterns = suffix_location.num_subpatterns;
   if (use_baeza_yates) {
+    double prev_linear_merge_time = linear_merger->linear_merge_time;
+    Clock::time_point start = Clock::now();
     binary_search_merger->Merge(locations, phrase, suffix,
         prefix_matchings->begin(), prefix_matchings->end(),
         suffix_matchings->begin(), suffix_matchings->end(),
         prefix_subpatterns, suffix_subpatterns);
+    Clock::time_point stop = Clock::now();
+    binary_merge_time += duration_cast<milliseconds>(stop - start).count() -
+        (linear_merger->linear_merge_time - prev_linear_merge_time);
   } else {
     linear_merger->Merge(locations, phrase, suffix, prefix_matchings->begin(),
         prefix_matchings->end(), suffix_matchings->begin(),
         suffix_matchings->end(), prefix_subpatterns, suffix_subpatterns);
   }
+  linear_merge_time = linear_merger->linear_merge_time;
   return PhraseLocation(locations, phrase.Arity() + 1);
 }
 
@@ -116,6 +141,8 @@ void Intersector::ExtendPhraseLocation(
     return;
   }
 
+  Clock::time_point sort_start = Clock::now();
+
   phrase_location.num_subpatterns = 1;
   phrase_location.sa_low = phrase_location.sa_high = 0;
 
@@ -140,4 +167,6 @@ void Intersector::ExtendPhraseLocation(
   }
 
   phrase_location.matchings = make_shared<vector<int> >(matchings);
+  Clock::time_point sort_stop = Clock::now();
+  sort_time += duration_cast<milliseconds>(sort_stop - sort_start).count();
 }
author	Paul Baltescu <pauldb89@gmail.com>	2013-02-14 23:17:15 +0000
committer	Paul Baltescu <pauldb89@gmail.com>	2013-02-14 23:17:15 +0000
commit	63b30ed9c8510da8c8e2f6a456576424fddacc0e (patch)
tree	1b5278fb5a4480b7f7a965bb6de8f6f9e9c4d333 /extractor/intersector.cc
parent	0a53f7eca74c165b5ce1c238f1999ddf1febea55 (diff)