summaryrefslogtreecommitdiff
path: root/extractor/intersector.cc
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-02-14 23:17:15 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-02-14 23:17:15 +0000
commit63b30ed9c8510da8c8e2f6a456576424fddacc0e (patch)
tree1b5278fb5a4480b7f7a965bb6de8f6f9e9c4d333 /extractor/intersector.cc
parent0a53f7eca74c165b5ce1c238f1999ddf1febea55 (diff)
Working version of the grammar extractor.
Diffstat (limited to 'extractor/intersector.cc')
-rw-r--r--extractor/intersector.cc31
1 files changed, 30 insertions, 1 deletions
diff --git a/extractor/intersector.cc b/extractor/intersector.cc
index b53479af..cf42f630 100644
--- a/extractor/intersector.cc
+++ b/extractor/intersector.cc
@@ -1,5 +1,7 @@
#include "intersector.h"
+#include <chrono>
+
#include "data_array.h"
#include "matching_comparator.h"
#include "phrase.h"
@@ -9,6 +11,10 @@
#include "veb.h"
#include "vocabulary.h"
+using namespace std::chrono;
+
+typedef high_resolution_clock Clock;
+
Intersector::Intersector(shared_ptr<Vocabulary> vocabulary,
shared_ptr<Precomputation> precomputation,
shared_ptr<SuffixArray> suffix_array,
@@ -38,12 +44,22 @@ Intersector::Intersector(shared_ptr<Vocabulary> vocabulary,
ConvertIndexes(precomputation, suffix_array->GetData());
}
+Intersector::Intersector() {}
+
+Intersector::~Intersector() {}
+
void Intersector::ConvertIndexes(shared_ptr<Precomputation> precomputation,
shared_ptr<DataArray> data_array) {
const Index& precomputed_index = precomputation->GetInvertedIndex();
for (pair<vector<int>, vector<int> > entry: precomputed_index) {
vector<int> phrase = ConvertPhrase(entry.first, data_array);
inverted_index[phrase] = entry.second;
+
+ phrase.push_back(vocabulary->GetNonterminalIndex(1));
+ inverted_index[phrase] = entry.second;
+ phrase.pop_back();
+ phrase.insert(phrase.begin(), vocabulary->GetNonterminalIndex(1));
+ inverted_index[phrase] = entry.second;
}
const Index& precomputed_collocations = precomputation->GetCollocations();
@@ -76,6 +92,9 @@ PhraseLocation Intersector::Intersect(
const Phrase& prefix, PhraseLocation& prefix_location,
const Phrase& suffix, PhraseLocation& suffix_location,
const Phrase& phrase) {
+ if (linear_merge_time == 0) {
+ linear_merger->linear_merge_time = 0;
+ }
vector<int> symbols = phrase.Get();
// We should never attempt to do an intersect query for a pattern starting or
@@ -95,17 +114,23 @@ PhraseLocation Intersector::Intersect(
shared_ptr<vector<int> > prefix_matchings = prefix_location.matchings;
shared_ptr<vector<int> > suffix_matchings = suffix_location.matchings;
int prefix_subpatterns = prefix_location.num_subpatterns;
- int suffix_subpatterns = prefix_location.num_subpatterns;
+ int suffix_subpatterns = suffix_location.num_subpatterns;
if (use_baeza_yates) {
+ double prev_linear_merge_time = linear_merger->linear_merge_time;
+ Clock::time_point start = Clock::now();
binary_search_merger->Merge(locations, phrase, suffix,
prefix_matchings->begin(), prefix_matchings->end(),
suffix_matchings->begin(), suffix_matchings->end(),
prefix_subpatterns, suffix_subpatterns);
+ Clock::time_point stop = Clock::now();
+ binary_merge_time += duration_cast<milliseconds>(stop - start).count() -
+ (linear_merger->linear_merge_time - prev_linear_merge_time);
} else {
linear_merger->Merge(locations, phrase, suffix, prefix_matchings->begin(),
prefix_matchings->end(), suffix_matchings->begin(),
suffix_matchings->end(), prefix_subpatterns, suffix_subpatterns);
}
+ linear_merge_time = linear_merger->linear_merge_time;
return PhraseLocation(locations, phrase.Arity() + 1);
}
@@ -116,6 +141,8 @@ void Intersector::ExtendPhraseLocation(
return;
}
+ Clock::time_point sort_start = Clock::now();
+
phrase_location.num_subpatterns = 1;
phrase_location.sa_low = phrase_location.sa_high = 0;
@@ -140,4 +167,6 @@ void Intersector::ExtendPhraseLocation(
}
phrase_location.matchings = make_shared<vector<int> >(matchings);
+ Clock::time_point sort_stop = Clock::now();
+ sort_time += duration_cast<milliseconds>(sort_stop - sort_start).count();
}