diff options
Diffstat (limited to 'extractor/intersector.cc')
-rw-r--r-- | extractor/intersector.cc | 31 |
1 files changed, 30 insertions, 1 deletions
diff --git a/extractor/intersector.cc b/extractor/intersector.cc index b53479af..cf42f630 100644 --- a/extractor/intersector.cc +++ b/extractor/intersector.cc @@ -1,5 +1,7 @@ #include "intersector.h" +#include <chrono> + #include "data_array.h" #include "matching_comparator.h" #include "phrase.h" @@ -9,6 +11,10 @@ #include "veb.h" #include "vocabulary.h" +using namespace std::chrono; + +typedef high_resolution_clock Clock; + Intersector::Intersector(shared_ptr<Vocabulary> vocabulary, shared_ptr<Precomputation> precomputation, shared_ptr<SuffixArray> suffix_array, @@ -38,12 +44,22 @@ Intersector::Intersector(shared_ptr<Vocabulary> vocabulary, ConvertIndexes(precomputation, suffix_array->GetData()); } +Intersector::Intersector() {} + +Intersector::~Intersector() {} + void Intersector::ConvertIndexes(shared_ptr<Precomputation> precomputation, shared_ptr<DataArray> data_array) { const Index& precomputed_index = precomputation->GetInvertedIndex(); for (pair<vector<int>, vector<int> > entry: precomputed_index) { vector<int> phrase = ConvertPhrase(entry.first, data_array); inverted_index[phrase] = entry.second; + + phrase.push_back(vocabulary->GetNonterminalIndex(1)); + inverted_index[phrase] = entry.second; + phrase.pop_back(); + phrase.insert(phrase.begin(), vocabulary->GetNonterminalIndex(1)); + inverted_index[phrase] = entry.second; } const Index& precomputed_collocations = precomputation->GetCollocations(); @@ -76,6 +92,9 @@ PhraseLocation Intersector::Intersect( const Phrase& prefix, PhraseLocation& prefix_location, const Phrase& suffix, PhraseLocation& suffix_location, const Phrase& phrase) { + if (linear_merge_time == 0) { + linear_merger->linear_merge_time = 0; + } vector<int> symbols = phrase.Get(); // We should never attempt to do an intersect query for a pattern starting or @@ -95,17 +114,23 @@ PhraseLocation Intersector::Intersect( shared_ptr<vector<int> > prefix_matchings = prefix_location.matchings; shared_ptr<vector<int> > suffix_matchings = suffix_location.matchings; int prefix_subpatterns = prefix_location.num_subpatterns; - int suffix_subpatterns = prefix_location.num_subpatterns; + int suffix_subpatterns = suffix_location.num_subpatterns; if (use_baeza_yates) { + double prev_linear_merge_time = linear_merger->linear_merge_time; + Clock::time_point start = Clock::now(); binary_search_merger->Merge(locations, phrase, suffix, prefix_matchings->begin(), prefix_matchings->end(), suffix_matchings->begin(), suffix_matchings->end(), prefix_subpatterns, suffix_subpatterns); + Clock::time_point stop = Clock::now(); + binary_merge_time += duration_cast<milliseconds>(stop - start).count() - + (linear_merger->linear_merge_time - prev_linear_merge_time); } else { linear_merger->Merge(locations, phrase, suffix, prefix_matchings->begin(), prefix_matchings->end(), suffix_matchings->begin(), suffix_matchings->end(), prefix_subpatterns, suffix_subpatterns); } + linear_merge_time = linear_merger->linear_merge_time; return PhraseLocation(locations, phrase.Arity() + 1); } @@ -116,6 +141,8 @@ void Intersector::ExtendPhraseLocation( return; } + Clock::time_point sort_start = Clock::now(); + phrase_location.num_subpatterns = 1; phrase_location.sa_low = phrase_location.sa_high = 0; @@ -140,4 +167,6 @@ void Intersector::ExtendPhraseLocation( } phrase_location.matchings = make_shared<vector<int> >(matchings); + Clock::time_point sort_stop = Clock::now(); + sort_time += duration_cast<milliseconds>(sort_stop - sort_start).count(); } |