From 252fb164c208ec8f3005f8a652eb3b48c0644e3d Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Fri, 1 Feb 2013 16:11:10 +0000 Subject: Second working commit. --- extractor/intersector.cc | 50 +++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 18 deletions(-) (limited to 'extractor/intersector.cc') diff --git a/extractor/intersector.cc b/extractor/intersector.cc index 9d9b54c0..b53479af 100644 --- a/extractor/intersector.cc +++ b/extractor/intersector.cc @@ -10,35 +10,51 @@ #include "vocabulary.h" Intersector::Intersector(shared_ptr vocabulary, - const Precomputation& precomputation, + shared_ptr precomputation, shared_ptr suffix_array, shared_ptr comparator, bool use_baeza_yates) : vocabulary(vocabulary), suffix_array(suffix_array), use_baeza_yates(use_baeza_yates) { - linear_merger = make_shared( - vocabulary, suffix_array->GetData(), comparator); + shared_ptr data_array = suffix_array->GetData(); + linear_merger = make_shared(vocabulary, data_array, comparator); binary_search_merger = make_shared( - vocabulary, linear_merger, suffix_array->GetData(), comparator); + vocabulary, linear_merger, data_array, comparator); + ConvertIndexes(precomputation, data_array); +} - shared_ptr source_data_array = suffix_array->GetData(); +Intersector::Intersector(shared_ptr vocabulary, + shared_ptr precomputation, + shared_ptr suffix_array, + shared_ptr linear_merger, + shared_ptr binary_search_merger, + bool use_baeza_yates) : + vocabulary(vocabulary), + suffix_array(suffix_array), + linear_merger(linear_merger), + binary_search_merger(binary_search_merger), + use_baeza_yates(use_baeza_yates) { + ConvertIndexes(precomputation, suffix_array->GetData()); +} - const Index& precomputed_index = precomputation.GetInvertedIndex(); +void Intersector::ConvertIndexes(shared_ptr precomputation, + shared_ptr data_array) { + const Index& precomputed_index = precomputation->GetInvertedIndex(); for (pair, vector > entry: precomputed_index) { - vector phrase = Convert(entry.first, source_data_array); + vector phrase = ConvertPhrase(entry.first, data_array); inverted_index[phrase] = entry.second; } - const Index& precomputed_collocations = precomputation.GetCollocations(); + const Index& precomputed_collocations = precomputation->GetCollocations(); for (pair, vector > entry: precomputed_collocations) { - vector phrase = Convert(entry.first, source_data_array); + vector phrase = ConvertPhrase(entry.first, data_array); collocations[phrase] = entry.second; } } -vector Intersector::Convert( - const vector& old_phrase, shared_ptr source_data_array) { +vector Intersector::ConvertPhrase(const vector& old_phrase, + shared_ptr data_array) { vector new_phrase; new_phrase.reserve(old_phrase.size()); @@ -49,7 +65,7 @@ vector Intersector::Convert( new_phrase.push_back(vocabulary->GetNonterminalIndex(arity)); } else { new_phrase.push_back( - vocabulary->GetTerminalIndex(source_data_array->GetWord(word_id))); + vocabulary->GetTerminalIndex(data_array->GetWord(word_id))); } } @@ -70,8 +86,7 @@ PhraseLocation Intersector::Intersect( && vocabulary->IsTerminal(symbols.back())); if (collocations.count(symbols)) { - return PhraseLocation(make_shared >(collocations[symbols]), - phrase.Arity()); + return PhraseLocation(collocations[symbols], phrase.Arity() + 1); } vector locations; @@ -91,19 +106,18 @@ PhraseLocation Intersector::Intersect( prefix_matchings->end(), suffix_matchings->begin(), suffix_matchings->end(), prefix_subpatterns, suffix_subpatterns); } - return PhraseLocation(shared_ptr >(new vector(locations)), - phrase.Arity() + 1); + return PhraseLocation(locations, phrase.Arity() + 1); } void Intersector::ExtendPhraseLocation( const Phrase& phrase, PhraseLocation& phrase_location) { int low = phrase_location.sa_low, high = phrase_location.sa_high; - if (phrase.Arity() || phrase_location.num_subpatterns || - phrase_location.IsEmpty()) { + if (phrase_location.matchings != NULL) { return; } phrase_location.num_subpatterns = 1; + phrase_location.sa_low = phrase_location.sa_high = 0; vector symbols = phrase.Get(); if (inverted_index.count(symbols)) { -- cgit v1.2.3