summaryrefslogtreecommitdiff
path: root/extractor/intersector.cc
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-02-01 16:11:10 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-02-01 16:11:10 +0000
commit252fb164c208ec8f3005f8a652eb3b48c0644e3d (patch)
tree7199cb668e77ef89c7bcccb37d70554e3b52c2a5 /extractor/intersector.cc
parent4ab84a0be28fdb6c0c421fe5ba5e09cfa298f2d1 (diff)
Second working commit.
Diffstat (limited to 'extractor/intersector.cc')
-rw-r--r--extractor/intersector.cc50
1 files changed, 32 insertions, 18 deletions
diff --git a/extractor/intersector.cc b/extractor/intersector.cc
index 9d9b54c0..b53479af 100644
--- a/extractor/intersector.cc
+++ b/extractor/intersector.cc
@@ -10,35 +10,51 @@
#include "vocabulary.h"
Intersector::Intersector(shared_ptr<Vocabulary> vocabulary,
- const Precomputation& precomputation,
+ shared_ptr<Precomputation> precomputation,
shared_ptr<SuffixArray> suffix_array,
shared_ptr<MatchingComparator> comparator,
bool use_baeza_yates) :
vocabulary(vocabulary),
suffix_array(suffix_array),
use_baeza_yates(use_baeza_yates) {
- linear_merger = make_shared<LinearMerger>(
- vocabulary, suffix_array->GetData(), comparator);
+ shared_ptr<DataArray> data_array = suffix_array->GetData();
+ linear_merger = make_shared<LinearMerger>(vocabulary, data_array, comparator);
binary_search_merger = make_shared<BinarySearchMerger>(
- vocabulary, linear_merger, suffix_array->GetData(), comparator);
+ vocabulary, linear_merger, data_array, comparator);
+ ConvertIndexes(precomputation, data_array);
+}
- shared_ptr<DataArray> source_data_array = suffix_array->GetData();
+Intersector::Intersector(shared_ptr<Vocabulary> vocabulary,
+ shared_ptr<Precomputation> precomputation,
+ shared_ptr<SuffixArray> suffix_array,
+ shared_ptr<LinearMerger> linear_merger,
+ shared_ptr<BinarySearchMerger> binary_search_merger,
+ bool use_baeza_yates) :
+ vocabulary(vocabulary),
+ suffix_array(suffix_array),
+ linear_merger(linear_merger),
+ binary_search_merger(binary_search_merger),
+ use_baeza_yates(use_baeza_yates) {
+ ConvertIndexes(precomputation, suffix_array->GetData());
+}
- const Index& precomputed_index = precomputation.GetInvertedIndex();
+void Intersector::ConvertIndexes(shared_ptr<Precomputation> precomputation,
+ shared_ptr<DataArray> data_array) {
+ const Index& precomputed_index = precomputation->GetInvertedIndex();
for (pair<vector<int>, vector<int> > entry: precomputed_index) {
- vector<int> phrase = Convert(entry.first, source_data_array);
+ vector<int> phrase = ConvertPhrase(entry.first, data_array);
inverted_index[phrase] = entry.second;
}
- const Index& precomputed_collocations = precomputation.GetCollocations();
+ const Index& precomputed_collocations = precomputation->GetCollocations();
for (pair<vector<int>, vector<int> > entry: precomputed_collocations) {
- vector<int> phrase = Convert(entry.first, source_data_array);
+ vector<int> phrase = ConvertPhrase(entry.first, data_array);
collocations[phrase] = entry.second;
}
}
-vector<int> Intersector::Convert(
- const vector<int>& old_phrase, shared_ptr<DataArray> source_data_array) {
+vector<int> Intersector::ConvertPhrase(const vector<int>& old_phrase,
+ shared_ptr<DataArray> data_array) {
vector<int> new_phrase;
new_phrase.reserve(old_phrase.size());
@@ -49,7 +65,7 @@ vector<int> Intersector::Convert(
new_phrase.push_back(vocabulary->GetNonterminalIndex(arity));
} else {
new_phrase.push_back(
- vocabulary->GetTerminalIndex(source_data_array->GetWord(word_id)));
+ vocabulary->GetTerminalIndex(data_array->GetWord(word_id)));
}
}
@@ -70,8 +86,7 @@ PhraseLocation Intersector::Intersect(
&& vocabulary->IsTerminal(symbols.back()));
if (collocations.count(symbols)) {
- return PhraseLocation(make_shared<vector<int> >(collocations[symbols]),
- phrase.Arity());
+ return PhraseLocation(collocations[symbols], phrase.Arity() + 1);
}
vector<int> locations;
@@ -91,19 +106,18 @@ PhraseLocation Intersector::Intersect(
prefix_matchings->end(), suffix_matchings->begin(),
suffix_matchings->end(), prefix_subpatterns, suffix_subpatterns);
}
- return PhraseLocation(shared_ptr<vector<int> >(new vector<int>(locations)),
- phrase.Arity() + 1);
+ return PhraseLocation(locations, phrase.Arity() + 1);
}
void Intersector::ExtendPhraseLocation(
const Phrase& phrase, PhraseLocation& phrase_location) {
int low = phrase_location.sa_low, high = phrase_location.sa_high;
- if (phrase.Arity() || phrase_location.num_subpatterns ||
- phrase_location.IsEmpty()) {
+ if (phrase_location.matchings != NULL) {
return;
}
phrase_location.num_subpatterns = 1;
+ phrase_location.sa_low = phrase_location.sa_high = 0;
vector<int> symbols = phrase.Get();
if (inverted_index.count(symbols)) {