summaryrefslogtreecommitdiff
path: root/extractor/intersector.cc
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-02-22 16:52:25 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-02-22 16:52:25 +0000
commit6717daa034bfcfa429275f64d84607ae1568d488 (patch)
treec04c1546eed0e9acaa21907a5f8ab5aedc1a08fc /extractor/intersector.cc
parentbb79fdc140a5593124d3943769a644c6cc3d87f6 (diff)
parentd69c289e172562039bcbe987657280332ab6315e (diff)
Merge branch 'master' into experiment
Diffstat (limited to 'extractor/intersector.cc')
-rw-r--r--extractor/intersector.cc154
1 files changed, 0 insertions, 154 deletions
diff --git a/extractor/intersector.cc b/extractor/intersector.cc
deleted file mode 100644
index 39a7648d..00000000
--- a/extractor/intersector.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-#include "intersector.h"
-
-#include "data_array.h"
-#include "matching_comparator.h"
-#include "phrase.h"
-#include "phrase_location.h"
-#include "precomputation.h"
-#include "suffix_array.h"
-#include "veb.h"
-#include "vocabulary.h"
-
-Intersector::Intersector(shared_ptr<Vocabulary> vocabulary,
- shared_ptr<Precomputation> precomputation,
- shared_ptr<SuffixArray> suffix_array,
- shared_ptr<MatchingComparator> comparator,
- bool use_baeza_yates) :
- vocabulary(vocabulary),
- suffix_array(suffix_array),
- use_baeza_yates(use_baeza_yates) {
- shared_ptr<DataArray> data_array = suffix_array->GetData();
- linear_merger = make_shared<LinearMerger>(vocabulary, data_array, comparator);
- binary_search_merger = make_shared<BinarySearchMerger>(
- vocabulary, linear_merger, data_array, comparator);
- ConvertIndexes(precomputation, data_array);
-}
-
-Intersector::Intersector(shared_ptr<Vocabulary> vocabulary,
- shared_ptr<Precomputation> precomputation,
- shared_ptr<SuffixArray> suffix_array,
- shared_ptr<LinearMerger> linear_merger,
- shared_ptr<BinarySearchMerger> binary_search_merger,
- bool use_baeza_yates) :
- vocabulary(vocabulary),
- suffix_array(suffix_array),
- linear_merger(linear_merger),
- binary_search_merger(binary_search_merger),
- use_baeza_yates(use_baeza_yates) {
- ConvertIndexes(precomputation, suffix_array->GetData());
-}
-
-Intersector::Intersector() {}
-
-Intersector::~Intersector() {}
-
-void Intersector::ConvertIndexes(shared_ptr<Precomputation> precomputation,
- shared_ptr<DataArray> data_array) {
- const Index& precomputed_index = precomputation->GetInvertedIndex();
- for (pair<vector<int>, vector<int> > entry: precomputed_index) {
- vector<int> phrase = ConvertPhrase(entry.first, data_array);
- inverted_index[phrase] = entry.second;
-
- phrase.push_back(vocabulary->GetNonterminalIndex(1));
- inverted_index[phrase] = entry.second;
- phrase.pop_back();
- phrase.insert(phrase.begin(), vocabulary->GetNonterminalIndex(1));
- inverted_index[phrase] = entry.second;
- }
-
- const Index& precomputed_collocations = precomputation->GetCollocations();
- for (pair<vector<int>, vector<int> > entry: precomputed_collocations) {
- vector<int> phrase = ConvertPhrase(entry.first, data_array);
- collocations[phrase] = entry.second;
- }
-}
-
-vector<int> Intersector::ConvertPhrase(const vector<int>& old_phrase,
- shared_ptr<DataArray> data_array) {
- vector<int> new_phrase;
- new_phrase.reserve(old_phrase.size());
-
- int arity = 0;
- for (int word_id: old_phrase) {
- if (word_id == Precomputation::NON_TERMINAL) {
- ++arity;
- new_phrase.push_back(vocabulary->GetNonterminalIndex(arity));
- } else {
- new_phrase.push_back(
- vocabulary->GetTerminalIndex(data_array->GetWord(word_id)));
- }
- }
-
- return new_phrase;
-}
-
-PhraseLocation Intersector::Intersect(
- const Phrase& prefix, PhraseLocation& prefix_location,
- const Phrase& suffix, PhraseLocation& suffix_location,
- const Phrase& phrase) {
- vector<int> symbols = phrase.Get();
-
- // We should never attempt to do an intersect query for a pattern starting or
- // ending with a non terminal. The RuleFactory should handle these cases,
- // initializing the matchings list with the one for the pattern without the
- // starting or ending terminal.
- assert(vocabulary->IsTerminal(symbols.front())
- && vocabulary->IsTerminal(symbols.back()));
-
- if (collocations.count(symbols)) {
- return PhraseLocation(collocations[symbols], phrase.Arity() + 1);
- }
-
- vector<int> locations;
- ExtendPhraseLocation(prefix, prefix_location);
- ExtendPhraseLocation(suffix, suffix_location);
- shared_ptr<vector<int> > prefix_matchings = prefix_location.matchings;
- shared_ptr<vector<int> > suffix_matchings = suffix_location.matchings;
- int prefix_subpatterns = prefix_location.num_subpatterns;
- int suffix_subpatterns = suffix_location.num_subpatterns;
- if (use_baeza_yates) {
- binary_search_merger->Merge(locations, phrase, suffix,
- prefix_matchings->begin(), prefix_matchings->end(),
- suffix_matchings->begin(), suffix_matchings->end(),
- prefix_subpatterns, suffix_subpatterns);
- } else {
- linear_merger->Merge(locations, phrase, suffix, prefix_matchings->begin(),
- prefix_matchings->end(), suffix_matchings->begin(),
- suffix_matchings->end(), prefix_subpatterns, suffix_subpatterns);
- }
- return PhraseLocation(locations, phrase.Arity() + 1);
-}
-
-void Intersector::ExtendPhraseLocation(
- const Phrase& phrase, PhraseLocation& phrase_location) {
- int low = phrase_location.sa_low, high = phrase_location.sa_high;
- if (phrase_location.matchings != NULL) {
- return;
- }
-
-
- phrase_location.num_subpatterns = 1;
- phrase_location.sa_low = phrase_location.sa_high = 0;
-
- vector<int> symbols = phrase.Get();
- if (inverted_index.count(symbols)) {
- phrase_location.matchings =
- make_shared<vector<int> >(inverted_index[symbols]);
- return;
- }
-
- vector<int> matchings;
- matchings.reserve(high - low + 1);
- shared_ptr<VEB> veb = VEB::Create(suffix_array->GetSize());
- for (int i = low; i < high; ++i) {
- veb->Insert(suffix_array->GetSuffix(i));
- }
-
- int value = veb->GetMinimum();
- while (value != -1) {
- matchings.push_back(value);
- value = veb->GetSuccessor(value);
- }
-
- phrase_location.matchings = make_shared<vector<int> >(matchings);
-}