diff options
| author | Paul Baltescu <pauldb89@gmail.com> | 2013-01-28 11:56:31 +0000 | 
|---|---|---|
| committer | Paul Baltescu <pauldb89@gmail.com> | 2013-01-28 11:56:31 +0000 | 
| commit | 5530575ae0ad939e17f08d6bd49978acea388ab7 (patch) | |
| tree | 4620a276c1c827d824e285148f4f4a5bf781ebfe /extractor/intersector.cc | |
| parent | ce6937f136a38af93d9a5cd9628acc712da95543 (diff) | |
Initial working commit.
Diffstat (limited to 'extractor/intersector.cc')
| -rw-r--r-- | extractor/intersector.cc | 129 | 
1 files changed, 129 insertions, 0 deletions
| diff --git a/extractor/intersector.cc b/extractor/intersector.cc new file mode 100644 index 00000000..9d9b54c0 --- /dev/null +++ b/extractor/intersector.cc @@ -0,0 +1,129 @@ +#include "intersector.h" + +#include "data_array.h" +#include "matching_comparator.h" +#include "phrase.h" +#include "phrase_location.h" +#include "precomputation.h" +#include "suffix_array.h" +#include "veb.h" +#include "vocabulary.h" + +Intersector::Intersector(shared_ptr<Vocabulary> vocabulary, +                         const Precomputation& precomputation, +                         shared_ptr<SuffixArray> suffix_array, +                         shared_ptr<MatchingComparator> comparator, +                         bool use_baeza_yates) : +    vocabulary(vocabulary), +    suffix_array(suffix_array), +    use_baeza_yates(use_baeza_yates) { +  linear_merger = make_shared<LinearMerger>( +      vocabulary, suffix_array->GetData(), comparator); +  binary_search_merger = make_shared<BinarySearchMerger>( +      vocabulary, linear_merger, suffix_array->GetData(), comparator); + +  shared_ptr<DataArray> source_data_array = suffix_array->GetData(); + +  const Index& precomputed_index = precomputation.GetInvertedIndex(); +  for (pair<vector<int>, vector<int> > entry: precomputed_index) { +    vector<int> phrase = Convert(entry.first, source_data_array); +    inverted_index[phrase] = entry.second; +  } + +  const Index& precomputed_collocations = precomputation.GetCollocations(); +  for (pair<vector<int>, vector<int> > entry: precomputed_collocations) { +    vector<int> phrase = Convert(entry.first, source_data_array); +    collocations[phrase] = entry.second; +  } +} + +vector<int> Intersector::Convert( +    const vector<int>& old_phrase, shared_ptr<DataArray> source_data_array) { +  vector<int> new_phrase; +  new_phrase.reserve(old_phrase.size()); + +  int arity = 0; +  for (int word_id: old_phrase) { +    if (word_id == Precomputation::NON_TERMINAL) { +      ++arity; +      new_phrase.push_back(vocabulary->GetNonterminalIndex(arity)); +    } else { +      new_phrase.push_back( +          vocabulary->GetTerminalIndex(source_data_array->GetWord(word_id))); +    } +  } + +  return new_phrase; +} + +PhraseLocation Intersector::Intersect( +    const Phrase& prefix, PhraseLocation& prefix_location, +    const Phrase& suffix, PhraseLocation& suffix_location, +    const Phrase& phrase) { +  vector<int> symbols = phrase.Get(); + +  // We should never attempt to do an intersect query for a pattern starting or +  // ending with a non terminal. The RuleFactory should handle these cases, +  // initializing the matchings list with the one for the pattern without the +  // starting or ending terminal. +  assert(vocabulary->IsTerminal(symbols.front()) +      && vocabulary->IsTerminal(symbols.back())); + +  if (collocations.count(symbols)) { +    return PhraseLocation(make_shared<vector<int> >(collocations[symbols]), +                          phrase.Arity()); +  } + +  vector<int> locations; +  ExtendPhraseLocation(prefix, prefix_location); +  ExtendPhraseLocation(suffix, suffix_location); +  shared_ptr<vector<int> > prefix_matchings = prefix_location.matchings; +  shared_ptr<vector<int> > suffix_matchings = suffix_location.matchings; +  int prefix_subpatterns = prefix_location.num_subpatterns; +  int suffix_subpatterns = prefix_location.num_subpatterns; +  if (use_baeza_yates) { +    binary_search_merger->Merge(locations, phrase, suffix, +        prefix_matchings->begin(), prefix_matchings->end(), +        suffix_matchings->begin(), suffix_matchings->end(), +        prefix_subpatterns, suffix_subpatterns); +  } else { +    linear_merger->Merge(locations, phrase, suffix, prefix_matchings->begin(), +        prefix_matchings->end(), suffix_matchings->begin(), +        suffix_matchings->end(), prefix_subpatterns, suffix_subpatterns); +  } +  return PhraseLocation(shared_ptr<vector<int> >(new vector<int>(locations)), +                        phrase.Arity() + 1); +} + +void Intersector::ExtendPhraseLocation( +    const Phrase& phrase, PhraseLocation& phrase_location) { +  int low = phrase_location.sa_low, high = phrase_location.sa_high; +  if (phrase.Arity() || phrase_location.num_subpatterns || +      phrase_location.IsEmpty()) { +    return; +  } + +  phrase_location.num_subpatterns = 1; + +  vector<int> symbols = phrase.Get(); +  if (inverted_index.count(symbols)) { +    phrase_location.matchings = +        make_shared<vector<int> >(inverted_index[symbols]); +    return; +  } + +  vector<int> matchings; +  matchings.reserve(high - low + 1); +  shared_ptr<VEB> veb = VEB::Create(suffix_array->GetSize()); +  for (int i = low; i < high; ++i) { +    veb->Insert(suffix_array->GetSuffix(i)); +  } + +  int value = veb->GetMinimum(); +  while (value != -1) { +    matchings.push_back(value); +    value = veb->GetSuccessor(value); +  } + +  phrase_location.matchings = make_shared<vector<int> >(matchings); +} | 
