diff options
Diffstat (limited to 'extractor/precomputation.cc')
-rw-r--r-- | extractor/precomputation.cc | 110 |
1 files changed, 66 insertions, 44 deletions
diff --git a/extractor/precomputation.cc b/extractor/precomputation.cc index 38d8f489..b79daae3 100644 --- a/extractor/precomputation.cc +++ b/extractor/precomputation.cc @@ -5,60 +5,67 @@ #include "data_array.h" #include "suffix_array.h" +#include "time_util.h" #include "vocabulary.h" using namespace std; namespace extractor { -int Precomputation::NONTERMINAL = -1; - Precomputation::Precomputation( shared_ptr<Vocabulary> vocabulary, shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns, int num_super_frequent_patterns, int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency) { + Clock::time_point start_time = Clock::now(); + shared_ptr<DataArray> data_array = suffix_array->GetData(); + vector<int> data = data_array->GetData(); vector<vector<int>> frequent_patterns = FindMostFrequentPatterns( - suffix_array, num_frequent_patterns, max_frequent_phrase_len, + suffix_array, data, num_frequent_patterns, max_frequent_phrase_len, min_frequency); + Clock::time_point end_time = Clock::now(); + cerr << "Finding most frequent patterns took " + << GetDuration(start_time, end_time) << " seconds..." << endl; - // Construct sets containing the frequent and superfrequent contiguous - // collocations. - unordered_set<vector<int>, VectorHash> frequent_patterns_set; - unordered_set<vector<int>, VectorHash> super_frequent_patterns_set; + vector<vector<int>> pattern_annotations(frequent_patterns.size()); + unordered_map<vector<int>, int, VectorHash> frequent_patterns_index; for (size_t i = 0; i < frequent_patterns.size(); ++i) { - frequent_patterns_set.insert(frequent_patterns[i]); - if (i < num_super_frequent_patterns) { - super_frequent_patterns_set.insert(frequent_patterns[i]); - } + frequent_patterns_index[frequent_patterns[i]] = i; + pattern_annotations[i] = AnnotatePattern(vocabulary, data_array, + frequent_patterns[i]); } - shared_ptr<DataArray> data_array = suffix_array->GetData(); + start_time = Clock::now(); vector<tuple<int, int, int>> matchings; - for (size_t i = 0; i < data_array->GetSize(); ++i) { + vector<vector<int>> annotations; + for (size_t i = 0; i < data.size(); ++i) { // If the sentence is over, add all the discontiguous frequent patterns to // the index. - if (data_array->AtIndex(i) == DataArray::END_OF_LINE) { - UpdateIndex(data_array, vocabulary, matchings, max_rule_span, - min_gap_size, max_rule_symbols); + if (data[i] == DataArray::END_OF_LINE) { + UpdateIndex(matchings, annotations, max_rule_span, min_gap_size, + max_rule_symbols); matchings.clear(); + annotations.clear(); continue; } // Find all the contiguous frequent patterns starting at position i. vector<int> pattern; - for (int j = 1; - j <= max_frequent_phrase_len && i + j <= data_array->GetSize(); - ++j) { - pattern.push_back(data_array->AtIndex(i + j - 1)); - if (!frequent_patterns_set.count(pattern)) { + for (int j = 1; j <= max_frequent_phrase_len && i + j <= data.size(); ++j) { + pattern.push_back(data[i + j - 1]); + auto it = frequent_patterns_index.find(pattern); + if (it == frequent_patterns_index.end()) { // If the current pattern is not frequent, any longer pattern having the // current pattern as prefix will not be frequent. break; } - int is_super_frequent = super_frequent_patterns_set.count(pattern); + int is_super_frequent = it->second < num_super_frequent_patterns; matchings.push_back(make_tuple(i, j, is_super_frequent)); + annotations.push_back(pattern_annotations[it->second]); } } + end_time = Clock::now(); + cerr << "Constructing collocations index took " + << GetDuration(start_time, end_time) << " seconds..." << endl; } Precomputation::Precomputation() {} @@ -66,8 +73,8 @@ Precomputation::Precomputation() {} Precomputation::~Precomputation() {} vector<vector<int>> Precomputation::FindMostFrequentPatterns( - shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns, - int max_frequent_phrase_len, int min_frequency) { + shared_ptr<SuffixArray> suffix_array, const vector<int>& data, + int num_frequent_patterns, int max_frequent_phrase_len, int min_frequency) { vector<int> lcp = suffix_array->BuildLCPArray(); vector<int> run_start(max_frequent_phrase_len); @@ -76,9 +83,9 @@ vector<vector<int>> Precomputation::FindMostFrequentPatterns( for (size_t i = 1; i < lcp.size(); ++i) { for (int len = lcp[i]; len < max_frequent_phrase_len; ++len) { int frequency = i - run_start[len]; - if (frequency >= min_frequency) { - heap.push(make_pair(frequency, - make_pair(suffix_array->GetSuffix(run_start[len]), len + 1))); + int start = suffix_array->GetSuffix(run_start[len]); + if (frequency >= min_frequency && start + len <= data.size()) { + heap.push(make_pair(frequency, make_pair(start, len + 1))); } run_start[len] = i; } @@ -101,9 +108,20 @@ vector<vector<int>> Precomputation::FindMostFrequentPatterns( return frequent_patterns; } +vector<int> Precomputation::AnnotatePattern( + shared_ptr<Vocabulary> vocabulary, shared_ptr<DataArray> data_array, + const vector<int>& pattern) const { + vector<int> annotation; + for (int word_id: pattern) { + annotation.push_back(vocabulary->GetTerminalIndex( + data_array->GetWord(word_id))); + } + return annotation; +} + void Precomputation::UpdateIndex( - shared_ptr<DataArray> data_array, shared_ptr<Vocabulary> vocabulary, const vector<tuple<int, int, int>>& matchings, + const vector<vector<int>>& annotations, int max_rule_span, int min_gap_size, int max_rule_symbols) { // Select the leftmost subpattern. for (size_t i = 0; i < matchings.size(); ++i) { @@ -121,15 +139,14 @@ void Precomputation::UpdateIndex( if (start2 - start1 - size1 >= min_gap_size && start2 + size2 - start1 <= max_rule_span && size1 + size2 + 1 <= max_rule_symbols) { - vector<int> pattern; - AppendSubpattern(pattern, data_array, vocabulary, start1, size1); - pattern.push_back(Precomputation::NONTERMINAL); - AppendSubpattern(pattern, data_array, vocabulary, start2, size2); - AppendCollocation(index[pattern], {start1, start2}); + vector<int> pattern = annotations[i]; + pattern.push_back(-1); + AppendSubpattern(pattern, annotations[j]); + AppendCollocation(index[pattern], start1, start2); // Try extending the binary collocation to a ternary collocation. if (is_super2) { - pattern.push_back(Precomputation::NONTERMINAL); + pattern.push_back(-2); // Select the rightmost subpattern. for (size_t k = j + 1; k < matchings.size(); ++k) { int start3, size3, is_super3; @@ -142,8 +159,8 @@ void Precomputation::UpdateIndex( && start3 + size3 - start1 <= max_rule_span && size1 + size2 + size3 + 2 <= max_rule_symbols && (is_super1 || is_super3)) { - AppendSubpattern(pattern, data_array, vocabulary, start3, size3); - AppendCollocation(index[pattern], {start1, start2, start3}); + AppendSubpattern(pattern, annotations[k]); + AppendCollocation(index[pattern], start1, start2, start3); pattern.erase(pattern.end() - size3); } } @@ -154,17 +171,22 @@ void Precomputation::UpdateIndex( } void Precomputation::AppendSubpattern( - vector<int>& pattern, shared_ptr<DataArray> data_array, - shared_ptr<Vocabulary> vocabulary, int start, int size) { - vector<string> words = data_array->GetWords(start, size); - for (const string& word: words) { - pattern.push_back(vocabulary->GetTerminalIndex(word)); - } + vector<int>& pattern, + const vector<int>& subpattern) { + copy(subpattern.begin(), subpattern.end(), back_inserter(pattern)); +} + +void Precomputation::AppendCollocation( + vector<int>& collocations, int pos1, int pos2) { + collocations.push_back(pos1); + collocations.push_back(pos2); } void Precomputation::AppendCollocation( - vector<int>& collocations, const vector<int>& collocation) { - copy(collocation.begin(), collocation.end(), back_inserter(collocations)); + vector<int>& collocations, int pos1, int pos2, int pos3) { + collocations.push_back(pos1); + collocations.push_back(pos2); + collocations.push_back(pos3); } bool Precomputation::Contains(const vector<int>& pattern) const { |