summaryrefslogtreecommitdiff
path: root/extractor/precomputation.cc
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-06-25 15:13:30 +0100
committerPaul Baltescu <pauldb89@gmail.com>2013-06-25 15:13:30 +0100
commit9a0a9582d38315fd83628112144077b35b5f1367 (patch)
tree27267f38981291742665f08e64204eb9b42671ef /extractor/precomputation.cc
parent23e89686849d290e8b64875a0bdf77cbdb70d2df (diff)
Reduce memory used by precomputation.
Diffstat (limited to 'extractor/precomputation.cc')
-rw-r--r--extractor/precomputation.cc138
1 files changed, 76 insertions, 62 deletions
diff --git a/extractor/precomputation.cc b/extractor/precomputation.cc
index 3b8aed69..37dbf7b7 100644
--- a/extractor/precomputation.cc
+++ b/extractor/precomputation.cc
@@ -14,63 +14,65 @@ int Precomputation::FIRST_NONTERMINAL = -1;
int Precomputation::SECOND_NONTERMINAL = -2;
Precomputation::Precomputation(
- shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns,
- int num_super_frequent_patterns, int max_rule_span,
+ shared_ptr<SuffixArray> suffix_array, int num_frequent_phrases,
+ int num_super_frequent_phrases, int max_rule_span,
int max_rule_symbols, int min_gap_size,
int max_frequent_phrase_len, int min_frequency) {
vector<int> data = suffix_array->GetData()->GetData();
- vector<vector<int>> frequent_patterns = FindMostFrequentPatterns(
- suffix_array, data, num_frequent_patterns, max_frequent_phrase_len,
+ vector<vector<int>> frequent_phrases = FindMostFrequentPhrases(
+ suffix_array, data, num_frequent_phrases, max_frequent_phrase_len,
min_frequency);
// Construct sets containing the frequent and superfrequent contiguous
// collocations.
- unordered_set<vector<int>, VectorHash> frequent_patterns_set;
- unordered_set<vector<int>, VectorHash> super_frequent_patterns_set;
- for (size_t i = 0; i < frequent_patterns.size(); ++i) {
- frequent_patterns_set.insert(frequent_patterns[i]);
- if (i < num_super_frequent_patterns) {
- super_frequent_patterns_set.insert(frequent_patterns[i]);
+ unordered_set<vector<int>, VectorHash> frequent_phrases_set;
+ unordered_set<vector<int>, VectorHash> super_frequent_phrases_set;
+ for (size_t i = 0; i < frequent_phrases.size(); ++i) {
+ frequent_phrases_set.insert(frequent_phrases[i]);
+ if (i < num_super_frequent_phrases) {
+ super_frequent_phrases_set.insert(frequent_phrases[i]);
}
}
- vector<tuple<int, int, int>> matchings;
+ vector<tuple<int, int, int>> locations;
for (size_t i = 0; i < data.size(); ++i) {
- // If the sentence is over, add all the discontiguous frequent patterns to
- // the index.
+ // If the sentence is over, add all the discontiguous frequent phrases to
+ // the list.
if (data[i] == DataArray::END_OF_LINE) {
- AddCollocations(matchings, data, max_rule_span, min_gap_size,
+ AddCollocations(locations, data, max_rule_span, min_gap_size,
max_rule_symbols);
- matchings.clear();
+ locations.clear();
continue;
}
- vector<int> pattern;
- // Find all the contiguous frequent patterns starting at position i.
+ vector<int> phrase;
+ // Find all the contiguous frequent phrases starting at position i.
for (int j = 1; j <= max_frequent_phrase_len && i + j <= data.size(); ++j) {
- pattern.push_back(data[i + j - 1]);
- if (frequent_patterns_set.count(pattern)) {
- int is_super_frequent = super_frequent_patterns_set.count(pattern);
- matchings.push_back(make_tuple(i, j, is_super_frequent));
+ phrase.push_back(data[i + j - 1]);
+ if (frequent_phrases_set.count(phrase)) {
+ int is_super_frequent = super_frequent_phrases_set.count(phrase);
+ locations.push_back(make_tuple(i, j, is_super_frequent));
} else {
- // If the current pattern is not frequent, any longer pattern having the
- // current pattern as prefix will not be frequent.
+ // If the current phrase is not frequent, any longer phrase having the
+ // current phrase as prefix will not be frequent.
break;
}
}
}
+
+ collocations.shrink_to_fit();
}
Precomputation::Precomputation() {}
Precomputation::~Precomputation() {}
-vector<vector<int>> Precomputation::FindMostFrequentPatterns(
+vector<vector<int>> Precomputation::FindMostFrequentPhrases(
shared_ptr<SuffixArray> suffix_array, const vector<int>& data,
- int num_frequent_patterns, int max_frequent_phrase_len, int min_frequency) {
+ int num_frequent_phrases, int max_frequent_phrase_len, int min_frequency) {
vector<int> lcp = suffix_array->BuildLCPArray();
vector<int> run_start(max_frequent_phrase_len);
- // Find all the patterns occurring at least min_frequency times.
+ // Find all the phrases occurring at least min_frequency times.
priority_queue<pair<int, pair<int, int>>> heap;
for (size_t i = 1; i < lcp.size(); ++i) {
for (int len = lcp[i]; len < max_frequent_phrase_len; ++len) {
@@ -83,34 +85,34 @@ vector<vector<int>> Precomputation::FindMostFrequentPatterns(
}
}
- // Extract the most frequent patterns.
- vector<vector<int>> frequent_patterns;
- while (frequent_patterns.size() < num_frequent_patterns && !heap.empty()) {
+ // Extract the most frequent phrases.
+ vector<vector<int>> frequent_phrases;
+ while (frequent_phrases.size() < num_frequent_phrases && !heap.empty()) {
int start = heap.top().second.first;
int len = heap.top().second.second;
heap.pop();
- vector<int> pattern(data.begin() + start, data.begin() + start + len);
- if (find(pattern.begin(), pattern.end(), DataArray::END_OF_LINE) ==
- pattern.end()) {
- frequent_patterns.push_back(pattern);
+ vector<int> phrase(data.begin() + start, data.begin() + start + len);
+ if (find(phrase.begin(), phrase.end(), DataArray::END_OF_LINE) ==
+ phrase.end()) {
+ frequent_phrases.push_back(phrase);
}
}
- return frequent_patterns;
+ return frequent_phrases;
}
void Precomputation::AddCollocations(
- const vector<tuple<int, int, int>>& matchings, const vector<int>& data,
+ const vector<tuple<int, int, int>>& locations, const vector<int>& data,
int max_rule_span, int min_gap_size, int max_rule_symbols) {
- // Select the leftmost subpattern.
- for (size_t i = 0; i < matchings.size(); ++i) {
+ // Select the leftmost subphrase.
+ for (size_t i = 0; i < locations.size(); ++i) {
int start1, size1, is_super1;
- tie(start1, size1, is_super1) = matchings[i];
+ tie(start1, size1, is_super1) = locations[i];
- // Select the second (middle) subpattern
- for (size_t j = i + 1; j < matchings.size(); ++j) {
+ // Select the second (middle) subphrase
+ for (size_t j = i + 1; j < locations.size(); ++j) {
int start2, size2, is_super2;
- tie(start2, size2, is_super2) = matchings[j];
+ tie(start2, size2, is_super2) = locations[j];
if (start2 - start1 >= max_rule_span) {
break;
}
@@ -118,20 +120,21 @@ void Precomputation::AddCollocations(
if (start2 - start1 - size1 >= min_gap_size
&& start2 + size2 - start1 <= max_rule_span
&& size1 + size2 + 1 <= max_rule_symbols) {
- vector<int> pattern(data.begin() + start1,
+ vector<int> collocation(data.begin() + start1,
data.begin() + start1 + size1);
- pattern.push_back(Precomputation::FIRST_NONTERMINAL);
- pattern.insert(pattern.end(), data.begin() + start2,
+ collocation.push_back(Precomputation::FIRST_NONTERMINAL);
+ collocation.insert(collocation.end(), data.begin() + start2,
data.begin() + start2 + size2);
- AddStartPositions(collocations[pattern], start1, start2);
+
+ AddCollocation(collocation, GetLocation(start1, start2));
// Try extending the binary collocation to a ternary collocation.
if (is_super2) {
- pattern.push_back(Precomputation::SECOND_NONTERMINAL);
- // Select the rightmost subpattern.
- for (size_t k = j + 1; k < matchings.size(); ++k) {
+ collocation.push_back(Precomputation::SECOND_NONTERMINAL);
+ // Select the rightmost subphrase.
+ for (size_t k = j + 1; k < locations.size(); ++k) {
int start3, size3, is_super3;
- tie(start3, size3, is_super3) = matchings[k];
+ tie(start3, size3, is_super3) = locations[k];
if (start3 - start1 >= max_rule_span) {
break;
}
@@ -140,10 +143,12 @@ void Precomputation::AddCollocations(
&& start3 + size3 - start1 <= max_rule_span
&& size1 + size2 + size3 + 2 <= max_rule_symbols
&& (is_super1 || is_super3)) {
- pattern.insert(pattern.end(), data.begin() + start3,
+ collocation.insert(collocation.end(), data.begin() + start3,
data.begin() + start3 + size3);
- AddStartPositions(collocations[pattern], start1, start2, start3);
- pattern.erase(pattern.end() - size3);
+
+ AddCollocation(collocation, GetLocation(start1, start2, start3));
+
+ collocation.erase(collocation.end() - size3);
}
}
}
@@ -152,20 +157,29 @@ void Precomputation::AddCollocations(
}
}
-void Precomputation::AddStartPositions(
- vector<int>& positions, int pos1, int pos2) {
- positions.push_back(pos1);
- positions.push_back(pos2);
+vector<int> Precomputation::GetLocation(int pos1, int pos2) {
+ vector<int> location;
+ location.push_back(pos1);
+ location.push_back(pos2);
+ return location;
+}
+
+vector<int> Precomputation::GetLocation(int pos1, int pos2, int pos3) {
+ vector<int> location;
+ location.push_back(pos1);
+ location.push_back(pos2);
+ location.push_back(pos3);
+ return location;
}
-void Precomputation::AddStartPositions(
- vector<int>& positions, int pos1, int pos2, int pos3) {
- positions.push_back(pos1);
- positions.push_back(pos2);
- positions.push_back(pos3);
+void Precomputation::AddCollocation(vector<int> collocation,
+ vector<int> location) {
+ collocation.shrink_to_fit();
+ location.shrink_to_fit();
+ collocations.push_back(make_pair(collocation, location));
}
-const Index& Precomputation::GetCollocations() const {
+Collocations Precomputation::GetCollocations() const {
return collocations;
}