summaryrefslogtreecommitdiff
path: root/extractor/precomputation.h
diff options
context:
space:
mode:
Diffstat (limited to 'extractor/precomputation.h')
-rw-r--r--extractor/precomputation.h49
1 files changed, 20 insertions, 29 deletions
diff --git a/extractor/precomputation.h b/extractor/precomputation.h
index 0a06349b..9f0c9424 100644
--- a/extractor/precomputation.h
+++ b/extractor/precomputation.h
@@ -19,18 +19,16 @@ using namespace std;
namespace extractor {
typedef boost::hash<vector<int>> VectorHash;
-typedef vector<pair<vector<int>, vector<int>>> Collocations;
+typedef unordered_map<vector<int>, vector<int>, VectorHash> Index;
class SuffixArray;
/**
- * Data structure containing all the data needed for constructing an index with
- * all the occurrences of the most frequent discontiguous collocations in the
- * source data.
+ * Data structure wrapping an index with all the occurrences of the most
+ * frequent discontiguous collocations in the source data.
*
- * Let a, b, c be contiguous phrases. The data structure will contain the
- * locations in the source data where every collocation of the following forms
- * occurs:
+ * Let a, b, c be contiguous collocations. The index will contain an entry for
+ * every collocation of the form:
* - aXb, where a and b are frequent
* - aXbXc, where a and b are super-frequent and c is frequent or
* b and c are super-frequent and a is frequent.
@@ -39,8 +37,8 @@ class Precomputation {
public:
// Constructs the index using the suffix array.
Precomputation(
- shared_ptr<SuffixArray> suffix_array, int num_frequent_phrases,
- int num_super_frequent_phrases, int max_rule_span,
+ shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns,
+ int num_super_frequent_patterns, int max_rule_span,
int max_rule_symbols, int min_gap_size,
int max_frequent_phrase_len, int min_frequency);
@@ -49,9 +47,8 @@ class Precomputation {
virtual ~Precomputation();
- // Returns the list of the locations of the most frequent collocations in the
- // source data.
- virtual Collocations GetCollocations() const;
+ // Returns a reference to the index.
+ virtual const Index& GetCollocations() const;
bool operator==(const Precomputation& other) const;
@@ -60,29 +57,23 @@ class Precomputation {
private:
// Finds the most frequent contiguous collocations.
- vector<vector<int>> FindMostFrequentPhrases(
+ vector<vector<int>> FindMostFrequentPatterns(
shared_ptr<SuffixArray> suffix_array, const vector<int>& data,
- int num_frequent_phrases, int max_frequent_phrase_len,
+ int num_frequent_patterns, int max_frequent_phrase_len,
int min_frequency);
// Given the locations of the frequent contiguous collocations in a sentence,
// it adds new entries to the index for each discontiguous collocation
// matching the criteria specified in the class description.
- void AddCollocations(const vector<std::tuple<int, int, int>>& locations,
- const vector<int>& data, int max_rule_span,
- int min_gap_size, int max_rule_symbols);
+ void AddCollocations(
+ const vector<std::tuple<int, int, int>>& matchings, const vector<int>& data,
+ int max_rule_span, int min_gap_size, int max_rule_symbols);
- // Creates a vector representation for the location of a binary collocation
- // containing the starting points of each subpattern.
- vector<int> GetLocation(int pos1, int pos2);
+ // Adds an occurrence of a binary collocation.
+ void AddStartPositions(vector<int>& positions, int pos1, int pos2);
- // Creates a vector representation for the location of a ternary collocation
- // containing the starting points of each subpattern.
- vector<int> GetLocation(int pos1, int pos2, int pos3);
-
- // Appends a collocation to the list of collocations after shrinking the
- // vectors to avoid unnecessary memory usage.
- void AddCollocation(vector<int> collocation, vector<int> location);
+ // Adds an occurrence of a ternary collocation.
+ void AddStartPositions(vector<int>& positions, int pos1, int pos2, int pos3);
friend class boost::serialization::access;
@@ -100,13 +91,13 @@ class Precomputation {
for (size_t i = 0; i < num_entries; ++i) {
pair<vector<int>, vector<int>> entry;
ar >> entry;
- collocations.push_back(entry);
+ collocations.insert(entry);
}
}
BOOST_SERIALIZATION_SPLIT_MEMBER();
- Collocations collocations;
+ Index collocations;
};
} // namespace extractor