diff options
Diffstat (limited to 'extractor/precomputation.h')
-rw-r--r-- | extractor/precomputation.h | 21 |
1 files changed, 21 insertions, 0 deletions
diff --git a/extractor/precomputation.h b/extractor/precomputation.h index 2c1eccf8..e3c4d26a 100644 --- a/extractor/precomputation.h +++ b/extractor/precomputation.h @@ -20,8 +20,19 @@ typedef unordered_map<vector<int>, vector<int>, VectorHash> Index; class SuffixArray; +/** + * Data structure wrapping an index with all the occurrences of the most + * frequent discontiguous collocations in the source data. + * + * Let a, b, c be contiguous collocations. The index will contain an entry for + * every collocation of the form: + * - aXb, where a and b are frequent + * - aXbXc, where a and b are super-frequent and c is frequent or + * b and c are super-frequent and a is frequent. + */ class Precomputation { public: + // Constructs the index using the suffix array. Precomputation( shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns, int num_super_frequent_patterns, int max_rule_span, @@ -32,6 +43,7 @@ class Precomputation { void WriteBinary(const fs::path& filepath) const; + // Returns a reference to the index. virtual const Index& GetCollocations() const; static int FIRST_NONTERMINAL; @@ -41,14 +53,23 @@ class Precomputation { Precomputation(); private: + // Finds the most frequent contiguous collocations. vector<vector<int> > FindMostFrequentPatterns( shared_ptr<SuffixArray> suffix_array, const vector<int>& data, int num_frequent_patterns, int max_frequent_phrase_len, int min_frequency); + + // Given the locations of the frequent contiguous collocations in a sentence, + // it adds new entries to the index for each discontiguous collocation + // matching the criteria specified in the class description. void AddCollocations( const vector<std::tuple<int, int, int> >& matchings, const vector<int>& data, int max_rule_span, int min_gap_size, int max_rule_symbols); + + // Adds an occurrence of a binary collocation. void AddStartPositions(vector<int>& positions, int pos1, int pos2); + + // Adds an occurrence of a ternary collocation. void AddStartPositions(vector<int>& positions, int pos1, int pos2, int pos3); Index collocations; |