diff options
Diffstat (limited to 'extractor/precomputation.h')
-rw-r--r-- | extractor/precomputation.h | 45 |
1 files changed, 25 insertions, 20 deletions
diff --git a/extractor/precomputation.h b/extractor/precomputation.h index e5fa3e37..6ade58df 100644 --- a/extractor/precomputation.h +++ b/extractor/precomputation.h @@ -19,7 +19,9 @@ namespace extractor { typedef boost::hash<vector<int>> VectorHash; typedef unordered_map<vector<int>, vector<int>, VectorHash> Index; +class DataArray; class SuffixArray; +class Vocabulary; /** * Data structure wrapping an index with all the occurrences of the most @@ -35,9 +37,9 @@ class Precomputation { public: // Constructs the index using the suffix array. Precomputation( - shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns, - int num_super_frequent_patterns, int max_rule_span, - int max_rule_symbols, int min_gap_size, + shared_ptr<Vocabulary> vocabulary, shared_ptr<SuffixArray> suffix_array, + int num_frequent_patterns, int num_super_frequent_patterns, + int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency); // Creates empty precomputation data structure. @@ -45,40 +47,43 @@ class Precomputation { virtual ~Precomputation(); - // Returns a reference to the index. - virtual const Index& GetCollocations() const; + // Returns whether a pattern is contained in the index of collocations. + virtual bool Contains(const vector<int>& pattern) const; + + // Returns the list of collocations for a given pattern. + virtual vector<int> GetCollocations(const vector<int>& pattern) const; bool operator==(const Precomputation& other) const; - static int FIRST_NONTERMINAL; - static int SECOND_NONTERMINAL; + static int NONTERMINAL; private: // Finds the most frequent contiguous collocations. vector<vector<int>> FindMostFrequentPatterns( - shared_ptr<SuffixArray> suffix_array, const vector<int>& data, - int num_frequent_patterns, int max_frequent_phrase_len, - int min_frequency); + shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns, + int max_frequent_phrase_len, int min_frequency); // Given the locations of the frequent contiguous collocations in a sentence, // it adds new entries to the index for each discontiguous collocation // matching the criteria specified in the class description. - void AddCollocations( - const vector<std::tuple<int, int, int>>& matchings, const vector<int>& data, + void UpdateIndex( + shared_ptr<DataArray> data_array, shared_ptr<Vocabulary> vocabulary, + const vector<tuple<int, int, int>>& matchings, int max_rule_span, int min_gap_size, int max_rule_symbols); - // Adds an occurrence of a binary collocation. - void AddStartPositions(vector<int>& positions, int pos1, int pos2); + void AppendSubpattern( + vector<int>& pattern, shared_ptr<DataArray> data_array, + shared_ptr<Vocabulary> vocabulary, int start, int size); - // Adds an occurrence of a ternary collocation. - void AddStartPositions(vector<int>& positions, int pos1, int pos2, int pos3); + // Adds an occurrence of a collocation. + void AppendCollocation(vector<int>& collocations, const vector<int>& collocation); friend class boost::serialization::access; template<class Archive> void save(Archive& ar, unsigned int) const { - int num_entries = collocations.size(); + int num_entries = index.size(); ar << num_entries; - for (pair<vector<int>, vector<int>> entry: collocations) { + for (pair<vector<int>, vector<int>> entry: index) { ar << entry; } } @@ -89,13 +94,13 @@ class Precomputation { for (size_t i = 0; i < num_entries; ++i) { pair<vector<int>, vector<int>> entry; ar >> entry; - collocations.insert(entry); + index.insert(entry); } } BOOST_SERIALIZATION_SPLIT_MEMBER(); - Index collocations; + Index index; }; } // namespace extractor |