From f528ac27dab11770f01595b043675dba2947a263 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Sun, 24 Nov 2013 13:19:28 +0000 Subject: Reduce memory overhead for constructing the intersector. --- extractor/precomputation.h | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'extractor/precomputation.h') diff --git a/extractor/precomputation.h b/extractor/precomputation.h index e5fa3e37..6ade58df 100644 --- a/extractor/precomputation.h +++ b/extractor/precomputation.h @@ -19,7 +19,9 @@ namespace extractor { typedef boost::hash> VectorHash; typedef unordered_map, vector, VectorHash> Index; +class DataArray; class SuffixArray; +class Vocabulary; /** * Data structure wrapping an index with all the occurrences of the most @@ -35,9 +37,9 @@ class Precomputation { public: // Constructs the index using the suffix array. Precomputation( - shared_ptr suffix_array, int num_frequent_patterns, - int num_super_frequent_patterns, int max_rule_span, - int max_rule_symbols, int min_gap_size, + shared_ptr vocabulary, shared_ptr suffix_array, + int num_frequent_patterns, int num_super_frequent_patterns, + int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency); // Creates empty precomputation data structure. @@ -45,40 +47,43 @@ class Precomputation { virtual ~Precomputation(); - // Returns a reference to the index. - virtual const Index& GetCollocations() const; + // Returns whether a pattern is contained in the index of collocations. + virtual bool Contains(const vector& pattern) const; + + // Returns the list of collocations for a given pattern. + virtual vector GetCollocations(const vector& pattern) const; bool operator==(const Precomputation& other) const; - static int FIRST_NONTERMINAL; - static int SECOND_NONTERMINAL; + static int NONTERMINAL; private: // Finds the most frequent contiguous collocations. vector> FindMostFrequentPatterns( - shared_ptr suffix_array, const vector& data, - int num_frequent_patterns, int max_frequent_phrase_len, - int min_frequency); + shared_ptr suffix_array, int num_frequent_patterns, + int max_frequent_phrase_len, int min_frequency); // Given the locations of the frequent contiguous collocations in a sentence, // it adds new entries to the index for each discontiguous collocation // matching the criteria specified in the class description. - void AddCollocations( - const vector>& matchings, const vector& data, + void UpdateIndex( + shared_ptr data_array, shared_ptr vocabulary, + const vector>& matchings, int max_rule_span, int min_gap_size, int max_rule_symbols); - // Adds an occurrence of a binary collocation. - void AddStartPositions(vector& positions, int pos1, int pos2); + void AppendSubpattern( + vector& pattern, shared_ptr data_array, + shared_ptr vocabulary, int start, int size); - // Adds an occurrence of a ternary collocation. - void AddStartPositions(vector& positions, int pos1, int pos2, int pos3); + // Adds an occurrence of a collocation. + void AppendCollocation(vector& collocations, const vector& collocation); friend class boost::serialization::access; template void save(Archive& ar, unsigned int) const { - int num_entries = collocations.size(); + int num_entries = index.size(); ar << num_entries; - for (pair, vector> entry: collocations) { + for (pair, vector> entry: index) { ar << entry; } } @@ -89,13 +94,13 @@ class Precomputation { for (size_t i = 0; i < num_entries; ++i) { pair, vector> entry; ar >> entry; - collocations.insert(entry); + index.insert(entry); } } BOOST_SERIALIZATION_SPLIT_MEMBER(); - Index collocations; + Index index; }; } // namespace extractor -- cgit v1.2.3