diff options
Diffstat (limited to 'extractor/precomputation.h')
-rw-r--r-- | extractor/precomputation.h | 49 |
1 files changed, 20 insertions, 29 deletions
diff --git a/extractor/precomputation.h b/extractor/precomputation.h index 0a06349b..9f0c9424 100644 --- a/extractor/precomputation.h +++ b/extractor/precomputation.h @@ -19,18 +19,16 @@ using namespace std; namespace extractor { typedef boost::hash<vector<int>> VectorHash; -typedef vector<pair<vector<int>, vector<int>>> Collocations; +typedef unordered_map<vector<int>, vector<int>, VectorHash> Index; class SuffixArray; /** - * Data structure containing all the data needed for constructing an index with - * all the occurrences of the most frequent discontiguous collocations in the - * source data. + * Data structure wrapping an index with all the occurrences of the most + * frequent discontiguous collocations in the source data. * - * Let a, b, c be contiguous phrases. The data structure will contain the - * locations in the source data where every collocation of the following forms - * occurs: + * Let a, b, c be contiguous collocations. The index will contain an entry for + * every collocation of the form: * - aXb, where a and b are frequent * - aXbXc, where a and b are super-frequent and c is frequent or * b and c are super-frequent and a is frequent. @@ -39,8 +37,8 @@ class Precomputation { public: // Constructs the index using the suffix array. Precomputation( - shared_ptr<SuffixArray> suffix_array, int num_frequent_phrases, - int num_super_frequent_phrases, int max_rule_span, + shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns, + int num_super_frequent_patterns, int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency); @@ -49,9 +47,8 @@ class Precomputation { virtual ~Precomputation(); - // Returns the list of the locations of the most frequent collocations in the - // source data. - virtual Collocations GetCollocations() const; + // Returns a reference to the index. + virtual const Index& GetCollocations() const; bool operator==(const Precomputation& other) const; @@ -60,29 +57,23 @@ class Precomputation { private: // Finds the most frequent contiguous collocations. - vector<vector<int>> FindMostFrequentPhrases( + vector<vector<int>> FindMostFrequentPatterns( shared_ptr<SuffixArray> suffix_array, const vector<int>& data, - int num_frequent_phrases, int max_frequent_phrase_len, + int num_frequent_patterns, int max_frequent_phrase_len, int min_frequency); // Given the locations of the frequent contiguous collocations in a sentence, // it adds new entries to the index for each discontiguous collocation // matching the criteria specified in the class description. - void AddCollocations(const vector<std::tuple<int, int, int>>& locations, - const vector<int>& data, int max_rule_span, - int min_gap_size, int max_rule_symbols); + void AddCollocations( + const vector<std::tuple<int, int, int>>& matchings, const vector<int>& data, + int max_rule_span, int min_gap_size, int max_rule_symbols); - // Creates a vector representation for the location of a binary collocation - // containing the starting points of each subpattern. - vector<int> GetLocation(int pos1, int pos2); + // Adds an occurrence of a binary collocation. + void AddStartPositions(vector<int>& positions, int pos1, int pos2); - // Creates a vector representation for the location of a ternary collocation - // containing the starting points of each subpattern. - vector<int> GetLocation(int pos1, int pos2, int pos3); - - // Appends a collocation to the list of collocations after shrinking the - // vectors to avoid unnecessary memory usage. - void AddCollocation(vector<int> collocation, vector<int> location); + // Adds an occurrence of a ternary collocation. + void AddStartPositions(vector<int>& positions, int pos1, int pos2, int pos3); friend class boost::serialization::access; @@ -100,13 +91,13 @@ class Precomputation { for (size_t i = 0; i < num_entries; ++i) { pair<vector<int>, vector<int>> entry; ar >> entry; - collocations.push_back(entry); + collocations.insert(entry); } } BOOST_SERIALIZATION_SPLIT_MEMBER(); - Collocations collocations; + Index collocations; }; } // namespace extractor |