#ifndef _PRECOMPUTATION_H_ #define _PRECOMPUTATION_H_ #include #include #include #include #include #include #include #include #include #include namespace fs = boost::filesystem; using namespace std; namespace extractor { typedef boost::hash> VectorHash; typedef vector, vector>> Collocations; class SuffixArray; /** * Data structure containing all the data needed for constructing an index with * all the occurrences of the most frequent discontiguous collocations in the * source data. * * Let a, b, c be contiguous phrases. The data structure will contain the * locations in the source data where every collocation of the following forms * occurs: * - aXb, where a and b are frequent * - aXbXc, where a and b are super-frequent and c is frequent or * b and c are super-frequent and a is frequent. */ class Precomputation { public: // Constructs the index using the suffix array. Precomputation( shared_ptr suffix_array, int num_frequent_phrases, int num_super_frequent_phrases, int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency); // Creates empty precomputation data structure. Precomputation(); virtual ~Precomputation(); // Returns the list of the locations of the most frequent collocations in the // source data. virtual Collocations GetCollocations() const; bool operator==(const Precomputation& other) const; static int FIRST_NONTERMINAL; static int SECOND_NONTERMINAL; private: // Finds the most frequent contiguous collocations. vector> FindMostFrequentPhrases( shared_ptr suffix_array, const vector& data, int num_frequent_phrases, int max_frequent_phrase_len, int min_frequency); // Given the locations of the frequent contiguous collocations in a sentence, // it adds new entries to the index for each discontiguous collocation // matching the criteria specified in the class description. void AddCollocations(const vector>& locations, const vector& data, int max_rule_span, int min_gap_size, int max_rule_symbols); // Creates a vector representation for the location of a binary collocation // containing the starting points of each subpattern. vector GetLocation(int pos1, int pos2); // Creates a vector representation for the location of a ternary collocation // containing the starting points of each subpattern. vector GetLocation(int pos1, int pos2, int pos3); // Appends a collocation to the list of collocations after shrinking the // vectors to avoid unnecessary memory usage. void AddCollocation(vector collocation, vector location); friend class boost::serialization::access; template void save(Archive& ar, unsigned int) const { int num_entries = collocations.size(); ar << num_entries; for (pair, vector> entry: collocations) { ar << entry; } } template void load(Archive& ar, unsigned int) { int num_entries; ar >> num_entries; for (size_t i = 0; i < num_entries; ++i) { pair, vector> entry; ar >> entry; collocations.push_back(entry); } } BOOST_SERIALIZATION_SPLIT_MEMBER(); Collocations collocations; }; } // namespace extractor #endif