diff options
| author | Patrick Simianer <p@simianer.de> | 2013-05-02 09:09:59 +0200 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2013-05-02 09:09:59 +0200 | 
| commit | 9e50f0237413180fba11b500c9dce5c600e3c157 (patch) | |
| tree | 556fc31d231353c853a864afffddd43dc525549a /extractor/precomputation.h | |
| parent | d18024a41cbc1b54db88d499571349a6234b6db8 (diff) | |
| parent | 14ed53426726202813a8e82d706b44266f015fe1 (diff) | |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'extractor/precomputation.h')
| -rw-r--r-- | extractor/precomputation.h | 80 | 
1 files changed, 80 insertions, 0 deletions
| diff --git a/extractor/precomputation.h b/extractor/precomputation.h new file mode 100644 index 00000000..e3c4d26a --- /dev/null +++ b/extractor/precomputation.h @@ -0,0 +1,80 @@ +#ifndef _PRECOMPUTATION_H_ +#define _PRECOMPUTATION_H_ + +#include <memory> +#include <unordered_map> +#include <unordered_set> +#include <tuple> +#include <vector> + +#include <boost/filesystem.hpp> +#include <boost/functional/hash.hpp> + +namespace fs = boost::filesystem; +using namespace std; + +namespace extractor { + +typedef boost::hash<vector<int> > VectorHash; +typedef unordered_map<vector<int>, vector<int>, VectorHash> Index; + +class SuffixArray; + +/** + * Data structure wrapping an index with all the occurrences of the most + * frequent discontiguous collocations in the source data. + * + * Let a, b, c be contiguous collocations. The index will contain an entry for + * every collocation of the form: + * - aXb, where a and b are frequent + * - aXbXc, where a and b are super-frequent and c is frequent or + *                b and c are super-frequent and a is frequent. + */ +class Precomputation { + public: +  // Constructs the index using the suffix array. +  Precomputation( +      shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns, +      int num_super_frequent_patterns, int max_rule_span, +      int max_rule_symbols, int min_gap_size, +      int max_frequent_phrase_len, int min_frequency); + +  virtual ~Precomputation(); + +  void WriteBinary(const fs::path& filepath) const; + +  // Returns a reference to the index. +  virtual const Index& GetCollocations() const; + +  static int FIRST_NONTERMINAL; +  static int SECOND_NONTERMINAL; + + protected: +  Precomputation(); + + private: +  // Finds the most frequent contiguous collocations. +  vector<vector<int> > FindMostFrequentPatterns( +      shared_ptr<SuffixArray> suffix_array, const vector<int>& data, +      int num_frequent_patterns, int max_frequent_phrase_len, +      int min_frequency); + +  // Given the locations of the frequent contiguous collocations in a sentence, +  // it adds new entries to the index for each discontiguous collocation +  // matching the criteria specified in the class description. +  void AddCollocations( +      const vector<std::tuple<int, int, int> >& matchings, const vector<int>& data, +      int max_rule_span, int min_gap_size, int max_rule_symbols); + +  // Adds an occurrence of a binary collocation. +  void AddStartPositions(vector<int>& positions, int pos1, int pos2); + +  // Adds an occurrence of a ternary collocation. +  void AddStartPositions(vector<int>& positions, int pos1, int pos2, int pos3); + +  Index collocations; +}; + +} // namespace extractor + +#endif | 
