From 5530575ae0ad939e17f08d6bd49978acea388ab7 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Mon, 28 Jan 2013 11:56:31 +0000 Subject: Initial working commit. --- extractor/precomputation.h | 52 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 extractor/precomputation.h (limited to 'extractor/precomputation.h') diff --git a/extractor/precomputation.h b/extractor/precomputation.h new file mode 100644 index 00000000..0d1b269f --- /dev/null +++ b/extractor/precomputation.h @@ -0,0 +1,52 @@ +#ifndef _PRECOMPUTATION_H_ +#define _PRECOMPUTATION_H_ + +#include +#include +#include +#include +#include + +#include +#include + +namespace fs = boost::filesystem; +using namespace std; +using namespace tr1; + +class SuffixArray; + +typedef boost::hash > vector_hash; +typedef unordered_map, vector, vector_hash> Index; + +class Precomputation { + public: + Precomputation( + shared_ptr suffix_array, int num_frequent_patterns, + int num_super_frequent_patterns, int max_rule_span, + int max_rule_symbols, int min_gap_size, + int max_frequent_phrase_len, int min_frequency); + + void WriteBinary(const fs::path& filepath) const; + + const Index& GetInvertedIndex() const; + const Index& GetCollocations() const; + + static int NON_TERMINAL; + + private: + vector > FindMostFrequentPatterns( + shared_ptr suffix_array, const vector& data, + int num_frequent_patterns, int max_frequent_phrase_len, + int min_frequency); + void AddCollocations( + const vector >& matchings, const vector& data, + int max_rule_span, int min_gap_size, int max_rule_symbols); + void AddStartPositions(vector& positions, int pos1, int pos2); + void AddStartPositions(vector& positions, int pos1, int pos2, int pos3); + + Index inverted_index; + Index collocations; +}; + +#endif -- cgit v1.2.3