diff options
Diffstat (limited to 'extractor/suffix_array.h')
-rw-r--r-- | extractor/suffix_array.h | 17 |
1 files changed, 17 insertions, 0 deletions
diff --git a/extractor/suffix_array.h b/extractor/suffix_array.h index 7a4f1110..bf731d79 100644 --- a/extractor/suffix_array.h +++ b/extractor/suffix_array.h @@ -17,18 +17,26 @@ class PhraseLocation; class SuffixArray { public: + // Creates a suffix array from a data array. SuffixArray(shared_ptr<DataArray> data_array); virtual ~SuffixArray(); + // Returns the size of the suffix array. virtual int GetSize() const; + // Returns the data array on top of which the suffix array is constructed. virtual shared_ptr<DataArray> GetData() const; + // Constructs the longest-common-prefix array using the algorithm of Kasai et + // al. (2001). virtual vector<int> BuildLCPArray() const; + // Returns the i-th suffix. virtual int GetSuffix(int rank) const; + // Given the range in which a phrase is located and the next word, returns the + // range corresponding to the phrase extended with the next word. virtual PhraseLocation Lookup(int low, int high, const string& word, int offset) const; @@ -38,14 +46,23 @@ class SuffixArray { SuffixArray(); private: + // Constructs the suffix array using the algorithm of Larsson and Sadakane + // (1999). void BuildSuffixArray(); + // Bucket sort on the data array (used for initializing the construction of + // the suffix array.) void InitialBucketSort(vector<int>& groups); void TernaryQuicksort(int left, int right, int step, vector<int>& groups); + // Constructs the suffix array in log(n) steps by doubling the length of the + // suffixes at each step. void PrefixDoublingSort(vector<int>& groups); + // Given a [low, high) range in the suffix array in which all elements have + // the first offset-1 values the same, it returns the first position where the + // offset value is greater or equal to word_id. int LookupRangeStart(int low, int high, int word_id, int offset) const; shared_ptr<DataArray> data_array; |