summaryrefslogtreecommitdiff
path: root/extractor/suffix_array.h
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-04-23 19:35:18 -0400
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-04-23 19:35:18 -0400
commitc164dc0ed8a32e4095ba1b36495e0f743b8cc1ea (patch)
tree78b81e4c63adfa67adb7b8f80c3e6be87b4a2b2a /extractor/suffix_array.h
parent0e46089cafa4e8e2f060e370d7afaceeda6b90a9 (diff)
parentd467e14b28085809c31431be0478eb3d9322fe96 (diff)
merge paul's extractor code
Diffstat (limited to 'extractor/suffix_array.h')
-rw-r--r--extractor/suffix_array.h75
1 files changed, 75 insertions, 0 deletions
diff --git a/extractor/suffix_array.h b/extractor/suffix_array.h
new file mode 100644
index 00000000..bf731d79
--- /dev/null
+++ b/extractor/suffix_array.h
@@ -0,0 +1,75 @@
+#ifndef _SUFFIX_ARRAY_H_
+#define _SUFFIX_ARRAY_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <boost/filesystem.hpp>
+
+namespace fs = boost::filesystem;
+using namespace std;
+
+namespace extractor {
+
+class DataArray;
+class PhraseLocation;
+
+class SuffixArray {
+ public:
+ // Creates a suffix array from a data array.
+ SuffixArray(shared_ptr<DataArray> data_array);
+
+ virtual ~SuffixArray();
+
+ // Returns the size of the suffix array.
+ virtual int GetSize() const;
+
+ // Returns the data array on top of which the suffix array is constructed.
+ virtual shared_ptr<DataArray> GetData() const;
+
+ // Constructs the longest-common-prefix array using the algorithm of Kasai et
+ // al. (2001).
+ virtual vector<int> BuildLCPArray() const;
+
+ // Returns the i-th suffix.
+ virtual int GetSuffix(int rank) const;
+
+ // Given the range in which a phrase is located and the next word, returns the
+ // range corresponding to the phrase extended with the next word.
+ virtual PhraseLocation Lookup(int low, int high, const string& word,
+ int offset) const;
+
+ void WriteBinary(const fs::path& filepath) const;
+
+ protected:
+ SuffixArray();
+
+ private:
+ // Constructs the suffix array using the algorithm of Larsson and Sadakane
+ // (1999).
+ void BuildSuffixArray();
+
+ // Bucket sort on the data array (used for initializing the construction of
+ // the suffix array.)
+ void InitialBucketSort(vector<int>& groups);
+
+ void TernaryQuicksort(int left, int right, int step, vector<int>& groups);
+
+ // Constructs the suffix array in log(n) steps by doubling the length of the
+ // suffixes at each step.
+ void PrefixDoublingSort(vector<int>& groups);
+
+ // Given a [low, high) range in the suffix array in which all elements have
+ // the first offset-1 values the same, it returns the first position where the
+ // offset value is greater or equal to word_id.
+ int LookupRangeStart(int low, int high, int word_id, int offset) const;
+
+ shared_ptr<DataArray> data_array;
+ vector<int> suffix_array;
+ vector<int> word_start;
+};
+
+} // namespace extractor
+
+#endif