diff options
Diffstat (limited to 'extractor/suffix_array.cc')
-rw-r--r-- | extractor/suffix_array.cc | 229 |
1 files changed, 229 insertions, 0 deletions
diff --git a/extractor/suffix_array.cc b/extractor/suffix_array.cc new file mode 100644 index 00000000..23c458a4 --- /dev/null +++ b/extractor/suffix_array.cc @@ -0,0 +1,229 @@ +#include "suffix_array.h" + +#include <chrono> +#include <iostream> +#include <string> +#include <vector> + +#include "data_array.h" +#include "phrase_location.h" +#include "time_util.h" + +namespace fs = boost::filesystem; +using namespace std; +using namespace chrono; + +SuffixArray::SuffixArray(shared_ptr<DataArray> data_array) : + data_array(data_array) { + BuildSuffixArray(); +} + +SuffixArray::SuffixArray() {} + +SuffixArray::~SuffixArray() {} + +void SuffixArray::BuildSuffixArray() { + vector<int> groups = data_array->GetData(); + groups.reserve(groups.size() + 1); + groups.push_back(DataArray::NULL_WORD); + suffix_array.resize(groups.size()); + word_start.resize(data_array->GetVocabularySize() + 1); + + InitialBucketSort(groups); + + int combined_group_size = 0; + for (size_t i = 1; i < word_start.size(); ++i) { + if (word_start[i] - word_start[i - 1] == 1) { + ++combined_group_size; + suffix_array[word_start[i] - combined_group_size] = -combined_group_size; + } else { + combined_group_size = 0; + } + } + + PrefixDoublingSort(groups); + cerr << "\tFinalizing sort..." << endl; + + for (size_t i = 0; i < groups.size(); ++i) { + suffix_array[groups[i]] = i; + } +} + +void SuffixArray::InitialBucketSort(vector<int>& groups) { + Clock::time_point start_time = Clock::now(); + for (size_t i = 0; i < groups.size(); ++i) { + ++word_start[groups[i]]; + } + + for (size_t i = 1; i < word_start.size(); ++i) { + word_start[i] += word_start[i - 1]; + } + + for (size_t i = 0; i < groups.size(); ++i) { + --word_start[groups[i]]; + suffix_array[word_start[groups[i]]] = i; + } + + for (size_t i = 0; i < suffix_array.size(); ++i) { + groups[i] = word_start[groups[i] + 1] - 1; + } + Clock::time_point stop_time = Clock::now(); + cerr << "\tBucket sort took " << GetDuration(start_time, stop_time) + << " seconds" << endl; +} + +void SuffixArray::PrefixDoublingSort(vector<int>& groups) { + int step = 1; + while (suffix_array[0] != -suffix_array.size()) { + int combined_group_size = 0; + int i = 0; + while (i < suffix_array.size()) { + if (suffix_array[i] < 0) { + int skip = -suffix_array[i]; + combined_group_size += skip; + i += skip; + suffix_array[i - combined_group_size] = -combined_group_size; + } else { + combined_group_size = 0; + int j = groups[suffix_array[i]]; + TernaryQuicksort(i, j, step, groups); + i = j + 1; + } + } + step *= 2; + } +} + +void SuffixArray::TernaryQuicksort(int left, int right, int step, + vector<int>& groups) { + if (left > right) { + return; + } + + int pivot = left + rand() % (right - left + 1); + int pivot_value = groups[suffix_array[pivot] + step]; + swap(suffix_array[pivot], suffix_array[left]); + int mid_left = left, mid_right = left; + for (int i = left + 1; i <= right; ++i) { + if (groups[suffix_array[i] + step] < pivot_value) { + ++mid_right; + int temp = suffix_array[i]; + suffix_array[i] = suffix_array[mid_right]; + suffix_array[mid_right] = suffix_array[mid_left]; + suffix_array[mid_left] = temp; + ++mid_left; + } else if (groups[suffix_array[i] + step] == pivot_value) { + ++mid_right; + int temp = suffix_array[i]; + suffix_array[i] = suffix_array[mid_right]; + suffix_array[mid_right] = temp; + } + } + + TernaryQuicksort(left, mid_left - 1, step, groups); + + if (mid_left == mid_right) { + groups[suffix_array[mid_left]] = mid_left; + suffix_array[mid_left] = -1; + } else { + for (int i = mid_left; i <= mid_right; ++i) { + groups[suffix_array[i]] = mid_right; + } + } + + TernaryQuicksort(mid_right + 1, right, step, groups); +} + +vector<int> SuffixArray::BuildLCPArray() const { + Clock::time_point start_time = Clock::now(); + cerr << "Constructing LCP array..." << endl; + + vector<int> lcp(suffix_array.size()); + vector<int> rank(suffix_array.size()); + const vector<int>& data = data_array->GetData(); + + for (size_t i = 0; i < suffix_array.size(); ++i) { + rank[suffix_array[i]] = i; + } + + int prefix_len = 0; + for (size_t i = 0; i < suffix_array.size(); ++i) { + if (rank[i] == 0) { + lcp[rank[i]] = -1; + } else { + int j = suffix_array[rank[i] - 1]; + while (i + prefix_len < data.size() && j + prefix_len < data.size() + && data[i + prefix_len] == data[j + prefix_len]) { + ++prefix_len; + } + lcp[rank[i]] = prefix_len; + } + + if (prefix_len > 0) { + --prefix_len; + } + } + + Clock::time_point stop_time = Clock::now(); + cerr << "Constructing LCP took " + << GetDuration(start_time, stop_time) << " seconds" << endl; + + return lcp; +} + +int SuffixArray::GetSuffix(int rank) const { + return suffix_array[rank]; +} + +int SuffixArray::GetSize() const { + return suffix_array.size(); +} + +shared_ptr<DataArray> SuffixArray::GetData() const { + return data_array; +} + +void SuffixArray::WriteBinary(const fs::path& filepath) const { + FILE* file = fopen(filepath.string().c_str(), "r"); + data_array->WriteBinary(file); + + int size = suffix_array.size(); + fwrite(&size, sizeof(int), 1, file); + fwrite(suffix_array.data(), sizeof(int), size, file); + + size = word_start.size(); + fwrite(&size, sizeof(int), 1, file); + fwrite(word_start.data(), sizeof(int), size, file); +} + +PhraseLocation SuffixArray::Lookup(int low, int high, const string& word, + int offset) const { + if (!data_array->HasWord(word)) { + // Return empty phrase location. + return PhraseLocation(0, 0); + } + + int word_id = data_array->GetWordId(word); + if (offset == 0) { + return PhraseLocation(word_start[word_id], word_start[word_id + 1]); + } + + return PhraseLocation(LookupRangeStart(low, high, word_id, offset), + LookupRangeStart(low, high, word_id + 1, offset)); +} + +int SuffixArray::LookupRangeStart(int low, int high, int word_id, + int offset) const { + int result = high; + while (low < high) { + int middle = low + (high - low) / 2; + if (suffix_array[middle] + offset >= data_array->GetSize() || + data_array->AtIndex(suffix_array[middle] + offset) < word_id) { + low = middle + 1; + } else { + result = middle; + high = middle; + } + } + return result; +} |