diff options
author | Paul Baltescu <pauldb89@gmail.com> | 2013-02-22 16:52:25 +0000 |
---|---|---|
committer | Paul Baltescu <pauldb89@gmail.com> | 2013-02-22 16:52:25 +0000 |
commit | 6717daa034bfcfa429275f64d84607ae1568d488 (patch) | |
tree | c04c1546eed0e9acaa21907a5f8ab5aedc1a08fc /extractor | |
parent | bb79fdc140a5593124d3943769a644c6cc3d87f6 (diff) | |
parent | d69c289e172562039bcbe987657280332ab6315e (diff) |
Merge branch 'master' into experiment
Diffstat (limited to 'extractor')
35 files changed, 50 insertions, 1862 deletions
diff --git a/extractor/Makefile.am b/extractor/Makefile.am index df796cea..721df18b 100644 --- a/extractor/Makefile.am +++ b/extractor/Makefile.am @@ -1,7 +1,6 @@ bin_PROGRAMS = compile run_extractor EXTRA_PROGRAMS = alignment_test \ - binary_search_merger_test \ data_array_test \ fast_intersector_test \ feature_count_source_target_test \ @@ -12,10 +11,6 @@ EXTRA_PROGRAMS = alignment_test \ feature_sample_source_count_test \ feature_target_given_source_coherent_test \ grammar_extractor_test \ - intersector_test \ - linear_merger_test \ - matching_comparator_test \ - matching_test \ matchings_finder_test \ phrase_test \ precomputation_test \ @@ -26,12 +21,10 @@ EXTRA_PROGRAMS = alignment_test \ scorer_test \ suffix_array_test \ target_phrase_extractor_test \ - translation_table_test \ - veb_test + translation_table_test if HAVE_GTEST RUNNABLE_TESTS = alignment_test \ - binary_search_merger_test \ data_array_test \ fast_intersector_test \ feature_count_source_target_test \ @@ -42,10 +35,6 @@ if HAVE_GTEST feature_sample_source_count_test \ feature_target_given_source_coherent_test \ grammar_extractor_test \ - intersector_test \ - linear_merger_test \ - matching_comparator_test \ - matching_test \ matchings_finder_test \ phrase_test \ precomputation_test \ @@ -56,8 +45,7 @@ if HAVE_GTEST scorer_test \ suffix_array_test \ target_phrase_extractor_test \ - translation_table_test \ - veb_test + translation_table_test endif noinst_PROGRAMS = $(RUNNABLE_TESTS) @@ -66,8 +54,6 @@ TESTS = $(RUNNABLE_TESTS) alignment_test_SOURCES = alignment_test.cc alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -binary_search_merger_test_SOURCES = binary_search_merger_test.cc -binary_search_merger_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a data_array_test_SOURCES = data_array_test.cc data_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a fast_intersector_test_SOURCES = fast_intersector_test.cc @@ -88,14 +74,6 @@ feature_target_given_source_coherent_test_SOURCES = features/target_given_source feature_target_given_source_coherent_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a grammar_extractor_test_SOURCES = grammar_extractor_test.cc grammar_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -intersector_test_SOURCES = intersector_test.cc -intersector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -linear_merger_test_SOURCES = linear_merger_test.cc -linear_merger_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -matching_comparator_test_SOURCES = matching_comparator_test.cc -matching_comparator_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -matching_test_SOURCES = matching_test.cc -matching_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a matchings_finder_test_SOURCES = matchings_finder_test.cc matchings_finder_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a phrase_test_SOURCES = phrase_test.cc @@ -118,8 +96,6 @@ target_phrase_extractor_test_SOURCES = target_phrase_extractor_test.cc target_phrase_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a translation_table_test_SOURCES = translation_table_test.cc translation_table_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -veb_test_SOURCES = veb_test.cc -veb_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a noinst_LIBRARIES = libextractor.a libcompile.a @@ -139,7 +115,6 @@ libcompile_a_SOURCES = \ libextractor_a_SOURCES = \ alignment.cc \ - binary_search_merger.cc \ data_array.cc \ fast_intersector.cc \ features/count_source_target.cc \ @@ -152,11 +127,7 @@ libextractor_a_SOURCES = \ features/target_given_source_coherent.cc \ grammar.cc \ grammar_extractor.cc \ - matching.cc \ - matching_comparator.cc \ matchings_finder.cc \ - intersector.cc \ - linear_merger.cc \ matchings_trie.cc \ phrase.cc \ phrase_builder.cc \ @@ -172,9 +143,6 @@ libextractor_a_SOURCES = \ target_phrase_extractor.cc \ time_util.cc \ translation_table.cc \ - veb.cc \ - veb_bitset.cc \ - veb_tree.cc \ vocabulary.cc AM_CPPFLAGS = -W -Wall -Wno-sign-compare -std=c++0x $(GTEST_CPPFLAGS) $(GMOCK_CPPFLAGS) diff --git a/extractor/binary_search_merger.cc b/extractor/binary_search_merger.cc deleted file mode 100644 index c1b86a77..00000000 --- a/extractor/binary_search_merger.cc +++ /dev/null @@ -1,251 +0,0 @@ -#include "binary_search_merger.h" - -#include "data_array.h" -#include "linear_merger.h" -#include "matching.h" -#include "matching_comparator.h" -#include "phrase.h" -#include "vocabulary.h" - -double BinarySearchMerger::BAEZA_YATES_FACTOR = 1.0; - -BinarySearchMerger::BinarySearchMerger( - shared_ptr<Vocabulary> vocabulary, - shared_ptr<LinearMerger> linear_merger, - shared_ptr<DataArray> data_array, - shared_ptr<MatchingComparator> comparator, - bool force_binary_search_merge) : - vocabulary(vocabulary), linear_merger(linear_merger), - data_array(data_array), comparator(comparator), - force_binary_search_merge(force_binary_search_merge) {} - -BinarySearchMerger::BinarySearchMerger() {} - -BinarySearchMerger::~BinarySearchMerger() {} - -void BinarySearchMerger::Merge( - vector<int>& locations, const Phrase& phrase, const Phrase& suffix, - const vector<int>::iterator& prefix_start, - const vector<int>::iterator& prefix_end, - const vector<int>::iterator& suffix_start, - const vector<int>::iterator& suffix_end, - int prefix_subpatterns, int suffix_subpatterns) const { - if (IsIntersectionVoid(prefix_start, prefix_end, suffix_start, suffix_end, - prefix_subpatterns, suffix_subpatterns, suffix)) { - return; - } - - int prefix_set_size = prefix_end - prefix_start; - int suffix_set_size = suffix_end - suffix_start; - if (ShouldUseLinearMerge(prefix_set_size, suffix_set_size)) { - linear_merger->Merge(locations, phrase, suffix, prefix_start, prefix_end, - suffix_start, suffix_end, prefix_subpatterns, suffix_subpatterns); - return; - } - - vector<int>::iterator low, high, prefix_low, prefix_high, suffix_mid; - if (prefix_set_size > suffix_set_size) { - // Binary search on the prefix set. - suffix_mid = GetMiddle(suffix_start, suffix_end, suffix_subpatterns); - low = prefix_start, high = prefix_end; - while (low < high) { - vector<int>::iterator prefix_mid = - GetMiddle(low, high, prefix_subpatterns); - - GetComparableMatchings(prefix_start, prefix_end, prefix_mid, - prefix_subpatterns, prefix_low, prefix_high); - int comparison = CompareMatchingsSet(prefix_low, prefix_high, suffix_mid, - prefix_subpatterns, suffix_subpatterns, suffix); - if (comparison == 0) { - break; - } else if (comparison < 0) { - low = prefix_mid + prefix_subpatterns; - } else { - high = prefix_mid; - } - } - } else { - // Binary search on the suffix set. - vector<int>::iterator prefix_mid = - GetMiddle(prefix_start, prefix_end, prefix_subpatterns); - - GetComparableMatchings(prefix_start, prefix_end, prefix_mid, - prefix_subpatterns, prefix_low, prefix_high); - low = suffix_start, high = suffix_end; - while (low < high) { - suffix_mid = GetMiddle(low, high, suffix_subpatterns); - - int comparison = CompareMatchingsSet(prefix_low, prefix_high, suffix_mid, - prefix_subpatterns, suffix_subpatterns, suffix); - if (comparison == 0) { - break; - } else if (comparison > 0) { - low = suffix_mid + suffix_subpatterns; - } else { - high = suffix_mid; - } - } - } - - vector<int> result; - int last_chunk_len = suffix.GetChunkLen(suffix.Arity()); - bool offset = !vocabulary->IsTerminal(suffix.GetSymbol(0)); - vector<int>::iterator suffix_low, suffix_high; - if (low < high) { - // We found a group of prefixes with the same starting position that give - // different results when compared to the found suffix. - // Find all matching suffixes for the previously found set of prefixes. - suffix_low = suffix_mid; - suffix_high = suffix_mid + suffix_subpatterns; - for (auto i = prefix_low; i != prefix_high; i += prefix_subpatterns) { - Matching left(i, prefix_subpatterns, data_array->GetSentenceId(*i)); - while (suffix_low != suffix_start) { - Matching right(suffix_low - suffix_subpatterns, suffix_subpatterns, - data_array->GetSentenceId(*(suffix_low - suffix_subpatterns))); - if (comparator->Compare(left, right, last_chunk_len, offset) <= 0) { - suffix_low -= suffix_subpatterns; - } else { - break; - } - } - - for (auto j = suffix_low; j != suffix_end; j += suffix_subpatterns) { - Matching right(j, suffix_subpatterns, data_array->GetSentenceId(*j)); - int comparison = comparator->Compare(left, right, last_chunk_len, - offset); - if (comparison == 0) { - vector<int> merged = left.Merge(right, phrase.Arity() + 1); - result.insert(result.end(), merged.begin(), merged.end()); - } else if (comparison < 0) { - break; - } - suffix_high = max(suffix_high, j + suffix_subpatterns); - } - } - - swap(suffix_low, suffix_high); - } else if (prefix_set_size > suffix_set_size) { - // We did the binary search on the prefix set. - suffix_low = suffix_mid; - suffix_high = suffix_mid + suffix_subpatterns; - if (CompareMatchingsSet(prefix_low, prefix_high, suffix_mid, - prefix_subpatterns, suffix_subpatterns, suffix) < 0) { - prefix_low = prefix_high; - } else { - prefix_high = prefix_low; - } - } else { - // We did the binary search on the suffix set. - if (CompareMatchingsSet(prefix_low, prefix_high, suffix_mid, - prefix_subpatterns, suffix_subpatterns, suffix) < 0) { - suffix_low = suffix_mid; - suffix_high = suffix_mid; - } else { - suffix_low = suffix_mid + suffix_subpatterns; - suffix_high = suffix_mid + suffix_subpatterns; - } - } - - Merge(locations, phrase, suffix, prefix_start, prefix_low, suffix_start, - suffix_low, prefix_subpatterns, suffix_subpatterns); - locations.insert(locations.end(), result.begin(), result.end()); - Merge(locations, phrase, suffix, prefix_high, prefix_end, suffix_high, - suffix_end, prefix_subpatterns, suffix_subpatterns); -} - -bool BinarySearchMerger::IsIntersectionVoid( - vector<int>::iterator prefix_start, vector<int>::iterator prefix_end, - vector<int>::iterator suffix_start, vector<int>::iterator suffix_end, - int prefix_subpatterns, int suffix_subpatterns, - const Phrase& suffix) const { - // Is any of the sets empty? - if (prefix_start >= prefix_end || suffix_start >= suffix_end) { - return true; - } - - int last_chunk_len = suffix.GetChunkLen(suffix.Arity()); - bool offset = !vocabulary->IsTerminal(suffix.GetSymbol(0)); - // Is the first value from the first set larger than the last value in the - // second set? - Matching left(prefix_start, prefix_subpatterns, - data_array->GetSentenceId(*prefix_start)); - Matching right(suffix_end - suffix_subpatterns, suffix_subpatterns, - data_array->GetSentenceId(*(suffix_end - suffix_subpatterns))); - if (comparator->Compare(left, right, last_chunk_len, offset) > 0) { - return true; - } - - // Is the last value from the first set smaller than the first value in the - // second set? - left = Matching(prefix_end - prefix_subpatterns, prefix_subpatterns, - data_array->GetSentenceId(*(prefix_end - prefix_subpatterns))); - right = Matching(suffix_start, suffix_subpatterns, - data_array->GetSentenceId(*suffix_start)); - if (comparator->Compare(left, right, last_chunk_len, offset) < 0) { - return true; - } - - return false; -} - -bool BinarySearchMerger::ShouldUseLinearMerge( - int prefix_set_size, int suffix_set_size) const { - if (force_binary_search_merge) { - return false; - } - - int min_size = min(prefix_set_size, suffix_set_size); - int max_size = max(prefix_set_size, suffix_set_size); - - return BAEZA_YATES_FACTOR * min_size * log2(max_size) > max_size; -} - -vector<int>::iterator BinarySearchMerger::GetMiddle( - vector<int>::iterator low, vector<int>::iterator high, - int num_subpatterns) const { - return low + (((high - low) / num_subpatterns) / 2) * num_subpatterns; -} - -void BinarySearchMerger::GetComparableMatchings( - const vector<int>::iterator& prefix_start, - const vector<int>::iterator& prefix_end, - const vector<int>::iterator& prefix_mid, - int num_subpatterns, - vector<int>::iterator& prefix_low, - vector<int>::iterator& prefix_high) const { - prefix_low = prefix_mid; - while (prefix_low != prefix_start - && *prefix_mid == *(prefix_low - num_subpatterns)) { - prefix_low -= num_subpatterns; - } - prefix_high = prefix_mid + num_subpatterns; - while (prefix_high != prefix_end - && *prefix_mid == *prefix_high) { - prefix_high += num_subpatterns; - } -} - -int BinarySearchMerger::CompareMatchingsSet( - const vector<int>::iterator& prefix_start, - const vector<int>::iterator& prefix_end, - const vector<int>::iterator& suffix_mid, - int prefix_subpatterns, - int suffix_subpatterns, - const Phrase& suffix) const { - int result = 0; - int last_chunk_len = suffix.GetChunkLen(suffix.Arity()); - bool offset = !vocabulary->IsTerminal(suffix.GetSymbol(0)); - - Matching right(suffix_mid, suffix_subpatterns, - data_array->GetSentenceId(*suffix_mid)); - for (auto i = prefix_start; i != prefix_end; i += prefix_subpatterns) { - Matching left(i, prefix_subpatterns, data_array->GetSentenceId(*i)); - int comparison = comparator->Compare(left, right, last_chunk_len, offset); - if (i == prefix_start) { - result = comparison; - } else if (comparison != result) { - return 0; - } - } - return result; -} diff --git a/extractor/binary_search_merger.h b/extractor/binary_search_merger.h deleted file mode 100644 index c887e012..00000000 --- a/extractor/binary_search_merger.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef _BINARY_SEARCH_MERGER_H_ -#define _BINARY_SEARCH_MERGER_H_ - -#include <memory> -#include <vector> - -using namespace std; - -class DataArray; -class LinearMerger; -class MatchingComparator; -class Phrase; -class Vocabulary; - -class BinarySearchMerger { - public: - BinarySearchMerger(shared_ptr<Vocabulary> vocabulary, - shared_ptr<LinearMerger> linear_merger, - shared_ptr<DataArray> data_array, - shared_ptr<MatchingComparator> comparator, - bool force_binary_search_merge = false); - - virtual ~BinarySearchMerger(); - - virtual void Merge( - vector<int>& locations, const Phrase& phrase, const Phrase& suffix, - const vector<int>::iterator& prefix_start, - const vector<int>::iterator& prefix_end, - const vector<int>::iterator& suffix_start, - const vector<int>::iterator& suffix_end, - int prefix_subpatterns, int suffix_subpatterns) const; - - static double BAEZA_YATES_FACTOR; - - protected: - BinarySearchMerger(); - - private: - bool IsIntersectionVoid( - vector<int>::iterator prefix_start, vector<int>::iterator prefix_end, - vector<int>::iterator suffix_start, vector<int>::iterator suffix_end, - int prefix_subpatterns, int suffix_subpatterns, - const Phrase& suffix) const; - - bool ShouldUseLinearMerge(int prefix_set_size, int suffix_set_size) const; - - vector<int>::iterator GetMiddle(vector<int>::iterator low, - vector<int>::iterator high, - int num_subpatterns) const; - - void GetComparableMatchings( - const vector<int>::iterator& prefix_start, - const vector<int>::iterator& prefix_end, - const vector<int>::iterator& prefix_mid, - int num_subpatterns, - vector<int>::iterator& prefix_low, - vector<int>::iterator& prefix_high) const; - - int CompareMatchingsSet( - const vector<int>::iterator& prefix_low, - const vector<int>::iterator& prefix_high, - const vector<int>::iterator& suffix_mid, - int prefix_subpatterns, - int suffix_subpatterns, - const Phrase& suffix) const; - - shared_ptr<Vocabulary> vocabulary; - shared_ptr<LinearMerger> linear_merger; - shared_ptr<DataArray> data_array; - shared_ptr<MatchingComparator> comparator; - // Should be true only for testing. - bool force_binary_search_merge; -}; - -#endif diff --git a/extractor/binary_search_merger_test.cc b/extractor/binary_search_merger_test.cc deleted file mode 100644 index b1baa62f..00000000 --- a/extractor/binary_search_merger_test.cc +++ /dev/null @@ -1,157 +0,0 @@ -#include <gtest/gtest.h> - -#include <memory> - -#include "binary_search_merger.h" -#include "matching_comparator.h" -#include "mocks/mock_data_array.h" -#include "mocks/mock_vocabulary.h" -#include "mocks/mock_linear_merger.h" -#include "phrase.h" -#include "phrase_location.h" -#include "phrase_builder.h" - -using namespace std; -using namespace ::testing; - -namespace { - -class BinarySearchMergerTest : public Test { - protected: - virtual void SetUp() { - shared_ptr<MockVocabulary> vocabulary = make_shared<MockVocabulary>(); - EXPECT_CALL(*vocabulary, GetTerminalValue(_)) - .WillRepeatedly(Return("word")); - - shared_ptr<MockDataArray> data_array = make_shared<MockDataArray>(); - EXPECT_CALL(*data_array, GetSentenceId(_)) - .WillRepeatedly(Return(1)); - - shared_ptr<MatchingComparator> comparator = - make_shared<MatchingComparator>(1, 20); - - phrase_builder = make_shared<PhraseBuilder>(vocabulary); - - // We are going to force the binary_search_merger to do all the work, so we - // need to check that the linear_merger never gets called. - shared_ptr<MockLinearMerger> linear_merger = - make_shared<MockLinearMerger>(); - EXPECT_CALL(*linear_merger, Merge(_, _, _, _, _, _, _, _, _)).Times(0); - - binary_search_merger = make_shared<BinarySearchMerger>( - vocabulary, linear_merger, data_array, comparator, true); - } - - shared_ptr<BinarySearchMerger> binary_search_merger; - shared_ptr<PhraseBuilder> phrase_builder; -}; - -TEST_F(BinarySearchMergerTest, aXbTest) { - vector<int> locations; - // Encoding for him X it (see Adam's dissertation). - vector<int> symbols{1, -1, 2}; - Phrase phrase = phrase_builder->Build(symbols); - vector<int> suffix_symbols{-1, 2}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - - vector<int> prefix_locs{2, 6, 10, 15}; - vector<int> suffix_locs{0, 4, 8, 13}; - - binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), - prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1); - - vector<int> expected_locations{2, 4, 2, 8, 2, 13, 6, 8, 6, 13, 10, 13}; - EXPECT_EQ(expected_locations, locations); -} - -TEST_F(BinarySearchMergerTest, aXbXcTest) { - vector<int> locations; - // Encoding for it X him X it (see Adam's dissertation). - vector<int> symbols{1, -1, 2, -2, 1}; - Phrase phrase = phrase_builder->Build(symbols); - vector<int> suffix_symbols{-1, 2, -2, 1}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - - vector<int> prefix_locs{0, 2, 0, 6, 0, 10, 4, 6, 4, 10, 4, 15, 8, 10, 8, 15, - 13, 15}; - vector<int> suffix_locs{2, 4, 2, 8, 2, 13, 6, 8, 6, 13, 10, 13}; - - binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), - prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 2, 2); - - vector<int> expected_locs{0, 2, 4, 0, 2, 8, 0, 2, 13, 0, 6, 8, 0, 6, 13, 0, - 10, 13, 4, 6, 8, 4, 6, 13, 4, 10, 13, 8, 10, 13}; - EXPECT_EQ(expected_locs, locations); -} - -TEST_F(BinarySearchMergerTest, abXcXdTest) { - // Sentence: Anna has many many nuts and sour apples and juicy apples. - // Phrase: Anna has X and X apples. - vector<int> locations; - vector<int> symbols{1, 2, -1, 3, -2, 4}; - Phrase phrase = phrase_builder->Build(symbols); - vector<int> suffix_symbols{2, -1, 3, -2, 4}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - - vector<int> prefix_locs{1, 6, 1, 9}; - vector<int> suffix_locs{2, 6, 8, 2, 6, 11, 2, 9, 11}; - - binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), - prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 2, 3); - - vector<int> expected_locs{1, 6, 8, 1, 6, 11, 1, 9, 11}; - EXPECT_EQ(expected_locs, locations); -} - -TEST_F(BinarySearchMergerTest, LargeTest) { - vector<int> locations; - vector<int> symbols{1, -1, 2}; - Phrase phrase = phrase_builder->Build(symbols); - vector<int> suffix_symbols{-1, 2}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - - vector<int> prefix_locs; - for (int i = 0; i < 100; ++i) { - prefix_locs.push_back(i * 20 + 1); - } - vector<int> suffix_locs; - for (int i = 0; i < 100; ++i) { - suffix_locs.push_back(i * 20 + 5); - suffix_locs.push_back(i * 20 + 13); - } - - binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), - prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1); - - EXPECT_EQ(400, locations.size()); - for (int i = 0; i < 100; ++i) { - EXPECT_EQ(i * 20 + 1, locations[4 * i]); - EXPECT_EQ(i * 20 + 5, locations[4 * i + 1]); - EXPECT_EQ(i * 20 + 1, locations[4 * i + 2]); - EXPECT_EQ(i * 20 + 13, locations[4 * i + 3]); - } -} - -TEST_F(BinarySearchMergerTest, EmptyResultTest) { - vector<int> locations; - vector<int> symbols{1, -1, 2}; - Phrase phrase = phrase_builder->Build(symbols); - vector<int> suffix_symbols{-1, 2}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - - vector<int> prefix_locs; - for (int i = 0; i < 100; ++i) { - prefix_locs.push_back(i * 200 + 1); - } - vector<int> suffix_locs; - for (int i = 0; i < 100; ++i) { - suffix_locs.push_back(i * 200 + 101); - } - - binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), - prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1); - - EXPECT_EQ(0, locations.size()); -} - -} // namespace diff --git a/extractor/data_array.h b/extractor/data_array.h index 7c120b3c..96950789 100644 --- a/extractor/data_array.h +++ b/extractor/data_array.h @@ -15,6 +15,9 @@ enum Side { TARGET }; +// TODO: This class has features for both the source and target data arrays. +// Maybe we can save some memory by having more specific implementations (e.g. +// sentence_id is only needed for the source data array). class DataArray { public: static int NULL_WORD; @@ -48,7 +51,6 @@ class DataArray { virtual int GetSentenceStart(int position) const; - //TODO(pauldb): Add unit tests. virtual int GetSentenceLength(int sentence_id) const; virtual int GetSentenceId(int position) const; @@ -67,8 +69,6 @@ class DataArray { unordered_map<string, int> word2id; vector<string> id2word; vector<int> data; - // TODO(pauldb): We only need sentence_id for the source language. Maybe we - // can save some memory here. vector<int> sentence_id; vector<int> sentence_start; }; diff --git a/extractor/data_array_test.cc b/extractor/data_array_test.cc index 772ba10e..ba5ce09e 100644 --- a/extractor/data_array_test.cc +++ b/extractor/data_array_test.cc @@ -26,18 +26,28 @@ class DataArrayTest : public Test { }; TEST_F(DataArrayTest, TestGetData) { - vector<int> expected_source_data{2, 3, 4, 5, 1, 2, 6, 7, 8, 5, 1}; + vector<int> expected_source_data = {2, 3, 4, 5, 1, 2, 6, 7, 8, 5, 1}; + vector<string> expected_source_words = { + "ana", "are", "mere", ".", "__END_OF_LINE__", + "ana", "bea", "mult", "lapte", ".", "__END_OF_LINE__" + }; EXPECT_EQ(expected_source_data, source_data->GetData()); EXPECT_EQ(expected_source_data.size(), source_data->GetSize()); for (size_t i = 0; i < expected_source_data.size(); ++i) { EXPECT_EQ(expected_source_data[i], source_data->AtIndex(i)); + EXPECT_EQ(expected_source_words[i], source_data->GetWordAtIndex(i)); } - vector<int> expected_target_data{2, 3, 4, 5, 1, 2, 6, 7, 8, 9, 10, 5, 1}; + vector<int> expected_target_data = {2, 3, 4, 5, 1, 2, 6, 7, 8, 9, 10, 5, 1}; + vector<string> expected_target_words = { + "anna", "has", "apples", ".", "__END_OF_LINE__", + "anna", "drinks", "a", "lot", "of", "milk", ".", "__END_OF_LINE__" + }; EXPECT_EQ(expected_target_data, target_data->GetData()); EXPECT_EQ(expected_target_data.size(), target_data->GetSize()); for (size_t i = 0; i < expected_target_data.size(); ++i) { EXPECT_EQ(expected_target_data[i], target_data->AtIndex(i)); + EXPECT_EQ(expected_target_words[i], target_data->GetWordAtIndex(i)); } } @@ -61,10 +71,26 @@ TEST_F(DataArrayTest, TestSentenceData) { EXPECT_EQ(5, source_data->GetSentenceStart(1)); EXPECT_EQ(11, source_data->GetSentenceStart(2)); + EXPECT_EQ(4, source_data->GetSentenceLength(0)); + EXPECT_EQ(5, source_data->GetSentenceLength(1)); + + vector<int> expected_source_ids = {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1}; + for (size_t i = 0; i < expected_source_ids.size(); ++i) { + EXPECT_EQ(expected_source_ids[i], source_data->GetSentenceId(i)); + } + EXPECT_EQ(2, target_data->GetNumSentences()); EXPECT_EQ(0, target_data->GetSentenceStart(0)); EXPECT_EQ(5, target_data->GetSentenceStart(1)); EXPECT_EQ(13, target_data->GetSentenceStart(2)); + + EXPECT_EQ(4, target_data->GetSentenceLength(0)); + EXPECT_EQ(7, target_data->GetSentenceLength(1)); + + vector<int> expected_target_ids = {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}; + for (size_t i = 0; i < expected_target_ids.size(); ++i) { + EXPECT_EQ(expected_target_ids[i], target_data->GetSentenceId(i)); + } } } // namespace diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc index a03e805f..b8f6f0c7 100644 --- a/extractor/grammar_extractor.cc +++ b/extractor/grammar_extractor.cc @@ -16,13 +16,12 @@ GrammarExtractor::GrammarExtractor( shared_ptr<Alignment> alignment, shared_ptr<Precomputation> precomputation, shared_ptr<Scorer> scorer, int min_gap_size, int max_rule_span, int max_nonterminals, int max_rule_symbols, int max_samples, - bool use_fast_intersect, bool use_baeza_yates, bool require_tight_phrases) : + bool require_tight_phrases) : vocabulary(make_shared<Vocabulary>()), rule_factory(make_shared<HieroCachingRuleFactory>( source_suffix_array, target_data_array, alignment, vocabulary, precomputation, scorer, min_gap_size, max_rule_span, max_nonterminals, - max_rule_symbols, max_samples, use_fast_intersect, use_baeza_yates, - require_tight_phrases)) {} + max_rule_symbols, max_samples, require_tight_phrases)) {} GrammarExtractor::GrammarExtractor( shared_ptr<Vocabulary> vocabulary, diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h index a8f2090d..f50a8d14 100644 --- a/extractor/grammar_extractor.h +++ b/extractor/grammar_extractor.h @@ -29,8 +29,6 @@ class GrammarExtractor { int max_nonterminals, int max_rule_symbols, int max_samples, - bool use_fast_intersect, - bool use_baeza_yates, bool require_tight_phrases); // For testing only. diff --git a/extractor/intersector.cc b/extractor/intersector.cc deleted file mode 100644 index 39a7648d..00000000 --- a/extractor/intersector.cc +++ /dev/null @@ -1,154 +0,0 @@ -#include "intersector.h" - -#include "data_array.h" -#include "matching_comparator.h" -#include "phrase.h" -#include "phrase_location.h" -#include "precomputation.h" -#include "suffix_array.h" -#include "veb.h" -#include "vocabulary.h" - -Intersector::Intersector(shared_ptr<Vocabulary> vocabulary, - shared_ptr<Precomputation> precomputation, - shared_ptr<SuffixArray> suffix_array, - shared_ptr<MatchingComparator> comparator, - bool use_baeza_yates) : - vocabulary(vocabulary), - suffix_array(suffix_array), - use_baeza_yates(use_baeza_yates) { - shared_ptr<DataArray> data_array = suffix_array->GetData(); - linear_merger = make_shared<LinearMerger>(vocabulary, data_array, comparator); - binary_search_merger = make_shared<BinarySearchMerger>( - vocabulary, linear_merger, data_array, comparator); - ConvertIndexes(precomputation, data_array); -} - -Intersector::Intersector(shared_ptr<Vocabulary> vocabulary, - shared_ptr<Precomputation> precomputation, - shared_ptr<SuffixArray> suffix_array, - shared_ptr<LinearMerger> linear_merger, - shared_ptr<BinarySearchMerger> binary_search_merger, - bool use_baeza_yates) : - vocabulary(vocabulary), - suffix_array(suffix_array), - linear_merger(linear_merger), - binary_search_merger(binary_search_merger), - use_baeza_yates(use_baeza_yates) { - ConvertIndexes(precomputation, suffix_array->GetData()); -} - -Intersector::Intersector() {} - -Intersector::~Intersector() {} - -void Intersector::ConvertIndexes(shared_ptr<Precomputation> precomputation, - shared_ptr<DataArray> data_array) { - const Index& precomputed_index = precomputation->GetInvertedIndex(); - for (pair<vector<int>, vector<int> > entry: precomputed_index) { - vector<int> phrase = ConvertPhrase(entry.first, data_array); - inverted_index[phrase] = entry.second; - - phrase.push_back(vocabulary->GetNonterminalIndex(1)); - inverted_index[phrase] = entry.second; - phrase.pop_back(); - phrase.insert(phrase.begin(), vocabulary->GetNonterminalIndex(1)); - inverted_index[phrase] = entry.second; - } - - const Index& precomputed_collocations = precomputation->GetCollocations(); - for (pair<vector<int>, vector<int> > entry: precomputed_collocations) { - vector<int> phrase = ConvertPhrase(entry.first, data_array); - collocations[phrase] = entry.second; - } -} - -vector<int> Intersector::ConvertPhrase(const vector<int>& old_phrase, - shared_ptr<DataArray> data_array) { - vector<int> new_phrase; - new_phrase.reserve(old_phrase.size()); - - int arity = 0; - for (int word_id: old_phrase) { - if (word_id == Precomputation::NON_TERMINAL) { - ++arity; - new_phrase.push_back(vocabulary->GetNonterminalIndex(arity)); - } else { - new_phrase.push_back( - vocabulary->GetTerminalIndex(data_array->GetWord(word_id))); - } - } - - return new_phrase; -} - -PhraseLocation Intersector::Intersect( - const Phrase& prefix, PhraseLocation& prefix_location, - const Phrase& suffix, PhraseLocation& suffix_location, - const Phrase& phrase) { - vector<int> symbols = phrase.Get(); - - // We should never attempt to do an intersect query for a pattern starting or - // ending with a non terminal. The RuleFactory should handle these cases, - // initializing the matchings list with the one for the pattern without the - // starting or ending terminal. - assert(vocabulary->IsTerminal(symbols.front()) - && vocabulary->IsTerminal(symbols.back())); - - if (collocations.count(symbols)) { - return PhraseLocation(collocations[symbols], phrase.Arity() + 1); - } - - vector<int> locations; - ExtendPhraseLocation(prefix, prefix_location); - ExtendPhraseLocation(suffix, suffix_location); - shared_ptr<vector<int> > prefix_matchings = prefix_location.matchings; - shared_ptr<vector<int> > suffix_matchings = suffix_location.matchings; - int prefix_subpatterns = prefix_location.num_subpatterns; - int suffix_subpatterns = suffix_location.num_subpatterns; - if (use_baeza_yates) { - binary_search_merger->Merge(locations, phrase, suffix, - prefix_matchings->begin(), prefix_matchings->end(), - suffix_matchings->begin(), suffix_matchings->end(), - prefix_subpatterns, suffix_subpatterns); - } else { - linear_merger->Merge(locations, phrase, suffix, prefix_matchings->begin(), - prefix_matchings->end(), suffix_matchings->begin(), - suffix_matchings->end(), prefix_subpatterns, suffix_subpatterns); - } - return PhraseLocation(locations, phrase.Arity() + 1); -} - -void Intersector::ExtendPhraseLocation( - const Phrase& phrase, PhraseLocation& phrase_location) { - int low = phrase_location.sa_low, high = phrase_location.sa_high; - if (phrase_location.matchings != NULL) { - return; - } - - - phrase_location.num_subpatterns = 1; - phrase_location.sa_low = phrase_location.sa_high = 0; - - vector<int> symbols = phrase.Get(); - if (inverted_index.count(symbols)) { - phrase_location.matchings = - make_shared<vector<int> >(inverted_index[symbols]); - return; - } - - vector<int> matchings; - matchings.reserve(high - low + 1); - shared_ptr<VEB> veb = VEB::Create(suffix_array->GetSize()); - for (int i = low; i < high; ++i) { - veb->Insert(suffix_array->GetSuffix(i)); - } - - int value = veb->GetMinimum(); - while (value != -1) { - matchings.push_back(value); - value = veb->GetSuccessor(value); - } - - phrase_location.matchings = make_shared<vector<int> >(matchings); -} diff --git a/extractor/intersector.h b/extractor/intersector.h deleted file mode 100644 index 8b159f17..00000000 --- a/extractor/intersector.h +++ /dev/null @@ -1,79 +0,0 @@ -#ifndef _INTERSECTOR_H_ -#define _INTERSECTOR_H_ - -#include <memory> -#include <unordered_map> -#include <vector> - -#include <boost/functional/hash.hpp> - -#include "binary_search_merger.h" -#include "linear_merger.h" - -using namespace std; - -typedef boost::hash<vector<int> > VectorHash; -typedef unordered_map<vector<int>, vector<int>, VectorHash> Index; - -class DataArray; -class MatchingComparator; -class Phrase; -class PhraseLocation; -class Precomputation; -class SuffixArray; -class Vocabulary; - -class Intersector { - public: - Intersector( - shared_ptr<Vocabulary> vocabulary, - shared_ptr<Precomputation> precomputation, - shared_ptr<SuffixArray> source_suffix_array, - shared_ptr<MatchingComparator> comparator, - bool use_baeza_yates); - - // For testing. - Intersector( - shared_ptr<Vocabulary> vocabulary, - shared_ptr<Precomputation> precomputation, - shared_ptr<SuffixArray> source_suffix_array, - shared_ptr<LinearMerger> linear_merger, - shared_ptr<BinarySearchMerger> binary_search_merger, - bool use_baeza_yates); - - virtual ~Intersector(); - - virtual PhraseLocation Intersect( - const Phrase& prefix, PhraseLocation& prefix_location, - const Phrase& suffix, PhraseLocation& suffix_location, - const Phrase& phrase); - - protected: - Intersector(); - - private: - void ConvertIndexes(shared_ptr<Precomputation> precomputation, - shared_ptr<DataArray> data_array); - - vector<int> ConvertPhrase(const vector<int>& old_phrase, - shared_ptr<DataArray> data_array); - - void ExtendPhraseLocation(const Phrase& phrase, - PhraseLocation& phrase_location); - - shared_ptr<Vocabulary> vocabulary; - shared_ptr<SuffixArray> suffix_array; - shared_ptr<LinearMerger> linear_merger; - shared_ptr<BinarySearchMerger> binary_search_merger; - Index inverted_index; - Index collocations; - bool use_baeza_yates; - - // TODO(pauldb): Don't forget to remove these. - public: - double sort_time; - double linear_merge_time; - double binary_merge_time; -}; - -#endif diff --git a/extractor/intersector_test.cc b/extractor/intersector_test.cc deleted file mode 100644 index ec318362..00000000 --- a/extractor/intersector_test.cc +++ /dev/null @@ -1,193 +0,0 @@ -#include <gtest/gtest.h> - -#include <memory> -#include <vector> - -#include "intersector.h" -#include "mocks/mock_binary_search_merger.h" -#include "mocks/mock_data_array.h" -#include "mocks/mock_linear_merger.h" -#include "mocks/mock_precomputation.h" -#include "mocks/mock_suffix_array.h" -#include "mocks/mock_vocabulary.h" - -using namespace std; -using namespace ::testing; - -namespace { - -class IntersectorTest : public Test { - protected: - virtual void SetUp() { - data = {2, 3, 4, 3, 4, 3}; - vector<string> words = {"a", "b", "c", "b", "c", "b"}; - data_array = make_shared<MockDataArray>(); - EXPECT_CALL(*data_array, GetData()).WillRepeatedly(ReturnRef(data)); - - vocabulary = make_shared<MockVocabulary>(); - for (size_t i = 0; i < data.size(); ++i) { - EXPECT_CALL(*data_array, GetWord(data[i])) - .WillRepeatedly(Return(words[i])); - EXPECT_CALL(*vocabulary, GetTerminalIndex(words[i])) - .WillRepeatedly(Return(data[i])); - EXPECT_CALL(*vocabulary, GetTerminalValue(data[i])) - .WillRepeatedly(Return(words[i])); - } - - vector<int> suffixes = {6, 0, 5, 3, 1, 4, 2}; - suffix_array = make_shared<MockSuffixArray>(); - EXPECT_CALL(*suffix_array, GetData()) - .WillRepeatedly(Return(data_array)); - EXPECT_CALL(*suffix_array, GetSize()) - .WillRepeatedly(Return(suffixes.size())); - for (size_t i = 0; i < suffixes.size(); ++i) { - EXPECT_CALL(*suffix_array, GetSuffix(i)) - .WillRepeatedly(Return(suffixes[i])); - } - - vector<int> key = {2, -1, 4}; - vector<int> values = {0, 2}; - collocations[key] = values; - precomputation = make_shared<MockPrecomputation>(); - EXPECT_CALL(*precomputation, GetInvertedIndex()) - .WillRepeatedly(ReturnRef(inverted_index)); - EXPECT_CALL(*precomputation, GetCollocations()) - .WillRepeatedly(ReturnRef(collocations)); - - linear_merger = make_shared<MockLinearMerger>(); - binary_search_merger = make_shared<MockBinarySearchMerger>(); - - phrase_builder = make_shared<PhraseBuilder>(vocabulary); - } - - Index inverted_index; - Index collocations; - vector<int> data; - shared_ptr<MockVocabulary> vocabulary; - shared_ptr<MockDataArray> data_array; - shared_ptr<MockSuffixArray> suffix_array; - shared_ptr<MockPrecomputation> precomputation; - shared_ptr<MockLinearMerger> linear_merger; - shared_ptr<MockBinarySearchMerger> binary_search_merger; - shared_ptr<PhraseBuilder> phrase_builder; - shared_ptr<Intersector> intersector; -}; - -TEST_F(IntersectorTest, TestCachedCollocation) { - intersector = make_shared<Intersector>(vocabulary, precomputation, - suffix_array, linear_merger, binary_search_merger, false); - - vector<int> prefix_symbols = {2, -1}; - Phrase prefix = phrase_builder->Build(prefix_symbols); - vector<int> suffix_symbols = {-1, 4}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - vector<int> symbols = {2, -1, 4}; - Phrase phrase = phrase_builder->Build(symbols); - PhraseLocation prefix_locs(0, 1), suffix_locs(2, 3); - - PhraseLocation result = intersector->Intersect( - prefix, prefix_locs, suffix, suffix_locs, phrase); - - vector<int> expected_locs = {0, 2}; - PhraseLocation expected_result(expected_locs, 2); - - EXPECT_EQ(expected_result, result); - EXPECT_EQ(PhraseLocation(0, 1), prefix_locs); - EXPECT_EQ(PhraseLocation(2, 3), suffix_locs); -} - -TEST_F(IntersectorTest, TestLinearMergeaXb) { - vector<int> prefix_symbols = {3, -1}; - Phrase prefix = phrase_builder->Build(prefix_symbols); - vector<int> suffix_symbols = {-1, 4}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - vector<int> symbols = {3, -1, 4}; - Phrase phrase = phrase_builder->Build(symbols); - PhraseLocation prefix_locs(2, 5), suffix_locs(5, 7); - - vector<int> ex_prefix_locs = {1, 3, 5}; - PhraseLocation extended_prefix_locs(ex_prefix_locs, 1); - vector<int> ex_suffix_locs = {2, 4}; - PhraseLocation extended_suffix_locs(ex_suffix_locs, 1); - - vector<int> expected_locs = {1, 4}; - EXPECT_CALL(*linear_merger, Merge(_, _, _, _, _, _, _, _, _)) - .Times(1) - .WillOnce(SetArgReferee<0>(expected_locs)); - EXPECT_CALL(*binary_search_merger, Merge(_, _, _, _, _, _, _, _, _)).Times(0); - - intersector = make_shared<Intersector>(vocabulary, precomputation, - suffix_array, linear_merger, binary_search_merger, false); - - PhraseLocation result = intersector->Intersect( - prefix, prefix_locs, suffix, suffix_locs, phrase); - PhraseLocation expected_result(expected_locs, 2); - - EXPECT_EQ(expected_result, result); - EXPECT_EQ(extended_prefix_locs, prefix_locs); - EXPECT_EQ(extended_suffix_locs, suffix_locs); -} - -TEST_F(IntersectorTest, TestBinarySearchMergeaXb) { - vector<int> prefix_symbols = {3, -1}; - Phrase prefix = phrase_builder->Build(prefix_symbols); - vector<int> suffix_symbols = {-1, 4}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - vector<int> symbols = {3, -1, 4}; - Phrase phrase = phrase_builder->Build(symbols); - PhraseLocation prefix_locs(2, 5), suffix_locs(5, 7); - - vector<int> ex_prefix_locs = {1, 3, 5}; - PhraseLocation extended_prefix_locs(ex_prefix_locs, 1); - vector<int> ex_suffix_locs = {2, 4}; - PhraseLocation extended_suffix_locs(ex_suffix_locs, 1); - - vector<int> expected_locs = {1, 4}; - EXPECT_CALL(*binary_search_merger, Merge(_, _, _, _, _, _, _, _, _)) - .Times(1) - .WillOnce(SetArgReferee<0>(expected_locs)); - EXPECT_CALL(*linear_merger, Merge(_, _, _, _, _, _, _, _, _)).Times(0); - - intersector = make_shared<Intersector>(vocabulary, precomputation, - suffix_array, linear_merger, binary_search_merger, true); - - PhraseLocation result = intersector->Intersect( - prefix, prefix_locs, suffix, suffix_locs, phrase); - PhraseLocation expected_result(expected_locs, 2); - - EXPECT_EQ(expected_result, result); - EXPECT_EQ(extended_prefix_locs, prefix_locs); - EXPECT_EQ(extended_suffix_locs, suffix_locs); -} - -TEST_F(IntersectorTest, TestMergeaXbXc) { - vector<int> prefix_symbols = {2, -1, 4, -1}; - Phrase prefix = phrase_builder->Build(prefix_symbols); - vector<int> suffix_symbols = {-1, 4, -1, 4}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - vector<int> symbols = {2, -1, 4, -1, 4}; - Phrase phrase = phrase_builder->Build(symbols); - - vector<int> ex_prefix_locs = {0, 2, 0, 4}; - PhraseLocation extended_prefix_locs(ex_prefix_locs, 2); - vector<int> ex_suffix_locs = {2, 4}; - PhraseLocation extended_suffix_locs(ex_suffix_locs, 2); - vector<int> expected_locs = {0, 2, 4}; - EXPECT_CALL(*linear_merger, Merge(_, _, _, _, _, _, _, _, _)) - .Times(1) - .WillOnce(SetArgReferee<0>(expected_locs)); - EXPECT_CALL(*binary_search_merger, Merge(_, _, _, _, _, _, _, _, _)).Times(0); - - intersector = make_shared<Intersector>(vocabulary, precomputation, - suffix_array, linear_merger, binary_search_merger, false); - - PhraseLocation result = intersector->Intersect( - prefix, extended_prefix_locs, suffix, extended_suffix_locs, phrase); - PhraseLocation expected_result(expected_locs, 3); - - EXPECT_EQ(expected_result, result); - EXPECT_EQ(ex_prefix_locs, *extended_prefix_locs.matchings); - EXPECT_EQ(ex_suffix_locs, *extended_suffix_locs.matchings); -} - -} // namespace diff --git a/extractor/linear_merger.cc b/extractor/linear_merger.cc deleted file mode 100644 index e7a32788..00000000 --- a/extractor/linear_merger.cc +++ /dev/null @@ -1,65 +0,0 @@ -#include "linear_merger.h" - -#include <cmath> - -#include "data_array.h" -#include "matching.h" -#include "matching_comparator.h" -#include "phrase.h" -#include "phrase_location.h" -#include "vocabulary.h" - -LinearMerger::LinearMerger(shared_ptr<Vocabulary> vocabulary, - shared_ptr<DataArray> data_array, - shared_ptr<MatchingComparator> comparator) : - vocabulary(vocabulary), data_array(data_array), comparator(comparator) {} - -LinearMerger::LinearMerger() {} - -LinearMerger::~LinearMerger() {} - -void LinearMerger::Merge( - vector<int>& locations, const Phrase& phrase, const Phrase& suffix, - vector<int>::iterator prefix_start, vector<int>::iterator prefix_end, - vector<int>::iterator suffix_start, vector<int>::iterator suffix_end, - int prefix_subpatterns, int suffix_subpatterns) { - int last_chunk_len = suffix.GetChunkLen(suffix.Arity()); - bool offset = !vocabulary->IsTerminal(suffix.GetSymbol(0)); - - while (prefix_start != prefix_end) { - Matching left(prefix_start, prefix_subpatterns, - data_array->GetSentenceId(*prefix_start)); - - while (suffix_start != suffix_end) { - Matching right(suffix_start, suffix_subpatterns, - data_array->GetSentenceId(*suffix_start)); - if (comparator->Compare(left, right, last_chunk_len, offset) > 0) { - suffix_start += suffix_subpatterns; - } else { - break; - } - } - - int start_position = *prefix_start; - vector<int> :: iterator i = suffix_start; - while (prefix_start != prefix_end && *prefix_start == start_position) { - Matching left(prefix_start, prefix_subpatterns, - data_array->GetSentenceId(*prefix_start)); - - while (i != suffix_end) { - Matching right(i, suffix_subpatterns, data_array->GetSentenceId(*i)); - int comparison = comparator->Compare(left, right, last_chunk_len, - offset); - if (comparison == 0) { - vector<int> merged = left.Merge(right, phrase.Arity() + 1); - locations.insert(locations.end(), merged.begin(), merged.end()); - } else if (comparison < 0) { - break; - } - i += suffix_subpatterns; - } - - prefix_start += prefix_subpatterns; - } - } -} diff --git a/extractor/linear_merger.h b/extractor/linear_merger.h deleted file mode 100644 index c3c7111e..00000000 --- a/extractor/linear_merger.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef _LINEAR_MERGER_H_ -#define _LINEAR_MERGER_H_ - -#include <memory> -#include <vector> - -using namespace std; - -class MatchingComparator; -class Phrase; -class PhraseLocation; -class DataArray; -class Vocabulary; - -class LinearMerger { - public: - LinearMerger(shared_ptr<Vocabulary> vocabulary, - shared_ptr<DataArray> data_array, - shared_ptr<MatchingComparator> comparator); - - virtual ~LinearMerger(); - - virtual void Merge( - vector<int>& locations, const Phrase& phrase, const Phrase& suffix, - vector<int>::iterator prefix_start, vector<int>::iterator prefix_end, - vector<int>::iterator suffix_start, vector<int>::iterator suffix_end, - int prefix_subpatterns, int suffix_subpatterns); - - protected: - LinearMerger(); - - private: - shared_ptr<Vocabulary> vocabulary; - shared_ptr<DataArray> data_array; - shared_ptr<MatchingComparator> comparator; -}; - -#endif diff --git a/extractor/linear_merger_test.cc b/extractor/linear_merger_test.cc deleted file mode 100644 index a6963430..00000000 --- a/extractor/linear_merger_test.cc +++ /dev/null @@ -1,149 +0,0 @@ -#include <gtest/gtest.h> - -#include <memory> - -#include "linear_merger.h" -#include "matching_comparator.h" -#include "mocks/mock_data_array.h" -#include "mocks/mock_vocabulary.h" -#include "phrase.h" -#include "phrase_location.h" -#include "phrase_builder.h" - -using namespace std; -using namespace ::testing; - -namespace { - -class LinearMergerTest : public Test { - protected: - virtual void SetUp() { - shared_ptr<MockVocabulary> vocabulary = make_shared<MockVocabulary>(); - EXPECT_CALL(*vocabulary, GetTerminalValue(_)) - .WillRepeatedly(Return("word")); - - shared_ptr<MockDataArray> data_array = make_shared<MockDataArray>(); - EXPECT_CALL(*data_array, GetSentenceId(_)) - .WillRepeatedly(Return(1)); - - shared_ptr<MatchingComparator> comparator = - make_shared<MatchingComparator>(1, 20); - - phrase_builder = make_shared<PhraseBuilder>(vocabulary); - linear_merger = make_shared<LinearMerger>(vocabulary, data_array, - comparator); - } - - shared_ptr<LinearMerger> linear_merger; - shared_ptr<PhraseBuilder> phrase_builder; -}; - -TEST_F(LinearMergerTest, aXbTest) { - vector<int> locations; - // Encoding for him X it (see Adam's dissertation). - vector<int> symbols{1, -1, 2}; - Phrase phrase = phrase_builder->Build(symbols); - vector<int> suffix_symbols{-1, 2}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - - vector<int> prefix_locs{2, 6, 10, 15}; - vector<int> suffix_locs{0, 4, 8, 13}; - - linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), - prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1); - - vector<int> expected_locations{2, 4, 2, 8, 2, 13, 6, 8, 6, 13, 10, 13}; - EXPECT_EQ(expected_locations, locations); -} - -TEST_F(LinearMergerTest, aXbXcTest) { - vector<int> locations; - // Encoding for it X him X it (see Adam's dissertation). - vector<int> symbols{1, -1, 2, -2, 1}; - Phrase phrase = phrase_builder->Build(symbols); - vector<int> suffix_symbols{-1, 2, -2, 1}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - - vector<int> prefix_locs{0, 2, 0, 6, 0, 10, 4, 6, 4, 10, 4, 15, 8, 10, 8, 15, - 13, 15}; - vector<int> suffix_locs{2, 4, 2, 8, 2, 13, 6, 8, 6, 13, 10, 13}; - - linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), - prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 2, 2); - - vector<int> expected_locs{0, 2, 4, 0, 2, 8, 0, 2, 13, 0, 6, 8, 0, 6, 13, 0, - 10, 13, 4, 6, 8, 4, 6, 13, 4, 10, 13, 8, 10, 13}; - EXPECT_EQ(expected_locs, locations); -} - -TEST_F(LinearMergerTest, abXcXdTest) { - // Sentence: Anna has many many nuts and sour apples and juicy apples. - // Phrase: Anna has X and X apples. - vector<int> locations; - vector<int> symbols{1, 2, -1, 3, -2, 4}; - Phrase phrase = phrase_builder->Build(symbols); - vector<int> suffix_symbols{2, -1, 3, -2, 4}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - - vector<int> prefix_locs{1, 6, 1, 9}; - vector<int> suffix_locs{2, 6, 8, 2, 6, 11, 2, 9, 11}; - - linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), - prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 2, 3); - - vector<int> expected_locs{1, 6, 8, 1, 6, 11, 1, 9, 11}; - EXPECT_EQ(expected_locs, locations); -} - -TEST_F(LinearMergerTest, LargeTest) { - vector<int> locations; - vector<int> symbols{1, -1, 2}; - Phrase phrase = phrase_builder->Build(symbols); - vector<int> suffix_symbols{-1, 2}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - - vector<int> prefix_locs; - for (int i = 0; i < 100; ++i) { - prefix_locs.push_back(i * 20 + 1); - } - vector<int> suffix_locs; - for (int i = 0; i < 100; ++i) { - suffix_locs.push_back(i * 20 + 5); - suffix_locs.push_back(i * 20 + 13); - } - - linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), - prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1); - - EXPECT_EQ(400, locations.size()); - for (int i = 0; i < 100; ++i) { - EXPECT_EQ(i * 20 + 1, locations[4 * i]); - EXPECT_EQ(i * 20 + 5, locations[4 * i + 1]); - EXPECT_EQ(i * 20 + 1, locations[4 * i + 2]); - EXPECT_EQ(i * 20 + 13, locations[4 * i + 3]); - } -} - -TEST_F(LinearMergerTest, EmptyResultTest) { - vector<int> locations; - vector<int> symbols{1, -1, 2}; - Phrase phrase = phrase_builder->Build(symbols); - vector<int> suffix_symbols{-1, 2}; - Phrase suffix = phrase_builder->Build(suffix_symbols); - - vector<int> prefix_locs; - for (int i = 0; i < 100; ++i) { - prefix_locs.push_back(i * 200 + 1); - } - vector<int> suffix_locs; - for (int i = 0; i < 100; ++i) { - suffix_locs.push_back(i * 200 + 101); - } - - linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), - prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1); - - EXPECT_EQ(0, locations.size()); -} - -} // namespace diff --git a/extractor/matching.cc b/extractor/matching.cc deleted file mode 100644 index 16a3ed6f..00000000 --- a/extractor/matching.cc +++ /dev/null @@ -1,12 +0,0 @@ -#include "matching.h" - -Matching::Matching(vector<int>::iterator start, int len, int sentence_id) : - positions(start, start + len), sentence_id(sentence_id) {} - -vector<int> Matching::Merge(const Matching& other, int num_subpatterns) const { - vector<int> result = positions; - if (num_subpatterns > positions.size()) { - result.push_back(other.positions.back()); - } - return result; -} diff --git a/extractor/matching.h b/extractor/matching.h deleted file mode 100644 index 4c46559e..00000000 --- a/extractor/matching.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _MATCHING_H_ -#define _MATCHING_H_ - -#include <memory> -#include <vector> - -using namespace std; - -struct Matching { - Matching(vector<int>::iterator start, int len, int sentence_id); - - vector<int> Merge(const Matching& other, int num_subpatterns) const; - - vector<int> positions; - int sentence_id; -}; - -#endif diff --git a/extractor/matching_comparator.cc b/extractor/matching_comparator.cc deleted file mode 100644 index 03db95c0..00000000 --- a/extractor/matching_comparator.cc +++ /dev/null @@ -1,46 +0,0 @@ -#include "matching_comparator.h" - -#include "matching.h" -#include "vocabulary.h" - -MatchingComparator::MatchingComparator(int min_gap_size, int max_rule_span) : - min_gap_size(min_gap_size), max_rule_span(max_rule_span) {} - -int MatchingComparator::Compare(const Matching& left, - const Matching& right, - int last_chunk_len, - bool offset) const { - if (left.sentence_id != right.sentence_id) { - return left.sentence_id < right.sentence_id ? -1 : 1; - } - - if (left.positions.size() == 1 && right.positions.size() == 1) { - // The prefix and the suffix must be separated by a non-terminal, otherwise - // we would be doing a suffix array lookup. - if (right.positions[0] - left.positions[0] <= min_gap_size) { - return 1; - } - } else if (offset) { - for (size_t i = 1; i < left.positions.size(); ++i) { - if (left.positions[i] != right.positions[i - 1]) { - return left.positions[i] < right.positions[i - 1] ? -1 : 1; - } - } - } else { - if (left.positions[0] + 1 != right.positions[0]) { - return left.positions[0] + 1 < right.positions[0] ? -1 : 1; - } - for (size_t i = 1; i < left.positions.size(); ++i) { - if (left.positions[i] != right.positions[i]) { - return left.positions[i] < right.positions[i] ? -1 : 1; - } - } - } - - if (right.positions.back() + last_chunk_len - left.positions.front() > - max_rule_span) { - return -1; - } - - return 0; -} diff --git a/extractor/matching_comparator.h b/extractor/matching_comparator.h deleted file mode 100644 index 6e1bb487..00000000 --- a/extractor/matching_comparator.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _MATCHING_COMPARATOR_H_ -#define _MATCHING_COMPARATOR_H_ - -#include <memory> - -using namespace std; - -class Vocabulary; -class Matching; - -class MatchingComparator { - public: - MatchingComparator(int min_gap_size, int max_rule_span); - - int Compare(const Matching& left, const Matching& right, - int last_chunk_len, bool offset) const; - - private: - int min_gap_size; - int max_rule_span; -}; - -#endif diff --git a/extractor/matching_comparator_test.cc b/extractor/matching_comparator_test.cc deleted file mode 100644 index b8f898cf..00000000 --- a/extractor/matching_comparator_test.cc +++ /dev/null @@ -1,139 +0,0 @@ -#include <gtest/gtest.h> - -#include "matching.h" -#include "matching_comparator.h" - -using namespace ::testing; - -namespace { - -class MatchingComparatorTest : public Test { - protected: - virtual void SetUp() { - comparator = make_shared<MatchingComparator>(1, 20); - } - - shared_ptr<MatchingComparator> comparator; -}; - -TEST_F(MatchingComparatorTest, SmallerSentenceId) { - vector<int> left_locations{1}; - Matching left(left_locations.begin(), 1, 1); - vector<int> right_locations{100}; - Matching right(right_locations.begin(), 1, 5); - EXPECT_EQ(-1, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, GreaterSentenceId) { - vector<int> left_locations{100}; - Matching left(left_locations.begin(), 1, 5); - vector<int> right_locations{1}; - Matching right(right_locations.begin(), 1, 1); - EXPECT_EQ(1, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, SmalleraXb) { - vector<int> left_locations{1}; - Matching left(left_locations.begin(), 1, 1); - vector<int> right_locations{21}; - Matching right(right_locations.begin(), 1, 1); - // The matching exceeds the max rule span. - EXPECT_EQ(-1, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, EqualaXb) { - vector<int> left_locations{1}; - Matching left(left_locations.begin(), 1, 1); - vector<int> lower_right_locations{3}; - Matching right(lower_right_locations.begin(), 1, 1); - EXPECT_EQ(0, comparator->Compare(left, right, 1, true)); - - vector<int> higher_right_locations{20}; - right = Matching(higher_right_locations.begin(), 1, 1); - EXPECT_EQ(0, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, GreateraXb) { - vector<int> left_locations{1}; - Matching left(left_locations.begin(), 1, 1); - vector<int> right_locations{2}; - Matching right(right_locations.begin(), 1, 1); - // The gap between the prefix and the suffix is of size 0, less than the - // min gap size. - EXPECT_EQ(1, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, SmalleraXbXc) { - vector<int> left_locations{1, 3}; - Matching left(left_locations.begin(), 2, 1); - vector<int> right_locations{4, 6}; - // The common part doesn't match. - Matching right(right_locations.begin(), 2, 1); - EXPECT_EQ(-1, comparator->Compare(left, right, 1, true)); - - // The common part matches, but the rule exceeds the max span. - vector<int> other_right_locations{3, 21}; - right = Matching(other_right_locations.begin(), 2, 1); - EXPECT_EQ(-1, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, EqualaXbXc) { - vector<int> left_locations{1, 3}; - Matching left(left_locations.begin(), 2, 1); - vector<int> right_locations{3, 5}; - // The leftmost possible match. - Matching right(right_locations.begin(), 2, 1); - EXPECT_EQ(0, comparator->Compare(left, right, 1, true)); - - // The rightmost possible match. - vector<int> other_right_locations{3, 20}; - right = Matching(other_right_locations.begin(), 2, 1); - EXPECT_EQ(0, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, GreateraXbXc) { - vector<int> left_locations{1, 4}; - Matching left(left_locations.begin(), 2, 1); - vector<int> right_locations{3, 5}; - // The common part doesn't match. - Matching right(right_locations.begin(), 2, 1); - EXPECT_EQ(1, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, SmallerabXcXd) { - vector<int> left_locations{9, 13}; - Matching left(left_locations.begin(), 2, 1); - // The suffix doesn't start on the next position. - vector<int> right_locations{11, 13, 15}; - Matching right(right_locations.begin(), 3, 1); - EXPECT_EQ(-1, comparator->Compare(left, right, 1, false)); - - // The common part doesn't match. - vector<int> other_right_locations{10, 16, 18}; - right = Matching(other_right_locations.begin(), 3, 1); - EXPECT_EQ(-1, comparator->Compare(left, right, 1, false)); -} - -TEST_F(MatchingComparatorTest, EqualabXcXd) { - vector<int> left_locations{10, 13}; - Matching left(left_locations.begin(), 2, 1); - vector<int> right_locations{11, 13, 15}; - Matching right(right_locations.begin(), 3, 1); - EXPECT_EQ(0, comparator->Compare(left, right, 1, false)); -} - -TEST_F(MatchingComparatorTest, GreaterabXcXd) { - vector<int> left_locations{9, 15}; - Matching left(left_locations.begin(), 2, 1); - // The suffix doesn't start on the next position. - vector<int> right_locations{7, 15, 17}; - Matching right(right_locations.begin(), 3, 1); - EXPECT_EQ(1, comparator->Compare(left, right, 1, false)); - - // The common part doesn't match. - vector<int> other_right_locations{10, 13, 16}; - right = Matching(other_right_locations.begin(), 3, 1); - EXPECT_EQ(1, comparator->Compare(left, right, 1, false)); -} - -} // namespace diff --git a/extractor/matching_test.cc b/extractor/matching_test.cc deleted file mode 100644 index 9593aa86..00000000 --- a/extractor/matching_test.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include <gtest/gtest.h> - -#include <vector> - -#include "matching.h" - -using namespace std; - -namespace { - -TEST(MatchingTest, SameSize) { - vector<int> positions{1, 2, 3}; - Matching left(positions.begin(), positions.size(), 0); - Matching right(positions.begin(), positions.size(), 0); - EXPECT_EQ(positions, left.Merge(right, positions.size())); -} - -TEST(MatchingTest, DifferentSize) { - vector<int> positions{1, 2, 3}; - Matching left(positions.begin(), positions.size() - 1, 0); - Matching right(positions.begin() + 1, positions.size() - 1, 0); - vector<int> result = left.Merge(right, positions.size()); -} - -} // namespace diff --git a/extractor/mocks/mock_binary_search_merger.h b/extractor/mocks/mock_binary_search_merger.h deleted file mode 100644 index e23386f0..00000000 --- a/extractor/mocks/mock_binary_search_merger.h +++ /dev/null @@ -1,15 +0,0 @@ -#include <gmock/gmock.h> - -#include <vector> - -#include "../binary_search_merger.h" -#include "../phrase.h" - -using namespace std; - -class MockBinarySearchMerger: public BinarySearchMerger { - public: - MOCK_CONST_METHOD9(Merge, void(vector<int>&, const Phrase&, const Phrase&, - const vector<int>::iterator&, const vector<int>::iterator&, - const vector<int>::iterator&, const vector<int>::iterator&, int, int)); -}; diff --git a/extractor/mocks/mock_intersector.h b/extractor/mocks/mock_intersector.h deleted file mode 100644 index 372fa7ea..00000000 --- a/extractor/mocks/mock_intersector.h +++ /dev/null @@ -1,11 +0,0 @@ -#include <gmock/gmock.h> - -#include "../intersector.h" -#include "../phrase.h" -#include "../phrase_location.h" - -class MockIntersector : public Intersector { - public: - MOCK_METHOD5(Intersect, PhraseLocation(const Phrase&, PhraseLocation&, - const Phrase&, PhraseLocation&, const Phrase&)); -}; diff --git a/extractor/mocks/mock_linear_merger.h b/extractor/mocks/mock_linear_merger.h deleted file mode 100644 index 522c1f31..00000000 --- a/extractor/mocks/mock_linear_merger.h +++ /dev/null @@ -1,15 +0,0 @@ -#include <gmock/gmock.h> - -#include <vector> - -#include "../linear_merger.h" -#include "../phrase.h" - -using namespace std; - -class MockLinearMerger: public LinearMerger { - public: - MOCK_METHOD9(Merge, void(vector<int>&, const Phrase&, const Phrase&, - vector<int>::iterator, vector<int>::iterator, vector<int>::iterator, - vector<int>::iterator, int, int)); -}; diff --git a/extractor/rule_factory.cc b/extractor/rule_factory.cc index fbdc7cce..4908dac0 100644 --- a/extractor/rule_factory.cc +++ b/extractor/rule_factory.cc @@ -7,9 +7,7 @@ #include "grammar.h" #include "fast_intersector.h" -#include "intersector.h" #include "matchings_finder.h" -#include "matching_comparator.h" #include "phrase.h" #include "rule.h" #include "rule_extractor.h" @@ -50,8 +48,6 @@ HieroCachingRuleFactory::HieroCachingRuleFactory( int max_nonterminals, int max_rule_symbols, int max_samples, - bool use_fast_intersect, - bool use_baeza_yates, bool require_tight_phrases) : vocabulary(vocabulary), scorer(scorer), @@ -59,13 +55,8 @@ HieroCachingRuleFactory::HieroCachingRuleFactory( max_rule_span(max_rule_span), max_nonterminals(max_nonterminals), max_chunks(max_nonterminals + 1), - max_rule_symbols(max_rule_symbols), - use_fast_intersect(use_fast_intersect) { + max_rule_symbols(max_rule_symbols) { matchings_finder = make_shared<MatchingsFinder>(source_suffix_array); - shared_ptr<MatchingComparator> comparator = - make_shared<MatchingComparator>(min_gap_size, max_rule_span); - intersector = make_shared<Intersector>(vocabulary, precomputation, - source_suffix_array, comparator, use_baeza_yates); fast_intersector = make_shared<FastIntersector>(source_suffix_array, precomputation, vocabulary, max_rule_span, min_gap_size); phrase_builder = make_shared<PhraseBuilder>(vocabulary); @@ -78,7 +69,6 @@ HieroCachingRuleFactory::HieroCachingRuleFactory( HieroCachingRuleFactory::HieroCachingRuleFactory( shared_ptr<MatchingsFinder> finder, - shared_ptr<Intersector> intersector, shared_ptr<FastIntersector> fast_intersector, shared_ptr<PhraseBuilder> phrase_builder, shared_ptr<RuleExtractor> rule_extractor, @@ -89,10 +79,8 @@ HieroCachingRuleFactory::HieroCachingRuleFactory( int max_rule_span, int max_nonterminals, int max_chunks, - int max_rule_symbols, - bool use_fast_intersect) : + int max_rule_symbols) : matchings_finder(finder), - intersector(intersector), fast_intersector(fast_intersector), phrase_builder(phrase_builder), rule_extractor(rule_extractor), @@ -103,15 +91,13 @@ HieroCachingRuleFactory::HieroCachingRuleFactory( max_rule_span(max_rule_span), max_nonterminals(max_nonterminals), max_chunks(max_chunks), - max_rule_symbols(max_rule_symbols), - use_fast_intersect(use_fast_intersect) {} + max_rule_symbols(max_rule_symbols) {} HieroCachingRuleFactory::HieroCachingRuleFactory() {} HieroCachingRuleFactory::~HieroCachingRuleFactory() {} Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids) { - intersector->sort_time = 0; Clock::time_point start_time = Clock::now(); double total_extract_time = 0; double total_intersect_time = 0; @@ -164,17 +150,8 @@ Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids) { PhraseLocation phrase_location; if (next_phrase.Arity() > 0) { Clock::time_point intersect_start = Clock::now(); - if (use_fast_intersect) { - phrase_location = fast_intersector->Intersect( - node->matchings, next_suffix_link->matchings, next_phrase); - } else { - phrase_location = intersector->Intersect( - node->phrase, - node->matchings, - next_suffix_link->phrase, - next_suffix_link->matchings, - next_phrase); - } + phrase_location = fast_intersector->Intersect( + node->matchings, next_suffix_link->matchings, next_phrase); Clock::time_point intersect_stop = Clock::now(); total_intersect_time += GetDuration(intersect_start, intersect_stop); } else { diff --git a/extractor/rule_factory.h b/extractor/rule_factory.h index a39386a8..2d1c7f5a 100644 --- a/extractor/rule_factory.h +++ b/extractor/rule_factory.h @@ -14,7 +14,6 @@ class DataArray; class Grammar; class MatchingsFinder; class FastIntersector; -class Intersector; class Precomputation; class Rule; class RuleExtractor; @@ -38,14 +37,11 @@ class HieroCachingRuleFactory { int max_nonterminals, int max_rule_symbols, int max_samples, - bool use_fast_intersect, - bool use_beaza_yates, bool require_tight_phrases); // For testing only. HieroCachingRuleFactory( shared_ptr<MatchingsFinder> finder, - shared_ptr<Intersector> intersector, shared_ptr<FastIntersector> fast_intersector, shared_ptr<PhraseBuilder> phrase_builder, shared_ptr<RuleExtractor> rule_extractor, @@ -56,8 +52,7 @@ class HieroCachingRuleFactory { int max_rule_span, int max_nonterminals, int max_chunks, - int max_rule_symbols, - bool use_fast_intersect); + int max_rule_symbols); virtual ~HieroCachingRuleFactory(); @@ -83,7 +78,6 @@ class HieroCachingRuleFactory { const shared_ptr<TrieNode>& node); shared_ptr<MatchingsFinder> matchings_finder; - shared_ptr<Intersector> intersector; shared_ptr<FastIntersector> fast_intersector; MatchingsTrie trie; shared_ptr<PhraseBuilder> phrase_builder; @@ -96,7 +90,6 @@ class HieroCachingRuleFactory { int max_nonterminals; int max_chunks; int max_rule_symbols; - bool use_fast_intersect; }; #endif diff --git a/extractor/rule_factory_test.cc b/extractor/rule_factory_test.cc index d329382a..fc709461 100644 --- a/extractor/rule_factory_test.cc +++ b/extractor/rule_factory_test.cc @@ -6,7 +6,6 @@ #include "grammar.h" #include "mocks/mock_fast_intersector.h" -#include "mocks/mock_intersector.h" #include "mocks/mock_matchings_finder.h" #include "mocks/mock_rule_extractor.h" #include "mocks/mock_sampler.h" @@ -25,7 +24,6 @@ class RuleFactoryTest : public Test { protected: virtual void SetUp() { finder = make_shared<MockMatchingsFinder>(); - intersector = make_shared<MockIntersector>(); fast_intersector = make_shared<MockFastIntersector>(); vocabulary = make_shared<MockVocabulary>(); @@ -55,7 +53,6 @@ class RuleFactoryTest : public Test { vector<string> feature_names; shared_ptr<MockMatchingsFinder> finder; - shared_ptr<MockIntersector> intersector; shared_ptr<MockFastIntersector> fast_intersector; shared_ptr<MockVocabulary> vocabulary; shared_ptr<PhraseBuilder> phrase_builder; @@ -66,81 +63,39 @@ class RuleFactoryTest : public Test { }; TEST_F(RuleFactoryTest, TestGetGrammarDifferentWords) { - factory = make_shared<HieroCachingRuleFactory>(finder, intersector, - fast_intersector, phrase_builder, extractor, vocabulary, sampler, - scorer, 1, 10, 2, 3, 5, false); + factory = make_shared<HieroCachingRuleFactory>(finder, fast_intersector, + phrase_builder, extractor, vocabulary, sampler, scorer, 1, 10, 2, 3, 5); EXPECT_CALL(*finder, Find(_, _, _)) .Times(6) .WillRepeatedly(Return(PhraseLocation(0, 1))); - EXPECT_CALL(*intersector, Intersect(_, _, _, _, _)) + EXPECT_CALL(*fast_intersector, Intersect(_, _, _)) .Times(1) .WillRepeatedly(Return(PhraseLocation(0, 1))); - EXPECT_CALL(*fast_intersector, Intersect(_, _, _)).Times(0); vector<int> word_ids = {2, 3, 4}; Grammar grammar = factory->GetGrammar(word_ids); EXPECT_EQ(feature_names, grammar.GetFeatureNames()); EXPECT_EQ(7, grammar.GetRules().size()); - - // Test for fast intersector. - factory = make_shared<HieroCachingRuleFactory>(finder, intersector, - fast_intersector, phrase_builder, extractor, vocabulary, sampler, - scorer, 1, 10, 2, 3, 5, true); - - EXPECT_CALL(*finder, Find(_, _, _)) - .Times(6) - .WillRepeatedly(Return(PhraseLocation(0, 1))); - - EXPECT_CALL(*fast_intersector, Intersect(_, _, _)) - .Times(1) - .WillRepeatedly(Return(PhraseLocation(0, 1))); - EXPECT_CALL(*intersector, Intersect(_, _, _, _, _)).Times(0); - - grammar = factory->GetGrammar(word_ids); - EXPECT_EQ(feature_names, grammar.GetFeatureNames()); - EXPECT_EQ(7, grammar.GetRules().size()); } TEST_F(RuleFactoryTest, TestGetGrammarRepeatingWords) { - factory = make_shared<HieroCachingRuleFactory>(finder, intersector, - fast_intersector, phrase_builder, extractor, vocabulary, sampler, - scorer, 1, 10, 2, 3, 5, false); + factory = make_shared<HieroCachingRuleFactory>(finder, fast_intersector, + phrase_builder, extractor, vocabulary, sampler, scorer, 1, 10, 2, 3, 5); EXPECT_CALL(*finder, Find(_, _, _)) .Times(12) .WillRepeatedly(Return(PhraseLocation(0, 1))); - EXPECT_CALL(*intersector, Intersect(_, _, _, _, _)) + EXPECT_CALL(*fast_intersector, Intersect(_, _, _)) .Times(16) .WillRepeatedly(Return(PhraseLocation(0, 1))); - EXPECT_CALL(*fast_intersector, Intersect(_, _, _)).Times(0); - vector<int> word_ids = {2, 3, 4, 2, 3}; Grammar grammar = factory->GetGrammar(word_ids); EXPECT_EQ(feature_names, grammar.GetFeatureNames()); EXPECT_EQ(28, grammar.GetRules().size()); - - // Test for fast intersector. - factory = make_shared<HieroCachingRuleFactory>(finder, intersector, - fast_intersector, phrase_builder, extractor, vocabulary, sampler, - scorer, 1, 10, 2, 3, 5, true); - - EXPECT_CALL(*finder, Find(_, _, _)) - .Times(12) - .WillRepeatedly(Return(PhraseLocation(0, 1))); - - EXPECT_CALL(*fast_intersector, Intersect(_, _, _)) - .Times(16) - .WillRepeatedly(Return(PhraseLocation(0, 1))); - - EXPECT_CALL(*intersector, Intersect(_, _, _, _, _)).Times(0); - - grammar = factory->GetGrammar(word_ids); - EXPECT_EQ(feature_names, grammar.GetFeatureNames()); - EXPECT_EQ(28, grammar.GetRules().size()); } } // namespace diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 2b01e832..36dbd7a0 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -66,13 +66,9 @@ int main(int argc, char** argv) { "Minimum number of occurences for a pharse to be considered frequent") ("max_samples", po::value<int>()->default_value(300), "Maximum number of samples") - ("fast_intersect", po::value<bool>()->default_value(false), - "Enable fast intersect") // TODO(pauldb): Check if this works when set to false. ("tight_phrases", po::value<bool>()->default_value(true), - "False if phrases may be loose (better, but slower)") - ("baeza_yates", po::value<bool>()->default_value(true), - "Use double binary search"); + "False if phrases may be loose (better, but slower)"); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); @@ -178,8 +174,6 @@ int main(int argc, char** argv) { vm["max_nonterminals"].as<int>(), vm["max_rule_symbols"].as<int>(), vm["max_samples"].as<int>(), - vm["fast_intersect"].as<bool>(), - vm["baeza_yates"].as<bool>(), vm["tight_phrases"].as<bool>()); int grammar_id = 0; diff --git a/extractor/suffix_array.cc b/extractor/suffix_array.cc index 23c458a4..ab8a0913 100644 --- a/extractor/suffix_array.cc +++ b/extractor/suffix_array.cc @@ -136,7 +136,7 @@ void SuffixArray::TernaryQuicksort(int left, int right, int step, vector<int> SuffixArray::BuildLCPArray() const { Clock::time_point start_time = Clock::now(); - cerr << "Constructing LCP array..." << endl; + cerr << "\tConstructing LCP array..." << endl; vector<int> lcp(suffix_array.size()); vector<int> rank(suffix_array.size()); @@ -165,7 +165,7 @@ vector<int> SuffixArray::BuildLCPArray() const { } Clock::time_point stop_time = Clock::now(); - cerr << "Constructing LCP took " + cerr << "\tConstructing LCP took " << GetDuration(start_time, stop_time) << " seconds" << endl; return lcp; diff --git a/extractor/veb.cc b/extractor/veb.cc deleted file mode 100644 index f38f672e..00000000 --- a/extractor/veb.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include "veb.h" - -#include "veb_bitset.h" -#include "veb_tree.h" - -int VEB::MIN_BOTTOM_BITS = 5; -int VEB::MIN_BOTTOM_SIZE = 1 << VEB::MIN_BOTTOM_BITS; - -shared_ptr<VEB> VEB::Create(int size) { - if (size > MIN_BOTTOM_SIZE) { - return shared_ptr<VEB>(new VEBTree(size)); - } else { - return shared_ptr<VEB>(new VEBBitset(size)); - } -} - -int VEB::GetMinimum() { - return min; -} - -int VEB::GetMaximum() { - return max; -} - -VEB::VEB(int min, int max) : min(min), max(max) {} diff --git a/extractor/veb.h b/extractor/veb.h deleted file mode 100644 index c8209cf7..00000000 --- a/extractor/veb.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef _VEB_H_ -#define _VEB_H_ - -#include <memory> - -using namespace std; - -class VEB { - public: - static shared_ptr<VEB> Create(int size); - - virtual void Insert(int value) = 0; - - virtual int GetSuccessor(int value) = 0; - - int GetMinimum(); - - int GetMaximum(); - - static int MIN_BOTTOM_BITS; - static int MIN_BOTTOM_SIZE; - - protected: - VEB(int min = -1, int max = -1); - - int min, max; -}; - -#endif diff --git a/extractor/veb_bitset.cc b/extractor/veb_bitset.cc deleted file mode 100644 index 4e364cc5..00000000 --- a/extractor/veb_bitset.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include "veb_bitset.h" - -using namespace std; - -VEBBitset::VEBBitset(int size) : bitset(size) { - min = max = -1; -} - -void VEBBitset::Insert(int value) { - bitset[value] = 1; - if (min == -1 || value < min) { - min = value; - } - if (max == - 1 || value > max) { - max = value; - } -} - -int VEBBitset::GetSuccessor(int value) { - int next_value = bitset.find_next(value); - if (next_value == bitset.npos) { - return -1; - } - return next_value; -} diff --git a/extractor/veb_bitset.h b/extractor/veb_bitset.h deleted file mode 100644 index f8a91234..00000000 --- a/extractor/veb_bitset.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _VEB_BITSET_H_ -#define _VEB_BITSET_H_ - -#include <boost/dynamic_bitset.hpp> - -#include "veb.h" - -class VEBBitset: public VEB { - public: - VEBBitset(int size); - - void Insert(int value); - - int GetMinimum(); - - int GetSuccessor(int value); - - private: - boost::dynamic_bitset<> bitset; -}; - -#endif diff --git a/extractor/veb_test.cc b/extractor/veb_test.cc deleted file mode 100644 index c40c9f28..00000000 --- a/extractor/veb_test.cc +++ /dev/null @@ -1,56 +0,0 @@ -#include <gtest/gtest.h> - -#include <algorithm> -#include <vector> - -#include "veb.h" - -using namespace std; - -namespace { - -class VEBTest : public ::testing::Test { - protected: - void VEBSortTester(vector<int> values, int max_value) { - shared_ptr<VEB> veb = VEB::Create(max_value); - for (int value: values) { - veb->Insert(value); - } - - sort(values.begin(), values.end()); - EXPECT_EQ(values.front(), veb->GetMinimum()); - EXPECT_EQ(values.back(), veb->GetMaximum()); - for (size_t i = 0; i + 1 < values.size(); ++i) { - EXPECT_EQ(values[i + 1], veb->GetSuccessor(values[i])); - } - EXPECT_EQ(-1, veb->GetSuccessor(values.back())); - } -}; - -TEST_F(VEBTest, SmallRange) { - vector<int> values{8, 13, 5, 1, 4, 15, 2, 10, 6, 7}; - VEBSortTester(values, 16); -} - -TEST_F(VEBTest, MediumRange) { - vector<int> values{167, 243, 88, 12, 137, 199, 212, 45, 150, 189}; - VEBSortTester(values, 255); -} - -TEST_F(VEBTest, LargeRangeSparse) { - vector<int> values; - for (size_t i = 0; i < 100; ++i) { - values.push_back(i * 1000000); - } - VEBSortTester(values, 100000000); -} - -TEST_F(VEBTest, LargeRangeDense) { - vector<int> values; - for (size_t i = 0; i < 1000000; ++i) { - values.push_back(i); - } - VEBSortTester(values, 1000000); -} - -} // namespace diff --git a/extractor/veb_tree.cc b/extractor/veb_tree.cc deleted file mode 100644 index f8945445..00000000 --- a/extractor/veb_tree.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include <cmath> - -#include "veb_tree.h" - -VEBTree::VEBTree(int size) { - int num_bits = ceil(log2(size)); - - lower_bits = num_bits >> 1; - upper_size = (size >> lower_bits) + 1; - - clusters.reserve(upper_size); - clusters.resize(upper_size); -} - -int VEBTree::GetNextValue(int value) { - return value & ((1 << lower_bits) - 1); -} - -int VEBTree::GetCluster(int value) { - return value >> lower_bits; -} - -int VEBTree::Compose(int cluster, int value) { - return (cluster << lower_bits) + value; -} - -void VEBTree::Insert(int value) { - if (min == -1 && max == -1) { - min = max = value; - return; - } - - if (value < min) { - swap(min, value); - } - - int cluster = GetCluster(value), next_value = GetNextValue(value); - if (clusters[cluster] == NULL) { - clusters[cluster] = VEB::Create(1 << lower_bits); - if (summary == NULL) { - summary = VEB::Create(upper_size); - } - summary->Insert(cluster); - } - clusters[cluster]->Insert(next_value); - - if (value > max) { - max = value; - } -} - -int VEBTree::GetSuccessor(int value) { - if (value >= max) { - return -1; - } - if (value < min) { - return min; - } - - int cluster = GetCluster(value), next_value = GetNextValue(value); - if (clusters[cluster] != NULL && - next_value < clusters[cluster]->GetMaximum()) { - return Compose(cluster, clusters[cluster]->GetSuccessor(next_value)); - } else { - int next_cluster = summary->GetSuccessor(cluster); - if (next_cluster == -1) { - return -1; - } - return Compose(next_cluster, clusters[next_cluster]->GetMinimum()); - } -} diff --git a/extractor/veb_tree.h b/extractor/veb_tree.h deleted file mode 100644 index 578d3e6a..00000000 --- a/extractor/veb_tree.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef _VEB_TREE_H_ -#define _VEB_TREE_H_ - -#include <memory> -#include <vector> - -using namespace std; - -#include "veb.h" - -class VEBTree: public VEB { - public: - VEBTree(int size); - - void Insert(int value); - - int GetSuccessor(int value); - - private: - int GetNextValue(int value); - int GetCluster(int value); - int Compose(int cluster, int value); - - int lower_bits, upper_size; - shared_ptr<VEB> summary; - vector<shared_ptr<VEB> > clusters; -}; - -#endif |