diff options
Diffstat (limited to 'extractor')
35 files changed, 50 insertions, 1862 deletions
| diff --git a/extractor/Makefile.am b/extractor/Makefile.am index df796cea..721df18b 100644 --- a/extractor/Makefile.am +++ b/extractor/Makefile.am @@ -1,7 +1,6 @@  bin_PROGRAMS = compile run_extractor  EXTRA_PROGRAMS = alignment_test \ -    binary_search_merger_test \      data_array_test \      fast_intersector_test \      feature_count_source_target_test \ @@ -12,10 +11,6 @@ EXTRA_PROGRAMS = alignment_test \      feature_sample_source_count_test \      feature_target_given_source_coherent_test \      grammar_extractor_test \ -    intersector_test \ -    linear_merger_test \ -    matching_comparator_test \ -    matching_test \      matchings_finder_test \      phrase_test \      precomputation_test \ @@ -26,12 +21,10 @@ EXTRA_PROGRAMS = alignment_test \      scorer_test \      suffix_array_test \      target_phrase_extractor_test \ -    translation_table_test \ -    veb_test +    translation_table_test  if HAVE_GTEST    RUNNABLE_TESTS = alignment_test \ -    binary_search_merger_test \      data_array_test \      fast_intersector_test \      feature_count_source_target_test \ @@ -42,10 +35,6 @@ if HAVE_GTEST      feature_sample_source_count_test \      feature_target_given_source_coherent_test \      grammar_extractor_test \ -    intersector_test \ -    linear_merger_test \ -    matching_comparator_test \ -    matching_test \      matchings_finder_test \      phrase_test \      precomputation_test \ @@ -56,8 +45,7 @@ if HAVE_GTEST      scorer_test \      suffix_array_test \      target_phrase_extractor_test \ -    translation_table_test \ -    veb_test +    translation_table_test  endif  noinst_PROGRAMS = $(RUNNABLE_TESTS) @@ -66,8 +54,6 @@ TESTS = $(RUNNABLE_TESTS)  alignment_test_SOURCES = alignment_test.cc  alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -binary_search_merger_test_SOURCES = binary_search_merger_test.cc -binary_search_merger_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a  data_array_test_SOURCES = data_array_test.cc  data_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a  fast_intersector_test_SOURCES = fast_intersector_test.cc @@ -88,14 +74,6 @@ feature_target_given_source_coherent_test_SOURCES = features/target_given_source  feature_target_given_source_coherent_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a  grammar_extractor_test_SOURCES = grammar_extractor_test.cc  grammar_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -intersector_test_SOURCES = intersector_test.cc -intersector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -linear_merger_test_SOURCES = linear_merger_test.cc -linear_merger_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -matching_comparator_test_SOURCES = matching_comparator_test.cc -matching_comparator_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -matching_test_SOURCES = matching_test.cc -matching_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a  matchings_finder_test_SOURCES = matchings_finder_test.cc  matchings_finder_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a  phrase_test_SOURCES = phrase_test.cc @@ -118,8 +96,6 @@ target_phrase_extractor_test_SOURCES = target_phrase_extractor_test.cc  target_phrase_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a  translation_table_test_SOURCES = translation_table_test.cc  translation_table_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -veb_test_SOURCES = veb_test.cc -veb_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a  noinst_LIBRARIES = libextractor.a libcompile.a @@ -139,7 +115,6 @@ libcompile_a_SOURCES = \  libextractor_a_SOURCES = \    alignment.cc \ -  binary_search_merger.cc \    data_array.cc \    fast_intersector.cc \    features/count_source_target.cc \ @@ -152,11 +127,7 @@ libextractor_a_SOURCES = \    features/target_given_source_coherent.cc \    grammar.cc \    grammar_extractor.cc \ -  matching.cc \ -  matching_comparator.cc \    matchings_finder.cc \ -  intersector.cc \ -  linear_merger.cc \    matchings_trie.cc \    phrase.cc \    phrase_builder.cc \ @@ -172,9 +143,6 @@ libextractor_a_SOURCES = \    target_phrase_extractor.cc \    time_util.cc \    translation_table.cc \ -  veb.cc \ -  veb_bitset.cc \ -  veb_tree.cc \    vocabulary.cc  AM_CPPFLAGS = -W -Wall -Wno-sign-compare -std=c++0x $(GTEST_CPPFLAGS) $(GMOCK_CPPFLAGS) diff --git a/extractor/binary_search_merger.cc b/extractor/binary_search_merger.cc deleted file mode 100644 index c1b86a77..00000000 --- a/extractor/binary_search_merger.cc +++ /dev/null @@ -1,251 +0,0 @@ -#include "binary_search_merger.h" - -#include "data_array.h" -#include "linear_merger.h" -#include "matching.h" -#include "matching_comparator.h" -#include "phrase.h" -#include "vocabulary.h" - -double BinarySearchMerger::BAEZA_YATES_FACTOR = 1.0; - -BinarySearchMerger::BinarySearchMerger( -    shared_ptr<Vocabulary> vocabulary, -    shared_ptr<LinearMerger> linear_merger, -    shared_ptr<DataArray> data_array, -    shared_ptr<MatchingComparator> comparator, -    bool force_binary_search_merge) : -    vocabulary(vocabulary), linear_merger(linear_merger), -    data_array(data_array), comparator(comparator), -    force_binary_search_merge(force_binary_search_merge) {} - -BinarySearchMerger::BinarySearchMerger() {} - -BinarySearchMerger::~BinarySearchMerger() {} - -void BinarySearchMerger::Merge( -    vector<int>& locations, const Phrase& phrase, const Phrase& suffix, -    const vector<int>::iterator& prefix_start, -    const vector<int>::iterator& prefix_end, -    const vector<int>::iterator& suffix_start, -    const vector<int>::iterator& suffix_end, -    int prefix_subpatterns, int suffix_subpatterns) const { -  if (IsIntersectionVoid(prefix_start, prefix_end, suffix_start, suffix_end, -      prefix_subpatterns, suffix_subpatterns, suffix)) { -    return; -  } - -  int prefix_set_size = prefix_end - prefix_start; -  int suffix_set_size = suffix_end - suffix_start; -  if (ShouldUseLinearMerge(prefix_set_size, suffix_set_size)) { -    linear_merger->Merge(locations, phrase, suffix, prefix_start, prefix_end, -          suffix_start, suffix_end, prefix_subpatterns, suffix_subpatterns); -    return; -  } - -  vector<int>::iterator low, high, prefix_low, prefix_high, suffix_mid; -  if (prefix_set_size > suffix_set_size) { -    // Binary search on the prefix set. -    suffix_mid = GetMiddle(suffix_start, suffix_end, suffix_subpatterns); -    low = prefix_start, high = prefix_end; -    while (low < high) { -      vector<int>::iterator prefix_mid = -          GetMiddle(low, high, prefix_subpatterns); - -      GetComparableMatchings(prefix_start, prefix_end, prefix_mid, -          prefix_subpatterns, prefix_low, prefix_high); -      int comparison = CompareMatchingsSet(prefix_low, prefix_high, suffix_mid, -          prefix_subpatterns, suffix_subpatterns, suffix); -      if (comparison == 0) { -        break; -      } else if (comparison < 0) { -        low = prefix_mid + prefix_subpatterns; -      } else { -        high = prefix_mid; -      } -    } -  } else { -    // Binary search on the suffix set. -    vector<int>::iterator prefix_mid = -        GetMiddle(prefix_start, prefix_end, prefix_subpatterns); - -    GetComparableMatchings(prefix_start, prefix_end, prefix_mid, -        prefix_subpatterns, prefix_low, prefix_high); -    low = suffix_start, high = suffix_end; -    while (low < high) { -      suffix_mid = GetMiddle(low, high, suffix_subpatterns); - -      int comparison = CompareMatchingsSet(prefix_low, prefix_high, suffix_mid, -          prefix_subpatterns, suffix_subpatterns, suffix); -      if (comparison == 0) { -        break; -      } else if (comparison > 0) { -        low = suffix_mid + suffix_subpatterns; -      } else { -        high = suffix_mid; -      } -    } -  } - -  vector<int> result; -  int last_chunk_len = suffix.GetChunkLen(suffix.Arity()); -  bool offset = !vocabulary->IsTerminal(suffix.GetSymbol(0)); -  vector<int>::iterator suffix_low, suffix_high; -  if (low < high) { -    // We found a group of prefixes with the same starting position that give -    // different results when compared to the found suffix. -    // Find all matching suffixes for the previously found set of prefixes. -    suffix_low = suffix_mid; -    suffix_high = suffix_mid + suffix_subpatterns; -    for (auto i = prefix_low; i != prefix_high; i += prefix_subpatterns) { -      Matching left(i, prefix_subpatterns, data_array->GetSentenceId(*i)); -      while (suffix_low != suffix_start) { -        Matching right(suffix_low - suffix_subpatterns, suffix_subpatterns, -            data_array->GetSentenceId(*(suffix_low - suffix_subpatterns))); -        if (comparator->Compare(left, right, last_chunk_len, offset) <= 0) { -          suffix_low -= suffix_subpatterns; -        } else { -          break; -        } -      } - -      for (auto j = suffix_low; j != suffix_end; j += suffix_subpatterns) { -        Matching right(j, suffix_subpatterns, data_array->GetSentenceId(*j)); -        int comparison = comparator->Compare(left, right, last_chunk_len, -                                             offset); -        if (comparison == 0) { -          vector<int> merged = left.Merge(right, phrase.Arity() + 1); -          result.insert(result.end(), merged.begin(), merged.end()); -        } else if (comparison < 0) { -          break; -        } -        suffix_high = max(suffix_high, j + suffix_subpatterns); -      } -    } - -    swap(suffix_low, suffix_high); -  } else if (prefix_set_size > suffix_set_size) { -    // We did the binary search on the prefix set. -    suffix_low = suffix_mid; -    suffix_high = suffix_mid + suffix_subpatterns; -    if (CompareMatchingsSet(prefix_low, prefix_high, suffix_mid, -        prefix_subpatterns, suffix_subpatterns, suffix) < 0) { -      prefix_low = prefix_high; -    } else { -      prefix_high = prefix_low; -    } -  } else { -    // We did the binary search on the suffix set. -    if (CompareMatchingsSet(prefix_low, prefix_high, suffix_mid, -        prefix_subpatterns, suffix_subpatterns, suffix) < 0) { -      suffix_low = suffix_mid; -      suffix_high = suffix_mid; -    } else { -      suffix_low = suffix_mid + suffix_subpatterns; -      suffix_high = suffix_mid + suffix_subpatterns; -    } -  } - -  Merge(locations, phrase, suffix, prefix_start, prefix_low, suffix_start, -        suffix_low, prefix_subpatterns, suffix_subpatterns); -  locations.insert(locations.end(), result.begin(), result.end()); -  Merge(locations, phrase, suffix, prefix_high, prefix_end, suffix_high, -        suffix_end, prefix_subpatterns, suffix_subpatterns); -} - -bool BinarySearchMerger::IsIntersectionVoid( -    vector<int>::iterator prefix_start, vector<int>::iterator prefix_end, -    vector<int>::iterator suffix_start, vector<int>::iterator suffix_end, -    int prefix_subpatterns, int suffix_subpatterns, -    const Phrase& suffix) const { -  // Is any of the sets empty? -  if (prefix_start >= prefix_end || suffix_start >= suffix_end) { -    return true; -  } - -  int last_chunk_len = suffix.GetChunkLen(suffix.Arity()); -  bool offset = !vocabulary->IsTerminal(suffix.GetSymbol(0)); -  // Is the first value from the first set larger than the last value in the -  // second set? -  Matching left(prefix_start, prefix_subpatterns, -      data_array->GetSentenceId(*prefix_start)); -  Matching right(suffix_end - suffix_subpatterns, suffix_subpatterns, -      data_array->GetSentenceId(*(suffix_end - suffix_subpatterns))); -  if (comparator->Compare(left, right, last_chunk_len, offset) > 0) { -    return true; -  } - -  // Is the last value from the first set smaller than the first value in the -  // second set? -  left = Matching(prefix_end - prefix_subpatterns, prefix_subpatterns, -      data_array->GetSentenceId(*(prefix_end - prefix_subpatterns))); -  right = Matching(suffix_start, suffix_subpatterns, -      data_array->GetSentenceId(*suffix_start)); -  if (comparator->Compare(left, right, last_chunk_len, offset) < 0) { -    return true; -  } - -  return false; -} - -bool BinarySearchMerger::ShouldUseLinearMerge( -    int prefix_set_size, int suffix_set_size) const { -  if (force_binary_search_merge) { -    return false; -  } - -  int min_size = min(prefix_set_size, suffix_set_size); -  int max_size = max(prefix_set_size, suffix_set_size); - -  return BAEZA_YATES_FACTOR * min_size * log2(max_size) > max_size; -} - -vector<int>::iterator BinarySearchMerger::GetMiddle( -    vector<int>::iterator low, vector<int>::iterator high, -    int num_subpatterns) const { -  return low + (((high - low) / num_subpatterns) / 2) * num_subpatterns; -} - -void BinarySearchMerger::GetComparableMatchings( -    const vector<int>::iterator& prefix_start, -    const vector<int>::iterator& prefix_end, -    const vector<int>::iterator& prefix_mid, -    int num_subpatterns, -    vector<int>::iterator& prefix_low, -    vector<int>::iterator& prefix_high) const { -  prefix_low = prefix_mid; -  while (prefix_low != prefix_start -      && *prefix_mid == *(prefix_low - num_subpatterns)) { -    prefix_low -= num_subpatterns; -  } -  prefix_high = prefix_mid + num_subpatterns; -  while (prefix_high != prefix_end -      && *prefix_mid == *prefix_high) { -    prefix_high += num_subpatterns; -  } -} - -int BinarySearchMerger::CompareMatchingsSet( -    const vector<int>::iterator& prefix_start, -    const vector<int>::iterator& prefix_end, -    const vector<int>::iterator& suffix_mid, -    int prefix_subpatterns, -    int suffix_subpatterns, -    const Phrase& suffix) const { -  int result = 0; -  int last_chunk_len = suffix.GetChunkLen(suffix.Arity()); -  bool offset = !vocabulary->IsTerminal(suffix.GetSymbol(0)); - -  Matching right(suffix_mid, suffix_subpatterns, -      data_array->GetSentenceId(*suffix_mid)); -  for (auto i = prefix_start; i != prefix_end; i += prefix_subpatterns) { -    Matching left(i, prefix_subpatterns, data_array->GetSentenceId(*i)); -    int comparison = comparator->Compare(left, right, last_chunk_len, offset); -    if (i == prefix_start) { -      result = comparison; -    } else if (comparison != result) { -      return 0; -    } -  } -  return result; -} diff --git a/extractor/binary_search_merger.h b/extractor/binary_search_merger.h deleted file mode 100644 index c887e012..00000000 --- a/extractor/binary_search_merger.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef _BINARY_SEARCH_MERGER_H_ -#define _BINARY_SEARCH_MERGER_H_ - -#include <memory> -#include <vector> - -using namespace std; - -class DataArray; -class LinearMerger; -class MatchingComparator; -class Phrase; -class Vocabulary; - -class BinarySearchMerger { - public: -  BinarySearchMerger(shared_ptr<Vocabulary> vocabulary, -                     shared_ptr<LinearMerger> linear_merger, -                     shared_ptr<DataArray> data_array, -                     shared_ptr<MatchingComparator> comparator, -                     bool force_binary_search_merge = false); - -  virtual ~BinarySearchMerger(); - -  virtual void Merge( -      vector<int>& locations, const Phrase& phrase, const Phrase& suffix, -      const vector<int>::iterator& prefix_start, -      const vector<int>::iterator& prefix_end, -      const vector<int>::iterator& suffix_start, -      const vector<int>::iterator& suffix_end, -      int prefix_subpatterns, int suffix_subpatterns) const; - -  static double BAEZA_YATES_FACTOR; - - protected: -  BinarySearchMerger(); - - private: -  bool IsIntersectionVoid( -      vector<int>::iterator prefix_start, vector<int>::iterator prefix_end, -      vector<int>::iterator suffix_start, vector<int>::iterator suffix_end, -      int prefix_subpatterns, int suffix_subpatterns, -      const Phrase& suffix) const; - -  bool ShouldUseLinearMerge(int prefix_set_size, int suffix_set_size) const; - -  vector<int>::iterator GetMiddle(vector<int>::iterator low, -                                  vector<int>::iterator high, -                                  int num_subpatterns) const; - -  void GetComparableMatchings( -      const vector<int>::iterator& prefix_start, -      const vector<int>::iterator& prefix_end, -      const vector<int>::iterator& prefix_mid, -      int num_subpatterns, -      vector<int>::iterator& prefix_low, -      vector<int>::iterator& prefix_high) const; - -  int CompareMatchingsSet( -      const vector<int>::iterator& prefix_low, -      const vector<int>::iterator& prefix_high, -      const vector<int>::iterator& suffix_mid, -      int prefix_subpatterns, -      int suffix_subpatterns, -      const Phrase& suffix) const; - -  shared_ptr<Vocabulary> vocabulary; -  shared_ptr<LinearMerger> linear_merger; -  shared_ptr<DataArray> data_array; -  shared_ptr<MatchingComparator> comparator; -  // Should be true only for testing. -  bool force_binary_search_merge; -}; - -#endif diff --git a/extractor/binary_search_merger_test.cc b/extractor/binary_search_merger_test.cc deleted file mode 100644 index b1baa62f..00000000 --- a/extractor/binary_search_merger_test.cc +++ /dev/null @@ -1,157 +0,0 @@ -#include <gtest/gtest.h> - -#include <memory> - -#include "binary_search_merger.h" -#include "matching_comparator.h" -#include "mocks/mock_data_array.h" -#include "mocks/mock_vocabulary.h" -#include "mocks/mock_linear_merger.h" -#include "phrase.h" -#include "phrase_location.h" -#include "phrase_builder.h" - -using namespace std; -using namespace ::testing; - -namespace { - -class BinarySearchMergerTest : public Test { - protected: -  virtual void SetUp() { -    shared_ptr<MockVocabulary> vocabulary = make_shared<MockVocabulary>(); -    EXPECT_CALL(*vocabulary, GetTerminalValue(_)) -        .WillRepeatedly(Return("word")); - -    shared_ptr<MockDataArray> data_array = make_shared<MockDataArray>(); -    EXPECT_CALL(*data_array, GetSentenceId(_)) -        .WillRepeatedly(Return(1)); - -    shared_ptr<MatchingComparator> comparator = -        make_shared<MatchingComparator>(1, 20); - -    phrase_builder = make_shared<PhraseBuilder>(vocabulary); - -    // We are going to force the binary_search_merger to do all the work, so we -    // need to check that the linear_merger never gets called. -    shared_ptr<MockLinearMerger> linear_merger = -        make_shared<MockLinearMerger>(); -    EXPECT_CALL(*linear_merger, Merge(_, _, _, _, _, _, _, _, _)).Times(0); - -    binary_search_merger = make_shared<BinarySearchMerger>( -        vocabulary, linear_merger, data_array, comparator, true); -  } - -  shared_ptr<BinarySearchMerger> binary_search_merger; -  shared_ptr<PhraseBuilder> phrase_builder; -}; - -TEST_F(BinarySearchMergerTest, aXbTest) { -  vector<int> locations; -  // Encoding for him X it (see Adam's dissertation). -  vector<int> symbols{1, -1, 2}; -  Phrase phrase = phrase_builder->Build(symbols); -  vector<int> suffix_symbols{-1, 2}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); - -  vector<int> prefix_locs{2, 6, 10, 15}; -  vector<int> suffix_locs{0, 4, 8, 13}; - -  binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), -      prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1); - -  vector<int> expected_locations{2, 4, 2, 8, 2, 13, 6, 8, 6, 13, 10, 13}; -  EXPECT_EQ(expected_locations, locations); -} - -TEST_F(BinarySearchMergerTest, aXbXcTest) { -  vector<int> locations; -  // Encoding for it X him X it (see Adam's dissertation). -  vector<int> symbols{1, -1, 2, -2, 1}; -  Phrase phrase = phrase_builder->Build(symbols); -  vector<int> suffix_symbols{-1, 2, -2, 1}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); - -  vector<int> prefix_locs{0, 2, 0, 6, 0, 10, 4, 6, 4, 10, 4, 15, 8, 10, 8, 15, -                          13, 15}; -  vector<int> suffix_locs{2, 4, 2, 8, 2, 13, 6, 8, 6, 13, 10, 13}; - -  binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), -      prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 2, 2); - -  vector<int> expected_locs{0, 2, 4, 0, 2, 8, 0, 2, 13, 0, 6, 8, 0, 6, 13, 0, -                            10, 13, 4, 6, 8, 4, 6, 13, 4, 10, 13, 8, 10, 13}; -  EXPECT_EQ(expected_locs, locations); -} - -TEST_F(BinarySearchMergerTest, abXcXdTest) { -  // Sentence: Anna has many many nuts and sour apples and juicy apples. -  // Phrase: Anna has X and X apples. -  vector<int> locations; -  vector<int> symbols{1, 2, -1, 3, -2, 4}; -  Phrase phrase = phrase_builder->Build(symbols); -  vector<int> suffix_symbols{2, -1, 3, -2, 4}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); - -  vector<int> prefix_locs{1, 6, 1, 9}; -  vector<int> suffix_locs{2, 6, 8, 2, 6, 11, 2, 9, 11}; - -  binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), -      prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 2, 3); - -  vector<int> expected_locs{1, 6, 8, 1, 6, 11, 1, 9, 11}; -  EXPECT_EQ(expected_locs, locations); -} - -TEST_F(BinarySearchMergerTest, LargeTest) { -  vector<int> locations; -  vector<int> symbols{1, -1, 2}; -  Phrase phrase = phrase_builder->Build(symbols); -  vector<int> suffix_symbols{-1, 2}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); - -  vector<int> prefix_locs; -  for (int i = 0; i < 100; ++i) { -    prefix_locs.push_back(i * 20 + 1); -  } -  vector<int> suffix_locs; -  for (int i = 0; i < 100; ++i) { -    suffix_locs.push_back(i * 20 + 5); -    suffix_locs.push_back(i * 20 + 13); -  } - -  binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), -      prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1); - -  EXPECT_EQ(400, locations.size()); -  for (int i = 0; i < 100; ++i) { -    EXPECT_EQ(i * 20 + 1, locations[4 * i]); -    EXPECT_EQ(i * 20 + 5, locations[4 * i + 1]); -    EXPECT_EQ(i * 20 + 1, locations[4 * i + 2]); -    EXPECT_EQ(i * 20 + 13, locations[4 * i + 3]); -  } -} - -TEST_F(BinarySearchMergerTest, EmptyResultTest) { -  vector<int> locations; -  vector<int> symbols{1, -1, 2}; -  Phrase phrase = phrase_builder->Build(symbols); -  vector<int> suffix_symbols{-1, 2}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); - -  vector<int> prefix_locs; -  for (int i = 0; i < 100; ++i) { -    prefix_locs.push_back(i * 200 + 1); -  } -  vector<int> suffix_locs; -  for (int i = 0; i < 100; ++i) { -    suffix_locs.push_back(i * 200 + 101); -  } - -  binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), -      prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1); - -  EXPECT_EQ(0, locations.size()); -} - -}  // namespace diff --git a/extractor/data_array.h b/extractor/data_array.h index 7c120b3c..96950789 100644 --- a/extractor/data_array.h +++ b/extractor/data_array.h @@ -15,6 +15,9 @@ enum Side {    TARGET  }; +// TODO: This class has features for both the source and target data arrays. +// Maybe we can save some memory by having more specific implementations (e.g. +// sentence_id is only needed for the source data array).  class DataArray {   public:    static int NULL_WORD; @@ -48,7 +51,6 @@ class DataArray {    virtual int GetSentenceStart(int position) const; -  //TODO(pauldb): Add unit tests.    virtual int GetSentenceLength(int sentence_id) const;    virtual int GetSentenceId(int position) const; @@ -67,8 +69,6 @@ class DataArray {    unordered_map<string, int> word2id;    vector<string> id2word;    vector<int> data; -  // TODO(pauldb): We only need sentence_id for the source language. Maybe we -  // can save some memory here.    vector<int> sentence_id;    vector<int> sentence_start;  }; diff --git a/extractor/data_array_test.cc b/extractor/data_array_test.cc index 772ba10e..ba5ce09e 100644 --- a/extractor/data_array_test.cc +++ b/extractor/data_array_test.cc @@ -26,18 +26,28 @@ class DataArrayTest : public Test {  };  TEST_F(DataArrayTest, TestGetData) { -  vector<int> expected_source_data{2, 3, 4, 5, 1, 2, 6, 7, 8, 5, 1}; +  vector<int> expected_source_data = {2, 3, 4, 5, 1, 2, 6, 7, 8, 5, 1}; +  vector<string> expected_source_words = { +      "ana", "are", "mere", ".", "__END_OF_LINE__", +      "ana", "bea", "mult", "lapte", ".", "__END_OF_LINE__" +  };    EXPECT_EQ(expected_source_data, source_data->GetData());    EXPECT_EQ(expected_source_data.size(), source_data->GetSize());    for (size_t i = 0; i < expected_source_data.size(); ++i) {      EXPECT_EQ(expected_source_data[i], source_data->AtIndex(i)); +    EXPECT_EQ(expected_source_words[i], source_data->GetWordAtIndex(i));    } -  vector<int> expected_target_data{2, 3, 4, 5, 1, 2, 6, 7, 8, 9, 10, 5, 1}; +  vector<int> expected_target_data = {2, 3, 4, 5, 1, 2, 6, 7, 8, 9, 10, 5, 1}; +  vector<string> expected_target_words = { +      "anna", "has", "apples", ".", "__END_OF_LINE__", +      "anna", "drinks", "a", "lot", "of", "milk", ".", "__END_OF_LINE__" +  };    EXPECT_EQ(expected_target_data, target_data->GetData());    EXPECT_EQ(expected_target_data.size(), target_data->GetSize());    for (size_t i = 0; i < expected_target_data.size(); ++i) {      EXPECT_EQ(expected_target_data[i], target_data->AtIndex(i)); +    EXPECT_EQ(expected_target_words[i], target_data->GetWordAtIndex(i));    }  } @@ -61,10 +71,26 @@ TEST_F(DataArrayTest, TestSentenceData) {    EXPECT_EQ(5, source_data->GetSentenceStart(1));    EXPECT_EQ(11, source_data->GetSentenceStart(2)); +  EXPECT_EQ(4, source_data->GetSentenceLength(0)); +  EXPECT_EQ(5, source_data->GetSentenceLength(1)); + +  vector<int> expected_source_ids = {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1}; +  for (size_t i = 0; i < expected_source_ids.size(); ++i) { +    EXPECT_EQ(expected_source_ids[i], source_data->GetSentenceId(i)); +  } +    EXPECT_EQ(2, target_data->GetNumSentences());    EXPECT_EQ(0, target_data->GetSentenceStart(0));    EXPECT_EQ(5, target_data->GetSentenceStart(1));    EXPECT_EQ(13, target_data->GetSentenceStart(2)); + +  EXPECT_EQ(4, target_data->GetSentenceLength(0)); +  EXPECT_EQ(7, target_data->GetSentenceLength(1)); + +  vector<int> expected_target_ids = {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}; +  for (size_t i = 0; i < expected_target_ids.size(); ++i) { +    EXPECT_EQ(expected_target_ids[i], target_data->GetSentenceId(i)); +  }  }  }  // namespace diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc index a03e805f..b8f6f0c7 100644 --- a/extractor/grammar_extractor.cc +++ b/extractor/grammar_extractor.cc @@ -16,13 +16,12 @@ GrammarExtractor::GrammarExtractor(      shared_ptr<Alignment> alignment, shared_ptr<Precomputation> precomputation,      shared_ptr<Scorer> scorer, int min_gap_size, int max_rule_span,      int max_nonterminals, int max_rule_symbols, int max_samples, -    bool use_fast_intersect, bool use_baeza_yates, bool require_tight_phrases) : +    bool require_tight_phrases) :      vocabulary(make_shared<Vocabulary>()),      rule_factory(make_shared<HieroCachingRuleFactory>(          source_suffix_array, target_data_array, alignment, vocabulary,          precomputation, scorer, min_gap_size, max_rule_span, max_nonterminals, -        max_rule_symbols, max_samples, use_fast_intersect, use_baeza_yates, -        require_tight_phrases)) {} +        max_rule_symbols, max_samples, require_tight_phrases)) {}  GrammarExtractor::GrammarExtractor(      shared_ptr<Vocabulary> vocabulary, diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h index a8f2090d..f50a8d14 100644 --- a/extractor/grammar_extractor.h +++ b/extractor/grammar_extractor.h @@ -29,8 +29,6 @@ class GrammarExtractor {        int max_nonterminals,        int max_rule_symbols,        int max_samples, -      bool use_fast_intersect, -      bool use_baeza_yates,        bool require_tight_phrases);    // For testing only. diff --git a/extractor/intersector.cc b/extractor/intersector.cc deleted file mode 100644 index 39a7648d..00000000 --- a/extractor/intersector.cc +++ /dev/null @@ -1,154 +0,0 @@ -#include "intersector.h" - -#include "data_array.h" -#include "matching_comparator.h" -#include "phrase.h" -#include "phrase_location.h" -#include "precomputation.h" -#include "suffix_array.h" -#include "veb.h" -#include "vocabulary.h" - -Intersector::Intersector(shared_ptr<Vocabulary> vocabulary, -                         shared_ptr<Precomputation> precomputation, -                         shared_ptr<SuffixArray> suffix_array, -                         shared_ptr<MatchingComparator> comparator, -                         bool use_baeza_yates) : -    vocabulary(vocabulary), -    suffix_array(suffix_array), -    use_baeza_yates(use_baeza_yates) { -  shared_ptr<DataArray> data_array = suffix_array->GetData(); -  linear_merger = make_shared<LinearMerger>(vocabulary, data_array, comparator); -  binary_search_merger = make_shared<BinarySearchMerger>( -      vocabulary, linear_merger, data_array, comparator); -  ConvertIndexes(precomputation, data_array); -} - -Intersector::Intersector(shared_ptr<Vocabulary> vocabulary, -                         shared_ptr<Precomputation> precomputation, -                         shared_ptr<SuffixArray> suffix_array, -                         shared_ptr<LinearMerger> linear_merger, -                         shared_ptr<BinarySearchMerger> binary_search_merger, -                         bool use_baeza_yates) : -    vocabulary(vocabulary), -    suffix_array(suffix_array), -    linear_merger(linear_merger), -    binary_search_merger(binary_search_merger), -    use_baeza_yates(use_baeza_yates) { -  ConvertIndexes(precomputation, suffix_array->GetData()); -} - -Intersector::Intersector() {} - -Intersector::~Intersector() {} - -void Intersector::ConvertIndexes(shared_ptr<Precomputation> precomputation, -                                 shared_ptr<DataArray> data_array) { -  const Index& precomputed_index = precomputation->GetInvertedIndex(); -  for (pair<vector<int>, vector<int> > entry: precomputed_index) { -    vector<int> phrase = ConvertPhrase(entry.first, data_array); -    inverted_index[phrase] = entry.second; - -    phrase.push_back(vocabulary->GetNonterminalIndex(1)); -    inverted_index[phrase] = entry.second; -    phrase.pop_back(); -    phrase.insert(phrase.begin(), vocabulary->GetNonterminalIndex(1)); -    inverted_index[phrase] = entry.second; -  } - -  const Index& precomputed_collocations = precomputation->GetCollocations(); -  for (pair<vector<int>, vector<int> > entry: precomputed_collocations) { -    vector<int> phrase = ConvertPhrase(entry.first, data_array); -    collocations[phrase] = entry.second; -  } -} - -vector<int> Intersector::ConvertPhrase(const vector<int>& old_phrase, -                                       shared_ptr<DataArray> data_array) { -  vector<int> new_phrase; -  new_phrase.reserve(old_phrase.size()); - -  int arity = 0; -  for (int word_id: old_phrase) { -    if (word_id == Precomputation::NON_TERMINAL) { -      ++arity; -      new_phrase.push_back(vocabulary->GetNonterminalIndex(arity)); -    } else { -      new_phrase.push_back( -          vocabulary->GetTerminalIndex(data_array->GetWord(word_id))); -    } -  } - -  return new_phrase; -} - -PhraseLocation Intersector::Intersect( -    const Phrase& prefix, PhraseLocation& prefix_location, -    const Phrase& suffix, PhraseLocation& suffix_location, -    const Phrase& phrase) { -  vector<int> symbols = phrase.Get(); - -  // We should never attempt to do an intersect query for a pattern starting or -  // ending with a non terminal. The RuleFactory should handle these cases, -  // initializing the matchings list with the one for the pattern without the -  // starting or ending terminal. -  assert(vocabulary->IsTerminal(symbols.front()) -      && vocabulary->IsTerminal(symbols.back())); - -  if (collocations.count(symbols)) { -    return PhraseLocation(collocations[symbols], phrase.Arity() + 1); -  } - -  vector<int> locations; -  ExtendPhraseLocation(prefix, prefix_location); -  ExtendPhraseLocation(suffix, suffix_location); -  shared_ptr<vector<int> > prefix_matchings = prefix_location.matchings; -  shared_ptr<vector<int> > suffix_matchings = suffix_location.matchings; -  int prefix_subpatterns = prefix_location.num_subpatterns; -  int suffix_subpatterns = suffix_location.num_subpatterns; -  if (use_baeza_yates) { -    binary_search_merger->Merge(locations, phrase, suffix, -        prefix_matchings->begin(), prefix_matchings->end(), -        suffix_matchings->begin(), suffix_matchings->end(), -        prefix_subpatterns, suffix_subpatterns); -  } else { -    linear_merger->Merge(locations, phrase, suffix, prefix_matchings->begin(), -        prefix_matchings->end(), suffix_matchings->begin(), -        suffix_matchings->end(), prefix_subpatterns, suffix_subpatterns); -  } -  return PhraseLocation(locations, phrase.Arity() + 1); -} - -void Intersector::ExtendPhraseLocation( -    const Phrase& phrase, PhraseLocation& phrase_location) { -  int low = phrase_location.sa_low, high = phrase_location.sa_high; -  if (phrase_location.matchings != NULL) { -    return; -  } - - -  phrase_location.num_subpatterns = 1; -  phrase_location.sa_low = phrase_location.sa_high = 0; - -  vector<int> symbols = phrase.Get(); -  if (inverted_index.count(symbols)) { -    phrase_location.matchings = -        make_shared<vector<int> >(inverted_index[symbols]); -    return; -  } - -  vector<int> matchings; -  matchings.reserve(high - low + 1); -  shared_ptr<VEB> veb = VEB::Create(suffix_array->GetSize()); -  for (int i = low; i < high; ++i) { -    veb->Insert(suffix_array->GetSuffix(i)); -  } - -  int value = veb->GetMinimum(); -  while (value != -1) { -    matchings.push_back(value); -    value = veb->GetSuccessor(value); -  } - -  phrase_location.matchings = make_shared<vector<int> >(matchings); -} diff --git a/extractor/intersector.h b/extractor/intersector.h deleted file mode 100644 index 8b159f17..00000000 --- a/extractor/intersector.h +++ /dev/null @@ -1,79 +0,0 @@ -#ifndef _INTERSECTOR_H_ -#define _INTERSECTOR_H_ - -#include <memory> -#include <unordered_map> -#include <vector> - -#include <boost/functional/hash.hpp> - -#include "binary_search_merger.h" -#include "linear_merger.h" - -using namespace std; - -typedef boost::hash<vector<int> > VectorHash; -typedef unordered_map<vector<int>, vector<int>, VectorHash> Index; - -class DataArray; -class MatchingComparator; -class Phrase; -class PhraseLocation; -class Precomputation; -class SuffixArray; -class Vocabulary; - -class Intersector { - public: -  Intersector( -      shared_ptr<Vocabulary> vocabulary, -      shared_ptr<Precomputation> precomputation, -      shared_ptr<SuffixArray> source_suffix_array, -      shared_ptr<MatchingComparator> comparator, -      bool use_baeza_yates); - -  // For testing. -  Intersector( -      shared_ptr<Vocabulary> vocabulary, -      shared_ptr<Precomputation> precomputation, -      shared_ptr<SuffixArray> source_suffix_array, -      shared_ptr<LinearMerger> linear_merger, -      shared_ptr<BinarySearchMerger> binary_search_merger, -      bool use_baeza_yates); - -  virtual ~Intersector(); - -  virtual PhraseLocation Intersect( -      const Phrase& prefix, PhraseLocation& prefix_location, -      const Phrase& suffix, PhraseLocation& suffix_location, -      const Phrase& phrase); - - protected: -  Intersector(); - - private: -  void ConvertIndexes(shared_ptr<Precomputation> precomputation, -                      shared_ptr<DataArray> data_array); - -  vector<int> ConvertPhrase(const vector<int>& old_phrase, -                            shared_ptr<DataArray> data_array); - -  void ExtendPhraseLocation(const Phrase& phrase, -      PhraseLocation& phrase_location); - -  shared_ptr<Vocabulary> vocabulary; -  shared_ptr<SuffixArray> suffix_array; -  shared_ptr<LinearMerger> linear_merger; -  shared_ptr<BinarySearchMerger> binary_search_merger; -  Index inverted_index; -  Index collocations; -  bool use_baeza_yates; - -  // TODO(pauldb): Don't forget to remove these. - public: -  double sort_time; -  double linear_merge_time; -  double binary_merge_time; -}; - -#endif diff --git a/extractor/intersector_test.cc b/extractor/intersector_test.cc deleted file mode 100644 index ec318362..00000000 --- a/extractor/intersector_test.cc +++ /dev/null @@ -1,193 +0,0 @@ -#include <gtest/gtest.h> - -#include <memory> -#include <vector> - -#include "intersector.h" -#include "mocks/mock_binary_search_merger.h" -#include "mocks/mock_data_array.h" -#include "mocks/mock_linear_merger.h" -#include "mocks/mock_precomputation.h" -#include "mocks/mock_suffix_array.h" -#include "mocks/mock_vocabulary.h" - -using namespace std; -using namespace ::testing; - -namespace { - -class IntersectorTest : public Test { - protected: -  virtual void SetUp() { -    data = {2, 3, 4, 3, 4, 3}; -    vector<string> words = {"a", "b", "c", "b", "c", "b"}; -    data_array = make_shared<MockDataArray>(); -    EXPECT_CALL(*data_array, GetData()).WillRepeatedly(ReturnRef(data)); - -    vocabulary = make_shared<MockVocabulary>(); -    for (size_t i = 0; i < data.size(); ++i) { -      EXPECT_CALL(*data_array, GetWord(data[i])) -          .WillRepeatedly(Return(words[i])); -      EXPECT_CALL(*vocabulary, GetTerminalIndex(words[i])) -          .WillRepeatedly(Return(data[i])); -      EXPECT_CALL(*vocabulary, GetTerminalValue(data[i])) -          .WillRepeatedly(Return(words[i])); -    } - -    vector<int> suffixes = {6, 0, 5, 3, 1, 4, 2}; -    suffix_array = make_shared<MockSuffixArray>(); -    EXPECT_CALL(*suffix_array, GetData()) -        .WillRepeatedly(Return(data_array)); -    EXPECT_CALL(*suffix_array, GetSize()) -        .WillRepeatedly(Return(suffixes.size())); -    for (size_t i = 0; i < suffixes.size(); ++i) { -      EXPECT_CALL(*suffix_array, GetSuffix(i)) -          .WillRepeatedly(Return(suffixes[i])); -    } - -    vector<int> key = {2, -1, 4}; -    vector<int> values = {0, 2}; -    collocations[key] = values; -    precomputation = make_shared<MockPrecomputation>(); -    EXPECT_CALL(*precomputation, GetInvertedIndex()) -       .WillRepeatedly(ReturnRef(inverted_index)); -    EXPECT_CALL(*precomputation, GetCollocations()) -       .WillRepeatedly(ReturnRef(collocations)); - -    linear_merger = make_shared<MockLinearMerger>(); -    binary_search_merger = make_shared<MockBinarySearchMerger>(); - -    phrase_builder = make_shared<PhraseBuilder>(vocabulary); -  } - -  Index inverted_index; -  Index collocations; -  vector<int> data; -  shared_ptr<MockVocabulary> vocabulary; -  shared_ptr<MockDataArray> data_array; -  shared_ptr<MockSuffixArray> suffix_array; -  shared_ptr<MockPrecomputation> precomputation; -  shared_ptr<MockLinearMerger> linear_merger; -  shared_ptr<MockBinarySearchMerger> binary_search_merger; -  shared_ptr<PhraseBuilder> phrase_builder; -  shared_ptr<Intersector> intersector; -}; - -TEST_F(IntersectorTest, TestCachedCollocation) { -  intersector = make_shared<Intersector>(vocabulary, precomputation, -      suffix_array, linear_merger, binary_search_merger, false); - -  vector<int> prefix_symbols = {2, -1}; -  Phrase prefix = phrase_builder->Build(prefix_symbols); -  vector<int> suffix_symbols = {-1, 4}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); -  vector<int> symbols = {2, -1, 4}; -  Phrase phrase = phrase_builder->Build(symbols); -  PhraseLocation prefix_locs(0, 1), suffix_locs(2, 3); - -  PhraseLocation result = intersector->Intersect( -      prefix, prefix_locs, suffix, suffix_locs, phrase); - -  vector<int> expected_locs = {0, 2}; -  PhraseLocation expected_result(expected_locs, 2); - -  EXPECT_EQ(expected_result, result); -  EXPECT_EQ(PhraseLocation(0, 1), prefix_locs); -  EXPECT_EQ(PhraseLocation(2, 3), suffix_locs); -} - -TEST_F(IntersectorTest, TestLinearMergeaXb) { -  vector<int> prefix_symbols = {3, -1}; -  Phrase prefix = phrase_builder->Build(prefix_symbols); -  vector<int> suffix_symbols = {-1, 4}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); -  vector<int> symbols = {3, -1, 4}; -  Phrase phrase = phrase_builder->Build(symbols); -  PhraseLocation prefix_locs(2, 5), suffix_locs(5, 7); - -  vector<int> ex_prefix_locs = {1, 3, 5}; -  PhraseLocation extended_prefix_locs(ex_prefix_locs, 1); -  vector<int> ex_suffix_locs = {2, 4}; -  PhraseLocation extended_suffix_locs(ex_suffix_locs, 1); - -  vector<int> expected_locs = {1, 4}; -  EXPECT_CALL(*linear_merger, Merge(_, _, _, _, _, _, _, _, _)) -      .Times(1) -      .WillOnce(SetArgReferee<0>(expected_locs)); -  EXPECT_CALL(*binary_search_merger, Merge(_, _, _, _, _, _, _, _, _)).Times(0); - -  intersector = make_shared<Intersector>(vocabulary, precomputation, -      suffix_array, linear_merger, binary_search_merger, false); - -  PhraseLocation result = intersector->Intersect( -      prefix, prefix_locs, suffix, suffix_locs, phrase); -  PhraseLocation expected_result(expected_locs, 2); - -  EXPECT_EQ(expected_result, result); -  EXPECT_EQ(extended_prefix_locs, prefix_locs); -  EXPECT_EQ(extended_suffix_locs, suffix_locs); -} - -TEST_F(IntersectorTest, TestBinarySearchMergeaXb) { -  vector<int> prefix_symbols = {3, -1}; -  Phrase prefix = phrase_builder->Build(prefix_symbols); -  vector<int> suffix_symbols = {-1, 4}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); -  vector<int> symbols = {3, -1, 4}; -  Phrase phrase = phrase_builder->Build(symbols); -  PhraseLocation prefix_locs(2, 5), suffix_locs(5, 7); - -  vector<int> ex_prefix_locs = {1, 3, 5}; -  PhraseLocation extended_prefix_locs(ex_prefix_locs, 1); -  vector<int> ex_suffix_locs = {2, 4}; -  PhraseLocation extended_suffix_locs(ex_suffix_locs, 1); - -  vector<int> expected_locs = {1, 4}; -  EXPECT_CALL(*binary_search_merger, Merge(_, _, _, _, _, _, _, _, _)) -      .Times(1) -      .WillOnce(SetArgReferee<0>(expected_locs)); -  EXPECT_CALL(*linear_merger, Merge(_, _, _, _, _, _, _, _, _)).Times(0); - -  intersector = make_shared<Intersector>(vocabulary, precomputation, -      suffix_array, linear_merger, binary_search_merger, true); - -  PhraseLocation result = intersector->Intersect( -      prefix, prefix_locs, suffix, suffix_locs, phrase); -  PhraseLocation expected_result(expected_locs, 2); - -  EXPECT_EQ(expected_result, result); -  EXPECT_EQ(extended_prefix_locs, prefix_locs); -  EXPECT_EQ(extended_suffix_locs, suffix_locs); -} - -TEST_F(IntersectorTest, TestMergeaXbXc) { -  vector<int> prefix_symbols = {2, -1, 4, -1}; -  Phrase prefix = phrase_builder->Build(prefix_symbols); -  vector<int> suffix_symbols = {-1, 4, -1, 4}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); -  vector<int> symbols = {2, -1, 4, -1, 4}; -  Phrase phrase = phrase_builder->Build(symbols); - -  vector<int> ex_prefix_locs = {0, 2, 0, 4}; -  PhraseLocation extended_prefix_locs(ex_prefix_locs, 2); -  vector<int> ex_suffix_locs = {2, 4}; -  PhraseLocation extended_suffix_locs(ex_suffix_locs, 2); -  vector<int> expected_locs = {0, 2, 4}; -  EXPECT_CALL(*linear_merger, Merge(_, _, _, _, _, _, _, _, _)) -      .Times(1) -      .WillOnce(SetArgReferee<0>(expected_locs)); -  EXPECT_CALL(*binary_search_merger, Merge(_, _, _, _, _, _, _, _, _)).Times(0); - -  intersector = make_shared<Intersector>(vocabulary, precomputation, -      suffix_array, linear_merger, binary_search_merger, false); - -  PhraseLocation result = intersector->Intersect( -      prefix, extended_prefix_locs, suffix, extended_suffix_locs, phrase); -  PhraseLocation expected_result(expected_locs, 3); - -  EXPECT_EQ(expected_result, result); -  EXPECT_EQ(ex_prefix_locs, *extended_prefix_locs.matchings); -  EXPECT_EQ(ex_suffix_locs, *extended_suffix_locs.matchings); -} - -}  // namespace diff --git a/extractor/linear_merger.cc b/extractor/linear_merger.cc deleted file mode 100644 index e7a32788..00000000 --- a/extractor/linear_merger.cc +++ /dev/null @@ -1,65 +0,0 @@ -#include "linear_merger.h" - -#include <cmath> - -#include "data_array.h" -#include "matching.h" -#include "matching_comparator.h" -#include "phrase.h" -#include "phrase_location.h" -#include "vocabulary.h" - -LinearMerger::LinearMerger(shared_ptr<Vocabulary> vocabulary, -                           shared_ptr<DataArray> data_array, -                           shared_ptr<MatchingComparator> comparator) : -    vocabulary(vocabulary), data_array(data_array), comparator(comparator) {} - -LinearMerger::LinearMerger() {} - -LinearMerger::~LinearMerger() {} - -void LinearMerger::Merge( -    vector<int>& locations, const Phrase& phrase, const Phrase& suffix, -    vector<int>::iterator prefix_start, vector<int>::iterator prefix_end, -    vector<int>::iterator suffix_start, vector<int>::iterator suffix_end, -    int prefix_subpatterns, int suffix_subpatterns) { -  int last_chunk_len = suffix.GetChunkLen(suffix.Arity()); -  bool offset = !vocabulary->IsTerminal(suffix.GetSymbol(0)); - -  while (prefix_start != prefix_end) { -    Matching left(prefix_start, prefix_subpatterns, -        data_array->GetSentenceId(*prefix_start)); - -    while (suffix_start != suffix_end) { -      Matching right(suffix_start, suffix_subpatterns, -          data_array->GetSentenceId(*suffix_start)); -      if (comparator->Compare(left, right, last_chunk_len, offset) > 0) { -        suffix_start += suffix_subpatterns; -      } else { -        break; -      } -    } - -    int start_position = *prefix_start; -    vector<int> :: iterator i = suffix_start; -    while (prefix_start != prefix_end && *prefix_start == start_position) { -      Matching left(prefix_start, prefix_subpatterns, -          data_array->GetSentenceId(*prefix_start)); - -      while (i != suffix_end) { -        Matching right(i, suffix_subpatterns, data_array->GetSentenceId(*i)); -        int comparison = comparator->Compare(left, right, last_chunk_len, -                                             offset); -        if (comparison == 0) { -          vector<int> merged = left.Merge(right, phrase.Arity() + 1); -          locations.insert(locations.end(), merged.begin(), merged.end()); -        } else if (comparison < 0) { -          break; -        } -        i += suffix_subpatterns; -      } - -      prefix_start += prefix_subpatterns; -    } -  } -} diff --git a/extractor/linear_merger.h b/extractor/linear_merger.h deleted file mode 100644 index c3c7111e..00000000 --- a/extractor/linear_merger.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef _LINEAR_MERGER_H_ -#define _LINEAR_MERGER_H_ - -#include <memory> -#include <vector> - -using namespace std; - -class MatchingComparator; -class Phrase; -class PhraseLocation; -class DataArray; -class Vocabulary; - -class LinearMerger { - public: -  LinearMerger(shared_ptr<Vocabulary> vocabulary, -               shared_ptr<DataArray> data_array, -               shared_ptr<MatchingComparator> comparator); - -  virtual ~LinearMerger(); - -  virtual void Merge( -      vector<int>& locations, const Phrase& phrase, const Phrase& suffix, -      vector<int>::iterator prefix_start, vector<int>::iterator prefix_end, -      vector<int>::iterator suffix_start, vector<int>::iterator suffix_end, -      int prefix_subpatterns, int suffix_subpatterns); - - protected: -  LinearMerger(); - - private: -  shared_ptr<Vocabulary> vocabulary; -  shared_ptr<DataArray> data_array; -  shared_ptr<MatchingComparator> comparator; -}; - -#endif diff --git a/extractor/linear_merger_test.cc b/extractor/linear_merger_test.cc deleted file mode 100644 index a6963430..00000000 --- a/extractor/linear_merger_test.cc +++ /dev/null @@ -1,149 +0,0 @@ -#include <gtest/gtest.h> - -#include <memory> - -#include "linear_merger.h" -#include "matching_comparator.h" -#include "mocks/mock_data_array.h" -#include "mocks/mock_vocabulary.h" -#include "phrase.h" -#include "phrase_location.h" -#include "phrase_builder.h" - -using namespace std; -using namespace ::testing; - -namespace { - -class LinearMergerTest : public Test { - protected: -  virtual void SetUp() { -    shared_ptr<MockVocabulary> vocabulary = make_shared<MockVocabulary>(); -    EXPECT_CALL(*vocabulary, GetTerminalValue(_)) -        .WillRepeatedly(Return("word")); - -    shared_ptr<MockDataArray> data_array = make_shared<MockDataArray>(); -    EXPECT_CALL(*data_array, GetSentenceId(_)) -        .WillRepeatedly(Return(1)); - -    shared_ptr<MatchingComparator> comparator = -        make_shared<MatchingComparator>(1, 20); - -    phrase_builder = make_shared<PhraseBuilder>(vocabulary); -    linear_merger = make_shared<LinearMerger>(vocabulary, data_array, -                                              comparator); -  } - -  shared_ptr<LinearMerger> linear_merger; -  shared_ptr<PhraseBuilder> phrase_builder; -}; - -TEST_F(LinearMergerTest, aXbTest) { -  vector<int> locations; -  // Encoding for him X it (see Adam's dissertation). -  vector<int> symbols{1, -1, 2}; -  Phrase phrase = phrase_builder->Build(symbols); -  vector<int> suffix_symbols{-1, 2}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); - -  vector<int> prefix_locs{2, 6, 10, 15}; -  vector<int> suffix_locs{0, 4, 8, 13}; - -  linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), -      prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1); - -  vector<int> expected_locations{2, 4, 2, 8, 2, 13, 6, 8, 6, 13, 10, 13}; -  EXPECT_EQ(expected_locations, locations); -} - -TEST_F(LinearMergerTest, aXbXcTest) { -  vector<int> locations; -  // Encoding for it X him X it (see Adam's dissertation). -  vector<int> symbols{1, -1, 2, -2, 1}; -  Phrase phrase = phrase_builder->Build(symbols); -  vector<int> suffix_symbols{-1, 2, -2, 1}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); - -  vector<int> prefix_locs{0, 2, 0, 6, 0, 10, 4, 6, 4, 10, 4, 15, 8, 10, 8, 15, -                          13, 15}; -  vector<int> suffix_locs{2, 4, 2, 8, 2, 13, 6, 8, 6, 13, 10, 13}; - -  linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), -      prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 2, 2); - -  vector<int> expected_locs{0, 2, 4, 0, 2, 8, 0, 2, 13, 0, 6, 8, 0, 6, 13, 0, -                            10, 13, 4, 6, 8, 4, 6, 13, 4, 10, 13, 8, 10, 13}; -  EXPECT_EQ(expected_locs, locations); -} - -TEST_F(LinearMergerTest, abXcXdTest) { -  // Sentence: Anna has many many nuts and sour apples and juicy apples. -  // Phrase: Anna has X and X apples. -  vector<int> locations; -  vector<int> symbols{1, 2, -1, 3, -2, 4}; -  Phrase phrase = phrase_builder->Build(symbols); -  vector<int> suffix_symbols{2, -1, 3, -2, 4}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); - -  vector<int> prefix_locs{1, 6, 1, 9}; -  vector<int> suffix_locs{2, 6, 8, 2, 6, 11, 2, 9, 11}; - -  linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), -      prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 2, 3); - -  vector<int> expected_locs{1, 6, 8, 1, 6, 11, 1, 9, 11}; -  EXPECT_EQ(expected_locs, locations); -} - -TEST_F(LinearMergerTest, LargeTest) { -  vector<int> locations; -  vector<int> symbols{1, -1, 2}; -  Phrase phrase = phrase_builder->Build(symbols); -  vector<int> suffix_symbols{-1, 2}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); - -  vector<int> prefix_locs; -  for (int i = 0; i < 100; ++i) { -    prefix_locs.push_back(i * 20 + 1); -  } -  vector<int> suffix_locs; -  for (int i = 0; i < 100; ++i) { -    suffix_locs.push_back(i * 20 + 5); -    suffix_locs.push_back(i * 20 + 13); -  } - -  linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), -      prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1); - -  EXPECT_EQ(400, locations.size()); -  for (int i = 0; i < 100; ++i) { -    EXPECT_EQ(i * 20 + 1, locations[4 * i]); -    EXPECT_EQ(i * 20 + 5, locations[4 * i + 1]); -    EXPECT_EQ(i * 20 + 1, locations[4 * i + 2]); -    EXPECT_EQ(i * 20 + 13, locations[4 * i + 3]); -  } -} - -TEST_F(LinearMergerTest, EmptyResultTest) { -  vector<int> locations; -  vector<int> symbols{1, -1, 2}; -  Phrase phrase = phrase_builder->Build(symbols); -  vector<int> suffix_symbols{-1, 2}; -  Phrase suffix = phrase_builder->Build(suffix_symbols); - -  vector<int> prefix_locs; -  for (int i = 0; i < 100; ++i) { -    prefix_locs.push_back(i * 200 + 1); -  } -  vector<int> suffix_locs; -  for (int i = 0; i < 100; ++i) { -    suffix_locs.push_back(i * 200 + 101); -  } - -  linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(), -      prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1); - -  EXPECT_EQ(0, locations.size()); -} - -}  // namespace diff --git a/extractor/matching.cc b/extractor/matching.cc deleted file mode 100644 index 16a3ed6f..00000000 --- a/extractor/matching.cc +++ /dev/null @@ -1,12 +0,0 @@ -#include "matching.h" - -Matching::Matching(vector<int>::iterator start, int len, int sentence_id) : -    positions(start, start + len), sentence_id(sentence_id) {} - -vector<int> Matching::Merge(const Matching& other, int num_subpatterns) const { -  vector<int> result = positions; -  if (num_subpatterns > positions.size()) { -    result.push_back(other.positions.back()); -  } -  return result; -} diff --git a/extractor/matching.h b/extractor/matching.h deleted file mode 100644 index 4c46559e..00000000 --- a/extractor/matching.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _MATCHING_H_ -#define _MATCHING_H_ - -#include <memory> -#include <vector> - -using namespace std; - -struct Matching { -  Matching(vector<int>::iterator start, int len, int sentence_id); - -  vector<int> Merge(const Matching& other, int num_subpatterns) const; - -  vector<int> positions; -  int sentence_id; -}; - -#endif diff --git a/extractor/matching_comparator.cc b/extractor/matching_comparator.cc deleted file mode 100644 index 03db95c0..00000000 --- a/extractor/matching_comparator.cc +++ /dev/null @@ -1,46 +0,0 @@ -#include "matching_comparator.h" - -#include "matching.h" -#include "vocabulary.h" - -MatchingComparator::MatchingComparator(int min_gap_size, int max_rule_span) : -    min_gap_size(min_gap_size), max_rule_span(max_rule_span) {} - -int MatchingComparator::Compare(const Matching& left, -                                const Matching& right, -                                int last_chunk_len, -                                bool offset) const { -  if (left.sentence_id != right.sentence_id) { -    return left.sentence_id < right.sentence_id ? -1 : 1; -  } - -  if (left.positions.size() == 1 && right.positions.size() == 1) { -    // The prefix and the suffix must be separated by a non-terminal, otherwise -    // we would be doing a suffix array lookup. -    if (right.positions[0] - left.positions[0] <= min_gap_size) { -      return 1; -    } -  } else if (offset) { -    for (size_t i = 1; i < left.positions.size(); ++i) { -      if (left.positions[i] != right.positions[i - 1]) { -        return left.positions[i] < right.positions[i - 1] ? -1 : 1; -      } -    } -  } else { -    if (left.positions[0] + 1 != right.positions[0]) { -      return left.positions[0] + 1 < right.positions[0] ? -1 : 1; -    } -    for (size_t i = 1; i < left.positions.size(); ++i) { -      if (left.positions[i] != right.positions[i]) { -        return left.positions[i] < right.positions[i] ? -1 : 1; -      } -    } -  } - -  if (right.positions.back() + last_chunk_len - left.positions.front() > -      max_rule_span) { -    return -1; -  } - -  return 0; -} diff --git a/extractor/matching_comparator.h b/extractor/matching_comparator.h deleted file mode 100644 index 6e1bb487..00000000 --- a/extractor/matching_comparator.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _MATCHING_COMPARATOR_H_ -#define _MATCHING_COMPARATOR_H_ - -#include <memory> - -using namespace std; - -class Vocabulary; -class Matching; - -class MatchingComparator { - public: -  MatchingComparator(int min_gap_size, int max_rule_span); - -  int Compare(const Matching& left, const Matching& right, -              int last_chunk_len, bool offset) const; - - private: -  int min_gap_size; -  int max_rule_span; -}; - -#endif diff --git a/extractor/matching_comparator_test.cc b/extractor/matching_comparator_test.cc deleted file mode 100644 index b8f898cf..00000000 --- a/extractor/matching_comparator_test.cc +++ /dev/null @@ -1,139 +0,0 @@ -#include <gtest/gtest.h> - -#include "matching.h" -#include "matching_comparator.h" - -using namespace ::testing; - -namespace { - -class MatchingComparatorTest : public Test { - protected: -  virtual void SetUp() { -    comparator = make_shared<MatchingComparator>(1, 20); -  } - -  shared_ptr<MatchingComparator> comparator; -}; - -TEST_F(MatchingComparatorTest, SmallerSentenceId) { -  vector<int> left_locations{1}; -  Matching left(left_locations.begin(), 1, 1); -  vector<int> right_locations{100}; -  Matching right(right_locations.begin(), 1, 5); -  EXPECT_EQ(-1, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, GreaterSentenceId) { -  vector<int> left_locations{100}; -  Matching left(left_locations.begin(), 1, 5); -  vector<int> right_locations{1}; -  Matching right(right_locations.begin(), 1, 1); -  EXPECT_EQ(1, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, SmalleraXb) { -  vector<int> left_locations{1}; -  Matching left(left_locations.begin(), 1, 1); -  vector<int> right_locations{21}; -  Matching right(right_locations.begin(), 1, 1); -  // The matching exceeds the max rule span. -  EXPECT_EQ(-1, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, EqualaXb) { -  vector<int> left_locations{1}; -  Matching left(left_locations.begin(), 1, 1); -  vector<int> lower_right_locations{3}; -  Matching right(lower_right_locations.begin(), 1, 1); -  EXPECT_EQ(0, comparator->Compare(left, right, 1, true)); - -  vector<int> higher_right_locations{20}; -  right = Matching(higher_right_locations.begin(), 1, 1); -  EXPECT_EQ(0, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, GreateraXb) { -  vector<int> left_locations{1}; -  Matching left(left_locations.begin(), 1, 1); -  vector<int> right_locations{2}; -  Matching right(right_locations.begin(), 1, 1); -  // The gap between the prefix and the suffix is of size 0, less than the -  // min gap size. -  EXPECT_EQ(1, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, SmalleraXbXc) { -  vector<int> left_locations{1, 3}; -  Matching left(left_locations.begin(), 2, 1); -  vector<int> right_locations{4, 6}; -  // The common part doesn't match. -  Matching right(right_locations.begin(), 2, 1); -  EXPECT_EQ(-1, comparator->Compare(left, right, 1, true)); - -  // The common part matches, but the rule exceeds the max span. -  vector<int> other_right_locations{3, 21}; -  right = Matching(other_right_locations.begin(), 2, 1); -  EXPECT_EQ(-1, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, EqualaXbXc) { -  vector<int> left_locations{1, 3}; -  Matching left(left_locations.begin(), 2, 1); -  vector<int> right_locations{3, 5}; -  // The leftmost possible match. -  Matching right(right_locations.begin(), 2, 1); -  EXPECT_EQ(0, comparator->Compare(left, right, 1, true)); - -  // The rightmost possible match. -  vector<int> other_right_locations{3, 20}; -  right = Matching(other_right_locations.begin(), 2, 1); -  EXPECT_EQ(0, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, GreateraXbXc) { -  vector<int> left_locations{1, 4}; -  Matching left(left_locations.begin(), 2, 1); -  vector<int> right_locations{3, 5}; -  // The common part doesn't match. -  Matching right(right_locations.begin(), 2, 1); -  EXPECT_EQ(1, comparator->Compare(left, right, 1, true)); -} - -TEST_F(MatchingComparatorTest, SmallerabXcXd) { -  vector<int> left_locations{9, 13}; -  Matching left(left_locations.begin(), 2, 1); -  // The suffix doesn't start on the next position. -  vector<int> right_locations{11, 13, 15}; -  Matching right(right_locations.begin(), 3, 1); -  EXPECT_EQ(-1, comparator->Compare(left, right, 1, false)); - -  // The common part doesn't match. -  vector<int> other_right_locations{10, 16, 18}; -  right = Matching(other_right_locations.begin(), 3, 1); -  EXPECT_EQ(-1, comparator->Compare(left, right, 1, false)); -} - -TEST_F(MatchingComparatorTest, EqualabXcXd) { -  vector<int> left_locations{10, 13}; -  Matching left(left_locations.begin(), 2, 1); -  vector<int> right_locations{11, 13, 15}; -  Matching right(right_locations.begin(), 3, 1); -  EXPECT_EQ(0, comparator->Compare(left, right, 1, false)); -} - -TEST_F(MatchingComparatorTest, GreaterabXcXd) { -  vector<int> left_locations{9, 15}; -  Matching left(left_locations.begin(), 2, 1); -  // The suffix doesn't start on the next position. -  vector<int> right_locations{7, 15, 17}; -  Matching right(right_locations.begin(), 3, 1); -  EXPECT_EQ(1, comparator->Compare(left, right, 1, false)); - -  // The common part doesn't match. -  vector<int> other_right_locations{10, 13, 16}; -  right = Matching(other_right_locations.begin(), 3, 1); -  EXPECT_EQ(1, comparator->Compare(left, right, 1, false)); -} - -}  // namespace diff --git a/extractor/matching_test.cc b/extractor/matching_test.cc deleted file mode 100644 index 9593aa86..00000000 --- a/extractor/matching_test.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include <gtest/gtest.h> - -#include <vector> - -#include "matching.h" - -using namespace std; - -namespace { - -TEST(MatchingTest, SameSize) { -  vector<int> positions{1, 2, 3}; -  Matching left(positions.begin(), positions.size(), 0); -  Matching right(positions.begin(), positions.size(), 0); -  EXPECT_EQ(positions, left.Merge(right, positions.size())); -} - -TEST(MatchingTest, DifferentSize) { -  vector<int> positions{1, 2, 3}; -  Matching left(positions.begin(), positions.size() - 1, 0); -  Matching right(positions.begin() + 1, positions.size() - 1, 0); -  vector<int> result = left.Merge(right, positions.size()); -} - -}  // namespace diff --git a/extractor/mocks/mock_binary_search_merger.h b/extractor/mocks/mock_binary_search_merger.h deleted file mode 100644 index e23386f0..00000000 --- a/extractor/mocks/mock_binary_search_merger.h +++ /dev/null @@ -1,15 +0,0 @@ -#include <gmock/gmock.h> - -#include <vector> - -#include "../binary_search_merger.h" -#include "../phrase.h" - -using namespace std; - -class MockBinarySearchMerger: public BinarySearchMerger { - public: -  MOCK_CONST_METHOD9(Merge, void(vector<int>&, const Phrase&, const Phrase&, -      const vector<int>::iterator&, const vector<int>::iterator&, -      const vector<int>::iterator&, const vector<int>::iterator&, int, int)); -}; diff --git a/extractor/mocks/mock_intersector.h b/extractor/mocks/mock_intersector.h deleted file mode 100644 index 372fa7ea..00000000 --- a/extractor/mocks/mock_intersector.h +++ /dev/null @@ -1,11 +0,0 @@ -#include <gmock/gmock.h> - -#include "../intersector.h" -#include "../phrase.h" -#include "../phrase_location.h" - -class MockIntersector : public Intersector { - public: -  MOCK_METHOD5(Intersect, PhraseLocation(const Phrase&, PhraseLocation&, -      const Phrase&, PhraseLocation&, const Phrase&)); -}; diff --git a/extractor/mocks/mock_linear_merger.h b/extractor/mocks/mock_linear_merger.h deleted file mode 100644 index 522c1f31..00000000 --- a/extractor/mocks/mock_linear_merger.h +++ /dev/null @@ -1,15 +0,0 @@ -#include <gmock/gmock.h> - -#include <vector> - -#include "../linear_merger.h" -#include "../phrase.h" - -using namespace std; - -class MockLinearMerger: public LinearMerger { - public: -  MOCK_METHOD9(Merge, void(vector<int>&, const Phrase&, const Phrase&, -      vector<int>::iterator, vector<int>::iterator, vector<int>::iterator, -      vector<int>::iterator, int, int)); -}; diff --git a/extractor/rule_factory.cc b/extractor/rule_factory.cc index fbdc7cce..4908dac0 100644 --- a/extractor/rule_factory.cc +++ b/extractor/rule_factory.cc @@ -7,9 +7,7 @@  #include "grammar.h"  #include "fast_intersector.h" -#include "intersector.h"  #include "matchings_finder.h" -#include "matching_comparator.h"  #include "phrase.h"  #include "rule.h"  #include "rule_extractor.h" @@ -50,8 +48,6 @@ HieroCachingRuleFactory::HieroCachingRuleFactory(      int max_nonterminals,      int max_rule_symbols,      int max_samples, -    bool use_fast_intersect, -    bool use_baeza_yates,      bool require_tight_phrases) :      vocabulary(vocabulary),      scorer(scorer), @@ -59,13 +55,8 @@ HieroCachingRuleFactory::HieroCachingRuleFactory(      max_rule_span(max_rule_span),      max_nonterminals(max_nonterminals),      max_chunks(max_nonterminals + 1), -    max_rule_symbols(max_rule_symbols), -    use_fast_intersect(use_fast_intersect) { +    max_rule_symbols(max_rule_symbols) {    matchings_finder = make_shared<MatchingsFinder>(source_suffix_array); -  shared_ptr<MatchingComparator> comparator = -      make_shared<MatchingComparator>(min_gap_size, max_rule_span); -  intersector = make_shared<Intersector>(vocabulary, precomputation, -      source_suffix_array, comparator, use_baeza_yates);    fast_intersector = make_shared<FastIntersector>(source_suffix_array,        precomputation, vocabulary, max_rule_span, min_gap_size);    phrase_builder = make_shared<PhraseBuilder>(vocabulary); @@ -78,7 +69,6 @@ HieroCachingRuleFactory::HieroCachingRuleFactory(  HieroCachingRuleFactory::HieroCachingRuleFactory(      shared_ptr<MatchingsFinder> finder, -    shared_ptr<Intersector> intersector,      shared_ptr<FastIntersector> fast_intersector,      shared_ptr<PhraseBuilder> phrase_builder,      shared_ptr<RuleExtractor> rule_extractor, @@ -89,10 +79,8 @@ HieroCachingRuleFactory::HieroCachingRuleFactory(      int max_rule_span,      int max_nonterminals,      int max_chunks, -    int max_rule_symbols, -    bool use_fast_intersect) : +    int max_rule_symbols) :      matchings_finder(finder), -    intersector(intersector),      fast_intersector(fast_intersector),      phrase_builder(phrase_builder),      rule_extractor(rule_extractor), @@ -103,15 +91,13 @@ HieroCachingRuleFactory::HieroCachingRuleFactory(      max_rule_span(max_rule_span),      max_nonterminals(max_nonterminals),      max_chunks(max_chunks), -    max_rule_symbols(max_rule_symbols), -    use_fast_intersect(use_fast_intersect) {} +    max_rule_symbols(max_rule_symbols) {}  HieroCachingRuleFactory::HieroCachingRuleFactory() {}  HieroCachingRuleFactory::~HieroCachingRuleFactory() {}  Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids) { -  intersector->sort_time = 0;    Clock::time_point start_time = Clock::now();    double total_extract_time = 0;    double total_intersect_time = 0; @@ -164,17 +150,8 @@ Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids) {          PhraseLocation phrase_location;          if (next_phrase.Arity() > 0) {            Clock::time_point intersect_start = Clock::now(); -          if (use_fast_intersect) { -            phrase_location = fast_intersector->Intersect( -                node->matchings, next_suffix_link->matchings, next_phrase); -          } else { -            phrase_location = intersector->Intersect( -                node->phrase, -                node->matchings, -                next_suffix_link->phrase, -                next_suffix_link->matchings, -                next_phrase); -          } +          phrase_location = fast_intersector->Intersect( +              node->matchings, next_suffix_link->matchings, next_phrase);            Clock::time_point intersect_stop = Clock::now();            total_intersect_time += GetDuration(intersect_start, intersect_stop);          } else { diff --git a/extractor/rule_factory.h b/extractor/rule_factory.h index a39386a8..2d1c7f5a 100644 --- a/extractor/rule_factory.h +++ b/extractor/rule_factory.h @@ -14,7 +14,6 @@ class DataArray;  class Grammar;  class MatchingsFinder;  class FastIntersector; -class Intersector;  class Precomputation;  class Rule;  class RuleExtractor; @@ -38,14 +37,11 @@ class HieroCachingRuleFactory {        int max_nonterminals,        int max_rule_symbols,        int max_samples, -      bool use_fast_intersect, -      bool use_beaza_yates,        bool require_tight_phrases);    // For testing only.    HieroCachingRuleFactory(        shared_ptr<MatchingsFinder> finder, -      shared_ptr<Intersector> intersector,        shared_ptr<FastIntersector> fast_intersector,        shared_ptr<PhraseBuilder> phrase_builder,        shared_ptr<RuleExtractor> rule_extractor, @@ -56,8 +52,7 @@ class HieroCachingRuleFactory {        int max_rule_span,        int max_nonterminals,        int max_chunks, -      int max_rule_symbols, -      bool use_fast_intersect); +      int max_rule_symbols);    virtual ~HieroCachingRuleFactory(); @@ -83,7 +78,6 @@ class HieroCachingRuleFactory {                              const shared_ptr<TrieNode>& node);    shared_ptr<MatchingsFinder> matchings_finder; -  shared_ptr<Intersector> intersector;    shared_ptr<FastIntersector> fast_intersector;    MatchingsTrie trie;    shared_ptr<PhraseBuilder> phrase_builder; @@ -96,7 +90,6 @@ class HieroCachingRuleFactory {    int max_nonterminals;    int max_chunks;    int max_rule_symbols; -  bool use_fast_intersect;  };  #endif diff --git a/extractor/rule_factory_test.cc b/extractor/rule_factory_test.cc index d329382a..fc709461 100644 --- a/extractor/rule_factory_test.cc +++ b/extractor/rule_factory_test.cc @@ -6,7 +6,6 @@  #include "grammar.h"  #include "mocks/mock_fast_intersector.h" -#include "mocks/mock_intersector.h"  #include "mocks/mock_matchings_finder.h"  #include "mocks/mock_rule_extractor.h"  #include "mocks/mock_sampler.h" @@ -25,7 +24,6 @@ class RuleFactoryTest : public Test {   protected:    virtual void SetUp() {      finder = make_shared<MockMatchingsFinder>(); -    intersector = make_shared<MockIntersector>();      fast_intersector = make_shared<MockFastIntersector>();      vocabulary = make_shared<MockVocabulary>(); @@ -55,7 +53,6 @@ class RuleFactoryTest : public Test {    vector<string> feature_names;    shared_ptr<MockMatchingsFinder> finder; -  shared_ptr<MockIntersector> intersector;    shared_ptr<MockFastIntersector> fast_intersector;    shared_ptr<MockVocabulary> vocabulary;    shared_ptr<PhraseBuilder> phrase_builder; @@ -66,81 +63,39 @@ class RuleFactoryTest : public Test {  };  TEST_F(RuleFactoryTest, TestGetGrammarDifferentWords) { -  factory = make_shared<HieroCachingRuleFactory>(finder, intersector, -      fast_intersector, phrase_builder, extractor, vocabulary, sampler, -      scorer, 1, 10, 2, 3, 5, false); +  factory = make_shared<HieroCachingRuleFactory>(finder, fast_intersector, +      phrase_builder, extractor, vocabulary, sampler, scorer, 1, 10, 2, 3, 5);    EXPECT_CALL(*finder, Find(_, _, _))        .Times(6)        .WillRepeatedly(Return(PhraseLocation(0, 1))); -  EXPECT_CALL(*intersector, Intersect(_, _, _, _, _)) +  EXPECT_CALL(*fast_intersector, Intersect(_, _, _))        .Times(1)        .WillRepeatedly(Return(PhraseLocation(0, 1))); -  EXPECT_CALL(*fast_intersector, Intersect(_, _, _)).Times(0);    vector<int> word_ids = {2, 3, 4};    Grammar grammar = factory->GetGrammar(word_ids);    EXPECT_EQ(feature_names, grammar.GetFeatureNames());    EXPECT_EQ(7, grammar.GetRules().size()); - -  // Test for fast intersector. -  factory = make_shared<HieroCachingRuleFactory>(finder, intersector, -      fast_intersector, phrase_builder, extractor, vocabulary, sampler, -      scorer, 1, 10, 2, 3, 5, true); - -  EXPECT_CALL(*finder, Find(_, _, _)) -      .Times(6) -      .WillRepeatedly(Return(PhraseLocation(0, 1))); - -  EXPECT_CALL(*fast_intersector, Intersect(_, _, _)) -      .Times(1) -      .WillRepeatedly(Return(PhraseLocation(0, 1))); -  EXPECT_CALL(*intersector, Intersect(_, _, _, _, _)).Times(0); - -  grammar = factory->GetGrammar(word_ids); -  EXPECT_EQ(feature_names, grammar.GetFeatureNames()); -  EXPECT_EQ(7, grammar.GetRules().size());  }  TEST_F(RuleFactoryTest, TestGetGrammarRepeatingWords) { -  factory = make_shared<HieroCachingRuleFactory>(finder, intersector, -      fast_intersector, phrase_builder, extractor, vocabulary, sampler, -      scorer, 1, 10, 2, 3, 5, false); +  factory = make_shared<HieroCachingRuleFactory>(finder, fast_intersector, +      phrase_builder, extractor, vocabulary, sampler, scorer, 1, 10, 2, 3, 5);    EXPECT_CALL(*finder, Find(_, _, _))        .Times(12)        .WillRepeatedly(Return(PhraseLocation(0, 1))); -  EXPECT_CALL(*intersector, Intersect(_, _, _, _, _)) +  EXPECT_CALL(*fast_intersector, Intersect(_, _, _))        .Times(16)        .WillRepeatedly(Return(PhraseLocation(0, 1))); -  EXPECT_CALL(*fast_intersector, Intersect(_, _, _)).Times(0); -    vector<int> word_ids = {2, 3, 4, 2, 3};    Grammar grammar = factory->GetGrammar(word_ids);    EXPECT_EQ(feature_names, grammar.GetFeatureNames());    EXPECT_EQ(28, grammar.GetRules().size()); - -  // Test for fast intersector. -  factory = make_shared<HieroCachingRuleFactory>(finder, intersector, -      fast_intersector, phrase_builder, extractor, vocabulary, sampler, -      scorer, 1, 10, 2, 3, 5, true); - -  EXPECT_CALL(*finder, Find(_, _, _)) -      .Times(12) -      .WillRepeatedly(Return(PhraseLocation(0, 1))); - -  EXPECT_CALL(*fast_intersector, Intersect(_, _, _)) -      .Times(16) -      .WillRepeatedly(Return(PhraseLocation(0, 1))); - -  EXPECT_CALL(*intersector, Intersect(_, _, _, _, _)).Times(0); - -  grammar = factory->GetGrammar(word_ids); -  EXPECT_EQ(feature_names, grammar.GetFeatureNames()); -  EXPECT_EQ(28, grammar.GetRules().size());  }  }  // namespace diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 2b01e832..36dbd7a0 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -66,13 +66,9 @@ int main(int argc, char** argv) {          "Minimum number of occurences for a pharse to be considered frequent")      ("max_samples", po::value<int>()->default_value(300),          "Maximum number of samples") -    ("fast_intersect", po::value<bool>()->default_value(false), -        "Enable fast intersect")      // TODO(pauldb): Check if this works when set to false.      ("tight_phrases", po::value<bool>()->default_value(true), -        "False if phrases may be loose (better, but slower)") -    ("baeza_yates", po::value<bool>()->default_value(true), -        "Use double binary search"); +        "False if phrases may be loose (better, but slower)");    po::variables_map vm;    po::store(po::parse_command_line(argc, argv, desc), vm); @@ -178,8 +174,6 @@ int main(int argc, char** argv) {        vm["max_nonterminals"].as<int>(),        vm["max_rule_symbols"].as<int>(),        vm["max_samples"].as<int>(), -      vm["fast_intersect"].as<bool>(), -      vm["baeza_yates"].as<bool>(),        vm["tight_phrases"].as<bool>());    int grammar_id = 0; diff --git a/extractor/suffix_array.cc b/extractor/suffix_array.cc index 23c458a4..ab8a0913 100644 --- a/extractor/suffix_array.cc +++ b/extractor/suffix_array.cc @@ -136,7 +136,7 @@ void SuffixArray::TernaryQuicksort(int left, int right, int step,  vector<int> SuffixArray::BuildLCPArray() const {    Clock::time_point start_time = Clock::now(); -  cerr << "Constructing LCP array..." << endl; +  cerr << "\tConstructing LCP array..." << endl;    vector<int> lcp(suffix_array.size());    vector<int> rank(suffix_array.size()); @@ -165,7 +165,7 @@ vector<int> SuffixArray::BuildLCPArray() const {    }    Clock::time_point stop_time = Clock::now(); -  cerr << "Constructing LCP took " +  cerr << "\tConstructing LCP took "         << GetDuration(start_time, stop_time) << " seconds" << endl;    return lcp; diff --git a/extractor/veb.cc b/extractor/veb.cc deleted file mode 100644 index f38f672e..00000000 --- a/extractor/veb.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include "veb.h" - -#include "veb_bitset.h" -#include "veb_tree.h" - -int VEB::MIN_BOTTOM_BITS = 5; -int VEB::MIN_BOTTOM_SIZE = 1 << VEB::MIN_BOTTOM_BITS; - -shared_ptr<VEB> VEB::Create(int size) { -  if (size > MIN_BOTTOM_SIZE) { -    return shared_ptr<VEB>(new VEBTree(size)); -  } else { -    return shared_ptr<VEB>(new VEBBitset(size)); -  } -} - -int VEB::GetMinimum() { -  return min; -} - -int VEB::GetMaximum() { -  return max; -} - -VEB::VEB(int min, int max) : min(min), max(max) {} diff --git a/extractor/veb.h b/extractor/veb.h deleted file mode 100644 index c8209cf7..00000000 --- a/extractor/veb.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef _VEB_H_ -#define _VEB_H_ - -#include <memory> - -using namespace std; - -class VEB { - public: -  static shared_ptr<VEB> Create(int size); - -  virtual void Insert(int value) = 0; - -  virtual int GetSuccessor(int value) = 0; - -  int GetMinimum(); - -  int GetMaximum(); - -  static int MIN_BOTTOM_BITS; -  static int MIN_BOTTOM_SIZE; - - protected: -  VEB(int min = -1, int max = -1); - -  int min, max; -}; - -#endif diff --git a/extractor/veb_bitset.cc b/extractor/veb_bitset.cc deleted file mode 100644 index 4e364cc5..00000000 --- a/extractor/veb_bitset.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include "veb_bitset.h" - -using namespace std; - -VEBBitset::VEBBitset(int size) : bitset(size) { -  min = max = -1; -} - -void VEBBitset::Insert(int value) { -  bitset[value] = 1; -  if (min == -1 || value < min) { -    min = value; -  } -  if (max == - 1 || value > max) { -    max = value; -  } -} - -int VEBBitset::GetSuccessor(int value) { -  int next_value = bitset.find_next(value); -  if (next_value == bitset.npos) { -    return -1; -  } -  return next_value; -} diff --git a/extractor/veb_bitset.h b/extractor/veb_bitset.h deleted file mode 100644 index f8a91234..00000000 --- a/extractor/veb_bitset.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _VEB_BITSET_H_ -#define _VEB_BITSET_H_ - -#include <boost/dynamic_bitset.hpp> - -#include "veb.h" - -class VEBBitset: public VEB { - public: -  VEBBitset(int size); - -  void Insert(int value); - -  int GetMinimum(); - -  int GetSuccessor(int value); - - private: -  boost::dynamic_bitset<> bitset; -}; - -#endif diff --git a/extractor/veb_test.cc b/extractor/veb_test.cc deleted file mode 100644 index c40c9f28..00000000 --- a/extractor/veb_test.cc +++ /dev/null @@ -1,56 +0,0 @@ -#include <gtest/gtest.h> - -#include <algorithm> -#include <vector> - -#include "veb.h" - -using namespace std; - -namespace { - -class VEBTest : public ::testing::Test { - protected: -  void VEBSortTester(vector<int> values, int max_value) { -    shared_ptr<VEB> veb = VEB::Create(max_value); -    for (int value: values) { -      veb->Insert(value); -    } - -    sort(values.begin(), values.end()); -    EXPECT_EQ(values.front(), veb->GetMinimum()); -    EXPECT_EQ(values.back(), veb->GetMaximum()); -    for (size_t i = 0; i + 1 < values.size(); ++i) { -      EXPECT_EQ(values[i + 1], veb->GetSuccessor(values[i])); -    } -    EXPECT_EQ(-1, veb->GetSuccessor(values.back())); -  } -}; - -TEST_F(VEBTest, SmallRange) { -  vector<int> values{8, 13, 5, 1, 4, 15, 2, 10, 6, 7}; -  VEBSortTester(values, 16); -} - -TEST_F(VEBTest, MediumRange) { -  vector<int> values{167, 243, 88, 12, 137, 199, 212, 45, 150, 189}; -  VEBSortTester(values, 255); -} - -TEST_F(VEBTest, LargeRangeSparse) { -  vector<int> values; -  for (size_t i = 0; i < 100; ++i) { -    values.push_back(i * 1000000); -  } -  VEBSortTester(values, 100000000); -} - -TEST_F(VEBTest, LargeRangeDense) { -  vector<int> values; -  for (size_t i = 0; i < 1000000; ++i) { -    values.push_back(i); -  } -  VEBSortTester(values, 1000000); -} - -}  // namespace diff --git a/extractor/veb_tree.cc b/extractor/veb_tree.cc deleted file mode 100644 index f8945445..00000000 --- a/extractor/veb_tree.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include <cmath> - -#include "veb_tree.h" - -VEBTree::VEBTree(int size) { -  int num_bits = ceil(log2(size)); - -  lower_bits = num_bits >> 1; -  upper_size = (size >> lower_bits) + 1; - -  clusters.reserve(upper_size); -  clusters.resize(upper_size); -} - -int VEBTree::GetNextValue(int value) { -  return value & ((1 << lower_bits) - 1); -} - -int VEBTree::GetCluster(int value) { -  return value >> lower_bits; -} - -int VEBTree::Compose(int cluster, int value) { -  return (cluster << lower_bits) + value; -} - -void VEBTree::Insert(int value) { -  if (min == -1 && max == -1) { -    min = max = value; -    return; -  } - -  if (value < min) { -    swap(min, value); -  } - -  int cluster = GetCluster(value), next_value = GetNextValue(value); -  if (clusters[cluster] == NULL) { -    clusters[cluster] = VEB::Create(1 << lower_bits); -    if (summary == NULL) { -      summary = VEB::Create(upper_size); -    } -    summary->Insert(cluster); -  } -  clusters[cluster]->Insert(next_value); - -  if (value > max) { -    max = value; -  } -} - -int VEBTree::GetSuccessor(int value) { -  if (value >= max) { -    return -1; -  } -  if (value < min) { -    return min; -  } - -  int cluster = GetCluster(value), next_value = GetNextValue(value); -  if (clusters[cluster] != NULL && -      next_value < clusters[cluster]->GetMaximum()) { -    return Compose(cluster, clusters[cluster]->GetSuccessor(next_value)); -  } else { -    int next_cluster = summary->GetSuccessor(cluster); -    if (next_cluster == -1) { -      return -1; -    } -    return Compose(next_cluster, clusters[next_cluster]->GetMinimum()); -  } -} diff --git a/extractor/veb_tree.h b/extractor/veb_tree.h deleted file mode 100644 index 578d3e6a..00000000 --- a/extractor/veb_tree.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef _VEB_TREE_H_ -#define _VEB_TREE_H_ - -#include <memory> -#include <vector> - -using namespace std; - -#include "veb.h" - -class VEBTree: public VEB { - public: -  VEBTree(int size); - -  void Insert(int value); - -  int GetSuccessor(int value); - - private: -  int GetNextValue(int value); -  int GetCluster(int value); -  int Compose(int cluster, int value); - -  int lower_bits, upper_size; -  shared_ptr<VEB> summary; -  vector<shared_ptr<VEB> > clusters; -}; - -#endif | 
