summaryrefslogtreecommitdiff
path: root/extractor
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-02-22 16:52:25 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-02-22 16:52:25 +0000
commit6717daa034bfcfa429275f64d84607ae1568d488 (patch)
treec04c1546eed0e9acaa21907a5f8ab5aedc1a08fc /extractor
parentbb79fdc140a5593124d3943769a644c6cc3d87f6 (diff)
parentd69c289e172562039bcbe987657280332ab6315e (diff)
Merge branch 'master' into experiment
Diffstat (limited to 'extractor')
-rw-r--r--extractor/Makefile.am36
-rw-r--r--extractor/binary_search_merger.cc251
-rw-r--r--extractor/binary_search_merger.h75
-rw-r--r--extractor/binary_search_merger_test.cc157
-rw-r--r--extractor/data_array.h6
-rw-r--r--extractor/data_array_test.cc30
-rw-r--r--extractor/grammar_extractor.cc5
-rw-r--r--extractor/grammar_extractor.h2
-rw-r--r--extractor/intersector.cc154
-rw-r--r--extractor/intersector.h79
-rw-r--r--extractor/intersector_test.cc193
-rw-r--r--extractor/linear_merger.cc65
-rw-r--r--extractor/linear_merger.h38
-rw-r--r--extractor/linear_merger_test.cc149
-rw-r--r--extractor/matching.cc12
-rw-r--r--extractor/matching.h18
-rw-r--r--extractor/matching_comparator.cc46
-rw-r--r--extractor/matching_comparator.h23
-rw-r--r--extractor/matching_comparator_test.cc139
-rw-r--r--extractor/matching_test.cc25
-rw-r--r--extractor/mocks/mock_binary_search_merger.h15
-rw-r--r--extractor/mocks/mock_intersector.h11
-rw-r--r--extractor/mocks/mock_linear_merger.h15
-rw-r--r--extractor/rule_factory.cc33
-rw-r--r--extractor/rule_factory.h9
-rw-r--r--extractor/rule_factory_test.cc57
-rw-r--r--extractor/run_extractor.cc8
-rw-r--r--extractor/suffix_array.cc4
-rw-r--r--extractor/veb.cc25
-rw-r--r--extractor/veb.h29
-rw-r--r--extractor/veb_bitset.cc25
-rw-r--r--extractor/veb_bitset.h22
-rw-r--r--extractor/veb_test.cc56
-rw-r--r--extractor/veb_tree.cc71
-rw-r--r--extractor/veb_tree.h29
35 files changed, 50 insertions, 1862 deletions
diff --git a/extractor/Makefile.am b/extractor/Makefile.am
index df796cea..721df18b 100644
--- a/extractor/Makefile.am
+++ b/extractor/Makefile.am
@@ -1,7 +1,6 @@
bin_PROGRAMS = compile run_extractor
EXTRA_PROGRAMS = alignment_test \
- binary_search_merger_test \
data_array_test \
fast_intersector_test \
feature_count_source_target_test \
@@ -12,10 +11,6 @@ EXTRA_PROGRAMS = alignment_test \
feature_sample_source_count_test \
feature_target_given_source_coherent_test \
grammar_extractor_test \
- intersector_test \
- linear_merger_test \
- matching_comparator_test \
- matching_test \
matchings_finder_test \
phrase_test \
precomputation_test \
@@ -26,12 +21,10 @@ EXTRA_PROGRAMS = alignment_test \
scorer_test \
suffix_array_test \
target_phrase_extractor_test \
- translation_table_test \
- veb_test
+ translation_table_test
if HAVE_GTEST
RUNNABLE_TESTS = alignment_test \
- binary_search_merger_test \
data_array_test \
fast_intersector_test \
feature_count_source_target_test \
@@ -42,10 +35,6 @@ if HAVE_GTEST
feature_sample_source_count_test \
feature_target_given_source_coherent_test \
grammar_extractor_test \
- intersector_test \
- linear_merger_test \
- matching_comparator_test \
- matching_test \
matchings_finder_test \
phrase_test \
precomputation_test \
@@ -56,8 +45,7 @@ if HAVE_GTEST
scorer_test \
suffix_array_test \
target_phrase_extractor_test \
- translation_table_test \
- veb_test
+ translation_table_test
endif
noinst_PROGRAMS = $(RUNNABLE_TESTS)
@@ -66,8 +54,6 @@ TESTS = $(RUNNABLE_TESTS)
alignment_test_SOURCES = alignment_test.cc
alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
-binary_search_merger_test_SOURCES = binary_search_merger_test.cc
-binary_search_merger_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
data_array_test_SOURCES = data_array_test.cc
data_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
fast_intersector_test_SOURCES = fast_intersector_test.cc
@@ -88,14 +74,6 @@ feature_target_given_source_coherent_test_SOURCES = features/target_given_source
feature_target_given_source_coherent_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
grammar_extractor_test_SOURCES = grammar_extractor_test.cc
grammar_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
-intersector_test_SOURCES = intersector_test.cc
-intersector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
-linear_merger_test_SOURCES = linear_merger_test.cc
-linear_merger_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
-matching_comparator_test_SOURCES = matching_comparator_test.cc
-matching_comparator_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
-matching_test_SOURCES = matching_test.cc
-matching_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
matchings_finder_test_SOURCES = matchings_finder_test.cc
matchings_finder_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
phrase_test_SOURCES = phrase_test.cc
@@ -118,8 +96,6 @@ target_phrase_extractor_test_SOURCES = target_phrase_extractor_test.cc
target_phrase_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
translation_table_test_SOURCES = translation_table_test.cc
translation_table_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a
-veb_test_SOURCES = veb_test.cc
-veb_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
noinst_LIBRARIES = libextractor.a libcompile.a
@@ -139,7 +115,6 @@ libcompile_a_SOURCES = \
libextractor_a_SOURCES = \
alignment.cc \
- binary_search_merger.cc \
data_array.cc \
fast_intersector.cc \
features/count_source_target.cc \
@@ -152,11 +127,7 @@ libextractor_a_SOURCES = \
features/target_given_source_coherent.cc \
grammar.cc \
grammar_extractor.cc \
- matching.cc \
- matching_comparator.cc \
matchings_finder.cc \
- intersector.cc \
- linear_merger.cc \
matchings_trie.cc \
phrase.cc \
phrase_builder.cc \
@@ -172,9 +143,6 @@ libextractor_a_SOURCES = \
target_phrase_extractor.cc \
time_util.cc \
translation_table.cc \
- veb.cc \
- veb_bitset.cc \
- veb_tree.cc \
vocabulary.cc
AM_CPPFLAGS = -W -Wall -Wno-sign-compare -std=c++0x $(GTEST_CPPFLAGS) $(GMOCK_CPPFLAGS)
diff --git a/extractor/binary_search_merger.cc b/extractor/binary_search_merger.cc
deleted file mode 100644
index c1b86a77..00000000
--- a/extractor/binary_search_merger.cc
+++ /dev/null
@@ -1,251 +0,0 @@
-#include "binary_search_merger.h"
-
-#include "data_array.h"
-#include "linear_merger.h"
-#include "matching.h"
-#include "matching_comparator.h"
-#include "phrase.h"
-#include "vocabulary.h"
-
-double BinarySearchMerger::BAEZA_YATES_FACTOR = 1.0;
-
-BinarySearchMerger::BinarySearchMerger(
- shared_ptr<Vocabulary> vocabulary,
- shared_ptr<LinearMerger> linear_merger,
- shared_ptr<DataArray> data_array,
- shared_ptr<MatchingComparator> comparator,
- bool force_binary_search_merge) :
- vocabulary(vocabulary), linear_merger(linear_merger),
- data_array(data_array), comparator(comparator),
- force_binary_search_merge(force_binary_search_merge) {}
-
-BinarySearchMerger::BinarySearchMerger() {}
-
-BinarySearchMerger::~BinarySearchMerger() {}
-
-void BinarySearchMerger::Merge(
- vector<int>& locations, const Phrase& phrase, const Phrase& suffix,
- const vector<int>::iterator& prefix_start,
- const vector<int>::iterator& prefix_end,
- const vector<int>::iterator& suffix_start,
- const vector<int>::iterator& suffix_end,
- int prefix_subpatterns, int suffix_subpatterns) const {
- if (IsIntersectionVoid(prefix_start, prefix_end, suffix_start, suffix_end,
- prefix_subpatterns, suffix_subpatterns, suffix)) {
- return;
- }
-
- int prefix_set_size = prefix_end - prefix_start;
- int suffix_set_size = suffix_end - suffix_start;
- if (ShouldUseLinearMerge(prefix_set_size, suffix_set_size)) {
- linear_merger->Merge(locations, phrase, suffix, prefix_start, prefix_end,
- suffix_start, suffix_end, prefix_subpatterns, suffix_subpatterns);
- return;
- }
-
- vector<int>::iterator low, high, prefix_low, prefix_high, suffix_mid;
- if (prefix_set_size > suffix_set_size) {
- // Binary search on the prefix set.
- suffix_mid = GetMiddle(suffix_start, suffix_end, suffix_subpatterns);
- low = prefix_start, high = prefix_end;
- while (low < high) {
- vector<int>::iterator prefix_mid =
- GetMiddle(low, high, prefix_subpatterns);
-
- GetComparableMatchings(prefix_start, prefix_end, prefix_mid,
- prefix_subpatterns, prefix_low, prefix_high);
- int comparison = CompareMatchingsSet(prefix_low, prefix_high, suffix_mid,
- prefix_subpatterns, suffix_subpatterns, suffix);
- if (comparison == 0) {
- break;
- } else if (comparison < 0) {
- low = prefix_mid + prefix_subpatterns;
- } else {
- high = prefix_mid;
- }
- }
- } else {
- // Binary search on the suffix set.
- vector<int>::iterator prefix_mid =
- GetMiddle(prefix_start, prefix_end, prefix_subpatterns);
-
- GetComparableMatchings(prefix_start, prefix_end, prefix_mid,
- prefix_subpatterns, prefix_low, prefix_high);
- low = suffix_start, high = suffix_end;
- while (low < high) {
- suffix_mid = GetMiddle(low, high, suffix_subpatterns);
-
- int comparison = CompareMatchingsSet(prefix_low, prefix_high, suffix_mid,
- prefix_subpatterns, suffix_subpatterns, suffix);
- if (comparison == 0) {
- break;
- } else if (comparison > 0) {
- low = suffix_mid + suffix_subpatterns;
- } else {
- high = suffix_mid;
- }
- }
- }
-
- vector<int> result;
- int last_chunk_len = suffix.GetChunkLen(suffix.Arity());
- bool offset = !vocabulary->IsTerminal(suffix.GetSymbol(0));
- vector<int>::iterator suffix_low, suffix_high;
- if (low < high) {
- // We found a group of prefixes with the same starting position that give
- // different results when compared to the found suffix.
- // Find all matching suffixes for the previously found set of prefixes.
- suffix_low = suffix_mid;
- suffix_high = suffix_mid + suffix_subpatterns;
- for (auto i = prefix_low; i != prefix_high; i += prefix_subpatterns) {
- Matching left(i, prefix_subpatterns, data_array->GetSentenceId(*i));
- while (suffix_low != suffix_start) {
- Matching right(suffix_low - suffix_subpatterns, suffix_subpatterns,
- data_array->GetSentenceId(*(suffix_low - suffix_subpatterns)));
- if (comparator->Compare(left, right, last_chunk_len, offset) <= 0) {
- suffix_low -= suffix_subpatterns;
- } else {
- break;
- }
- }
-
- for (auto j = suffix_low; j != suffix_end; j += suffix_subpatterns) {
- Matching right(j, suffix_subpatterns, data_array->GetSentenceId(*j));
- int comparison = comparator->Compare(left, right, last_chunk_len,
- offset);
- if (comparison == 0) {
- vector<int> merged = left.Merge(right, phrase.Arity() + 1);
- result.insert(result.end(), merged.begin(), merged.end());
- } else if (comparison < 0) {
- break;
- }
- suffix_high = max(suffix_high, j + suffix_subpatterns);
- }
- }
-
- swap(suffix_low, suffix_high);
- } else if (prefix_set_size > suffix_set_size) {
- // We did the binary search on the prefix set.
- suffix_low = suffix_mid;
- suffix_high = suffix_mid + suffix_subpatterns;
- if (CompareMatchingsSet(prefix_low, prefix_high, suffix_mid,
- prefix_subpatterns, suffix_subpatterns, suffix) < 0) {
- prefix_low = prefix_high;
- } else {
- prefix_high = prefix_low;
- }
- } else {
- // We did the binary search on the suffix set.
- if (CompareMatchingsSet(prefix_low, prefix_high, suffix_mid,
- prefix_subpatterns, suffix_subpatterns, suffix) < 0) {
- suffix_low = suffix_mid;
- suffix_high = suffix_mid;
- } else {
- suffix_low = suffix_mid + suffix_subpatterns;
- suffix_high = suffix_mid + suffix_subpatterns;
- }
- }
-
- Merge(locations, phrase, suffix, prefix_start, prefix_low, suffix_start,
- suffix_low, prefix_subpatterns, suffix_subpatterns);
- locations.insert(locations.end(), result.begin(), result.end());
- Merge(locations, phrase, suffix, prefix_high, prefix_end, suffix_high,
- suffix_end, prefix_subpatterns, suffix_subpatterns);
-}
-
-bool BinarySearchMerger::IsIntersectionVoid(
- vector<int>::iterator prefix_start, vector<int>::iterator prefix_end,
- vector<int>::iterator suffix_start, vector<int>::iterator suffix_end,
- int prefix_subpatterns, int suffix_subpatterns,
- const Phrase& suffix) const {
- // Is any of the sets empty?
- if (prefix_start >= prefix_end || suffix_start >= suffix_end) {
- return true;
- }
-
- int last_chunk_len = suffix.GetChunkLen(suffix.Arity());
- bool offset = !vocabulary->IsTerminal(suffix.GetSymbol(0));
- // Is the first value from the first set larger than the last value in the
- // second set?
- Matching left(prefix_start, prefix_subpatterns,
- data_array->GetSentenceId(*prefix_start));
- Matching right(suffix_end - suffix_subpatterns, suffix_subpatterns,
- data_array->GetSentenceId(*(suffix_end - suffix_subpatterns)));
- if (comparator->Compare(left, right, last_chunk_len, offset) > 0) {
- return true;
- }
-
- // Is the last value from the first set smaller than the first value in the
- // second set?
- left = Matching(prefix_end - prefix_subpatterns, prefix_subpatterns,
- data_array->GetSentenceId(*(prefix_end - prefix_subpatterns)));
- right = Matching(suffix_start, suffix_subpatterns,
- data_array->GetSentenceId(*suffix_start));
- if (comparator->Compare(left, right, last_chunk_len, offset) < 0) {
- return true;
- }
-
- return false;
-}
-
-bool BinarySearchMerger::ShouldUseLinearMerge(
- int prefix_set_size, int suffix_set_size) const {
- if (force_binary_search_merge) {
- return false;
- }
-
- int min_size = min(prefix_set_size, suffix_set_size);
- int max_size = max(prefix_set_size, suffix_set_size);
-
- return BAEZA_YATES_FACTOR * min_size * log2(max_size) > max_size;
-}
-
-vector<int>::iterator BinarySearchMerger::GetMiddle(
- vector<int>::iterator low, vector<int>::iterator high,
- int num_subpatterns) const {
- return low + (((high - low) / num_subpatterns) / 2) * num_subpatterns;
-}
-
-void BinarySearchMerger::GetComparableMatchings(
- const vector<int>::iterator& prefix_start,
- const vector<int>::iterator& prefix_end,
- const vector<int>::iterator& prefix_mid,
- int num_subpatterns,
- vector<int>::iterator& prefix_low,
- vector<int>::iterator& prefix_high) const {
- prefix_low = prefix_mid;
- while (prefix_low != prefix_start
- && *prefix_mid == *(prefix_low - num_subpatterns)) {
- prefix_low -= num_subpatterns;
- }
- prefix_high = prefix_mid + num_subpatterns;
- while (prefix_high != prefix_end
- && *prefix_mid == *prefix_high) {
- prefix_high += num_subpatterns;
- }
-}
-
-int BinarySearchMerger::CompareMatchingsSet(
- const vector<int>::iterator& prefix_start,
- const vector<int>::iterator& prefix_end,
- const vector<int>::iterator& suffix_mid,
- int prefix_subpatterns,
- int suffix_subpatterns,
- const Phrase& suffix) const {
- int result = 0;
- int last_chunk_len = suffix.GetChunkLen(suffix.Arity());
- bool offset = !vocabulary->IsTerminal(suffix.GetSymbol(0));
-
- Matching right(suffix_mid, suffix_subpatterns,
- data_array->GetSentenceId(*suffix_mid));
- for (auto i = prefix_start; i != prefix_end; i += prefix_subpatterns) {
- Matching left(i, prefix_subpatterns, data_array->GetSentenceId(*i));
- int comparison = comparator->Compare(left, right, last_chunk_len, offset);
- if (i == prefix_start) {
- result = comparison;
- } else if (comparison != result) {
- return 0;
- }
- }
- return result;
-}
diff --git a/extractor/binary_search_merger.h b/extractor/binary_search_merger.h
deleted file mode 100644
index c887e012..00000000
--- a/extractor/binary_search_merger.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef _BINARY_SEARCH_MERGER_H_
-#define _BINARY_SEARCH_MERGER_H_
-
-#include <memory>
-#include <vector>
-
-using namespace std;
-
-class DataArray;
-class LinearMerger;
-class MatchingComparator;
-class Phrase;
-class Vocabulary;
-
-class BinarySearchMerger {
- public:
- BinarySearchMerger(shared_ptr<Vocabulary> vocabulary,
- shared_ptr<LinearMerger> linear_merger,
- shared_ptr<DataArray> data_array,
- shared_ptr<MatchingComparator> comparator,
- bool force_binary_search_merge = false);
-
- virtual ~BinarySearchMerger();
-
- virtual void Merge(
- vector<int>& locations, const Phrase& phrase, const Phrase& suffix,
- const vector<int>::iterator& prefix_start,
- const vector<int>::iterator& prefix_end,
- const vector<int>::iterator& suffix_start,
- const vector<int>::iterator& suffix_end,
- int prefix_subpatterns, int suffix_subpatterns) const;
-
- static double BAEZA_YATES_FACTOR;
-
- protected:
- BinarySearchMerger();
-
- private:
- bool IsIntersectionVoid(
- vector<int>::iterator prefix_start, vector<int>::iterator prefix_end,
- vector<int>::iterator suffix_start, vector<int>::iterator suffix_end,
- int prefix_subpatterns, int suffix_subpatterns,
- const Phrase& suffix) const;
-
- bool ShouldUseLinearMerge(int prefix_set_size, int suffix_set_size) const;
-
- vector<int>::iterator GetMiddle(vector<int>::iterator low,
- vector<int>::iterator high,
- int num_subpatterns) const;
-
- void GetComparableMatchings(
- const vector<int>::iterator& prefix_start,
- const vector<int>::iterator& prefix_end,
- const vector<int>::iterator& prefix_mid,
- int num_subpatterns,
- vector<int>::iterator& prefix_low,
- vector<int>::iterator& prefix_high) const;
-
- int CompareMatchingsSet(
- const vector<int>::iterator& prefix_low,
- const vector<int>::iterator& prefix_high,
- const vector<int>::iterator& suffix_mid,
- int prefix_subpatterns,
- int suffix_subpatterns,
- const Phrase& suffix) const;
-
- shared_ptr<Vocabulary> vocabulary;
- shared_ptr<LinearMerger> linear_merger;
- shared_ptr<DataArray> data_array;
- shared_ptr<MatchingComparator> comparator;
- // Should be true only for testing.
- bool force_binary_search_merge;
-};
-
-#endif
diff --git a/extractor/binary_search_merger_test.cc b/extractor/binary_search_merger_test.cc
deleted file mode 100644
index b1baa62f..00000000
--- a/extractor/binary_search_merger_test.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <memory>
-
-#include "binary_search_merger.h"
-#include "matching_comparator.h"
-#include "mocks/mock_data_array.h"
-#include "mocks/mock_vocabulary.h"
-#include "mocks/mock_linear_merger.h"
-#include "phrase.h"
-#include "phrase_location.h"
-#include "phrase_builder.h"
-
-using namespace std;
-using namespace ::testing;
-
-namespace {
-
-class BinarySearchMergerTest : public Test {
- protected:
- virtual void SetUp() {
- shared_ptr<MockVocabulary> vocabulary = make_shared<MockVocabulary>();
- EXPECT_CALL(*vocabulary, GetTerminalValue(_))
- .WillRepeatedly(Return("word"));
-
- shared_ptr<MockDataArray> data_array = make_shared<MockDataArray>();
- EXPECT_CALL(*data_array, GetSentenceId(_))
- .WillRepeatedly(Return(1));
-
- shared_ptr<MatchingComparator> comparator =
- make_shared<MatchingComparator>(1, 20);
-
- phrase_builder = make_shared<PhraseBuilder>(vocabulary);
-
- // We are going to force the binary_search_merger to do all the work, so we
- // need to check that the linear_merger never gets called.
- shared_ptr<MockLinearMerger> linear_merger =
- make_shared<MockLinearMerger>();
- EXPECT_CALL(*linear_merger, Merge(_, _, _, _, _, _, _, _, _)).Times(0);
-
- binary_search_merger = make_shared<BinarySearchMerger>(
- vocabulary, linear_merger, data_array, comparator, true);
- }
-
- shared_ptr<BinarySearchMerger> binary_search_merger;
- shared_ptr<PhraseBuilder> phrase_builder;
-};
-
-TEST_F(BinarySearchMergerTest, aXbTest) {
- vector<int> locations;
- // Encoding for him X it (see Adam's dissertation).
- vector<int> symbols{1, -1, 2};
- Phrase phrase = phrase_builder->Build(symbols);
- vector<int> suffix_symbols{-1, 2};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
-
- vector<int> prefix_locs{2, 6, 10, 15};
- vector<int> suffix_locs{0, 4, 8, 13};
-
- binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(),
- prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1);
-
- vector<int> expected_locations{2, 4, 2, 8, 2, 13, 6, 8, 6, 13, 10, 13};
- EXPECT_EQ(expected_locations, locations);
-}
-
-TEST_F(BinarySearchMergerTest, aXbXcTest) {
- vector<int> locations;
- // Encoding for it X him X it (see Adam's dissertation).
- vector<int> symbols{1, -1, 2, -2, 1};
- Phrase phrase = phrase_builder->Build(symbols);
- vector<int> suffix_symbols{-1, 2, -2, 1};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
-
- vector<int> prefix_locs{0, 2, 0, 6, 0, 10, 4, 6, 4, 10, 4, 15, 8, 10, 8, 15,
- 13, 15};
- vector<int> suffix_locs{2, 4, 2, 8, 2, 13, 6, 8, 6, 13, 10, 13};
-
- binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(),
- prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 2, 2);
-
- vector<int> expected_locs{0, 2, 4, 0, 2, 8, 0, 2, 13, 0, 6, 8, 0, 6, 13, 0,
- 10, 13, 4, 6, 8, 4, 6, 13, 4, 10, 13, 8, 10, 13};
- EXPECT_EQ(expected_locs, locations);
-}
-
-TEST_F(BinarySearchMergerTest, abXcXdTest) {
- // Sentence: Anna has many many nuts and sour apples and juicy apples.
- // Phrase: Anna has X and X apples.
- vector<int> locations;
- vector<int> symbols{1, 2, -1, 3, -2, 4};
- Phrase phrase = phrase_builder->Build(symbols);
- vector<int> suffix_symbols{2, -1, 3, -2, 4};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
-
- vector<int> prefix_locs{1, 6, 1, 9};
- vector<int> suffix_locs{2, 6, 8, 2, 6, 11, 2, 9, 11};
-
- binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(),
- prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 2, 3);
-
- vector<int> expected_locs{1, 6, 8, 1, 6, 11, 1, 9, 11};
- EXPECT_EQ(expected_locs, locations);
-}
-
-TEST_F(BinarySearchMergerTest, LargeTest) {
- vector<int> locations;
- vector<int> symbols{1, -1, 2};
- Phrase phrase = phrase_builder->Build(symbols);
- vector<int> suffix_symbols{-1, 2};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
-
- vector<int> prefix_locs;
- for (int i = 0; i < 100; ++i) {
- prefix_locs.push_back(i * 20 + 1);
- }
- vector<int> suffix_locs;
- for (int i = 0; i < 100; ++i) {
- suffix_locs.push_back(i * 20 + 5);
- suffix_locs.push_back(i * 20 + 13);
- }
-
- binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(),
- prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1);
-
- EXPECT_EQ(400, locations.size());
- for (int i = 0; i < 100; ++i) {
- EXPECT_EQ(i * 20 + 1, locations[4 * i]);
- EXPECT_EQ(i * 20 + 5, locations[4 * i + 1]);
- EXPECT_EQ(i * 20 + 1, locations[4 * i + 2]);
- EXPECT_EQ(i * 20 + 13, locations[4 * i + 3]);
- }
-}
-
-TEST_F(BinarySearchMergerTest, EmptyResultTest) {
- vector<int> locations;
- vector<int> symbols{1, -1, 2};
- Phrase phrase = phrase_builder->Build(symbols);
- vector<int> suffix_symbols{-1, 2};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
-
- vector<int> prefix_locs;
- for (int i = 0; i < 100; ++i) {
- prefix_locs.push_back(i * 200 + 1);
- }
- vector<int> suffix_locs;
- for (int i = 0; i < 100; ++i) {
- suffix_locs.push_back(i * 200 + 101);
- }
-
- binary_search_merger->Merge(locations, phrase, suffix, prefix_locs.begin(),
- prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1);
-
- EXPECT_EQ(0, locations.size());
-}
-
-} // namespace
diff --git a/extractor/data_array.h b/extractor/data_array.h
index 7c120b3c..96950789 100644
--- a/extractor/data_array.h
+++ b/extractor/data_array.h
@@ -15,6 +15,9 @@ enum Side {
TARGET
};
+// TODO: This class has features for both the source and target data arrays.
+// Maybe we can save some memory by having more specific implementations (e.g.
+// sentence_id is only needed for the source data array).
class DataArray {
public:
static int NULL_WORD;
@@ -48,7 +51,6 @@ class DataArray {
virtual int GetSentenceStart(int position) const;
- //TODO(pauldb): Add unit tests.
virtual int GetSentenceLength(int sentence_id) const;
virtual int GetSentenceId(int position) const;
@@ -67,8 +69,6 @@ class DataArray {
unordered_map<string, int> word2id;
vector<string> id2word;
vector<int> data;
- // TODO(pauldb): We only need sentence_id for the source language. Maybe we
- // can save some memory here.
vector<int> sentence_id;
vector<int> sentence_start;
};
diff --git a/extractor/data_array_test.cc b/extractor/data_array_test.cc
index 772ba10e..ba5ce09e 100644
--- a/extractor/data_array_test.cc
+++ b/extractor/data_array_test.cc
@@ -26,18 +26,28 @@ class DataArrayTest : public Test {
};
TEST_F(DataArrayTest, TestGetData) {
- vector<int> expected_source_data{2, 3, 4, 5, 1, 2, 6, 7, 8, 5, 1};
+ vector<int> expected_source_data = {2, 3, 4, 5, 1, 2, 6, 7, 8, 5, 1};
+ vector<string> expected_source_words = {
+ "ana", "are", "mere", ".", "__END_OF_LINE__",
+ "ana", "bea", "mult", "lapte", ".", "__END_OF_LINE__"
+ };
EXPECT_EQ(expected_source_data, source_data->GetData());
EXPECT_EQ(expected_source_data.size(), source_data->GetSize());
for (size_t i = 0; i < expected_source_data.size(); ++i) {
EXPECT_EQ(expected_source_data[i], source_data->AtIndex(i));
+ EXPECT_EQ(expected_source_words[i], source_data->GetWordAtIndex(i));
}
- vector<int> expected_target_data{2, 3, 4, 5, 1, 2, 6, 7, 8, 9, 10, 5, 1};
+ vector<int> expected_target_data = {2, 3, 4, 5, 1, 2, 6, 7, 8, 9, 10, 5, 1};
+ vector<string> expected_target_words = {
+ "anna", "has", "apples", ".", "__END_OF_LINE__",
+ "anna", "drinks", "a", "lot", "of", "milk", ".", "__END_OF_LINE__"
+ };
EXPECT_EQ(expected_target_data, target_data->GetData());
EXPECT_EQ(expected_target_data.size(), target_data->GetSize());
for (size_t i = 0; i < expected_target_data.size(); ++i) {
EXPECT_EQ(expected_target_data[i], target_data->AtIndex(i));
+ EXPECT_EQ(expected_target_words[i], target_data->GetWordAtIndex(i));
}
}
@@ -61,10 +71,26 @@ TEST_F(DataArrayTest, TestSentenceData) {
EXPECT_EQ(5, source_data->GetSentenceStart(1));
EXPECT_EQ(11, source_data->GetSentenceStart(2));
+ EXPECT_EQ(4, source_data->GetSentenceLength(0));
+ EXPECT_EQ(5, source_data->GetSentenceLength(1));
+
+ vector<int> expected_source_ids = {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
+ for (size_t i = 0; i < expected_source_ids.size(); ++i) {
+ EXPECT_EQ(expected_source_ids[i], source_data->GetSentenceId(i));
+ }
+
EXPECT_EQ(2, target_data->GetNumSentences());
EXPECT_EQ(0, target_data->GetSentenceStart(0));
EXPECT_EQ(5, target_data->GetSentenceStart(1));
EXPECT_EQ(13, target_data->GetSentenceStart(2));
+
+ EXPECT_EQ(4, target_data->GetSentenceLength(0));
+ EXPECT_EQ(7, target_data->GetSentenceLength(1));
+
+ vector<int> expected_target_ids = {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1};
+ for (size_t i = 0; i < expected_target_ids.size(); ++i) {
+ EXPECT_EQ(expected_target_ids[i], target_data->GetSentenceId(i));
+ }
}
} // namespace
diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc
index a03e805f..b8f6f0c7 100644
--- a/extractor/grammar_extractor.cc
+++ b/extractor/grammar_extractor.cc
@@ -16,13 +16,12 @@ GrammarExtractor::GrammarExtractor(
shared_ptr<Alignment> alignment, shared_ptr<Precomputation> precomputation,
shared_ptr<Scorer> scorer, int min_gap_size, int max_rule_span,
int max_nonterminals, int max_rule_symbols, int max_samples,
- bool use_fast_intersect, bool use_baeza_yates, bool require_tight_phrases) :
+ bool require_tight_phrases) :
vocabulary(make_shared<Vocabulary>()),
rule_factory(make_shared<HieroCachingRuleFactory>(
source_suffix_array, target_data_array, alignment, vocabulary,
precomputation, scorer, min_gap_size, max_rule_span, max_nonterminals,
- max_rule_symbols, max_samples, use_fast_intersect, use_baeza_yates,
- require_tight_phrases)) {}
+ max_rule_symbols, max_samples, require_tight_phrases)) {}
GrammarExtractor::GrammarExtractor(
shared_ptr<Vocabulary> vocabulary,
diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h
index a8f2090d..f50a8d14 100644
--- a/extractor/grammar_extractor.h
+++ b/extractor/grammar_extractor.h
@@ -29,8 +29,6 @@ class GrammarExtractor {
int max_nonterminals,
int max_rule_symbols,
int max_samples,
- bool use_fast_intersect,
- bool use_baeza_yates,
bool require_tight_phrases);
// For testing only.
diff --git a/extractor/intersector.cc b/extractor/intersector.cc
deleted file mode 100644
index 39a7648d..00000000
--- a/extractor/intersector.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-#include "intersector.h"
-
-#include "data_array.h"
-#include "matching_comparator.h"
-#include "phrase.h"
-#include "phrase_location.h"
-#include "precomputation.h"
-#include "suffix_array.h"
-#include "veb.h"
-#include "vocabulary.h"
-
-Intersector::Intersector(shared_ptr<Vocabulary> vocabulary,
- shared_ptr<Precomputation> precomputation,
- shared_ptr<SuffixArray> suffix_array,
- shared_ptr<MatchingComparator> comparator,
- bool use_baeza_yates) :
- vocabulary(vocabulary),
- suffix_array(suffix_array),
- use_baeza_yates(use_baeza_yates) {
- shared_ptr<DataArray> data_array = suffix_array->GetData();
- linear_merger = make_shared<LinearMerger>(vocabulary, data_array, comparator);
- binary_search_merger = make_shared<BinarySearchMerger>(
- vocabulary, linear_merger, data_array, comparator);
- ConvertIndexes(precomputation, data_array);
-}
-
-Intersector::Intersector(shared_ptr<Vocabulary> vocabulary,
- shared_ptr<Precomputation> precomputation,
- shared_ptr<SuffixArray> suffix_array,
- shared_ptr<LinearMerger> linear_merger,
- shared_ptr<BinarySearchMerger> binary_search_merger,
- bool use_baeza_yates) :
- vocabulary(vocabulary),
- suffix_array(suffix_array),
- linear_merger(linear_merger),
- binary_search_merger(binary_search_merger),
- use_baeza_yates(use_baeza_yates) {
- ConvertIndexes(precomputation, suffix_array->GetData());
-}
-
-Intersector::Intersector() {}
-
-Intersector::~Intersector() {}
-
-void Intersector::ConvertIndexes(shared_ptr<Precomputation> precomputation,
- shared_ptr<DataArray> data_array) {
- const Index& precomputed_index = precomputation->GetInvertedIndex();
- for (pair<vector<int>, vector<int> > entry: precomputed_index) {
- vector<int> phrase = ConvertPhrase(entry.first, data_array);
- inverted_index[phrase] = entry.second;
-
- phrase.push_back(vocabulary->GetNonterminalIndex(1));
- inverted_index[phrase] = entry.second;
- phrase.pop_back();
- phrase.insert(phrase.begin(), vocabulary->GetNonterminalIndex(1));
- inverted_index[phrase] = entry.second;
- }
-
- const Index& precomputed_collocations = precomputation->GetCollocations();
- for (pair<vector<int>, vector<int> > entry: precomputed_collocations) {
- vector<int> phrase = ConvertPhrase(entry.first, data_array);
- collocations[phrase] = entry.second;
- }
-}
-
-vector<int> Intersector::ConvertPhrase(const vector<int>& old_phrase,
- shared_ptr<DataArray> data_array) {
- vector<int> new_phrase;
- new_phrase.reserve(old_phrase.size());
-
- int arity = 0;
- for (int word_id: old_phrase) {
- if (word_id == Precomputation::NON_TERMINAL) {
- ++arity;
- new_phrase.push_back(vocabulary->GetNonterminalIndex(arity));
- } else {
- new_phrase.push_back(
- vocabulary->GetTerminalIndex(data_array->GetWord(word_id)));
- }
- }
-
- return new_phrase;
-}
-
-PhraseLocation Intersector::Intersect(
- const Phrase& prefix, PhraseLocation& prefix_location,
- const Phrase& suffix, PhraseLocation& suffix_location,
- const Phrase& phrase) {
- vector<int> symbols = phrase.Get();
-
- // We should never attempt to do an intersect query for a pattern starting or
- // ending with a non terminal. The RuleFactory should handle these cases,
- // initializing the matchings list with the one for the pattern without the
- // starting or ending terminal.
- assert(vocabulary->IsTerminal(symbols.front())
- && vocabulary->IsTerminal(symbols.back()));
-
- if (collocations.count(symbols)) {
- return PhraseLocation(collocations[symbols], phrase.Arity() + 1);
- }
-
- vector<int> locations;
- ExtendPhraseLocation(prefix, prefix_location);
- ExtendPhraseLocation(suffix, suffix_location);
- shared_ptr<vector<int> > prefix_matchings = prefix_location.matchings;
- shared_ptr<vector<int> > suffix_matchings = suffix_location.matchings;
- int prefix_subpatterns = prefix_location.num_subpatterns;
- int suffix_subpatterns = suffix_location.num_subpatterns;
- if (use_baeza_yates) {
- binary_search_merger->Merge(locations, phrase, suffix,
- prefix_matchings->begin(), prefix_matchings->end(),
- suffix_matchings->begin(), suffix_matchings->end(),
- prefix_subpatterns, suffix_subpatterns);
- } else {
- linear_merger->Merge(locations, phrase, suffix, prefix_matchings->begin(),
- prefix_matchings->end(), suffix_matchings->begin(),
- suffix_matchings->end(), prefix_subpatterns, suffix_subpatterns);
- }
- return PhraseLocation(locations, phrase.Arity() + 1);
-}
-
-void Intersector::ExtendPhraseLocation(
- const Phrase& phrase, PhraseLocation& phrase_location) {
- int low = phrase_location.sa_low, high = phrase_location.sa_high;
- if (phrase_location.matchings != NULL) {
- return;
- }
-
-
- phrase_location.num_subpatterns = 1;
- phrase_location.sa_low = phrase_location.sa_high = 0;
-
- vector<int> symbols = phrase.Get();
- if (inverted_index.count(symbols)) {
- phrase_location.matchings =
- make_shared<vector<int> >(inverted_index[symbols]);
- return;
- }
-
- vector<int> matchings;
- matchings.reserve(high - low + 1);
- shared_ptr<VEB> veb = VEB::Create(suffix_array->GetSize());
- for (int i = low; i < high; ++i) {
- veb->Insert(suffix_array->GetSuffix(i));
- }
-
- int value = veb->GetMinimum();
- while (value != -1) {
- matchings.push_back(value);
- value = veb->GetSuccessor(value);
- }
-
- phrase_location.matchings = make_shared<vector<int> >(matchings);
-}
diff --git a/extractor/intersector.h b/extractor/intersector.h
deleted file mode 100644
index 8b159f17..00000000
--- a/extractor/intersector.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef _INTERSECTOR_H_
-#define _INTERSECTOR_H_
-
-#include <memory>
-#include <unordered_map>
-#include <vector>
-
-#include <boost/functional/hash.hpp>
-
-#include "binary_search_merger.h"
-#include "linear_merger.h"
-
-using namespace std;
-
-typedef boost::hash<vector<int> > VectorHash;
-typedef unordered_map<vector<int>, vector<int>, VectorHash> Index;
-
-class DataArray;
-class MatchingComparator;
-class Phrase;
-class PhraseLocation;
-class Precomputation;
-class SuffixArray;
-class Vocabulary;
-
-class Intersector {
- public:
- Intersector(
- shared_ptr<Vocabulary> vocabulary,
- shared_ptr<Precomputation> precomputation,
- shared_ptr<SuffixArray> source_suffix_array,
- shared_ptr<MatchingComparator> comparator,
- bool use_baeza_yates);
-
- // For testing.
- Intersector(
- shared_ptr<Vocabulary> vocabulary,
- shared_ptr<Precomputation> precomputation,
- shared_ptr<SuffixArray> source_suffix_array,
- shared_ptr<LinearMerger> linear_merger,
- shared_ptr<BinarySearchMerger> binary_search_merger,
- bool use_baeza_yates);
-
- virtual ~Intersector();
-
- virtual PhraseLocation Intersect(
- const Phrase& prefix, PhraseLocation& prefix_location,
- const Phrase& suffix, PhraseLocation& suffix_location,
- const Phrase& phrase);
-
- protected:
- Intersector();
-
- private:
- void ConvertIndexes(shared_ptr<Precomputation> precomputation,
- shared_ptr<DataArray> data_array);
-
- vector<int> ConvertPhrase(const vector<int>& old_phrase,
- shared_ptr<DataArray> data_array);
-
- void ExtendPhraseLocation(const Phrase& phrase,
- PhraseLocation& phrase_location);
-
- shared_ptr<Vocabulary> vocabulary;
- shared_ptr<SuffixArray> suffix_array;
- shared_ptr<LinearMerger> linear_merger;
- shared_ptr<BinarySearchMerger> binary_search_merger;
- Index inverted_index;
- Index collocations;
- bool use_baeza_yates;
-
- // TODO(pauldb): Don't forget to remove these.
- public:
- double sort_time;
- double linear_merge_time;
- double binary_merge_time;
-};
-
-#endif
diff --git a/extractor/intersector_test.cc b/extractor/intersector_test.cc
deleted file mode 100644
index ec318362..00000000
--- a/extractor/intersector_test.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <memory>
-#include <vector>
-
-#include "intersector.h"
-#include "mocks/mock_binary_search_merger.h"
-#include "mocks/mock_data_array.h"
-#include "mocks/mock_linear_merger.h"
-#include "mocks/mock_precomputation.h"
-#include "mocks/mock_suffix_array.h"
-#include "mocks/mock_vocabulary.h"
-
-using namespace std;
-using namespace ::testing;
-
-namespace {
-
-class IntersectorTest : public Test {
- protected:
- virtual void SetUp() {
- data = {2, 3, 4, 3, 4, 3};
- vector<string> words = {"a", "b", "c", "b", "c", "b"};
- data_array = make_shared<MockDataArray>();
- EXPECT_CALL(*data_array, GetData()).WillRepeatedly(ReturnRef(data));
-
- vocabulary = make_shared<MockVocabulary>();
- for (size_t i = 0; i < data.size(); ++i) {
- EXPECT_CALL(*data_array, GetWord(data[i]))
- .WillRepeatedly(Return(words[i]));
- EXPECT_CALL(*vocabulary, GetTerminalIndex(words[i]))
- .WillRepeatedly(Return(data[i]));
- EXPECT_CALL(*vocabulary, GetTerminalValue(data[i]))
- .WillRepeatedly(Return(words[i]));
- }
-
- vector<int> suffixes = {6, 0, 5, 3, 1, 4, 2};
- suffix_array = make_shared<MockSuffixArray>();
- EXPECT_CALL(*suffix_array, GetData())
- .WillRepeatedly(Return(data_array));
- EXPECT_CALL(*suffix_array, GetSize())
- .WillRepeatedly(Return(suffixes.size()));
- for (size_t i = 0; i < suffixes.size(); ++i) {
- EXPECT_CALL(*suffix_array, GetSuffix(i))
- .WillRepeatedly(Return(suffixes[i]));
- }
-
- vector<int> key = {2, -1, 4};
- vector<int> values = {0, 2};
- collocations[key] = values;
- precomputation = make_shared<MockPrecomputation>();
- EXPECT_CALL(*precomputation, GetInvertedIndex())
- .WillRepeatedly(ReturnRef(inverted_index));
- EXPECT_CALL(*precomputation, GetCollocations())
- .WillRepeatedly(ReturnRef(collocations));
-
- linear_merger = make_shared<MockLinearMerger>();
- binary_search_merger = make_shared<MockBinarySearchMerger>();
-
- phrase_builder = make_shared<PhraseBuilder>(vocabulary);
- }
-
- Index inverted_index;
- Index collocations;
- vector<int> data;
- shared_ptr<MockVocabulary> vocabulary;
- shared_ptr<MockDataArray> data_array;
- shared_ptr<MockSuffixArray> suffix_array;
- shared_ptr<MockPrecomputation> precomputation;
- shared_ptr<MockLinearMerger> linear_merger;
- shared_ptr<MockBinarySearchMerger> binary_search_merger;
- shared_ptr<PhraseBuilder> phrase_builder;
- shared_ptr<Intersector> intersector;
-};
-
-TEST_F(IntersectorTest, TestCachedCollocation) {
- intersector = make_shared<Intersector>(vocabulary, precomputation,
- suffix_array, linear_merger, binary_search_merger, false);
-
- vector<int> prefix_symbols = {2, -1};
- Phrase prefix = phrase_builder->Build(prefix_symbols);
- vector<int> suffix_symbols = {-1, 4};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
- vector<int> symbols = {2, -1, 4};
- Phrase phrase = phrase_builder->Build(symbols);
- PhraseLocation prefix_locs(0, 1), suffix_locs(2, 3);
-
- PhraseLocation result = intersector->Intersect(
- prefix, prefix_locs, suffix, suffix_locs, phrase);
-
- vector<int> expected_locs = {0, 2};
- PhraseLocation expected_result(expected_locs, 2);
-
- EXPECT_EQ(expected_result, result);
- EXPECT_EQ(PhraseLocation(0, 1), prefix_locs);
- EXPECT_EQ(PhraseLocation(2, 3), suffix_locs);
-}
-
-TEST_F(IntersectorTest, TestLinearMergeaXb) {
- vector<int> prefix_symbols = {3, -1};
- Phrase prefix = phrase_builder->Build(prefix_symbols);
- vector<int> suffix_symbols = {-1, 4};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
- vector<int> symbols = {3, -1, 4};
- Phrase phrase = phrase_builder->Build(symbols);
- PhraseLocation prefix_locs(2, 5), suffix_locs(5, 7);
-
- vector<int> ex_prefix_locs = {1, 3, 5};
- PhraseLocation extended_prefix_locs(ex_prefix_locs, 1);
- vector<int> ex_suffix_locs = {2, 4};
- PhraseLocation extended_suffix_locs(ex_suffix_locs, 1);
-
- vector<int> expected_locs = {1, 4};
- EXPECT_CALL(*linear_merger, Merge(_, _, _, _, _, _, _, _, _))
- .Times(1)
- .WillOnce(SetArgReferee<0>(expected_locs));
- EXPECT_CALL(*binary_search_merger, Merge(_, _, _, _, _, _, _, _, _)).Times(0);
-
- intersector = make_shared<Intersector>(vocabulary, precomputation,
- suffix_array, linear_merger, binary_search_merger, false);
-
- PhraseLocation result = intersector->Intersect(
- prefix, prefix_locs, suffix, suffix_locs, phrase);
- PhraseLocation expected_result(expected_locs, 2);
-
- EXPECT_EQ(expected_result, result);
- EXPECT_EQ(extended_prefix_locs, prefix_locs);
- EXPECT_EQ(extended_suffix_locs, suffix_locs);
-}
-
-TEST_F(IntersectorTest, TestBinarySearchMergeaXb) {
- vector<int> prefix_symbols = {3, -1};
- Phrase prefix = phrase_builder->Build(prefix_symbols);
- vector<int> suffix_symbols = {-1, 4};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
- vector<int> symbols = {3, -1, 4};
- Phrase phrase = phrase_builder->Build(symbols);
- PhraseLocation prefix_locs(2, 5), suffix_locs(5, 7);
-
- vector<int> ex_prefix_locs = {1, 3, 5};
- PhraseLocation extended_prefix_locs(ex_prefix_locs, 1);
- vector<int> ex_suffix_locs = {2, 4};
- PhraseLocation extended_suffix_locs(ex_suffix_locs, 1);
-
- vector<int> expected_locs = {1, 4};
- EXPECT_CALL(*binary_search_merger, Merge(_, _, _, _, _, _, _, _, _))
- .Times(1)
- .WillOnce(SetArgReferee<0>(expected_locs));
- EXPECT_CALL(*linear_merger, Merge(_, _, _, _, _, _, _, _, _)).Times(0);
-
- intersector = make_shared<Intersector>(vocabulary, precomputation,
- suffix_array, linear_merger, binary_search_merger, true);
-
- PhraseLocation result = intersector->Intersect(
- prefix, prefix_locs, suffix, suffix_locs, phrase);
- PhraseLocation expected_result(expected_locs, 2);
-
- EXPECT_EQ(expected_result, result);
- EXPECT_EQ(extended_prefix_locs, prefix_locs);
- EXPECT_EQ(extended_suffix_locs, suffix_locs);
-}
-
-TEST_F(IntersectorTest, TestMergeaXbXc) {
- vector<int> prefix_symbols = {2, -1, 4, -1};
- Phrase prefix = phrase_builder->Build(prefix_symbols);
- vector<int> suffix_symbols = {-1, 4, -1, 4};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
- vector<int> symbols = {2, -1, 4, -1, 4};
- Phrase phrase = phrase_builder->Build(symbols);
-
- vector<int> ex_prefix_locs = {0, 2, 0, 4};
- PhraseLocation extended_prefix_locs(ex_prefix_locs, 2);
- vector<int> ex_suffix_locs = {2, 4};
- PhraseLocation extended_suffix_locs(ex_suffix_locs, 2);
- vector<int> expected_locs = {0, 2, 4};
- EXPECT_CALL(*linear_merger, Merge(_, _, _, _, _, _, _, _, _))
- .Times(1)
- .WillOnce(SetArgReferee<0>(expected_locs));
- EXPECT_CALL(*binary_search_merger, Merge(_, _, _, _, _, _, _, _, _)).Times(0);
-
- intersector = make_shared<Intersector>(vocabulary, precomputation,
- suffix_array, linear_merger, binary_search_merger, false);
-
- PhraseLocation result = intersector->Intersect(
- prefix, extended_prefix_locs, suffix, extended_suffix_locs, phrase);
- PhraseLocation expected_result(expected_locs, 3);
-
- EXPECT_EQ(expected_result, result);
- EXPECT_EQ(ex_prefix_locs, *extended_prefix_locs.matchings);
- EXPECT_EQ(ex_suffix_locs, *extended_suffix_locs.matchings);
-}
-
-} // namespace
diff --git a/extractor/linear_merger.cc b/extractor/linear_merger.cc
deleted file mode 100644
index e7a32788..00000000
--- a/extractor/linear_merger.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "linear_merger.h"
-
-#include <cmath>
-
-#include "data_array.h"
-#include "matching.h"
-#include "matching_comparator.h"
-#include "phrase.h"
-#include "phrase_location.h"
-#include "vocabulary.h"
-
-LinearMerger::LinearMerger(shared_ptr<Vocabulary> vocabulary,
- shared_ptr<DataArray> data_array,
- shared_ptr<MatchingComparator> comparator) :
- vocabulary(vocabulary), data_array(data_array), comparator(comparator) {}
-
-LinearMerger::LinearMerger() {}
-
-LinearMerger::~LinearMerger() {}
-
-void LinearMerger::Merge(
- vector<int>& locations, const Phrase& phrase, const Phrase& suffix,
- vector<int>::iterator prefix_start, vector<int>::iterator prefix_end,
- vector<int>::iterator suffix_start, vector<int>::iterator suffix_end,
- int prefix_subpatterns, int suffix_subpatterns) {
- int last_chunk_len = suffix.GetChunkLen(suffix.Arity());
- bool offset = !vocabulary->IsTerminal(suffix.GetSymbol(0));
-
- while (prefix_start != prefix_end) {
- Matching left(prefix_start, prefix_subpatterns,
- data_array->GetSentenceId(*prefix_start));
-
- while (suffix_start != suffix_end) {
- Matching right(suffix_start, suffix_subpatterns,
- data_array->GetSentenceId(*suffix_start));
- if (comparator->Compare(left, right, last_chunk_len, offset) > 0) {
- suffix_start += suffix_subpatterns;
- } else {
- break;
- }
- }
-
- int start_position = *prefix_start;
- vector<int> :: iterator i = suffix_start;
- while (prefix_start != prefix_end && *prefix_start == start_position) {
- Matching left(prefix_start, prefix_subpatterns,
- data_array->GetSentenceId(*prefix_start));
-
- while (i != suffix_end) {
- Matching right(i, suffix_subpatterns, data_array->GetSentenceId(*i));
- int comparison = comparator->Compare(left, right, last_chunk_len,
- offset);
- if (comparison == 0) {
- vector<int> merged = left.Merge(right, phrase.Arity() + 1);
- locations.insert(locations.end(), merged.begin(), merged.end());
- } else if (comparison < 0) {
- break;
- }
- i += suffix_subpatterns;
- }
-
- prefix_start += prefix_subpatterns;
- }
- }
-}
diff --git a/extractor/linear_merger.h b/extractor/linear_merger.h
deleted file mode 100644
index c3c7111e..00000000
--- a/extractor/linear_merger.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef _LINEAR_MERGER_H_
-#define _LINEAR_MERGER_H_
-
-#include <memory>
-#include <vector>
-
-using namespace std;
-
-class MatchingComparator;
-class Phrase;
-class PhraseLocation;
-class DataArray;
-class Vocabulary;
-
-class LinearMerger {
- public:
- LinearMerger(shared_ptr<Vocabulary> vocabulary,
- shared_ptr<DataArray> data_array,
- shared_ptr<MatchingComparator> comparator);
-
- virtual ~LinearMerger();
-
- virtual void Merge(
- vector<int>& locations, const Phrase& phrase, const Phrase& suffix,
- vector<int>::iterator prefix_start, vector<int>::iterator prefix_end,
- vector<int>::iterator suffix_start, vector<int>::iterator suffix_end,
- int prefix_subpatterns, int suffix_subpatterns);
-
- protected:
- LinearMerger();
-
- private:
- shared_ptr<Vocabulary> vocabulary;
- shared_ptr<DataArray> data_array;
- shared_ptr<MatchingComparator> comparator;
-};
-
-#endif
diff --git a/extractor/linear_merger_test.cc b/extractor/linear_merger_test.cc
deleted file mode 100644
index a6963430..00000000
--- a/extractor/linear_merger_test.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <memory>
-
-#include "linear_merger.h"
-#include "matching_comparator.h"
-#include "mocks/mock_data_array.h"
-#include "mocks/mock_vocabulary.h"
-#include "phrase.h"
-#include "phrase_location.h"
-#include "phrase_builder.h"
-
-using namespace std;
-using namespace ::testing;
-
-namespace {
-
-class LinearMergerTest : public Test {
- protected:
- virtual void SetUp() {
- shared_ptr<MockVocabulary> vocabulary = make_shared<MockVocabulary>();
- EXPECT_CALL(*vocabulary, GetTerminalValue(_))
- .WillRepeatedly(Return("word"));
-
- shared_ptr<MockDataArray> data_array = make_shared<MockDataArray>();
- EXPECT_CALL(*data_array, GetSentenceId(_))
- .WillRepeatedly(Return(1));
-
- shared_ptr<MatchingComparator> comparator =
- make_shared<MatchingComparator>(1, 20);
-
- phrase_builder = make_shared<PhraseBuilder>(vocabulary);
- linear_merger = make_shared<LinearMerger>(vocabulary, data_array,
- comparator);
- }
-
- shared_ptr<LinearMerger> linear_merger;
- shared_ptr<PhraseBuilder> phrase_builder;
-};
-
-TEST_F(LinearMergerTest, aXbTest) {
- vector<int> locations;
- // Encoding for him X it (see Adam's dissertation).
- vector<int> symbols{1, -1, 2};
- Phrase phrase = phrase_builder->Build(symbols);
- vector<int> suffix_symbols{-1, 2};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
-
- vector<int> prefix_locs{2, 6, 10, 15};
- vector<int> suffix_locs{0, 4, 8, 13};
-
- linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(),
- prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1);
-
- vector<int> expected_locations{2, 4, 2, 8, 2, 13, 6, 8, 6, 13, 10, 13};
- EXPECT_EQ(expected_locations, locations);
-}
-
-TEST_F(LinearMergerTest, aXbXcTest) {
- vector<int> locations;
- // Encoding for it X him X it (see Adam's dissertation).
- vector<int> symbols{1, -1, 2, -2, 1};
- Phrase phrase = phrase_builder->Build(symbols);
- vector<int> suffix_symbols{-1, 2, -2, 1};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
-
- vector<int> prefix_locs{0, 2, 0, 6, 0, 10, 4, 6, 4, 10, 4, 15, 8, 10, 8, 15,
- 13, 15};
- vector<int> suffix_locs{2, 4, 2, 8, 2, 13, 6, 8, 6, 13, 10, 13};
-
- linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(),
- prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 2, 2);
-
- vector<int> expected_locs{0, 2, 4, 0, 2, 8, 0, 2, 13, 0, 6, 8, 0, 6, 13, 0,
- 10, 13, 4, 6, 8, 4, 6, 13, 4, 10, 13, 8, 10, 13};
- EXPECT_EQ(expected_locs, locations);
-}
-
-TEST_F(LinearMergerTest, abXcXdTest) {
- // Sentence: Anna has many many nuts and sour apples and juicy apples.
- // Phrase: Anna has X and X apples.
- vector<int> locations;
- vector<int> symbols{1, 2, -1, 3, -2, 4};
- Phrase phrase = phrase_builder->Build(symbols);
- vector<int> suffix_symbols{2, -1, 3, -2, 4};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
-
- vector<int> prefix_locs{1, 6, 1, 9};
- vector<int> suffix_locs{2, 6, 8, 2, 6, 11, 2, 9, 11};
-
- linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(),
- prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 2, 3);
-
- vector<int> expected_locs{1, 6, 8, 1, 6, 11, 1, 9, 11};
- EXPECT_EQ(expected_locs, locations);
-}
-
-TEST_F(LinearMergerTest, LargeTest) {
- vector<int> locations;
- vector<int> symbols{1, -1, 2};
- Phrase phrase = phrase_builder->Build(symbols);
- vector<int> suffix_symbols{-1, 2};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
-
- vector<int> prefix_locs;
- for (int i = 0; i < 100; ++i) {
- prefix_locs.push_back(i * 20 + 1);
- }
- vector<int> suffix_locs;
- for (int i = 0; i < 100; ++i) {
- suffix_locs.push_back(i * 20 + 5);
- suffix_locs.push_back(i * 20 + 13);
- }
-
- linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(),
- prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1);
-
- EXPECT_EQ(400, locations.size());
- for (int i = 0; i < 100; ++i) {
- EXPECT_EQ(i * 20 + 1, locations[4 * i]);
- EXPECT_EQ(i * 20 + 5, locations[4 * i + 1]);
- EXPECT_EQ(i * 20 + 1, locations[4 * i + 2]);
- EXPECT_EQ(i * 20 + 13, locations[4 * i + 3]);
- }
-}
-
-TEST_F(LinearMergerTest, EmptyResultTest) {
- vector<int> locations;
- vector<int> symbols{1, -1, 2};
- Phrase phrase = phrase_builder->Build(symbols);
- vector<int> suffix_symbols{-1, 2};
- Phrase suffix = phrase_builder->Build(suffix_symbols);
-
- vector<int> prefix_locs;
- for (int i = 0; i < 100; ++i) {
- prefix_locs.push_back(i * 200 + 1);
- }
- vector<int> suffix_locs;
- for (int i = 0; i < 100; ++i) {
- suffix_locs.push_back(i * 200 + 101);
- }
-
- linear_merger->Merge(locations, phrase, suffix, prefix_locs.begin(),
- prefix_locs.end(), suffix_locs.begin(), suffix_locs.end(), 1, 1);
-
- EXPECT_EQ(0, locations.size());
-}
-
-} // namespace
diff --git a/extractor/matching.cc b/extractor/matching.cc
deleted file mode 100644
index 16a3ed6f..00000000
--- a/extractor/matching.cc
+++ /dev/null
@@ -1,12 +0,0 @@
-#include "matching.h"
-
-Matching::Matching(vector<int>::iterator start, int len, int sentence_id) :
- positions(start, start + len), sentence_id(sentence_id) {}
-
-vector<int> Matching::Merge(const Matching& other, int num_subpatterns) const {
- vector<int> result = positions;
- if (num_subpatterns > positions.size()) {
- result.push_back(other.positions.back());
- }
- return result;
-}
diff --git a/extractor/matching.h b/extractor/matching.h
deleted file mode 100644
index 4c46559e..00000000
--- a/extractor/matching.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _MATCHING_H_
-#define _MATCHING_H_
-
-#include <memory>
-#include <vector>
-
-using namespace std;
-
-struct Matching {
- Matching(vector<int>::iterator start, int len, int sentence_id);
-
- vector<int> Merge(const Matching& other, int num_subpatterns) const;
-
- vector<int> positions;
- int sentence_id;
-};
-
-#endif
diff --git a/extractor/matching_comparator.cc b/extractor/matching_comparator.cc
deleted file mode 100644
index 03db95c0..00000000
--- a/extractor/matching_comparator.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "matching_comparator.h"
-
-#include "matching.h"
-#include "vocabulary.h"
-
-MatchingComparator::MatchingComparator(int min_gap_size, int max_rule_span) :
- min_gap_size(min_gap_size), max_rule_span(max_rule_span) {}
-
-int MatchingComparator::Compare(const Matching& left,
- const Matching& right,
- int last_chunk_len,
- bool offset) const {
- if (left.sentence_id != right.sentence_id) {
- return left.sentence_id < right.sentence_id ? -1 : 1;
- }
-
- if (left.positions.size() == 1 && right.positions.size() == 1) {
- // The prefix and the suffix must be separated by a non-terminal, otherwise
- // we would be doing a suffix array lookup.
- if (right.positions[0] - left.positions[0] <= min_gap_size) {
- return 1;
- }
- } else if (offset) {
- for (size_t i = 1; i < left.positions.size(); ++i) {
- if (left.positions[i] != right.positions[i - 1]) {
- return left.positions[i] < right.positions[i - 1] ? -1 : 1;
- }
- }
- } else {
- if (left.positions[0] + 1 != right.positions[0]) {
- return left.positions[0] + 1 < right.positions[0] ? -1 : 1;
- }
- for (size_t i = 1; i < left.positions.size(); ++i) {
- if (left.positions[i] != right.positions[i]) {
- return left.positions[i] < right.positions[i] ? -1 : 1;
- }
- }
- }
-
- if (right.positions.back() + last_chunk_len - left.positions.front() >
- max_rule_span) {
- return -1;
- }
-
- return 0;
-}
diff --git a/extractor/matching_comparator.h b/extractor/matching_comparator.h
deleted file mode 100644
index 6e1bb487..00000000
--- a/extractor/matching_comparator.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _MATCHING_COMPARATOR_H_
-#define _MATCHING_COMPARATOR_H_
-
-#include <memory>
-
-using namespace std;
-
-class Vocabulary;
-class Matching;
-
-class MatchingComparator {
- public:
- MatchingComparator(int min_gap_size, int max_rule_span);
-
- int Compare(const Matching& left, const Matching& right,
- int last_chunk_len, bool offset) const;
-
- private:
- int min_gap_size;
- int max_rule_span;
-};
-
-#endif
diff --git a/extractor/matching_comparator_test.cc b/extractor/matching_comparator_test.cc
deleted file mode 100644
index b8f898cf..00000000
--- a/extractor/matching_comparator_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-#include <gtest/gtest.h>
-
-#include "matching.h"
-#include "matching_comparator.h"
-
-using namespace ::testing;
-
-namespace {
-
-class MatchingComparatorTest : public Test {
- protected:
- virtual void SetUp() {
- comparator = make_shared<MatchingComparator>(1, 20);
- }
-
- shared_ptr<MatchingComparator> comparator;
-};
-
-TEST_F(MatchingComparatorTest, SmallerSentenceId) {
- vector<int> left_locations{1};
- Matching left(left_locations.begin(), 1, 1);
- vector<int> right_locations{100};
- Matching right(right_locations.begin(), 1, 5);
- EXPECT_EQ(-1, comparator->Compare(left, right, 1, true));
-}
-
-TEST_F(MatchingComparatorTest, GreaterSentenceId) {
- vector<int> left_locations{100};
- Matching left(left_locations.begin(), 1, 5);
- vector<int> right_locations{1};
- Matching right(right_locations.begin(), 1, 1);
- EXPECT_EQ(1, comparator->Compare(left, right, 1, true));
-}
-
-TEST_F(MatchingComparatorTest, SmalleraXb) {
- vector<int> left_locations{1};
- Matching left(left_locations.begin(), 1, 1);
- vector<int> right_locations{21};
- Matching right(right_locations.begin(), 1, 1);
- // The matching exceeds the max rule span.
- EXPECT_EQ(-1, comparator->Compare(left, right, 1, true));
-}
-
-TEST_F(MatchingComparatorTest, EqualaXb) {
- vector<int> left_locations{1};
- Matching left(left_locations.begin(), 1, 1);
- vector<int> lower_right_locations{3};
- Matching right(lower_right_locations.begin(), 1, 1);
- EXPECT_EQ(0, comparator->Compare(left, right, 1, true));
-
- vector<int> higher_right_locations{20};
- right = Matching(higher_right_locations.begin(), 1, 1);
- EXPECT_EQ(0, comparator->Compare(left, right, 1, true));
-}
-
-TEST_F(MatchingComparatorTest, GreateraXb) {
- vector<int> left_locations{1};
- Matching left(left_locations.begin(), 1, 1);
- vector<int> right_locations{2};
- Matching right(right_locations.begin(), 1, 1);
- // The gap between the prefix and the suffix is of size 0, less than the
- // min gap size.
- EXPECT_EQ(1, comparator->Compare(left, right, 1, true));
-}
-
-TEST_F(MatchingComparatorTest, SmalleraXbXc) {
- vector<int> left_locations{1, 3};
- Matching left(left_locations.begin(), 2, 1);
- vector<int> right_locations{4, 6};
- // The common part doesn't match.
- Matching right(right_locations.begin(), 2, 1);
- EXPECT_EQ(-1, comparator->Compare(left, right, 1, true));
-
- // The common part matches, but the rule exceeds the max span.
- vector<int> other_right_locations{3, 21};
- right = Matching(other_right_locations.begin(), 2, 1);
- EXPECT_EQ(-1, comparator->Compare(left, right, 1, true));
-}
-
-TEST_F(MatchingComparatorTest, EqualaXbXc) {
- vector<int> left_locations{1, 3};
- Matching left(left_locations.begin(), 2, 1);
- vector<int> right_locations{3, 5};
- // The leftmost possible match.
- Matching right(right_locations.begin(), 2, 1);
- EXPECT_EQ(0, comparator->Compare(left, right, 1, true));
-
- // The rightmost possible match.
- vector<int> other_right_locations{3, 20};
- right = Matching(other_right_locations.begin(), 2, 1);
- EXPECT_EQ(0, comparator->Compare(left, right, 1, true));
-}
-
-TEST_F(MatchingComparatorTest, GreateraXbXc) {
- vector<int> left_locations{1, 4};
- Matching left(left_locations.begin(), 2, 1);
- vector<int> right_locations{3, 5};
- // The common part doesn't match.
- Matching right(right_locations.begin(), 2, 1);
- EXPECT_EQ(1, comparator->Compare(left, right, 1, true));
-}
-
-TEST_F(MatchingComparatorTest, SmallerabXcXd) {
- vector<int> left_locations{9, 13};
- Matching left(left_locations.begin(), 2, 1);
- // The suffix doesn't start on the next position.
- vector<int> right_locations{11, 13, 15};
- Matching right(right_locations.begin(), 3, 1);
- EXPECT_EQ(-1, comparator->Compare(left, right, 1, false));
-
- // The common part doesn't match.
- vector<int> other_right_locations{10, 16, 18};
- right = Matching(other_right_locations.begin(), 3, 1);
- EXPECT_EQ(-1, comparator->Compare(left, right, 1, false));
-}
-
-TEST_F(MatchingComparatorTest, EqualabXcXd) {
- vector<int> left_locations{10, 13};
- Matching left(left_locations.begin(), 2, 1);
- vector<int> right_locations{11, 13, 15};
- Matching right(right_locations.begin(), 3, 1);
- EXPECT_EQ(0, comparator->Compare(left, right, 1, false));
-}
-
-TEST_F(MatchingComparatorTest, GreaterabXcXd) {
- vector<int> left_locations{9, 15};
- Matching left(left_locations.begin(), 2, 1);
- // The suffix doesn't start on the next position.
- vector<int> right_locations{7, 15, 17};
- Matching right(right_locations.begin(), 3, 1);
- EXPECT_EQ(1, comparator->Compare(left, right, 1, false));
-
- // The common part doesn't match.
- vector<int> other_right_locations{10, 13, 16};
- right = Matching(other_right_locations.begin(), 3, 1);
- EXPECT_EQ(1, comparator->Compare(left, right, 1, false));
-}
-
-} // namespace
diff --git a/extractor/matching_test.cc b/extractor/matching_test.cc
deleted file mode 100644
index 9593aa86..00000000
--- a/extractor/matching_test.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <vector>
-
-#include "matching.h"
-
-using namespace std;
-
-namespace {
-
-TEST(MatchingTest, SameSize) {
- vector<int> positions{1, 2, 3};
- Matching left(positions.begin(), positions.size(), 0);
- Matching right(positions.begin(), positions.size(), 0);
- EXPECT_EQ(positions, left.Merge(right, positions.size()));
-}
-
-TEST(MatchingTest, DifferentSize) {
- vector<int> positions{1, 2, 3};
- Matching left(positions.begin(), positions.size() - 1, 0);
- Matching right(positions.begin() + 1, positions.size() - 1, 0);
- vector<int> result = left.Merge(right, positions.size());
-}
-
-} // namespace
diff --git a/extractor/mocks/mock_binary_search_merger.h b/extractor/mocks/mock_binary_search_merger.h
deleted file mode 100644
index e23386f0..00000000
--- a/extractor/mocks/mock_binary_search_merger.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <gmock/gmock.h>
-
-#include <vector>
-
-#include "../binary_search_merger.h"
-#include "../phrase.h"
-
-using namespace std;
-
-class MockBinarySearchMerger: public BinarySearchMerger {
- public:
- MOCK_CONST_METHOD9(Merge, void(vector<int>&, const Phrase&, const Phrase&,
- const vector<int>::iterator&, const vector<int>::iterator&,
- const vector<int>::iterator&, const vector<int>::iterator&, int, int));
-};
diff --git a/extractor/mocks/mock_intersector.h b/extractor/mocks/mock_intersector.h
deleted file mode 100644
index 372fa7ea..00000000
--- a/extractor/mocks/mock_intersector.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <gmock/gmock.h>
-
-#include "../intersector.h"
-#include "../phrase.h"
-#include "../phrase_location.h"
-
-class MockIntersector : public Intersector {
- public:
- MOCK_METHOD5(Intersect, PhraseLocation(const Phrase&, PhraseLocation&,
- const Phrase&, PhraseLocation&, const Phrase&));
-};
diff --git a/extractor/mocks/mock_linear_merger.h b/extractor/mocks/mock_linear_merger.h
deleted file mode 100644
index 522c1f31..00000000
--- a/extractor/mocks/mock_linear_merger.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <gmock/gmock.h>
-
-#include <vector>
-
-#include "../linear_merger.h"
-#include "../phrase.h"
-
-using namespace std;
-
-class MockLinearMerger: public LinearMerger {
- public:
- MOCK_METHOD9(Merge, void(vector<int>&, const Phrase&, const Phrase&,
- vector<int>::iterator, vector<int>::iterator, vector<int>::iterator,
- vector<int>::iterator, int, int));
-};
diff --git a/extractor/rule_factory.cc b/extractor/rule_factory.cc
index fbdc7cce..4908dac0 100644
--- a/extractor/rule_factory.cc
+++ b/extractor/rule_factory.cc
@@ -7,9 +7,7 @@
#include "grammar.h"
#include "fast_intersector.h"
-#include "intersector.h"
#include "matchings_finder.h"
-#include "matching_comparator.h"
#include "phrase.h"
#include "rule.h"
#include "rule_extractor.h"
@@ -50,8 +48,6 @@ HieroCachingRuleFactory::HieroCachingRuleFactory(
int max_nonterminals,
int max_rule_symbols,
int max_samples,
- bool use_fast_intersect,
- bool use_baeza_yates,
bool require_tight_phrases) :
vocabulary(vocabulary),
scorer(scorer),
@@ -59,13 +55,8 @@ HieroCachingRuleFactory::HieroCachingRuleFactory(
max_rule_span(max_rule_span),
max_nonterminals(max_nonterminals),
max_chunks(max_nonterminals + 1),
- max_rule_symbols(max_rule_symbols),
- use_fast_intersect(use_fast_intersect) {
+ max_rule_symbols(max_rule_symbols) {
matchings_finder = make_shared<MatchingsFinder>(source_suffix_array);
- shared_ptr<MatchingComparator> comparator =
- make_shared<MatchingComparator>(min_gap_size, max_rule_span);
- intersector = make_shared<Intersector>(vocabulary, precomputation,
- source_suffix_array, comparator, use_baeza_yates);
fast_intersector = make_shared<FastIntersector>(source_suffix_array,
precomputation, vocabulary, max_rule_span, min_gap_size);
phrase_builder = make_shared<PhraseBuilder>(vocabulary);
@@ -78,7 +69,6 @@ HieroCachingRuleFactory::HieroCachingRuleFactory(
HieroCachingRuleFactory::HieroCachingRuleFactory(
shared_ptr<MatchingsFinder> finder,
- shared_ptr<Intersector> intersector,
shared_ptr<FastIntersector> fast_intersector,
shared_ptr<PhraseBuilder> phrase_builder,
shared_ptr<RuleExtractor> rule_extractor,
@@ -89,10 +79,8 @@ HieroCachingRuleFactory::HieroCachingRuleFactory(
int max_rule_span,
int max_nonterminals,
int max_chunks,
- int max_rule_symbols,
- bool use_fast_intersect) :
+ int max_rule_symbols) :
matchings_finder(finder),
- intersector(intersector),
fast_intersector(fast_intersector),
phrase_builder(phrase_builder),
rule_extractor(rule_extractor),
@@ -103,15 +91,13 @@ HieroCachingRuleFactory::HieroCachingRuleFactory(
max_rule_span(max_rule_span),
max_nonterminals(max_nonterminals),
max_chunks(max_chunks),
- max_rule_symbols(max_rule_symbols),
- use_fast_intersect(use_fast_intersect) {}
+ max_rule_symbols(max_rule_symbols) {}
HieroCachingRuleFactory::HieroCachingRuleFactory() {}
HieroCachingRuleFactory::~HieroCachingRuleFactory() {}
Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids) {
- intersector->sort_time = 0;
Clock::time_point start_time = Clock::now();
double total_extract_time = 0;
double total_intersect_time = 0;
@@ -164,17 +150,8 @@ Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids) {
PhraseLocation phrase_location;
if (next_phrase.Arity() > 0) {
Clock::time_point intersect_start = Clock::now();
- if (use_fast_intersect) {
- phrase_location = fast_intersector->Intersect(
- node->matchings, next_suffix_link->matchings, next_phrase);
- } else {
- phrase_location = intersector->Intersect(
- node->phrase,
- node->matchings,
- next_suffix_link->phrase,
- next_suffix_link->matchings,
- next_phrase);
- }
+ phrase_location = fast_intersector->Intersect(
+ node->matchings, next_suffix_link->matchings, next_phrase);
Clock::time_point intersect_stop = Clock::now();
total_intersect_time += GetDuration(intersect_start, intersect_stop);
} else {
diff --git a/extractor/rule_factory.h b/extractor/rule_factory.h
index a39386a8..2d1c7f5a 100644
--- a/extractor/rule_factory.h
+++ b/extractor/rule_factory.h
@@ -14,7 +14,6 @@ class DataArray;
class Grammar;
class MatchingsFinder;
class FastIntersector;
-class Intersector;
class Precomputation;
class Rule;
class RuleExtractor;
@@ -38,14 +37,11 @@ class HieroCachingRuleFactory {
int max_nonterminals,
int max_rule_symbols,
int max_samples,
- bool use_fast_intersect,
- bool use_beaza_yates,
bool require_tight_phrases);
// For testing only.
HieroCachingRuleFactory(
shared_ptr<MatchingsFinder> finder,
- shared_ptr<Intersector> intersector,
shared_ptr<FastIntersector> fast_intersector,
shared_ptr<PhraseBuilder> phrase_builder,
shared_ptr<RuleExtractor> rule_extractor,
@@ -56,8 +52,7 @@ class HieroCachingRuleFactory {
int max_rule_span,
int max_nonterminals,
int max_chunks,
- int max_rule_symbols,
- bool use_fast_intersect);
+ int max_rule_symbols);
virtual ~HieroCachingRuleFactory();
@@ -83,7 +78,6 @@ class HieroCachingRuleFactory {
const shared_ptr<TrieNode>& node);
shared_ptr<MatchingsFinder> matchings_finder;
- shared_ptr<Intersector> intersector;
shared_ptr<FastIntersector> fast_intersector;
MatchingsTrie trie;
shared_ptr<PhraseBuilder> phrase_builder;
@@ -96,7 +90,6 @@ class HieroCachingRuleFactory {
int max_nonterminals;
int max_chunks;
int max_rule_symbols;
- bool use_fast_intersect;
};
#endif
diff --git a/extractor/rule_factory_test.cc b/extractor/rule_factory_test.cc
index d329382a..fc709461 100644
--- a/extractor/rule_factory_test.cc
+++ b/extractor/rule_factory_test.cc
@@ -6,7 +6,6 @@
#include "grammar.h"
#include "mocks/mock_fast_intersector.h"
-#include "mocks/mock_intersector.h"
#include "mocks/mock_matchings_finder.h"
#include "mocks/mock_rule_extractor.h"
#include "mocks/mock_sampler.h"
@@ -25,7 +24,6 @@ class RuleFactoryTest : public Test {
protected:
virtual void SetUp() {
finder = make_shared<MockMatchingsFinder>();
- intersector = make_shared<MockIntersector>();
fast_intersector = make_shared<MockFastIntersector>();
vocabulary = make_shared<MockVocabulary>();
@@ -55,7 +53,6 @@ class RuleFactoryTest : public Test {
vector<string> feature_names;
shared_ptr<MockMatchingsFinder> finder;
- shared_ptr<MockIntersector> intersector;
shared_ptr<MockFastIntersector> fast_intersector;
shared_ptr<MockVocabulary> vocabulary;
shared_ptr<PhraseBuilder> phrase_builder;
@@ -66,81 +63,39 @@ class RuleFactoryTest : public Test {
};
TEST_F(RuleFactoryTest, TestGetGrammarDifferentWords) {
- factory = make_shared<HieroCachingRuleFactory>(finder, intersector,
- fast_intersector, phrase_builder, extractor, vocabulary, sampler,
- scorer, 1, 10, 2, 3, 5, false);
+ factory = make_shared<HieroCachingRuleFactory>(finder, fast_intersector,
+ phrase_builder, extractor, vocabulary, sampler, scorer, 1, 10, 2, 3, 5);
EXPECT_CALL(*finder, Find(_, _, _))
.Times(6)
.WillRepeatedly(Return(PhraseLocation(0, 1)));
- EXPECT_CALL(*intersector, Intersect(_, _, _, _, _))
+ EXPECT_CALL(*fast_intersector, Intersect(_, _, _))
.Times(1)
.WillRepeatedly(Return(PhraseLocation(0, 1)));
- EXPECT_CALL(*fast_intersector, Intersect(_, _, _)).Times(0);
vector<int> word_ids = {2, 3, 4};
Grammar grammar = factory->GetGrammar(word_ids);
EXPECT_EQ(feature_names, grammar.GetFeatureNames());
EXPECT_EQ(7, grammar.GetRules().size());
-
- // Test for fast intersector.
- factory = make_shared<HieroCachingRuleFactory>(finder, intersector,
- fast_intersector, phrase_builder, extractor, vocabulary, sampler,
- scorer, 1, 10, 2, 3, 5, true);
-
- EXPECT_CALL(*finder, Find(_, _, _))
- .Times(6)
- .WillRepeatedly(Return(PhraseLocation(0, 1)));
-
- EXPECT_CALL(*fast_intersector, Intersect(_, _, _))
- .Times(1)
- .WillRepeatedly(Return(PhraseLocation(0, 1)));
- EXPECT_CALL(*intersector, Intersect(_, _, _, _, _)).Times(0);
-
- grammar = factory->GetGrammar(word_ids);
- EXPECT_EQ(feature_names, grammar.GetFeatureNames());
- EXPECT_EQ(7, grammar.GetRules().size());
}
TEST_F(RuleFactoryTest, TestGetGrammarRepeatingWords) {
- factory = make_shared<HieroCachingRuleFactory>(finder, intersector,
- fast_intersector, phrase_builder, extractor, vocabulary, sampler,
- scorer, 1, 10, 2, 3, 5, false);
+ factory = make_shared<HieroCachingRuleFactory>(finder, fast_intersector,
+ phrase_builder, extractor, vocabulary, sampler, scorer, 1, 10, 2, 3, 5);
EXPECT_CALL(*finder, Find(_, _, _))
.Times(12)
.WillRepeatedly(Return(PhraseLocation(0, 1)));
- EXPECT_CALL(*intersector, Intersect(_, _, _, _, _))
+ EXPECT_CALL(*fast_intersector, Intersect(_, _, _))
.Times(16)
.WillRepeatedly(Return(PhraseLocation(0, 1)));
- EXPECT_CALL(*fast_intersector, Intersect(_, _, _)).Times(0);
-
vector<int> word_ids = {2, 3, 4, 2, 3};
Grammar grammar = factory->GetGrammar(word_ids);
EXPECT_EQ(feature_names, grammar.GetFeatureNames());
EXPECT_EQ(28, grammar.GetRules().size());
-
- // Test for fast intersector.
- factory = make_shared<HieroCachingRuleFactory>(finder, intersector,
- fast_intersector, phrase_builder, extractor, vocabulary, sampler,
- scorer, 1, 10, 2, 3, 5, true);
-
- EXPECT_CALL(*finder, Find(_, _, _))
- .Times(12)
- .WillRepeatedly(Return(PhraseLocation(0, 1)));
-
- EXPECT_CALL(*fast_intersector, Intersect(_, _, _))
- .Times(16)
- .WillRepeatedly(Return(PhraseLocation(0, 1)));
-
- EXPECT_CALL(*intersector, Intersect(_, _, _, _, _)).Times(0);
-
- grammar = factory->GetGrammar(word_ids);
- EXPECT_EQ(feature_names, grammar.GetFeatureNames());
- EXPECT_EQ(28, grammar.GetRules().size());
}
} // namespace
diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc
index 2b01e832..36dbd7a0 100644
--- a/extractor/run_extractor.cc
+++ b/extractor/run_extractor.cc
@@ -66,13 +66,9 @@ int main(int argc, char** argv) {
"Minimum number of occurences for a pharse to be considered frequent")
("max_samples", po::value<int>()->default_value(300),
"Maximum number of samples")
- ("fast_intersect", po::value<bool>()->default_value(false),
- "Enable fast intersect")
// TODO(pauldb): Check if this works when set to false.
("tight_phrases", po::value<bool>()->default_value(true),
- "False if phrases may be loose (better, but slower)")
- ("baeza_yates", po::value<bool>()->default_value(true),
- "Use double binary search");
+ "False if phrases may be loose (better, but slower)");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
@@ -178,8 +174,6 @@ int main(int argc, char** argv) {
vm["max_nonterminals"].as<int>(),
vm["max_rule_symbols"].as<int>(),
vm["max_samples"].as<int>(),
- vm["fast_intersect"].as<bool>(),
- vm["baeza_yates"].as<bool>(),
vm["tight_phrases"].as<bool>());
int grammar_id = 0;
diff --git a/extractor/suffix_array.cc b/extractor/suffix_array.cc
index 23c458a4..ab8a0913 100644
--- a/extractor/suffix_array.cc
+++ b/extractor/suffix_array.cc
@@ -136,7 +136,7 @@ void SuffixArray::TernaryQuicksort(int left, int right, int step,
vector<int> SuffixArray::BuildLCPArray() const {
Clock::time_point start_time = Clock::now();
- cerr << "Constructing LCP array..." << endl;
+ cerr << "\tConstructing LCP array..." << endl;
vector<int> lcp(suffix_array.size());
vector<int> rank(suffix_array.size());
@@ -165,7 +165,7 @@ vector<int> SuffixArray::BuildLCPArray() const {
}
Clock::time_point stop_time = Clock::now();
- cerr << "Constructing LCP took "
+ cerr << "\tConstructing LCP took "
<< GetDuration(start_time, stop_time) << " seconds" << endl;
return lcp;
diff --git a/extractor/veb.cc b/extractor/veb.cc
deleted file mode 100644
index f38f672e..00000000
--- a/extractor/veb.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-#include "veb.h"
-
-#include "veb_bitset.h"
-#include "veb_tree.h"
-
-int VEB::MIN_BOTTOM_BITS = 5;
-int VEB::MIN_BOTTOM_SIZE = 1 << VEB::MIN_BOTTOM_BITS;
-
-shared_ptr<VEB> VEB::Create(int size) {
- if (size > MIN_BOTTOM_SIZE) {
- return shared_ptr<VEB>(new VEBTree(size));
- } else {
- return shared_ptr<VEB>(new VEBBitset(size));
- }
-}
-
-int VEB::GetMinimum() {
- return min;
-}
-
-int VEB::GetMaximum() {
- return max;
-}
-
-VEB::VEB(int min, int max) : min(min), max(max) {}
diff --git a/extractor/veb.h b/extractor/veb.h
deleted file mode 100644
index c8209cf7..00000000
--- a/extractor/veb.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _VEB_H_
-#define _VEB_H_
-
-#include <memory>
-
-using namespace std;
-
-class VEB {
- public:
- static shared_ptr<VEB> Create(int size);
-
- virtual void Insert(int value) = 0;
-
- virtual int GetSuccessor(int value) = 0;
-
- int GetMinimum();
-
- int GetMaximum();
-
- static int MIN_BOTTOM_BITS;
- static int MIN_BOTTOM_SIZE;
-
- protected:
- VEB(int min = -1, int max = -1);
-
- int min, max;
-};
-
-#endif
diff --git a/extractor/veb_bitset.cc b/extractor/veb_bitset.cc
deleted file mode 100644
index 4e364cc5..00000000
--- a/extractor/veb_bitset.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-#include "veb_bitset.h"
-
-using namespace std;
-
-VEBBitset::VEBBitset(int size) : bitset(size) {
- min = max = -1;
-}
-
-void VEBBitset::Insert(int value) {
- bitset[value] = 1;
- if (min == -1 || value < min) {
- min = value;
- }
- if (max == - 1 || value > max) {
- max = value;
- }
-}
-
-int VEBBitset::GetSuccessor(int value) {
- int next_value = bitset.find_next(value);
- if (next_value == bitset.npos) {
- return -1;
- }
- return next_value;
-}
diff --git a/extractor/veb_bitset.h b/extractor/veb_bitset.h
deleted file mode 100644
index f8a91234..00000000
--- a/extractor/veb_bitset.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _VEB_BITSET_H_
-#define _VEB_BITSET_H_
-
-#include <boost/dynamic_bitset.hpp>
-
-#include "veb.h"
-
-class VEBBitset: public VEB {
- public:
- VEBBitset(int size);
-
- void Insert(int value);
-
- int GetMinimum();
-
- int GetSuccessor(int value);
-
- private:
- boost::dynamic_bitset<> bitset;
-};
-
-#endif
diff --git a/extractor/veb_test.cc b/extractor/veb_test.cc
deleted file mode 100644
index c40c9f28..00000000
--- a/extractor/veb_test.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <vector>
-
-#include "veb.h"
-
-using namespace std;
-
-namespace {
-
-class VEBTest : public ::testing::Test {
- protected:
- void VEBSortTester(vector<int> values, int max_value) {
- shared_ptr<VEB> veb = VEB::Create(max_value);
- for (int value: values) {
- veb->Insert(value);
- }
-
- sort(values.begin(), values.end());
- EXPECT_EQ(values.front(), veb->GetMinimum());
- EXPECT_EQ(values.back(), veb->GetMaximum());
- for (size_t i = 0; i + 1 < values.size(); ++i) {
- EXPECT_EQ(values[i + 1], veb->GetSuccessor(values[i]));
- }
- EXPECT_EQ(-1, veb->GetSuccessor(values.back()));
- }
-};
-
-TEST_F(VEBTest, SmallRange) {
- vector<int> values{8, 13, 5, 1, 4, 15, 2, 10, 6, 7};
- VEBSortTester(values, 16);
-}
-
-TEST_F(VEBTest, MediumRange) {
- vector<int> values{167, 243, 88, 12, 137, 199, 212, 45, 150, 189};
- VEBSortTester(values, 255);
-}
-
-TEST_F(VEBTest, LargeRangeSparse) {
- vector<int> values;
- for (size_t i = 0; i < 100; ++i) {
- values.push_back(i * 1000000);
- }
- VEBSortTester(values, 100000000);
-}
-
-TEST_F(VEBTest, LargeRangeDense) {
- vector<int> values;
- for (size_t i = 0; i < 1000000; ++i) {
- values.push_back(i);
- }
- VEBSortTester(values, 1000000);
-}
-
-} // namespace
diff --git a/extractor/veb_tree.cc b/extractor/veb_tree.cc
deleted file mode 100644
index f8945445..00000000
--- a/extractor/veb_tree.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <cmath>
-
-#include "veb_tree.h"
-
-VEBTree::VEBTree(int size) {
- int num_bits = ceil(log2(size));
-
- lower_bits = num_bits >> 1;
- upper_size = (size >> lower_bits) + 1;
-
- clusters.reserve(upper_size);
- clusters.resize(upper_size);
-}
-
-int VEBTree::GetNextValue(int value) {
- return value & ((1 << lower_bits) - 1);
-}
-
-int VEBTree::GetCluster(int value) {
- return value >> lower_bits;
-}
-
-int VEBTree::Compose(int cluster, int value) {
- return (cluster << lower_bits) + value;
-}
-
-void VEBTree::Insert(int value) {
- if (min == -1 && max == -1) {
- min = max = value;
- return;
- }
-
- if (value < min) {
- swap(min, value);
- }
-
- int cluster = GetCluster(value), next_value = GetNextValue(value);
- if (clusters[cluster] == NULL) {
- clusters[cluster] = VEB::Create(1 << lower_bits);
- if (summary == NULL) {
- summary = VEB::Create(upper_size);
- }
- summary->Insert(cluster);
- }
- clusters[cluster]->Insert(next_value);
-
- if (value > max) {
- max = value;
- }
-}
-
-int VEBTree::GetSuccessor(int value) {
- if (value >= max) {
- return -1;
- }
- if (value < min) {
- return min;
- }
-
- int cluster = GetCluster(value), next_value = GetNextValue(value);
- if (clusters[cluster] != NULL &&
- next_value < clusters[cluster]->GetMaximum()) {
- return Compose(cluster, clusters[cluster]->GetSuccessor(next_value));
- } else {
- int next_cluster = summary->GetSuccessor(cluster);
- if (next_cluster == -1) {
- return -1;
- }
- return Compose(next_cluster, clusters[next_cluster]->GetMinimum());
- }
-}
diff --git a/extractor/veb_tree.h b/extractor/veb_tree.h
deleted file mode 100644
index 578d3e6a..00000000
--- a/extractor/veb_tree.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _VEB_TREE_H_
-#define _VEB_TREE_H_
-
-#include <memory>
-#include <vector>
-
-using namespace std;
-
-#include "veb.h"
-
-class VEBTree: public VEB {
- public:
- VEBTree(int size);
-
- void Insert(int value);
-
- int GetSuccessor(int value);
-
- private:
- int GetNextValue(int value);
- int GetCluster(int value);
- int Compose(int cluster, int value);
-
- int lower_bits, upper_size;
- shared_ptr<VEB> summary;
- vector<shared_ptr<VEB> > clusters;
-};
-
-#endif