diff options
-rw-r--r-- | extractor/grammar_extractor_test.cc | 7 | ||||
-rw-r--r-- | extractor/mocks/mock_rule_factory.h | 2 | ||||
-rw-r--r-- | extractor/rule_factory_test.cc | 8 | ||||
-rw-r--r-- | extractor/sampler.cc | 18 | ||||
-rw-r--r-- | extractor/sampler_test.cc | 24 | ||||
-rw-r--r-- | extractor/sampler_test_blacklist.cc | 102 |
6 files changed, 138 insertions, 23 deletions
diff --git a/extractor/grammar_extractor_test.cc b/extractor/grammar_extractor_test.cc index 823bb8b4..f32a9599 100644 --- a/extractor/grammar_extractor_test.cc +++ b/extractor/grammar_extractor_test.cc @@ -39,12 +39,15 @@ TEST(GrammarExtractorTest, TestAnnotatingWords) { vector<Rule> rules; vector<string> feature_names; Grammar grammar(rules, feature_names); - EXPECT_CALL(*factory, GetGrammar(word_ids)) + unordered_set<int> blacklisted_sentence_ids; + shared_ptr<DataArray> source_data_array; + EXPECT_CALL(*factory, GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array)) .WillOnce(Return(grammar)); GrammarExtractor extractor(vocabulary, factory); string sentence = "Anna has many many apples ."; - extractor.GetGrammar(sentence); + + extractor.GetGrammar(sentence, blacklisted_sentence_ids, source_data_array); } } // namespace diff --git a/extractor/mocks/mock_rule_factory.h b/extractor/mocks/mock_rule_factory.h index 7389b396..86a084b5 100644 --- a/extractor/mocks/mock_rule_factory.h +++ b/extractor/mocks/mock_rule_factory.h @@ -7,7 +7,7 @@ namespace extractor { class MockHieroCachingRuleFactory : public HieroCachingRuleFactory { public: - MOCK_METHOD1(GetGrammar, Grammar(const vector<int>& word_ids)); + MOCK_METHOD3(GetGrammar, Grammar(const vector<int>& word_ids, const unordered_set<int> blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array)); }; } // namespace extractor diff --git a/extractor/rule_factory_test.cc b/extractor/rule_factory_test.cc index 08af3dcd..f26cc567 100644 --- a/extractor/rule_factory_test.cc +++ b/extractor/rule_factory_test.cc @@ -76,7 +76,9 @@ TEST_F(RuleFactoryTest, TestGetGrammarDifferentWords) { .WillRepeatedly(Return(PhraseLocation(0, 1))); vector<int> word_ids = {2, 3, 4}; - Grammar grammar = factory->GetGrammar(word_ids); + unordered_set<int> blacklisted_sentence_ids; + shared_ptr<DataArray> source_data_array; + Grammar grammar = factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array); EXPECT_EQ(feature_names, grammar.GetFeatureNames()); EXPECT_EQ(7, grammar.GetRules().size()); } @@ -94,7 +96,9 @@ TEST_F(RuleFactoryTest, TestGetGrammarRepeatingWords) { .WillRepeatedly(Return(PhraseLocation(0, 1))); vector<int> word_ids = {2, 3, 4, 2, 3}; - Grammar grammar = factory->GetGrammar(word_ids); + unordered_set<int> blacklisted_sentence_ids; + shared_ptr<DataArray> source_data_array; + Grammar grammar = factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array); EXPECT_EQ(feature_names, grammar.GetFeatureNames()); EXPECT_EQ(28, grammar.GetRules().size()); } diff --git a/extractor/sampler.cc b/extractor/sampler.cc index cb470962..d332dd90 100644 --- a/extractor/sampler.cc +++ b/extractor/sampler.cc @@ -19,25 +19,25 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location, unordered_set<int // Sample suffix array range. num_subpatterns = 1; int low = location.sa_low, high = location.sa_high; - double step = Round(max(1.0, (double) (high - low) / max_samples)); - int i = low, last = i; + double step = max(1.0, (double) (high - low) / max_samples); + double i = low, last = i; bool found; while (sample.size() < max_samples && i < high) { - int x = suffix_array->GetSuffix(i); + int x = suffix_array->GetSuffix(Round(i)); int id = source_data_array->GetSentenceId(x); if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) { found = false; - int backoff_step = 1; + double backoff_step = 1; while (true) { if ((double)backoff_step >= step) break; - int j = i - backoff_step; - x = suffix_array->GetSuffix(j); + double j = i - backoff_step; + x = suffix_array->GetSuffix(Round(j)); id = source_data_array->GetSentenceId(x); - if (j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { + if (x >= 0 && j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { found = true; last = i; break; } - int k = i + backoff_step; - x = suffix_array->GetSuffix(k); + double k = i + backoff_step; + x = suffix_array->GetSuffix(Round(k)); id = source_data_array->GetSentenceId(x); if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { found = true; last = k; break; diff --git a/extractor/sampler_test.cc b/extractor/sampler_test.cc index e9abebfa..965567ba 100644 --- a/extractor/sampler_test.cc +++ b/extractor/sampler_test.cc @@ -3,6 +3,7 @@ #include <memory> #include "mocks/mock_suffix_array.h" +#include "mocks/mock_data_array.h" #include "phrase_location.h" #include "sampler.h" @@ -15,6 +16,8 @@ namespace { class SamplerTest : public Test { protected: virtual void SetUp() { + source_data_array = make_shared<MockDataArray>(); + EXPECT_CALL(*source_data_array, GetSentenceId(_)).WillRepeatedly(Return(9999)); suffix_array = make_shared<MockSuffixArray>(); for (int i = 0; i < 10; ++i) { EXPECT_CALL(*suffix_array, GetSuffix(i)).WillRepeatedly(Return(i)); @@ -23,51 +26,54 @@ class SamplerTest : public Test { shared_ptr<MockSuffixArray> suffix_array; shared_ptr<Sampler> sampler; + shared_ptr<MockDataArray> source_data_array; }; TEST_F(SamplerTest, TestSuffixArrayRange) { PhraseLocation location(0, 10); + unordered_set<int> blacklist; sampler = make_shared<Sampler>(suffix_array, 1); vector<int> expected_locations = {0}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 2); expected_locations = {0, 5}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 3); expected_locations = {0, 3, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 4); expected_locations = {0, 3, 5, 8}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 100); expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); } TEST_F(SamplerTest, TestSubstringsSample) { vector<int> locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + unordered_set<int> blacklist; PhraseLocation location(locations, 2); sampler = make_shared<Sampler>(suffix_array, 1); vector<int> expected_locations = {0, 1}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 2); expected_locations = {0, 1, 6, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 3); expected_locations = {0, 1, 4, 5, 6, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 7); expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); } } // namespace diff --git a/extractor/sampler_test_blacklist.cc b/extractor/sampler_test_blacklist.cc new file mode 100644 index 00000000..3305b990 --- /dev/null +++ b/extractor/sampler_test_blacklist.cc @@ -0,0 +1,102 @@ +#include <gtest/gtest.h> + +#include <memory> + +#include "mocks/mock_suffix_array.h" +#include "mocks/mock_data_array.h" +#include "phrase_location.h" +#include "sampler.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace { + +class SamplerTestBlacklist : public Test { + protected: + virtual void SetUp() { + source_data_array = make_shared<MockDataArray>(); + for (int i = 0; i < 10; ++i) { + EXPECT_CALL(*source_data_array, GetSentenceId(i)).WillRepeatedly(Return(i)); + } + for (int i = -10; i < 0; ++i) { + EXPECT_CALL(*source_data_array, GetSentenceId(i)).WillRepeatedly(Return(0)); + } + suffix_array = make_shared<MockSuffixArray>(); + for (int i = -10; i < 10; ++i) { + EXPECT_CALL(*suffix_array, GetSuffix(i)).WillRepeatedly(Return(i)); + } + } + + shared_ptr<MockSuffixArray> suffix_array; + shared_ptr<Sampler> sampler; + shared_ptr<MockDataArray> source_data_array; +}; + +TEST_F(SamplerTestBlacklist, TestSuffixArrayRange) { + PhraseLocation location(0, 10); + unordered_set<int> blacklist; + vector<int> expected_locations; + + blacklist.insert(0); + sampler = make_shared<Sampler>(suffix_array, 1); + expected_locations = {1}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + for (int i = 0; i < 9; i++) { + blacklist.insert(i); + } + sampler = make_shared<Sampler>(suffix_array, 1); + expected_locations = {9}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + blacklist.insert(5); + sampler = make_shared<Sampler>(suffix_array, 2); + expected_locations = {1, 4}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + blacklist.insert(1); + blacklist.insert(2); + blacklist.insert(3); + sampler = make_shared<Sampler>(suffix_array, 2); + expected_locations = {4, 5}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + blacklist.insert(3); + blacklist.insert(7); + sampler = make_shared<Sampler>(suffix_array, 3); + expected_locations = {1, 2, 6}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + blacklist.insert(3); + blacklist.insert(5); + blacklist.insert(8); + sampler = make_shared<Sampler>(suffix_array, 4); + expected_locations = {1, 2, 4, 7}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + sampler = make_shared<Sampler>(suffix_array, 100); + expected_locations = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(9); + sampler = make_shared<Sampler>(suffix_array, 100); + expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); +} + +} // namespace +} // namespace extractor |