From 2d2d5eced93d58bc77894d8c328195cd9950b96d Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Wed, 13 Nov 2013 18:00:10 +0100
Subject: unit tests for extractor loo sampling
---
extractor/grammar_extractor_test.cc | 7 ++-
extractor/mocks/mock_rule_factory.h | 2 +-
extractor/rule_factory_test.cc | 8 ++-
extractor/sampler.cc | 18 +++----
extractor/sampler_test.cc | 24 +++++----
extractor/sampler_test_blacklist.cc | 102 ++++++++++++++++++++++++++++++++++++
6 files changed, 138 insertions(+), 23 deletions(-)
create mode 100644 extractor/sampler_test_blacklist.cc
(limited to 'extractor')
diff --git a/extractor/grammar_extractor_test.cc b/extractor/grammar_extractor_test.cc
index 823bb8b4..f32a9599 100644
--- a/extractor/grammar_extractor_test.cc
+++ b/extractor/grammar_extractor_test.cc
@@ -39,12 +39,15 @@ TEST(GrammarExtractorTest, TestAnnotatingWords) {
vector rules;
vector feature_names;
Grammar grammar(rules, feature_names);
- EXPECT_CALL(*factory, GetGrammar(word_ids))
+ unordered_set blacklisted_sentence_ids;
+ shared_ptr source_data_array;
+ EXPECT_CALL(*factory, GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array))
.WillOnce(Return(grammar));
GrammarExtractor extractor(vocabulary, factory);
string sentence = "Anna has many many apples .";
- extractor.GetGrammar(sentence);
+
+ extractor.GetGrammar(sentence, blacklisted_sentence_ids, source_data_array);
}
} // namespace
diff --git a/extractor/mocks/mock_rule_factory.h b/extractor/mocks/mock_rule_factory.h
index 7389b396..86a084b5 100644
--- a/extractor/mocks/mock_rule_factory.h
+++ b/extractor/mocks/mock_rule_factory.h
@@ -7,7 +7,7 @@ namespace extractor {
class MockHieroCachingRuleFactory : public HieroCachingRuleFactory {
public:
- MOCK_METHOD1(GetGrammar, Grammar(const vector& word_ids));
+ MOCK_METHOD3(GetGrammar, Grammar(const vector& word_ids, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array));
};
} // namespace extractor
diff --git a/extractor/rule_factory_test.cc b/extractor/rule_factory_test.cc
index 08af3dcd..f26cc567 100644
--- a/extractor/rule_factory_test.cc
+++ b/extractor/rule_factory_test.cc
@@ -76,7 +76,9 @@ TEST_F(RuleFactoryTest, TestGetGrammarDifferentWords) {
.WillRepeatedly(Return(PhraseLocation(0, 1)));
vector word_ids = {2, 3, 4};
- Grammar grammar = factory->GetGrammar(word_ids);
+ unordered_set blacklisted_sentence_ids;
+ shared_ptr source_data_array;
+ Grammar grammar = factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array);
EXPECT_EQ(feature_names, grammar.GetFeatureNames());
EXPECT_EQ(7, grammar.GetRules().size());
}
@@ -94,7 +96,9 @@ TEST_F(RuleFactoryTest, TestGetGrammarRepeatingWords) {
.WillRepeatedly(Return(PhraseLocation(0, 1)));
vector word_ids = {2, 3, 4, 2, 3};
- Grammar grammar = factory->GetGrammar(word_ids);
+ unordered_set blacklisted_sentence_ids;
+ shared_ptr source_data_array;
+ Grammar grammar = factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array);
EXPECT_EQ(feature_names, grammar.GetFeatureNames());
EXPECT_EQ(28, grammar.GetRules().size());
}
diff --git a/extractor/sampler.cc b/extractor/sampler.cc
index cb470962..d332dd90 100644
--- a/extractor/sampler.cc
+++ b/extractor/sampler.cc
@@ -19,25 +19,25 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location, unordered_setGetSuffix(i);
+ int x = suffix_array->GetSuffix(Round(i));
int id = source_data_array->GetSentenceId(x);
if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) {
found = false;
- int backoff_step = 1;
+ double backoff_step = 1;
while (true) {
if ((double)backoff_step >= step) break;
- int j = i - backoff_step;
- x = suffix_array->GetSuffix(j);
+ double j = i - backoff_step;
+ x = suffix_array->GetSuffix(Round(j));
id = source_data_array->GetSentenceId(x);
- if (j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) {
+ if (x >= 0 && j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) {
found = true; last = i; break;
}
- int k = i + backoff_step;
- x = suffix_array->GetSuffix(k);
+ double k = i + backoff_step;
+ x = suffix_array->GetSuffix(Round(k));
id = source_data_array->GetSentenceId(x);
if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) {
found = true; last = k; break;
diff --git a/extractor/sampler_test.cc b/extractor/sampler_test.cc
index e9abebfa..965567ba 100644
--- a/extractor/sampler_test.cc
+++ b/extractor/sampler_test.cc
@@ -3,6 +3,7 @@
#include
#include "mocks/mock_suffix_array.h"
+#include "mocks/mock_data_array.h"
#include "phrase_location.h"
#include "sampler.h"
@@ -15,6 +16,8 @@ namespace {
class SamplerTest : public Test {
protected:
virtual void SetUp() {
+ source_data_array = make_shared();
+ EXPECT_CALL(*source_data_array, GetSentenceId(_)).WillRepeatedly(Return(9999));
suffix_array = make_shared();
for (int i = 0; i < 10; ++i) {
EXPECT_CALL(*suffix_array, GetSuffix(i)).WillRepeatedly(Return(i));
@@ -23,51 +26,54 @@ class SamplerTest : public Test {
shared_ptr suffix_array;
shared_ptr sampler;
+ shared_ptr source_data_array;
};
TEST_F(SamplerTest, TestSuffixArrayRange) {
PhraseLocation location(0, 10);
+ unordered_set blacklist;
sampler = make_shared(suffix_array, 1);
vector expected_locations = {0};
- EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location));
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
sampler = make_shared(suffix_array, 2);
expected_locations = {0, 5};
- EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location));
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
sampler = make_shared(suffix_array, 3);
expected_locations = {0, 3, 7};
- EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location));
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
sampler = make_shared(suffix_array, 4);
expected_locations = {0, 3, 5, 8};
- EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location));
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
sampler = make_shared(suffix_array, 100);
expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
- EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location));
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
}
TEST_F(SamplerTest, TestSubstringsSample) {
vector locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+ unordered_set blacklist;
PhraseLocation location(locations, 2);
sampler = make_shared(suffix_array, 1);
vector expected_locations = {0, 1};
- EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location));
+ EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array));
sampler = make_shared(suffix_array, 2);
expected_locations = {0, 1, 6, 7};
- EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location));
+ EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array));
sampler = make_shared(suffix_array, 3);
expected_locations = {0, 1, 4, 5, 6, 7};
- EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location));
+ EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array));
sampler = make_shared(suffix_array, 7);
expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
- EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location));
+ EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array));
}
} // namespace
diff --git a/extractor/sampler_test_blacklist.cc b/extractor/sampler_test_blacklist.cc
new file mode 100644
index 00000000..3305b990
--- /dev/null
+++ b/extractor/sampler_test_blacklist.cc
@@ -0,0 +1,102 @@
+#include
+
+#include
+
+#include "mocks/mock_suffix_array.h"
+#include "mocks/mock_data_array.h"
+#include "phrase_location.h"
+#include "sampler.h"
+
+using namespace std;
+using namespace ::testing;
+
+namespace extractor {
+namespace {
+
+class SamplerTestBlacklist : public Test {
+ protected:
+ virtual void SetUp() {
+ source_data_array = make_shared();
+ for (int i = 0; i < 10; ++i) {
+ EXPECT_CALL(*source_data_array, GetSentenceId(i)).WillRepeatedly(Return(i));
+ }
+ for (int i = -10; i < 0; ++i) {
+ EXPECT_CALL(*source_data_array, GetSentenceId(i)).WillRepeatedly(Return(0));
+ }
+ suffix_array = make_shared();
+ for (int i = -10; i < 10; ++i) {
+ EXPECT_CALL(*suffix_array, GetSuffix(i)).WillRepeatedly(Return(i));
+ }
+ }
+
+ shared_ptr suffix_array;
+ shared_ptr sampler;
+ shared_ptr source_data_array;
+};
+
+TEST_F(SamplerTestBlacklist, TestSuffixArrayRange) {
+ PhraseLocation location(0, 10);
+ unordered_set blacklist;
+ vector expected_locations;
+
+ blacklist.insert(0);
+ sampler = make_shared(suffix_array, 1);
+ expected_locations = {1};
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
+ blacklist.clear();
+
+ for (int i = 0; i < 9; i++) {
+ blacklist.insert(i);
+ }
+ sampler = make_shared(suffix_array, 1);
+ expected_locations = {9};
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
+ blacklist.clear();
+
+ blacklist.insert(0);
+ blacklist.insert(5);
+ sampler = make_shared(suffix_array, 2);
+ expected_locations = {1, 4};
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
+ blacklist.clear();
+
+ blacklist.insert(0);
+ blacklist.insert(1);
+ blacklist.insert(2);
+ blacklist.insert(3);
+ sampler = make_shared(suffix_array, 2);
+ expected_locations = {4, 5};
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
+ blacklist.clear();
+
+ blacklist.insert(0);
+ blacklist.insert(3);
+ blacklist.insert(7);
+ sampler = make_shared(suffix_array, 3);
+ expected_locations = {1, 2, 6};
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
+ blacklist.clear();
+
+ blacklist.insert(0);
+ blacklist.insert(3);
+ blacklist.insert(5);
+ blacklist.insert(8);
+ sampler = make_shared(suffix_array, 4);
+ expected_locations = {1, 2, 4, 7};
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
+ blacklist.clear();
+
+ blacklist.insert(0);
+ sampler = make_shared(suffix_array, 100);
+ expected_locations = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
+ blacklist.clear();
+
+ blacklist.insert(9);
+ sampler = make_shared(suffix_array, 100);
+ expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array));
+}
+
+} // namespace
+} // namespace extractor
--
cgit v1.2.3