From 581ecdbad3c691e4e749098bd8568d9cdc48323c Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Mon, 1 Apr 2013 17:23:25 +0100 Subject: Minor speedup: pass sentence_id around. --- extractor/mocks/mock_rule_extractor_helper.h | 14 ++--- extractor/rule_extractor.cc | 21 ++++---- extractor/rule_extractor.h | 4 +- extractor/rule_extractor_helper.cc | 21 +++----- extractor/rule_extractor_helper.h | 11 ++-- extractor/rule_extractor_helper_test.cc | 76 +++++++++++++++------------- extractor/rule_extractor_test.cc | 10 ++-- 7 files changed, 80 insertions(+), 77 deletions(-) (limited to 'extractor') diff --git a/extractor/mocks/mock_rule_extractor_helper.h b/extractor/mocks/mock_rule_extractor_helper.h index cf196ab5..468468f6 100644 --- a/extractor/mocks/mock_rule_extractor_helper.h +++ b/extractor/mocks/mock_rule_extractor_helper.h @@ -14,13 +14,13 @@ class MockRuleExtractorHelper : public RuleExtractorHelper { public: MOCK_CONST_METHOD5(GetLinksSpans, void(vector&, vector&, vector&, vector&, int)); - MOCK_CONST_METHOD3(CheckAlignedTerminals, bool(const vector&, - const vector&, const vector&)); - MOCK_CONST_METHOD3(CheckTightPhrases, bool(const vector&, - const vector&, const vector&)); + MOCK_CONST_METHOD4(CheckAlignedTerminals, bool(const vector&, + const vector&, const vector&, int)); + MOCK_CONST_METHOD4(CheckTightPhrases, bool(const vector&, + const vector&, const vector&, int)); MOCK_CONST_METHOD1(GetGapOrder, vector(const vector >&)); - MOCK_CONST_METHOD3(GetSourceIndexes, Indexes(const vector&, - const vector&, int)); + MOCK_CONST_METHOD4(GetSourceIndexes, Indexes(const vector&, + const vector&, int, int)); // We need to implement these methods, because Google Mock doesn't support // methods with more than 10 arguments. @@ -40,7 +40,7 @@ class MockRuleExtractorHelper : public RuleExtractorHelper { vector >& target_gaps, const vector&, const vector&, const vector&, const vector&, const vector&, const vector&, - int, int, int, int, int& num_symbols, + int, int, int, int, int, int, int& num_symbols, bool& met_constraints) const { source_gaps = this->source_gaps; target_gaps = this->target_gaps; diff --git a/extractor/rule_extractor.cc b/extractor/rule_extractor.cc index 9f5e8e00..fa7386a4 100644 --- a/extractor/rule_extractor.cc +++ b/extractor/rule_extractor.cc @@ -140,8 +140,10 @@ vector RuleExtractor::ExtractAlignments( } // Basic checks to see if we can extract phrase pairs for this occurrence. - if (!helper->CheckAlignedTerminals(matching, chunklen, source_low) || - !helper->CheckTightPhrases(matching, chunklen, source_low)) { + if (!helper->CheckAlignedTerminals(matching, chunklen, source_low, + source_sent_start) || + !helper->CheckTightPhrases(matching, chunklen, source_low, + source_sent_start)) { return extracts; } @@ -167,7 +169,8 @@ vector RuleExtractor::ExtractAlignments( if (!helper->GetGaps(source_gaps, target_gaps, matching, chunklen, source_low, source_high, target_low, target_high, source_phrase_low, source_phrase_high, source_back_low, source_back_high, - num_symbols, met_constraints)) { + sentence_id, source_sent_start, num_symbols, + met_constraints)) { return extracts; } @@ -177,7 +180,7 @@ vector RuleExtractor::ExtractAlignments( Phrase source_phrase = phrase_builder->Extend( phrase, starts_with_x, ends_with_x); unordered_map source_indexes = helper->GetSourceIndexes( - matching, chunklen, starts_with_x); + matching, chunklen, starts_with_x, source_sent_start); if (met_constraints) { AddExtracts(extracts, source_phrase, source_indexes, target_gaps, target_low, target_phrase_low, target_phrase_high, sentence_id); @@ -196,8 +199,8 @@ vector RuleExtractor::ExtractAlignments( for (int j = 1 - i; j < 2; ++j) { AddNonterminalExtremities(extracts, matching, chunklen, source_phrase, source_back_low, source_back_high, source_low, source_high, - target_low, target_high, target_gaps, sentence_id, starts_with_x, - ends_with_x, i, j); + target_low, target_high, target_gaps, sentence_id, source_sent_start, + starts_with_x, ends_with_x, i, j); } } @@ -230,8 +233,8 @@ void RuleExtractor::AddNonterminalExtremities( int source_back_low, int source_back_high, const vector& source_low, const vector& source_high, const vector& target_low, const vector& target_high, vector > target_gaps, - int sentence_id, int starts_with_x, int ends_with_x, int extend_left, - int extend_right) const { + int sentence_id, int source_sent_start, int starts_with_x, int ends_with_x, + int extend_left, int extend_right) const { int source_x_low = source_back_low, source_x_high = source_back_high; // Check if the extended source phrase will remain tight. @@ -332,7 +335,7 @@ void RuleExtractor::AddNonterminalExtremities( Phrase new_source_phrase = phrase_builder->Extend(source_phrase, extend_left, extend_right); unordered_map source_indexes = helper->GetSourceIndexes( - matching, chunklen, extend_left || starts_with_x); + matching, chunklen, extend_left || starts_with_x, source_sent_start); AddExtracts(extracts, new_source_phrase, source_indexes, target_gaps, target_low, target_x_low, target_x_high, sentence_id); } diff --git a/extractor/rule_extractor.h b/extractor/rule_extractor.h index bfec0225..26e6f21c 100644 --- a/extractor/rule_extractor.h +++ b/extractor/rule_extractor.h @@ -102,8 +102,8 @@ class RuleExtractor { int source_back_low, int source_back_high, const vector& source_low, const vector& source_high, const vector& target_low, const vector& target_high, vector > target_gaps, - int sentence_id, int starts_with_x, int ends_with_x, int extend_left, - int extend_right) const; + int sentence_id, int source_sent_start, int starts_with_x, + int ends_with_x, int extend_left, int extend_right) const; private: shared_ptr target_data_array; diff --git a/extractor/rule_extractor_helper.cc b/extractor/rule_extractor_helper.cc index 6410d147..8a9516f2 100644 --- a/extractor/rule_extractor_helper.cc +++ b/extractor/rule_extractor_helper.cc @@ -54,14 +54,12 @@ void RuleExtractorHelper::GetLinksSpans( bool RuleExtractorHelper::CheckAlignedTerminals( const vector& matching, const vector& chunklen, - const vector& source_low) const { + const vector& source_low, + int source_sent_start) const { if (!require_aligned_terminal) { return true; } - int sentence_id = source_data_array->GetSentenceId(matching[0]); - int source_sent_start = source_data_array->GetSentenceStart(sentence_id); - int num_aligned_chunks = 0; for (size_t i = 0; i < chunklen.size(); ++i) { for (size_t j = 0; j < chunklen[i]; ++j) { @@ -83,14 +81,13 @@ bool RuleExtractorHelper::CheckAlignedTerminals( bool RuleExtractorHelper::CheckTightPhrases( const vector& matching, const vector& chunklen, - const vector& source_low) const { + const vector& source_low, + int source_sent_start) const { if (!require_tight_phrases) { return true; } // Check if the chunk extremities are aligned. - int sentence_id = source_data_array->GetSentenceId(matching[0]); - int source_sent_start = source_data_array->GetSentenceStart(sentence_id); for (size_t i = 0; i + 1 < chunklen.size(); ++i) { int gap_start = matching[i] + chunklen[i] - source_sent_start; int gap_end = matching[i + 1] - 1 - source_sent_start; @@ -272,10 +269,8 @@ bool RuleExtractorHelper::GetGaps( const vector& source_low, const vector& source_high, const vector& target_low, const vector& target_high, int source_phrase_low, int source_phrase_high, int source_back_low, - int source_back_high, int& num_symbols, bool& met_constraints) const { - int sentence_id = source_data_array->GetSentenceId(matching[0]); - int source_sent_start = source_data_array->GetSentenceStart(sentence_id); - + int source_back_high, int sentence_id, int source_sent_start, + int& num_symbols, bool& met_constraints) const { if (source_back_low < source_phrase_low) { source_gaps.push_back(make_pair(source_back_low, source_phrase_low)); if (num_symbols >= max_rule_symbols) { @@ -351,10 +346,8 @@ vector RuleExtractorHelper::GetGapOrder( unordered_map RuleExtractorHelper::GetSourceIndexes( const vector& matching, const vector& chunklen, - int starts_with_x) const { + int starts_with_x, int source_sent_start) const { unordered_map source_indexes; - int sentence_id = source_data_array->GetSentenceId(matching[0]); - int source_sent_start = source_data_array->GetSentenceStart(sentence_id); int num_symbols = starts_with_x; for (size_t i = 0; i < matching.size(); ++i) { for (size_t j = 0; j < chunklen[i]; ++j) { diff --git a/extractor/rule_extractor_helper.h b/extractor/rule_extractor_helper.h index bea75bc3..d4ae45d4 100644 --- a/extractor/rule_extractor_helper.h +++ b/extractor/rule_extractor_helper.h @@ -36,12 +36,14 @@ class RuleExtractorHelper { // Check if one chunk (all chunks) is aligned at least in one point. virtual bool CheckAlignedTerminals(const vector& matching, const vector& chunklen, - const vector& source_low) const; + const vector& source_low, + int source_sent_start) const; // Check if the chunks are tight. virtual bool CheckTightPhrases(const vector& matching, const vector& chunklen, - const vector& source_low) const; + const vector& source_low, + int source_sent_start) const; // Find the target span and the reflected source span for a source phrase // occurrence. @@ -62,7 +64,8 @@ class RuleExtractorHelper { const vector& source_low, const vector& source_high, const vector& target_low, const vector& target_high, int source_phrase_low, int source_phrase_high, int source_back_low, - int source_back_high, int& num_symbols, bool& met_constraints) const; + int source_back_high, int sentence_id, int source_sent_start, + int& num_symbols, bool& met_constraints) const; // Get the order of the nonterminals in the target phrase. virtual vector GetGapOrder(const vector >& gaps) const; @@ -70,7 +73,7 @@ class RuleExtractorHelper { // Map each terminal symbol with its position in the source phrase. virtual unordered_map GetSourceIndexes( const vector& matching, const vector& chunklen, - int starts_with_x) const; + int starts_with_x, int source_sent_start) const; protected: RuleExtractorHelper(); diff --git a/extractor/rule_extractor_helper_test.cc b/extractor/rule_extractor_helper_test.cc index 24a322df..9b82abb1 100644 --- a/extractor/rule_extractor_helper_test.cc +++ b/extractor/rule_extractor_helper_test.cc @@ -18,10 +18,6 @@ class RuleExtractorHelperTest : public Test { source_data_array = make_shared(); EXPECT_CALL(*source_data_array, GetSentenceLength(_)) .WillRepeatedly(Return(12)); - EXPECT_CALL(*source_data_array, GetSentenceId(_)) - .WillRepeatedly(Return(5)); - EXPECT_CALL(*source_data_array, GetSentenceStart(_)) - .WillRepeatedly(Return(10)); target_data_array = make_shared(); EXPECT_CALL(*target_data_array, GetSentenceLength(_)) @@ -68,7 +64,8 @@ TEST_F(RuleExtractorHelperTest, TestCheckAlignedFalse) { EXPECT_CALL(*source_data_array, GetSentenceStart(_)).Times(0); vector matching, chunklen, source_low; - EXPECT_TRUE(helper->CheckAlignedTerminals(matching, chunklen, source_low)); + EXPECT_TRUE(helper->CheckAlignedTerminals(matching, chunklen, + source_low, 10)); } TEST_F(RuleExtractorHelperTest, TestCheckAlignedTerminal) { @@ -78,9 +75,11 @@ TEST_F(RuleExtractorHelperTest, TestCheckAlignedTerminal) { vector matching = {10, 12}; vector chunklen = {1, 3}; vector source_low = {-1, 1, -1, 3, -1}; - EXPECT_TRUE(helper->CheckAlignedTerminals(matching, chunklen, source_low)); + EXPECT_TRUE(helper->CheckAlignedTerminals(matching, chunklen, + source_low, 10)); source_low = {-1, 1, -1, -1, -1}; - EXPECT_FALSE(helper->CheckAlignedTerminals(matching, chunklen, source_low)); + EXPECT_FALSE(helper->CheckAlignedTerminals(matching, chunklen, + source_low, 10)); } TEST_F(RuleExtractorHelperTest, TestCheckAlignedChunks) { @@ -90,11 +89,14 @@ TEST_F(RuleExtractorHelperTest, TestCheckAlignedChunks) { vector matching = {10, 12}; vector chunklen = {1, 3}; vector source_low = {2, 1, -1, 3, -1}; - EXPECT_TRUE(helper->CheckAlignedTerminals(matching, chunklen, source_low)); + EXPECT_TRUE(helper->CheckAlignedTerminals(matching, chunklen, + source_low, 10)); source_low = {-1, 1, -1, 3, -1}; - EXPECT_FALSE(helper->CheckAlignedTerminals(matching, chunklen, source_low)); + EXPECT_FALSE(helper->CheckAlignedTerminals(matching, chunklen, + source_low, 10)); source_low = {2, 1, -1, -1, -1}; - EXPECT_FALSE(helper->CheckAlignedTerminals(matching, chunklen, source_low)); + EXPECT_FALSE(helper->CheckAlignedTerminals(matching, chunklen, + source_low, 10)); } @@ -105,7 +107,7 @@ TEST_F(RuleExtractorHelperTest, TestCheckTightPhrasesFalse) { EXPECT_CALL(*source_data_array, GetSentenceStart(_)).Times(0); vector matching, chunklen, source_low; - EXPECT_TRUE(helper->CheckTightPhrases(matching, chunklen, source_low)); + EXPECT_TRUE(helper->CheckTightPhrases(matching, chunklen, source_low, 10)); } TEST_F(RuleExtractorHelperTest, TestCheckTightPhrases) { @@ -116,20 +118,20 @@ TEST_F(RuleExtractorHelperTest, TestCheckTightPhrases) { vector chunklen = {2, 3, 1}; // No missing links. vector source_low = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - EXPECT_TRUE(helper->CheckTightPhrases(matching, chunklen, source_low)); + EXPECT_TRUE(helper->CheckTightPhrases(matching, chunklen, source_low, 10)); // Missing link at the beginning or ending of a gap. source_low = {0, 1, -1, 3, 4, 5, 6, 7, 8}; - EXPECT_FALSE(helper->CheckTightPhrases(matching, chunklen, source_low)); + EXPECT_FALSE(helper->CheckTightPhrases(matching, chunklen, source_low, 10)); source_low = {0, 1, 2, -1, 4, 5, 6, 7, 8}; - EXPECT_FALSE(helper->CheckTightPhrases(matching, chunklen, source_low)); + EXPECT_FALSE(helper->CheckTightPhrases(matching, chunklen, source_low, 10)); source_low = {0, 1, 2, 3, 4, 5, 6, -1, 8}; - EXPECT_FALSE(helper->CheckTightPhrases(matching, chunklen, source_low)); + EXPECT_FALSE(helper->CheckTightPhrases(matching, chunklen, source_low, 10)); // Missing link inside the gap. chunklen = {1, 3, 1}; source_low = {0, 1, -1, 3, 4, 5, 6, 7, 8}; - EXPECT_TRUE(helper->CheckTightPhrases(matching, chunklen, source_low)); + EXPECT_TRUE(helper->CheckTightPhrases(matching, chunklen, source_low, 10)); } TEST_F(RuleExtractorHelperTest, TestFindFixPointBadEdgeCase) { @@ -428,8 +430,8 @@ TEST_F(RuleExtractorHelperTest, TestGetGapsExceedNumSymbols) { EXPECT_FALSE(helper->GetGaps(source_gaps, target_gaps, matching, chunklen, source_low, source_high, target_low, target_high, source_phrase_low, source_phrase_high, - source_back_low, source_back_high, num_symbols, - met_constraints)); + source_back_low, source_back_high, 5, 10, + num_symbols, met_constraints)); source_low = {0, 1, 2, 3, 4, 5, 5}; source_high = {1, 2, 3, 4, 5, 6, 6}; @@ -441,8 +443,8 @@ TEST_F(RuleExtractorHelperTest, TestGetGapsExceedNumSymbols) { EXPECT_FALSE(helper->GetGaps(source_gaps, target_gaps, matching, chunklen, source_low, source_high, target_low, target_high, source_phrase_low, source_phrase_high, - source_back_low, source_back_high, num_symbols, - met_constraints)); + source_back_low, source_back_high, 5, 10, + num_symbols, met_constraints)); } TEST_F(RuleExtractorHelperTest, TestGetGapsExtensionsNotTight) { @@ -467,8 +469,8 @@ TEST_F(RuleExtractorHelperTest, TestGetGapsExtensionsNotTight) { EXPECT_FALSE(helper->GetGaps(source_gaps, target_gaps, matching, chunklen, source_low, source_high, target_low, target_high, source_phrase_low, source_phrase_high, - source_back_low, source_back_high, num_symbols, - met_constraints)); + source_back_low, source_back_high, 5, 10, + num_symbols, met_constraints)); source_phrase_low = 1, source_phrase_high = 6; source_back_low = 1, source_back_high = 7; @@ -476,8 +478,8 @@ TEST_F(RuleExtractorHelperTest, TestGetGapsExtensionsNotTight) { EXPECT_FALSE(helper->GetGaps(source_gaps, target_gaps, matching, chunklen, source_low, source_high, target_low, target_high, source_phrase_low, source_phrase_high, - source_back_low, source_back_high, num_symbols, - met_constraints)); + source_back_low, source_back_high, 5, 10, + num_symbols, met_constraints)); } TEST_F(RuleExtractorHelperTest, TestGetGapsNotTightExtremities) { @@ -502,8 +504,8 @@ TEST_F(RuleExtractorHelperTest, TestGetGapsNotTightExtremities) { EXPECT_TRUE(helper->GetGaps(source_gaps, target_gaps, matching, chunklen, source_low, source_high, target_low, target_high, source_phrase_low, source_phrase_high, - source_back_low, source_back_high, num_symbols, - met_constraints)); + source_back_low, source_back_high, 5, 10, + num_symbols, met_constraints)); EXPECT_FALSE(met_constraints); vector > expected_gaps = {make_pair(2, 3), make_pair(4, 5)}; EXPECT_EQ(expected_gaps, source_gaps); @@ -519,8 +521,8 @@ TEST_F(RuleExtractorHelperTest, TestGetGapsNotTightExtremities) { EXPECT_TRUE(helper->GetGaps(source_gaps, target_gaps, matching, chunklen, source_low, source_high, target_low, target_high, source_phrase_low, source_phrase_high, - source_back_low, source_back_high, num_symbols, - met_constraints)); + source_back_low, source_back_high, 5, 10, + num_symbols, met_constraints)); EXPECT_FALSE(met_constraints); EXPECT_EQ(expected_gaps, source_gaps); EXPECT_EQ(expected_gaps, target_gaps); @@ -548,8 +550,8 @@ TEST_F(RuleExtractorHelperTest, TestGetGapsWithExtensions) { EXPECT_TRUE(helper->GetGaps(source_gaps, target_gaps, matching, chunklen, source_low, source_high, target_low, target_high, source_phrase_low, source_phrase_high, - source_back_low, source_back_high, num_symbols, - met_constraints)); + source_back_low, source_back_high, 5, 10, + num_symbols, met_constraints)); vector > expected_source_gaps = { make_pair(1, 2), make_pair(3, 4), make_pair(5, 6) }; @@ -582,8 +584,8 @@ TEST_F(RuleExtractorHelperTest, TestGetGaps) { EXPECT_TRUE(helper->GetGaps(source_gaps, target_gaps, matching, chunklen, source_low, source_high, target_low, target_high, source_phrase_low, source_phrase_high, - source_back_low, source_back_high, num_symbols, - met_constraints)); + source_back_low, source_back_high, 5, 10, + num_symbols, met_constraints)); vector > expected_source_gaps = { make_pair(2, 3), make_pair(4, 5) }; @@ -616,8 +618,8 @@ TEST_F(RuleExtractorHelperTest, TestGetGapIntegrityChecksFailed) { EXPECT_FALSE(helper->GetGaps(source_gaps, target_gaps, matching, chunklen, source_low, source_high, target_low, target_high, source_phrase_low, source_phrase_high, - source_back_low, source_back_high, num_symbols, - met_constraints)); + source_back_low, source_back_high, 5, 10, + num_symbols, met_constraints)); } TEST_F(RuleExtractorHelperTest, TestGetSourceIndexes) { @@ -629,12 +631,14 @@ TEST_F(RuleExtractorHelperTest, TestGetSourceIndexes) { unordered_map expected_indexes = { {3, 1}, {4, 2}, {5, 3}, {8, 5}, {9, 6}, {11, 8} }; - EXPECT_EQ(expected_indexes, helper->GetSourceIndexes(matching, chunklen, 1)); + EXPECT_EQ(expected_indexes, helper->GetSourceIndexes(matching, chunklen, + 1, 10)); matching = {12, 17}; chunklen = {2, 4}; expected_indexes = {{2, 0}, {3, 1}, {7, 3}, {8, 4}, {9, 5}, {10, 6}}; - EXPECT_EQ(expected_indexes, helper->GetSourceIndexes(matching, chunklen, 0)); + EXPECT_EQ(expected_indexes, helper->GetSourceIndexes(matching, chunklen, + 0, 10)); } } // namespace diff --git a/extractor/rule_extractor_test.cc b/extractor/rule_extractor_test.cc index 1b543fc9..5c1501c7 100644 --- a/extractor/rule_extractor_test.cc +++ b/extractor/rule_extractor_test.cc @@ -32,12 +32,12 @@ class RuleExtractorTest : public Test { .WillRepeatedly(Return(10)); helper = make_shared(); - EXPECT_CALL(*helper, CheckAlignedTerminals(_, _, _)) + EXPECT_CALL(*helper, CheckAlignedTerminals(_, _, _, _)) .WillRepeatedly(Return(true)); - EXPECT_CALL(*helper, CheckTightPhrases(_, _, _)) + EXPECT_CALL(*helper, CheckTightPhrases(_, _, _, _)) .WillRepeatedly(Return(true)); unordered_map source_indexes; - EXPECT_CALL(*helper, GetSourceIndexes(_, _, _)) + EXPECT_CALL(*helper, GetSourceIndexes(_, _, _, _)) .WillRepeatedly(Return(source_indexes)); vocabulary = make_shared(); @@ -78,7 +78,7 @@ TEST_F(RuleExtractorTest, TestExtractRulesAlignedTerminalsFail) { vector matching = {2}; PhraseLocation phrase_location(matching, 1); EXPECT_CALL(*helper, GetLinksSpans(_, _, _, _, _)).Times(1); - EXPECT_CALL(*helper, CheckAlignedTerminals(_, _, _)) + EXPECT_CALL(*helper, CheckAlignedTerminals(_, _, _, _)) .WillRepeatedly(Return(false)); vector rules = extractor->ExtractRules(phrase, phrase_location); EXPECT_EQ(0, rules.size()); @@ -90,7 +90,7 @@ TEST_F(RuleExtractorTest, TestExtractRulesTightPhrasesFail) { vector matching = {2}; PhraseLocation phrase_location(matching, 1); EXPECT_CALL(*helper, GetLinksSpans(_, _, _, _, _)).Times(1); - EXPECT_CALL(*helper, CheckTightPhrases(_, _, _)) + EXPECT_CALL(*helper, CheckTightPhrases(_, _, _, _)) .WillRepeatedly(Return(false)); vector rules = extractor->ExtractRules(phrase, phrase_location); EXPECT_EQ(0, rules.size()); -- cgit v1.2.3