From f528ac27dab11770f01595b043675dba2947a263 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Sun, 24 Nov 2013 13:19:28 +0000 Subject: Reduce memory overhead for constructing the intersector. --- extractor/data_array_test.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'extractor/data_array_test.cc') diff --git a/extractor/data_array_test.cc b/extractor/data_array_test.cc index 6c329e34..7b085cd9 100644 --- a/extractor/data_array_test.cc +++ b/extractor/data_array_test.cc @@ -56,6 +56,18 @@ TEST_F(DataArrayTest, TestGetData) { } } +TEST_F(DataArrayTest, TestSubstrings) { + vector expected_word_ids = {3, 4, 5}; + vector expected_words = {"are", "mere", "."}; + EXPECT_EQ(expected_word_ids, source_data.GetWordIds(1, 3)); + EXPECT_EQ(expected_words, source_data.GetWords(1, 3)); + + expected_word_ids = {7, 8}; + expected_words = {"a", "lot"}; + EXPECT_EQ(expected_word_ids, target_data.GetWordIds(7, 2)); + EXPECT_EQ(expected_words, target_data.GetWords(7, 2)); +} + TEST_F(DataArrayTest, TestVocabulary) { EXPECT_EQ(9, source_data.GetVocabularySize()); EXPECT_TRUE(source_data.HasWord("mere")); -- cgit v1.2.3 From 467ef6ce78cfe7341a696ebf0948e377be619ae5 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Mon, 25 Nov 2013 18:19:13 +0000 Subject: Reduce unordered_map calls. --- extractor/Makefile.am | 3 +-- extractor/data_array.cc | 4 ---- extractor/data_array.h | 3 --- extractor/data_array_test.cc | 4 ---- extractor/mocks/mock_data_array.h | 1 - extractor/suffix_array.cc | 4 ++-- extractor/suffix_array_test.cc | 6 +----- extractor/translation_table.cc | 14 ++++++-------- extractor/translation_table_test.cc | 10 ++-------- 9 files changed, 12 insertions(+), 37 deletions(-) (limited to 'extractor/data_array_test.cc') diff --git a/extractor/Makefile.am b/extractor/Makefile.am index faf25d89..65a3d436 100644 --- a/extractor/Makefile.am +++ b/extractor/Makefile.am @@ -53,8 +53,7 @@ endif noinst_PROGRAMS = $(RUNNABLE_TESTS) -# TESTS = $(RUNNABLE_TESTS) -TESTS = precomputation_test +TESTS = $(RUNNABLE_TESTS) alignment_test_SOURCES = alignment_test.cc alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a diff --git a/extractor/data_array.cc b/extractor/data_array.cc index 6757cae7..ac0493fd 100644 --- a/extractor/data_array.cc +++ b/extractor/data_array.cc @@ -115,10 +115,6 @@ int DataArray::GetSentenceId(int position) const { return sentence_id[position]; } -bool DataArray::HasWord(const string& word) const { - return word2id.count(word); -} - int DataArray::GetWordId(const string& word) const { auto result = word2id.find(word); return result == word2id.end() ? -1 : result->second; diff --git a/extractor/data_array.h b/extractor/data_array.h index e9af5bd0..c5dc8a26 100644 --- a/extractor/data_array.h +++ b/extractor/data_array.h @@ -65,9 +65,6 @@ class DataArray { // Returns the number of distinct words in the data array. virtual int GetVocabularySize() const; - // Returns whether a word has ever been observed in the data array. - virtual bool HasWord(const string& word) const; - // Returns the word id for a given word or -1 if it the word has never been // observed. virtual int GetWordId(const string& word) const; diff --git a/extractor/data_array_test.cc b/extractor/data_array_test.cc index 6c329e34..b6b56561 100644 --- a/extractor/data_array_test.cc +++ b/extractor/data_array_test.cc @@ -58,16 +58,12 @@ TEST_F(DataArrayTest, TestGetData) { TEST_F(DataArrayTest, TestVocabulary) { EXPECT_EQ(9, source_data.GetVocabularySize()); - EXPECT_TRUE(source_data.HasWord("mere")); EXPECT_EQ(4, source_data.GetWordId("mere")); EXPECT_EQ("mere", source_data.GetWord(4)); - EXPECT_FALSE(source_data.HasWord("banane")); EXPECT_EQ(11, target_data.GetVocabularySize()); - EXPECT_TRUE(target_data.HasWord("apples")); EXPECT_EQ(4, target_data.GetWordId("apples")); EXPECT_EQ("apples", target_data.GetWord(4)); - EXPECT_FALSE(target_data.HasWord("bananas")); } TEST_F(DataArrayTest, TestSentenceData) { diff --git a/extractor/mocks/mock_data_array.h b/extractor/mocks/mock_data_array.h index d39cb0c4..edc525fa 100644 --- a/extractor/mocks/mock_data_array.h +++ b/extractor/mocks/mock_data_array.h @@ -11,7 +11,6 @@ class MockDataArray : public DataArray { MOCK_CONST_METHOD1(GetWordAtIndex, string(int index)); MOCK_CONST_METHOD0(GetSize, int()); MOCK_CONST_METHOD0(GetVocabularySize, int()); - MOCK_CONST_METHOD1(HasWord, bool(const string& word)); MOCK_CONST_METHOD1(GetWordId, int(const string& word)); MOCK_CONST_METHOD1(GetWord, string(int word_id)); MOCK_CONST_METHOD1(GetSentenceLength, int(int sentence_id)); diff --git a/extractor/suffix_array.cc b/extractor/suffix_array.cc index ac230d13..4a514b12 100644 --- a/extractor/suffix_array.cc +++ b/extractor/suffix_array.cc @@ -187,12 +187,12 @@ shared_ptr SuffixArray::GetData() const { PhraseLocation SuffixArray::Lookup(int low, int high, const string& word, int offset) const { - if (!data_array->HasWord(word)) { + int word_id = data_array->GetWordId(word); + if (word_id == -1) { // Return empty phrase location. return PhraseLocation(0, 0); } - int word_id = data_array->GetWordId(word); if (offset == 0) { return PhraseLocation(word_start[word_id], word_start[word_id + 1]); } diff --git a/extractor/suffix_array_test.cc b/extractor/suffix_array_test.cc index a9fd1eab..161edbc0 100644 --- a/extractor/suffix_array_test.cc +++ b/extractor/suffix_array_test.cc @@ -55,22 +55,18 @@ TEST_F(SuffixArrayTest, TestLookup) { EXPECT_CALL(*data_array, AtIndex(i)).WillRepeatedly(Return(data[i])); } - EXPECT_CALL(*data_array, HasWord("word1")).WillRepeatedly(Return(true)); EXPECT_CALL(*data_array, GetWordId("word1")).WillRepeatedly(Return(6)); EXPECT_EQ(PhraseLocation(11, 14), suffix_array.Lookup(0, 14, "word1", 0)); - EXPECT_CALL(*data_array, HasWord("word2")).WillRepeatedly(Return(false)); + EXPECT_CALL(*data_array, GetWordId("word2")).WillRepeatedly(Return(-1)); EXPECT_EQ(PhraseLocation(0, 0), suffix_array.Lookup(0, 14, "word2", 0)); - EXPECT_CALL(*data_array, HasWord("word3")).WillRepeatedly(Return(true)); EXPECT_CALL(*data_array, GetWordId("word3")).WillRepeatedly(Return(4)); EXPECT_EQ(PhraseLocation(11, 13), suffix_array.Lookup(11, 14, "word3", 1)); - EXPECT_CALL(*data_array, HasWord("word4")).WillRepeatedly(Return(true)); EXPECT_CALL(*data_array, GetWordId("word4")).WillRepeatedly(Return(1)); EXPECT_EQ(PhraseLocation(11, 13), suffix_array.Lookup(11, 13, "word4", 2)); - EXPECT_CALL(*data_array, HasWord("word5")).WillRepeatedly(Return(true)); EXPECT_CALL(*data_array, GetWordId("word5")).WillRepeatedly(Return(2)); EXPECT_EQ(PhraseLocation(11, 13), suffix_array.Lookup(11, 13, "word5", 3)); diff --git a/extractor/translation_table.cc b/extractor/translation_table.cc index 1b1ba112..11e29e1e 100644 --- a/extractor/translation_table.cc +++ b/extractor/translation_table.cc @@ -90,13 +90,12 @@ void TranslationTable::IncrementLinksCount( double TranslationTable::GetTargetGivenSourceScore( const string& source_word, const string& target_word) { - if (!source_data_array->HasWord(source_word) || - !target_data_array->HasWord(target_word)) { + int source_id = source_data_array->GetWordId(source_word); + int target_id = target_data_array->GetWordId(target_word); + if (source_id == -1 || target_id == -1) { return -1; } - int source_id = source_data_array->GetWordId(source_word); - int target_id = target_data_array->GetWordId(target_word); auto entry = make_pair(source_id, target_id); auto it = translation_probabilities.find(entry); if (it == translation_probabilities.end()) { @@ -107,13 +106,12 @@ double TranslationTable::GetTargetGivenSourceScore( double TranslationTable::GetSourceGivenTargetScore( const string& source_word, const string& target_word) { - if (!source_data_array->HasWord(source_word) || - !target_data_array->HasWord(target_word)) { + int source_id = source_data_array->GetWordId(source_word); + int target_id = target_data_array->GetWordId(target_word); + if (source_id == -1 || target_id == -1) { return -1; } - int source_id = source_data_array->GetWordId(source_word); - int target_id = target_data_array->GetWordId(target_word); auto entry = make_pair(source_id, target_id); auto it = translation_probabilities.find(entry); if (it == translation_probabilities.end()) { diff --git a/extractor/translation_table_test.cc b/extractor/translation_table_test.cc index 72551a12..3cfc0011 100644 --- a/extractor/translation_table_test.cc +++ b/extractor/translation_table_test.cc @@ -36,13 +36,10 @@ class TranslationTableTest : public Test { .WillRepeatedly(Return(source_sentence_start[i])); } for (size_t i = 0; i < words.size(); ++i) { - EXPECT_CALL(*source_data_array, HasWord(words[i])) - .WillRepeatedly(Return(true)); EXPECT_CALL(*source_data_array, GetWordId(words[i])) .WillRepeatedly(Return(i + 2)); } - EXPECT_CALL(*source_data_array, HasWord("d")) - .WillRepeatedly(Return(false)); + EXPECT_CALL(*source_data_array, GetWordId("d")).WillRepeatedly(Return(-1)); vector target_data = {2, 3, 2, 3, 4, 5, 0, 3, 6, 0, 2, 7, 0}; vector target_sentence_start = {0, 7, 10, 13}; @@ -54,13 +51,10 @@ class TranslationTableTest : public Test { .WillRepeatedly(Return(target_sentence_start[i])); } for (size_t i = 0; i < words.size(); ++i) { - EXPECT_CALL(*target_data_array, HasWord(words[i])) - .WillRepeatedly(Return(true)); EXPECT_CALL(*target_data_array, GetWordId(words[i])) .WillRepeatedly(Return(i + 2)); } - EXPECT_CALL(*target_data_array, HasWord("d")) - .WillRepeatedly(Return(false)); + EXPECT_CALL(*target_data_array, GetWordId("d")).WillRepeatedly(Return(-1)); vector> links1 = { make_pair(0, 0), make_pair(1, 1), make_pair(2, 2), make_pair(3, 3), -- cgit v1.2.3