From 7990c750829af93f0a1e0fc14534582f52ee9e8c Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Sat, 9 Nov 2013 20:21:23 +0000 Subject: Remove unnecessary boost filesystem dependencies. --- extractor/alignment.cc | 2 -- extractor/alignment.h | 2 -- extractor/data_array.cc | 3 --- extractor/data_array.h | 2 -- extractor/precomputation.h | 2 -- extractor/suffix_array.cc | 1 - extractor/suffix_array.h | 2 -- extractor/translation_table.h | 2 -- 8 files changed, 16 deletions(-) diff --git a/extractor/alignment.cc b/extractor/alignment.cc index 2278c825..4a7a14f4 100644 --- a/extractor/alignment.cc +++ b/extractor/alignment.cc @@ -8,9 +8,7 @@ #include #include -#include -namespace fs = boost::filesystem; using namespace std; namespace extractor { diff --git a/extractor/alignment.h b/extractor/alignment.h index dc5a8b55..76c27da2 100644 --- a/extractor/alignment.h +++ b/extractor/alignment.h @@ -4,13 +4,11 @@ #include #include -#include #include #include #include #include -namespace fs = boost::filesystem; using namespace std; namespace extractor { diff --git a/extractor/data_array.cc b/extractor/data_array.cc index 2e4bdafb..82efcd51 100644 --- a/extractor/data_array.cc +++ b/extractor/data_array.cc @@ -5,9 +5,6 @@ #include #include -#include - -namespace fs = boost::filesystem; using namespace std; namespace extractor { diff --git a/extractor/data_array.h b/extractor/data_array.h index 2be6a09c..5207366d 100644 --- a/extractor/data_array.h +++ b/extractor/data_array.h @@ -5,13 +5,11 @@ #include #include -#include #include #include #include #include -namespace fs = boost::filesystem; using namespace std; namespace extractor { diff --git a/extractor/precomputation.h b/extractor/precomputation.h index 9f0c9424..e5fa3e37 100644 --- a/extractor/precomputation.h +++ b/extractor/precomputation.h @@ -7,13 +7,11 @@ #include #include -#include #include #include #include #include -namespace fs = boost::filesystem; using namespace std; namespace extractor { diff --git a/extractor/suffix_array.cc b/extractor/suffix_array.cc index 0cf4d1f6..ac230d13 100644 --- a/extractor/suffix_array.cc +++ b/extractor/suffix_array.cc @@ -10,7 +10,6 @@ #include "phrase_location.h" #include "time_util.h" -namespace fs = boost::filesystem; using namespace std; using namespace chrono; diff --git a/extractor/suffix_array.h b/extractor/suffix_array.h index 8ee454ec..df80c152 100644 --- a/extractor/suffix_array.h +++ b/extractor/suffix_array.h @@ -5,12 +5,10 @@ #include #include -#include #include #include #include -namespace fs = boost::filesystem; using namespace std; namespace extractor { diff --git a/extractor/translation_table.h b/extractor/translation_table.h index 2a37bab7..97620727 100644 --- a/extractor/translation_table.h +++ b/extractor/translation_table.h @@ -5,14 +5,12 @@ #include #include -#include #include #include #include #include using namespace std; -namespace fs = boost::filesystem; namespace extractor { -- cgit v1.2.3 From 491c130ef1b0cbe52cf0880be30e4b3614646471 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Sat, 23 Nov 2013 17:44:32 +0000 Subject: Update .gitignore. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 5f573137..4acc057f 100644 --- a/.gitignore +++ b/.gitignore @@ -103,6 +103,7 @@ jam-files/bjam jam-files/engine/bin.* jam-files/engine/bootstrap/ klm/lm/bin/ +klm/lm/builder/builder klm/lm/builder/lmplz klm/lm/build_binary klm/lm/ngram_query -- cgit v1.2.3 From 79206291f78fba893fda6a61ff0ae9264d00bb82 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Sat, 23 Nov 2013 18:39:39 +0000 Subject: Fix broken extractor test. --- configure.ac | 2 +- extractor/mocks/mock_rule_factory.h | 4 +++- extractor/rule_factory.h | 5 ++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index 03bae25c..8136a7c7 100644 --- a/configure.ac +++ b/configure.ac @@ -199,7 +199,7 @@ AC_CONFIG_FILES([mteval/Makefile]) AC_CONFIG_FILES([mteval/meteor_jar.cc]) AC_CONFIG_FILES([decoder/Makefile]) AC_CONFIG_FILES([python/setup.py]) -#AC_CONFIG_FILES([extractor/Makefile]) +AC_CONFIG_FILES([extractor/Makefile]) AC_CONFIG_FILES([word-aligner/Makefile]) # KenLM stuff diff --git a/extractor/mocks/mock_rule_factory.h b/extractor/mocks/mock_rule_factory.h index 86a084b5..6b7b6586 100644 --- a/extractor/mocks/mock_rule_factory.h +++ b/extractor/mocks/mock_rule_factory.h @@ -7,7 +7,9 @@ namespace extractor { class MockHieroCachingRuleFactory : public HieroCachingRuleFactory { public: - MOCK_METHOD3(GetGrammar, Grammar(const vector& word_ids, const unordered_set blacklisted_sentence_ids, const shared_ptr source_data_array)); + MOCK_METHOD3(GetGrammar, Grammar(const vector& word_ids, const + unordered_set& blacklisted_sentence_ids, + const shared_ptr source_data_array)); }; } // namespace extractor diff --git a/extractor/rule_factory.h b/extractor/rule_factory.h index df63a9d8..a1ff76e4 100644 --- a/extractor/rule_factory.h +++ b/extractor/rule_factory.h @@ -72,7 +72,10 @@ class HieroCachingRuleFactory { // Constructs SCFG rules for a given sentence. // (See class description for more details.) - virtual Grammar GetGrammar(const vector& word_ids, const unordered_set& blacklisted_sentence_ids, const shared_ptr source_data_array); + virtual Grammar GetGrammar( + const vector& word_ids, + const unordered_set& blacklisted_sentence_ids, + const shared_ptr source_data_array); protected: HieroCachingRuleFactory(); -- cgit v1.2.3 From f528ac27dab11770f01595b043675dba2947a263 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Sun, 24 Nov 2013 13:19:28 +0000 Subject: Reduce memory overhead for constructing the intersector. --- extractor/compile.cc | 4 ++ extractor/data_array.cc | 14 ++++- extractor/data_array.h | 10 +++- extractor/data_array_test.cc | 12 ++++ extractor/fast_intersector.cc | 40 ++++--------- extractor/fast_intersector.h | 8 +-- extractor/fast_intersector_test.cc | 10 ++-- extractor/grammar_extractor.cc | 5 +- extractor/grammar_extractor.h | 1 + extractor/mocks/mock_data_array.h | 4 +- extractor/mocks/mock_precomputation.h | 3 +- extractor/precomputation.cc | 96 +++++++++++++++-------------- extractor/precomputation.h | 45 +++++++------- extractor/precomputation_test.cc | 110 ++++++++++++++++++++++++---------- extractor/run_extractor.cc | 5 ++ extractor/suffix_array_test.cc | 2 +- extractor/translation_table_test.cc | 4 +- 17 files changed, 225 insertions(+), 148 deletions(-) diff --git a/extractor/compile.cc b/extractor/compile.cc index 65fdd509..0d62757e 100644 --- a/extractor/compile.cc +++ b/extractor/compile.cc @@ -13,6 +13,7 @@ #include "suffix_array.h" #include "time_util.h" #include "translation_table.h" +#include "vocabulary.h" namespace ar = boost::archive; namespace fs = boost::filesystem; @@ -125,9 +126,12 @@ int main(int argc, char** argv) { cerr << "Reading alignment took " << GetDuration(start_time, stop_time) << " seconds" << endl; + shared_ptr vocabulary; + start_time = Clock::now(); cerr << "Precomputing collocations..." << endl; Precomputation precomputation( + vocabulary, source_suffix_array, vm["frequent"].as(), vm["super_frequent"].as(), diff --git a/extractor/data_array.cc b/extractor/data_array.cc index 82efcd51..dacc4283 100644 --- a/extractor/data_array.cc +++ b/extractor/data_array.cc @@ -78,7 +78,7 @@ void DataArray::CreateDataArray(const vector& lines) { DataArray::~DataArray() {} -const vector& DataArray::GetData() const { +vector DataArray::GetData() const { return data; } @@ -90,6 +90,18 @@ string DataArray::GetWordAtIndex(int index) const { return id2word[data[index]]; } +vector DataArray::GetWordIds(int index, int size) const { + return vector(data.begin() + index, data.begin() + index + size); +} + +vector DataArray::GetWords(int start_index, int size) const { + vector words; + for (int word_id: GetWordIds(start_index, size)) { + words.push_back(id2word[word_id]); + } + return words; +} + int DataArray::GetSize() const { return data.size(); } diff --git a/extractor/data_array.h b/extractor/data_array.h index 5207366d..e3823d18 100644 --- a/extractor/data_array.h +++ b/extractor/data_array.h @@ -51,7 +51,7 @@ class DataArray { virtual ~DataArray(); // Returns a vector containing the word ids. - virtual const vector& GetData() const; + virtual vector GetData() const; // Returns the word id at the specified position. virtual int AtIndex(int index) const; @@ -59,6 +59,14 @@ class DataArray { // Returns the original word at the specified position. virtual string GetWordAtIndex(int index) const; + // Returns the substring of word ids starting at the specified position and + // having the specified length. + virtual vector GetWordIds(int start_index, int size) const; + + // Returns the substring of words starting at the specified position and + // having the specified length. + virtual vector GetWords(int start_index, int size) const; + // Returns the size of the data array. virtual int GetSize() const; diff --git a/extractor/data_array_test.cc b/extractor/data_array_test.cc index 6c329e34..7b085cd9 100644 --- a/extractor/data_array_test.cc +++ b/extractor/data_array_test.cc @@ -56,6 +56,18 @@ TEST_F(DataArrayTest, TestGetData) { } } +TEST_F(DataArrayTest, TestSubstrings) { + vector expected_word_ids = {3, 4, 5}; + vector expected_words = {"are", "mere", "."}; + EXPECT_EQ(expected_word_ids, source_data.GetWordIds(1, 3)); + EXPECT_EQ(expected_words, source_data.GetWords(1, 3)); + + expected_word_ids = {7, 8}; + expected_words = {"a", "lot"}; + EXPECT_EQ(expected_word_ids, target_data.GetWordIds(7, 2)); + EXPECT_EQ(expected_words, target_data.GetWords(7, 2)); +} + TEST_F(DataArrayTest, TestVocabulary) { EXPECT_EQ(9, source_data.GetVocabularySize()); EXPECT_TRUE(source_data.HasWord("mere")); diff --git a/extractor/fast_intersector.cc b/extractor/fast_intersector.cc index a8591a72..0d1fa6d8 100644 --- a/extractor/fast_intersector.cc +++ b/extractor/fast_intersector.cc @@ -11,41 +11,22 @@ namespace extractor { -FastIntersector::FastIntersector(shared_ptr suffix_array, - shared_ptr precomputation, - shared_ptr vocabulary, - int max_rule_span, - int min_gap_size) : +FastIntersector::FastIntersector( + shared_ptr suffix_array, + shared_ptr precomputation, + shared_ptr vocabulary, + int max_rule_span, + int min_gap_size) : suffix_array(suffix_array), + precomputation(precomputation), vocabulary(vocabulary), max_rule_span(max_rule_span), - min_gap_size(min_gap_size) { - Index precomputed_collocations = precomputation->GetCollocations(); - for (pair, vector> entry: precomputed_collocations) { - vector phrase = ConvertPhrase(entry.first); - collocations[phrase] = entry.second; - } -} + min_gap_size(min_gap_size) {} FastIntersector::FastIntersector() {} FastIntersector::~FastIntersector() {} -vector FastIntersector::ConvertPhrase(const vector& old_phrase) { - vector new_phrase; - new_phrase.reserve(old_phrase.size()); - shared_ptr data_array = suffix_array->GetData(); - for (int word_id: old_phrase) { - if (word_id < 0) { - new_phrase.push_back(word_id); - } else { - new_phrase.push_back( - vocabulary->GetTerminalIndex(data_array->GetWord(word_id))); - } - } - return new_phrase; -} - PhraseLocation FastIntersector::Intersect( PhraseLocation& prefix_location, PhraseLocation& suffix_location, @@ -59,8 +40,9 @@ PhraseLocation FastIntersector::Intersect( assert(vocabulary->IsTerminal(symbols.front()) && vocabulary->IsTerminal(symbols.back())); - if (collocations.count(symbols)) { - return PhraseLocation(collocations[symbols], phrase.Arity() + 1); + if (precomputation->Contains(symbols)) { + return PhraseLocation(precomputation->GetCollocations(symbols), + phrase.Arity() + 1); } bool prefix_ends_with_x = diff --git a/extractor/fast_intersector.h b/extractor/fast_intersector.h index 2819d239..305373dc 100644 --- a/extractor/fast_intersector.h +++ b/extractor/fast_intersector.h @@ -12,7 +12,6 @@ using namespace std; namespace extractor { typedef boost::hash> VectorHash; -typedef unordered_map, vector, VectorHash> Index; class Phrase; class PhraseLocation; @@ -52,11 +51,6 @@ class FastIntersector { FastIntersector(); private: - // Uses the vocabulary to convert the phrase from the numberized format - // specified by the source data array to the numberized format given by the - // vocabulary. - vector ConvertPhrase(const vector& old_phrase); - // Estimates the number of computations needed if the prefix/suffix is // extended. If the last/first symbol is separated from the rest of the phrase // by a nonterminal, then for each occurrence of the prefix/suffix we need to @@ -85,10 +79,10 @@ class FastIntersector { pair GetSearchRange(bool has_marginal_x) const; shared_ptr suffix_array; + shared_ptr precomputation; shared_ptr vocabulary; int max_rule_span; int min_gap_size; - Index collocations; }; } // namespace extractor diff --git a/extractor/fast_intersector_test.cc b/extractor/fast_intersector_test.cc index 76c3aaea..f2a26ba1 100644 --- a/extractor/fast_intersector_test.cc +++ b/extractor/fast_intersector_test.cc @@ -59,15 +59,13 @@ class FastIntersectorTest : public Test { } precomputation = make_shared(); - EXPECT_CALL(*precomputation, GetCollocations()) - .WillRepeatedly(ReturnRef(collocations)); + EXPECT_CALL(*precomputation, Contains(_)).WillRepeatedly(Return(false)); phrase_builder = make_shared(vocabulary); intersector = make_shared(suffix_array, precomputation, vocabulary, 15, 1); } - Index collocations; shared_ptr data_array; shared_ptr suffix_array; shared_ptr precomputation; @@ -82,9 +80,9 @@ TEST_F(FastIntersectorTest, TestCachedCollocation) { Phrase phrase = phrase_builder->Build(symbols); PhraseLocation prefix_location(15, 16), suffix_location(16, 17); - collocations[symbols] = expected_location; - EXPECT_CALL(*precomputation, GetCollocations()) - .WillRepeatedly(ReturnRef(collocations)); + EXPECT_CALL(*precomputation, Contains(symbols)).WillRepeatedly(Return(true)); + EXPECT_CALL(*precomputation, GetCollocations(symbols)). + WillRepeatedly(Return(expected_location)); intersector = make_shared(suffix_array, precomputation, vocabulary, 15, 1); diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc index 487abcaf..4d0738f7 100644 --- a/extractor/grammar_extractor.cc +++ b/extractor/grammar_extractor.cc @@ -19,10 +19,11 @@ GrammarExtractor::GrammarExtractor( shared_ptr source_suffix_array, shared_ptr target_data_array, shared_ptr alignment, shared_ptr precomputation, - shared_ptr scorer, int min_gap_size, int max_rule_span, + shared_ptr scorer, shared_ptr vocabulary, + int min_gap_size, int max_rule_span, int max_nonterminals, int max_rule_symbols, int max_samples, bool require_tight_phrases) : - vocabulary(make_shared()), + vocabulary(vocabulary), rule_factory(make_shared( source_suffix_array, target_data_array, alignment, vocabulary, precomputation, scorer, min_gap_size, max_rule_span, max_nonterminals, diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h index ae407b47..8f570df2 100644 --- a/extractor/grammar_extractor.h +++ b/extractor/grammar_extractor.h @@ -32,6 +32,7 @@ class GrammarExtractor { shared_ptr alignment, shared_ptr precomputation, shared_ptr scorer, + shared_ptr vocabulary, int min_gap_size, int max_rule_span, int max_nonterminals, diff --git a/extractor/mocks/mock_data_array.h b/extractor/mocks/mock_data_array.h index 6f85abb4..4bdcf21f 100644 --- a/extractor/mocks/mock_data_array.h +++ b/extractor/mocks/mock_data_array.h @@ -6,9 +6,11 @@ namespace extractor { class MockDataArray : public DataArray { public: - MOCK_CONST_METHOD0(GetData, const vector&()); + MOCK_CONST_METHOD0(GetData, vector()); MOCK_CONST_METHOD1(AtIndex, int(int index)); MOCK_CONST_METHOD1(GetWordAtIndex, string(int index)); + MOCK_CONST_METHOD2(GetWordIds, vector(int start_index, int size)); + MOCK_CONST_METHOD2(GetWords, vector(int start_index, int size)); MOCK_CONST_METHOD0(GetSize, int()); MOCK_CONST_METHOD0(GetVocabularySize, int()); MOCK_CONST_METHOD1(HasWord, bool(const string& word)); diff --git a/extractor/mocks/mock_precomputation.h b/extractor/mocks/mock_precomputation.h index 8753343e..5f7aa999 100644 --- a/extractor/mocks/mock_precomputation.h +++ b/extractor/mocks/mock_precomputation.h @@ -6,7 +6,8 @@ namespace extractor { class MockPrecomputation : public Precomputation { public: - MOCK_CONST_METHOD0(GetCollocations, const Index&()); + MOCK_CONST_METHOD1(Contains, bool(const vector& pattern)); + MOCK_CONST_METHOD1(GetCollocations, vector(const vector& pattern)); }; } // namespace extractor diff --git a/extractor/precomputation.cc b/extractor/precomputation.cc index 3b8aed69..38d8f489 100644 --- a/extractor/precomputation.cc +++ b/extractor/precomputation.cc @@ -5,22 +5,21 @@ #include "data_array.h" #include "suffix_array.h" +#include "vocabulary.h" using namespace std; namespace extractor { -int Precomputation::FIRST_NONTERMINAL = -1; -int Precomputation::SECOND_NONTERMINAL = -2; +int Precomputation::NONTERMINAL = -1; Precomputation::Precomputation( - shared_ptr suffix_array, int num_frequent_patterns, - int num_super_frequent_patterns, int max_rule_span, - int max_rule_symbols, int min_gap_size, + shared_ptr vocabulary, shared_ptr suffix_array, + int num_frequent_patterns, int num_super_frequent_patterns, + int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency) { - vector data = suffix_array->GetData()->GetData(); vector> frequent_patterns = FindMostFrequentPatterns( - suffix_array, data, num_frequent_patterns, max_frequent_phrase_len, + suffix_array, num_frequent_patterns, max_frequent_phrase_len, min_frequency); // Construct sets containing the frequent and superfrequent contiguous @@ -34,28 +33,30 @@ Precomputation::Precomputation( } } + shared_ptr data_array = suffix_array->GetData(); vector> matchings; - for (size_t i = 0; i < data.size(); ++i) { + for (size_t i = 0; i < data_array->GetSize(); ++i) { // If the sentence is over, add all the discontiguous frequent patterns to // the index. - if (data[i] == DataArray::END_OF_LINE) { - AddCollocations(matchings, data, max_rule_span, min_gap_size, - max_rule_symbols); + if (data_array->AtIndex(i) == DataArray::END_OF_LINE) { + UpdateIndex(data_array, vocabulary, matchings, max_rule_span, + min_gap_size, max_rule_symbols); matchings.clear(); continue; } - vector pattern; // Find all the contiguous frequent patterns starting at position i. - for (int j = 1; j <= max_frequent_phrase_len && i + j <= data.size(); ++j) { - pattern.push_back(data[i + j - 1]); - if (frequent_patterns_set.count(pattern)) { - int is_super_frequent = super_frequent_patterns_set.count(pattern); - matchings.push_back(make_tuple(i, j, is_super_frequent)); - } else { + vector pattern; + for (int j = 1; + j <= max_frequent_phrase_len && i + j <= data_array->GetSize(); + ++j) { + pattern.push_back(data_array->AtIndex(i + j - 1)); + if (!frequent_patterns_set.count(pattern)) { // If the current pattern is not frequent, any longer pattern having the // current pattern as prefix will not be frequent. break; } + int is_super_frequent = super_frequent_patterns_set.count(pattern); + matchings.push_back(make_tuple(i, j, is_super_frequent)); } } } @@ -65,8 +66,8 @@ Precomputation::Precomputation() {} Precomputation::~Precomputation() {} vector> Precomputation::FindMostFrequentPatterns( - shared_ptr suffix_array, const vector& data, - int num_frequent_patterns, int max_frequent_phrase_len, int min_frequency) { + shared_ptr suffix_array, int num_frequent_patterns, + int max_frequent_phrase_len, int min_frequency) { vector lcp = suffix_array->BuildLCPArray(); vector run_start(max_frequent_phrase_len); @@ -83,6 +84,7 @@ vector> Precomputation::FindMostFrequentPatterns( } } + shared_ptr data_array = suffix_array->GetData(); // Extract the most frequent patterns. vector> frequent_patterns; while (frequent_patterns.size() < num_frequent_patterns && !heap.empty()) { @@ -90,7 +92,7 @@ vector> Precomputation::FindMostFrequentPatterns( int len = heap.top().second.second; heap.pop(); - vector pattern(data.begin() + start, data.begin() + start + len); + vector pattern = data_array->GetWordIds(start, len); if (find(pattern.begin(), pattern.end(), DataArray::END_OF_LINE) == pattern.end()) { frequent_patterns.push_back(pattern); @@ -99,8 +101,9 @@ vector> Precomputation::FindMostFrequentPatterns( return frequent_patterns; } -void Precomputation::AddCollocations( - const vector>& matchings, const vector& data, +void Precomputation::UpdateIndex( + shared_ptr data_array, shared_ptr vocabulary, + const vector>& matchings, int max_rule_span, int min_gap_size, int max_rule_symbols) { // Select the leftmost subpattern. for (size_t i = 0; i < matchings.size(); ++i) { @@ -118,16 +121,15 @@ void Precomputation::AddCollocations( if (start2 - start1 - size1 >= min_gap_size && start2 + size2 - start1 <= max_rule_span && size1 + size2 + 1 <= max_rule_symbols) { - vector pattern(data.begin() + start1, - data.begin() + start1 + size1); - pattern.push_back(Precomputation::FIRST_NONTERMINAL); - pattern.insert(pattern.end(), data.begin() + start2, - data.begin() + start2 + size2); - AddStartPositions(collocations[pattern], start1, start2); + vector pattern; + AppendSubpattern(pattern, data_array, vocabulary, start1, size1); + pattern.push_back(Precomputation::NONTERMINAL); + AppendSubpattern(pattern, data_array, vocabulary, start2, size2); + AppendCollocation(index[pattern], {start1, start2}); // Try extending the binary collocation to a ternary collocation. if (is_super2) { - pattern.push_back(Precomputation::SECOND_NONTERMINAL); + pattern.push_back(Precomputation::NONTERMINAL); // Select the rightmost subpattern. for (size_t k = j + 1; k < matchings.size(); ++k) { int start3, size3, is_super3; @@ -140,9 +142,8 @@ void Precomputation::AddCollocations( && start3 + size3 - start1 <= max_rule_span && size1 + size2 + size3 + 2 <= max_rule_symbols && (is_super1 || is_super3)) { - pattern.insert(pattern.end(), data.begin() + start3, - data.begin() + start3 + size3); - AddStartPositions(collocations[pattern], start1, start2, start3); + AppendSubpattern(pattern, data_array, vocabulary, start3, size3); + AppendCollocation(index[pattern], {start1, start2, start3}); pattern.erase(pattern.end() - size3); } } @@ -152,25 +153,30 @@ void Precomputation::AddCollocations( } } -void Precomputation::AddStartPositions( - vector& positions, int pos1, int pos2) { - positions.push_back(pos1); - positions.push_back(pos2); +void Precomputation::AppendSubpattern( + vector& pattern, shared_ptr data_array, + shared_ptr vocabulary, int start, int size) { + vector words = data_array->GetWords(start, size); + for (const string& word: words) { + pattern.push_back(vocabulary->GetTerminalIndex(word)); + } +} + +void Precomputation::AppendCollocation( + vector& collocations, const vector& collocation) { + copy(collocation.begin(), collocation.end(), back_inserter(collocations)); } -void Precomputation::AddStartPositions( - vector& positions, int pos1, int pos2, int pos3) { - positions.push_back(pos1); - positions.push_back(pos2); - positions.push_back(pos3); +bool Precomputation::Contains(const vector& pattern) const { + return index.count(pattern); } -const Index& Precomputation::GetCollocations() const { - return collocations; +vector Precomputation::GetCollocations(const vector& pattern) const { + return index.at(pattern); } bool Precomputation::operator==(const Precomputation& other) const { - return collocations == other.collocations; + return index == other.index; } } // namespace extractor diff --git a/extractor/precomputation.h b/extractor/precomputation.h index e5fa3e37..6ade58df 100644 --- a/extractor/precomputation.h +++ b/extractor/precomputation.h @@ -19,7 +19,9 @@ namespace extractor { typedef boost::hash> VectorHash; typedef unordered_map, vector, VectorHash> Index; +class DataArray; class SuffixArray; +class Vocabulary; /** * Data structure wrapping an index with all the occurrences of the most @@ -35,9 +37,9 @@ class Precomputation { public: // Constructs the index using the suffix array. Precomputation( - shared_ptr suffix_array, int num_frequent_patterns, - int num_super_frequent_patterns, int max_rule_span, - int max_rule_symbols, int min_gap_size, + shared_ptr vocabulary, shared_ptr suffix_array, + int num_frequent_patterns, int num_super_frequent_patterns, + int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency); // Creates empty precomputation data structure. @@ -45,40 +47,43 @@ class Precomputation { virtual ~Precomputation(); - // Returns a reference to the index. - virtual const Index& GetCollocations() const; + // Returns whether a pattern is contained in the index of collocations. + virtual bool Contains(const vector& pattern) const; + + // Returns the list of collocations for a given pattern. + virtual vector GetCollocations(const vector& pattern) const; bool operator==(const Precomputation& other) const; - static int FIRST_NONTERMINAL; - static int SECOND_NONTERMINAL; + static int NONTERMINAL; private: // Finds the most frequent contiguous collocations. vector> FindMostFrequentPatterns( - shared_ptr suffix_array, const vector& data, - int num_frequent_patterns, int max_frequent_phrase_len, - int min_frequency); + shared_ptr suffix_array, int num_frequent_patterns, + int max_frequent_phrase_len, int min_frequency); // Given the locations of the frequent contiguous collocations in a sentence, // it adds new entries to the index for each discontiguous collocation // matching the criteria specified in the class description. - void AddCollocations( - const vector>& matchings, const vector& data, + void UpdateIndex( + shared_ptr data_array, shared_ptr vocabulary, + const vector>& matchings, int max_rule_span, int min_gap_size, int max_rule_symbols); - // Adds an occurrence of a binary collocation. - void AddStartPositions(vector& positions, int pos1, int pos2); + void AppendSubpattern( + vector& pattern, shared_ptr data_array, + shared_ptr vocabulary, int start, int size); - // Adds an occurrence of a ternary collocation. - void AddStartPositions(vector& positions, int pos1, int pos2, int pos3); + // Adds an occurrence of a collocation. + void AppendCollocation(vector& collocations, const vector& collocation); friend class boost::serialization::access; template void save(Archive& ar, unsigned int) const { - int num_entries = collocations.size(); + int num_entries = index.size(); ar << num_entries; - for (pair, vector> entry: collocations) { + for (pair, vector> entry: index) { ar << entry; } } @@ -89,13 +94,13 @@ class Precomputation { for (size_t i = 0; i < num_entries; ++i) { pair, vector> entry; ar >> entry; - collocations.insert(entry); + index.insert(entry); } } BOOST_SERIALIZATION_SPLIT_MEMBER(); - Index collocations; + Index index; }; } // namespace extractor diff --git a/extractor/precomputation_test.cc b/extractor/precomputation_test.cc index e81ece5d..fd85fcf8 100644 --- a/extractor/precomputation_test.cc +++ b/extractor/precomputation_test.cc @@ -9,6 +9,7 @@ #include "mocks/mock_data_array.h" #include "mocks/mock_suffix_array.h" +#include "mocks/mock_vocabulary.h" #include "precomputation.h" using namespace std; @@ -23,7 +24,31 @@ class PrecomputationTest : public Test { virtual void SetUp() { data = {4, 2, 3, 5, 7, 2, 3, 5, 2, 3, 4, 2, 1}; data_array = make_shared(); - EXPECT_CALL(*data_array, GetData()).WillRepeatedly(ReturnRef(data)); + EXPECT_CALL(*data_array, GetSize()).WillRepeatedly(Return(data.size())); + for (size_t i = 0; i < data.size(); ++i) { + EXPECT_CALL(*data_array, AtIndex(i)).WillRepeatedly(Return(data[i])); + } + vector> expected_calls = {{8, 1}, {8, 2}, {6, 1}}; + for (const auto& call: expected_calls) { + int start = call.first; + int size = call.second; + vector word_ids(data.begin() + start, data.begin() + start + size); + EXPECT_CALL(*data_array, GetWordIds(start, size)) + .WillRepeatedly(Return(word_ids)); + } + + expected_calls = {{1, 1}, {5, 1}, {8, 1}, {9, 1}, {5, 2}, + {6, 1}, {8, 2}, {1, 2}, {2, 1}, {11, 1}}; + for (const auto& call: expected_calls) { + int start = call.first; + int size = call.second; + vector words; + for (size_t j = start; j < start + size; ++j) { + words.push_back(to_string(data[j])); + } + EXPECT_CALL(*data_array, GetWords(start, size)) + .WillRepeatedly(Return(words)); + } vector suffixes{12, 8, 5, 1, 9, 6, 2, 0, 10, 7, 3, 4, 13}; vector lcp{-1, 0, 2, 3, 1, 0, 1, 2, 0, 2, 0, 1, 0, 0}; @@ -35,77 +60,98 @@ class PrecomputationTest : public Test { } EXPECT_CALL(*suffix_array, BuildLCPArray()).WillRepeatedly(Return(lcp)); - precomputation = Precomputation(suffix_array, 3, 3, 10, 5, 1, 4, 2); + vocabulary = make_shared(); + EXPECT_CALL(*vocabulary, GetTerminalIndex("2")).WillRepeatedly(Return(2)); + EXPECT_CALL(*vocabulary, GetTerminalIndex("3")).WillRepeatedly(Return(3)); + + precomputation = Precomputation(vocabulary, suffix_array, + 3, 3, 10, 5, 1, 4, 2); } vector data; shared_ptr data_array; shared_ptr suffix_array; + shared_ptr vocabulary; Precomputation precomputation; }; TEST_F(PrecomputationTest, TestCollocations) { - Index collocations = precomputation.GetCollocations(); - vector key = {2, 3, -1, 2}; vector expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, 3, -1, 2, 3}; expected_value = {1, 5, 1, 8, 5, 8}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, 3, -1, 3}; expected_value = {1, 6, 1, 9, 5, 9}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {3, -1, 2}; expected_value = {2, 5, 2, 8, 2, 11, 6, 8, 6, 11, 9, 11}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {3, -1, 3}; expected_value = {2, 6, 2, 9, 6, 9}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {3, -1, 2, 3}; expected_value = {2, 5, 2, 8, 6, 8}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, -1, 2}; expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, -1, 2, 3}; expected_value = {1, 5, 1, 8, 5, 8}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, -1, 3}; expected_value = {1, 6, 1, 9, 5, 9}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); - key = {2, -1, 2, -2, 2}; + key = {2, -1, 2, -1, 2}; expected_value = {1, 5, 8, 5, 8, 11}; - EXPECT_EQ(expected_value, collocations[key]); - key = {2, -1, 2, -2, 3}; + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); + key = {2, -1, 2, -1, 3}; expected_value = {1, 5, 9}; - EXPECT_EQ(expected_value, collocations[key]); - key = {2, -1, 3, -2, 2}; + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); + key = {2, -1, 3, -1, 2}; expected_value = {1, 6, 8, 5, 9, 11}; - EXPECT_EQ(expected_value, collocations[key]); - key = {2, -1, 3, -2, 3}; + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); + key = {2, -1, 3, -1, 3}; expected_value = {1, 6, 9}; - EXPECT_EQ(expected_value, collocations[key]); - key = {3, -1, 2, -2, 2}; + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); + key = {3, -1, 2, -1, 2}; expected_value = {2, 5, 8, 2, 5, 11, 2, 8, 11, 6, 8, 11}; - EXPECT_EQ(expected_value, collocations[key]); - key = {3, -1, 2, -2, 3}; + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); + key = {3, -1, 2, -1, 3}; expected_value = {2, 5, 9}; - EXPECT_EQ(expected_value, collocations[key]); - key = {3, -1, 3, -2, 2}; + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); + key = {3, -1, 3, -1, 2}; expected_value = {2, 6, 8, 2, 6, 11, 2, 9, 11, 6, 9, 11}; - EXPECT_EQ(expected_value, collocations[key]); - key = {3, -1, 3, -2, 3}; + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); + key = {3, -1, 3, -1, 3}; expected_value = {2, 6, 9}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); // Exceeds max_rule_symbols. - key = {2, -1, 2, -2, 2, 3}; - EXPECT_EQ(0, collocations.count(key)); + key = {2, -1, 2, -1, 2, 3}; + EXPECT_FALSE(precomputation.Contains(key)); // Contains non frequent pattern. key = {2, -1, 5}; - EXPECT_EQ(0, collocations.count(key)); + EXPECT_FALSE(precomputation.Contains(key)); } TEST_F(PrecomputationTest, TestSerialization) { diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 6eb55073..85c8a422 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -28,6 +28,7 @@ #include "suffix_array.h" #include "time_util.h" #include "translation_table.h" +#include "vocabulary.h" namespace fs = boost::filesystem; namespace po = boost::program_options; @@ -142,11 +143,14 @@ int main(int argc, char** argv) { cerr << "Reading alignment took " << GetDuration(start_time, stop_time) << " seconds" << endl; + shared_ptr vocabulary = make_shared(); + // Constructs an index storing the occurrences in the source data for each // frequent collocation. start_time = Clock::now(); cerr << "Precomputing collocations..." << endl; shared_ptr precomputation = make_shared( + vocabulary, source_suffix_array, vm["frequent"].as(), vm["super_frequent"].as(), @@ -194,6 +198,7 @@ int main(int argc, char** argv) { alignment, precomputation, scorer, + vocabulary, vm["min_gap_size"].as(), vm["max_rule_span"].as(), vm["max_nonterminals"].as(), diff --git a/extractor/suffix_array_test.cc b/extractor/suffix_array_test.cc index ba0dbcc3..a9fd1eab 100644 --- a/extractor/suffix_array_test.cc +++ b/extractor/suffix_array_test.cc @@ -21,7 +21,7 @@ class SuffixArrayTest : public Test { virtual void SetUp() { data = {6, 4, 1, 2, 4, 5, 3, 4, 6, 6, 4, 1, 2}; data_array = make_shared(); - EXPECT_CALL(*data_array, GetData()).WillRepeatedly(ReturnRef(data)); + EXPECT_CALL(*data_array, GetData()).WillRepeatedly(Return(data)); EXPECT_CALL(*data_array, GetVocabularySize()).WillRepeatedly(Return(7)); EXPECT_CALL(*data_array, GetSize()).WillRepeatedly(Return(13)); suffix_array = SuffixArray(data_array); diff --git a/extractor/translation_table_test.cc b/extractor/translation_table_test.cc index 606777bd..72551a12 100644 --- a/extractor/translation_table_test.cc +++ b/extractor/translation_table_test.cc @@ -28,7 +28,7 @@ class TranslationTableTest : public Test { vector source_sentence_start = {0, 6, 10, 14}; shared_ptr source_data_array = make_shared(); EXPECT_CALL(*source_data_array, GetData()) - .WillRepeatedly(ReturnRef(source_data)); + .WillRepeatedly(Return(source_data)); EXPECT_CALL(*source_data_array, GetNumSentences()) .WillRepeatedly(Return(3)); for (size_t i = 0; i < source_sentence_start.size(); ++i) { @@ -48,7 +48,7 @@ class TranslationTableTest : public Test { vector target_sentence_start = {0, 7, 10, 13}; shared_ptr target_data_array = make_shared(); EXPECT_CALL(*target_data_array, GetData()) - .WillRepeatedly(ReturnRef(target_data)); + .WillRepeatedly(Return(target_data)); for (size_t i = 0; i < target_sentence_start.size(); ++i) { EXPECT_CALL(*target_data_array, GetSentenceStart(i)) .WillRepeatedly(Return(target_sentence_start[i])); -- cgit v1.2.3 From 3973a7e4a8302b4a02fee7d2950bb469b37e2452 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Sun, 24 Nov 2013 13:19:28 +0000 Subject: Reduce memory overhead for constructing the intersector. --- extractor/Makefile.am | 3 +- extractor/compile.cc | 4 ++ extractor/data_array.cc | 2 +- extractor/data_array.h | 2 +- extractor/fast_intersector.cc | 40 +++-------- extractor/fast_intersector.h | 8 +-- extractor/fast_intersector_test.cc | 10 ++- extractor/grammar_extractor.cc | 5 +- extractor/grammar_extractor.h | 1 + extractor/mocks/mock_data_array.h | 2 +- extractor/mocks/mock_precomputation.h | 3 +- extractor/precomputation.cc | 125 +++++++++++++++++++++------------- extractor/precomputation.h | 41 ++++++----- extractor/precomputation_test.cc | 73 +++++++++++++------- extractor/run_extractor.cc | 5 ++ extractor/suffix_array_test.cc | 2 +- extractor/translation_table_test.cc | 4 +- extractor/vocabulary.cc | 7 +- 18 files changed, 194 insertions(+), 143 deletions(-) diff --git a/extractor/Makefile.am b/extractor/Makefile.am index 65a3d436..faf25d89 100644 --- a/extractor/Makefile.am +++ b/extractor/Makefile.am @@ -53,7 +53,8 @@ endif noinst_PROGRAMS = $(RUNNABLE_TESTS) -TESTS = $(RUNNABLE_TESTS) +# TESTS = $(RUNNABLE_TESTS) +TESTS = precomputation_test alignment_test_SOURCES = alignment_test.cc alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a diff --git a/extractor/compile.cc b/extractor/compile.cc index 65fdd509..0d62757e 100644 --- a/extractor/compile.cc +++ b/extractor/compile.cc @@ -13,6 +13,7 @@ #include "suffix_array.h" #include "time_util.h" #include "translation_table.h" +#include "vocabulary.h" namespace ar = boost::archive; namespace fs = boost::filesystem; @@ -125,9 +126,12 @@ int main(int argc, char** argv) { cerr << "Reading alignment took " << GetDuration(start_time, stop_time) << " seconds" << endl; + shared_ptr vocabulary; + start_time = Clock::now(); cerr << "Precomputing collocations..." << endl; Precomputation precomputation( + vocabulary, source_suffix_array, vm["frequent"].as(), vm["super_frequent"].as(), diff --git a/extractor/data_array.cc b/extractor/data_array.cc index 82efcd51..6757cae7 100644 --- a/extractor/data_array.cc +++ b/extractor/data_array.cc @@ -78,7 +78,7 @@ void DataArray::CreateDataArray(const vector& lines) { DataArray::~DataArray() {} -const vector& DataArray::GetData() const { +vector DataArray::GetData() const { return data; } diff --git a/extractor/data_array.h b/extractor/data_array.h index 5207366d..e9af5bd0 100644 --- a/extractor/data_array.h +++ b/extractor/data_array.h @@ -51,7 +51,7 @@ class DataArray { virtual ~DataArray(); // Returns a vector containing the word ids. - virtual const vector& GetData() const; + virtual vector GetData() const; // Returns the word id at the specified position. virtual int AtIndex(int index) const; diff --git a/extractor/fast_intersector.cc b/extractor/fast_intersector.cc index a8591a72..0d1fa6d8 100644 --- a/extractor/fast_intersector.cc +++ b/extractor/fast_intersector.cc @@ -11,41 +11,22 @@ namespace extractor { -FastIntersector::FastIntersector(shared_ptr suffix_array, - shared_ptr precomputation, - shared_ptr vocabulary, - int max_rule_span, - int min_gap_size) : +FastIntersector::FastIntersector( + shared_ptr suffix_array, + shared_ptr precomputation, + shared_ptr vocabulary, + int max_rule_span, + int min_gap_size) : suffix_array(suffix_array), + precomputation(precomputation), vocabulary(vocabulary), max_rule_span(max_rule_span), - min_gap_size(min_gap_size) { - Index precomputed_collocations = precomputation->GetCollocations(); - for (pair, vector> entry: precomputed_collocations) { - vector phrase = ConvertPhrase(entry.first); - collocations[phrase] = entry.second; - } -} + min_gap_size(min_gap_size) {} FastIntersector::FastIntersector() {} FastIntersector::~FastIntersector() {} -vector FastIntersector::ConvertPhrase(const vector& old_phrase) { - vector new_phrase; - new_phrase.reserve(old_phrase.size()); - shared_ptr data_array = suffix_array->GetData(); - for (int word_id: old_phrase) { - if (word_id < 0) { - new_phrase.push_back(word_id); - } else { - new_phrase.push_back( - vocabulary->GetTerminalIndex(data_array->GetWord(word_id))); - } - } - return new_phrase; -} - PhraseLocation FastIntersector::Intersect( PhraseLocation& prefix_location, PhraseLocation& suffix_location, @@ -59,8 +40,9 @@ PhraseLocation FastIntersector::Intersect( assert(vocabulary->IsTerminal(symbols.front()) && vocabulary->IsTerminal(symbols.back())); - if (collocations.count(symbols)) { - return PhraseLocation(collocations[symbols], phrase.Arity() + 1); + if (precomputation->Contains(symbols)) { + return PhraseLocation(precomputation->GetCollocations(symbols), + phrase.Arity() + 1); } bool prefix_ends_with_x = diff --git a/extractor/fast_intersector.h b/extractor/fast_intersector.h index 2819d239..305373dc 100644 --- a/extractor/fast_intersector.h +++ b/extractor/fast_intersector.h @@ -12,7 +12,6 @@ using namespace std; namespace extractor { typedef boost::hash> VectorHash; -typedef unordered_map, vector, VectorHash> Index; class Phrase; class PhraseLocation; @@ -52,11 +51,6 @@ class FastIntersector { FastIntersector(); private: - // Uses the vocabulary to convert the phrase from the numberized format - // specified by the source data array to the numberized format given by the - // vocabulary. - vector ConvertPhrase(const vector& old_phrase); - // Estimates the number of computations needed if the prefix/suffix is // extended. If the last/first symbol is separated from the rest of the phrase // by a nonterminal, then for each occurrence of the prefix/suffix we need to @@ -85,10 +79,10 @@ class FastIntersector { pair GetSearchRange(bool has_marginal_x) const; shared_ptr suffix_array; + shared_ptr precomputation; shared_ptr vocabulary; int max_rule_span; int min_gap_size; - Index collocations; }; } // namespace extractor diff --git a/extractor/fast_intersector_test.cc b/extractor/fast_intersector_test.cc index 76c3aaea..f2a26ba1 100644 --- a/extractor/fast_intersector_test.cc +++ b/extractor/fast_intersector_test.cc @@ -59,15 +59,13 @@ class FastIntersectorTest : public Test { } precomputation = make_shared(); - EXPECT_CALL(*precomputation, GetCollocations()) - .WillRepeatedly(ReturnRef(collocations)); + EXPECT_CALL(*precomputation, Contains(_)).WillRepeatedly(Return(false)); phrase_builder = make_shared(vocabulary); intersector = make_shared(suffix_array, precomputation, vocabulary, 15, 1); } - Index collocations; shared_ptr data_array; shared_ptr suffix_array; shared_ptr precomputation; @@ -82,9 +80,9 @@ TEST_F(FastIntersectorTest, TestCachedCollocation) { Phrase phrase = phrase_builder->Build(symbols); PhraseLocation prefix_location(15, 16), suffix_location(16, 17); - collocations[symbols] = expected_location; - EXPECT_CALL(*precomputation, GetCollocations()) - .WillRepeatedly(ReturnRef(collocations)); + EXPECT_CALL(*precomputation, Contains(symbols)).WillRepeatedly(Return(true)); + EXPECT_CALL(*precomputation, GetCollocations(symbols)). + WillRepeatedly(Return(expected_location)); intersector = make_shared(suffix_array, precomputation, vocabulary, 15, 1); diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc index 487abcaf..4d0738f7 100644 --- a/extractor/grammar_extractor.cc +++ b/extractor/grammar_extractor.cc @@ -19,10 +19,11 @@ GrammarExtractor::GrammarExtractor( shared_ptr source_suffix_array, shared_ptr target_data_array, shared_ptr alignment, shared_ptr precomputation, - shared_ptr scorer, int min_gap_size, int max_rule_span, + shared_ptr scorer, shared_ptr vocabulary, + int min_gap_size, int max_rule_span, int max_nonterminals, int max_rule_symbols, int max_samples, bool require_tight_phrases) : - vocabulary(make_shared()), + vocabulary(vocabulary), rule_factory(make_shared( source_suffix_array, target_data_array, alignment, vocabulary, precomputation, scorer, min_gap_size, max_rule_span, max_nonterminals, diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h index ae407b47..8f570df2 100644 --- a/extractor/grammar_extractor.h +++ b/extractor/grammar_extractor.h @@ -32,6 +32,7 @@ class GrammarExtractor { shared_ptr alignment, shared_ptr precomputation, shared_ptr scorer, + shared_ptr vocabulary, int min_gap_size, int max_rule_span, int max_nonterminals, diff --git a/extractor/mocks/mock_data_array.h b/extractor/mocks/mock_data_array.h index 6f85abb4..d39cb0c4 100644 --- a/extractor/mocks/mock_data_array.h +++ b/extractor/mocks/mock_data_array.h @@ -6,7 +6,7 @@ namespace extractor { class MockDataArray : public DataArray { public: - MOCK_CONST_METHOD0(GetData, const vector&()); + MOCK_CONST_METHOD0(GetData, vector()); MOCK_CONST_METHOD1(AtIndex, int(int index)); MOCK_CONST_METHOD1(GetWordAtIndex, string(int index)); MOCK_CONST_METHOD0(GetSize, int()); diff --git a/extractor/mocks/mock_precomputation.h b/extractor/mocks/mock_precomputation.h index 8753343e..5f7aa999 100644 --- a/extractor/mocks/mock_precomputation.h +++ b/extractor/mocks/mock_precomputation.h @@ -6,7 +6,8 @@ namespace extractor { class MockPrecomputation : public Precomputation { public: - MOCK_CONST_METHOD0(GetCollocations, const Index&()); + MOCK_CONST_METHOD1(Contains, bool(const vector& pattern)); + MOCK_CONST_METHOD1(GetCollocations, vector(const vector& pattern)); }; } // namespace extractor diff --git a/extractor/precomputation.cc b/extractor/precomputation.cc index 3b8aed69..3e58e2a9 100644 --- a/extractor/precomputation.cc +++ b/extractor/precomputation.cc @@ -5,59 +5,67 @@ #include "data_array.h" #include "suffix_array.h" +#include "time_util.h" +#include "vocabulary.h" using namespace std; namespace extractor { -int Precomputation::FIRST_NONTERMINAL = -1; -int Precomputation::SECOND_NONTERMINAL = -2; - Precomputation::Precomputation( - shared_ptr suffix_array, int num_frequent_patterns, - int num_super_frequent_patterns, int max_rule_span, - int max_rule_symbols, int min_gap_size, + shared_ptr vocabulary, shared_ptr suffix_array, + int num_frequent_patterns, int num_super_frequent_patterns, + int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency) { - vector data = suffix_array->GetData()->GetData(); + Clock::time_point start_time = Clock::now(); + shared_ptr data_array = suffix_array->GetData(); + vector data = data_array->GetData(); vector> frequent_patterns = FindMostFrequentPatterns( suffix_array, data, num_frequent_patterns, max_frequent_phrase_len, min_frequency); + Clock::time_point end_time = Clock::now(); + cerr << "Finding most frequent patterns took " + << GetDuration(start_time, end_time) << " seconds..." << endl; - // Construct sets containing the frequent and superfrequent contiguous - // collocations. - unordered_set, VectorHash> frequent_patterns_set; - unordered_set, VectorHash> super_frequent_patterns_set; + vector> pattern_annotations(frequent_patterns.size()); + unordered_map, int, VectorHash> frequent_patterns_index; for (size_t i = 0; i < frequent_patterns.size(); ++i) { - frequent_patterns_set.insert(frequent_patterns[i]); - if (i < num_super_frequent_patterns) { - super_frequent_patterns_set.insert(frequent_patterns[i]); - } + frequent_patterns_index[frequent_patterns[i]] = i; + pattern_annotations[i] = AnnotatePattern(vocabulary, data_array, + frequent_patterns[i]); } + start_time = Clock::now(); vector> matchings; + vector> annotations; for (size_t i = 0; i < data.size(); ++i) { // If the sentence is over, add all the discontiguous frequent patterns to // the index. if (data[i] == DataArray::END_OF_LINE) { - AddCollocations(matchings, data, max_rule_span, min_gap_size, - max_rule_symbols); + UpdateIndex(matchings, annotations, max_rule_span, min_gap_size, + max_rule_symbols); matchings.clear(); + annotations.clear(); continue; } - vector pattern; // Find all the contiguous frequent patterns starting at position i. + vector pattern; for (int j = 1; j <= max_frequent_phrase_len && i + j <= data.size(); ++j) { pattern.push_back(data[i + j - 1]); - if (frequent_patterns_set.count(pattern)) { - int is_super_frequent = super_frequent_patterns_set.count(pattern); - matchings.push_back(make_tuple(i, j, is_super_frequent)); - } else { + auto it = frequent_patterns_index.find(pattern); + if (it == frequent_patterns_index.end()) { // If the current pattern is not frequent, any longer pattern having the // current pattern as prefix will not be frequent. break; } + int is_super_frequent = it->second < num_super_frequent_patterns; + matchings.push_back(make_tuple(i, j, is_super_frequent)); + annotations.push_back(pattern_annotations[it->second]); } } + end_time = Clock::now(); + cerr << "Constructing collocations index took " + << GetDuration(start_time, end_time) << " seconds..." << endl; } Precomputation::Precomputation() {} @@ -75,9 +83,9 @@ vector> Precomputation::FindMostFrequentPatterns( for (size_t i = 1; i < lcp.size(); ++i) { for (int len = lcp[i]; len < max_frequent_phrase_len; ++len) { int frequency = i - run_start[len]; - if (frequency >= min_frequency) { - heap.push(make_pair(frequency, - make_pair(suffix_array->GetSuffix(run_start[len]), len + 1))); + int start = suffix_array->GetSuffix(run_start[len]); + if (frequency >= min_frequency && start + len <= data.size()) { + heap.push(make_pair(frequency, make_pair(start, len + 1))); } run_start[len] = i; } @@ -99,8 +107,20 @@ vector> Precomputation::FindMostFrequentPatterns( return frequent_patterns; } -void Precomputation::AddCollocations( - const vector>& matchings, const vector& data, +vector Precomputation::AnnotatePattern( + shared_ptr vocabulary, shared_ptr data_array, + const vector& pattern) const { + vector annotation; + for (int word_id: pattern) { + annotation.push_back(vocabulary->GetTerminalIndex( + data_array->GetWord(word_id))); + } + return annotation; +} + +void Precomputation::UpdateIndex( + const vector>& matchings, + const vector>& annotations, int max_rule_span, int min_gap_size, int max_rule_symbols) { // Select the leftmost subpattern. for (size_t i = 0; i < matchings.size(); ++i) { @@ -118,16 +138,14 @@ void Precomputation::AddCollocations( if (start2 - start1 - size1 >= min_gap_size && start2 + size2 - start1 <= max_rule_span && size1 + size2 + 1 <= max_rule_symbols) { - vector pattern(data.begin() + start1, - data.begin() + start1 + size1); - pattern.push_back(Precomputation::FIRST_NONTERMINAL); - pattern.insert(pattern.end(), data.begin() + start2, - data.begin() + start2 + size2); - AddStartPositions(collocations[pattern], start1, start2); + vector pattern = annotations[i]; + pattern.push_back(-1); + AppendSubpattern(pattern, annotations[j]); + AppendCollocation(index[pattern], start1, start2); // Try extending the binary collocation to a ternary collocation. if (is_super2) { - pattern.push_back(Precomputation::SECOND_NONTERMINAL); + pattern.push_back(-2); // Select the rightmost subpattern. for (size_t k = j + 1; k < matchings.size(); ++k) { int start3, size3, is_super3; @@ -140,9 +158,8 @@ void Precomputation::AddCollocations( && start3 + size3 - start1 <= max_rule_span && size1 + size2 + size3 + 2 <= max_rule_symbols && (is_super1 || is_super3)) { - pattern.insert(pattern.end(), data.begin() + start3, - data.begin() + start3 + size3); - AddStartPositions(collocations[pattern], start1, start2, start3); + AppendSubpattern(pattern, annotations[k]); + AppendCollocation(index[pattern], start1, start2, start3); pattern.erase(pattern.end() - size3); } } @@ -152,25 +169,35 @@ void Precomputation::AddCollocations( } } -void Precomputation::AddStartPositions( - vector& positions, int pos1, int pos2) { - positions.push_back(pos1); - positions.push_back(pos2); +void Precomputation::AppendSubpattern( + vector& pattern, + const vector& subpattern) { + copy(subpattern.begin(), subpattern.end(), back_inserter(pattern)); +} + +void Precomputation::AppendCollocation( + vector& collocations, int pos1, int pos2) { + collocations.push_back(pos1); + collocations.push_back(pos2); +} + +void Precomputation::AppendCollocation( + vector& collocations, int pos1, int pos2, int pos3) { + collocations.push_back(pos1); + collocations.push_back(pos2); + collocations.push_back(pos3); } -void Precomputation::AddStartPositions( - vector& positions, int pos1, int pos2, int pos3) { - positions.push_back(pos1); - positions.push_back(pos2); - positions.push_back(pos3); +bool Precomputation::Contains(const vector& pattern) const { + return index.count(pattern); } -const Index& Precomputation::GetCollocations() const { - return collocations; +vector Precomputation::GetCollocations(const vector& pattern) const { + return index.at(pattern); } bool Precomputation::operator==(const Precomputation& other) const { - return collocations == other.collocations; + return index == other.index; } } // namespace extractor diff --git a/extractor/precomputation.h b/extractor/precomputation.h index e5fa3e37..2b34fc29 100644 --- a/extractor/precomputation.h +++ b/extractor/precomputation.h @@ -19,7 +19,9 @@ namespace extractor { typedef boost::hash> VectorHash; typedef unordered_map, vector, VectorHash> Index; +class DataArray; class SuffixArray; +class Vocabulary; /** * Data structure wrapping an index with all the occurrences of the most @@ -35,9 +37,9 @@ class Precomputation { public: // Constructs the index using the suffix array. Precomputation( - shared_ptr suffix_array, int num_frequent_patterns, - int num_super_frequent_patterns, int max_rule_span, - int max_rule_symbols, int min_gap_size, + shared_ptr vocabulary, shared_ptr suffix_array, + int num_frequent_patterns, int num_super_frequent_patterns, + int max_rule_span, int max_rule_symbols, int min_gap_size, int max_frequent_phrase_len, int min_frequency); // Creates empty precomputation data structure. @@ -45,13 +47,13 @@ class Precomputation { virtual ~Precomputation(); - // Returns a reference to the index. - virtual const Index& GetCollocations() const; + // Returns whether a pattern is contained in the index of collocations. + virtual bool Contains(const vector& pattern) const; - bool operator==(const Precomputation& other) const; + // Returns the list of collocations for a given pattern. + virtual vector GetCollocations(const vector& pattern) const; - static int FIRST_NONTERMINAL; - static int SECOND_NONTERMINAL; + bool operator==(const Precomputation& other) const; private: // Finds the most frequent contiguous collocations. @@ -60,25 +62,32 @@ class Precomputation { int num_frequent_patterns, int max_frequent_phrase_len, int min_frequency); + vector AnnotatePattern(shared_ptr vocabulary, + shared_ptr data_array, + const vector& pattern) const; + // Given the locations of the frequent contiguous collocations in a sentence, // it adds new entries to the index for each discontiguous collocation // matching the criteria specified in the class description. - void AddCollocations( - const vector>& matchings, const vector& data, + void UpdateIndex( + const vector>& matchings, + const vector>& annotations, int max_rule_span, int min_gap_size, int max_rule_symbols); + void AppendSubpattern(vector& pattern, const vector& subpattern); + // Adds an occurrence of a binary collocation. - void AddStartPositions(vector& positions, int pos1, int pos2); + void AppendCollocation(vector& collocations, int pos1, int pos2); // Adds an occurrence of a ternary collocation. - void AddStartPositions(vector& positions, int pos1, int pos2, int pos3); + void AppendCollocation(vector& collocations, int pos1, int pos2, int pos3); friend class boost::serialization::access; template void save(Archive& ar, unsigned int) const { - int num_entries = collocations.size(); + int num_entries = index.size(); ar << num_entries; - for (pair, vector> entry: collocations) { + for (pair, vector> entry: index) { ar << entry; } } @@ -89,13 +98,13 @@ class Precomputation { for (size_t i = 0; i < num_entries; ++i) { pair, vector> entry; ar >> entry; - collocations.insert(entry); + index.insert(entry); } } BOOST_SERIALIZATION_SPLIT_MEMBER(); - Index collocations; + Index index; }; } // namespace extractor diff --git a/extractor/precomputation_test.cc b/extractor/precomputation_test.cc index e81ece5d..3a98ce05 100644 --- a/extractor/precomputation_test.cc +++ b/extractor/precomputation_test.cc @@ -9,6 +9,7 @@ #include "mocks/mock_data_array.h" #include "mocks/mock_suffix_array.h" +#include "mocks/mock_vocabulary.h" #include "precomputation.h" using namespace std; @@ -23,7 +24,12 @@ class PrecomputationTest : public Test { virtual void SetUp() { data = {4, 2, 3, 5, 7, 2, 3, 5, 2, 3, 4, 2, 1}; data_array = make_shared(); - EXPECT_CALL(*data_array, GetData()).WillRepeatedly(ReturnRef(data)); + EXPECT_CALL(*data_array, GetData()).WillRepeatedly(Return(data)); + for (size_t i = 0; i < data.size(); ++i) { + EXPECT_CALL(*data_array, AtIndex(i)).WillRepeatedly(Return(data[i])); + } + EXPECT_CALL(*data_array, GetWord(2)).WillRepeatedly(Return("2")); + EXPECT_CALL(*data_array, GetWord(3)).WillRepeatedly(Return("3")); vector suffixes{12, 8, 5, 1, 9, 6, 2, 0, 10, 7, 3, 4, 13}; vector lcp{-1, 0, 2, 3, 1, 0, 1, 2, 0, 2, 0, 1, 0, 0}; @@ -35,77 +41,98 @@ class PrecomputationTest : public Test { } EXPECT_CALL(*suffix_array, BuildLCPArray()).WillRepeatedly(Return(lcp)); - precomputation = Precomputation(suffix_array, 3, 3, 10, 5, 1, 4, 2); + vocabulary = make_shared(); + EXPECT_CALL(*vocabulary, GetTerminalIndex("2")).WillRepeatedly(Return(2)); + EXPECT_CALL(*vocabulary, GetTerminalIndex("3")).WillRepeatedly(Return(3)); + + precomputation = Precomputation(vocabulary, suffix_array, + 3, 3, 10, 5, 1, 4, 2); } vector data; shared_ptr data_array; shared_ptr suffix_array; + shared_ptr vocabulary; Precomputation precomputation; }; TEST_F(PrecomputationTest, TestCollocations) { - Index collocations = precomputation.GetCollocations(); - vector key = {2, 3, -1, 2}; vector expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, 3, -1, 2, 3}; expected_value = {1, 5, 1, 8, 5, 8}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, 3, -1, 3}; expected_value = {1, 6, 1, 9, 5, 9}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {3, -1, 2}; expected_value = {2, 5, 2, 8, 2, 11, 6, 8, 6, 11, 9, 11}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {3, -1, 3}; expected_value = {2, 6, 2, 9, 6, 9}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {3, -1, 2, 3}; expected_value = {2, 5, 2, 8, 6, 8}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, -1, 2}; expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, -1, 2, 3}; expected_value = {1, 5, 1, 8, 5, 8}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, -1, 3}; expected_value = {1, 6, 1, 9, 5, 9}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, -1, 2, -2, 2}; expected_value = {1, 5, 8, 5, 8, 11}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, -1, 2, -2, 3}; expected_value = {1, 5, 9}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, -1, 3, -2, 2}; expected_value = {1, 6, 8, 5, 9, 11}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {2, -1, 3, -2, 3}; expected_value = {1, 6, 9}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {3, -1, 2, -2, 2}; expected_value = {2, 5, 8, 2, 5, 11, 2, 8, 11, 6, 8, 11}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {3, -1, 2, -2, 3}; expected_value = {2, 5, 9}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {3, -1, 3, -2, 2}; expected_value = {2, 6, 8, 2, 6, 11, 2, 9, 11, 6, 9, 11}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); key = {3, -1, 3, -2, 3}; expected_value = {2, 6, 9}; - EXPECT_EQ(expected_value, collocations[key]); + EXPECT_TRUE(precomputation.Contains(key)); + EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); // Exceeds max_rule_symbols. key = {2, -1, 2, -2, 2, 3}; - EXPECT_EQ(0, collocations.count(key)); + EXPECT_FALSE(precomputation.Contains(key)); // Contains non frequent pattern. key = {2, -1, 5}; - EXPECT_EQ(0, collocations.count(key)); + EXPECT_FALSE(precomputation.Contains(key)); } TEST_F(PrecomputationTest, TestSerialization) { diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 6eb55073..85c8a422 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -28,6 +28,7 @@ #include "suffix_array.h" #include "time_util.h" #include "translation_table.h" +#include "vocabulary.h" namespace fs = boost::filesystem; namespace po = boost::program_options; @@ -142,11 +143,14 @@ int main(int argc, char** argv) { cerr << "Reading alignment took " << GetDuration(start_time, stop_time) << " seconds" << endl; + shared_ptr vocabulary = make_shared(); + // Constructs an index storing the occurrences in the source data for each // frequent collocation. start_time = Clock::now(); cerr << "Precomputing collocations..." << endl; shared_ptr precomputation = make_shared( + vocabulary, source_suffix_array, vm["frequent"].as(), vm["super_frequent"].as(), @@ -194,6 +198,7 @@ int main(int argc, char** argv) { alignment, precomputation, scorer, + vocabulary, vm["min_gap_size"].as(), vm["max_rule_span"].as(), vm["max_nonterminals"].as(), diff --git a/extractor/suffix_array_test.cc b/extractor/suffix_array_test.cc index ba0dbcc3..a9fd1eab 100644 --- a/extractor/suffix_array_test.cc +++ b/extractor/suffix_array_test.cc @@ -21,7 +21,7 @@ class SuffixArrayTest : public Test { virtual void SetUp() { data = {6, 4, 1, 2, 4, 5, 3, 4, 6, 6, 4, 1, 2}; data_array = make_shared(); - EXPECT_CALL(*data_array, GetData()).WillRepeatedly(ReturnRef(data)); + EXPECT_CALL(*data_array, GetData()).WillRepeatedly(Return(data)); EXPECT_CALL(*data_array, GetVocabularySize()).WillRepeatedly(Return(7)); EXPECT_CALL(*data_array, GetSize()).WillRepeatedly(Return(13)); suffix_array = SuffixArray(data_array); diff --git a/extractor/translation_table_test.cc b/extractor/translation_table_test.cc index 606777bd..72551a12 100644 --- a/extractor/translation_table_test.cc +++ b/extractor/translation_table_test.cc @@ -28,7 +28,7 @@ class TranslationTableTest : public Test { vector source_sentence_start = {0, 6, 10, 14}; shared_ptr source_data_array = make_shared(); EXPECT_CALL(*source_data_array, GetData()) - .WillRepeatedly(ReturnRef(source_data)); + .WillRepeatedly(Return(source_data)); EXPECT_CALL(*source_data_array, GetNumSentences()) .WillRepeatedly(Return(3)); for (size_t i = 0; i < source_sentence_start.size(); ++i) { @@ -48,7 +48,7 @@ class TranslationTableTest : public Test { vector target_sentence_start = {0, 7, 10, 13}; shared_ptr target_data_array = make_shared(); EXPECT_CALL(*target_data_array, GetData()) - .WillRepeatedly(ReturnRef(target_data)); + .WillRepeatedly(Return(target_data)); for (size_t i = 0; i < target_sentence_start.size(); ++i) { EXPECT_CALL(*target_data_array, GetSentenceStart(i)) .WillRepeatedly(Return(target_sentence_start[i])); diff --git a/extractor/vocabulary.cc b/extractor/vocabulary.cc index 15795d1e..aef674a5 100644 --- a/extractor/vocabulary.cc +++ b/extractor/vocabulary.cc @@ -8,12 +8,13 @@ int Vocabulary::GetTerminalIndex(const string& word) { int word_id = -1; #pragma omp critical (vocabulary) { - if (!dictionary.count(word)) { + auto it = dictionary.find(word); + if (it != dictionary.end()) { + word_id = it->second; + } else { word_id = words.size(); dictionary[word] = word_id; words.push_back(word); - } else { - word_id = dictionary[word]; } } return word_id; -- cgit v1.2.3 From 467ef6ce78cfe7341a696ebf0948e377be619ae5 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Mon, 25 Nov 2013 18:19:13 +0000 Subject: Reduce unordered_map calls. --- extractor/Makefile.am | 3 +-- extractor/data_array.cc | 4 ---- extractor/data_array.h | 3 --- extractor/data_array_test.cc | 4 ---- extractor/mocks/mock_data_array.h | 1 - extractor/suffix_array.cc | 4 ++-- extractor/suffix_array_test.cc | 6 +----- extractor/translation_table.cc | 14 ++++++-------- extractor/translation_table_test.cc | 10 ++-------- 9 files changed, 12 insertions(+), 37 deletions(-) diff --git a/extractor/Makefile.am b/extractor/Makefile.am index faf25d89..65a3d436 100644 --- a/extractor/Makefile.am +++ b/extractor/Makefile.am @@ -53,8 +53,7 @@ endif noinst_PROGRAMS = $(RUNNABLE_TESTS) -# TESTS = $(RUNNABLE_TESTS) -TESTS = precomputation_test +TESTS = $(RUNNABLE_TESTS) alignment_test_SOURCES = alignment_test.cc alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a diff --git a/extractor/data_array.cc b/extractor/data_array.cc index 6757cae7..ac0493fd 100644 --- a/extractor/data_array.cc +++ b/extractor/data_array.cc @@ -115,10 +115,6 @@ int DataArray::GetSentenceId(int position) const { return sentence_id[position]; } -bool DataArray::HasWord(const string& word) const { - return word2id.count(word); -} - int DataArray::GetWordId(const string& word) const { auto result = word2id.find(word); return result == word2id.end() ? -1 : result->second; diff --git a/extractor/data_array.h b/extractor/data_array.h index e9af5bd0..c5dc8a26 100644 --- a/extractor/data_array.h +++ b/extractor/data_array.h @@ -65,9 +65,6 @@ class DataArray { // Returns the number of distinct words in the data array. virtual int GetVocabularySize() const; - // Returns whether a word has ever been observed in the data array. - virtual bool HasWord(const string& word) const; - // Returns the word id for a given word or -1 if it the word has never been // observed. virtual int GetWordId(const string& word) const; diff --git a/extractor/data_array_test.cc b/extractor/data_array_test.cc index 6c329e34..b6b56561 100644 --- a/extractor/data_array_test.cc +++ b/extractor/data_array_test.cc @@ -58,16 +58,12 @@ TEST_F(DataArrayTest, TestGetData) { TEST_F(DataArrayTest, TestVocabulary) { EXPECT_EQ(9, source_data.GetVocabularySize()); - EXPECT_TRUE(source_data.HasWord("mere")); EXPECT_EQ(4, source_data.GetWordId("mere")); EXPECT_EQ("mere", source_data.GetWord(4)); - EXPECT_FALSE(source_data.HasWord("banane")); EXPECT_EQ(11, target_data.GetVocabularySize()); - EXPECT_TRUE(target_data.HasWord("apples")); EXPECT_EQ(4, target_data.GetWordId("apples")); EXPECT_EQ("apples", target_data.GetWord(4)); - EXPECT_FALSE(target_data.HasWord("bananas")); } TEST_F(DataArrayTest, TestSentenceData) { diff --git a/extractor/mocks/mock_data_array.h b/extractor/mocks/mock_data_array.h index d39cb0c4..edc525fa 100644 --- a/extractor/mocks/mock_data_array.h +++ b/extractor/mocks/mock_data_array.h @@ -11,7 +11,6 @@ class MockDataArray : public DataArray { MOCK_CONST_METHOD1(GetWordAtIndex, string(int index)); MOCK_CONST_METHOD0(GetSize, int()); MOCK_CONST_METHOD0(GetVocabularySize, int()); - MOCK_CONST_METHOD1(HasWord, bool(const string& word)); MOCK_CONST_METHOD1(GetWordId, int(const string& word)); MOCK_CONST_METHOD1(GetWord, string(int word_id)); MOCK_CONST_METHOD1(GetSentenceLength, int(int sentence_id)); diff --git a/extractor/suffix_array.cc b/extractor/suffix_array.cc index ac230d13..4a514b12 100644 --- a/extractor/suffix_array.cc +++ b/extractor/suffix_array.cc @@ -187,12 +187,12 @@ shared_ptr SuffixArray::GetData() const { PhraseLocation SuffixArray::Lookup(int low, int high, const string& word, int offset) const { - if (!data_array->HasWord(word)) { + int word_id = data_array->GetWordId(word); + if (word_id == -1) { // Return empty phrase location. return PhraseLocation(0, 0); } - int word_id = data_array->GetWordId(word); if (offset == 0) { return PhraseLocation(word_start[word_id], word_start[word_id + 1]); } diff --git a/extractor/suffix_array_test.cc b/extractor/suffix_array_test.cc index a9fd1eab..161edbc0 100644 --- a/extractor/suffix_array_test.cc +++ b/extractor/suffix_array_test.cc @@ -55,22 +55,18 @@ TEST_F(SuffixArrayTest, TestLookup) { EXPECT_CALL(*data_array, AtIndex(i)).WillRepeatedly(Return(data[i])); } - EXPECT_CALL(*data_array, HasWord("word1")).WillRepeatedly(Return(true)); EXPECT_CALL(*data_array, GetWordId("word1")).WillRepeatedly(Return(6)); EXPECT_EQ(PhraseLocation(11, 14), suffix_array.Lookup(0, 14, "word1", 0)); - EXPECT_CALL(*data_array, HasWord("word2")).WillRepeatedly(Return(false)); + EXPECT_CALL(*data_array, GetWordId("word2")).WillRepeatedly(Return(-1)); EXPECT_EQ(PhraseLocation(0, 0), suffix_array.Lookup(0, 14, "word2", 0)); - EXPECT_CALL(*data_array, HasWord("word3")).WillRepeatedly(Return(true)); EXPECT_CALL(*data_array, GetWordId("word3")).WillRepeatedly(Return(4)); EXPECT_EQ(PhraseLocation(11, 13), suffix_array.Lookup(11, 14, "word3", 1)); - EXPECT_CALL(*data_array, HasWord("word4")).WillRepeatedly(Return(true)); EXPECT_CALL(*data_array, GetWordId("word4")).WillRepeatedly(Return(1)); EXPECT_EQ(PhraseLocation(11, 13), suffix_array.Lookup(11, 13, "word4", 2)); - EXPECT_CALL(*data_array, HasWord("word5")).WillRepeatedly(Return(true)); EXPECT_CALL(*data_array, GetWordId("word5")).WillRepeatedly(Return(2)); EXPECT_EQ(PhraseLocation(11, 13), suffix_array.Lookup(11, 13, "word5", 3)); diff --git a/extractor/translation_table.cc b/extractor/translation_table.cc index 1b1ba112..11e29e1e 100644 --- a/extractor/translation_table.cc +++ b/extractor/translation_table.cc @@ -90,13 +90,12 @@ void TranslationTable::IncrementLinksCount( double TranslationTable::GetTargetGivenSourceScore( const string& source_word, const string& target_word) { - if (!source_data_array->HasWord(source_word) || - !target_data_array->HasWord(target_word)) { + int source_id = source_data_array->GetWordId(source_word); + int target_id = target_data_array->GetWordId(target_word); + if (source_id == -1 || target_id == -1) { return -1; } - int source_id = source_data_array->GetWordId(source_word); - int target_id = target_data_array->GetWordId(target_word); auto entry = make_pair(source_id, target_id); auto it = translation_probabilities.find(entry); if (it == translation_probabilities.end()) { @@ -107,13 +106,12 @@ double TranslationTable::GetTargetGivenSourceScore( double TranslationTable::GetSourceGivenTargetScore( const string& source_word, const string& target_word) { - if (!source_data_array->HasWord(source_word) || - !target_data_array->HasWord(target_word)) { + int source_id = source_data_array->GetWordId(source_word); + int target_id = target_data_array->GetWordId(target_word); + if (source_id == -1 || target_id == -1) { return -1; } - int source_id = source_data_array->GetWordId(source_word); - int target_id = target_data_array->GetWordId(target_word); auto entry = make_pair(source_id, target_id); auto it = translation_probabilities.find(entry); if (it == translation_probabilities.end()) { diff --git a/extractor/translation_table_test.cc b/extractor/translation_table_test.cc index 72551a12..3cfc0011 100644 --- a/extractor/translation_table_test.cc +++ b/extractor/translation_table_test.cc @@ -36,13 +36,10 @@ class TranslationTableTest : public Test { .WillRepeatedly(Return(source_sentence_start[i])); } for (size_t i = 0; i < words.size(); ++i) { - EXPECT_CALL(*source_data_array, HasWord(words[i])) - .WillRepeatedly(Return(true)); EXPECT_CALL(*source_data_array, GetWordId(words[i])) .WillRepeatedly(Return(i + 2)); } - EXPECT_CALL(*source_data_array, HasWord("d")) - .WillRepeatedly(Return(false)); + EXPECT_CALL(*source_data_array, GetWordId("d")).WillRepeatedly(Return(-1)); vector target_data = {2, 3, 2, 3, 4, 5, 0, 3, 6, 0, 2, 7, 0}; vector target_sentence_start = {0, 7, 10, 13}; @@ -54,13 +51,10 @@ class TranslationTableTest : public Test { .WillRepeatedly(Return(target_sentence_start[i])); } for (size_t i = 0; i < words.size(); ++i) { - EXPECT_CALL(*target_data_array, HasWord(words[i])) - .WillRepeatedly(Return(true)); EXPECT_CALL(*target_data_array, GetWordId(words[i])) .WillRepeatedly(Return(i + 2)); } - EXPECT_CALL(*target_data_array, HasWord("d")) - .WillRepeatedly(Return(false)); + EXPECT_CALL(*target_data_array, GetWordId("d")).WillRepeatedly(Return(-1)); vector> links1 = { make_pair(0, 0), make_pair(1, 1), make_pair(2, 2), make_pair(3, 3), -- cgit v1.2.3 From 3c73e472444ff0cd436b12f3679440a6969cbf2d Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Mon, 25 Nov 2013 23:56:31 +0000 Subject: Clean up leave-one-out sampling. --- extractor/grammar_extractor.cc | 6 ++++-- extractor/grammar_extractor.h | 4 +++- extractor/grammar_extractor_test.cc | 4 ++-- extractor/mocks/mock_rule_factory.h | 6 +++--- extractor/mocks/mock_sampler.h | 4 +++- extractor/rule_factory.cc | 7 +++++-- extractor/rule_factory.h | 3 +-- extractor/rule_factory_test.cc | 8 +++----- extractor/run_extractor.cc | 3 ++- extractor/sampler.cc | 12 ++++++++---- extractor/sampler.h | 4 +++- extractor/sampler_test.cc | 30 +++++++++++++++++++++--------- 12 files changed, 58 insertions(+), 33 deletions(-) diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc index 4d0738f7..1dc94c25 100644 --- a/extractor/grammar_extractor.cc +++ b/extractor/grammar_extractor.cc @@ -35,10 +35,12 @@ GrammarExtractor::GrammarExtractor( vocabulary(vocabulary), rule_factory(rule_factory) {} -Grammar GrammarExtractor::GetGrammar(const string& sentence, const unordered_set& blacklisted_sentence_ids, const shared_ptr source_data_array) { +Grammar GrammarExtractor::GetGrammar( + const string& sentence, + const unordered_set& blacklisted_sentence_ids) { vector words = TokenizeSentence(sentence); vector word_ids = AnnotateWords(words); - return rule_factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array); + return rule_factory->GetGrammar(word_ids, blacklisted_sentence_ids); } vector GrammarExtractor::TokenizeSentence(const string& sentence) { diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h index 8f570df2..eb79f53c 100644 --- a/extractor/grammar_extractor.h +++ b/extractor/grammar_extractor.h @@ -46,7 +46,9 @@ class GrammarExtractor { // Converts the sentence to a vector of word ids and uses the RuleFactory to // extract the SCFG rules which may be used to decode the sentence. - Grammar GetGrammar(const string& sentence, const unordered_set& blacklisted_sentence_ids, const shared_ptr source_data_array); + Grammar GetGrammar( + const string& sentence, + const unordered_set& blacklisted_sentence_ids); private: // Splits the sentence in a vector of words. diff --git a/extractor/grammar_extractor_test.cc b/extractor/grammar_extractor_test.cc index f32a9599..719e90ff 100644 --- a/extractor/grammar_extractor_test.cc +++ b/extractor/grammar_extractor_test.cc @@ -41,13 +41,13 @@ TEST(GrammarExtractorTest, TestAnnotatingWords) { Grammar grammar(rules, feature_names); unordered_set blacklisted_sentence_ids; shared_ptr source_data_array; - EXPECT_CALL(*factory, GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array)) + EXPECT_CALL(*factory, GetGrammar(word_ids, blacklisted_sentence_ids)) .WillOnce(Return(grammar)); GrammarExtractor extractor(vocabulary, factory); string sentence = "Anna has many many apples ."; - extractor.GetGrammar(sentence, blacklisted_sentence_ids, source_data_array); + extractor.GetGrammar(sentence, blacklisted_sentence_ids); } } // namespace diff --git a/extractor/mocks/mock_rule_factory.h b/extractor/mocks/mock_rule_factory.h index 6b7b6586..53eb5022 100644 --- a/extractor/mocks/mock_rule_factory.h +++ b/extractor/mocks/mock_rule_factory.h @@ -7,9 +7,9 @@ namespace extractor { class MockHieroCachingRuleFactory : public HieroCachingRuleFactory { public: - MOCK_METHOD3(GetGrammar, Grammar(const vector& word_ids, const - unordered_set& blacklisted_sentence_ids, - const shared_ptr source_data_array)); + MOCK_METHOD2(GetGrammar, Grammar( + const vector& word_ids, + const unordered_set& blacklisted_sentence_ids)); }; } // namespace extractor diff --git a/extractor/mocks/mock_sampler.h b/extractor/mocks/mock_sampler.h index 75c43c27..b2742f62 100644 --- a/extractor/mocks/mock_sampler.h +++ b/extractor/mocks/mock_sampler.h @@ -7,7 +7,9 @@ namespace extractor { class MockSampler : public Sampler { public: - MOCK_CONST_METHOD1(Sample, PhraseLocation(const PhraseLocation& location)); + MOCK_CONST_METHOD2(Sample, PhraseLocation( + const PhraseLocation& location, + const unordered_set& blacklisted_sentence_ids)); }; } // namespace extractor diff --git a/extractor/rule_factory.cc b/extractor/rule_factory.cc index 6ae2d792..5b66f685 100644 --- a/extractor/rule_factory.cc +++ b/extractor/rule_factory.cc @@ -101,7 +101,9 @@ HieroCachingRuleFactory::HieroCachingRuleFactory() {} HieroCachingRuleFactory::~HieroCachingRuleFactory() {} -Grammar HieroCachingRuleFactory::GetGrammar(const vector& word_ids, const unordered_set& blacklisted_sentence_ids, const shared_ptr source_data_array) { +Grammar HieroCachingRuleFactory::GetGrammar( + const vector& word_ids, + const unordered_set& blacklisted_sentence_ids) { Clock::time_point start_time = Clock::now(); double total_extract_time = 0; double total_intersect_time = 0; @@ -193,7 +195,8 @@ Grammar HieroCachingRuleFactory::GetGrammar(const vector& word_ids, const u Clock::time_point extract_start = Clock::now(); if (!state.starts_with_x) { // Extract rules for the sampled set of occurrences. - PhraseLocation sample = sampler->Sample(next_node->matchings, blacklisted_sentence_ids, source_data_array); + PhraseLocation sample = sampler->Sample( + next_node->matchings, blacklisted_sentence_ids); vector new_rules = rule_extractor->ExtractRules(next_phrase, sample); rules.insert(rules.end(), new_rules.begin(), new_rules.end()); diff --git a/extractor/rule_factory.h b/extractor/rule_factory.h index a1ff76e4..1a9fa2af 100644 --- a/extractor/rule_factory.h +++ b/extractor/rule_factory.h @@ -74,8 +74,7 @@ class HieroCachingRuleFactory { // (See class description for more details.) virtual Grammar GetGrammar( const vector& word_ids, - const unordered_set& blacklisted_sentence_ids, - const shared_ptr source_data_array); + const unordered_set& blacklisted_sentence_ids); protected: HieroCachingRuleFactory(); diff --git a/extractor/rule_factory_test.cc b/extractor/rule_factory_test.cc index f26cc567..332c5959 100644 --- a/extractor/rule_factory_test.cc +++ b/extractor/rule_factory_test.cc @@ -40,7 +40,7 @@ class RuleFactoryTest : public Test { .WillRepeatedly(Return(feature_names)); sampler = make_shared(); - EXPECT_CALL(*sampler, Sample(_)) + EXPECT_CALL(*sampler, Sample(_, _)) .WillRepeatedly(Return(PhraseLocation(0, 1))); Phrase phrase; @@ -77,8 +77,7 @@ TEST_F(RuleFactoryTest, TestGetGrammarDifferentWords) { vector word_ids = {2, 3, 4}; unordered_set blacklisted_sentence_ids; - shared_ptr source_data_array; - Grammar grammar = factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array); + Grammar grammar = factory->GetGrammar(word_ids, blacklisted_sentence_ids); EXPECT_EQ(feature_names, grammar.GetFeatureNames()); EXPECT_EQ(7, grammar.GetRules().size()); } @@ -97,8 +96,7 @@ TEST_F(RuleFactoryTest, TestGetGrammarRepeatingWords) { vector word_ids = {2, 3, 4, 2, 3}; unordered_set blacklisted_sentence_ids; - shared_ptr source_data_array; - Grammar grammar = factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array); + Grammar grammar = factory->GetGrammar(word_ids, blacklisted_sentence_ids); EXPECT_EQ(feature_names, grammar.GetFeatureNames()); EXPECT_EQ(28, grammar.GetRules().size()); } diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 85c8a422..6b22a302 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -237,7 +237,8 @@ int main(int argc, char** argv) { unordered_set blacklisted_sentence_ids; if (leave_one_out) blacklisted_sentence_ids.insert(i); - Grammar grammar = extractor.GetGrammar(sentences[i], blacklisted_sentence_ids, source_data_array); + Grammar grammar = extractor.GetGrammar( + sentences[i], blacklisted_sentence_ids); ofstream output(GetGrammarFilePath(grammar_path, i).c_str()); output << grammar; } diff --git a/extractor/sampler.cc b/extractor/sampler.cc index 963afa7a..fc386ed1 100644 --- a/extractor/sampler.cc +++ b/extractor/sampler.cc @@ -12,7 +12,9 @@ Sampler::Sampler() {} Sampler::~Sampler() {} -PhraseLocation Sampler::Sample(const PhraseLocation& location, const unordered_set& blacklisted_sentence_ids, const shared_ptr source_data_array) const { +PhraseLocation Sampler::Sample( + const PhraseLocation& location, + const unordered_set& blacklisted_sentence_ids) const { vector sample; int num_subpatterns; if (location.matchings == NULL) { @@ -22,10 +24,11 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location, const unordered_s double step = max(1.0, (double) (high - low) / max_samples); double i = low, last = i; bool found; + shared_ptr source_data_array = suffix_array->GetData(); while (sample.size() < max_samples && i < high) { int x = suffix_array->GetSuffix(Round(i)); int id = source_data_array->GetSentenceId(x); - if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) { + if (blacklisted_sentence_ids.count(id)) { found = false; double backoff_step = 1; while (true) { @@ -33,13 +36,14 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location, const unordered_s double j = i - backoff_step; x = suffix_array->GetSuffix(Round(j)); id = source_data_array->GetSentenceId(x); - if (x >= 0 && j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { + if (x >= 0 && j > last && !blacklisted_sentence_ids.count(id)) { found = true; last = i; break; } double k = i + backoff_step; x = suffix_array->GetSuffix(Round(k)); id = source_data_array->GetSentenceId(x); - if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { + if (k < min(i+step, (double)high) && + !blacklisted_sentence_ids.count(id)) { found = true; last = k; break; } if (j <= last && k >= high) break; diff --git a/extractor/sampler.h b/extractor/sampler.h index de450c48..bd8a5876 100644 --- a/extractor/sampler.h +++ b/extractor/sampler.h @@ -23,7 +23,9 @@ class Sampler { virtual ~Sampler(); // Samples uniformly at most max_samples phrase occurrences. - virtual PhraseLocation Sample(const PhraseLocation& location, const unordered_set& blacklisted_sentence_ids, const shared_ptr source_data_array) const; + virtual PhraseLocation Sample( + const PhraseLocation& location, + const unordered_set& blacklisted_sentence_ids) const; protected: Sampler(); diff --git a/extractor/sampler_test.cc b/extractor/sampler_test.cc index 965567ba..14e72780 100644 --- a/extractor/sampler_test.cc +++ b/extractor/sampler_test.cc @@ -19,6 +19,8 @@ class SamplerTest : public Test { source_data_array = make_shared(); EXPECT_CALL(*source_data_array, GetSentenceId(_)).WillRepeatedly(Return(9999)); suffix_array = make_shared(); + EXPECT_CALL(*suffix_array, GetData()) + .WillRepeatedly(Return(source_data_array)); for (int i = 0; i < 10; ++i) { EXPECT_CALL(*suffix_array, GetSuffix(i)).WillRepeatedly(Return(i)); } @@ -35,23 +37,29 @@ TEST_F(SamplerTest, TestSuffixArrayRange) { sampler = make_shared(suffix_array, 1); vector expected_locations = {0}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler->Sample(location, blacklist)); + return; sampler = make_shared(suffix_array, 2); expected_locations = {0, 5}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler->Sample(location, blacklist)); sampler = make_shared(suffix_array, 3); expected_locations = {0, 3, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler->Sample(location, blacklist)); sampler = make_shared(suffix_array, 4); expected_locations = {0, 3, 5, 8}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler->Sample(location, blacklist)); sampler = make_shared(suffix_array, 100); expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler->Sample(location, blacklist)); } TEST_F(SamplerTest, TestSubstringsSample) { @@ -61,19 +69,23 @@ TEST_F(SamplerTest, TestSubstringsSample) { sampler = make_shared(suffix_array, 1); vector expected_locations = {0, 1}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklist)); sampler = make_shared(suffix_array, 2); expected_locations = {0, 1, 6, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklist)); sampler = make_shared(suffix_array, 3); expected_locations = {0, 1, 4, 5, 6, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklist)); sampler = make_shared(suffix_array, 7); expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklist)); } } // namespace -- cgit v1.2.3 From e633526bc2ba1f73e88989f495d70c0d2ec84a97 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Tue, 26 Nov 2013 01:14:28 +0000 Subject: Serialize vocabulary. --- extractor/Makefile.am | 12 +++++++++--- extractor/compile.cc | 4 ++++ extractor/vocabulary.cc | 4 ++++ extractor/vocabulary.h | 23 +++++++++++++++++++++- extractor/vocabulary_test.cc | 45 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 84 insertions(+), 4 deletions(-) create mode 100644 extractor/vocabulary_test.cc diff --git a/extractor/Makefile.am b/extractor/Makefile.am index 65a3d436..64a5a2b5 100644 --- a/extractor/Makefile.am +++ b/extractor/Makefile.am @@ -24,7 +24,8 @@ EXTRA_PROGRAMS = alignment_test \ scorer_test \ suffix_array_test \ target_phrase_extractor_test \ - translation_table_test + translation_table_test \ + vocabulary_test if HAVE_GTEST RUNNABLE_TESTS = alignment_test \ @@ -48,12 +49,14 @@ if HAVE_GTEST scorer_test \ suffix_array_test \ target_phrase_extractor_test \ - translation_table_test + translation_table_test \ + vocabulary_test endif noinst_PROGRAMS = $(RUNNABLE_TESTS) -TESTS = $(RUNNABLE_TESTS) +# TESTS = $(RUNNABLE_TESTS) +TESTS = vocabulary_test alignment_test_SOURCES = alignment_test.cc alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a @@ -99,6 +102,8 @@ target_phrase_extractor_test_SOURCES = target_phrase_extractor_test.cc target_phrase_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a translation_table_test_SOURCES = translation_table_test.cc translation_table_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a +vocabulary_test_SOURCES = vocabulary_test.cc +vocabulary_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a noinst_LIBRARIES = libextractor.a libcompile.a @@ -115,6 +120,7 @@ libcompile_a_SOURCES = \ suffix_array.cc \ time_util.cc \ translation_table.cc \ + vocabulary.cc \ alignment.h \ data_array.h \ fast_intersector.h \ diff --git a/extractor/compile.cc b/extractor/compile.cc index 0d62757e..9e8044ad 100644 --- a/extractor/compile.cc +++ b/extractor/compile.cc @@ -145,6 +145,10 @@ int main(int argc, char** argv) { ofstream precomp_fstream((output_dir / fs::path("precomp.bin")).string()); ar::binary_oarchive precomp_stream(precomp_fstream); precomp_stream << precomputation; + + ofstream vocab_fstream((output_dir / fs::path("vocab.bin")).string()); + ar::binary_oarchive vocab_stream(vocab_fstream); + vocab_stream << *vocabulary; stop_write = Clock::now(); write_duration += GetDuration(start_write, stop_write); diff --git a/extractor/vocabulary.cc b/extractor/vocabulary.cc index aef674a5..c9c2d6f4 100644 --- a/extractor/vocabulary.cc +++ b/extractor/vocabulary.cc @@ -35,4 +35,8 @@ string Vocabulary::GetTerminalValue(int symbol) { return word; } +bool Vocabulary::operator==(const Vocabulary& other) const { + return words == other.words && dictionary == other.dictionary; +} + } // namespace extractor diff --git a/extractor/vocabulary.h b/extractor/vocabulary.h index c8fd9411..db092e99 100644 --- a/extractor/vocabulary.h +++ b/extractor/vocabulary.h @@ -5,6 +5,10 @@ #include #include +#include +#include +#include + using namespace std; namespace extractor { @@ -14,7 +18,7 @@ namespace extractor { * * This strucure contains words located in the frequent collocations and words * encountered during the grammar extraction time. This dictionary is - * considerably smaller than the dictionaries in the data arrays (and so is the + * considerably smaller than the dictionaries in the data arays (and so is the * query time). Note that this is the single data structure that changes state * and needs to have thread safe read/write operations. * @@ -38,7 +42,24 @@ class Vocabulary { // Returns the word corresponding to the given word id. virtual string GetTerminalValue(int symbol); + bool operator==(const Vocabulary& vocabulary) const; + private: + friend class boost::serialization::access; + + template void save(Archive& ar, unsigned int) const { + ar << words; + } + + template void load(Archive& ar, unsigned int) { + ar >> words; + for (size_t i = 0; i < words.size(); ++i) { + dictionary[words[i]] = i; + } + } + + BOOST_SERIALIZATION_SPLIT_MEMBER(); + unordered_map dictionary; vector words; }; diff --git a/extractor/vocabulary_test.cc b/extractor/vocabulary_test.cc new file mode 100644 index 00000000..cf5e3e36 --- /dev/null +++ b/extractor/vocabulary_test.cc @@ -0,0 +1,45 @@ +#include + +#include +#include +#include + +#include +#include + +#include "vocabulary.h" + +using namespace std; +using namespace ::testing; +namespace ar = boost::archive; + +namespace extractor { +namespace { + +TEST(VocabularyTest, TestIndexes) { + Vocabulary vocabulary; + EXPECT_EQ(0, vocabulary.GetTerminalIndex("zero")); + EXPECT_EQ("zero", vocabulary.GetTerminalValue(0)); + + EXPECT_EQ(1, vocabulary.GetTerminalIndex("one")); + EXPECT_EQ("one", vocabulary.GetTerminalValue(1)); +} + +TEST(VocabularyTest, TestSerialization) { + Vocabulary vocabulary; + EXPECT_EQ(0, vocabulary.GetTerminalIndex("zero")); + EXPECT_EQ("zero", vocabulary.GetTerminalValue(0)); + + stringstream stream(ios_base::out | ios_base::in); + ar::text_oarchive output_stream(stream, ar::no_header); + output_stream << vocabulary; + + Vocabulary vocabulary_copy; + ar::text_iarchive input_stream(stream, ar::no_header); + input_stream >> vocabulary_copy; + + EXPECT_EQ(vocabulary, vocabulary_copy); +} + +} // namespace +} // namespace extractor -- cgit v1.2.3 From bed3e4b867e4132917fa0640956e8ce713f0e451 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Tue, 26 Nov 2013 15:01:14 +0000 Subject: Script for grammar extraction only. --- .gitignore | 1 + extractor/Makefile.am | 40 +------ extractor/extract.cc | 253 ++++++++++++++++++++++++++++++++++++++++++ extractor/grammar_extractor.h | 1 - extractor/run_extractor.cc | 20 ++-- extractor/sampler.cc | 23 ++-- 6 files changed, 278 insertions(+), 60 deletions(-) create mode 100644 extractor/extract.cc diff --git a/.gitignore b/.gitignore index 942539cb..f964fa0c 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ extools/score_grammar extools/sg_lexer.cc extractor/*_test extractor/compile +extractor/extract extractor/run_extractor gi/clda/src/clda gi/markov_al/ml diff --git a/extractor/Makefile.am b/extractor/Makefile.am index 64a5a2b5..7825012c 100644 --- a/extractor/Makefile.am +++ b/extractor/Makefile.am @@ -1,5 +1,5 @@ -bin_PROGRAMS = compile run_extractor +bin_PROGRAMS = compile run_extractor extract if HAVE_CXX11 @@ -105,44 +105,14 @@ translation_table_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $ vocabulary_test_SOURCES = vocabulary_test.cc vocabulary_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a -noinst_LIBRARIES = libextractor.a libcompile.a +noinst_LIBRARIES = libextractor.a compile_SOURCES = compile.cc -compile_LDADD = libcompile.a +compile_LDADD = libextractor.a run_extractor_SOURCES = run_extractor.cc run_extractor_LDADD = libextractor.a - -libcompile_a_SOURCES = \ - alignment.cc \ - data_array.cc \ - phrase_location.cc \ - precomputation.cc \ - suffix_array.cc \ - time_util.cc \ - translation_table.cc \ - vocabulary.cc \ - alignment.h \ - data_array.h \ - fast_intersector.h \ - grammar.h \ - grammar_extractor.h \ - matchings_finder.h \ - matchings_trie.h \ - phrase.h \ - phrase_builder.h \ - phrase_location.h \ - precomputation.h \ - rule.h \ - rule_extractor.h \ - rule_extractor_helper.h \ - rule_factory.h \ - sampler.h \ - scorer.h \ - suffix_array.h \ - target_phrase_extractor.h \ - time_util.h \ - translation_table.h \ - vocabulary.h +extract_SOURCES = extract.cc +extract_LDADD = libextractor.a libextractor_a_SOURCES = \ alignment.cc \ diff --git a/extractor/extract.cc b/extractor/extract.cc new file mode 100644 index 00000000..2d5831fa --- /dev/null +++ b/extractor/extract.cc @@ -0,0 +1,253 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "alignment.h" +#include "data_array.h" +#include "features/count_source_target.h" +#include "features/feature.h" +#include "features/is_source_singleton.h" +#include "features/is_source_target_singleton.h" +#include "features/max_lex_source_given_target.h" +#include "features/max_lex_target_given_source.h" +#include "features/sample_source_count.h" +#include "features/target_given_source_coherent.h" +#include "grammar.h" +#include "grammar_extractor.h" +#include "precomputation.h" +#include "rule.h" +#include "scorer.h" +#include "suffix_array.h" +#include "time_util.h" +#include "translation_table.h" +#include "vocabulary.h" + +namespace ar = boost::archive; +namespace fs = boost::filesystem; +namespace po = boost::program_options; +using namespace extractor; +using namespace features; +using namespace std; + +// Returns the file path in which a given grammar should be written. +fs::path GetGrammarFilePath(const fs::path& grammar_path, int file_number) { + string file_name = "grammar." + to_string(file_number); + return grammar_path / file_name; +} + +int main(int argc, char** argv) { + po::options_description general_options("General options"); + int max_threads = 1; + #pragma omp parallel + max_threads = omp_get_num_threads(); + string threads_option = "Number of threads used for grammar extraction " + "max(" + to_string(max_threads) + ")"; + general_options.add_options() + ("threads,t", po::value()->required()->default_value(1), + threads_option.c_str()) + ("grammars,g", po::value()->required(), "Grammars output path") + ("max_rule_span", po::value()->default_value(15), + "Maximum rule span") + ("max_rule_symbols", po::value()->default_value(5), + "Maximum number of symbols (terminals + nontermals) in a rule") + ("min_gap_size", po::value()->default_value(1), "Minimum gap size") + ("max_nonterminals", po::value()->default_value(2), + "Maximum number of nonterminals in a rule") + ("max_samples", po::value()->default_value(300), + "Maximum number of samples") + ("tight_phrases", po::value()->default_value(true), + "False if phrases may be loose (better, but slower)") + ("leave_one_out", po::value()->zero_tokens(), + "do leave-one-out estimation of grammars " + "(e.g. for extracting grammars for the training set"); + + po::options_description cmdline_options("Command line options"); + cmdline_options.add_options() + ("help", "Show available options") + ("config", po::value()->required(), "Path to config file"); + cmdline_options.add(general_options); + + po::options_description config_options("Config file options"); + config_options.add_options() + ("target", po::value()->required(), + "Path to target data file in binary format") + ("source", po::value()->required(), + "Path to source suffix array file in binary format") + ("alignment", po::value()->required(), + "Path to alignment file in binary format") + ("precomputation", po::value()->required(), + "Path to precomputation file in binary format") + ("vocabulary", po::value()->required(), + "Path to vocabulary file in binary format") + ("ttable", po::value()->required(), + "Path to translation table in binary format"); + config_options.add(general_options); + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, cmdline_options), vm); + if (vm.count("help")) { + po::options_description all_options; + all_options.add(cmdline_options).add(config_options); + cout << all_options << endl; + return 0; + } + + po::notify(vm); + + ifstream config_stream(vm["config"].as()); + po::store(po::parse_config_file(config_stream, config_options), vm); + po::notify(vm); + + int num_threads = vm["threads"].as(); + cerr << "Grammar extraction will use " << num_threads << " threads." << endl; + + Clock::time_point read_start_time = Clock::now(); + + Clock::time_point start_time = Clock::now(); + cerr << "Reading target data in binary format..." << endl; + shared_ptr target_data_array = make_shared(); + ifstream target_fstream(vm["target"].as()); + ar::binary_iarchive target_stream(target_fstream); + target_stream >> *target_data_array; + Clock::time_point end_time = Clock::now(); + cerr << "Reading target data took " << GetDuration(start_time, end_time) + << " seconds" << endl; + + start_time = Clock::now(); + cerr << "Reading source suffix array in binary format..." << endl; + shared_ptr source_suffix_array = make_shared(); + ifstream source_fstream(vm["source"].as()); + ar::binary_iarchive source_stream(source_fstream); + source_stream >> *source_suffix_array; + end_time = Clock::now(); + cerr << "Reading source suffix array took " + << GetDuration(start_time, end_time) << " seconds" << endl; + + start_time = Clock::now(); + cerr << "Reading alignment in binary format..." << endl; + shared_ptr alignment = make_shared(); + ifstream alignment_fstream(vm["alignment"].as()); + ar::binary_iarchive alignment_stream(alignment_fstream); + alignment_stream >> *alignment; + end_time = Clock::now(); + cerr << "Reading alignment took " << GetDuration(start_time, end_time) + << " seconds" << endl; + + start_time = Clock::now(); + cerr << "Reading precomputation in binary format..." << endl; + shared_ptr precomputation = make_shared(); + ifstream precomputation_fstream(vm["precomputation"].as()); + ar::binary_iarchive precomputation_stream(precomputation_fstream); + precomputation_stream >> *precomputation; + end_time = Clock::now(); + cerr << "Reading precomputation took " << GetDuration(start_time, end_time) + << " seconds" << endl; + + start_time = Clock::now(); + cerr << "Reading vocabulary in binary format..." << endl; + shared_ptr vocabulary = make_shared(); + ifstream vocabulary_fstream(vm["vocabulary"].as()); + ar::binary_iarchive vocabulary_stream(vocabulary_fstream); + vocabulary_stream >> *vocabulary; + end_time = Clock::now(); + cerr << "Reading vocabulary took " << GetDuration(start_time, end_time) + << " seconds" << endl; + + start_time = Clock::now(); + cerr << "Reading translation table in binary format..." << endl; + shared_ptr table = make_shared(); + ifstream ttable_fstream(vm["ttable"].as()); + ar::binary_iarchive ttable_stream(ttable_fstream); + ttable_stream >> *table; + end_time = Clock::now(); + cerr << "Reading translation table took " << GetDuration(start_time, end_time) + << " seconds" << endl; + + Clock::time_point read_end_time = Clock::now(); + cerr << "Total time spent loading data structures into memory: " + << GetDuration(read_start_time, read_end_time) << " seconds" << endl; + + Clock::time_point extraction_start_time = Clock::now(); + // Features used to score each grammar rule. + vector> features = { + make_shared(), + make_shared(), + make_shared(), + make_shared(table), + make_shared(table), + make_shared(), + make_shared() + }; + shared_ptr scorer = make_shared(features); + + GrammarExtractor extractor( + source_suffix_array, + target_data_array, + alignment, + precomputation, + scorer, + vocabulary, + vm["min_gap_size"].as(), + vm["max_rule_span"].as(), + vm["max_nonterminals"].as(), + vm["max_rule_symbols"].as(), + vm["max_samples"].as(), + vm["tight_phrases"].as()); + + // Creates the grammars directory if it doesn't exist. + fs::path grammar_path = vm["grammars"].as(); + if (!fs::is_directory(grammar_path)) { + fs::create_directory(grammar_path); + } + + // Reads all sentences for which we extract grammar rules (the paralellization + // is simplified if we read all sentences upfront). + string sentence; + vector sentences; + while (getline(cin, sentence)) { + sentences.push_back(sentence); + } + + // Extracts the grammar for each sentence and saves it to a file. + vector suffixes(sentences.size()); + bool leave_one_out = vm.count("leave_one_out"); + #pragma omp parallel for schedule(dynamic) num_threads(num_threads) + for (size_t i = 0; i < sentences.size(); ++i) { + string suffix; + int position = sentences[i].find("|||"); + if (position != sentences[i].npos) { + suffix = sentences[i].substr(position); + sentences[i] = sentences[i].substr(0, position); + } + suffixes[i] = suffix; + + unordered_set blacklisted_sentence_ids; + if (leave_one_out) { + blacklisted_sentence_ids.insert(i); + } + Grammar grammar = extractor.GetGrammar( + sentences[i], blacklisted_sentence_ids); + ofstream output(GetGrammarFilePath(grammar_path, i).c_str()); + // output << grammar; + } + + for (size_t i = 0; i < sentences.size(); ++i) { + cout << " " << sentences[i] << " " << suffixes[i] << endl; + } + + Clock::time_point extraction_stop_time = Clock::now(); + cerr << "Overall extraction step took " + << GetDuration(extraction_start_time, extraction_stop_time) + << " seconds" << endl; + + return 0; +} diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h index eb79f53c..0f3069b0 100644 --- a/extractor/grammar_extractor.h +++ b/extractor/grammar_extractor.h @@ -15,7 +15,6 @@ class DataArray; class Grammar; class HieroCachingRuleFactory; class Precomputation; -class Rule; class Scorer; class SuffixArray; class Vocabulary; diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 6b22a302..f1aa5e35 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -5,10 +5,10 @@ #include #include -#include #include #include #include +#include #include "alignment.h" #include "data_array.h" @@ -78,7 +78,8 @@ int main(int argc, char** argv) { ("tight_phrases", po::value()->default_value(true), "False if phrases may be loose (better, but slower)") ("leave_one_out", po::value()->zero_tokens(), - "do leave-one-out estimation of grammars (e.g. for extracting grammars for the training set"); + "do leave-one-out estimation of grammars " + "(e.g. for extracting grammars for the training set"); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); @@ -99,11 +100,6 @@ int main(int argc, char** argv) { return 1; } - bool leave_one_out = false; - if (vm.count("leave_one_out")) { - leave_one_out = true; - } - int num_threads = vm["threads"].as(); cerr << "Grammar extraction will use " << num_threads << " threads." << endl; @@ -178,8 +174,8 @@ int main(int argc, char** argv) { << GetDuration(preprocess_start_time, preprocess_stop_time) << " seconds" << endl; - // Features used to score each grammar rule. Clock::time_point extraction_start_time = Clock::now(); + // Features used to score each grammar rule. vector> features = { make_shared(), make_shared(), @@ -206,9 +202,6 @@ int main(int argc, char** argv) { vm["max_samples"].as(), vm["tight_phrases"].as()); - // Releases extra memory used by the initial precomputation. - precomputation.reset(); - // Creates the grammars directory if it doesn't exist. fs::path grammar_path = vm["grammars"].as(); if (!fs::is_directory(grammar_path)) { @@ -224,6 +217,7 @@ int main(int argc, char** argv) { } // Extracts the grammar for each sentence and saves it to a file. + bool leave_one_out = vm.count("leave_one_out"); vector suffixes(sentences.size()); #pragma omp parallel for schedule(dynamic) num_threads(num_threads) for (size_t i = 0; i < sentences.size(); ++i) { @@ -236,7 +230,9 @@ int main(int argc, char** argv) { suffixes[i] = suffix; unordered_set blacklisted_sentence_ids; - if (leave_one_out) blacklisted_sentence_ids.insert(i); + if (leave_one_out) { + blacklisted_sentence_ids.insert(i); + } Grammar grammar = extractor.GetGrammar( sentences[i], blacklisted_sentence_ids); ofstream output(GetGrammarFilePath(grammar_path, i).c_str()); diff --git a/extractor/sampler.cc b/extractor/sampler.cc index fc386ed1..887aaec1 100644 --- a/extractor/sampler.cc +++ b/extractor/sampler.cc @@ -15,6 +15,7 @@ Sampler::~Sampler() {} PhraseLocation Sampler::Sample( const PhraseLocation& location, const unordered_set& blacklisted_sentence_ids) const { + shared_ptr source_data_array = suffix_array->GetData(); vector sample; int num_subpatterns; if (location.matchings == NULL) { @@ -22,32 +23,30 @@ PhraseLocation Sampler::Sample( num_subpatterns = 1; int low = location.sa_low, high = location.sa_high; double step = max(1.0, (double) (high - low) / max_samples); - double i = low, last = i; - bool found; - shared_ptr source_data_array = suffix_array->GetData(); + double i = low, last = i - 1; while (sample.size() < max_samples && i < high) { int x = suffix_array->GetSuffix(Round(i)); int id = source_data_array->GetSentenceId(x); + bool found = false; if (blacklisted_sentence_ids.count(id)) { - found = false; - double backoff_step = 1; - while (true) { - if ((double)backoff_step >= step) break; + for (int backoff_step = 1; backoff_step <= step; ++backoff_step) { double j = i - backoff_step; x = suffix_array->GetSuffix(Round(j)); id = source_data_array->GetSentenceId(x); if (x >= 0 && j > last && !blacklisted_sentence_ids.count(id)) { - found = true; last = i; break; + found = true; + last = i; + break; } double k = i + backoff_step; x = suffix_array->GetSuffix(Round(k)); id = source_data_array->GetSentenceId(x); - if (k < min(i+step, (double)high) && + if (k < min(i+step, (double) high) && !blacklisted_sentence_ids.count(id)) { - found = true; last = k; break; + found = true; + last = k; + break; } - if (j <= last && k >= high) break; - backoff_step++; } } else { found = true; -- cgit v1.2.3 From 304103565d3b79cc9c98c1ee0356a8824fc982c2 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Tue, 26 Nov 2013 16:03:16 +0000 Subject: Write config file after compiling data structures. --- extractor/compile.cc | 30 +++++++++++++++++++++++------- extractor/extract.cc | 4 ++-- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/extractor/compile.cc b/extractor/compile.cc index 9e8044ad..3ee668ce 100644 --- a/extractor/compile.cc +++ b/extractor/compile.cc @@ -30,6 +30,8 @@ int main(int argc, char** argv) { ("bitext,b", po::value(), "Parallel text (source ||| target)") ("alignment,a", po::value()->required(), "Bitext word alignment") ("output,o", po::value()->required(), "Output path") + ("config,c", po::value()->required(), + "Path where the config file will be generated") ("frequent", po::value()->default_value(100), "Number of precomputed frequent patterns") ("super_frequent", po::value()->default_value(10), @@ -82,8 +84,12 @@ int main(int argc, char** argv) { target_data_array = make_shared(vm["target"].as()); } + ofstream config_stream(vm["config"].as()); + Clock::time_point start_write = Clock::now(); - ofstream target_fstream((output_dir / fs::path("target.bin")).string()); + string target_path = (output_dir / fs::path("target.bin")).string(); + config_stream << "target = " << target_path << endl; + ofstream target_fstream(target_path); ar::binary_oarchive target_stream(target_fstream); target_stream << *target_data_array; Clock::time_point stop_write = Clock::now(); @@ -100,7 +106,9 @@ int main(int argc, char** argv) { make_shared(source_data_array); start_write = Clock::now(); - ofstream source_fstream((output_dir / fs::path("source.bin")).string()); + string source_path = (output_dir / fs::path("source.bin")).string(); + config_stream << "source = " << source_path << endl; + ofstream source_fstream(source_path); ar::binary_oarchive output_stream(source_fstream); output_stream << *source_suffix_array; stop_write = Clock::now(); @@ -116,7 +124,9 @@ int main(int argc, char** argv) { make_shared(vm["alignment"].as()); start_write = Clock::now(); - ofstream alignment_fstream((output_dir / fs::path("alignment.bin")).string()); + string alignment_path = (output_dir / fs::path("alignment.bin")).string(); + config_stream << "alignment = " << alignment_path << endl; + ofstream alignment_fstream(alignment_path); ar::binary_oarchive alignment_stream(alignment_fstream); alignment_stream << *alignment; stop_write = Clock::now(); @@ -126,7 +136,7 @@ int main(int argc, char** argv) { cerr << "Reading alignment took " << GetDuration(start_time, stop_time) << " seconds" << endl; - shared_ptr vocabulary; + shared_ptr vocabulary = make_shared(); start_time = Clock::now(); cerr << "Precomputing collocations..." << endl; @@ -142,11 +152,15 @@ int main(int argc, char** argv) { vm["min_frequency"].as()); start_write = Clock::now(); - ofstream precomp_fstream((output_dir / fs::path("precomp.bin")).string()); + string precomputation_path = (output_dir / fs::path("precomp.bin")).string(); + config_stream << "precomputation = " << precomputation_path << endl; + ofstream precomp_fstream(precomputation_path); ar::binary_oarchive precomp_stream(precomp_fstream); precomp_stream << precomputation; - ofstream vocab_fstream((output_dir / fs::path("vocab.bin")).string()); + string vocabulary_path = (output_dir / fs::path("vocab.bin")).string(); + config_stream << "vocabulary = " << vocabulary_path << endl; + ofstream vocab_fstream(vocabulary_path); ar::binary_oarchive vocab_stream(vocab_fstream); vocab_stream << *vocabulary; stop_write = Clock::now(); @@ -161,7 +175,9 @@ int main(int argc, char** argv) { TranslationTable table(source_data_array, target_data_array, alignment); start_write = Clock::now(); - ofstream table_fstream((output_dir / fs::path("bilex.bin")).string()); + string table_path = (output_dir / fs::path("bilex.bin")).string(); + config_stream << "ttable = " << table_path << endl; + ofstream table_fstream(table_path); ar::binary_oarchive table_stream(table_fstream); table_stream << table; stop_write = Clock::now(); diff --git a/extractor/extract.cc b/extractor/extract.cc index 2d5831fa..387cbe9b 100644 --- a/extractor/extract.cc +++ b/extractor/extract.cc @@ -72,7 +72,7 @@ int main(int argc, char** argv) { po::options_description cmdline_options("Command line options"); cmdline_options.add_options() ("help", "Show available options") - ("config", po::value()->required(), "Path to config file"); + ("config,c", po::value()->required(), "Path to config file"); cmdline_options.add(general_options); po::options_description config_options("Config file options"); @@ -236,7 +236,7 @@ int main(int argc, char** argv) { Grammar grammar = extractor.GetGrammar( sentences[i], blacklisted_sentence_ids); ofstream output(GetGrammarFilePath(grammar_path, i).c_str()); - // output << grammar; + output << grammar; } for (size_t i = 0; i < sentences.size(); ++i) { -- cgit v1.2.3 From a6e6a369f40d8fb6a191fd7f74fc5efa8bfae2a0 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Wed, 27 Nov 2013 14:33:36 +0000 Subject: Unify sampling backoff strategy. --- extractor/Makefile.am | 24 ++++-- extractor/backoff_sampler.cc | 66 ++++++++++++++++ extractor/backoff_sampler.h | 41 ++++++++++ extractor/matchings_sampler.cc | 38 +++++++++ extractor/matchings_sampler.h | 31 ++++++++ extractor/matchings_sampler_test.cc | 118 ++++++++++++++++++++++++++++ extractor/mocks/mock_matchings_sampler.h | 15 ++++ extractor/mocks/mock_suffix_array_sampler.h | 15 ++++ extractor/phrase_location.cc | 2 + extractor/phrase_location_sampler.cc | 34 ++++++++ extractor/phrase_location_sampler.h | 35 +++++++++ extractor/phrase_location_sampler_test.cc | 50 ++++++++++++ extractor/precomputation.cc | 3 +- extractor/precomputation_test.cc | 2 +- extractor/rule_factory.cc | 4 +- extractor/sampler.cc | 78 ------------------ extractor/sampler.h | 22 +----- extractor/sampler_test.cc | 92 ---------------------- extractor/sampler_test_blacklist.cc | 102 ------------------------ extractor/suffix_array_sampler.cc | 40 ++++++++++ extractor/suffix_array_sampler.h | 34 ++++++++ extractor/suffix_array_sampler_test.cc | 114 +++++++++++++++++++++++++++ 22 files changed, 657 insertions(+), 303 deletions(-) create mode 100644 extractor/backoff_sampler.cc create mode 100644 extractor/backoff_sampler.h create mode 100644 extractor/matchings_sampler.cc create mode 100644 extractor/matchings_sampler.h create mode 100644 extractor/matchings_sampler_test.cc create mode 100644 extractor/mocks/mock_matchings_sampler.h create mode 100644 extractor/mocks/mock_suffix_array_sampler.h create mode 100644 extractor/phrase_location_sampler.cc create mode 100644 extractor/phrase_location_sampler.h create mode 100644 extractor/phrase_location_sampler_test.cc delete mode 100644 extractor/sampler.cc delete mode 100644 extractor/sampler_test.cc delete mode 100644 extractor/sampler_test_blacklist.cc create mode 100644 extractor/suffix_array_sampler.cc create mode 100644 extractor/suffix_array_sampler.h create mode 100644 extractor/suffix_array_sampler_test.cc diff --git a/extractor/Makefile.am b/extractor/Makefile.am index 7825012c..e5b439f9 100644 --- a/extractor/Makefile.am +++ b/extractor/Makefile.am @@ -15,13 +15,15 @@ EXTRA_PROGRAMS = alignment_test \ feature_target_given_source_coherent_test \ grammar_extractor_test \ matchings_finder_test \ + matchings_sampler_test \ + phrase_location_sampler_test \ phrase_test \ precomputation_test \ rule_extractor_helper_test \ rule_extractor_test \ rule_factory_test \ - sampler_test \ scorer_test \ + suffix_array_sampler_test \ suffix_array_test \ target_phrase_extractor_test \ translation_table_test \ @@ -40,13 +42,15 @@ if HAVE_GTEST feature_target_given_source_coherent_test \ grammar_extractor_test \ matchings_finder_test \ + matchings_sampler_test \ + phrase_location_sampler_test \ phrase_test \ precomputation_test \ rule_extractor_helper_test \ rule_extractor_test \ rule_factory_test \ - sampler_test \ scorer_test \ + suffix_array_sampler_test \ suffix_array_test \ target_phrase_extractor_test \ translation_table_test \ @@ -55,8 +59,7 @@ endif noinst_PROGRAMS = $(RUNNABLE_TESTS) -# TESTS = $(RUNNABLE_TESTS) -TESTS = vocabulary_test +TESTS = $(RUNNABLE_TESTS) alignment_test_SOURCES = alignment_test.cc alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a @@ -82,6 +85,10 @@ grammar_extractor_test_SOURCES = grammar_extractor_test.cc grammar_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a matchings_finder_test_SOURCES = matchings_finder_test.cc matchings_finder_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a +matchings_sampler_test_SOURCES = matchings_sampler_test.cc +matchings_sampler_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a +phrase_location_sampler_test_SOURCES = phrase_location_sampler_test.cc +phrase_location_sampler_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a phrase_test_SOURCES = phrase_test.cc phrase_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a precomputation_test_SOURCES = precomputation_test.cc @@ -92,10 +99,10 @@ rule_extractor_test_SOURCES = rule_extractor_test.cc rule_extractor_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a rule_factory_test_SOURCES = rule_factory_test.cc rule_factory_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a -sampler_test_SOURCES = sampler_test.cc -sampler_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a scorer_test_SOURCES = scorer_test.cc scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a +suffix_array_sampler_test_SOURCES = suffix_array_sampler_test.cc +suffix_array_sampler_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a suffix_array_test_SOURCES = suffix_array_test.cc suffix_array_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(GMOCK_LDFLAGS) $(GMOCK_LIBS) libextractor.a target_phrase_extractor_test_SOURCES = target_phrase_extractor_test.cc @@ -116,6 +123,7 @@ extract_LDADD = libextractor.a libextractor_a_SOURCES = \ alignment.cc \ + backoff_sampler.cc \ data_array.cc \ fast_intersector.cc \ features/count_source_target.cc \ @@ -129,18 +137,20 @@ libextractor_a_SOURCES = \ grammar.cc \ grammar_extractor.cc \ matchings_finder.cc \ + matchings_sampler.cc \ matchings_trie.cc \ phrase.cc \ phrase_builder.cc \ phrase_location.cc \ + phrase_location_sampler.cc \ precomputation.cc \ rule.cc \ rule_extractor.cc \ rule_extractor_helper.cc \ rule_factory.cc \ - sampler.cc \ scorer.cc \ suffix_array.cc \ + suffix_array_sampler.cc \ target_phrase_extractor.cc \ time_util.cc \ translation_table.cc \ diff --git a/extractor/backoff_sampler.cc b/extractor/backoff_sampler.cc new file mode 100644 index 00000000..28b12909 --- /dev/null +++ b/extractor/backoff_sampler.cc @@ -0,0 +1,66 @@ +#include "backoff_sampler.h" + +#include "data_array.h" +#include "phrase_location.h" + +namespace extractor { + +BackoffSampler::BackoffSampler( + shared_ptr source_data_array, int max_samples) : + source_data_array(source_data_array), max_samples(max_samples) {} + +BackoffSampler::BackoffSampler() {} + +PhraseLocation BackoffSampler::Sample( + const PhraseLocation& location, + const unordered_set& blacklisted_sentence_ids) const { + vector samples; + int low = GetRangeLow(location), high = GetRangeHigh(location); + int last_position = low - 1; + double step = max(1.0, (double) (high - low) / max_samples); + for (double num_samples = 0, i = low; + num_samples < max_samples && i < high; + ++num_samples, i += step) { + int position = GetPosition(location, round(i)); + int sentence_id = source_data_array->GetSentenceId(position); + bool found = false; + if (last_position >= position || + blacklisted_sentence_ids.count(sentence_id)) { + for (double backoff_step = 1; backoff_step < step; ++backoff_step) { + double j = i - backoff_step; + if (round(j) >= 0) { + position = GetPosition(location, round(j)); + sentence_id = source_data_array->GetSentenceId(position); + if (position > last_position && + !blacklisted_sentence_ids.count(sentence_id)) { + found = true; + last_position = position; + break; + } + } + + double k = i + backoff_step; + if (round(k) < high) { + position = GetPosition(location, round(k)); + sentence_id = source_data_array->GetSentenceId(position); + if (!blacklisted_sentence_ids.count(sentence_id)) { + found = true; + last_position = position; + break; + } + } + } + } else { + found = true; + last_position = position; + } + + if (found) { + AppendMatching(samples, position, location); + } + } + + return PhraseLocation(samples, GetNumSubpatterns(location)); +} + +} // namespace extractor diff --git a/extractor/backoff_sampler.h b/extractor/backoff_sampler.h new file mode 100644 index 00000000..5c244105 --- /dev/null +++ b/extractor/backoff_sampler.h @@ -0,0 +1,41 @@ +#ifndef _BACKOFF_SAMPLER_H_ +#define _BACKOFF_SAMPLER_H_ + +#include + +#include "sampler.h" + +namespace extractor { + +class DataArray; +class PhraseLocation; + +class BackoffSampler : public Sampler { + public: + BackoffSampler(shared_ptr source_data_array, int max_samples); + + BackoffSampler(); + + PhraseLocation Sample( + const PhraseLocation& location, + const unordered_set& blacklisted_sentence_ids) const; + + private: + virtual int GetNumSubpatterns(const PhraseLocation& location) const = 0; + + virtual int GetRangeLow(const PhraseLocation& location) const = 0; + + virtual int GetRangeHigh(const PhraseLocation& location) const = 0; + + virtual int GetPosition(const PhraseLocation& location, int index) const = 0; + + virtual void AppendMatching(vector& samples, int index, + const PhraseLocation& location) const = 0; + + shared_ptr source_data_array; + int max_samples; +}; + +} // namespace extractor + +#endif diff --git a/extractor/matchings_sampler.cc b/extractor/matchings_sampler.cc new file mode 100644 index 00000000..bb916e49 --- /dev/null +++ b/extractor/matchings_sampler.cc @@ -0,0 +1,38 @@ +#include "matchings_sampler.h" + +#include "data_array.h" +#include "phrase_location.h" + +namespace extractor { + +MatchingsSampler::MatchingsSampler( + shared_ptr data_array, int max_samples) : + BackoffSampler(data_array, max_samples) {} + +MatchingsSampler::MatchingsSampler() {} + +int MatchingsSampler::GetNumSubpatterns(const PhraseLocation& location) const { + return location.num_subpatterns; +} + +int MatchingsSampler::GetRangeLow(const PhraseLocation&) const { + return 0; +} + +int MatchingsSampler::GetRangeHigh(const PhraseLocation& location) const { + return location.matchings->size() / location.num_subpatterns; +} + +int MatchingsSampler::GetPosition(const PhraseLocation& location, + int index) const { + return (*location.matchings)[index * location.num_subpatterns]; +} + +void MatchingsSampler::AppendMatching(vector& samples, int index, + const PhraseLocation& location) const { + copy(location.matchings->begin() + index, + location.matchings->begin() + index + location.num_subpatterns, + back_inserter(samples)); +} + +} // namespace extractor diff --git a/extractor/matchings_sampler.h b/extractor/matchings_sampler.h new file mode 100644 index 00000000..ca4fce93 --- /dev/null +++ b/extractor/matchings_sampler.h @@ -0,0 +1,31 @@ +#ifndef _MATCHINGS_SAMPLER_H_ +#define _MATCHINGS_SAMPLER_H_ + +#include "backoff_sampler.h" + +namespace extractor { + +class DataArray; + +class MatchingsSampler : public BackoffSampler { + public: + MatchingsSampler(shared_ptr data_array, int max_samples); + + MatchingsSampler(); + + private: + int GetNumSubpatterns(const PhraseLocation& location) const; + + int GetRangeLow(const PhraseLocation& location) const; + + int GetRangeHigh(const PhraseLocation& location) const; + + int GetPosition(const PhraseLocation& location, int index) const; + + void AppendMatching(vector& samples, int index, + const PhraseLocation& location) const; +}; + +} // namespace extractor + +#endif diff --git a/extractor/matchings_sampler_test.cc b/extractor/matchings_sampler_test.cc new file mode 100644 index 00000000..bc927152 --- /dev/null +++ b/extractor/matchings_sampler_test.cc @@ -0,0 +1,118 @@ +#include + +#include + +#include "mocks/mock_data_array.h" +#include "matchings_sampler.h" +#include "phrase_location.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace { + +class MatchingsSamplerTest : public Test { + protected: + virtual void SetUp() { + vector locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + location = PhraseLocation(locations, 2); + + data_array = make_shared(); + for (int i = 0; i < 10; ++i) { + EXPECT_CALL(*data_array, GetSentenceId(i)).WillRepeatedly(Return(i / 2)); + } + } + + unordered_set blacklisted_sentence_ids; + PhraseLocation location; + shared_ptr data_array; + shared_ptr sampler; +}; + +TEST_F(MatchingsSamplerTest, TestSample) { + sampler = make_shared(data_array, 1); + vector expected_locations = {0, 1}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + sampler = make_shared(data_array, 2); + expected_locations = {0, 1, 6, 7}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + sampler = make_shared(data_array, 3); + expected_locations = {0, 1, 4, 5, 6, 7}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + sampler = make_shared(data_array, 7); + expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); +} + +TEST_F(MatchingsSamplerTest, TestBackoffSample) { + sampler = make_shared(data_array, 1); + blacklisted_sentence_ids = {0}; + vector expected_locations = {2, 3}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + blacklisted_sentence_ids = {0, 1, 2, 3}; + expected_locations = {8, 9}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + blacklisted_sentence_ids = {0, 1, 2, 3, 4}; + expected_locations = {}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + sampler = make_shared(data_array, 2); + blacklisted_sentence_ids = {0, 3}; + expected_locations = {2, 3, 4, 5}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + sampler = make_shared(data_array, 3); + blacklisted_sentence_ids = {0, 3}; + expected_locations = {2, 3, 4, 5, 8, 9}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + blacklisted_sentence_ids = {0, 2, 3}; + expected_locations = {2, 3, 8, 9}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + sampler = make_shared(data_array, 4); + blacklisted_sentence_ids = {0, 1, 2, 3}; + expected_locations = {8, 9}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + blacklisted_sentence_ids = {1, 3}; + expected_locations = {0, 1, 4, 5, 8, 9}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + sampler = make_shared(data_array, 7); + blacklisted_sentence_ids = {0, 1, 2, 3, 4}; + expected_locations = {}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + blacklisted_sentence_ids = {0, 2, 4}; + expected_locations = {2, 3, 6, 7}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); + + blacklisted_sentence_ids = {1, 3}; + expected_locations = {0, 1, 4, 5, 8, 9}; + EXPECT_EQ(PhraseLocation(expected_locations, 2), + sampler->Sample(location, blacklisted_sentence_ids)); +} + +} +} // namespace extractor diff --git a/extractor/mocks/mock_matchings_sampler.h b/extractor/mocks/mock_matchings_sampler.h new file mode 100644 index 00000000..de2009c3 --- /dev/null +++ b/extractor/mocks/mock_matchings_sampler.h @@ -0,0 +1,15 @@ +#include + +#include "phrase_location.h" +#include "matchings_sampler.h" + +namespace extractor { + +class MockMatchingsSampler : public MatchingsSampler { + public: + MOCK_CONST_METHOD2(Sample, PhraseLocation( + const PhraseLocation& location, + const unordered_set& blacklisted_sentence_ids)); +}; + +} // namespace extractor diff --git a/extractor/mocks/mock_suffix_array_sampler.h b/extractor/mocks/mock_suffix_array_sampler.h new file mode 100644 index 00000000..d799b969 --- /dev/null +++ b/extractor/mocks/mock_suffix_array_sampler.h @@ -0,0 +1,15 @@ +#include + +#include "phrase_location.h" +#include "suffix_array_sampler.h" + +namespace extractor { + +class MockSuffixArraySampler : public SuffixArrayRangeSampler { + public: + MOCK_CONST_METHOD2(Sample, PhraseLocation( + const PhraseLocation& location, + const unordered_set& blacklisted_sentence_ids)); +}; + +} // namespace extractor diff --git a/extractor/phrase_location.cc b/extractor/phrase_location.cc index 13140cac..2c367893 100644 --- a/extractor/phrase_location.cc +++ b/extractor/phrase_location.cc @@ -1,5 +1,7 @@ #include "phrase_location.h" +#include + namespace extractor { PhraseLocation::PhraseLocation(int sa_low, int sa_high) : diff --git a/extractor/phrase_location_sampler.cc b/extractor/phrase_location_sampler.cc new file mode 100644 index 00000000..a2eec105 --- /dev/null +++ b/extractor/phrase_location_sampler.cc @@ -0,0 +1,34 @@ +#include "phrase_location_sampler.h" + +#include "matchings_sampler.h" +#include "phrase_location.h" +#include "suffix_array.h" +#include "suffix_array_sampler.h" + +namespace extractor { + +PhraseLocationSampler::PhraseLocationSampler( + shared_ptr suffix_array, int max_samples) { + matchings_sampler = make_shared( + suffix_array->GetData(), max_samples); + suffix_array_sampler = make_shared( + suffix_array, max_samples); +} + +PhraseLocationSampler::PhraseLocationSampler( + shared_ptr matchings_sampler, + shared_ptr suffix_array_sampler) : + matchings_sampler(matchings_sampler), + suffix_array_sampler(suffix_array_sampler) {} + +PhraseLocation PhraseLocationSampler::Sample( + const PhraseLocation& location, + const unordered_set& blacklisted_sentence_ids) const { + if (location.matchings == NULL) { + return suffix_array_sampler->Sample(location, blacklisted_sentence_ids); + } else { + return matchings_sampler->Sample(location, blacklisted_sentence_ids); + } +} + +} // namespace extractor diff --git a/extractor/phrase_location_sampler.h b/extractor/phrase_location_sampler.h new file mode 100644 index 00000000..0e88335e --- /dev/null +++ b/extractor/phrase_location_sampler.h @@ -0,0 +1,35 @@ +#ifndef _PHRASE_LOCATION_SAMPLER_H_ +#define _PHRASE_LOCATION_SAMPLER_H_ + +#include + +#include "sampler.h" + +namespace extractor { + +class MatchingsSampler; +class PhraseLocation; +class SuffixArray; +class SuffixArrayRangeSampler; + +class PhraseLocationSampler : public Sampler { + public: + PhraseLocationSampler(shared_ptr suffix_array, int max_samples); + + // For testing only. + PhraseLocationSampler( + shared_ptr matchings_sampler, + shared_ptr suffix_array_sampler); + + PhraseLocation Sample( + const PhraseLocation& location, + const unordered_set& blacklisted_sentence_ids) const; + + private: + shared_ptr matchings_sampler; + shared_ptr suffix_array_sampler; +}; + +} // namespace extractor + +#endif diff --git a/extractor/phrase_location_sampler_test.cc b/extractor/phrase_location_sampler_test.cc new file mode 100644 index 00000000..e7520ce7 --- /dev/null +++ b/extractor/phrase_location_sampler_test.cc @@ -0,0 +1,50 @@ +#include + +#include + +#include "mocks/mock_matchings_sampler.h" +#include "mocks/mock_suffix_array_sampler.h" +#include "phrase_location.h" +#include "phrase_location_sampler.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace { + +class MatchingsSamplerTest : public Test { + protected: + virtual void SetUp() { + matchings_sampler = make_shared(); + suffix_array_sampler = make_shared(); + + sampler = make_shared( + matchings_sampler, suffix_array_sampler); + } + + shared_ptr matchings_sampler; + shared_ptr suffix_array_sampler; + shared_ptr sampler; +}; + +TEST_F(MatchingsSamplerTest, TestSuffixArrayRange) { + vector locations = {0, 1, 2, 3}; + PhraseLocation location(0, 3), result(locations, 2); + unordered_set blacklisted_sentence_ids; + EXPECT_CALL(*suffix_array_sampler, Sample(location, blacklisted_sentence_ids)) + .WillOnce(Return(result)); + EXPECT_EQ(result, sampler->Sample(location, blacklisted_sentence_ids)); +} + +TEST_F(MatchingsSamplerTest, TestMatchings) { + vector locations = {0, 1, 2, 3}; + PhraseLocation location(locations, 2), result(locations, 2); + unordered_set blacklisted_sentence_ids; + EXPECT_CALL(*matchings_sampler, Sample(location, blacklisted_sentence_ids)) + .WillOnce(Return(result)); + EXPECT_EQ(result, sampler->Sample(location, blacklisted_sentence_ids)); +} + +} +} // namespace extractor diff --git a/extractor/precomputation.cc b/extractor/precomputation.cc index b79daae3..3e58e2a9 100644 --- a/extractor/precomputation.cc +++ b/extractor/precomputation.cc @@ -91,7 +91,6 @@ vector> Precomputation::FindMostFrequentPatterns( } } - shared_ptr data_array = suffix_array->GetData(); // Extract the most frequent patterns. vector> frequent_patterns; while (frequent_patterns.size() < num_frequent_patterns && !heap.empty()) { @@ -99,7 +98,7 @@ vector> Precomputation::FindMostFrequentPatterns( int len = heap.top().second.second; heap.pop(); - vector pattern = data_array->GetWordIds(start, len); + vector pattern(data.begin() + start, data.begin() + start + len); if (find(pattern.begin(), pattern.end(), DataArray::END_OF_LINE) == pattern.end()) { frequent_patterns.push_back(pattern); diff --git a/extractor/precomputation_test.cc b/extractor/precomputation_test.cc index d5f5ef63..3a98ce05 100644 --- a/extractor/precomputation_test.cc +++ b/extractor/precomputation_test.cc @@ -94,7 +94,7 @@ TEST_F(PrecomputationTest, TestCollocations) { EXPECT_TRUE(precomputation.Contains(key)); EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); - key = {2, -1, 2, -1, 2}; + key = {2, -1, 2, -2, 2}; expected_value = {1, 5, 8, 5, 8, 11}; EXPECT_TRUE(precomputation.Contains(key)); EXPECT_EQ(expected_value, precomputation.GetCollocations(key)); diff --git a/extractor/rule_factory.cc b/extractor/rule_factory.cc index 5b66f685..18a60695 100644 --- a/extractor/rule_factory.cc +++ b/extractor/rule_factory.cc @@ -12,6 +12,7 @@ #include "phrase_builder.h" #include "rule.h" #include "rule_extractor.h" +#include "phrase_location_sampler.h" #include "sampler.h" #include "scorer.h" #include "suffix_array.h" @@ -68,7 +69,8 @@ HieroCachingRuleFactory::HieroCachingRuleFactory( target_data_array, alignment, phrase_builder, scorer, vocabulary, max_rule_span, min_gap_size, max_nonterminals, max_rule_symbols, true, false, require_tight_phrases); - sampler = make_shared(source_suffix_array, max_samples); + sampler = make_shared( + source_suffix_array, max_samples); } HieroCachingRuleFactory::HieroCachingRuleFactory( diff --git a/extractor/sampler.cc b/extractor/sampler.cc deleted file mode 100644 index 887aaec1..00000000 --- a/extractor/sampler.cc +++ /dev/null @@ -1,78 +0,0 @@ -#include "sampler.h" - -#include "phrase_location.h" -#include "suffix_array.h" - -namespace extractor { - -Sampler::Sampler(shared_ptr suffix_array, int max_samples) : - suffix_array(suffix_array), max_samples(max_samples) {} - -Sampler::Sampler() {} - -Sampler::~Sampler() {} - -PhraseLocation Sampler::Sample( - const PhraseLocation& location, - const unordered_set& blacklisted_sentence_ids) const { - shared_ptr source_data_array = suffix_array->GetData(); - vector sample; - int num_subpatterns; - if (location.matchings == NULL) { - // Sample suffix array range. - num_subpatterns = 1; - int low = location.sa_low, high = location.sa_high; - double step = max(1.0, (double) (high - low) / max_samples); - double i = low, last = i - 1; - while (sample.size() < max_samples && i < high) { - int x = suffix_array->GetSuffix(Round(i)); - int id = source_data_array->GetSentenceId(x); - bool found = false; - if (blacklisted_sentence_ids.count(id)) { - for (int backoff_step = 1; backoff_step <= step; ++backoff_step) { - double j = i - backoff_step; - x = suffix_array->GetSuffix(Round(j)); - id = source_data_array->GetSentenceId(x); - if (x >= 0 && j > last && !blacklisted_sentence_ids.count(id)) { - found = true; - last = i; - break; - } - double k = i + backoff_step; - x = suffix_array->GetSuffix(Round(k)); - id = source_data_array->GetSentenceId(x); - if (k < min(i+step, (double) high) && - !blacklisted_sentence_ids.count(id)) { - found = true; - last = k; - break; - } - } - } else { - found = true; - last = i; - } - if (found) sample.push_back(x); - i += step; - } - } else { - // Sample vector of occurrences. - num_subpatterns = location.num_subpatterns; - int num_matchings = location.matchings->size() / num_subpatterns; - double step = max(1.0, (double) num_matchings / max_samples); - for (double i = 0, num_samples = 0; - i < num_matchings && num_samples < max_samples; - i += step, ++num_samples) { - int start = Round(i) * num_subpatterns; - sample.insert(sample.end(), location.matchings->begin() + start, - location.matchings->begin() + start + num_subpatterns); - } - } - return PhraseLocation(sample, num_subpatterns); -} - -int Sampler::Round(double x) const { - return x + 0.5; -} - -} // namespace extractor diff --git a/extractor/sampler.h b/extractor/sampler.h index bd8a5876..3c4e37f1 100644 --- a/extractor/sampler.h +++ b/extractor/sampler.h @@ -4,38 +4,20 @@ #include #include -#include "data_array.h" - using namespace std; namespace extractor { class PhraseLocation; -class SuffixArray; /** - * Provides uniform sampling for a PhraseLocation. + * Base sampler class. */ class Sampler { public: - Sampler(shared_ptr suffix_array, int max_samples); - - virtual ~Sampler(); - - // Samples uniformly at most max_samples phrase occurrences. virtual PhraseLocation Sample( const PhraseLocation& location, - const unordered_set& blacklisted_sentence_ids) const; - - protected: - Sampler(); - - private: - // Round floating point number to the nearest integer. - int Round(double x) const; - - shared_ptr suffix_array; - int max_samples; + const unordered_set& blacklisted_sentence_ids) const = 0; }; } // namespace extractor diff --git a/extractor/sampler_test.cc b/extractor/sampler_test.cc deleted file mode 100644 index 14e72780..00000000 --- a/extractor/sampler_test.cc +++ /dev/null @@ -1,92 +0,0 @@ -#include - -#include - -#include "mocks/mock_suffix_array.h" -#include "mocks/mock_data_array.h" -#include "phrase_location.h" -#include "sampler.h" - -using namespace std; -using namespace ::testing; - -namespace extractor { -namespace { - -class SamplerTest : public Test { - protected: - virtual void SetUp() { - source_data_array = make_shared(); - EXPECT_CALL(*source_data_array, GetSentenceId(_)).WillRepeatedly(Return(9999)); - suffix_array = make_shared(); - EXPECT_CALL(*suffix_array, GetData()) - .WillRepeatedly(Return(source_data_array)); - for (int i = 0; i < 10; ++i) { - EXPECT_CALL(*suffix_array, GetSuffix(i)).WillRepeatedly(Return(i)); - } - } - - shared_ptr suffix_array; - shared_ptr sampler; - shared_ptr source_data_array; -}; - -TEST_F(SamplerTest, TestSuffixArrayRange) { - PhraseLocation location(0, 10); - unordered_set blacklist; - - sampler = make_shared(suffix_array, 1); - vector expected_locations = {0}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), - sampler->Sample(location, blacklist)); - return; - - sampler = make_shared(suffix_array, 2); - expected_locations = {0, 5}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), - sampler->Sample(location, blacklist)); - - sampler = make_shared(suffix_array, 3); - expected_locations = {0, 3, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), - sampler->Sample(location, blacklist)); - - sampler = make_shared(suffix_array, 4); - expected_locations = {0, 3, 5, 8}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), - sampler->Sample(location, blacklist)); - - sampler = make_shared(suffix_array, 100); - expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), - sampler->Sample(location, blacklist)); -} - -TEST_F(SamplerTest, TestSubstringsSample) { - vector locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - unordered_set blacklist; - PhraseLocation location(locations, 2); - - sampler = make_shared(suffix_array, 1); - vector expected_locations = {0, 1}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), - sampler->Sample(location, blacklist)); - - sampler = make_shared(suffix_array, 2); - expected_locations = {0, 1, 6, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), - sampler->Sample(location, blacklist)); - - sampler = make_shared(suffix_array, 3); - expected_locations = {0, 1, 4, 5, 6, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), - sampler->Sample(location, blacklist)); - - sampler = make_shared(suffix_array, 7); - expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), - sampler->Sample(location, blacklist)); -} - -} // namespace -} // namespace extractor diff --git a/extractor/sampler_test_blacklist.cc b/extractor/sampler_test_blacklist.cc deleted file mode 100644 index 3305b990..00000000 --- a/extractor/sampler_test_blacklist.cc +++ /dev/null @@ -1,102 +0,0 @@ -#include - -#include - -#include "mocks/mock_suffix_array.h" -#include "mocks/mock_data_array.h" -#include "phrase_location.h" -#include "sampler.h" - -using namespace std; -using namespace ::testing; - -namespace extractor { -namespace { - -class SamplerTestBlacklist : public Test { - protected: - virtual void SetUp() { - source_data_array = make_shared(); - for (int i = 0; i < 10; ++i) { - EXPECT_CALL(*source_data_array, GetSentenceId(i)).WillRepeatedly(Return(i)); - } - for (int i = -10; i < 0; ++i) { - EXPECT_CALL(*source_data_array, GetSentenceId(i)).WillRepeatedly(Return(0)); - } - suffix_array = make_shared(); - for (int i = -10; i < 10; ++i) { - EXPECT_CALL(*suffix_array, GetSuffix(i)).WillRepeatedly(Return(i)); - } - } - - shared_ptr suffix_array; - shared_ptr sampler; - shared_ptr source_data_array; -}; - -TEST_F(SamplerTestBlacklist, TestSuffixArrayRange) { - PhraseLocation location(0, 10); - unordered_set blacklist; - vector expected_locations; - - blacklist.insert(0); - sampler = make_shared(suffix_array, 1); - expected_locations = {1}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); - blacklist.clear(); - - for (int i = 0; i < 9; i++) { - blacklist.insert(i); - } - sampler = make_shared(suffix_array, 1); - expected_locations = {9}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); - blacklist.clear(); - - blacklist.insert(0); - blacklist.insert(5); - sampler = make_shared(suffix_array, 2); - expected_locations = {1, 4}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); - blacklist.clear(); - - blacklist.insert(0); - blacklist.insert(1); - blacklist.insert(2); - blacklist.insert(3); - sampler = make_shared(suffix_array, 2); - expected_locations = {4, 5}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); - blacklist.clear(); - - blacklist.insert(0); - blacklist.insert(3); - blacklist.insert(7); - sampler = make_shared(suffix_array, 3); - expected_locations = {1, 2, 6}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); - blacklist.clear(); - - blacklist.insert(0); - blacklist.insert(3); - blacklist.insert(5); - blacklist.insert(8); - sampler = make_shared(suffix_array, 4); - expected_locations = {1, 2, 4, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); - blacklist.clear(); - - blacklist.insert(0); - sampler = make_shared(suffix_array, 100); - expected_locations = {1, 2, 3, 4, 5, 6, 7, 8, 9}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); - blacklist.clear(); - - blacklist.insert(9); - sampler = make_shared(suffix_array, 100); - expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); -} - -} // namespace -} // namespace extractor diff --git a/extractor/suffix_array_sampler.cc b/extractor/suffix_array_sampler.cc new file mode 100644 index 00000000..4a4ced34 --- /dev/null +++ b/extractor/suffix_array_sampler.cc @@ -0,0 +1,40 @@ +#include "suffix_array_sampler.h" + +#include "data_array.h" +#include "phrase_location.h" +#include "suffix_array.h" + +namespace extractor { + +SuffixArrayRangeSampler::SuffixArrayRangeSampler( + shared_ptr source_suffix_array, int max_samples) : + BackoffSampler(source_suffix_array->GetData(), max_samples), + source_suffix_array(source_suffix_array) {} + +SuffixArrayRangeSampler::SuffixArrayRangeSampler() {} + +int SuffixArrayRangeSampler::GetNumSubpatterns(const PhraseLocation&) const { + return 1; +} + +int SuffixArrayRangeSampler::GetRangeLow( + const PhraseLocation& location) const { + return location.sa_low; +} + +int SuffixArrayRangeSampler::GetRangeHigh( + const PhraseLocation& location) const { + return location.sa_high; +} + +int SuffixArrayRangeSampler::GetPosition( + const PhraseLocation&, int position) const { + return source_suffix_array->GetSuffix(position); +} + +void SuffixArrayRangeSampler::AppendMatching( + vector& samples, int index, const PhraseLocation&) const { + samples.push_back(source_suffix_array->GetSuffix(index)); +} + +} // namespace extractor diff --git a/extractor/suffix_array_sampler.h b/extractor/suffix_array_sampler.h new file mode 100644 index 00000000..bb3c2653 --- /dev/null +++ b/extractor/suffix_array_sampler.h @@ -0,0 +1,34 @@ +#ifndef _SUFFIX_ARRAY_SAMPLER_H_ +#define _SUFFIX_ARRAY_SAMPLER_H_ + +#include "backoff_sampler.h" + +namespace extractor { + +class SuffixArray; + +class SuffixArrayRangeSampler : public BackoffSampler { + public: + SuffixArrayRangeSampler(shared_ptr suffix_array, + int max_samples); + + SuffixArrayRangeSampler(); + + private: + int GetNumSubpatterns(const PhraseLocation& location) const; + + int GetRangeLow(const PhraseLocation& location) const; + + int GetRangeHigh(const PhraseLocation& location) const; + + int GetPosition(const PhraseLocation& location, int index) const; + + void AppendMatching(vector& samples, int index, + const PhraseLocation& location) const; + + shared_ptr source_suffix_array; +}; + +} // namespace extractor + +#endif diff --git a/extractor/suffix_array_sampler_test.cc b/extractor/suffix_array_sampler_test.cc new file mode 100644 index 00000000..4b88c027 --- /dev/null +++ b/extractor/suffix_array_sampler_test.cc @@ -0,0 +1,114 @@ +#include + +#include + +#include "mocks/mock_data_array.h" +#include "mocks/mock_suffix_array.h" +#include "suffix_array_sampler.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace { + +class SuffixArraySamplerTest : public Test { + protected: + virtual void SetUp() { + data_array = make_shared(); + for (int i = 0; i < 10; ++i) { + EXPECT_CALL(*data_array, GetSentenceId(i)).WillRepeatedly(Return(i)); + } + + suffix_array = make_shared(); + EXPECT_CALL(*suffix_array, GetData()).WillRepeatedly(Return(data_array)); + for (int i = 0; i < 10; ++i) { + EXPECT_CALL(*suffix_array, GetSuffix(i)).WillRepeatedly(Return(i)); + } + } + + shared_ptr data_array; + shared_ptr suffix_array; +}; + +TEST_F(SuffixArraySamplerTest, TestSample) { + PhraseLocation location(0, 10); + unordered_set blacklisted_sentence_ids; + + SuffixArrayRangeSampler sampler(suffix_array, 1); + vector expected_locations = {0}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); + + sampler = SuffixArrayRangeSampler(suffix_array, 2); + expected_locations = {0, 5}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); + + sampler = SuffixArrayRangeSampler(suffix_array, 3); + expected_locations = {0, 3, 7}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); + + sampler = SuffixArrayRangeSampler(suffix_array, 4); + expected_locations = {0, 3, 5, 8}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); + + sampler = SuffixArrayRangeSampler(suffix_array, 100); + expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); +} + +TEST_F(SuffixArraySamplerTest, TestBackoffSample) { + PhraseLocation location(0, 10); + + SuffixArrayRangeSampler sampler(suffix_array, 1); + unordered_set blacklisted_sentence_ids = {0}; + vector expected_locations = {1}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); + + blacklisted_sentence_ids = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + expected_locations = {9}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); + + sampler = SuffixArrayRangeSampler(suffix_array, 2); + blacklisted_sentence_ids = {0, 5}; + expected_locations = {1, 4}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); + + blacklisted_sentence_ids = {0, 1, 2, 3}; + expected_locations = {4, 5}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); + + sampler = SuffixArrayRangeSampler(suffix_array, 3); + blacklisted_sentence_ids = {0, 3, 7}; + expected_locations = {1, 2, 6}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); + + sampler = SuffixArrayRangeSampler(suffix_array, 4); + blacklisted_sentence_ids = {0, 3, 5, 8}; + expected_locations = {1, 2, 4, 7}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); + + sampler = SuffixArrayRangeSampler(suffix_array, 100); + blacklisted_sentence_ids = {0}; + expected_locations = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); + + blacklisted_sentence_ids = {9}; + expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), + sampler.Sample(location, blacklisted_sentence_ids)); +} + +} +} // namespace extractor -- cgit v1.2.3 From 84f9ead0ce9deca3f019d1af6a011fa7da08ff6a Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Thu, 28 Nov 2013 01:39:14 +0000 Subject: Fixes. --- extractor/backoff_sampler.cc | 26 +++++++++++++------------- extractor/matchings_sampler.cc | 5 +++-- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/extractor/backoff_sampler.cc b/extractor/backoff_sampler.cc index 28b12909..891276c6 100644 --- a/extractor/backoff_sampler.cc +++ b/extractor/backoff_sampler.cc @@ -16,47 +16,47 @@ PhraseLocation BackoffSampler::Sample( const unordered_set& blacklisted_sentence_ids) const { vector samples; int low = GetRangeLow(location), high = GetRangeHigh(location); - int last_position = low - 1; + int last = low - 1; double step = max(1.0, (double) (high - low) / max_samples); for (double num_samples = 0, i = low; num_samples < max_samples && i < high; ++num_samples, i += step) { - int position = GetPosition(location, round(i)); + int sample = round(i); + int position = GetPosition(location, sample); int sentence_id = source_data_array->GetSentenceId(position); bool found = false; - if (last_position >= position || + if (last >= sample || blacklisted_sentence_ids.count(sentence_id)) { for (double backoff_step = 1; backoff_step < step; ++backoff_step) { double j = i - backoff_step; - if (round(j) >= 0) { - position = GetPosition(location, round(j)); + sample = round(j); + if (sample >= 0) { + position = GetPosition(location, sample); sentence_id = source_data_array->GetSentenceId(position); - if (position > last_position && - !blacklisted_sentence_ids.count(sentence_id)) { + if (sample > last && !blacklisted_sentence_ids.count(sentence_id)) { found = true; - last_position = position; break; } } double k = i + backoff_step; - if (round(k) < high) { - position = GetPosition(location, round(k)); + sample = round(k); + if (sample < high) { + position = GetPosition(location, sample); sentence_id = source_data_array->GetSentenceId(position); if (!blacklisted_sentence_ids.count(sentence_id)) { found = true; - last_position = position; break; } } } } else { found = true; - last_position = position; } if (found) { - AppendMatching(samples, position, location); + last = sample; + AppendMatching(samples, sample, location); } } diff --git a/extractor/matchings_sampler.cc b/extractor/matchings_sampler.cc index bb916e49..75a62366 100644 --- a/extractor/matchings_sampler.cc +++ b/extractor/matchings_sampler.cc @@ -30,8 +30,9 @@ int MatchingsSampler::GetPosition(const PhraseLocation& location, void MatchingsSampler::AppendMatching(vector& samples, int index, const PhraseLocation& location) const { - copy(location.matchings->begin() + index, - location.matchings->begin() + index + location.num_subpatterns, + int start = index * location.num_subpatterns; + copy(location.matchings->begin() + start, + location.matchings->begin() + start + location.num_subpatterns, back_inserter(samples)); } -- cgit v1.2.3 From 8a2ed1a3ba8d5c0204ec76cec801c2396aaf27d3 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Thu, 28 Nov 2013 17:23:37 +0000 Subject: Fix mira on taipan/tiger. --- environment/LocalConfig.pm | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/environment/LocalConfig.pm b/environment/LocalConfig.pm index f7c3b1c7..4fa0ab74 100644 --- a/environment/LocalConfig.pm +++ b/environment/LocalConfig.pm @@ -63,11 +63,11 @@ my $CCONFIG = { 'JobControl' => 'fork', 'DefaultJobs' => 8, }, - 'OxfordDeathSnakes' => { - 'HOST_REGEXP' => qr/^(taipan|tiger).cs.ox.ac.uk$/, - 'JobControl' => 'fork', - 'DefaultJobs' => 12, - }, +# 'OxfordDeathSnakes' => { +# 'HOST_REGEXP' => qr/^(taipan|tiger).cs.ox.ac.uk$/, +# 'JobControl' => 'fork', +# 'DefaultJobs' => 12, +# }, 'cluster.cl.uni-heidelberg.de' => { 'HOST_REGEXP' => qr/node25/, 'JobControl' => 'qsub', -- cgit v1.2.3 From 728dada3675d3c71d4c544579b7fb53d766308f9 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Sat, 30 Nov 2013 00:18:57 +0000 Subject: Update .gitignore. --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index f964fa0c..a3f979db 100644 --- a/.gitignore +++ b/.gitignore @@ -189,8 +189,10 @@ utils/mfcr_test utils/phmt utils/reconstruct_weights utils/small_vector_test +utils/sv_test utils/ts utils/weights_test +training/crf/mpi_adagrad_optimize training/crf/mpi_batch_optimize training/crf/mpi_baum_welch training/crf/mpi_compute_cllh -- cgit v1.2.3 From cacd7f909597848dc62ff5bca61d5b1ca3c630fb Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Sat, 30 Nov 2013 00:49:20 +0000 Subject: Update extractor README. --- extractor/README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/extractor/README.md b/extractor/README.md index c9db8de8..642fbd1d 100644 --- a/extractor/README.md +++ b/extractor/README.md @@ -1,8 +1,14 @@ C++ implementation of the online grammar extractor originally developed by [Adam Lopez](http://www.cs.jhu.edu/~alopez/). -To run the extractor you need to: +The grammar extraction takes place in two steps: (a) precomputing a number of data structures and (b) actually extracting the grammars. All the flags below have the same meaning as in the cython implementation. - cdec/extractor/run_extractor -t -a -b -g < > +To compile the data structures you need to run: + + cdec/extractor/compile -a -b -c -o + +To extract the grammars you need to run: + + cdec/extract/extract -t -c -g < > To run unit tests you need first to configure `cdec` with the [Google Test](https://code.google.com/p/googletest/) and [Google Mock](https://code.google.com/p/googlemock/) libraries: -- cgit v1.2.3 From 5f55de43762bbab3702edfee5ad3dc9c0a38db01 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 30 Nov 2013 18:03:30 -0500 Subject: fix l1 implementation to ensure greater sparsity --- training/crf/mpi_adagrad_optimize.cc | 58 ++++++++++++++++++++++++++++-------- training/crf/mpi_compute_cllh.cc | 1 + 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/training/crf/mpi_adagrad_optimize.cc b/training/crf/mpi_adagrad_optimize.cc index af963e3a..e1ee789c 100644 --- a/training/crf/mpi_adagrad_optimize.cc +++ b/training/crf/mpi_adagrad_optimize.cc @@ -188,7 +188,7 @@ class AdaGradOptimizer { explicit AdaGradOptimizer(double e) : eta(e), G() {} - void update(const SparseVector& g, vector* x) { + void update(const SparseVector& g, vector* x, SparseVector* sx) { if (x->size() > G.size()) G.resize(x->size(), 0.0); #if HAVE_CXX11 for (auto& gi : g) { @@ -199,6 +199,7 @@ class AdaGradOptimizer { if (gi.second) { G[gi.first] += gi.second * gi.second; (*x)[gi.first] -= eta / sqrt(G[gi.first]) * gi.second; + sx->add_value(gi.first, -eta / sqrt(G[gi.first]) * gi.second); } } } @@ -213,7 +214,7 @@ class AdaGradL1Optimizer { eta(e), lambda(l), G() {} - void update(const SparseVector& g, vector* x) { + void update(const SparseVector& g, vector* x, SparseVector* sx) { t += 1.0; if (x->size() > G.size()) { G.resize(x->size(), 0.0); @@ -228,13 +229,37 @@ class AdaGradL1Optimizer { if (gi.second) { u[gi.first] += gi.second; G[gi.first] += gi.second * gi.second; - double z = fabs(u[gi.first] / t) - lambda; - double s = 1; - if (u[gi.first] > 0) s = -1; - if (z > 0 && G[gi.first]) - (*x)[gi.first] = eta * s * z * t / sqrt(G[gi.first]); - else - (*x)[gi.first] = 0.0; + sx->set_value(gi.first, 1.0); // this is a dummy value to trigger recomputation + } + } + + // compute updates (avoid invalidating iterators by putting them all + // in the vector vupdate and applying them after this) + vector> vupdate; +#if HAVE_CXX11 + for (auto& xi : *sx) { +#else + for (SparseVector::const_iterator it = sx->begin(); it != sx->end(); ++it) { + const pair& gi = *it; +#endif + double z = fabs(u[xi.first] / t) - lambda; + double s = 1; + if (u[xi.first] > 0) s = -1; + if (z > 0 && G[xi.first]) { + vupdate.push_back(make_pair(xi.first, eta * s * z * t / sqrt(G[xi.first]))); + } else { + vupdate.push_back(make_pair(xi.first, 0.0)); + } + } + + // apply updates + for (unsigned i = 0; i < vupdate.size(); ++i) { + if (vupdate[i].second) { + sx->set_value(vupdate[i].first, vupdate[i].second); + (*x)[vupdate[i].first] = vupdate[i].second; + } else { + (*x)[vupdate[i].first] = 0.0; + sx->erase(vupdate[i].first); } } } @@ -323,6 +348,8 @@ int main(int argc, char** argv) { lambdas.swap(init_weights); init_weights.clear(); } + SparseVector lambdas_sparse; + Weights::InitSparseVector(lambdas, &lambdas_sparse); //AdaGradOptimizer adagrad(conf["eta"].as()); AdaGradL1Optimizer adagrad(conf["eta"].as(), conf["regularization_strength"].as()); @@ -338,6 +365,13 @@ int main(int argc, char** argv) { mpi::timer timer; #endif ++iter; + if (iter > 1) { + lambdas_sparse.init_vector(&lambdas); + if (rank == 0) { + Weights::SanityCheck(lambdas); + Weights::ShowLargestFeatures(lambdas); + } + } observer.Reset(); if (rank == 0) { converged = (iter == max_iteration); @@ -376,12 +410,10 @@ int main(int argc, char** argv) { if (rank == 0) { g /= minibatch_size; lambdas.resize(FD::NumFeats(), 0.0); // might have seen new features - adagrad.update(g, &lambdas); - Weights::SanityCheck(lambdas); - Weights::ShowLargestFeatures(lambdas); + adagrad.update(g, &lambdas, &lambdas_sparse); } #ifdef HAVE_MPI - broadcast(world, lambdas, 0); + broadcast(world, lambdas_sparse, 0); broadcast(world, converged, 0); world.barrier(); if (rank == 0) { cerr << " ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; } diff --git a/training/crf/mpi_compute_cllh.cc b/training/crf/mpi_compute_cllh.cc index 066389d0..7e38da3a 100644 --- a/training/crf/mpi_compute_cllh.cc +++ b/training/crf/mpi_compute_cllh.cc @@ -120,6 +120,7 @@ int main(int argc, char** argv) { reduce(world, observer.trg_words, total_words, std::plus(), 0); #else objective = observer.acc_obj; + total_words = observer.trg_words; #endif if (rank == 0) { -- cgit v1.2.3 From 407b100cd3e4ae987504b53101151fba287ad999 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 30 Nov 2013 22:05:08 -0500 Subject: fix format --- training/crf/mpi_adagrad_optimize.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/crf/mpi_adagrad_optimize.cc b/training/crf/mpi_adagrad_optimize.cc index e1ee789c..39bd763e 100644 --- a/training/crf/mpi_adagrad_optimize.cc +++ b/training/crf/mpi_adagrad_optimize.cc @@ -387,7 +387,7 @@ int main(int argc, char** argv) { } ostringstream vv; double minutes = (cur_time - start_time) / 60.0; - vv << "total walltime=" << minutes << "min iter=" << iter << " minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << non_zeros(lambdas) << '/' << FD::NumFeats() << " passes_thru_data=" << (iter * size_per_proc / static_cast(corpus.size())); + vv << "total walltime=" << minutes << " min iter=" << iter << " minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << non_zeros(lambdas) << '/' << FD::NumFeats() << " passes_thru_data=" << (iter * size_per_proc / static_cast(corpus.size())); const string svv = vv.str(); cerr << svv << endl; Weights::WriteToFile(fname, lambdas, true, &svv); -- cgit v1.2.3