summaryrefslogtreecommitdiff
path: root/extractor
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-11-24 13:19:28 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-11-25 17:54:09 +0000
commit9834df1efceb27b140f18f47e711d3fff6c7ecb8 (patch)
tree356f0c9c396c14720c4f0d513ab975435a5bd3a0 /extractor
parent9cc2e002a064a2e14444669178126d1e96be8230 (diff)
Reduce memory overhead for constructing the intersector.
Diffstat (limited to 'extractor')
-rw-r--r--extractor/Makefile.am3
-rw-r--r--extractor/compile.cc4
-rw-r--r--extractor/data_array.cc2
-rw-r--r--extractor/data_array.h2
-rw-r--r--extractor/fast_intersector.cc40
-rw-r--r--extractor/fast_intersector.h8
-rw-r--r--extractor/fast_intersector_test.cc10
-rw-r--r--extractor/grammar_extractor.cc5
-rw-r--r--extractor/grammar_extractor.h1
-rw-r--r--extractor/mocks/mock_data_array.h2
-rw-r--r--extractor/mocks/mock_precomputation.h3
-rw-r--r--extractor/precomputation.cc125
-rw-r--r--extractor/precomputation.h41
-rw-r--r--extractor/precomputation_test.cc73
-rw-r--r--extractor/run_extractor.cc5
-rw-r--r--extractor/suffix_array_test.cc2
-rw-r--r--extractor/translation_table_test.cc4
-rw-r--r--extractor/vocabulary.cc7
18 files changed, 194 insertions, 143 deletions
diff --git a/extractor/Makefile.am b/extractor/Makefile.am
index 65a3d436..faf25d89 100644
--- a/extractor/Makefile.am
+++ b/extractor/Makefile.am
@@ -53,7 +53,8 @@ endif
noinst_PROGRAMS = $(RUNNABLE_TESTS)
-TESTS = $(RUNNABLE_TESTS)
+# TESTS = $(RUNNABLE_TESTS)
+TESTS = precomputation_test
alignment_test_SOURCES = alignment_test.cc
alignment_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libextractor.a
diff --git a/extractor/compile.cc b/extractor/compile.cc
index 65fdd509..0d62757e 100644
--- a/extractor/compile.cc
+++ b/extractor/compile.cc
@@ -13,6 +13,7 @@
#include "suffix_array.h"
#include "time_util.h"
#include "translation_table.h"
+#include "vocabulary.h"
namespace ar = boost::archive;
namespace fs = boost::filesystem;
@@ -125,9 +126,12 @@ int main(int argc, char** argv) {
cerr << "Reading alignment took "
<< GetDuration(start_time, stop_time) << " seconds" << endl;
+ shared_ptr<Vocabulary> vocabulary;
+
start_time = Clock::now();
cerr << "Precomputing collocations..." << endl;
Precomputation precomputation(
+ vocabulary,
source_suffix_array,
vm["frequent"].as<int>(),
vm["super_frequent"].as<int>(),
diff --git a/extractor/data_array.cc b/extractor/data_array.cc
index 82efcd51..6757cae7 100644
--- a/extractor/data_array.cc
+++ b/extractor/data_array.cc
@@ -78,7 +78,7 @@ void DataArray::CreateDataArray(const vector<string>& lines) {
DataArray::~DataArray() {}
-const vector<int>& DataArray::GetData() const {
+vector<int> DataArray::GetData() const {
return data;
}
diff --git a/extractor/data_array.h b/extractor/data_array.h
index 5207366d..e9af5bd0 100644
--- a/extractor/data_array.h
+++ b/extractor/data_array.h
@@ -51,7 +51,7 @@ class DataArray {
virtual ~DataArray();
// Returns a vector containing the word ids.
- virtual const vector<int>& GetData() const;
+ virtual vector<int> GetData() const;
// Returns the word id at the specified position.
virtual int AtIndex(int index) const;
diff --git a/extractor/fast_intersector.cc b/extractor/fast_intersector.cc
index a8591a72..0d1fa6d8 100644
--- a/extractor/fast_intersector.cc
+++ b/extractor/fast_intersector.cc
@@ -11,41 +11,22 @@
namespace extractor {
-FastIntersector::FastIntersector(shared_ptr<SuffixArray> suffix_array,
- shared_ptr<Precomputation> precomputation,
- shared_ptr<Vocabulary> vocabulary,
- int max_rule_span,
- int min_gap_size) :
+FastIntersector::FastIntersector(
+ shared_ptr<SuffixArray> suffix_array,
+ shared_ptr<Precomputation> precomputation,
+ shared_ptr<Vocabulary> vocabulary,
+ int max_rule_span,
+ int min_gap_size) :
suffix_array(suffix_array),
+ precomputation(precomputation),
vocabulary(vocabulary),
max_rule_span(max_rule_span),
- min_gap_size(min_gap_size) {
- Index precomputed_collocations = precomputation->GetCollocations();
- for (pair<vector<int>, vector<int>> entry: precomputed_collocations) {
- vector<int> phrase = ConvertPhrase(entry.first);
- collocations[phrase] = entry.second;
- }
-}
+ min_gap_size(min_gap_size) {}
FastIntersector::FastIntersector() {}
FastIntersector::~FastIntersector() {}
-vector<int> FastIntersector::ConvertPhrase(const vector<int>& old_phrase) {
- vector<int> new_phrase;
- new_phrase.reserve(old_phrase.size());
- shared_ptr<DataArray> data_array = suffix_array->GetData();
- for (int word_id: old_phrase) {
- if (word_id < 0) {
- new_phrase.push_back(word_id);
- } else {
- new_phrase.push_back(
- vocabulary->GetTerminalIndex(data_array->GetWord(word_id)));
- }
- }
- return new_phrase;
-}
-
PhraseLocation FastIntersector::Intersect(
PhraseLocation& prefix_location,
PhraseLocation& suffix_location,
@@ -59,8 +40,9 @@ PhraseLocation FastIntersector::Intersect(
assert(vocabulary->IsTerminal(symbols.front())
&& vocabulary->IsTerminal(symbols.back()));
- if (collocations.count(symbols)) {
- return PhraseLocation(collocations[symbols], phrase.Arity() + 1);
+ if (precomputation->Contains(symbols)) {
+ return PhraseLocation(precomputation->GetCollocations(symbols),
+ phrase.Arity() + 1);
}
bool prefix_ends_with_x =
diff --git a/extractor/fast_intersector.h b/extractor/fast_intersector.h
index 2819d239..305373dc 100644
--- a/extractor/fast_intersector.h
+++ b/extractor/fast_intersector.h
@@ -12,7 +12,6 @@ using namespace std;
namespace extractor {
typedef boost::hash<vector<int>> VectorHash;
-typedef unordered_map<vector<int>, vector<int>, VectorHash> Index;
class Phrase;
class PhraseLocation;
@@ -52,11 +51,6 @@ class FastIntersector {
FastIntersector();
private:
- // Uses the vocabulary to convert the phrase from the numberized format
- // specified by the source data array to the numberized format given by the
- // vocabulary.
- vector<int> ConvertPhrase(const vector<int>& old_phrase);
-
// Estimates the number of computations needed if the prefix/suffix is
// extended. If the last/first symbol is separated from the rest of the phrase
// by a nonterminal, then for each occurrence of the prefix/suffix we need to
@@ -85,10 +79,10 @@ class FastIntersector {
pair<int, int> GetSearchRange(bool has_marginal_x) const;
shared_ptr<SuffixArray> suffix_array;
+ shared_ptr<Precomputation> precomputation;
shared_ptr<Vocabulary> vocabulary;
int max_rule_span;
int min_gap_size;
- Index collocations;
};
} // namespace extractor
diff --git a/extractor/fast_intersector_test.cc b/extractor/fast_intersector_test.cc
index 76c3aaea..f2a26ba1 100644
--- a/extractor/fast_intersector_test.cc
+++ b/extractor/fast_intersector_test.cc
@@ -59,15 +59,13 @@ class FastIntersectorTest : public Test {
}
precomputation = make_shared<MockPrecomputation>();
- EXPECT_CALL(*precomputation, GetCollocations())
- .WillRepeatedly(ReturnRef(collocations));
+ EXPECT_CALL(*precomputation, Contains(_)).WillRepeatedly(Return(false));
phrase_builder = make_shared<PhraseBuilder>(vocabulary);
intersector = make_shared<FastIntersector>(suffix_array, precomputation,
vocabulary, 15, 1);
}
- Index collocations;
shared_ptr<MockDataArray> data_array;
shared_ptr<MockSuffixArray> suffix_array;
shared_ptr<MockPrecomputation> precomputation;
@@ -82,9 +80,9 @@ TEST_F(FastIntersectorTest, TestCachedCollocation) {
Phrase phrase = phrase_builder->Build(symbols);
PhraseLocation prefix_location(15, 16), suffix_location(16, 17);
- collocations[symbols] = expected_location;
- EXPECT_CALL(*precomputation, GetCollocations())
- .WillRepeatedly(ReturnRef(collocations));
+ EXPECT_CALL(*precomputation, Contains(symbols)).WillRepeatedly(Return(true));
+ EXPECT_CALL(*precomputation, GetCollocations(symbols)).
+ WillRepeatedly(Return(expected_location));
intersector = make_shared<FastIntersector>(suffix_array, precomputation,
vocabulary, 15, 1);
diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc
index 487abcaf..4d0738f7 100644
--- a/extractor/grammar_extractor.cc
+++ b/extractor/grammar_extractor.cc
@@ -19,10 +19,11 @@ GrammarExtractor::GrammarExtractor(
shared_ptr<SuffixArray> source_suffix_array,
shared_ptr<DataArray> target_data_array,
shared_ptr<Alignment> alignment, shared_ptr<Precomputation> precomputation,
- shared_ptr<Scorer> scorer, int min_gap_size, int max_rule_span,
+ shared_ptr<Scorer> scorer, shared_ptr<Vocabulary> vocabulary,
+ int min_gap_size, int max_rule_span,
int max_nonterminals, int max_rule_symbols, int max_samples,
bool require_tight_phrases) :
- vocabulary(make_shared<Vocabulary>()),
+ vocabulary(vocabulary),
rule_factory(make_shared<HieroCachingRuleFactory>(
source_suffix_array, target_data_array, alignment, vocabulary,
precomputation, scorer, min_gap_size, max_rule_span, max_nonterminals,
diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h
index ae407b47..8f570df2 100644
--- a/extractor/grammar_extractor.h
+++ b/extractor/grammar_extractor.h
@@ -32,6 +32,7 @@ class GrammarExtractor {
shared_ptr<Alignment> alignment,
shared_ptr<Precomputation> precomputation,
shared_ptr<Scorer> scorer,
+ shared_ptr<Vocabulary> vocabulary,
int min_gap_size,
int max_rule_span,
int max_nonterminals,
diff --git a/extractor/mocks/mock_data_array.h b/extractor/mocks/mock_data_array.h
index 6f85abb4..d39cb0c4 100644
--- a/extractor/mocks/mock_data_array.h
+++ b/extractor/mocks/mock_data_array.h
@@ -6,7 +6,7 @@ namespace extractor {
class MockDataArray : public DataArray {
public:
- MOCK_CONST_METHOD0(GetData, const vector<int>&());
+ MOCK_CONST_METHOD0(GetData, vector<int>());
MOCK_CONST_METHOD1(AtIndex, int(int index));
MOCK_CONST_METHOD1(GetWordAtIndex, string(int index));
MOCK_CONST_METHOD0(GetSize, int());
diff --git a/extractor/mocks/mock_precomputation.h b/extractor/mocks/mock_precomputation.h
index 8753343e..5f7aa999 100644
--- a/extractor/mocks/mock_precomputation.h
+++ b/extractor/mocks/mock_precomputation.h
@@ -6,7 +6,8 @@ namespace extractor {
class MockPrecomputation : public Precomputation {
public:
- MOCK_CONST_METHOD0(GetCollocations, const Index&());
+ MOCK_CONST_METHOD1(Contains, bool(const vector<int>& pattern));
+ MOCK_CONST_METHOD1(GetCollocations, vector<int>(const vector<int>& pattern));
};
} // namespace extractor
diff --git a/extractor/precomputation.cc b/extractor/precomputation.cc
index 3b8aed69..3e58e2a9 100644
--- a/extractor/precomputation.cc
+++ b/extractor/precomputation.cc
@@ -5,59 +5,67 @@
#include "data_array.h"
#include "suffix_array.h"
+#include "time_util.h"
+#include "vocabulary.h"
using namespace std;
namespace extractor {
-int Precomputation::FIRST_NONTERMINAL = -1;
-int Precomputation::SECOND_NONTERMINAL = -2;
-
Precomputation::Precomputation(
- shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns,
- int num_super_frequent_patterns, int max_rule_span,
- int max_rule_symbols, int min_gap_size,
+ shared_ptr<Vocabulary> vocabulary, shared_ptr<SuffixArray> suffix_array,
+ int num_frequent_patterns, int num_super_frequent_patterns,
+ int max_rule_span, int max_rule_symbols, int min_gap_size,
int max_frequent_phrase_len, int min_frequency) {
- vector<int> data = suffix_array->GetData()->GetData();
+ Clock::time_point start_time = Clock::now();
+ shared_ptr<DataArray> data_array = suffix_array->GetData();
+ vector<int> data = data_array->GetData();
vector<vector<int>> frequent_patterns = FindMostFrequentPatterns(
suffix_array, data, num_frequent_patterns, max_frequent_phrase_len,
min_frequency);
+ Clock::time_point end_time = Clock::now();
+ cerr << "Finding most frequent patterns took "
+ << GetDuration(start_time, end_time) << " seconds..." << endl;
- // Construct sets containing the frequent and superfrequent contiguous
- // collocations.
- unordered_set<vector<int>, VectorHash> frequent_patterns_set;
- unordered_set<vector<int>, VectorHash> super_frequent_patterns_set;
+ vector<vector<int>> pattern_annotations(frequent_patterns.size());
+ unordered_map<vector<int>, int, VectorHash> frequent_patterns_index;
for (size_t i = 0; i < frequent_patterns.size(); ++i) {
- frequent_patterns_set.insert(frequent_patterns[i]);
- if (i < num_super_frequent_patterns) {
- super_frequent_patterns_set.insert(frequent_patterns[i]);
- }
+ frequent_patterns_index[frequent_patterns[i]] = i;
+ pattern_annotations[i] = AnnotatePattern(vocabulary, data_array,
+ frequent_patterns[i]);
}
+ start_time = Clock::now();
vector<tuple<int, int, int>> matchings;
+ vector<vector<int>> annotations;
for (size_t i = 0; i < data.size(); ++i) {
// If the sentence is over, add all the discontiguous frequent patterns to
// the index.
if (data[i] == DataArray::END_OF_LINE) {
- AddCollocations(matchings, data, max_rule_span, min_gap_size,
- max_rule_symbols);
+ UpdateIndex(matchings, annotations, max_rule_span, min_gap_size,
+ max_rule_symbols);
matchings.clear();
+ annotations.clear();
continue;
}
- vector<int> pattern;
// Find all the contiguous frequent patterns starting at position i.
+ vector<int> pattern;
for (int j = 1; j <= max_frequent_phrase_len && i + j <= data.size(); ++j) {
pattern.push_back(data[i + j - 1]);
- if (frequent_patterns_set.count(pattern)) {
- int is_super_frequent = super_frequent_patterns_set.count(pattern);
- matchings.push_back(make_tuple(i, j, is_super_frequent));
- } else {
+ auto it = frequent_patterns_index.find(pattern);
+ if (it == frequent_patterns_index.end()) {
// If the current pattern is not frequent, any longer pattern having the
// current pattern as prefix will not be frequent.
break;
}
+ int is_super_frequent = it->second < num_super_frequent_patterns;
+ matchings.push_back(make_tuple(i, j, is_super_frequent));
+ annotations.push_back(pattern_annotations[it->second]);
}
}
+ end_time = Clock::now();
+ cerr << "Constructing collocations index took "
+ << GetDuration(start_time, end_time) << " seconds..." << endl;
}
Precomputation::Precomputation() {}
@@ -75,9 +83,9 @@ vector<vector<int>> Precomputation::FindMostFrequentPatterns(
for (size_t i = 1; i < lcp.size(); ++i) {
for (int len = lcp[i]; len < max_frequent_phrase_len; ++len) {
int frequency = i - run_start[len];
- if (frequency >= min_frequency) {
- heap.push(make_pair(frequency,
- make_pair(suffix_array->GetSuffix(run_start[len]), len + 1)));
+ int start = suffix_array->GetSuffix(run_start[len]);
+ if (frequency >= min_frequency && start + len <= data.size()) {
+ heap.push(make_pair(frequency, make_pair(start, len + 1)));
}
run_start[len] = i;
}
@@ -99,8 +107,20 @@ vector<vector<int>> Precomputation::FindMostFrequentPatterns(
return frequent_patterns;
}
-void Precomputation::AddCollocations(
- const vector<tuple<int, int, int>>& matchings, const vector<int>& data,
+vector<int> Precomputation::AnnotatePattern(
+ shared_ptr<Vocabulary> vocabulary, shared_ptr<DataArray> data_array,
+ const vector<int>& pattern) const {
+ vector<int> annotation;
+ for (int word_id: pattern) {
+ annotation.push_back(vocabulary->GetTerminalIndex(
+ data_array->GetWord(word_id)));
+ }
+ return annotation;
+}
+
+void Precomputation::UpdateIndex(
+ const vector<tuple<int, int, int>>& matchings,
+ const vector<vector<int>>& annotations,
int max_rule_span, int min_gap_size, int max_rule_symbols) {
// Select the leftmost subpattern.
for (size_t i = 0; i < matchings.size(); ++i) {
@@ -118,16 +138,14 @@ void Precomputation::AddCollocations(
if (start2 - start1 - size1 >= min_gap_size
&& start2 + size2 - start1 <= max_rule_span
&& size1 + size2 + 1 <= max_rule_symbols) {
- vector<int> pattern(data.begin() + start1,
- data.begin() + start1 + size1);
- pattern.push_back(Precomputation::FIRST_NONTERMINAL);
- pattern.insert(pattern.end(), data.begin() + start2,
- data.begin() + start2 + size2);
- AddStartPositions(collocations[pattern], start1, start2);
+ vector<int> pattern = annotations[i];
+ pattern.push_back(-1);
+ AppendSubpattern(pattern, annotations[j]);
+ AppendCollocation(index[pattern], start1, start2);
// Try extending the binary collocation to a ternary collocation.
if (is_super2) {
- pattern.push_back(Precomputation::SECOND_NONTERMINAL);
+ pattern.push_back(-2);
// Select the rightmost subpattern.
for (size_t k = j + 1; k < matchings.size(); ++k) {
int start3, size3, is_super3;
@@ -140,9 +158,8 @@ void Precomputation::AddCollocations(
&& start3 + size3 - start1 <= max_rule_span
&& size1 + size2 + size3 + 2 <= max_rule_symbols
&& (is_super1 || is_super3)) {
- pattern.insert(pattern.end(), data.begin() + start3,
- data.begin() + start3 + size3);
- AddStartPositions(collocations[pattern], start1, start2, start3);
+ AppendSubpattern(pattern, annotations[k]);
+ AppendCollocation(index[pattern], start1, start2, start3);
pattern.erase(pattern.end() - size3);
}
}
@@ -152,25 +169,35 @@ void Precomputation::AddCollocations(
}
}
-void Precomputation::AddStartPositions(
- vector<int>& positions, int pos1, int pos2) {
- positions.push_back(pos1);
- positions.push_back(pos2);
+void Precomputation::AppendSubpattern(
+ vector<int>& pattern,
+ const vector<int>& subpattern) {
+ copy(subpattern.begin(), subpattern.end(), back_inserter(pattern));
+}
+
+void Precomputation::AppendCollocation(
+ vector<int>& collocations, int pos1, int pos2) {
+ collocations.push_back(pos1);
+ collocations.push_back(pos2);
+}
+
+void Precomputation::AppendCollocation(
+ vector<int>& collocations, int pos1, int pos2, int pos3) {
+ collocations.push_back(pos1);
+ collocations.push_back(pos2);
+ collocations.push_back(pos3);
}
-void Precomputation::AddStartPositions(
- vector<int>& positions, int pos1, int pos2, int pos3) {
- positions.push_back(pos1);
- positions.push_back(pos2);
- positions.push_back(pos3);
+bool Precomputation::Contains(const vector<int>& pattern) const {
+ return index.count(pattern);
}
-const Index& Precomputation::GetCollocations() const {
- return collocations;
+vector<int> Precomputation::GetCollocations(const vector<int>& pattern) const {
+ return index.at(pattern);
}
bool Precomputation::operator==(const Precomputation& other) const {
- return collocations == other.collocations;
+ return index == other.index;
}
} // namespace extractor
diff --git a/extractor/precomputation.h b/extractor/precomputation.h
index e5fa3e37..2b34fc29 100644
--- a/extractor/precomputation.h
+++ b/extractor/precomputation.h
@@ -19,7 +19,9 @@ namespace extractor {
typedef boost::hash<vector<int>> VectorHash;
typedef unordered_map<vector<int>, vector<int>, VectorHash> Index;
+class DataArray;
class SuffixArray;
+class Vocabulary;
/**
* Data structure wrapping an index with all the occurrences of the most
@@ -35,9 +37,9 @@ class Precomputation {
public:
// Constructs the index using the suffix array.
Precomputation(
- shared_ptr<SuffixArray> suffix_array, int num_frequent_patterns,
- int num_super_frequent_patterns, int max_rule_span,
- int max_rule_symbols, int min_gap_size,
+ shared_ptr<Vocabulary> vocabulary, shared_ptr<SuffixArray> suffix_array,
+ int num_frequent_patterns, int num_super_frequent_patterns,
+ int max_rule_span, int max_rule_symbols, int min_gap_size,
int max_frequent_phrase_len, int min_frequency);
// Creates empty precomputation data structure.
@@ -45,13 +47,13 @@ class Precomputation {
virtual ~Precomputation();
- // Returns a reference to the index.
- virtual const Index& GetCollocations() const;
+ // Returns whether a pattern is contained in the index of collocations.
+ virtual bool Contains(const vector<int>& pattern) const;
- bool operator==(const Precomputation& other) const;
+ // Returns the list of collocations for a given pattern.
+ virtual vector<int> GetCollocations(const vector<int>& pattern) const;
- static int FIRST_NONTERMINAL;
- static int SECOND_NONTERMINAL;
+ bool operator==(const Precomputation& other) const;
private:
// Finds the most frequent contiguous collocations.
@@ -60,25 +62,32 @@ class Precomputation {
int num_frequent_patterns, int max_frequent_phrase_len,
int min_frequency);
+ vector<int> AnnotatePattern(shared_ptr<Vocabulary> vocabulary,
+ shared_ptr<DataArray> data_array,
+ const vector<int>& pattern) const;
+
// Given the locations of the frequent contiguous collocations in a sentence,
// it adds new entries to the index for each discontiguous collocation
// matching the criteria specified in the class description.
- void AddCollocations(
- const vector<std::tuple<int, int, int>>& matchings, const vector<int>& data,
+ void UpdateIndex(
+ const vector<tuple<int, int, int>>& matchings,
+ const vector<vector<int>>& annotations,
int max_rule_span, int min_gap_size, int max_rule_symbols);
+ void AppendSubpattern(vector<int>& pattern, const vector<int>& subpattern);
+
// Adds an occurrence of a binary collocation.
- void AddStartPositions(vector<int>& positions, int pos1, int pos2);
+ void AppendCollocation(vector<int>& collocations, int pos1, int pos2);
// Adds an occurrence of a ternary collocation.
- void AddStartPositions(vector<int>& positions, int pos1, int pos2, int pos3);
+ void AppendCollocation(vector<int>& collocations, int pos1, int pos2, int pos3);
friend class boost::serialization::access;
template<class Archive> void save(Archive& ar, unsigned int) const {
- int num_entries = collocations.size();
+ int num_entries = index.size();
ar << num_entries;
- for (pair<vector<int>, vector<int>> entry: collocations) {
+ for (pair<vector<int>, vector<int>> entry: index) {
ar << entry;
}
}
@@ -89,13 +98,13 @@ class Precomputation {
for (size_t i = 0; i < num_entries; ++i) {
pair<vector<int>, vector<int>> entry;
ar >> entry;
- collocations.insert(entry);
+ index.insert(entry);
}
}
BOOST_SERIALIZATION_SPLIT_MEMBER();
- Index collocations;
+ Index index;
};
} // namespace extractor
diff --git a/extractor/precomputation_test.cc b/extractor/precomputation_test.cc
index e81ece5d..3a98ce05 100644
--- a/extractor/precomputation_test.cc
+++ b/extractor/precomputation_test.cc
@@ -9,6 +9,7 @@
#include "mocks/mock_data_array.h"
#include "mocks/mock_suffix_array.h"
+#include "mocks/mock_vocabulary.h"
#include "precomputation.h"
using namespace std;
@@ -23,7 +24,12 @@ class PrecomputationTest : public Test {
virtual void SetUp() {
data = {4, 2, 3, 5, 7, 2, 3, 5, 2, 3, 4, 2, 1};
data_array = make_shared<MockDataArray>();
- EXPECT_CALL(*data_array, GetData()).WillRepeatedly(ReturnRef(data));
+ EXPECT_CALL(*data_array, GetData()).WillRepeatedly(Return(data));
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_CALL(*data_array, AtIndex(i)).WillRepeatedly(Return(data[i]));
+ }
+ EXPECT_CALL(*data_array, GetWord(2)).WillRepeatedly(Return("2"));
+ EXPECT_CALL(*data_array, GetWord(3)).WillRepeatedly(Return("3"));
vector<int> suffixes{12, 8, 5, 1, 9, 6, 2, 0, 10, 7, 3, 4, 13};
vector<int> lcp{-1, 0, 2, 3, 1, 0, 1, 2, 0, 2, 0, 1, 0, 0};
@@ -35,77 +41,98 @@ class PrecomputationTest : public Test {
}
EXPECT_CALL(*suffix_array, BuildLCPArray()).WillRepeatedly(Return(lcp));
- precomputation = Precomputation(suffix_array, 3, 3, 10, 5, 1, 4, 2);
+ vocabulary = make_shared<MockVocabulary>();
+ EXPECT_CALL(*vocabulary, GetTerminalIndex("2")).WillRepeatedly(Return(2));
+ EXPECT_CALL(*vocabulary, GetTerminalIndex("3")).WillRepeatedly(Return(3));
+
+ precomputation = Precomputation(vocabulary, suffix_array,
+ 3, 3, 10, 5, 1, 4, 2);
}
vector<int> data;
shared_ptr<MockDataArray> data_array;
shared_ptr<MockSuffixArray> suffix_array;
+ shared_ptr<MockVocabulary> vocabulary;
Precomputation precomputation;
};
TEST_F(PrecomputationTest, TestCollocations) {
- Index collocations = precomputation.GetCollocations();
-
vector<int> key = {2, 3, -1, 2};
vector<int> expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {2, 3, -1, 2, 3};
expected_value = {1, 5, 1, 8, 5, 8};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {2, 3, -1, 3};
expected_value = {1, 6, 1, 9, 5, 9};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {3, -1, 2};
expected_value = {2, 5, 2, 8, 2, 11, 6, 8, 6, 11, 9, 11};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {3, -1, 3};
expected_value = {2, 6, 2, 9, 6, 9};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {3, -1, 2, 3};
expected_value = {2, 5, 2, 8, 6, 8};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {2, -1, 2};
expected_value = {1, 5, 1, 8, 5, 8, 5, 11, 8, 11};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {2, -1, 2, 3};
expected_value = {1, 5, 1, 8, 5, 8};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {2, -1, 3};
expected_value = {1, 6, 1, 9, 5, 9};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {2, -1, 2, -2, 2};
expected_value = {1, 5, 8, 5, 8, 11};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {2, -1, 2, -2, 3};
expected_value = {1, 5, 9};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {2, -1, 3, -2, 2};
expected_value = {1, 6, 8, 5, 9, 11};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {2, -1, 3, -2, 3};
expected_value = {1, 6, 9};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {3, -1, 2, -2, 2};
expected_value = {2, 5, 8, 2, 5, 11, 2, 8, 11, 6, 8, 11};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {3, -1, 2, -2, 3};
expected_value = {2, 5, 9};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {3, -1, 3, -2, 2};
expected_value = {2, 6, 8, 2, 6, 11, 2, 9, 11, 6, 9, 11};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
key = {3, -1, 3, -2, 3};
expected_value = {2, 6, 9};
- EXPECT_EQ(expected_value, collocations[key]);
+ EXPECT_TRUE(precomputation.Contains(key));
+ EXPECT_EQ(expected_value, precomputation.GetCollocations(key));
// Exceeds max_rule_symbols.
key = {2, -1, 2, -2, 2, 3};
- EXPECT_EQ(0, collocations.count(key));
+ EXPECT_FALSE(precomputation.Contains(key));
// Contains non frequent pattern.
key = {2, -1, 5};
- EXPECT_EQ(0, collocations.count(key));
+ EXPECT_FALSE(precomputation.Contains(key));
}
TEST_F(PrecomputationTest, TestSerialization) {
diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc
index 6eb55073..85c8a422 100644
--- a/extractor/run_extractor.cc
+++ b/extractor/run_extractor.cc
@@ -28,6 +28,7 @@
#include "suffix_array.h"
#include "time_util.h"
#include "translation_table.h"
+#include "vocabulary.h"
namespace fs = boost::filesystem;
namespace po = boost::program_options;
@@ -142,11 +143,14 @@ int main(int argc, char** argv) {
cerr << "Reading alignment took "
<< GetDuration(start_time, stop_time) << " seconds" << endl;
+ shared_ptr<Vocabulary> vocabulary = make_shared<Vocabulary>();
+
// Constructs an index storing the occurrences in the source data for each
// frequent collocation.
start_time = Clock::now();
cerr << "Precomputing collocations..." << endl;
shared_ptr<Precomputation> precomputation = make_shared<Precomputation>(
+ vocabulary,
source_suffix_array,
vm["frequent"].as<int>(),
vm["super_frequent"].as<int>(),
@@ -194,6 +198,7 @@ int main(int argc, char** argv) {
alignment,
precomputation,
scorer,
+ vocabulary,
vm["min_gap_size"].as<int>(),
vm["max_rule_span"].as<int>(),
vm["max_nonterminals"].as<int>(),
diff --git a/extractor/suffix_array_test.cc b/extractor/suffix_array_test.cc
index ba0dbcc3..a9fd1eab 100644
--- a/extractor/suffix_array_test.cc
+++ b/extractor/suffix_array_test.cc
@@ -21,7 +21,7 @@ class SuffixArrayTest : public Test {
virtual void SetUp() {
data = {6, 4, 1, 2, 4, 5, 3, 4, 6, 6, 4, 1, 2};
data_array = make_shared<MockDataArray>();
- EXPECT_CALL(*data_array, GetData()).WillRepeatedly(ReturnRef(data));
+ EXPECT_CALL(*data_array, GetData()).WillRepeatedly(Return(data));
EXPECT_CALL(*data_array, GetVocabularySize()).WillRepeatedly(Return(7));
EXPECT_CALL(*data_array, GetSize()).WillRepeatedly(Return(13));
suffix_array = SuffixArray(data_array);
diff --git a/extractor/translation_table_test.cc b/extractor/translation_table_test.cc
index 606777bd..72551a12 100644
--- a/extractor/translation_table_test.cc
+++ b/extractor/translation_table_test.cc
@@ -28,7 +28,7 @@ class TranslationTableTest : public Test {
vector<int> source_sentence_start = {0, 6, 10, 14};
shared_ptr<MockDataArray> source_data_array = make_shared<MockDataArray>();
EXPECT_CALL(*source_data_array, GetData())
- .WillRepeatedly(ReturnRef(source_data));
+ .WillRepeatedly(Return(source_data));
EXPECT_CALL(*source_data_array, GetNumSentences())
.WillRepeatedly(Return(3));
for (size_t i = 0; i < source_sentence_start.size(); ++i) {
@@ -48,7 +48,7 @@ class TranslationTableTest : public Test {
vector<int> target_sentence_start = {0, 7, 10, 13};
shared_ptr<MockDataArray> target_data_array = make_shared<MockDataArray>();
EXPECT_CALL(*target_data_array, GetData())
- .WillRepeatedly(ReturnRef(target_data));
+ .WillRepeatedly(Return(target_data));
for (size_t i = 0; i < target_sentence_start.size(); ++i) {
EXPECT_CALL(*target_data_array, GetSentenceStart(i))
.WillRepeatedly(Return(target_sentence_start[i]));
diff --git a/extractor/vocabulary.cc b/extractor/vocabulary.cc
index 15795d1e..aef674a5 100644
--- a/extractor/vocabulary.cc
+++ b/extractor/vocabulary.cc
@@ -8,12 +8,13 @@ int Vocabulary::GetTerminalIndex(const string& word) {
int word_id = -1;
#pragma omp critical (vocabulary)
{
- if (!dictionary.count(word)) {
+ auto it = dictionary.find(word);
+ if (it != dictionary.end()) {
+ word_id = it->second;
+ } else {
word_id = words.size();
dictionary[word] = word_id;
words.push_back(word);
- } else {
- word_id = dictionary[word];
}
}
return word_id;