diff options
-rw-r--r-- | extractor/matchings_trie.cc | 3 | ||||
-rw-r--r-- | extractor/mocks/mock_precomputation.h | 1 | ||||
-rw-r--r-- | extractor/precomputation.cc | 19 | ||||
-rw-r--r-- | extractor/precomputation.h | 2 | ||||
-rw-r--r-- | extractor/precomputation_test.cc | 34 | ||||
-rw-r--r-- | extractor/run_extractor.cc | 15 | ||||
-rw-r--r-- | python/pkg/cdec/sa/compile.py | 11 | ||||
-rw-r--r-- | python/src/sa/_sa.c | 2 |
8 files changed, 11 insertions, 76 deletions
diff --git a/extractor/matchings_trie.cc b/extractor/matchings_trie.cc index 921ec582..8ea795db 100644 --- a/extractor/matchings_trie.cc +++ b/extractor/matchings_trie.cc @@ -14,6 +14,9 @@ void MatchingsTrie::ResetTree(shared_ptr<TrieNode> root) { for (auto child: root->children) { ResetTree(child.second); } + if (root->suffix_link != NULL) { + root->suffix_link.reset(); + } root.reset(); } } diff --git a/extractor/mocks/mock_precomputation.h b/extractor/mocks/mock_precomputation.h index 987bdb2f..9bc72235 100644 --- a/extractor/mocks/mock_precomputation.h +++ b/extractor/mocks/mock_precomputation.h @@ -4,6 +4,5 @@ class MockPrecomputation : public Precomputation { public: - MOCK_CONST_METHOD0(GetInvertedIndex, const Index&()); MOCK_CONST_METHOD0(GetCollocations, const Index&()); }; diff --git a/extractor/precomputation.cc b/extractor/precomputation.cc index 8a76beb1..189ac42c 100644 --- a/extractor/precomputation.cc +++ b/extractor/precomputation.cc @@ -41,7 +41,6 @@ Precomputation::Precomputation( for (int j = 1; j <= max_frequent_phrase_len && i + j <= data.size(); ++j) { pattern.push_back(data[i + j - 1]); if (frequent_patterns_set.count(pattern)) { - inverted_index[pattern].push_back(i); int is_super_frequent = super_frequent_patterns_set.count(pattern); matchings.push_back(make_tuple(i, j, is_super_frequent)); } else { @@ -158,19 +157,7 @@ void Precomputation::WriteBinary(const fs::path& filepath) const { FILE* file = fopen(filepath.string().c_str(), "w"); // TODO(pauldb): Refactor this code. - int size = inverted_index.size(); - fwrite(&size, sizeof(int), 1, file); - for (auto entry: inverted_index) { - size = entry.first.size(); - fwrite(&size, sizeof(int), 1, file); - fwrite(entry.first.data(), sizeof(int), size, file); - - size = entry.second.size(); - fwrite(&size, sizeof(int), 1, file); - fwrite(entry.second.data(), sizeof(int), size, file); - } - - size = collocations.size(); + int size = collocations.size(); fwrite(&size, sizeof(int), 1, file); for (auto entry: collocations) { size = entry.first.size(); @@ -183,10 +170,6 @@ void Precomputation::WriteBinary(const fs::path& filepath) const { } } -const Index& Precomputation::GetInvertedIndex() const { - return inverted_index; -} - const Index& Precomputation::GetCollocations() const { return collocations; } diff --git a/extractor/precomputation.h b/extractor/precomputation.h index 28426bfa..3d44c2a6 100644 --- a/extractor/precomputation.h +++ b/extractor/precomputation.h @@ -30,7 +30,6 @@ class Precomputation { void WriteBinary(const fs::path& filepath) const; - virtual const Index& GetInvertedIndex() const; virtual const Index& GetCollocations() const; static int NON_TERMINAL; @@ -49,7 +48,6 @@ class Precomputation { void AddStartPositions(vector<int>& positions, int pos1, int pos2); void AddStartPositions(vector<int>& positions, int pos1, int pos2, int pos3); - Index inverted_index; Index collocations; }; diff --git a/extractor/precomputation_test.cc b/extractor/precomputation_test.cc index 9edb29db..6b77b9c0 100644 --- a/extractor/precomputation_test.cc +++ b/extractor/precomputation_test.cc @@ -35,40 +35,6 @@ class PrecomputationTest : public Test { shared_ptr<MockSuffixArray> suffix_array; }; -TEST_F(PrecomputationTest, TestInvertedIndex) { - Precomputation precomputation(suffix_array, 100, 3, 10, 5, 1, 4, 2); - Index inverted_index = precomputation.GetInvertedIndex(); - - EXPECT_EQ(8, inverted_index.size()); - vector<int> key = {2}; - vector<int> expected_value = {1, 5, 8, 11}; - EXPECT_EQ(expected_value, inverted_index[key]); - key = {3}; - expected_value = {2, 6, 9}; - EXPECT_EQ(expected_value, inverted_index[key]); - key = {4}; - expected_value = {0, 10}; - EXPECT_EQ(expected_value, inverted_index[key]); - key = {5}; - expected_value = {3, 7}; - EXPECT_EQ(expected_value, inverted_index[key]); - key = {4, 2}; - expected_value = {0, 10}; - EXPECT_EQ(expected_value, inverted_index[key]); - key = {2, 3}; - expected_value = {1, 5, 8}; - EXPECT_EQ(expected_value, inverted_index[key]); - key = {3, 5}; - expected_value = {2, 6}; - EXPECT_EQ(expected_value, inverted_index[key]); - key = {2, 3, 5}; - expected_value = {1, 5}; - EXPECT_EQ(expected_value, inverted_index[key]); - - key = {2, 4}; - EXPECT_EQ(0, inverted_index.count(key)); -} - TEST_F(PrecomputationTest, TestCollocations) { Precomputation precomputation(suffix_array, 3, 3, 10, 5, 1, 4, 2); Index collocations = precomputation.GetCollocations(); diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 36dbd7a0..5255737d 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -31,14 +31,6 @@ namespace fs = boost::filesystem; namespace po = boost::program_options; using namespace std; -void my_pause() { - cerr << "pausing..." << endl; - for (int i = 0; i < 10000000; ++i) { - cerr << endl; - } - cerr << "end pause" << endl; -} - int main(int argc, char** argv) { // TODO(pauldb): Also take arguments from config file. po::options_description desc("Command line options"); @@ -122,7 +114,7 @@ int main(int argc, char** argv) { cerr << "Reading alignment took " << GetDuration(start_time, stop_time) << " seconds" << endl; - cerr << "Precomputating collocations..." << endl; + cerr << "Precomputing collocations..." << endl; start_time = Clock::now(); shared_ptr<Precomputation> precomputation = make_shared<Precomputation>( source_suffix_array, @@ -150,6 +142,8 @@ int main(int argc, char** argv) { << GetDuration(preprocess_start_time, preprocess_stop_time) << " seconds" << endl; + cerr << "creating grammar extractor" << endl; + Clock::time_point extraction_start_time = Clock::now(); vector<shared_ptr<Feature> > features = { make_shared<TargetGivenSourceCoherent>(), @@ -176,6 +170,9 @@ int main(int argc, char** argv) { vm["max_samples"].as<int>(), vm["tight_phrases"].as<bool>()); + // Release extra memory used by the initial precomputation. + precomputation.reset(); + int grammar_id = 0; fs::path grammar_path = vm["grammars"].as<string>(); if (!fs::is_directory(grammar_path)) { diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py index 1362e7b5..d4cd8387 100644 --- a/python/pkg/cdec/sa/compile.py +++ b/python/pkg/cdec/sa/compile.py @@ -21,10 +21,6 @@ def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2, tight_phr train_min_gap_size=min_gap) return precomp -def my_pause(): - for i in range(10000000): - print >> sys.stderr, "" - def main(): preprocess_start_time = monitor_cpu() sys.setrecursionlimit(sys.getrecursionlimit() * 100) @@ -89,8 +85,6 @@ def main(): stop_time = monitor_cpu() logger.info('Compiling source suffix array took %f seconds', stop_time - start_time) - my_pause() - start_time = monitor_cpu() logger.info('Compiling target data array') if args.bitext: @@ -100,7 +94,6 @@ def main(): e.write_binary(e_bin) stop_time = monitor_cpu() logger.info('Compiling target data array took %f seconds', stop_time - start_time) - my_pause() start_time = monitor_cpu() logger.info('Precomputing frequent phrases') @@ -108,15 +101,12 @@ def main(): stop_time = monitor_cpu() logger.info('Compiling precomputations took %f seconds', stop_time - start_time) - my_pause() - start_time = monitor_cpu() logger.info('Compiling alignment') a = cdec.sa.Alignment(from_text=args.alignment) a.write_binary(a_bin) stop_time = monitor_cpu() logger.info('Compiling alignment took %f seonds', stop_time - start_time) - my_pause() start_time = monitor_cpu() logger.info('Compiling bilexical dictionary') @@ -124,7 +114,6 @@ def main(): lex.write_binary(lex_bin) stop_time = monitor_cpu() logger.info('Compiling bilexical dictionary took %f seconds', stop_time - start_time) - my_pause() # Write configuration config = cdec.configobj.ConfigObj(args.config, unrepr=True) diff --git a/python/src/sa/_sa.c b/python/src/sa/_sa.c index 92abd4c3..4bdabc17 100644 --- a/python/src/sa/_sa.c +++ b/python/src/sa/_sa.c @@ -1,4 +1,4 @@ -/* Generated by Cython 0.16 on Fri Feb 22 16:52:09 2013 */ +/* Generated by Cython 0.16 on Wed Mar 6 11:06:18 2013 */ #define PY_SSIZE_T_CLEAN #include "Python.h" |