From 4433ed33ccda66f815668062b4c2cda2f2b88dce Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Mon, 19 Jul 2010 02:44:15 +0000 Subject: Fixed a couple of bugs in the monolingual context extractor git-svn-id: https://ws10smt.googlecode.com/svn/trunk@314 ec762483-ff6d-05da-a07a-a48fb63a330f --- extools/extractor_monolingual.cc | 76 +++++++++++++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 13 deletions(-) diff --git a/extools/extractor_monolingual.cc b/extools/extractor_monolingual.cc index 5db768e3..e26aa402 100644 --- a/extools/extractor_monolingual.cc +++ b/extools/extractor_monolingual.cc @@ -27,6 +27,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("input,i", po::value()->default_value("-"), "Input file") ("phrases,p", po::value(), "File contatining phrases of interest") ("phrase_context_size,S", po::value()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts") + ("combiner_size,c", po::value()->default_value(800000), "Number of unique items to store in cache before writing rule counts. Set to 1 to disable cache. Set to 0 for no limit.") ("silent", "Write nothing to stderr except errors") ("help,h", "Print this help message and exit"); po::options_description clo("Command line options"); @@ -90,20 +91,62 @@ struct TrieNode unordered_map next; }; -void WriteContext(const vector& sentence, int start, int end, int ctx_size) -{ - for (int i = start; i < end; ++i) +struct CountCombiner { + CountCombiner(const size_t& csize) : combiner_size(csize) { + if (csize == 0) { cerr << "Using unlimited combiner cache.\n"; } + } + ~CountCombiner() { + if (!cache.empty()) WriteAndClearCache(); + } + + void Count(const vector& key, + const vector& val, + const int count_type) { - if (i != start) cout << " "; - cout << sentence[i]; + if (combiner_size != 1) { + cache[key][val] += count_type; + if (combiner_size > 1 && cache.size() > combiner_size) + WriteAndClearCache(); + } else { + cout << TD::GetString(key) << '\t' << TD::GetString(val) << " ||| C=" << count_type << "\n"; + } } - cout << '\t'; + + private: + void WriteAndClearCache() { + for (unordered_map, Vec2PhraseCount, boost::hash > >::iterator it = cache.begin(); + it != cache.end(); ++it) { + cout << TD::GetString(it->first) << '\t'; + const Vec2PhraseCount& vals = it->second; + bool needdiv = false; + for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi) { + if (needdiv) cout << " ||| "; else needdiv = true; + cout << TD::GetString(vi->first) << " ||| C=" << vi->second; + } + cout << '\n'; + } + cout << flush; + cache.clear(); + } + + const size_t combiner_size; + typedef unordered_map, int, boost::hash > > Vec2PhraseCount; + unordered_map, Vec2PhraseCount, boost::hash > > cache; +}; + +void WriteContext(const vector& sentence, int start, int end, int ctx_size, CountCombiner &combiner) +{ + vector phrase, context; + for (int i = start; i < end; ++i) + phrase.push_back(sentence[i]); + for (int i = ctx_size; i > 0; --i) - cout << TD::Convert(sentence[start-i]) << " "; - cout << " " << TD::Convert(kGAP); + context.push_back(sentence[start-i]); + context.push_back(kGAP); for (int i = 0; i < ctx_size; ++i) - cout << " " << TD::Convert(sentence[end+i]); - cout << "\n"; + context.push_back(sentence[end+i]); + + combiner.Count(phrase, context, 1); } inline bool IsWhitespace(char c) { @@ -151,6 +194,7 @@ int main(int argc, char** argv) bool silent = conf.count("silent") > 0; const int ctx_size = conf["phrase_context_size"].as(); + CountCombiner cc(conf["combiner_size"].as()); char buf[MAX_LINE_LENGTH]; TrieNode phrase_trie(0); @@ -167,6 +211,7 @@ int main(int argc, char** argv) while (iin) { ++line; iin.getline(buf, MAX_LINE_LENGTH); + //cout << "line: " << line << " '" << buf << "'" << endl; if (buf[0] == 0) continue; if (!silent) { if (line % 200 == 0) cerr << '.'; @@ -174,22 +219,27 @@ int main(int argc, char** argv) } vector sentence = ReadSentence(buf, ctx_size); - vector tries(1, &phrase_trie); + //cout << "sentence: " << TD::GetString(sentence) << endl; + vector tries; for (int i = ctx_size; i < (int)sentence.size() - ctx_size; ++i) { - vector tries_prime(1, &phrase_trie); + //cout << "i: " << i << " token: " << TD::Convert(sentence[i]) << " tries: " << tries.size() << endl; + vector tries_prime; + tries.push_back(&phrase_trie); for (vector::iterator tit = tries.begin(); tit != tries.end(); ++tit) { TrieNode* next = (*tit)->follow(sentence[i]); if (next != 0) { + //cout << "\tfollowed edge: " << next->finish << endl; if (next->finish) - WriteContext(sentence, i - next->length, i, ctx_size); + WriteContext(sentence, i + 1 - next->length, i + 1, ctx_size, cc); tries_prime.push_back(next); } } swap(tries, tries_prime); } + //cout << "/sentence" << endl; } if (!silent) cerr << endl; return 0; -- cgit v1.2.3