From 686e494171c7b336c20349d7c38f3d2259ed5410 Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Mon, 19 Jul 2010 18:55:05 +0000 Subject: Added count pruning. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@318 ec762483-ff6d-05da-a07a-a48fb63a330f --- extools/extractor_monolingual.cc | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/extools/extractor_monolingual.cc b/extools/extractor_monolingual.cc index e26aa402..ea3e128d 100644 --- a/extools/extractor_monolingual.cc +++ b/extools/extractor_monolingual.cc @@ -28,6 +28,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("phrases,p", po::value(), "File contatining phrases of interest") ("phrase_context_size,S", po::value()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts") ("combiner_size,c", po::value()->default_value(800000), "Number of unique items to store in cache before writing rule counts. Set to 1 to disable cache. Set to 0 for no limit.") + ("prune", po::value()->default_value(0), "Prune items with count less than threshold; applies each time the cache is dumped.") ("silent", "Write nothing to stderr except errors") ("help,h", "Print this help message and exit"); po::options_description clo("Command line options"); @@ -92,7 +93,7 @@ struct TrieNode }; struct CountCombiner { - CountCombiner(const size_t& csize) : combiner_size(csize) { + CountCombiner(const size_t& csize, const size_t& prune) : combiner_size(csize), threshold(prune) { if (csize == 0) { cerr << "Using unlimited combiner cache.\n"; } } ~CountCombiner() { @@ -116,20 +117,29 @@ struct CountCombiner { void WriteAndClearCache() { for (unordered_map, Vec2PhraseCount, boost::hash > >::iterator it = cache.begin(); it != cache.end(); ++it) { - cout << TD::GetString(it->first) << '\t'; const Vec2PhraseCount& vals = it->second; - bool needdiv = false; - for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi) { - if (needdiv) cout << " ||| "; else needdiv = true; + bool first = true; + for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi) + { + if (threshold > 1 && combiner_size != 1 && vi->second < threshold) + continue; + + if (!first) cout << " ||| "; + else + { + cout << TD::GetString(it->first) << '\t'; + first = false; + } cout << TD::GetString(vi->first) << " ||| C=" << vi->second; - } - cout << '\n'; + } + if (!first) + cout << '\n'; } cout << flush; cache.clear(); } - const size_t combiner_size; + const size_t combiner_size, threshold; typedef unordered_map, int, boost::hash > > Vec2PhraseCount; unordered_map, Vec2PhraseCount, boost::hash > > cache; }; @@ -194,7 +204,7 @@ int main(int argc, char** argv) bool silent = conf.count("silent") > 0; const int ctx_size = conf["phrase_context_size"].as(); - CountCombiner cc(conf["combiner_size"].as()); + CountCombiner cc(conf["combiner_size"].as(), conf["prune"].as()); char buf[MAX_LINE_LENGTH]; TrieNode phrase_trie(0); -- cgit v1.2.3