diff options
author | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-19 18:55:05 +0000 |
---|---|---|
committer | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-19 18:55:05 +0000 |
commit | 686e494171c7b336c20349d7c38f3d2259ed5410 (patch) | |
tree | fdb8cd50689d7ef6893a91072e28f883faba1acb | |
parent | 0e4d1fc42b5f3707d81549d11fad8fcd4f9af89f (diff) |
Added count pruning.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@318 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r-- | extools/extractor_monolingual.cc | 28 |
1 files changed, 19 insertions, 9 deletions
diff --git a/extools/extractor_monolingual.cc b/extools/extractor_monolingual.cc index e26aa402..ea3e128d 100644 --- a/extools/extractor_monolingual.cc +++ b/extools/extractor_monolingual.cc @@ -28,6 +28,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("phrases,p", po::value<string>(), "File contatining phrases of interest") ("phrase_context_size,S", po::value<int>()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts") ("combiner_size,c", po::value<size_t>()->default_value(800000), "Number of unique items to store in cache before writing rule counts. Set to 1 to disable cache. Set to 0 for no limit.") + ("prune", po::value<size_t>()->default_value(0), "Prune items with count less than threshold; applies each time the cache is dumped.") ("silent", "Write nothing to stderr except errors") ("help,h", "Print this help message and exit"); po::options_description clo("Command line options"); @@ -92,7 +93,7 @@ struct TrieNode }; struct CountCombiner { - CountCombiner(const size_t& csize) : combiner_size(csize) { + CountCombiner(const size_t& csize, const size_t& prune) : combiner_size(csize), threshold(prune) { if (csize == 0) { cerr << "Using unlimited combiner cache.\n"; } } ~CountCombiner() { @@ -116,20 +117,29 @@ struct CountCombiner { void WriteAndClearCache() { for (unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > >::iterator it = cache.begin(); it != cache.end(); ++it) { - cout << TD::GetString(it->first) << '\t'; const Vec2PhraseCount& vals = it->second; - bool needdiv = false; - for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi) { - if (needdiv) cout << " ||| "; else needdiv = true; + bool first = true; + for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi) + { + if (threshold > 1 && combiner_size != 1 && vi->second < threshold) + continue; + + if (!first) cout << " ||| "; + else + { + cout << TD::GetString(it->first) << '\t'; + first = false; + } cout << TD::GetString(vi->first) << " ||| C=" << vi->second; - } - cout << '\n'; + } + if (!first) + cout << '\n'; } cout << flush; cache.clear(); } - const size_t combiner_size; + const size_t combiner_size, threshold; typedef unordered_map<vector<WordID>, int, boost::hash<vector<WordID> > > Vec2PhraseCount; unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > > cache; }; @@ -194,7 +204,7 @@ int main(int argc, char** argv) bool silent = conf.count("silent") > 0; const int ctx_size = conf["phrase_context_size"].as<int>(); - CountCombiner cc(conf["combiner_size"].as<size_t>()); + CountCombiner cc(conf["combiner_size"].as<size_t>(), conf["prune"].as<size_t>()); char buf[MAX_LINE_LENGTH]; TrieNode phrase_trie(0); |