diff options
Diffstat (limited to 'extools/extractor_monolingual.cc')
-rw-r--r-- | extools/extractor_monolingual.cc | 256 |
1 files changed, 0 insertions, 256 deletions
diff --git a/extools/extractor_monolingual.cc b/extools/extractor_monolingual.cc deleted file mode 100644 index 049ebc85..00000000 --- a/extools/extractor_monolingual.cc +++ /dev/null @@ -1,256 +0,0 @@ -#include <iostream> -#include <vector> -#include <utility> -#include <tr1/unordered_map> - -#include <boost/functional/hash.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include <boost/lexical_cast.hpp> - -#include "tdict.h" -#include "fdict.h" -#include "wordid.h" -#include "filelib.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -static const size_t MAX_LINE_LENGTH = 100000; -WordID kBOS, kEOS, kDIVIDER, kGAP; -int kCOUNT; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input,i", po::value<string>()->default_value("-"), "Input file") - ("phrases,p", po::value<string>(), "File contatining phrases of interest") - ("phrase_context_size,S", po::value<int>()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts") - ("combiner_size,c", po::value<size_t>()->default_value(30000), "Number of unique items to store in cache before writing rule counts. Set to 1 to disable cache. Set to 0 for no limit.") - ("prune", po::value<size_t>()->default_value(0), "Prune items with count less than threshold; applies each time the cache is dumped.") - ("silent", "Write nothing to stderr except errors") - ("help,h", "Print this help message and exit"); - po::options_description clo("Command line options"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - - if (conf->count("help") || conf->count("input") != 1 || conf->count("phrases") != 1) { - cerr << "\nUsage: extractor_monolingual [-options]\n"; - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct TrieNode -{ - TrieNode(int l) : finish(false), length(l) {}; - ~TrieNode() - { - for (unordered_map<int, TrieNode*>::iterator - it = next.begin(); it != next.end(); ++it) - delete it->second; - next.clear(); - } - - TrieNode *follow(int token) - { - unordered_map<int, TrieNode*>::iterator - found = next.find(token); - if (found != next.end()) - return found->second; - else - return 0; - } - - void insert(const vector<int> &tokens) - { - insert(tokens.begin(), tokens.end()); - } - - void insert(vector<int>::const_iterator begin, vector<int>::const_iterator end) - { - if (begin == end) - finish = true; - else - { - int token = *begin; - unordered_map<int, TrieNode*>::iterator - nit = next.find(token); - if (nit == next.end()) - nit = next.insert(make_pair(token, new TrieNode(length+1))).first; - ++begin; - nit->second->insert(begin, end); - } - } - - bool finish; - int length; - unordered_map<int, TrieNode*> next; -}; - -struct CountCombiner { - CountCombiner(const size_t& csize, const size_t& prune) : combiner_size(csize), threshold(prune) { - if (csize == 0) { cerr << "Using unlimited combiner cache.\n"; } - } - ~CountCombiner() { - if (!cache.empty()) WriteAndClearCache(); - } - - void Count(const vector<WordID>& key, - const vector<WordID>& val, - const int count_type) - { - if (combiner_size != 1) { - cache[key][val] += count_type; - if (combiner_size > 1 && cache.size() > combiner_size) - WriteAndClearCache(); - } else { - cout << TD::GetString(key) << '\t' << TD::GetString(val) << " ||| C=" << count_type << "\n"; - } - } - - private: - void WriteAndClearCache() { - for (unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > >::iterator it = cache.begin(); - it != cache.end(); ++it) { - const Vec2PhraseCount& vals = it->second; - bool first = true; - for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi) - { - if (threshold > 1 && combiner_size != 1 && vi->second < threshold) - continue; - - if (!first) cout << " ||| "; - else - { - cout << TD::GetString(it->first) << '\t'; - first = false; - } - cout << TD::GetString(vi->first) << " ||| C=" << vi->second; - } - if (!first) - cout << '\n'; - } - cout << flush; - cache.clear(); - } - - const size_t combiner_size, threshold; - typedef unordered_map<vector<WordID>, int, boost::hash<vector<WordID> > > Vec2PhraseCount; - unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > > cache; -}; - -void WriteContext(const vector<int>& sentence, int start, int end, int ctx_size, CountCombiner &combiner) -{ - vector<WordID> phrase, context; - for (int i = start; i < end; ++i) - phrase.push_back(sentence[i]); - - for (int i = ctx_size; i > 0; --i) - context.push_back(sentence[start-i]); - context.push_back(kGAP); - for (int i = 0; i < ctx_size; ++i) - context.push_back(sentence[end+i]); - - combiner.Count(phrase, context, 1); -} - -inline bool IsWhitespace(char c) { - return c == ' ' || c == '\t'; -} - -inline void SkipWhitespace(const char* buf, int* ptr) { - while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); } -} - -vector<int> ReadSentence(const char *buf, int padding) -{ - int ptr = 0; - SkipWhitespace(buf, &ptr); - int start = ptr; - vector<int> sentence; - for (int i = 0; i < padding; ++i) - sentence.push_back(kBOS); - - while (char c = buf[ptr]) - { - if (!IsWhitespace(c)) - ++ptr; - else { - sentence.push_back(TD::Convert(string(buf, start, ptr-start))); - SkipWhitespace(buf, &ptr); - start = ptr; - } - } - for (int i = 0; i < padding; ++i) - sentence.push_back(kEOS); - - return sentence; -} - -int main(int argc, char** argv) -{ - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - kBOS = TD::Convert("<s>"); - kEOS = TD::Convert("</s>"); - kDIVIDER = TD::Convert("|||"); - kGAP = TD::Convert("<PHRASE>"); - kCOUNT = FD::Convert("C"); - - bool silent = conf.count("silent") > 0; - const int ctx_size = conf["phrase_context_size"].as<int>(); - CountCombiner cc(conf["combiner_size"].as<size_t>(), conf["prune"].as<size_t>()); - - char buf[MAX_LINE_LENGTH]; - TrieNode phrase_trie(0); - ReadFile rpf(conf["phrases"].as<string>()); - istream& pin = *rpf.stream(); - while (pin) { - pin.getline(buf, MAX_LINE_LENGTH); - phrase_trie.insert(ReadSentence(buf, 0)); - } - - ReadFile rif(conf["input"].as<string>()); - istream &iin = *rif.stream(); - int line = 0; - while (iin) { - ++line; - iin.getline(buf, MAX_LINE_LENGTH); - //cout << "line: " << line << " '" << buf << "'" << endl; - if (buf[0] == 0) continue; - if (!silent) { - if (line % 200 == 0) cerr << '.'; - if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush; - } - - vector<int> sentence = ReadSentence(buf, ctx_size); - //cout << "sentence: " << TD::GetString(sentence) << endl; - vector<TrieNode*> tries; - for (int i = ctx_size; i < (int)sentence.size() - ctx_size; ++i) - { - //cout << "i: " << i << " token: " << TD::Convert(sentence[i]) << " tries: " << tries.size() << endl; - vector<TrieNode*> tries_prime; - tries.push_back(&phrase_trie); - for (vector<TrieNode*>::iterator tit = tries.begin(); tit != tries.end(); ++tit) - { - TrieNode* next = (*tit)->follow(sentence[i]); - if (next != 0) - { - //cout << "\tfollowed edge: " << next->finish << endl; - if (next->finish) - WriteContext(sentence, i + 1 - next->length, i + 1, ctx_size, cc); - tries_prime.push_back(next); - } - } - swap(tries, tries_prime); - } - //cout << "/sentence" << endl; - } - if (!silent) cerr << endl; - return 0; -} |