summaryrefslogtreecommitdiff
path: root/extools/extractor_monolingual.cc
diff options
context:
space:
mode:
Diffstat (limited to 'extools/extractor_monolingual.cc')
-rw-r--r--extools/extractor_monolingual.cc256
1 files changed, 0 insertions, 256 deletions
diff --git a/extools/extractor_monolingual.cc b/extools/extractor_monolingual.cc
deleted file mode 100644
index 049ebc85..00000000
--- a/extools/extractor_monolingual.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <utility>
-#include <tr1/unordered_map>
-
-#include <boost/functional/hash.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include <boost/lexical_cast.hpp>
-
-#include "tdict.h"
-#include "fdict.h"
-#include "wordid.h"
-#include "filelib.h"
-
-using namespace std;
-using namespace std::tr1;
-namespace po = boost::program_options;
-
-static const size_t MAX_LINE_LENGTH = 100000;
-WordID kBOS, kEOS, kDIVIDER, kGAP;
-int kCOUNT;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input,i", po::value<string>()->default_value("-"), "Input file")
- ("phrases,p", po::value<string>(), "File contatining phrases of interest")
- ("phrase_context_size,S", po::value<int>()->default_value(2), "Use this many words of context on left and write when writing base phrase contexts")
- ("combiner_size,c", po::value<size_t>()->default_value(30000), "Number of unique items to store in cache before writing rule counts. Set to 1 to disable cache. Set to 0 for no limit.")
- ("prune", po::value<size_t>()->default_value(0), "Prune items with count less than threshold; applies each time the cache is dumped.")
- ("silent", "Write nothing to stderr except errors")
- ("help,h", "Print this help message and exit");
- po::options_description clo("Command line options");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- po::notify(*conf);
-
- if (conf->count("help") || conf->count("input") != 1 || conf->count("phrases") != 1) {
- cerr << "\nUsage: extractor_monolingual [-options]\n";
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-struct TrieNode
-{
- TrieNode(int l) : finish(false), length(l) {};
- ~TrieNode()
- {
- for (unordered_map<int, TrieNode*>::iterator
- it = next.begin(); it != next.end(); ++it)
- delete it->second;
- next.clear();
- }
-
- TrieNode *follow(int token)
- {
- unordered_map<int, TrieNode*>::iterator
- found = next.find(token);
- if (found != next.end())
- return found->second;
- else
- return 0;
- }
-
- void insert(const vector<int> &tokens)
- {
- insert(tokens.begin(), tokens.end());
- }
-
- void insert(vector<int>::const_iterator begin, vector<int>::const_iterator end)
- {
- if (begin == end)
- finish = true;
- else
- {
- int token = *begin;
- unordered_map<int, TrieNode*>::iterator
- nit = next.find(token);
- if (nit == next.end())
- nit = next.insert(make_pair(token, new TrieNode(length+1))).first;
- ++begin;
- nit->second->insert(begin, end);
- }
- }
-
- bool finish;
- int length;
- unordered_map<int, TrieNode*> next;
-};
-
-struct CountCombiner {
- CountCombiner(const size_t& csize, const size_t& prune) : combiner_size(csize), threshold(prune) {
- if (csize == 0) { cerr << "Using unlimited combiner cache.\n"; }
- }
- ~CountCombiner() {
- if (!cache.empty()) WriteAndClearCache();
- }
-
- void Count(const vector<WordID>& key,
- const vector<WordID>& val,
- const int count_type)
- {
- if (combiner_size != 1) {
- cache[key][val] += count_type;
- if (combiner_size > 1 && cache.size() > combiner_size)
- WriteAndClearCache();
- } else {
- cout << TD::GetString(key) << '\t' << TD::GetString(val) << " ||| C=" << count_type << "\n";
- }
- }
-
- private:
- void WriteAndClearCache() {
- for (unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > >::iterator it = cache.begin();
- it != cache.end(); ++it) {
- const Vec2PhraseCount& vals = it->second;
- bool first = true;
- for (Vec2PhraseCount::const_iterator vi = vals.begin(); vi != vals.end(); ++vi)
- {
- if (threshold > 1 && combiner_size != 1 && vi->second < threshold)
- continue;
-
- if (!first) cout << " ||| ";
- else
- {
- cout << TD::GetString(it->first) << '\t';
- first = false;
- }
- cout << TD::GetString(vi->first) << " ||| C=" << vi->second;
- }
- if (!first)
- cout << '\n';
- }
- cout << flush;
- cache.clear();
- }
-
- const size_t combiner_size, threshold;
- typedef unordered_map<vector<WordID>, int, boost::hash<vector<WordID> > > Vec2PhraseCount;
- unordered_map<vector<WordID>, Vec2PhraseCount, boost::hash<vector<WordID> > > cache;
-};
-
-void WriteContext(const vector<int>& sentence, int start, int end, int ctx_size, CountCombiner &combiner)
-{
- vector<WordID> phrase, context;
- for (int i = start; i < end; ++i)
- phrase.push_back(sentence[i]);
-
- for (int i = ctx_size; i > 0; --i)
- context.push_back(sentence[start-i]);
- context.push_back(kGAP);
- for (int i = 0; i < ctx_size; ++i)
- context.push_back(sentence[end+i]);
-
- combiner.Count(phrase, context, 1);
-}
-
-inline bool IsWhitespace(char c) {
- return c == ' ' || c == '\t';
-}
-
-inline void SkipWhitespace(const char* buf, int* ptr) {
- while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
-}
-
-vector<int> ReadSentence(const char *buf, int padding)
-{
- int ptr = 0;
- SkipWhitespace(buf, &ptr);
- int start = ptr;
- vector<int> sentence;
- for (int i = 0; i < padding; ++i)
- sentence.push_back(kBOS);
-
- while (char c = buf[ptr])
- {
- if (!IsWhitespace(c))
- ++ptr;
- else {
- sentence.push_back(TD::Convert(string(buf, start, ptr-start)));
- SkipWhitespace(buf, &ptr);
- start = ptr;
- }
- }
- for (int i = 0; i < padding; ++i)
- sentence.push_back(kEOS);
-
- return sentence;
-}
-
-int main(int argc, char** argv)
-{
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- kBOS = TD::Convert("<s>");
- kEOS = TD::Convert("</s>");
- kDIVIDER = TD::Convert("|||");
- kGAP = TD::Convert("<PHRASE>");
- kCOUNT = FD::Convert("C");
-
- bool silent = conf.count("silent") > 0;
- const int ctx_size = conf["phrase_context_size"].as<int>();
- CountCombiner cc(conf["combiner_size"].as<size_t>(), conf["prune"].as<size_t>());
-
- char buf[MAX_LINE_LENGTH];
- TrieNode phrase_trie(0);
- ReadFile rpf(conf["phrases"].as<string>());
- istream& pin = *rpf.stream();
- while (pin) {
- pin.getline(buf, MAX_LINE_LENGTH);
- phrase_trie.insert(ReadSentence(buf, 0));
- }
-
- ReadFile rif(conf["input"].as<string>());
- istream &iin = *rif.stream();
- int line = 0;
- while (iin) {
- ++line;
- iin.getline(buf, MAX_LINE_LENGTH);
- //cout << "line: " << line << " '" << buf << "'" << endl;
- if (buf[0] == 0) continue;
- if (!silent) {
- if (line % 200 == 0) cerr << '.';
- if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush;
- }
-
- vector<int> sentence = ReadSentence(buf, ctx_size);
- //cout << "sentence: " << TD::GetString(sentence) << endl;
- vector<TrieNode*> tries;
- for (int i = ctx_size; i < (int)sentence.size() - ctx_size; ++i)
- {
- //cout << "i: " << i << " token: " << TD::Convert(sentence[i]) << " tries: " << tries.size() << endl;
- vector<TrieNode*> tries_prime;
- tries.push_back(&phrase_trie);
- for (vector<TrieNode*>::iterator tit = tries.begin(); tit != tries.end(); ++tit)
- {
- TrieNode* next = (*tit)->follow(sentence[i]);
- if (next != 0)
- {
- //cout << "\tfollowed edge: " << next->finish << endl;
- if (next->finish)
- WriteContext(sentence, i + 1 - next->length, i + 1, ctx_size, cc);
- tries_prime.push_back(next);
- }
- }
- swap(tries, tries_prime);
- }
- //cout << "/sentence" << endl;
- }
- if (!silent) cerr << endl;
- return 0;
-}