summaryrefslogtreecommitdiff
path: root/extools/filter_grammar.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-08-12 23:33:21 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2012-08-12 23:33:21 -0400
commitda176941c1f481f14e93bd7d055cc29cac0ea8c8 (patch)
treec7ec8c0f75b386e6ca6d37da830e5a2e369b1cca /extools/filter_grammar.cc
parent4760209baa483403db3bcb9bf1a32ae87a7b576d (diff)
use new union api
Diffstat (limited to 'extools/filter_grammar.cc')
-rw-r--r--extools/filter_grammar.cc135
1 files changed, 0 insertions, 135 deletions
diff --git a/extools/filter_grammar.cc b/extools/filter_grammar.cc
deleted file mode 100644
index cafcc923..00000000
--- a/extools/filter_grammar.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Filter a grammar in striped format
- */
-#include <iostream>
-#include <string>
-#include <map>
-#include <vector>
-#include <utility>
-#include <tr1/unordered_map>
-
-#include "suffix_tree.h"
-#include "sparse_vector.h"
-#include "sentence_pair.h"
-#include "extract.h"
-#include "fdict.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "striped_grammar.h"
-
-#include <boost/shared_ptr.hpp>
-#include <boost/functional/hash.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-using namespace std;
-using namespace std::tr1;
-namespace po = boost::program_options;
-
-static const size_t MAX_LINE_LENGTH = 64000000;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("test_set,t", po::value<string>(), "Filter for this test set")
- ("top_e_given_f,n", po::value<size_t>()->default_value(30), "Keep top N rules, according to p(e|f). 0 for all")
- ("help,h", "Print this help message and exit");
- po::options_description clo("Command line options");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- po::notify(*conf);
-
- if (conf->count("help") || conf->count("test_set")==0) {
- cerr << "\nUsage: filter_grammar -t TEST-SET.fr [-options] < grammar\n";
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-struct SourceFilter {
- // return true to keep the rule, otherwise false
- virtual bool Matches(const vector<WordID>& key) const = 0;
- virtual ~SourceFilter() {}
-};
-
-struct DumbSuffixTreeFilter : SourceFilter {
- DumbSuffixTreeFilter(const string& corpus) {
- cerr << "Build suffix tree from test set in " << corpus << endl;
- assert(FileExists(corpus));
- ReadFile rfts(corpus);
- istream& testSet = *rfts.stream();
- char* buf = new char[MAX_LINE_LENGTH];
- AnnotatedParallelSentence sent;
-
- /* process the data set to build suffix tree
- */
- while(!testSet.eof()) {
- testSet.getline(buf, MAX_LINE_LENGTH);
- if (buf[0] == 0) continue;
-
- //hack to read in the test set using AnnotatedParallelSentence
- strcat(buf," ||| fake ||| 0-0");
- sent.ParseInputLine(buf);
-
- //add each successive suffix to the tree
- for(int i=0; i<sent.f_len; i++)
- root.InsertPath(sent.f, i, sent.f_len - 1);
- }
- delete[] buf;
- }
- virtual bool Matches(const vector<WordID>& src_rhs) const {
- const Node<int>* curnode = &root;
- for(int i=0; i < src_rhs.size(); i++) {
- if (src_rhs[i] <= 0) {
- curnode = &root;
- } else if (curnode) {
- curnode = curnode->Extend(src_rhs[i]);
- if (!curnode) return false;
- }
- }
- return true;
- }
- Node<int> root;
-};
-
-boost::shared_ptr<SourceFilter> filter;
-multimap<float, ID2RuleStatistics::const_iterator> options;
-int kCOUNT;
-int max_options;
-
-void cb(WordID lhs, const vector<WordID>& src_rhs, const ID2RuleStatistics& rules, void*) {
- options.clear();
- if (!filter || filter->Matches(src_rhs)) {
- for (ID2RuleStatistics::const_iterator it = rules.begin(); it != rules.end(); ++it) {
- options.insert(make_pair(-it->second.counts.get(kCOUNT), it));
- }
- int ocount = 0;
- cout << '[' << TD::Convert(-lhs) << ']' << " ||| ";
- WriteNamed(src_rhs, &cout);
- cout << '\t';
- bool first = true;
- for (multimap<float,ID2RuleStatistics::const_iterator>::iterator it = options.begin(); it != options.end(); ++it) {
- if (first) { first = false; } else { cout << " ||| "; }
- WriteAnonymous(it->second->first, &cout);
- cout << " ||| " << it->second->second;
- ++ocount;
- if (ocount == max_options) break;
- }
- cout << endl;
- }
-}
-
-int main(int argc, char** argv){
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- max_options = conf["top_e_given_f"].as<size_t>();;
- kCOUNT = FD::Convert("CFE");
- istream& unscored_grammar = cin;
- cerr << "Loading test set " << conf["test_set"].as<string>() << "...\n";
- filter.reset(new DumbSuffixTreeFilter(conf["test_set"].as<string>()));
- cerr << "Filtering...\n";
- StripedGrammarLexer::ReadStripedGrammar(&unscored_grammar, cb, NULL);
-}
-