From 7cc92b65a3185aa242088d830e166e495674efc9 Mon Sep 17 00:00:00 2001 From: redpony Date: Tue, 22 Jun 2010 05:12:27 +0000 Subject: initial checkin git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f --- training/collapse_weights.cc | 102 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 training/collapse_weights.cc (limited to 'training/collapse_weights.cc') diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc new file mode 100644 index 00000000..5e0f3f72 --- /dev/null +++ b/training/collapse_weights.cc @@ -0,0 +1,102 @@ +#include +#include +#include + +#include +#include +#include + +#include "prob.h" +#include "filelib.h" +#include "trule.h" +#include "weights.h" + +namespace po = boost::program_options; +using namespace std; + +typedef std::tr1::unordered_map, prob_t, boost::hash > > MarginalMap; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("grammar,g", po::value(), "Grammar file") + ("weights,w", po::value(), "Weights file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config,c", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + const string cfg = (*conf)["config"].as(); + cerr << "Configuration file: " << cfg << endl; + ifstream config(cfg.c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("grammar") || !conf->count("weights")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string wfile = conf["weights"].as(); + const string gfile = conf["grammar"].as(); + Weights wm; + wm.InitFromFile(wfile); + vector w; + wm.InitVector(&w); + MarginalMap e_tots; + MarginalMap f_tots; + prob_t tot; + { + ReadFile rf(gfile); + assert(*rf.stream()); + istream& in = *rf.stream(); + cerr << "Computing marginals...\n"; + int lc = 0; + while(in) { + string line; + getline(in, line); + ++lc; + if (line.empty()) continue; + TRule tr(line, true); + if (tr.GetFeatureValues().empty()) + cerr << "Line " << lc << ": empty features - may introduce bias\n"; + prob_t prob; + prob.logeq(tr.GetFeatureValues().dot(w)); + e_tots[tr.e_] += prob; + f_tots[tr.f_] += prob; + tot += prob; + } + } + bool normalized = (fabs(log(tot)) < 0.001); + cerr << "Total: " << tot << (normalized ? " [normalized]" : " [scaled]") << endl; + ReadFile rf(gfile); + istream&in = *rf.stream(); + while(in) { + string line; + getline(in, line); + if (line.empty()) continue; + TRule tr(line, true); + const double lp = tr.GetFeatureValues().dot(w); + if (isinf(lp)) { continue; } + tr.scores_.clear(); + + cout << tr.AsString() << " ||| F_and_E=" << lp - log(tot); + if (!normalized) { + cout << ";ZF_and_E=" << lp; + } + cout << ";F_given_E=" << lp - log(e_tots[tr.e_]) + << ";E_given_F=" << lp - log(f_tots[tr.f_]) << endl; + } + return 0; +} + -- cgit v1.2.3