From 671c21451542e2dd20e45b4033d44d8e8735f87b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 3 Dec 2009 16:33:55 -0500 Subject: initial check in --- training/mr_em_train.cc | 270 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 training/mr_em_train.cc (limited to 'training/mr_em_train.cc') diff --git a/training/mr_em_train.cc b/training/mr_em_train.cc new file mode 100644 index 00000000..a15fbe4c --- /dev/null +++ b/training/mr_em_train.cc @@ -0,0 +1,270 @@ +#include +#include +#include +#include + +#include +#include + +#include "config.h" +#ifdef HAVE_BOOST_DIGAMMA +#include +using boost::math::digamma; +#endif + +#include "tdict.h" +#include "filelib.h" +#include "trule.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" + +using namespace std; +using boost::shared_ptr; +namespace po = boost::program_options; + +#ifndef HAVE_BOOST_DIGAMMA +#warning Using Mark Johnson's digamma() +double digamma(double x) { + double result = 0, xx, xx2, xx4; + assert(x > 0); + for ( ; x < 7; ++x) + result -= 1/x; + x -= 1.0/2.0; + xx = 1.0/x; + xx2 = xx*xx; + xx4 = xx2*xx2; + result += log(x)+(1./24.)*xx2-(7.0/960.0)*xx4+(31.0/8064.0)*xx4*xx2-(127.0/30720.0)*xx4*xx4; + return result; +} +#endif + +void SanityCheck(const vector& w) { + for (int i = 0; i < w.size(); ++i) { + assert(!isnan(w[i])); + } +} + +struct FComp { + const vector& w_; + FComp(const vector& w) : w_(w) {} + bool operator()(int a, int b) const { + return w_[a] > w_[b]; + } +}; + +void ShowLargestFeatures(const vector& w) { + vector fnums(w.size() - 1); + for (int i = 1; i < w.size(); ++i) + fnums[i-1] = i; + vector::iterator mid = fnums.begin(); + mid += (w.size() > 10 ? 10 : w.size()) - 1; + partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); + cerr << "MOST PROBABLE:"; + for (vector::iterator i = fnums.begin(); i != mid; ++i) { + cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; + } + cerr << endl; +} + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("output,o",po::value()->default_value("-"),"Output log probs file") + ("grammar,g",po::value >()->composing(),"SCFG grammar file(s)") + ("optimization_method,m", po::value()->default_value("em"), "Optimization method (em, vb)") + ("input_format,f",po::value()->default_value("b64"),"Encoding of the input (b64 or text)"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("grammar")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +// describes a multinomial or multinomial with a prior +// does not contain the parameters- just the list of events +// and any hyperparameters +struct MultinomialInfo { + MultinomialInfo() : alpha(1.0) {} + vector events; // the events that this multinomial generates + double alpha; // hyperparameter for (optional) Dirichlet prior +}; + +typedef map ModelDefinition; + +void LoadModelEvents(const po::variables_map& conf, ModelDefinition* pm) { + ModelDefinition& m = *pm; + m.clear(); + vector gfiles = conf["grammar"].as >(); + for (int i = 0; i < gfiles.size(); ++i) { + ReadFile rf(gfiles[i]); + istream& in = *rf.stream(); + int lc = 0; + while(in) { + string line; + getline(in, line); + if (line.empty()) continue; + ++lc; + TRule r(line, true); + const SparseVector& f = r.GetFeatureValues(); + if (f.num_active() == 0) { + cerr << "[WARNING] no feature found in " << gfiles[i] << ':' << lc << endl; + continue; + } + if (f.num_active() > 1) { + cerr << "[ERROR] more than one feature found in " << gfiles[i] << ':' << lc << endl; + exit(1); + } + SparseVector::const_iterator it = f.begin(); + if (it->second != 1.0) { + cerr << "[ERROR] feature with value != 1 found in " << gfiles[i] << ':' << lc << endl; + exit(1); + } + m[r.GetLHS()].events.push_back(it->first); + } + } + for (ModelDefinition::iterator it = m.begin(); it != m.end(); ++it) { + const vector& v = it->second.events; + cerr << "Multinomial [" << TD::Convert(it->first*-1) << "]\n"; + if (v.size() < 1000) { + cerr << " generates:"; + for (int i = 0; i < v.size(); ++i) { + cerr << " " << FD::Convert(v[i]); + } + cerr << endl; + } + } +} + +void Maximize(const ModelDefinition& m, const bool use_vb, vector* counts) { + for (ModelDefinition::const_iterator it = m.begin(); it != m.end(); ++it) { + const MultinomialInfo& mult_info = it->second; + const vector& events = mult_info.events; + cerr << "Multinomial [" << TD::Convert(it->first*-1) << "]"; + double tot = 0; + for (int i = 0; i < events.size(); ++i) + tot += (*counts)[events[i]]; + cerr << " = " << tot << endl; + assert(tot > 0.0); + double ltot = log(tot); + if (use_vb) + ltot = digamma(tot + events.size() * mult_info.alpha); + for (int i = 0; i < events.size(); ++i) { + if (use_vb) { + (*counts)[events[i]] = digamma((*counts)[events[i]] + mult_info.alpha) - ltot; + } else { + (*counts)[events[i]] = log((*counts)[events[i]]) - ltot; + } + } + if (events.size() < 50) { + for (int i = 0; i < events.size(); ++i) { + cerr << " p(" << FD::Convert(events[i]) << ")=" << exp((*counts)[events[i]]); + } + cerr << endl; + } + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + const bool use_b64 = conf["input_format"].as() == "b64"; + const bool use_vb = conf["optimization_method"].as() == "vb"; + if (use_vb) + cerr << "Using variational Bayes, make sure alphas are set\n"; + + ModelDefinition model_def; + LoadModelEvents(conf, &model_def); + + const string s_obj = "**OBJ**"; + int num_feats = FD::NumFeats(); + cerr << "Number of features: " << num_feats << endl; + + vector counts(num_feats, 0); + double logprob = 0; + // 0**OBJ**=12.2;Feat1=2.3;Feat2=-0.2; + // 0**OBJ**=1.1;Feat1=1.0; + + // E-step + while(cin) { + string line; + getline(cin, line); + if (line.empty()) continue; + int feat; + double val; + size_t i = line.find("\t"); + assert(i != string::npos); + ++i; + if (use_b64) { + SparseVector g; + double obj; + if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { + cerr << "B64 decoder returned error, skipping!\n"; + continue; + } + logprob += obj; + const SparseVector& cg = g; + for (SparseVector::const_iterator it = cg.begin(); it != cg.end(); ++it) { + if (it->first >= num_feats) { + cerr << "Unexpected feature: " << FD::Convert(it->first) << endl; + abort(); + } + counts[it->first] += it->second; + } + } else { // text encoding - your counts will not be accurate! + while (i < line.size()) { + size_t start = i; + while (line[i] != '=' && i < line.size()) ++i; + if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } + string fname = line.substr(start, i - start); + if (fname == s_obj) { + feat = -1; + } else { + feat = FD::Convert(line.substr(start, i - start)); + if (feat >= num_feats) { + cerr << "Unexpected feature: " << line.substr(start, i - start) << endl; + abort(); + } + } + ++i; + start = i; + while (line[i] != ';' && i < line.size()) ++i; + if (i - start == 0) continue; + val = atof(line.substr(start, i - start).c_str()); + ++i; + if (feat == -1) { + logprob += val; + } else { + counts[feat] += val; + } + } + } + } + + cerr << "LOGPROB: " << logprob << endl; + // M-step + Maximize(model_def, use_vb, &counts); + + SanityCheck(counts); + ShowLargestFeatures(counts); + Weights weights; + weights.InitFromVector(counts); + weights.WriteToFile(conf["output"].as(), false); + + return 0; +} -- cgit v1.2.3