diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2011-07-06 20:41:52 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2011-07-06 20:41:52 -0400 |
commit | 3b004be48979da652cc64e7a01e685190eb79498 (patch) | |
tree | dc3f7f1eb8920b5a2ab0256d3dd8f825ec20a483 | |
parent | fe4b60f8669f0bdfcc67832e5487b33bd4b28938 (diff) |
tool to compute feature expectations in translation charts
-rw-r--r-- | training/Makefile.am | 4 | ||||
-rw-r--r-- | training/feature_expectations.cc | 232 |
2 files changed, 236 insertions, 0 deletions
diff --git a/training/Makefile.am b/training/Makefile.am index 0d9085e4..e075e417 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -14,6 +14,7 @@ bin_PROGRAMS = \ mpi_batch_optimize \ mpi_em_optimize \ compute_cllh \ + feature_expectations \ augment_grammar noinst_PROGRAMS = \ @@ -28,6 +29,9 @@ mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +feature_expectations_SOURCES = feature_expectations.cc +feature_expectations_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + mpi_em_optimize_SOURCES = mpi_em_optimize.cc optimize.cc mpi_em_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/training/feature_expectations.cc b/training/feature_expectations.cc new file mode 100644 index 00000000..f1a85495 --- /dev/null +++ b/training/feature_expectations.cc @@ -0,0 +1,232 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> +#include <cassert> +#include <cmath> +#include <tr1/memory> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "verbose.h" +#include "hg.h" +#include "prob.h" +#include "inside_outside.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "online_optimizer.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" +#include "sampler.h" + +#ifdef HAVE_MPI +#include <boost/mpi/timer.hpp> +#include <boost/mpi.hpp> +namespace mpi = boost::mpi; +#endif + +using namespace std; +namespace po = boost::program_options; + +struct FComp { + const vector<double>& w_; + FComp(const vector<double>& w) : w_(w) {} + bool operator()(int a, int b) const { + return fabs(w_[a]) > fabs(w_[b]); + } +}; + +void ShowFeatures(const vector<double>& w) { + vector<int> fnums(w.size()); + for (int i = 0; i < w.size(); ++i) + fnums[i] = i; + sort(fnums.begin(), fnums.end(), FComp(w)); + for (vector<int>::iterator i = fnums.begin(); i != fnums.end(); ++i) { + if (w[*i]) cout << FD::Convert(*i) << ' ' << w[*i] << endl; + } +} + +void ReadConfig(const string& ini, vector<string>* out) { + ReadFile rf(ini); + istream& in = *rf.stream(); + while(in) { + string line; + getline(in, line); + if (!in) continue; + out->push_back(line); + } +} + +void StoreConfig(const vector<string>& cfg, istringstream* o) { + ostringstream os; + for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; } + o->str(os.str()); +} + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("input,i",po::value<string>(),"Corpus of source language sentences") + ("weights,w",po::value<string>(),"Input feature weights file") + ("decoder_config,c",po::value<string>(), "cdec.ini file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("input") || !conf->count("decoder_config")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + int id = 0; + while(in) { + getline(in, line); + if (!in) break; + if (id % size == rank) { + c->push_back(line); + order->push_back(id); + } + ++id; + } +} + +static const double kMINUS_EPSILON = -1e-6; + +struct TrainingObserver : public DecoderObserver { + void Reset() { + acc_exp.clear(); + total_complete = 0; + } + + virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { + cur_model_exp.clear(); + state = 1; + } + + // compute model expectations, denominator of objective + virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { + assert(state == 1); + state = 2; + const prob_t z = InsideOutside<prob_t, + EdgeProb, + SparseVector<prob_t>, + EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); + cur_model_exp /= z; + acc_exp += cur_model_exp; + } + + virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { + cerr << "IGNORING ALIGNMENT FOREST!\n"; + } + + virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { + if (state == 2) { + ++total_complete; + } + } + + void GetExpectations(SparseVector<double>* g) const { + g->clear(); + for (SparseVector<prob_t>::const_iterator it = acc_exp.begin(); it != acc_exp.end(); ++it) + g->set_value(it->first, it->second); + } + + int total_complete; + SparseVector<prob_t> cur_model_exp; + SparseVector<prob_t> acc_exp; + int state; +}; + +#ifdef HAVE_MPI +namespace boost { namespace mpi { + template<> + struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> > + : mpl::true_ { }; +} } // end namespace boost::mpi +#endif + +int main(int argc, char** argv) { +#ifdef HAVE_MPI + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); +#else + const int size = 1; + const int rank = 0; +#endif + if (size > 1) SetSilent(true); // turn off verbose decoder output + register_feature_functions(); + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) + return 1; + + // load initial weights + Weights weights; + if (conf.count("weights")) + weights.InitFromFile(conf["weights"].as<string>()); + + vector<string> corpus; + vector<int> ids; + ReadTrainingCorpus(conf["input"].as<string>(), rank, size, &corpus, &ids); + assert(corpus.size() > 0); + + vector<string> cdec_ini; + ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini); + istringstream ini; + StoreConfig(cdec_ini, &ini); + Decoder decoder(&ini); + if (decoder.GetConf()["input"].as<string>() != "-") { + cerr << "cdec.ini must not set an input file\n"; + return 1; + } + + SparseVector<double> x; + weights.InitSparseVector(&x); + TrainingObserver observer; + + weights.InitFromVector(x); + vector<double> lambdas; + weights.InitVector(&lambdas); + decoder.SetWeights(lambdas); + observer.Reset(); + for (unsigned i = 0; i < corpus.size(); ++i) { + int id = ids[i]; + decoder.SetId(id); + decoder.Decode(corpus[i], &observer); + } + SparseVector<double> local_exps, exps; + observer.GetExpectations(&local_exps); +#ifdef HAVE_MPI + reduce(world, local_exps, exps, std::plus<SparseVector<double> >(), 0); +#else + exps.swap(local_exps); +#endif + + weights.InitFromVector(exps); + weights.InitVector(&lambdas); + ShowFeatures(lambdas); + + return 0; +} |