From 9d4cfa88a71c0cba9a7d3e21cb2b58f78b097b48 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 19 Jun 2012 23:07:51 +0100 Subject: compute held-out ppl in mpi_batch_optimize --- training/Makefile.am | 4 +-- training/cllh_observer.cc | 52 +++++++++++++++++++++++++++++ training/cllh_observer.h | 26 +++++++++++++++ training/mpi_batch_optimize.cc | 75 ++++++++++++++++++++---------------------- training/mpi_compute_cllh.cc | 59 +-------------------------------- 5 files changed, 117 insertions(+), 99 deletions(-) create mode 100644 training/cllh_observer.cc create mode 100644 training/cllh_observer.h diff --git a/training/Makefile.am b/training/Makefile.am index 8124b107..19ee8f0d 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -41,10 +41,10 @@ mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mtev mpi_extract_features_SOURCES = mpi_extract_features.cc mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc +mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc mpi_batch_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc +mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz augment_grammar_SOURCES = augment_grammar.cc diff --git a/training/cllh_observer.cc b/training/cllh_observer.cc new file mode 100644 index 00000000..58232769 --- /dev/null +++ b/training/cllh_observer.cc @@ -0,0 +1,52 @@ +#include "cllh_observer.h" + +#include +#include + +#include "inside_outside.h" +#include "hg.h" +#include "sentence_metadata.h" + +using namespace std; + +static const double kMINUS_EPSILON = -1e-6; + +ConditionalLikelihoodObserver::~ConditionalLikelihoodObserver() {} + +void ConditionalLikelihoodObserver::NotifyDecodingStart(const SentenceMetadata&) { + cur_obj = 0; + state = 1; +} + +void ConditionalLikelihoodObserver::NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) { + assert(state == 1); + state = 2; + SparseVector cur_model_exp; + const prob_t z = InsideOutside, + EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); + cur_obj = log(z); +} + +void ConditionalLikelihoodObserver::NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { + assert(state == 2); + state = 3; + SparseVector ref_exp; + const prob_t ref_z = InsideOutside, + EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp); + + double log_ref_z = log(ref_z); + + // rounding errors means that <0 is too strict + if ((cur_obj - log_ref_z) < kMINUS_EPSILON) { + cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl; + exit(1); + } + assert(!isnan(log_ref_z)); + acc_obj += (cur_obj - log_ref_z); + trg_words += smeta.GetReference().size(); +} + diff --git a/training/cllh_observer.h b/training/cllh_observer.h new file mode 100644 index 00000000..0de47331 --- /dev/null +++ b/training/cllh_observer.h @@ -0,0 +1,26 @@ +#ifndef _CLLH_OBSERVER_H_ +#define _CLLH_OBSERVER_H_ + +#include "decoder.h" + +struct ConditionalLikelihoodObserver : public DecoderObserver { + + ConditionalLikelihoodObserver() : trg_words(), acc_obj(), cur_obj() {} + ~ConditionalLikelihoodObserver(); + + void Reset() { + acc_obj = 0; + trg_words = 0; + } + + virtual void NotifyDecodingStart(const SentenceMetadata&); + virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg); + virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg); + + unsigned trg_words; + double acc_obj; + double cur_obj; + int state; +}; + +#endif diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc index 9f12dba9..0db062a7 100644 --- a/training/mpi_batch_optimize.cc +++ b/training/mpi_batch_optimize.cc @@ -15,6 +15,8 @@ namespace mpi = boost::mpi; #include #include +#include "sentence_metadata.h" +#include "cllh_observer.h" #include "verbose.h" #include "hg.h" #include "prob.h" @@ -36,14 +38,14 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("input_weights,w",po::value(),"Input feature weights file") ("training_data,t",po::value(),"Training data") - ("decoder_config,d",po::value(),"Decoder configuration file") + ("test_data,T",po::value(),"(optional) test data") + ("decoder_config,c",po::value(),"Decoder configuration file") ("output_weights,o",po::value()->default_value("-"),"Output feature weights file") ("optimization_method,m", po::value()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)") ("correction_buffers,M", po::value()->default_value(10), "Number of gradients for LBFGS to maintain in memory") ("gaussian_prior,p","Use a Gaussian prior on the weights") - ("means,u", po::value(), "File containing the means for Gaussian prior") - ("per_sentence_grammar_scratch,P", po::value(), "(Optional) location of scratch space to copy per-sentence grammars for fast access, useful if a RAM disk is available") - ("sigma_squared", po::value()->default_value(1.0), "Sigma squared term for spherical Gaussian prior"); + ("sigma_squared", po::value()->default_value(1.0), "Sigma squared term for spherical Gaussian prior") + ("means,u", po::value(), "(optional) file containing the means for Gaussian prior"); po::options_description clo("Command line options"); clo.add_options() ("config", po::value(), "Configuration file") @@ -86,6 +88,7 @@ struct TrainingObserver : public DecoderObserver { acc_grad.clear(); acc_obj = 0; total_complete = 0; + trg_words = 0; } void SetLocalGradientAndObjective(vector* g, double* o) const { @@ -143,6 +146,7 @@ struct TrainingObserver : public DecoderObserver { ref_exp -= cur_model_exp; acc_grad -= ref_exp; acc_obj += (cur_obj - log_ref_z); + trg_words += smeta.GetReference().size(); } virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { @@ -157,6 +161,7 @@ struct TrainingObserver : public DecoderObserver { SparseVector acc_grad; double acc_obj; double cur_obj; + unsigned trg_words; int state; }; @@ -187,36 +192,6 @@ struct VectorPlus : public binary_function, vector, vector > { } }; -void MovePerSentenceGrammars(const string& root, int size, int rank, vector* c) { - if (!DirectoryExists(root)) { - cerr << "Can't find scratch space at " << root << endl; - abort(); - } - ostringstream os; - os << root << "/psg." << size << "_of_" << rank; - const string path = os.str(); - MkDirP(path); - string sent; - map attr; - for (unsigned i = 0; i < c->size(); ++i) { - sent = (*c)[i]; - attr.clear(); - ProcessAndStripSGML(&sent, &attr); - map::iterator it = attr.find("grammar"); - if (it != attr.end()) { - string src_file = it->second; - bool is_gzipped = (src_file.size() > 3) && (src_file.rfind(".gz") == (src_file.size() - 3)); - string new_name = path + "/" + md5(sent); - if (is_gzipped) new_name += ".gz"; - CopyFile(src_file, new_name); - it->second = new_name; - } - ostringstream ns; - ns << SGMLOpenSegTag(attr) << ' ' << sent << " "; - (*c)[i] = ns.str(); - } -} - int main(int argc, char** argv) { #ifdef HAVE_MPI mpi::environment env(argc, argv); @@ -284,22 +259,24 @@ int main(int argc, char** argv) { rcv_grad.clear(); bool converged = false; - vector corpus; + vector corpus, test_corpus; ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus); assert(corpus.size() > 0); - - if (conf.count("per_sentence_grammar_scratch")) - MovePerSentenceGrammars(conf["per_sentence_grammar_scratch"].as(), rank, size, &corpus); + if (conf.count("test_data")) + ReadTrainingCorpus(conf["test_data"].as(), rank, size, &test_corpus); TrainingObserver observer; + ConditionalLikelihoodObserver cllh_observer; while (!converged) { observer.Reset(); + cllh_observer.Reset(); #ifdef HAVE_MPI mpi::timer timer; world.barrier(); #endif if (rank == 0) { cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n"; + cerr << " Testset size: " << test_corpus.size() << " sentences / proc)\n"; } for (int i = 0; i < corpus.size(); ++i) decoder->Decode(corpus[i], &observer); @@ -307,18 +284,38 @@ int main(int argc, char** argv) { fill(gradient.begin(), gradient.end(), 0); observer.SetLocalGradientAndObjective(&gradient, &objective); - double to = 0; + unsigned total_words = 0; #ifdef HAVE_MPI + double to = 0; rcv_grad.resize(num_feats, 0.0); mpi::reduce(world, &gradient[0], gradient.size(), &rcv_grad[0], plus(), 0); swap(gradient, rcv_grad); rcv_grad.clear(); + reduce(world, observer.trg_words, total_words, std::plus(), 0); mpi::reduce(world, objective, to, plus(), 0); objective = to; +#else + total_words = observer.trg_words; +#endif + if (rank == 0) + cerr << "TRAINING CORPUS: ln p(f|e)=" << objective << "\t log_2 p(f|e) = " << (objective/log(2)) << "\t cond. entropy = " << (objective/log(2) / total_words) << "\t ppl = " << pow(2, (objective/log(2) / total_words)) << endl; + + for (int i = 0; i < test_corpus.size(); ++i) + decoder->Decode(test_corpus[i], &cllh_observer); + + double test_objective = 0; + unsigned test_total_words = 0; +#ifdef HAVE_MPI + reduce(world, cllh_observer.acc_obj, test_objective, std::plus(), 0); + reduce(world, cllh_observer.trg_words, test_total_words, std::plus(), 0); +#else + test_objective = observer.acc_obj; #endif if (rank == 0) { // run optimizer only on rank=0 node + if (test_corpus.size()) + cerr << " TEST CORPUS: ln p(f|e)=" << test_objective << "\t log_2 p(f|e) = " << (test_objective/log(2)) << "\t cond. entropy = " << (test_objective/log(2) / test_total_words) << "\t ppl = " << pow(2, (test_objective/log(2) / test_total_words)) << endl; if (gaussian_prior) { const double sigsq = conf["sigma_squared"].as(); double norm = 0; diff --git a/training/mpi_compute_cllh.cc b/training/mpi_compute_cllh.cc index d5caa745..066389d0 100644 --- a/training/mpi_compute_cllh.cc +++ b/training/mpi_compute_cllh.cc @@ -10,6 +10,7 @@ #include #include +#include "cllh_observer.h" #include "sentence_metadata.h" #include "verbose.h" #include "hg.h" @@ -67,64 +68,6 @@ void ReadInstances(const string& fname, int rank, int size, vector* c) { static const double kMINUS_EPSILON = -1e-6; -struct ConditionalLikelihoodObserver : public DecoderObserver { - - ConditionalLikelihoodObserver() : trg_words(), acc_obj(), cur_obj() {} - - virtual void NotifyDecodingStart(const SentenceMetadata&) { - cur_obj = 0; - state = 1; - } - - // compute model expectations, denominator of objective - virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) { - assert(state == 1); - state = 2; - SparseVector cur_model_exp; - const prob_t z = InsideOutside, - EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); - cur_obj = log(z); - } - - // compute "empirical" expectations, numerator of objective - virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 2); - state = 3; - SparseVector ref_exp; - const prob_t ref_z = InsideOutside, - EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp); - - double log_ref_z; -#if 0 - if (crf_uniform_empirical) { - log_ref_z = ref_exp.dot(feature_weights); - } else { - log_ref_z = log(ref_z); - } -#else - log_ref_z = log(ref_z); -#endif - - // rounding errors means that <0 is too strict - if ((cur_obj - log_ref_z) < kMINUS_EPSILON) { - cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl; - exit(1); - } - assert(!isnan(log_ref_z)); - acc_obj += (cur_obj - log_ref_z); - trg_words += smeta.GetReference().size(); - } - - unsigned trg_words; - double acc_obj; - double cur_obj; - int state; -}; - #ifdef HAVE_MPI namespace mpi = boost::mpi; #endif -- cgit v1.2.3