From d04386922f3e25017207bbc26ea460ee7d85c630 Mon Sep 17 00:00:00 2001 From: redpony Date: Fri, 1 Oct 2010 20:13:48 +0000 Subject: compute obj, fixes for grammar filter git-svn-id: https://ws10smt.googlecode.com/svn/trunk@668 ec762483-ff6d-05da-a07a-a48fb63a330f --- training/Makefile.am | 6 +- training/cllh_filter_grammar.cc | 38 +++++++-- training/compute_cllh.cc | 185 ++++++++++++++++++++++++++++++++++++++++ training/model1.cc | 9 +- 4 files changed, 225 insertions(+), 13 deletions(-) create mode 100644 training/compute_cllh.cc diff --git a/training/Makefile.am b/training/Makefile.am index 83c15ecc..b3f93529 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -20,13 +20,17 @@ TESTS = lbfgs_test optimize_test if MPI bin_PROGRAMS += mpi_batch_optimize \ - mpi_online_optimize + mpi_online_optimize \ + compute_cllh mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +compute_cllh_SOURCES = compute_cllh.cc +compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz endif cllh_filter_grammar_SOURCES = cllh_filter_grammar.cc diff --git a/training/cllh_filter_grammar.cc b/training/cllh_filter_grammar.cc index 90fe9fba..6998ec2b 100644 --- a/training/cllh_filter_grammar.cc +++ b/training/cllh_filter_grammar.cc @@ -22,7 +22,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("training_data,t",po::value(),"Training data corpus") ("decoder_config,c",po::value(),"Decoder configuration file") - ("ncpus,n",po::value()->default_value(1),"Number of CPUs to use"); + ("shards,s",po::value()->default_value(1),"Number of shards") + ("starting_shard,S",po::value()->default_value(0), "In this invocation only process shards >= S") + ("work_limit,l",po::value()->default_value(9999), "Process maximially this many shards") + ("ncpus,C",po::value()->default_value(1),"Number of CPUs to use"); po::options_description clo("Command line options"); clo.add_options() ("config", po::value(), "Configuration file") @@ -49,6 +52,8 @@ void ReadTrainingCorpus(const string& fname, int rank, int size, vector* istream& in = *rf.stream(); string line; int lc = 0; + assert(size > 0); + assert(rank < size); while(in) { getline(in, line); if (!in) break; @@ -112,6 +117,7 @@ void work(const string& fname, int rank, int size, Decoder* decoder) { vector ids; ReadTrainingCorpus(fname, rank, size, &corpus, &ids); assert(corpus.size() > 0); + assert(corpus.size() == ids.size()); cerr << " " << rank << '/' << size << ": has " << corpus.size() << " sentences to process\n"; ostringstream oc; oc << "corpus." << rank << "_of_" << size; WriteFile foc(oc.str()); @@ -121,13 +127,14 @@ void work(const string& fname, int rank, int size, Decoder* decoder) { set all_used; TrainingObserver observer; for (int i = 0; i < corpus.size(); ++i) { - int ex_num = ids[i]; - decoder->SetId(ex_num); - decoder->Decode(corpus[ex_num], &observer); + const int sent_id = ids[i]; + const string& input = corpus[i]; + decoder->SetId(sent_id); + decoder->Decode(input, &observer); if (observer.failed) { - (*foc.stream()) << "*** id=" << ex_num << " is unreachable\n"; + // do nothing } else { - (*foc.stream()) << corpus[ex_num] << endl; + (*foc.stream()) << input << endl; for (set::iterator it = observer.used.begin(); it != observer.used.end(); ++it) { if (all_used.insert(*it).second) (*fog.stream()) << **it << endl; @@ -143,6 +150,11 @@ int main(int argc, char** argv) { InitCommandLine(argc, argv, &conf); const string fname = conf["training_data"].as(); const unsigned ncpus = conf["ncpus"].as(); + const unsigned shards = conf["shards"].as(); + const unsigned start = conf["starting_shard"].as(); + const unsigned work_limit = conf["work_limit"].as(); + const unsigned eff_shards = min(start + work_limit, shards); + cerr << "Processing shards " << start << "/" << shards << " to " << eff_shards << "/" << shards << endl; assert(ncpus > 0); ReadFile ini_rf(conf["decoder_config"].as()); Decoder decoder(ini_rf.stream()); @@ -162,8 +174,13 @@ int main(int argc, char** argv) { if (pid > 0) { children.push_back(pid); } else { - work(fname, i, ncpus, &decoder); - cerr << " " << i << "/" << ncpus << " finished.\n"; + for (int j = start; j < eff_shards; ++j) { + if (j % ncpus == i) { + cerr << " CPU " << i << " processing shard " << j << endl; + work(fname, j, shards, &decoder); + cerr << " Shard " << j << "/" << shards << " finished.\n"; + } + } _exit(0); } } @@ -171,7 +188,10 @@ int main(int argc, char** argv) { int status; int w = waitpid(children[i], &status, 0); if (w < 0) { cerr << "Error while waiting for children!"; return 1; } - cerr << "Child " << i << ": status=" << status << " sig?=" << WIFSIGNALED(status) << " sig=" << WTERMSIG(status) << endl; + if (WIFSIGNALED(status)) { + cerr << "Child " << i << " received signal " << WTERMSIG(status) << endl; + if (WTERMSIG(status) == 11) { cerr << " this is a SEGV- you may be trying to print temporarily created rules\n"; } + } } return 0; } diff --git a/training/compute_cllh.cc b/training/compute_cllh.cc new file mode 100644 index 00000000..f25e17c3 --- /dev/null +++ b/training/compute_cllh.cc @@ -0,0 +1,185 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "verbose.h" +#include "hg.h" +#include "prob.h" +#include "inside_outside.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "weights.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("weights,w",po::value(),"Input feature weights file") + ("training_data,t",po::value(),"Training data corpus") + ("decoder_config,c",po::value(),"Decoder configuration file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) { + cerr << dcmdline_options << endl; + MPI::Finalize(); + exit(1); + } +} + +void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c, vector* ids) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + int lc = 0; + while(in) { + getline(in, line); + if (!in) break; + if (lc % size == rank) { + c->push_back(line); + ids->push_back(lc); + } + ++lc; + } +} + +static const double kMINUS_EPSILON = -1e-6; + +struct TrainingObserver : public DecoderObserver { + void Reset() { + acc_obj = 0; + } + + virtual void NotifyDecodingStart(const SentenceMetadata&) { + cur_obj = 0; + state = 1; + } + + // compute model expectations, denominator of objective + virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) { + assert(state == 1); + state = 2; + SparseVector cur_model_exp; + const prob_t z = InsideOutside, + EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); + cur_obj = log(z); + } + + // compute "empirical" expectations, numerator of objective + virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { + assert(state == 2); + state = 3; + SparseVector ref_exp; + const prob_t ref_z = InsideOutside, + EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp); + + double log_ref_z; +#if 0 + if (crf_uniform_empirical) { + log_ref_z = ref_exp.dot(feature_weights); + } else { + log_ref_z = log(ref_z); + } +#else + log_ref_z = log(ref_z); +#endif + + // rounding errors means that <0 is too strict + if ((cur_obj - log_ref_z) < kMINUS_EPSILON) { + cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl; + exit(1); + } + assert(!isnan(log_ref_z)); + acc_obj += (cur_obj - log_ref_z); + } + + double acc_obj; + double cur_obj; + int state; +}; + +namespace mpi = boost::mpi; + +int main(int argc, char** argv) { + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); + if (size > 1) SetSilent(true); // turn off verbose decoder output + register_feature_functions(); + + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + // load initial weights + Weights weights; + if (conf.count("weights")) + weights.InitFromFile(conf["weights"].as()); + + // freeze feature set + //const bool freeze_feature_set = conf.count("freeze_feature_set"); + //if (freeze_feature_set) FD::Freeze(); + + // load cdec.ini and set up decoder + ReadFile ini_rf(conf["decoder_config"].as()); + Decoder decoder(ini_rf.stream()); + if (decoder.GetConf()["input"].as() != "-") { + cerr << "cdec.ini must not set an input file\n"; + abort(); + } + + vector corpus; vector ids; + ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus, &ids); + assert(corpus.size() > 0); + assert(corpus.size() == ids.size()); + + vector wv; + weights.InitVector(&wv); + decoder.SetWeights(wv); + TrainingObserver observer; + double objective = 0; + bool converged = false; + + observer.Reset(); + if (rank == 0) + cerr << "Each processor is decoding " << corpus.size() << " training examples...\n"; + + for (int i = 0; i < corpus.size(); ++i) { + decoder.SetId(ids[i]); + decoder.Decode(corpus[i], &observer); + } + + reduce(world, observer.acc_obj, objective, std::plus(), 0); + + if (rank == 0) + cout << "OBJECTIVE: " << objective << endl; + + return 0; +} diff --git a/training/model1.cc b/training/model1.cc index 92a70985..3e27689f 100644 --- a/training/model1.cc +++ b/training/model1.cc @@ -29,8 +29,8 @@ int main(int argc, char** argv) { double denom = 0.0; int lc = 0; bool flag = false; + string line; while(true) { - string line; getline(in, line); if (!in) break; ++lc; @@ -41,8 +41,11 @@ int main(int argc, char** argv) { Lattice src, trg; LatticeTools::ConvertTextToLattice(ssrc, &src); LatticeTools::ConvertTextToLattice(strg, &trg); - assert(src.size() > 0); - assert(trg.size() > 0); + if (src.size() == 0 || trg.size() == 0) { + cerr << "Error: " << lc << "\n" << line << endl; + assert(src.size() > 0); + assert(trg.size() > 0); + } denom += 1.0; vector probs(src.size() + 1); for (int j = 0; j < trg.size(); ++j) { -- cgit v1.2.3