From 1a49acb8c540e39f1a95f464197e12f44ad98a3e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 6 Jul 2011 20:41:52 -0400 Subject: tool to compute feature expectations in translation charts --- training/Makefile.am | 4 + training/feature_expectations.cc | 232 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 training/feature_expectations.cc (limited to 'training') diff --git a/training/Makefile.am b/training/Makefile.am index 0d9085e4..e075e417 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -14,6 +14,7 @@ bin_PROGRAMS = \ mpi_batch_optimize \ mpi_em_optimize \ compute_cllh \ + feature_expectations \ augment_grammar noinst_PROGRAMS = \ @@ -28,6 +29,9 @@ mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +feature_expectations_SOURCES = feature_expectations.cc +feature_expectations_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + mpi_em_optimize_SOURCES = mpi_em_optimize.cc optimize.cc mpi_em_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/training/feature_expectations.cc b/training/feature_expectations.cc new file mode 100644 index 00000000..f1a85495 --- /dev/null +++ b/training/feature_expectations.cc @@ -0,0 +1,232 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "verbose.h" +#include "hg.h" +#include "prob.h" +#include "inside_outside.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "online_optimizer.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" +#include "sampler.h" + +#ifdef HAVE_MPI +#include +#include +namespace mpi = boost::mpi; +#endif + +using namespace std; +namespace po = boost::program_options; + +struct FComp { + const vector& w_; + FComp(const vector& w) : w_(w) {} + bool operator()(int a, int b) const { + return fabs(w_[a]) > fabs(w_[b]); + } +}; + +void ShowFeatures(const vector& w) { + vector fnums(w.size()); + for (int i = 0; i < w.size(); ++i) + fnums[i] = i; + sort(fnums.begin(), fnums.end(), FComp(w)); + for (vector::iterator i = fnums.begin(); i != fnums.end(); ++i) { + if (w[*i]) cout << FD::Convert(*i) << ' ' << w[*i] << endl; + } +} + +void ReadConfig(const string& ini, vector* out) { + ReadFile rf(ini); + istream& in = *rf.stream(); + while(in) { + string line; + getline(in, line); + if (!in) continue; + out->push_back(line); + } +} + +void StoreConfig(const vector& cfg, istringstream* o) { + ostringstream os; + for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; } + o->str(os.str()); +} + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("input,i",po::value(),"Corpus of source language sentences") + ("weights,w",po::value(),"Input feature weights file") + ("decoder_config,c",po::value(), "cdec.ini file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("input") || !conf->count("decoder_config")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c, vector* order) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + int id = 0; + while(in) { + getline(in, line); + if (!in) break; + if (id % size == rank) { + c->push_back(line); + order->push_back(id); + } + ++id; + } +} + +static const double kMINUS_EPSILON = -1e-6; + +struct TrainingObserver : public DecoderObserver { + void Reset() { + acc_exp.clear(); + total_complete = 0; + } + + virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { + cur_model_exp.clear(); + state = 1; + } + + // compute model expectations, denominator of objective + virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { + assert(state == 1); + state = 2; + const prob_t z = InsideOutside, + EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); + cur_model_exp /= z; + acc_exp += cur_model_exp; + } + + virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { + cerr << "IGNORING ALIGNMENT FOREST!\n"; + } + + virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { + if (state == 2) { + ++total_complete; + } + } + + void GetExpectations(SparseVector* g) const { + g->clear(); + for (SparseVector::const_iterator it = acc_exp.begin(); it != acc_exp.end(); ++it) + g->set_value(it->first, it->second); + } + + int total_complete; + SparseVector cur_model_exp; + SparseVector acc_exp; + int state; +}; + +#ifdef HAVE_MPI +namespace boost { namespace mpi { + template<> + struct is_commutative >, SparseVector > + : mpl::true_ { }; +} } // end namespace boost::mpi +#endif + +int main(int argc, char** argv) { +#ifdef HAVE_MPI + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); +#else + const int size = 1; + const int rank = 0; +#endif + if (size > 1) SetSilent(true); // turn off verbose decoder output + register_feature_functions(); + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) + return 1; + + // load initial weights + Weights weights; + if (conf.count("weights")) + weights.InitFromFile(conf["weights"].as()); + + vector corpus; + vector ids; + ReadTrainingCorpus(conf["input"].as(), rank, size, &corpus, &ids); + assert(corpus.size() > 0); + + vector cdec_ini; + ReadConfig(conf["decoder_config"].as(), &cdec_ini); + istringstream ini; + StoreConfig(cdec_ini, &ini); + Decoder decoder(&ini); + if (decoder.GetConf()["input"].as() != "-") { + cerr << "cdec.ini must not set an input file\n"; + return 1; + } + + SparseVector x; + weights.InitSparseVector(&x); + TrainingObserver observer; + + weights.InitFromVector(x); + vector lambdas; + weights.InitVector(&lambdas); + decoder.SetWeights(lambdas); + observer.Reset(); + for (unsigned i = 0; i < corpus.size(); ++i) { + int id = ids[i]; + decoder.SetId(id); + decoder.Decode(corpus[i], &observer); + } + SparseVector local_exps, exps; + observer.GetExpectations(&local_exps); +#ifdef HAVE_MPI + reduce(world, local_exps, exps, std::plus >(), 0); +#else + exps.swap(local_exps); +#endif + + weights.InitFromVector(exps); + weights.InitVector(&lambdas); + ShowFeatures(lambdas); + + return 0; +} -- cgit v1.2.3 From bb86637332d49f71c485df34576e464eaf053656 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 13 Sep 2011 17:36:23 +0100 Subject: get rid of bad Weights class so it no longer keeps a copy of a vector inside it --- decoder/decoder.cc | 64 ++++++++--------- decoder/decoder.h | 9 ++- mira/kbest_mira.cc | 62 ++++------------- pro-train/mr_pro_map.cc | 8 +-- pro-train/mr_pro_reduce.cc | 16 ++--- training/Makefile.am | 8 --- training/augment_grammar.cc | 4 +- training/collapse_weights.cc | 6 +- training/compute_cllh.cc | 23 +++--- training/grammar_convert.cc | 8 +-- training/mpi_batch_optimize.cc | 127 ++++++++-------------------------- training/mpi_online_optimize.cc | 69 +++++++----------- training/mr_optimize_reduce.cc | 19 ++--- utils/fdict.h | 2 + utils/phmt.cc | 8 +-- utils/weights.cc | 75 ++++++++++++-------- utils/weights.h | 22 +++--- vest/mr_vest_generate_mapper_input.cc | 6 +- 18 files changed, 201 insertions(+), 335 deletions(-) (limited to 'training') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 25eb2de4..4d4b6245 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -159,8 +159,7 @@ struct RescoringPass { shared_ptr models; shared_ptr inter_conf; vector ffs; - shared_ptr w; // null == use previous weights - vector weight_vector; + shared_ptr > weight_vector; int fid_summary; // 0 == no summary feature double density_prune; // 0 == don't density prune double beam_prune; // 0 == don't beam prune @@ -169,7 +168,7 @@ struct RescoringPass { ostream& operator<<(ostream& os, const RescoringPass& rp) { os << "[num_fn=" << rp.ffs.size(); if (rp.inter_conf) { os << " int_alg=" << *rp.inter_conf; } - if (rp.w) os << " new_weights"; + //if (rp.weight_vector.size() > 0) os << " new_weights"; if (rp.fid_summary) os << " summary_feature=" << FD::Convert(rp.fid_summary); if (rp.density_prune) os << " density_prune=" << rp.density_prune; if (rp.beam_prune) os << " beam_prune=" << rp.beam_prune; @@ -181,13 +180,8 @@ struct DecoderImpl { DecoderImpl(po::variables_map& conf, int argc, char** argv, istream* cfg); ~DecoderImpl(); bool Decode(const string& input, DecoderObserver*); - void SetWeights(const vector& weights) { - init_weights = weights; - for (int i = 0; i < rescoring_passes.size(); ++i) { - if (rescoring_passes[i].models) - rescoring_passes[i].models->SetWeights(weights); - rescoring_passes[i].weight_vector = weights; - } + vector& CurrentWeightVector() { + return *rescoring_passes.back().weight_vector; } void SetId(int next_sent_id) { sent_id = next_sent_id - 1; } @@ -300,8 +294,7 @@ struct DecoderImpl { OracleBleu oracle; string formalism; shared_ptr translator; - Weights w_init_weights; // used with initial parse - vector init_weights; // weights used with initial parse + shared_ptr > init_weights; // weights used with initial parse vector > pffs; #ifdef FSA_RESCORING CFGOptions cfg_options; @@ -557,13 +550,18 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream exit(1); } - // load initial feature weights (and possibly freeze feature set) - if (conf.count("weights")) { - w_init_weights.InitFromFile(str("weights",conf)); - w_init_weights.InitVector(&init_weights); - init_weights.resize(FD::NumFeats()); + // load perfect hash function for features + if (conf.count("cmph_perfect_feature_hash")) { + cerr << "Loading perfect hash function from " << conf["cmph_perfect_feature_hash"].as() << " ...\n"; + FD::EnableHash(conf["cmph_perfect_feature_hash"].as()); + cerr << " " << FD::NumFeats() << " features in map\n"; } + // load initial feature weights (and possibly freeze feature set) + init_weights.reset(new vector); + if (conf.count("weights")) + Weights::InitFromFile(str("weights",conf), init_weights.get()); + // cube pruning pop-limit: we may want to configure this on a per-pass basis pop_limit = conf["cubepruning_pop_limit"].as(); @@ -582,9 +580,8 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream RescoringPass& rp = rescoring_passes.back(); // only configure new weights if pass > 0, otherwise we reuse the initial chart weights if (nth_pass_condition && conf.count(ws)) { - rp.w.reset(new Weights); - rp.w->InitFromFile(str(ws.c_str(), conf)); - rp.w->InitVector(&rp.weight_vector); + rp.weight_vector.reset(new vector()); + Weights::InitFromFile(str(ws.c_str(), conf), rp.weight_vector.get()); } bool has_stateful = false; if (conf.count(ff)) { @@ -624,11 +621,15 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream } // set up weight vectors since later phases may reuse weights from earlier phases - const vector* prev = &init_weights; + shared_ptr > prev_weights = init_weights; for (int pass = 0; pass < rescoring_passes.size(); ++pass) { RescoringPass& rp = rescoring_passes[pass]; - if (!rp.w) { rp.weight_vector = *prev; } else { prev = &rp.weight_vector; } - rp.models.reset(new ModelSet(rp.weight_vector, rp.ffs)); + if (!rp.weight_vector) { + rp.weight_vector = prev_weights; + } else { + prev_weights = rp.weight_vector; + } + rp.models.reset(new ModelSet(*rp.weight_vector, rp.ffs)); string ps = "Pass1 "; ps[4] += pass; if (!SILENT) show_models(conf,*rp.models,ps.c_str()); } @@ -650,12 +651,6 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream FD::Freeze(); // this means we can't see the feature names of not-weighted features } - if (conf.count("cmph_perfect_feature_hash")) { - cerr << "Loading perfect hash function from " << conf["cmph_perfect_feature_hash"].as() << " ...\n"; - FD::EnableHash(conf["cmph_perfect_feature_hash"].as()); - cerr << " " << FD::NumFeats() << " features in map\n"; - } - // set up translation back end if (formalism == "scfg") translator.reset(new SCFGTranslator(conf)); @@ -685,7 +680,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream } if (!fsa_ffs.empty()) { cerr<<"FSA: "; - show_all_features(fsa_ffs,init_weights,cerr,cerr,true,true); + show_all_features(fsa_ffs,*init_weights,cerr,cerr,true,true); } #endif @@ -733,7 +728,8 @@ bool Decoder::Decode(const string& input, DecoderObserver* o) { if (del) delete o; return res; } -void Decoder::SetWeights(const vector& weights) { pimpl_->SetWeights(weights); } +vector& Decoder::CurrentWeightVector() { return pimpl_->CurrentWeightVector(); } +const vector& Decoder::CurrentWeightVector() const { return pimpl_->CurrentWeightVector(); } void Decoder::SetSupplementalGrammar(const std::string& grammar_string) { assert(pimpl_->translator->GetDecoderType() == "SCFG"); static_cast(*pimpl_->translator).SetSupplementalGrammar(grammar_string); @@ -774,7 +770,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { translator->ProcessMarkupHints(smeta.sgml_); Timer t("Translation"); const bool translation_successful = - translator->Translate(to_translate, &smeta, init_weights, &forest); + translator->Translate(to_translate, &smeta, *init_weights, &forest); translator->SentenceComplete(); if (!translation_successful) { @@ -812,7 +808,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { for (int pass = 0; pass < rescoring_passes.size(); ++pass) { const RescoringPass& rp = rescoring_passes[pass]; - const vector& cur_weights = rp.weight_vector; + const vector& cur_weights = *rp.weight_vector; if (!SILENT) cerr << endl << " RESCORING PASS #" << (pass+1) << " " << rp << endl; #ifdef FSA_RESCORING cfg_options.maybe_output_source(forest); @@ -933,7 +929,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { #endif } - const vector& last_weights = (rescoring_passes.empty() ? init_weights : rescoring_passes.back().weight_vector); + const vector& last_weights = (rescoring_passes.empty() ? *init_weights : *rescoring_passes.back().weight_vector); // Oracle Rescoring if(get_oracle_forest) { diff --git a/decoder/decoder.h b/decoder/decoder.h index 5491369f..9d009ffa 100644 --- a/decoder/decoder.h +++ b/decoder/decoder.h @@ -7,6 +7,8 @@ #include #include +#include "weights.h" // weight_t + #undef CP_TIME //#define CP_TIME #ifdef CP_TIME @@ -39,7 +41,12 @@ struct Decoder { Decoder(int argc, char** argv); Decoder(std::istream* config_file); bool Decode(const std::string& input, DecoderObserver* observer = NULL); - void SetWeights(const std::vector& weights); + + // access this to either *read* or *write* to the decoder's last + // weight vector (i.e., the weights of the finest past) + std::vector& CurrentWeightVector(); + const std::vector& CurrentWeightVector() const; + void SetId(int id); ~Decoder(); const boost::program_options::variables_map& GetConf() const { return conf; } diff --git a/mira/kbest_mira.cc b/mira/kbest_mira.cc index 6918a9a1..459a5e6f 100644 --- a/mira/kbest_mira.cc +++ b/mira/kbest_mira.cc @@ -32,21 +32,6 @@ namespace po = boost::program_options; bool invert_score; boost::shared_ptr rng; -void SanityCheck(const vector& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - -struct FComp { - const vector& w_; - FComp(const vector& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - void RandomPermutation(int len, vector* p_ids) { vector& ids = *p_ids; ids.resize(len); @@ -58,21 +43,6 @@ void RandomPermutation(int len, vector* p_ids) { } } -void ShowLargestFeatures(const vector& w) { - vector fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - --mid; - for (vector::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() @@ -209,14 +179,16 @@ int main(int argc, char** argv) { cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; return 1; } - // load initial weights - Weights weights; - weights.InitFromFile(conf["input_weights"].as()); - SparseVector lambdas; - weights.InitSparseVector(&lambdas); ReadFile ini_rf(conf["decoder_config"].as()); Decoder decoder(ini_rf.stream()); + + // load initial weights + vector& dense_weights = decoder.CurrentWeightVector(); + SparseVector lambdas; + Weights::InitFromFile(conf["input_weights"].as(), &dense_weights); + Weights::InitSparseVector(dense_weights, &lambdas); + const double max_step_size = conf["max_step_size"].as(); const double mt_metric_scale = conf["mt_metric_scale"].as(); @@ -230,7 +202,6 @@ int main(int argc, char** argv) { double tot_loss = 0; int dots = 0; int cur_pass = 0; - vector dense_weights; SparseVector tot; tot += lambdas; // initial weights normalizer++; // count for initial weights @@ -240,27 +211,22 @@ int main(int argc, char** argv) { vector order; RandomPermutation(corpus.size(), &order); while (lcount <= max_iteration) { - dense_weights.clear(); - weights.InitFromVector(lambdas); - weights.InitVector(&dense_weights); - decoder.SetWeights(dense_weights); + lambdas.init_vector(&dense_weights); if ((cur_sent * 40 / corpus.size()) > dots) { ++dots; cerr << '.'; } if (corpus.size() == cur_sent) { cerr << " [AVG METRIC LAST PASS=" << (tot_loss / corpus.size()) << "]\n"; - ShowLargestFeatures(dense_weights); + Weights::ShowLargestFeatures(dense_weights); cur_sent = 0; tot_loss = 0; dots = 0; ostringstream os; os << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << ".gz"; - weights.WriteToFile(os.str(), true, &msg); SparseVector x = tot; x /= normalizer; ostringstream sa; sa << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "-avg.gz"; - Weights ww; - ww.InitFromVector(x); - ww.WriteToFile(sa.str(), true, &msga); + x.init_vector(&dense_weights); + Weights::WriteToFile(os.str(), dense_weights, true, &msg); ++cur_pass; RandomPermutation(corpus.size(), &order); } @@ -294,11 +260,11 @@ int main(int argc, char** argv) { ++cur_sent; } cerr << endl; - weights.WriteToFile("weights.mira-final.gz", true, &msg); + Weights::WriteToFile("weights.mira-final.gz", dense_weights, true, &msg); tot /= normalizer; - weights.InitFromVector(tot); + tot.init_vector(dense_weights); msg = "# MIRA tuned weights (averaged vector)"; - weights.WriteToFile("weights.mira-final-avg.gz", true, &msg); + Weights::WriteToFile("weights.mira-final-avg.gz", dense_weights, true, &msg); cerr << "Optimization complete.\nAVERAGED WEIGHTS: weights.mira-final-avg.gz\n"; return 0; } diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc index 4324e8de..bc59285b 100644 --- a/pro-train/mr_pro_map.cc +++ b/pro-train/mr_pro_map.cc @@ -301,12 +301,8 @@ int main(int argc, char** argv) { const unsigned gamma = conf["candidate_pairs"].as(); const unsigned xi = conf["best_pairs"].as(); string weightsf = conf["weights"].as(); - vector weights; - { - Weights w; - w.InitFromFile(weightsf); - w.InitVector(&weights); - } + vector weights; + Weights::InitFromFile(weightsf, &weights); string kbest_repo = conf["kbest_repository"].as(); MkDirP(kbest_repo); while(in) { diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 9b422f33..9caaa1d1 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -194,7 +194,7 @@ int main(int argc, char** argv) { InitCommandLine(argc, argv, &conf); string line; vector > > training, testing; - SparseVector old_weights; + SparseVector old_weights; const bool tune_regularizer = conf.count("tune_regularizer"); if (tune_regularizer && !conf.count("testset")) { cerr << "--tune_regularizer requires --testset to be set\n"; @@ -210,9 +210,9 @@ int main(int argc, char** argv) { const double psi = conf["interpolation"].as(); if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } if (conf.count("weights")) { - Weights w; - w.InitFromFile(conf["weights"].as()); - w.InitSparseVector(&old_weights); + vector dt; + Weights::InitFromFile(conf["weights"].as(), &dt); + Weights::InitSparseVector(dt, &old_weights); } ReadCorpus(&cin, &training); if (conf.count("testset")) { @@ -220,8 +220,8 @@ int main(int argc, char** argv) { ReadCorpus(rf.stream(), &testing); } cerr << "Number of features: " << FD::NumFeats() << endl; - vector x(FD::NumFeats(), 0.0); // x[0] is bias - for (SparseVector::const_iterator it = old_weights.begin(); + vector x(FD::NumFeats(), 0.0); // x[0] is bias + for (SparseVector::const_iterator it = old_weights.begin(); it != old_weights.end(); ++it) x[it->first] = it->second; double tppl = 0.0; @@ -257,7 +257,6 @@ int main(int argc, char** argv) { sigsq = sp[best_i].first; tppl = LearnParameters(training, testing, sigsq, conf["memory_buffers"].as(), &x); } - Weights w; if (conf.count("weights")) { for (int i = 1; i < x.size(); ++i) x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi); @@ -271,7 +270,6 @@ int main(int argc, char** argv) { cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl; } } - w.InitFromVector(x); - w.WriteToFile("-"); + Weights::WriteToFile("-", x); return 0; } diff --git a/training/Makefile.am b/training/Makefile.am index e075e417..6e2c06f5 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -12,9 +12,7 @@ bin_PROGRAMS = \ cllh_filter_grammar \ mpi_online_optimize \ mpi_batch_optimize \ - mpi_em_optimize \ compute_cllh \ - feature_expectations \ augment_grammar noinst_PROGRAMS = \ @@ -29,12 +27,6 @@ mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -feature_expectations_SOURCES = feature_expectations.cc -feature_expectations_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_em_optimize_SOURCES = mpi_em_optimize.cc optimize.cc -mpi_em_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - compute_cllh_SOURCES = compute_cllh.cc compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/training/augment_grammar.cc b/training/augment_grammar.cc index df8d4ee8..e89a92d5 100644 --- a/training/augment_grammar.cc +++ b/training/augment_grammar.cc @@ -134,9 +134,7 @@ int main(int argc, char** argv) { } else { ngram = NULL; } extra_feature = conf.count("extra_lex_feature") > 0; if (conf.count("collapse_weights")) { - Weights w; - w.InitFromFile(conf["collapse_weights"].as()); - w.InitVector(&col_weights); + Weights::InitFromFile(conf["collapse_weights"].as(), &col_weights); } clear_features = conf.count("clear_features_after_collapse") > 0; gather_rules = false; diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc index 4fb742fb..dc480f6c 100644 --- a/training/collapse_weights.cc +++ b/training/collapse_weights.cc @@ -59,10 +59,8 @@ int main(int argc, char** argv) { InitCommandLine(argc, argv, &conf); const string wfile = conf["weights"].as(); const string gfile = conf["grammar"].as(); - Weights wm; - wm.InitFromFile(wfile); - vector w; - wm.InitVector(&w); + vector w; + Weights::InitFromFile(wfile, &w); MarginalMap e_tots; MarginalMap f_tots; prob_t tot; diff --git a/training/compute_cllh.cc b/training/compute_cllh.cc index 332f6d0c..b496d196 100644 --- a/training/compute_cllh.cc +++ b/training/compute_cllh.cc @@ -148,15 +148,6 @@ int main(int argc, char** argv) { if (!InitCommandLine(argc, argv, &conf)) return false; - // load initial weights - Weights weights; - if (conf.count("weights")) - weights.InitFromFile(conf["weights"].as()); - - // freeze feature set - //const bool freeze_feature_set = conf.count("freeze_feature_set"); - //if (freeze_feature_set) FD::Freeze(); - // load cdec.ini and set up decoder ReadFile ini_rf(conf["decoder_config"].as()); Decoder decoder(ini_rf.stream()); @@ -165,17 +156,22 @@ int main(int argc, char** argv) { abort(); } + // load weights + vector& weights = decoder.CurrentWeightVector(); + if (conf.count("weights")) + Weights::InitFromFile(conf["weights"].as(), &weights); + + // freeze feature set + //const bool freeze_feature_set = conf.count("freeze_feature_set"); + //if (freeze_feature_set) FD::Freeze(); + vector corpus; vector ids; ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus, &ids); assert(corpus.size() > 0); assert(corpus.size() == ids.size()); - vector wv; - weights.InitVector(&wv); - decoder.SetWeights(wv); TrainingObserver observer; double objective = 0; - bool converged = false; observer.Reset(); if (rank == 0) @@ -197,3 +193,4 @@ int main(int argc, char** argv) { return 0; } + diff --git a/training/grammar_convert.cc b/training/grammar_convert.cc index 8d292f8a..bf8abb26 100644 --- a/training/grammar_convert.cc +++ b/training/grammar_convert.cc @@ -251,12 +251,10 @@ int main(int argc, char **argv) { const bool is_split_input = (conf["format"].as() == "split"); const bool is_json_input = is_split_input || (conf["format"].as() == "json"); const bool collapse_weights = conf.count("collapse_weights"); - Weights wts; vector w; - if (conf.count("weights")) { - wts.InitFromFile(conf["weights"].as()); - wts.InitVector(&w); - } + if (conf.count("weights")) + Weights::InitFromFile(conf["weights"].as(), &w); + if (collapse_weights && !w.size()) { cerr << "--collapse_weights requires a weights file to be specified!\n"; exit(1); diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc index 39a8af7d..cc5953f6 100644 --- a/training/mpi_batch_optimize.cc +++ b/training/mpi_batch_optimize.cc @@ -31,42 +31,12 @@ using namespace std; using boost::shared_ptr; namespace po = boost::program_options; -void SanityCheck(const vector& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - -struct FComp { - const vector& w_; - FComp(const vector& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowLargestFeatures(const vector& w) { - vector fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - for (vector::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("input_weights,w",po::value(),"Input feature weights file") ("training_data,t",po::value(),"Training data") ("decoder_config,d",po::value(),"Decoder configuration file") - ("sharded_input,s",po::value(), "Corpus and grammar files are 'sharded' so each processor loads its own input and grammar file. Argument is the directory containing the shards.") ("output_weights,o",po::value()->default_value("-"),"Output feature weights file") ("optimization_method,m", po::value()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)") ("correction_buffers,M", po::value()->default_value(10), "Number of gradients for LBFGS to maintain in memory") @@ -88,14 +58,10 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { } po::notify(*conf); - if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data") | conf->count("sharded_input")) || !conf->count("decoder_config")) { + if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data")) || !conf->count("decoder_config")) { cerr << dcmdline_options << endl; return false; } - if (conf->count("training_data") && conf->count("sharded_input")) { - cerr << "Cannot specify both --training_data and --sharded_input\n"; - return false; - } return true; } @@ -236,42 +202,9 @@ int main(int argc, char** argv) { po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) return 1; - string shard_dir; - if (conf.count("sharded_input")) { - shard_dir = conf["sharded_input"].as(); - if (!DirectoryExists(shard_dir)) { - if (rank == 0) cerr << "Can't find shard directory: " << shard_dir << endl; - return 1; - } - if (rank == 0) - cerr << "Shard directory: " << shard_dir << endl; - } - - // load initial weights - Weights weights; - if (rank == 0) { cerr << "Loading weights...\n"; } - weights.InitFromFile(conf["input_weights"].as()); - if (rank == 0) { cerr << "Done loading weights.\n"; } - - // freeze feature set (should be optional?) - const bool freeze_feature_set = true; - if (freeze_feature_set) FD::Freeze(); - // load cdec.ini and set up decoder vector cdec_ini; ReadConfig(conf["decoder_config"].as(), &cdec_ini); - if (shard_dir.size()) { - if (rank == 0) { - for (int i = 0; i < cdec_ini.size(); ++i) { - if (cdec_ini[i].find("grammar=") == 0) { - cerr << "!!! using sharded input and " << conf["decoder_config"].as() << " contains a grammar specification:\n" << cdec_ini[i] << "\n VERIFY THAT THIS IS CORRECT!\n"; - } - } - } - ostringstream g; - g << "grammar=" << shard_dir << "/grammar." << rank << "_of_" << size << ".gz"; - cdec_ini.push_back(g.str()); - } istringstream ini; StoreConfig(cdec_ini, &ini); if (rank == 0) cerr << "Loading grammar...\n"; @@ -282,22 +215,28 @@ int main(int argc, char** argv) { } if (rank == 0) cerr << "Done loading grammar!\n"; + // load initial weights + if (rank == 0) { cerr << "Loading weights...\n"; } + vector& lambdas = decoder->CurrentWeightVector(); + Weights::InitFromFile(conf["input_weights"].as(), &lambdas); + if (rank == 0) { cerr << "Done loading weights.\n"; } + + // freeze feature set (should be optional?) + const bool freeze_feature_set = true; + if (freeze_feature_set) FD::Freeze(); + const int num_feats = FD::NumFeats(); if (rank == 0) cerr << "Number of features: " << num_feats << endl; + lambdas.resize(num_feats); + const bool gaussian_prior = conf.count("gaussian_prior"); - vector means(num_feats, 0); + vector means(num_feats, 0); if (conf.count("means")) { if (!gaussian_prior) { cerr << "Don't use --means without --gaussian_prior!\n"; exit(1); } - Weights wm; - wm.InitFromFile(conf["means"].as()); - if (num_feats != FD::NumFeats()) { - cerr << "[ERROR] Means file had unexpected features!\n"; - exit(1); - } - wm.InitVector(&means); + Weights::InitFromFile(conf["means"].as(), &means); } shared_ptr o; if (rank == 0) { @@ -309,26 +248,13 @@ int main(int argc, char** argv) { cerr << "Optimizer: " << o->Name() << endl; } double objective = 0; - vector lambdas(num_feats, 0.0); - weights.InitVector(&lambdas); - if (lambdas.size() != num_feats) { - cerr << "Initial weights file did not have all features specified!\n feats=" - << num_feats << "\n weights file=" << lambdas.size() << endl; - lambdas.resize(num_feats, 0.0); - } vector gradient(num_feats, 0.0); - vector rcv_grad(num_feats, 0.0); + vector rcv_grad; + rcv_grad.clear(); bool converged = false; vector corpus; - if (shard_dir.size()) { - ostringstream os; os << shard_dir << "/corpus." << rank << "_of_" << size; - ReadTrainingCorpus(os.str(), 0, 1, &corpus); - cerr << os.str() << " has " << corpus.size() << " training examples. " << endl; - if (corpus.size() > 500) { corpus.resize(500); cerr << " TRUNCATING\n"; } - } else { - ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus); - } + ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus); assert(corpus.size() > 0); TrainingObserver observer; @@ -341,19 +267,20 @@ int main(int argc, char** argv) { if (rank == 0) { cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n"; } - decoder->SetWeights(lambdas); for (int i = 0; i < corpus.size(); ++i) decoder->Decode(corpus[i], &observer); cerr << " process " << rank << '/' << size << " done\n"; fill(gradient.begin(), gradient.end(), 0); - fill(rcv_grad.begin(), rcv_grad.end(), 0); observer.SetLocalGradientAndObjective(&gradient, &objective); double to = 0; #ifdef HAVE_MPI + rcv_grad.resize(num_feats, 0.0); mpi::reduce(world, &gradient[0], gradient.size(), &rcv_grad[0], plus(), 0); - mpi::reduce(world, objective, to, plus(), 0); swap(gradient, rcv_grad); + rcv_grad.clear(); + + mpi::reduce(world, objective, to, plus(), 0); objective = to; #endif @@ -378,7 +305,7 @@ int main(int argc, char** argv) { for (int i = 0; i < gradient.size(); ++i) gnorm += gradient[i] * gradient[i]; cerr << " GNORM=" << sqrt(gnorm) << endl; - vector old = lambdas; + vector old = lambdas; int c = 0; while (old == lambdas) { ++c; @@ -387,9 +314,8 @@ int main(int argc, char** argv) { assert(c < 5); } old.clear(); - SanityCheck(lambdas); - ShowLargestFeatures(lambdas); - weights.InitFromVector(lambdas); + Weights::SanityCheck(lambdas); + Weights::ShowLargestFeatures(lambdas); converged = o->HasConverged(); if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; } @@ -399,7 +325,7 @@ int main(int argc, char** argv) { ostringstream vv; vv << "Objective = " << objective << " (eval count=" << o->EvaluationCount() << ")"; const string svv = vv.str(); - weights.WriteToFile(fname, true, &svv); + Weights::WriteToFile(fname, lambdas, true, &svv); } // rank == 0 int cint = converged; #ifdef HAVE_MPI @@ -411,3 +337,4 @@ int main(int argc, char** argv) { } return 0; } + diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc index 32033c19..2ef4a2e7 100644 --- a/training/mpi_online_optimize.cc +++ b/training/mpi_online_optimize.cc @@ -31,35 +31,6 @@ namespace mpi = boost::mpi; using namespace std; namespace po = boost::program_options; -void SanityCheck(const vector& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - -struct FComp { - const vector& w_; - FComp(const vector& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowLargestFeatures(const vector& w) { - vector fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - for (vector::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() @@ -250,10 +221,25 @@ int main(int argc, char** argv) { if (!InitCommandLine(argc, argv, &conf)) return 1; + vector > agenda; + if (!LoadAgenda(conf["training_agenda"].as(), &agenda)) + return 1; + if (rank == 0) + cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n"; + + assert(agenda.size() > 0); + + if (1) { // hack to load the feature hash functions -- TODO this should not be in cdec.ini + const string& cur_config = agenda[0].first; + const unsigned max_iteration = agenda[0].second; + ReadFile ini_rf(cur_config); + Decoder decoder(ini_rf.stream()); + } + // load initial weights - Weights weights; + vector init_weights; if (conf.count("input_weights")) - weights.InitFromFile(conf["input_weights"].as()); + Weights::InitFromFile(conf["input_weights"].as(), &init_weights); vector frozen_fids; if (conf.count("frozen_features")) { @@ -310,19 +296,12 @@ int main(int argc, char** argv) { rng.reset(new MT19937); SparseVector x; - weights.InitSparseVector(&x); + Weights::InitSparseVector(init_weights, &x); TrainingObserver observer; int write_weights_every_ith = 100; // TODO configure int titer = -1; - vector > agenda; - if (!LoadAgenda(conf["training_agenda"].as(), &agenda)) - return 1; - if (rank == 0) - cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n"; - - vector lambdas; for (int ai = 0; ai < agenda.size(); ++ai) { const string& cur_config = agenda[ai].first; const unsigned max_iteration = agenda[ai].second; @@ -331,6 +310,8 @@ int main(int argc, char** argv) { // load cdec.ini and set up decoder ReadFile ini_rf(cur_config); Decoder decoder(ini_rf.stream()); + vector& lambdas = decoder.CurrentWeightVector(); + if (ai == 0) { lambdas.swap(init_weights); init_weights.clear(); } if (rank == 0) o->ResetEpoch(); // resets the learning rate-- TODO is this good? @@ -341,15 +322,13 @@ int main(int argc, char** argv) { #ifdef HAVE_MPI mpi::timer timer; #endif - weights.InitFromVector(x); - weights.InitVector(&lambdas); + x.init_vector(&lambdas); ++iter; ++titer; observer.Reset(); - decoder.SetWeights(lambdas); if (rank == 0) { converged = (iter == max_iteration); - SanityCheck(lambdas); - ShowLargestFeatures(lambdas); + Weights::SanityCheck(lambdas); + Weights::ShowLargestFeatures(lambdas); string fname = "weights.cur.gz"; if (iter % write_weights_every_ith == 0) { ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz"; @@ -360,7 +339,7 @@ int main(int argc, char** argv) { vv << "total iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << x.size() << '/' << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast(corpus.size())) << " eta=" << lr->eta(titer); const string svv = vv.str(); cerr << svv << endl; - weights.WriteToFile(fname, true, &svv); + Weights::WriteToFile(fname, lambdas, true, &svv); } for (int i = 0; i < size_per_proc; ++i) { diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc index b931991d..15e28fa1 100644 --- a/training/mr_optimize_reduce.cc +++ b/training/mr_optimize_reduce.cc @@ -88,25 +88,19 @@ int main(int argc, char** argv) { const bool use_b64 = conf["input_format"].as() == "b64"; - Weights weights; - weights.InitFromFile(conf["input_weights"].as()); + vector lambdas; + Weights::InitFromFile(conf["input_weights"].as(), &lambdas); const string s_obj = "**OBJ**"; int num_feats = FD::NumFeats(); cerr << "Number of features: " << num_feats << endl; const bool gaussian_prior = conf.count("gaussian_prior"); - vector means(num_feats, 0); + vector means(num_feats, 0); if (conf.count("means")) { if (!gaussian_prior) { cerr << "Don't use --means without --gaussian_prior!\n"; exit(1); } - Weights wm; - wm.InitFromFile(conf["means"].as()); - if (num_feats != FD::NumFeats()) { - cerr << "[ERROR] Means file had unexpected features!\n"; - exit(1); - } - wm.InitVector(&means); + Weights::InitFromFile(conf["means"].as(), &means); } shared_ptr o; const string omethod = conf["optimization_method"].as(); @@ -124,8 +118,6 @@ int main(int argc, char** argv) { cerr << "No state file found, assuming ITERATION 1\n"; } - vector lambdas(num_feats, 0); - weights.InitVector(&lambdas); double objective = 0; vector gradient(num_feats, 0); // 0**OBJ**=12.2;Feat1=2.3;Feat2=-0.2; @@ -223,8 +215,7 @@ int main(int argc, char** argv) { old.clear(); SanityCheck(lambdas); ShowLargestFeatures(lambdas); - weights.InitFromVector(lambdas); - weights.WriteToFile(conf["output_weights"].as(), false); + Weights::WriteToFile(conf["output_weights"].as(), lambdas, false); const bool conv = o->HasConverged(); if (conv) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; } diff --git a/utils/fdict.h b/utils/fdict.h index 771e8b91..f0871b9a 100644 --- a/utils/fdict.h +++ b/utils/fdict.h @@ -28,6 +28,8 @@ struct FD { } static void EnableHash(const std::string& cmph_file) { #ifdef HAVE_CMPH + assert(dict_.max() == 0); // dictionary must not have + // been added to hash_ = new PerfectHashFunction(cmph_file); #endif } diff --git a/utils/phmt.cc b/utils/phmt.cc index 1f59afaf..48d9f093 100644 --- a/utils/phmt.cc +++ b/utils/phmt.cc @@ -19,22 +19,18 @@ int main(int argc, char** argv) { cerr << "LexFE = " << FD::Convert("LexFE") << endl; cerr << "LexEF = " << FD::Convert("LexEF") << endl; { - Weights w; vector v(FD::NumFeats()); v[FD::Convert("LexFE")] = 1.0; v[FD::Convert("LexEF")] = 0.5; - w.InitFromVector(v); cerr << "Writing...\n"; - w.WriteToFile("weights.bin"); + Weights::WriteToFile("weights.bin", v); cerr << "Done.\n"; } { - Weights w; vector v(FD::NumFeats()); cerr << "Reading...\n"; - w.InitFromFile("weights.bin"); + Weights::InitFromFile("weights.bin", &v); cerr << "Done.\n"; - w.InitVector(&v); assert(v[FD::Convert("LexFE")] == 1.0); assert(v[FD::Convert("LexEF")] == 0.5); } diff --git a/utils/weights.cc b/utils/weights.cc index 0916b72a..c49000be 100644 --- a/utils/weights.cc +++ b/utils/weights.cc @@ -8,7 +8,10 @@ using namespace std; -void Weights::InitFromFile(const std::string& filename, vector* feature_list) { +void Weights::InitFromFile(const string& filename, + vector* pweights, + vector* feature_list) { + vector& weights = *pweights; if (!SILENT) cerr << "Reading weights from " << filename << endl; ReadFile in_file(filename); istream& in = *in_file.stream(); @@ -47,16 +50,16 @@ void Weights::InitFromFile(const std::string& filename, vector* feature_ int end = 0; while(end < buf.size() && buf[end] != ' ') ++end; const int fid = FD::Convert(buf.substr(start, end - start)); + if (feature_list) { feature_list->push_back(buf.substr(start, end - start)); } while(end < buf.size() && buf[end] == ' ') ++end; val = strtod(&buf.c_str()[end], NULL); if (isnan(val)) { cerr << FD::Convert(fid) << " has weight NaN!\n"; abort(); } - if (wv_.size() <= fid) - wv_.resize(fid + 1); - wv_[fid] = val; - if (feature_list) { feature_list->push_back(FD::Convert(fid)); } + if (weights.size() <= fid) + weights.resize(fid + 1); + weights[fid] = val; ++weight_count; if (!SILENT) { if (weight_count % 50000 == 0) { cerr << '.' << flush; fl = true; } @@ -76,8 +79,8 @@ void Weights::InitFromFile(const std::string& filename, vector* feature_ cerr << "Hash function reports " << FD::NumFeats() << " keys but weights file contains " << num_keys[0] << endl; abort(); } - wv_.resize(num_keys[0]); - in.get(reinterpret_cast(&wv_[0]), num_keys[0] * sizeof(weight_t)); + weights.resize(num_keys[0]); + in.get(reinterpret_cast(&weights[0]), num_keys[0] * sizeof(weight_t)); if (!in.good()) { cerr << "Error loading weights!\n"; abort(); @@ -85,7 +88,10 @@ void Weights::InitFromFile(const std::string& filename, vector* feature_ } } -void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features, const string* extra) const { +void Weights::WriteToFile(const string& fname, + const vector& weights, + bool hide_zero_value_features, + const string* extra) { WriteFile out(fname); ostream& o = *out.stream(); assert(o); @@ -96,41 +102,54 @@ void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_feature o.precision(17); const int num_feats = FD::NumFeats(); for (int i = 1; i < num_feats; ++i) { - const weight_t val = (i < wv_.size() ? wv_[i] : 0.0); + const weight_t val = (i < weights.size() ? weights[i] : 0.0); if (hide_zero_value_features && val == 0.0) continue; o << FD::Convert(i) << ' ' << val << endl; } } else { o.write("_PHWf", 5); const size_t keys = FD::NumFeats(); - assert(keys <= wv_.size()); + assert(keys <= weights.size()); o.write(reinterpret_cast(&keys), sizeof(keys)); - o.write(reinterpret_cast(&wv_[0]), keys * sizeof(weight_t)); + o.write(reinterpret_cast(&weights[0]), keys * sizeof(weight_t)); } } -void Weights::InitVector(std::vector* w) const { - *w = wv_; +void Weights::InitSparseVector(const vector& dv, + SparseVector* sv) { + sv->clear(); + for (unsigned i = 1; i < dv.size(); ++i) { + if (dv[i]) sv->set_value(i, dv[i]); + } } -void Weights::InitSparseVector(SparseVector* w) const { - for (int i = 1; i < wv_.size(); ++i) { - const weight_t& weight = wv_[i]; - if (weight) w->set_value(i, weight); +void Weights::SanityCheck(const vector& w) { + for (int i = 0; i < w.size(); ++i) { + assert(!isnan(w[i])); + assert(!isinf(w[i])); } } -void Weights::InitFromVector(const std::vector& w) { - wv_ = w; - if (wv_.size() > FD::NumFeats()) - cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n"; - wv_.resize(FD::NumFeats(), 0); -} +struct FComp { + const vector& w_; + FComp(const vector& w) : w_(w) {} + bool operator()(int a, int b) const { + return fabs(w_[a]) > fabs(w_[b]); + } +}; -void Weights::InitFromVector(const SparseVector& w) { - wv_.clear(); - wv_.resize(FD::NumFeats(), 0.0); - for (int i = 1; i < FD::NumFeats(); ++i) - wv_[i] = w.value(i); +void Weights::ShowLargestFeatures(const vector& w) { + vector fnums(w.size()); + for (int i = 0; i < w.size(); ++i) + fnums[i] = i; + vector::iterator mid = fnums.begin(); + mid += (w.size() > 10 ? 10 : w.size()); + partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); + cerr << "TOP FEATURES:"; + for (vector::iterator i = fnums.begin(); i != mid; ++i) { + cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; + } + cerr << endl; } + diff --git a/utils/weights.h b/utils/weights.h index 7664810b..30f71db0 100644 --- a/utils/weights.h +++ b/utils/weights.h @@ -10,15 +10,21 @@ typedef double weight_t; class Weights { public: - Weights() {} - void InitFromFile(const std::string& fname, std::vector* feature_list = NULL); - void WriteToFile(const std::string& fname, bool hide_zero_value_features = true, const std::string* extra = NULL) const; - void InitVector(std::vector* w) const; - void InitSparseVector(SparseVector* w) const; - void InitFromVector(const std::vector& w); - void InitFromVector(const SparseVector& w); + static void InitFromFile(const std::string& fname, + std::vector* weights, + std::vector* feature_list = NULL); + static void WriteToFile(const std::string& fname, + const std::vector& weights, + bool hide_zero_value_features = true, + const std::string* extra = NULL); + static void InitSparseVector(const std::vector& dv, + SparseVector* sv); + // check for infinities, NaNs, etc + static void SanityCheck(const std::vector& w); + // write weights with largest magnitude to cerr + static void ShowLargestFeatures(const std::vector& w); private: - std::vector wv_; + Weights(); }; #endif diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index b84c44bc..0c094fd5 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -223,16 +223,16 @@ struct oracle_directions { cerr << "Forest repo: " << forest_repository << endl; assert(DirectoryExists(forest_repository)); vector features; - weights.InitFromFile(weights_file, &features); + vector dorigin; + Weights::InitFromFile(weights_file, &dorigin, &features); if (optimize_features.size()) features=optimize_features; - weights.InitSparseVector(&origin); + Weights::InitSparseVector(dorigin, &origin); fids.clear(); AddFeatureIds(features); oracles.resize(dev_set_size); } - Weights weights; void AddFeatureIds(vector const& features) { int i = fids.size(); fids.resize(fids.size()+features.size()); -- cgit v1.2.3 From bac41f58fc6c1fd7901aa0d59490fc59ddc40095 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 13 Sep 2011 21:55:02 +0100 Subject: binary to extract features encountered --- training/Makefile.am | 4 ++ training/mpi_extract_features.cc | 151 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 training/mpi_extract_features.cc (limited to 'training') diff --git a/training/Makefile.am b/training/Makefile.am index 6e2c06f5..7ceeda34 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -11,6 +11,7 @@ bin_PROGRAMS = \ collapse_weights \ cllh_filter_grammar \ mpi_online_optimize \ + mpi_extract_features \ mpi_batch_optimize \ compute_cllh \ augment_grammar @@ -24,6 +25,9 @@ TESTS = lbfgs_test optimize_test mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +mpi_extract_features_SOURCES = mpi_extract_features.cc +mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/training/mpi_extract_features.cc b/training/mpi_extract_features.cc new file mode 100644 index 00000000..6750aa15 --- /dev/null +++ b/training/mpi_extract_features.cc @@ -0,0 +1,151 @@ +#include +#include +#include +#include + +#include "config.h" +#ifdef HAVE_MPI +#include +#endif +#include +#include + +#include "ff_register.h" +#include "verbose.h" +#include "filelib.h" +#include "fdict.h" +#include "decoder.h" +#include "weights.h" + +using namespace std; +namespace po = boost::program_options; + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("training_data,t",po::value(),"Training data corpus") + ("decoder_config,c",po::value(),"Decoder configuration file") + ("weights,w", po::value(), "(Optional) weights file; weights may affect what features are encountered in pruning configurations") + ("output_prefix,o",po::value()->default_value("features"),"Output path prefix"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) { + cerr << "Decode an input set (optionally in parallel using MPI) and write\nout the feature strings encountered.\n"; + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + int lc = 0; + while(in) { + getline(in, line); + if (!in) break; + if (lc % size == rank) c->push_back(line); + ++lc; + } +} + +static const double kMINUS_EPSILON = -1e-6; + +struct TrainingObserver : public DecoderObserver { + + virtual void NotifyDecodingStart(const SentenceMetadata&) { + } + + // compute model expectations, denominator of objective + virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) { + } + + // compute "empirical" expectations, numerator of objective + virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { + } +}; + +#ifdef HAVE_MPI +namespace mpi = boost::mpi; +#endif + +int main(int argc, char** argv) { +#ifdef HAVE_MPI + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); +#else + const int size = 1; + const int rank = 0; +#endif + if (size > 1) SetSilent(true); // turn off verbose decoder output + register_feature_functions(); + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) + return false; + + // load cdec.ini and set up decoder + ReadFile ini_rf(conf["decoder_config"].as()); + Decoder decoder(ini_rf.stream()); + if (decoder.GetConf()["input"].as() != "-") { + cerr << "cdec.ini must not set an input file\n"; + abort(); + } + + if (FD::UsingPerfectHashFunction()) { + cerr << "Your configuration file has enabled a cmph hash function. Please disable.\n"; + return 1; + } + + // load optional weights + if (conf.count("weights")) + Weights::InitFromFile(conf["weights"].as(), &decoder.CurrentWeightVector()); + + vector corpus; + ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus); + assert(corpus.size() > 0); + + TrainingObserver observer; + + if (rank == 0) + cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n"; + + for (int i = 0; i < corpus.size(); ++i) + decoder.Decode(corpus[i], &observer); + + { + ostringstream os; + os << conf["output_prefix"].as() << '.' << rank << "_of_" << size; + WriteFile wf(os.str()); + ostream& out = *wf.stream(); + const unsigned num_feats = FD::NumFeats(); + for (unsigned i = 1; i < num_feats; ++i) { + out << FD::Convert(i) << endl; + } + cerr << "Wrote " << os.str() << endl; + } + +#ifdef HAVE_MPI + world.barrier(); +#else +#endif + + return 0; +} + -- cgit v1.2.3 From 33a891dbaf7c77e822f4692c9ac4056f5b29e88c Mon Sep 17 00:00:00 2001 From: Guest_account Guest_account prguest11 Date: Thu, 15 Sep 2011 12:52:59 +0100 Subject: script to filter reachable sentences, weight cleanup --- decoder/apply_models.cc | 3 +- decoder/hg.h | 8 +- training/Makefile.am | 10 +- training/cllh_filter_grammar.cc | 197 -------------------------------------- training/mpi_extract_reachable.cc | 163 +++++++++++++++++++++++++++++++ utils/feature_vector.h | 4 +- 6 files changed, 174 insertions(+), 211 deletions(-) delete mode 100644 training/cllh_filter_grammar.cc create mode 100644 training/mpi_extract_reachable.cc (limited to 'training') diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc index 26cdb881..40fd27e4 100644 --- a/decoder/apply_models.cc +++ b/decoder/apply_models.cc @@ -276,8 +276,7 @@ public: make_heap(cand.begin(), cand.end(), HeapCandCompare()); State2Node state2node; // "buf" in Figure 2 int pops = 0; - int pop_limit_eff=max(1,int(v.promise*pop_limit_)); - while(!cand.empty() && pops < pop_limit_eff) { + while(!cand.empty() && pops < pop_limit_) { pop_heap(cand.begin(), cand.end(), HeapCandCompare()); Candidate* item = cand.back(); cand.pop_back(); diff --git a/decoder/hg.h b/decoder/hg.h index e5ef05f8..f0ddbb76 100644 --- a/decoder/hg.h +++ b/decoder/hg.h @@ -49,16 +49,14 @@ public: // TODO get rid of cat_? // TODO keep cat_ and add span and/or state? :) struct Node { - Node() : id_(), cat_(), promise(1) {} + Node() : id_(), cat_() {} int id_; // equal to this object's position in the nodes_ vector WordID cat_; // non-terminal category if <0, 0 if not set WordID NT() const { return -cat_; } EdgesVector in_edges_; // an in edge is an edge with this node as its head. (in edges come from the bottom up to us) indices in edges_ EdgesVector out_edges_; // an out edge is an edge with this node as its tail. (out edges leave us up toward the top/goal). indices in edges_ - double promise; // set in global pruning; in [0,infty) so that mean is 1. use: e.g. scale cube poplimit. //TODO: appears to be useless, compile without this? on the other hand, pretty cheap. void copy_fixed(Node const& o) { // nonstructural fields only - structural ones are managed by sorting/pruning/subsetting cat_=o.cat_; - promise=o.promise; } void copy_reindex(Node const& o,indices_after const& n2,indices_after const& e2) { copy_fixed(o); @@ -81,7 +79,7 @@ public: int head_node_; // refers to a position in nodes_ TailNodeVector tail_nodes_; // contents refer to positions in nodes_ TRulePtr rule_; - FeatureVector feature_values_; + SparseVector feature_values_; prob_t edge_prob_; // dot product of weights and feat_values int id_; // equal to this object's position in the edges_ vector @@ -468,7 +466,7 @@ public: /// drop edge i if edge_margin[i] < prune_below, unless preserve_mask[i] void MarginPrune(EdgeProbs const& edge_margin,prob_t prune_below,EdgeMask const* preserve_mask=0,bool safe_inside=false,bool verbose=false); - //TODO: in my opinion, looking at the ratio of logprobs (features \dot weights) rather than the absolute difference generalizes more nicely across sentence lengths and weight vectors that are constant multiples of one another. at least make that an option. i worked around this a little in cdec by making "beam alpha per source word" but that's not helping with different tuning runs. this would also make me more comfortable about allocating Node.promise + //TODO: in my opinion, looking at the ratio of logprobs (features \dot weights) rather than the absolute difference generalizes more nicely across sentence lengths and weight vectors that are constant multiples of one another. at least make that an option. i worked around this a little in cdec by making "beam alpha per source word" but that's not helping with different tuning runs. // beam_alpha=0 means don't beam prune, otherwise drop things that are e^beam_alpha times worse than best - // prunes any edge whose prob_t on the best path taking that edge is more than e^alpha times //density=0 means don't density prune: // for density>=1.0, keep this many times the edges needed for the 1best derivation diff --git a/training/Makefile.am b/training/Makefile.am index 7ceeda34..5752859e 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -9,9 +9,9 @@ bin_PROGRAMS = \ atools \ plftools \ collapse_weights \ - cllh_filter_grammar \ - mpi_online_optimize \ + mpi_extract_reachable \ mpi_extract_features \ + mpi_online_optimize \ mpi_batch_optimize \ compute_cllh \ augment_grammar @@ -25,6 +25,9 @@ TESTS = lbfgs_test optimize_test mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc +mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + mpi_extract_features_SOURCES = mpi_extract_features.cc mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz @@ -34,9 +37,6 @@ mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/ compute_cllh_SOURCES = compute_cllh.cc compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -cllh_filter_grammar_SOURCES = cllh_filter_grammar.cc -cllh_filter_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - augment_grammar_SOURCES = augment_grammar.cc augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/training/cllh_filter_grammar.cc b/training/cllh_filter_grammar.cc deleted file mode 100644 index 6998ec2b..00000000 --- a/training/cllh_filter_grammar.cc +++ /dev/null @@ -1,197 +0,0 @@ -#include -#include -#include -#include // fork -#include // waitpid - -#include -#include - -#include "tdict.h" -#include "ff_register.h" -#include "verbose.h" -#include "hg.h" -#include "decoder.h" -#include "filelib.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("training_data,t",po::value(),"Training data corpus") - ("decoder_config,c",po::value(),"Decoder configuration file") - ("shards,s",po::value()->default_value(1),"Number of shards") - ("starting_shard,S",po::value()->default_value(0), "In this invocation only process shards >= S") - ("work_limit,l",po::value()->default_value(9999), "Process maximially this many shards") - ("ncpus,C",po::value()->default_value(1),"Number of CPUs to use"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c, vector* ids) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - int lc = 0; - assert(size > 0); - assert(rank < size); - while(in) { - getline(in, line); - if (!in) break; - if (lc % size == rank) { - c->push_back(line); - ids->push_back(lc); - } - ++lc; - } -} - -struct TrainingObserver : public DecoderObserver { - TrainingObserver() : s_lhs(-TD::Convert("S")), goal_lhs(-TD::Convert("Goal")) {} - - void Reset() { - total_complete = 0; - } - - virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { - state = 1; - used.clear(); - failed = true; - } - - virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 1); - for (int i = 0; i < hg->edges_.size(); ++i) { - const TRule* rule = hg->edges_[i].rule_.get(); - if (rule->lhs_ == s_lhs || rule->lhs_ == goal_lhs) // fragile hack to filter out glue rules - continue; - used.insert(rule); - } - state = 2; - } - - virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 2); - state = 3; - } - - virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { - if (state == 3) { - failed = false; - } else { - failed = true; - } - } - - set used; - - const int s_lhs; - const int goal_lhs; - bool failed; - int total_complete; - int state; -}; - -void work(const string& fname, int rank, int size, Decoder* decoder) { - cerr << "Worker " << rank << '/' << size << " starting.\n"; - vector corpus; - vector ids; - ReadTrainingCorpus(fname, rank, size, &corpus, &ids); - assert(corpus.size() > 0); - assert(corpus.size() == ids.size()); - cerr << " " << rank << '/' << size << ": has " << corpus.size() << " sentences to process\n"; - ostringstream oc; oc << "corpus." << rank << "_of_" << size; - WriteFile foc(oc.str()); - ostringstream og; og << "grammar." << rank << "_of_" << size << ".gz"; - WriteFile fog(og.str()); - - set all_used; - TrainingObserver observer; - for (int i = 0; i < corpus.size(); ++i) { - const int sent_id = ids[i]; - const string& input = corpus[i]; - decoder->SetId(sent_id); - decoder->Decode(input, &observer); - if (observer.failed) { - // do nothing - } else { - (*foc.stream()) << input << endl; - for (set::iterator it = observer.used.begin(); it != observer.used.end(); ++it) { - if (all_used.insert(*it).second) - (*fog.stream()) << **it << endl; - } - } - } -} - -int main(int argc, char** argv) { - register_feature_functions(); - - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string fname = conf["training_data"].as(); - const unsigned ncpus = conf["ncpus"].as(); - const unsigned shards = conf["shards"].as(); - const unsigned start = conf["starting_shard"].as(); - const unsigned work_limit = conf["work_limit"].as(); - const unsigned eff_shards = min(start + work_limit, shards); - cerr << "Processing shards " << start << "/" << shards << " to " << eff_shards << "/" << shards << endl; - assert(ncpus > 0); - ReadFile ini_rf(conf["decoder_config"].as()); - Decoder decoder(ini_rf.stream()); - if (decoder.GetConf()["input"].as() != "-") { - cerr << "cdec.ini must not set an input file\n"; - abort(); - } - SetSilent(true); // turn off verbose decoder output - cerr << "Forking " << ncpus << " time(s)\n"; - vector children; - for (int i = 0; i < ncpus; ++i) { - pid_t pid = fork(); - if (pid < 0) { - cerr << "Fork failed!\n"; - exit(1); - } - if (pid > 0) { - children.push_back(pid); - } else { - for (int j = start; j < eff_shards; ++j) { - if (j % ncpus == i) { - cerr << " CPU " << i << " processing shard " << j << endl; - work(fname, j, shards, &decoder); - cerr << " Shard " << j << "/" << shards << " finished.\n"; - } - } - _exit(0); - } - } - for (int i = 0; i < children.size(); ++i) { - int status; - int w = waitpid(children[i], &status, 0); - if (w < 0) { cerr << "Error while waiting for children!"; return 1; } - if (WIFSIGNALED(status)) { - cerr << "Child " << i << " received signal " << WTERMSIG(status) << endl; - if (WTERMSIG(status) == 11) { cerr << " this is a SEGV- you may be trying to print temporarily created rules\n"; } - } - } - return 0; -} diff --git a/training/mpi_extract_reachable.cc b/training/mpi_extract_reachable.cc new file mode 100644 index 00000000..2a7c2b9d --- /dev/null +++ b/training/mpi_extract_reachable.cc @@ -0,0 +1,163 @@ +#include +#include +#include +#include + +#include "config.h" +#ifdef HAVE_MPI +#include +#endif +#include +#include + +#include "ff_register.h" +#include "verbose.h" +#include "filelib.h" +#include "fdict.h" +#include "decoder.h" +#include "weights.h" + +using namespace std; +namespace po = boost::program_options; + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("training_data,t",po::value(),"Training data corpus") + ("decoder_config,c",po::value(),"Decoder configuration file") + ("weights,w", po::value(), "(Optional) weights file; weights may affect what features are encountered in pruning configurations") + ("output_prefix,o",po::value()->default_value("reachable"),"Output path prefix"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) { + cerr << "Decode an input set (optionally in parallel using MPI) and write\nout the inputs that produce reachable parallel parses.\n"; + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + int lc = 0; + while(in) { + getline(in, line); + if (!in) break; + if (lc % size == rank) c->push_back(line); + ++lc; + } +} + +static const double kMINUS_EPSILON = -1e-6; + +struct ReachabilityObserver : public DecoderObserver { + + virtual void NotifyDecodingStart(const SentenceMetadata&) { + reachable = false; + } + + // compute model expectations, denominator of objective + virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) { + } + + // compute "empirical" expectations, numerator of objective + virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { + reachable = true; + } + + bool reachable; +}; + +#ifdef HAVE_MPI +namespace mpi = boost::mpi; +#endif + +int main(int argc, char** argv) { +#ifdef HAVE_MPI + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); +#else + const int size = 1; + const int rank = 0; +#endif + if (size > 1) SetSilent(true); // turn off verbose decoder output + register_feature_functions(); + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) + return false; + + // load cdec.ini and set up decoder + ReadFile ini_rf(conf["decoder_config"].as()); + Decoder decoder(ini_rf.stream()); + if (decoder.GetConf()["input"].as() != "-") { + cerr << "cdec.ini must not set an input file\n"; + abort(); + } + + if (FD::UsingPerfectHashFunction()) { + cerr << "Your configuration file has enabled a cmph hash function. Please disable.\n"; + return 1; + } + + // load optional weights + if (conf.count("weights")) + Weights::InitFromFile(conf["weights"].as(), &decoder.CurrentWeightVector()); + + vector corpus; + ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus); + assert(corpus.size() > 0); + + + if (rank == 0) + cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n"; + + size_t num_reached = 0; + { + ostringstream os; + os << conf["output_prefix"].as() << '.' << rank << "_of_" << size; + WriteFile wf(os.str()); + ostream& out = *wf.stream(); + ReachabilityObserver observer; + for (int i = 0; i < corpus.size(); ++i) { + decoder.Decode(corpus[i], &observer); + if (observer.reachable) { + out << corpus[i] << endl; + ++num_reached; + } + corpus[i].clear(); + } + cerr << "Shard " << rank << '/' << size << " finished, wrote " + << num_reached << " instances to " << os.str() << endl; + } + + size_t total = 0; +#ifdef HAVE_MPI + reduce(world, num_reached, total, std::plus(), 0); +#else + total = num_reached; +#endif + if (rank == 0) { + cerr << "-----------------------------------------\n"; + cerr << "TOTAL = " << total << " instances\n"; + } + return 0; +} + diff --git a/utils/feature_vector.h b/utils/feature_vector.h index 733aa99e..a7b61a66 100755 --- a/utils/feature_vector.h +++ b/utils/feature_vector.h @@ -3,9 +3,9 @@ #include #include "sparse_vector.h" -#include "fdict.h" +#include "weights.h" -typedef double Featval; +typedef weight_t Featval; typedef SparseVector FeatureVector; typedef SparseVector WeightVector; typedef std::vector DenseWeightVector; -- cgit v1.2.3 From ed2b1ce7181fb21d2363b0d5ce04d7fa52aef0fa Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 17 Sep 2011 01:08:45 +0100 Subject: enable ramdisk scratch for per-sentence-grammars --- training/mpi_batch_optimize.cc | 35 +++++++++++++++++++++++++++++++++++ utils/filelib.cc | 19 +++++++++++++++++++ utils/filelib.h | 5 +---- 3 files changed, 55 insertions(+), 4 deletions(-) (limited to 'training') diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc index cc5953f6..0ba8c530 100644 --- a/training/mpi_batch_optimize.cc +++ b/training/mpi_batch_optimize.cc @@ -22,6 +22,7 @@ namespace mpi = boost::mpi; #include "ff_register.h" #include "decoder.h" #include "filelib.h" +#include "stringlib.h" #include "optimize.h" #include "fdict.h" #include "weights.h" @@ -42,6 +43,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("correction_buffers,M", po::value()->default_value(10), "Number of gradients for LBFGS to maintain in memory") ("gaussian_prior,p","Use a Gaussian prior on the weights") ("means,u", po::value(), "File containing the means for Gaussian prior") + ("per_sentence_grammar_scratch,P", po::value(), "(Optional) location of scratch space to copy per-sentence grammars for fast access, useful if a RAM disk is available") ("sigma_squared", po::value()->default_value(1.0), "Sigma squared term for spherical Gaussian prior"); po::options_description clo("Command line options"); clo.add_options() @@ -186,6 +188,36 @@ struct VectorPlus : public binary_function, vector, vector > { } }; +void MovePerSentenceGrammars(const string& root, int size, int rank, vector* c) { + if (!DirectoryExists(root)) { + cerr << "Can't find scratch space at " << root << endl; + abort(); + } + ostringstream os; + os << root << "/psg." << size << "_of_" << rank; + const string path = os.str(); + MkDirP(path); + string sent; + map attr; + for (unsigned i = 0; i < c->size(); ++i) { + sent = (*c)[i]; + attr.clear(); + ProcessAndStripSGML(&sent, &attr); + map::iterator it = attr.find("grammar"); + if (it != attr.end()) { + string src_file = it->second; + bool is_gzipped = (src_file.size() > 3) && (src_file.rfind(".gz") == (src_file.size() - 3)); + string new_name = path + "/" + md5(sent); + if (is_gzipped) new_name += ".gz"; + CopyFile(src_file, new_name); + it->second = new_name; + } + ostringstream ns; + ns << SGMLOpenSegTag(attr) << ' ' << sent << " "; + (*c)[i] = ns.str(); + } +} + int main(int argc, char** argv) { #ifdef HAVE_MPI mpi::environment env(argc, argv); @@ -257,6 +289,9 @@ int main(int argc, char** argv) { ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus); assert(corpus.size() > 0); + if (conf.count("per_sentence_grammar_scratch")) + MovePerSentenceGrammars(conf["per_sentence_grammar_scratch"].as(), rank, size, &corpus); + TrainingObserver observer; while (!converged) { observer.Reset(); diff --git a/utils/filelib.cc b/utils/filelib.cc index a0969b1a..d206fc19 100644 --- a/utils/filelib.cc +++ b/utils/filelib.cc @@ -2,6 +2,12 @@ #include #include +#include +#include +#include +#include +#include +#include using namespace std; @@ -32,3 +38,16 @@ void MkDirP(const string& dir) { } } +#if 0 +void CopyFile(const string& inf, const string& outf) { + WriteFile w(outf); + CopyFile(inf,*w); +} +#else +void CopyFile(const string& inf, const string& outf) { + ofstream of(outf.c_str(), fstream::trunc|fstream::binary); + ifstream in(inf.c_str(), fstream::binary); + of << in.rdbuf(); +} +#endif + diff --git a/utils/filelib.h b/utils/filelib.h index a8622246..bb6e7415 100644 --- a/utils/filelib.h +++ b/utils/filelib.h @@ -113,9 +113,6 @@ inline void CopyFile(std::string const& inf,std::ostream &out) { CopyFile(*r,out); } -inline void CopyFile(std::string const& inf,std::string const& outf) { - WriteFile w(outf); - CopyFile(inf,*w); -} +void CopyFile(std::string const& inf,std::string const& outf); #endif -- cgit v1.2.3 From 41b28681f9a286d2ee98dab7915c0e735704286e Mon Sep 17 00:00:00 2001 From: Guest_account Guest_account prguest11 Date: Sat, 17 Sep 2011 01:39:07 +0100 Subject: add dep --- training/cluster-em.pl | 114 ---------------- training/cluster-ptrain.pl | 206 ----------------------------- training/compute_cllh.cc | 196 --------------------------- training/make-lexcrf-grammar.pl | 285 ---------------------------------------- training/mpi_compute_cllh.cc | 196 +++++++++++++++++++++++++++ utils/stringlib.cc | 14 +- 6 files changed, 203 insertions(+), 808 deletions(-) delete mode 100755 training/cluster-em.pl delete mode 100755 training/cluster-ptrain.pl delete mode 100644 training/compute_cllh.cc delete mode 100755 training/make-lexcrf-grammar.pl create mode 100644 training/mpi_compute_cllh.cc (limited to 'training') diff --git a/training/cluster-em.pl b/training/cluster-em.pl deleted file mode 100755 index 267ab642..00000000 --- a/training/cluster-em.pl +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } -use Getopt::Long; -my $parallel = 0; - -my $CWD=`pwd`; chomp $CWD; -my $BIN_DIR = "$CWD/.."; -my $REDUCER = "$BIN_DIR/training/mr_em_adapted_reduce"; -my $REDUCE2WEIGHTS = "$BIN_DIR/training/mr_reduce_to_weights"; -my $ADAPTER = "$BIN_DIR/training/mr_em_map_adapter"; -my $DECODER = "$BIN_DIR/decoder/cdec"; -my $COMBINER_CACHE_SIZE = 10000000; -my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl"; -die "Can't find $REDUCER" unless -f $REDUCER; -die "Can't execute $REDUCER" unless -x $REDUCER; -die "Can't find $REDUCE2WEIGHTS" unless -f $REDUCE2WEIGHTS; -die "Can't execute $REDUCE2WEIGHTS" unless -x $REDUCE2WEIGHTS; -die "Can't find $ADAPTER" unless -f $ADAPTER; -die "Can't execute $ADAPTER" unless -x $ADAPTER; -die "Can't find $DECODER" unless -f $DECODER; -die "Can't execute $DECODER" unless -x $DECODER; -my $restart = ''; -if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; } - -die "Usage: $0 [--restart] training.corpus cdec.ini\n" unless (scalar @ARGV == 2); - -my $training_corpus = shift @ARGV; -my $config = shift @ARGV; -my $pmem="2500mb"; -my $nodes = 40; -my $max_iteration = 1000; -my $CFLAG = "-C 1"; -if ($parallel) { - die "Can't find $PARALLEL" unless -f $PARALLEL; - die "Can't execute $PARALLEL" unless -x $PARALLEL; -} else { $CFLAG = "-C 500"; } - -my $initial_weights = ''; - -print STDERR < \$DECODER, - "distributed" => \$DISTRIBUTED, - "sigma_squared=f" => \$sigsq, - "lbfgs_memory_buffers=i" => \$mem_buffers, - "max_iteration=i" => \$max_iteration, - "means=s" => \$means_file, - "optimizer=s" => \$OALG, - "gaussian_prior" => \$PRIOR, - "restart_if_necessary" => \$RESTART_IF_NECESSARY, - "jobs=i" => \$nodes, - "pmem=s" => \$pmem - ) or usage(); -usage() unless scalar @ARGV==3; -my $config_file = shift @ARGV; -my $training_corpus = shift @ARGV; -my $initial_weights = shift @ARGV; -unless ($DISTRIBUTED) { $LOCAL = 1; } -die "Can't find $config_file" unless -f $config_file; -die "Can't find $DECODER" unless -f $DECODER; -die "Can't execute $DECODER" unless -x $DECODER; -if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; } -if ($PRIOR) { - $PRIOR_FLAG="-p --sigma_squared $sigsq"; - if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; } -} - -if ($parallel) { - die "Can't find $PARALLEL" unless -f $PARALLEL; - die "Can't execute $PARALLEL" unless -x $PARALLEL; -} -unless ($parallel) { $CFLAG = "-C 500"; } -unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; } -my $clines = num_lines($training_corpus); -my $dir = "$CWD/ptrain"; - -if ($RESTART_IF_NECESSARY && -d $dir) { - $restart = 1; -} - -print STDERR <$dir/training.in"; - my $lc = 0; - while() { - chomp; - s/^\s+//; - s/\s+$//; - die "Expected A ||| B in input file" unless / \|\|\| /; - print TO "$_\n"; - $lc++; - } - close T; - close TO; -} -$training_corpus = "$dir/training.in"; - -my $iter_attempts = 1; -while ($iter < $max_iteration) { - my $cur_time = `date`; chomp $cur_time; - print STDERR "\nStarting iteration $iter...\n"; - print STDERR " time: $cur_time\n"; - my $start = time; - my $next_iter = $iter + 1; - my $dec_cmd="$DECODER -G $CFLAG -c $config_file -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter"; - my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M $mem_buffers $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz"; - my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- "; - my $cmd = ""; - if ($parallel) { $cmd = $pcmd; } - $cmd .= "$dec_cmd | $opt_cmd"; - - print STDERR "EXECUTING: $cmd\n"; - my $result = `$cmd`; - my $exit_code = $? >> 8; - if ($exit_code == 99) { - $iter_attempts++; - if ($iter_attempts > $MAX_ITER_ATTEMPTS) { - die "Received restart request $iter_attempts times from optimizer, giving up\n"; - } - print STDERR "Function evaluation failed, retrying (attempt $iter_attempts)\n"; - next; - } - if ($? != 0) { - die "Error running iteration $iter: $!"; - } - chomp $result; - my $end = time; - my $diff = ($end - $start); - print STDERR " ITERATION $iter TOOK $diff SECONDS\n"; - $iter = $next_iter; - if ($result =~ /1$/) { - print STDERR "Training converged.\n"; - last; - } - $iter_attempts = 1; -} - -print "FINAL WEIGHTS: $dir/weights.$iter\n"; -`mv $dir/weights.$iter.gz $dir/weights.final.gz`; - -sub usage { - die <) { $lines++; } - close $fh; - return $lines; -} diff --git a/training/compute_cllh.cc b/training/compute_cllh.cc deleted file mode 100644 index b496d196..00000000 --- a/training/compute_cllh.cc +++ /dev/null @@ -1,196 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "config.h" -#ifdef HAVE_MPI -#include -#endif -#include -#include - -#include "verbose.h" -#include "hg.h" -#include "prob.h" -#include "inside_outside.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "weights.h" - -using namespace std; -namespace po = boost::program_options; - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("weights,w",po::value(),"Input feature weights file") - ("training_data,t",po::value(),"Training data corpus") - ("decoder_config,c",po::value(),"Decoder configuration file"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) { - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c, vector* ids) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - int lc = 0; - while(in) { - getline(in, line); - if (!in) break; - if (lc % size == rank) { - c->push_back(line); - ids->push_back(lc); - } - ++lc; - } -} - -static const double kMINUS_EPSILON = -1e-6; - -struct TrainingObserver : public DecoderObserver { - void Reset() { - acc_obj = 0; - } - - virtual void NotifyDecodingStart(const SentenceMetadata&) { - cur_obj = 0; - state = 1; - } - - // compute model expectations, denominator of objective - virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) { - assert(state == 1); - state = 2; - SparseVector cur_model_exp; - const prob_t z = InsideOutside, - EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); - cur_obj = log(z); - } - - // compute "empirical" expectations, numerator of objective - virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 2); - state = 3; - SparseVector ref_exp; - const prob_t ref_z = InsideOutside, - EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp); - - double log_ref_z; -#if 0 - if (crf_uniform_empirical) { - log_ref_z = ref_exp.dot(feature_weights); - } else { - log_ref_z = log(ref_z); - } -#else - log_ref_z = log(ref_z); -#endif - - // rounding errors means that <0 is too strict - if ((cur_obj - log_ref_z) < kMINUS_EPSILON) { - cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl; - exit(1); - } - assert(!isnan(log_ref_z)); - acc_obj += (cur_obj - log_ref_z); - } - - double acc_obj; - double cur_obj; - int state; -}; - -#ifdef HAVE_MPI -namespace mpi = boost::mpi; -#endif - -int main(int argc, char** argv) { -#ifdef HAVE_MPI - mpi::environment env(argc, argv); - mpi::communicator world; - const int size = world.size(); - const int rank = world.rank(); -#else - const int size = 1; - const int rank = 0; -#endif - if (size > 1) SetSilent(true); // turn off verbose decoder output - register_feature_functions(); - - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) - return false; - - // load cdec.ini and set up decoder - ReadFile ini_rf(conf["decoder_config"].as()); - Decoder decoder(ini_rf.stream()); - if (decoder.GetConf()["input"].as() != "-") { - cerr << "cdec.ini must not set an input file\n"; - abort(); - } - - // load weights - vector& weights = decoder.CurrentWeightVector(); - if (conf.count("weights")) - Weights::InitFromFile(conf["weights"].as(), &weights); - - // freeze feature set - //const bool freeze_feature_set = conf.count("freeze_feature_set"); - //if (freeze_feature_set) FD::Freeze(); - - vector corpus; vector ids; - ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus, &ids); - assert(corpus.size() > 0); - assert(corpus.size() == ids.size()); - - TrainingObserver observer; - double objective = 0; - - observer.Reset(); - if (rank == 0) - cerr << "Each processor is decoding " << corpus.size() << " training examples...\n"; - - for (int i = 0; i < corpus.size(); ++i) { - decoder.SetId(ids[i]); - decoder.Decode(corpus[i], &observer); - } - -#ifdef HAVE_MPI - reduce(world, observer.acc_obj, objective, std::plus(), 0); -#else - objective = observer.acc_obj; -#endif - - if (rank == 0) - cout << "OBJECTIVE: " << objective << endl; - - return 0; -} - diff --git a/training/make-lexcrf-grammar.pl b/training/make-lexcrf-grammar.pl deleted file mode 100755 index 8cdf7718..00000000 --- a/training/make-lexcrf-grammar.pl +++ /dev/null @@ -1,285 +0,0 @@ -#!/usr/bin/perl -w -use utf8; -use strict; -my ($effile, $model1) = @ARGV; -die "Usage: $0 corpus.fr-en corpus.model1\n" unless $effile && -f $effile && $model1 && -f $model1; - -open EF, "<$effile" or die; -open M1, "<$model1" or die; -binmode(EF,":utf8"); -binmode(M1,":utf8"); -binmode(STDOUT,":utf8"); -my %model1; -while() { - chomp; - my ($f, $e, $lp) = split /\s+/; - $model1{$f}->{$e} = $lp; -} - -my $ADD_MODEL1 = 0; # found that model1 hurts performance -my $IS_FRENCH_F = 1; # indicates that the f language is french -my $IS_ARABIC_F = 0; # indicates that the f language is arabic -my $IS_URDU_F = 0; # indicates that the f language is arabic -my $ADD_PREFIX_ID = 0; -my $ADD_LEN = 1; -my $ADD_SIM = 1; -my $ADD_DICE = 1; -my $ADD_111 = 1; -my $ADD_ID = 1; -my $ADD_PUNC = 1; -my $ADD_NUM_MM = 1; -my $ADD_NULL = 1; -my $ADD_STEM_ID = 1; -my $BEAM_RATIO = 50; - -my %fdict; -my %fcounts; -my %ecounts; - -my %sdict; - -while() { - chomp; - my ($f, $e) = split /\s*\|\|\|\s*/; - my @es = split /\s+/, $e; - my @fs = split /\s+/, $f; - for my $ew (@es){ $ecounts{$ew}++; } - push @fs, '' if $ADD_NULL; - for my $fw (@fs){ $fcounts{$fw}++; } - for my $fw (@fs){ - for my $ew (@es){ - $fdict{$fw}->{$ew}++; - } - } -} - -print STDERR "Dice 0\n" if $ADD_DICE; -print STDERR "OneOneOne 0\nId_OneOneOne 0\n" if $ADD_111; -print STDERR "Identical 0\n" if $ADD_ID; -print STDERR "PuncMiss 0\n" if $ADD_PUNC; -print STDERR "IsNull 0\n" if $ADD_NULL; -print STDERR "Model1 0\n" if $ADD_MODEL1; -print STDERR "DLen 0\n" if $ADD_LEN; -print STDERR "NumMM 0\nNumMatch 0\n" if $ADD_NUM_MM; -print STDERR "OrthoSim 0\n" if $ADD_SIM; -print STDERR "PfxIdentical 0\n" if ($ADD_PREFIX_ID); -my $fc = 1000000; -my $sids = 1000000; -for my $f (sort keys %fdict) { - my $re = $fdict{$f}; - my $max; - for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) { - my $efcount = $re->{$e}; - unless (defined $max) { $max = $efcount; } - my $m1 = $model1{$f}->{$e}; - unless (defined $m1) { next; } - $fc++; - my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); - my $feats = "F$fc=1"; - my $oe = $e; - my $of = $f; # normalized form - if ($IS_FRENCH_F) { - # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French - $of =~ s/â/as/g; - $of =~ s/ê/es/g; - $of =~ s/î/is/g; - $of =~ s/ô/os/g; - $of =~ s/û/us/g; - } elsif ($IS_ARABIC_F) { - if (length($of) > 1 && !($of =~ /\d/)) { - $of =~ s/\$/sh/g; - } - } elsif ($IS_URDU_F) { - if (length($of) > 1 && !($of =~ /\d/)) { - $of =~ s/\$/sh/g; - } - $oe =~ s/^-e-//; - $oe =~ s/^al-/al/; - $of =~ s/([a-z])\~/$1$1/g; - $of =~ s/E/'/g; - $of =~ s/^Aw/o/g; - $of =~ s/\|/a/g; - $of =~ s/@/h/g; - $of =~ s/c/ch/g; - $of =~ s/x/kh/g; - $of =~ s/\*/dh/g; - $of =~ s/w/o/g; - $of =~ s/Z/dh/g; - $of =~ s/y/i/g; - $of =~ s/Y/a/g; - $of = lc $of; - } - my $len_e = length($oe); - my $len_f = length($of); - $feats .= " Model1=$m1" if ($ADD_MODEL1); - $feats .= " Dice=$dice" if $ADD_DICE; - my $is_null = undef; - if ($ADD_NULL && $f eq '') { - $feats .= " IsNull=1"; - $is_null = 1; - } - if ($ADD_LEN) { - if (!$is_null) { - my $dlen = abs($len_e - $len_f); - $feats .= " DLen=$dlen"; - } - } - my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); - my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); - my $both_non_numeric = (!$e_num && !$f_num); - if ($ADD_NUM_MM && (($f_num && !$e_num) || ($e_num && !$f_num))) { - $feats .= " NumMM=1"; - } - if ($ADD_NUM_MM && ($f_num && $e_num) && ($oe eq $of)) { - $feats .= " NumMatch=1"; - } - if ($ADD_STEM_ID) { - my $el = 4; - my $fl = 4; - if ($oe =~ /^al|re|co/) { $el++; } - if ($of =~ /^al|re|co/) { $fl++; } - if ($oe =~ /^trans|inter/) { $el+=2; } - if ($of =~ /^trans|inter/) { $fl+=2; } - if ($fl > length($of)) { $fl = length($of); } - if ($el > length($oe)) { $el = length($oe); } - my $sf = substr $of, 0, $fl; - my $se = substr $oe, 0, $el; - my $id = $sdict{$sf}->{$se}; - if (!$id) { - $sids++; - $sdict{$sf}->{$se} = $sids; - $id = $sids; - print STDERR "S$sids 0\n" - } - $feats .= " S$id=1"; - } - if ($ADD_PREFIX_ID) { - if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { - my $pe = substr $oe, 0, 3; - my $pf = substr $of, 0, 3; - if ($pe eq $pf) { $feats .= " PfxIdentical=1"; } - } - } - if ($ADD_SIM) { - my $ld = 0; - my $eff = $len_e; - if ($eff < $len_f) { $eff = $len_f; } - if (!$is_null) { - $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); - } - $feats .= " OrthoSim=$ld"; - } - my $ident = ($e eq $f); - if ($ident && $ADD_ID) { $feats .= " Identical=1"; } - if ($ADD_111 && ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1)) { - if ($ident && $ADD_ID) { - $feats .= " Id_OneOneOne=1"; - } - $feats .= " OneOneOne=1"; - } - if ($ADD_PUNC) { - if (($f =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $e =~ /[a-z]+/) || - ($e =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $f =~ /[a-z]+/)) { - $feats .= " PuncMiss=1"; - } - } - my $r = (0.5 - rand)/5; - print STDERR "F$fc $r\n"; - print "$f ||| $e ||| $feats\n"; - } -} - -sub levenshtein -{ - # $s1 and $s2 are the two strings - # $len1 and $len2 are their respective lengths - # - my ($s1, $s2) = @_; - my ($len1, $len2) = (length $s1, length $s2); - - # If one of the strings is empty, the distance is the length - # of the other string - # - return $len2 if ($len1 == 0); - return $len1 if ($len2 == 0); - - my %mat; - - # Init the distance matrix - # - # The first row to 0..$len1 - # The first column to 0..$len2 - # The rest to 0 - # - # The first row and column are initialized so to denote distance - # from the empty string - # - for (my $i = 0; $i <= $len1; ++$i) - { - for (my $j = 0; $j <= $len2; ++$j) - { - $mat{$i}{$j} = 0; - $mat{0}{$j} = $j; - } - - $mat{$i}{0} = $i; - } - - # Some char-by-char processing is ahead, so prepare - # array of chars from the strings - # - my @ar1 = split(//, $s1); - my @ar2 = split(//, $s2); - - for (my $i = 1; $i <= $len1; ++$i) - { - for (my $j = 1; $j <= $len2; ++$j) - { - # Set the cost to 1 iff the ith char of $s1 - # equals the jth of $s2 - # - # Denotes a substitution cost. When the char are equal - # there is no need to substitute, so the cost is 0 - # - my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; - - # Cell $mat{$i}{$j} equals the minimum of: - # - # - The cell immediately above plus 1 - # - The cell immediately to the left plus 1 - # - The cell diagonally above and to the left plus the cost - # - # We can either insert a new char, delete a char or - # substitute an existing char (with an associated cost) - # - $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, - $mat{$i}{$j-1} + 1, - $mat{$i-1}{$j-1} + $cost]); - } - } - - # Finally, the Levenshtein distance equals the rightmost bottom cell - # of the matrix - # - # Note that $mat{$x}{$y} denotes the distance between the substrings - # 1..$x and 1..$y - # - return $mat{$len1}{$len2}; -} - - -# minimal element of a list -# -sub min -{ - my @list = @{$_[0]}; - my $min = $list[0]; - - foreach my $i (@list) - { - $min = $i if ($i < $min); - } - - return $min; -} - diff --git a/training/mpi_compute_cllh.cc b/training/mpi_compute_cllh.cc new file mode 100644 index 00000000..b496d196 --- /dev/null +++ b/training/mpi_compute_cllh.cc @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include + +#include "config.h" +#ifdef HAVE_MPI +#include +#endif +#include +#include + +#include "verbose.h" +#include "hg.h" +#include "prob.h" +#include "inside_outside.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "weights.h" + +using namespace std; +namespace po = boost::program_options; + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("weights,w",po::value(),"Input feature weights file") + ("training_data,t",po::value(),"Training data corpus") + ("decoder_config,c",po::value(),"Decoder configuration file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c, vector* ids) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + int lc = 0; + while(in) { + getline(in, line); + if (!in) break; + if (lc % size == rank) { + c->push_back(line); + ids->push_back(lc); + } + ++lc; + } +} + +static const double kMINUS_EPSILON = -1e-6; + +struct TrainingObserver : public DecoderObserver { + void Reset() { + acc_obj = 0; + } + + virtual void NotifyDecodingStart(const SentenceMetadata&) { + cur_obj = 0; + state = 1; + } + + // compute model expectations, denominator of objective + virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) { + assert(state == 1); + state = 2; + SparseVector cur_model_exp; + const prob_t z = InsideOutside, + EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); + cur_obj = log(z); + } + + // compute "empirical" expectations, numerator of objective + virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { + assert(state == 2); + state = 3; + SparseVector ref_exp; + const prob_t ref_z = InsideOutside, + EdgeFeaturesAndProbWeightFunction>(*hg, &ref_exp); + + double log_ref_z; +#if 0 + if (crf_uniform_empirical) { + log_ref_z = ref_exp.dot(feature_weights); + } else { + log_ref_z = log(ref_z); + } +#else + log_ref_z = log(ref_z); +#endif + + // rounding errors means that <0 is too strict + if ((cur_obj - log_ref_z) < kMINUS_EPSILON) { + cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl; + exit(1); + } + assert(!isnan(log_ref_z)); + acc_obj += (cur_obj - log_ref_z); + } + + double acc_obj; + double cur_obj; + int state; +}; + +#ifdef HAVE_MPI +namespace mpi = boost::mpi; +#endif + +int main(int argc, char** argv) { +#ifdef HAVE_MPI + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); +#else + const int size = 1; + const int rank = 0; +#endif + if (size > 1) SetSilent(true); // turn off verbose decoder output + register_feature_functions(); + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) + return false; + + // load cdec.ini and set up decoder + ReadFile ini_rf(conf["decoder_config"].as()); + Decoder decoder(ini_rf.stream()); + if (decoder.GetConf()["input"].as() != "-") { + cerr << "cdec.ini must not set an input file\n"; + abort(); + } + + // load weights + vector& weights = decoder.CurrentWeightVector(); + if (conf.count("weights")) + Weights::InitFromFile(conf["weights"].as(), &weights); + + // freeze feature set + //const bool freeze_feature_set = conf.count("freeze_feature_set"); + //if (freeze_feature_set) FD::Freeze(); + + vector corpus; vector ids; + ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus, &ids); + assert(corpus.size() > 0); + assert(corpus.size() == ids.size()); + + TrainingObserver observer; + double objective = 0; + + observer.Reset(); + if (rank == 0) + cerr << "Each processor is decoding " << corpus.size() << " training examples...\n"; + + for (int i = 0; i < corpus.size(); ++i) { + decoder.SetId(ids[i]); + decoder.Decode(corpus[i], &observer); + } + +#ifdef HAVE_MPI + reduce(world, observer.acc_obj, objective, std::plus(), 0); +#else + objective = observer.acc_obj; +#endif + + if (rank == 0) + cout << "OBJECTIVE: " << objective << endl; + + return 0; +} + diff --git a/utils/stringlib.cc b/utils/stringlib.cc index 3a56965c..1a152985 100644 --- a/utils/stringlib.cc +++ b/utils/stringlib.cc @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -104,11 +105,11 @@ public: typedef unsigned int size_type; // must be 32bit MD5(); - MD5(const std::string& text); + MD5(const string& text); void update(const unsigned char *buf, size_type length); void update(const char *buf, size_type length); MD5& finalize(); - std::string hexdigest() const; + string hexdigest() const; private: void init(); @@ -209,7 +210,7 @@ MD5::MD5() ////////////////////////////////////////////// // nifty shortcut ctor, compute MD5 for string and finalize it right away -MD5::MD5(const std::string &text) +MD5::MD5(const string &text) { init(); update(text.c_str(), text.length()); @@ -433,8 +434,7 @@ MD5& MD5::finalize() ////////////////////////////// // return hex representation of digest as string -std::string MD5::hexdigest() const -{ +string MD5::hexdigest() const { if (!finalized) return ""; @@ -443,12 +443,12 @@ std::string MD5::hexdigest() const sprintf(buf+i*2, "%02x", digest[i]); buf[32]=0; - return std::string(buf); + return string(buf); } ////////////////////////////// -std::string md5(const std::string& str) { +string md5(const string& str) { MD5 md5 = MD5(str); return md5.hexdigest(); } -- cgit v1.2.3 From c1d44af44ce582a34320c8d81e3b62f71416de34 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 17 Sep 2011 16:19:11 +0100 Subject: make fix --- training/Makefile.am | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'training') diff --git a/training/Makefile.am b/training/Makefile.am index 5752859e..0b598fd5 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -13,7 +13,7 @@ bin_PROGRAMS = \ mpi_extract_features \ mpi_online_optimize \ mpi_batch_optimize \ - compute_cllh \ + mpi_compute_cllh \ augment_grammar noinst_PROGRAMS = \ @@ -34,8 +34,8 @@ mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteva mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -compute_cllh_SOURCES = compute_cllh.cc -compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc +mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz augment_grammar_SOURCES = augment_grammar.cc augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -- cgit v1.2.3 From 08c4a7fae8f0bec4f76c4e0928e357100eb7a1ca Mon Sep 17 00:00:00 2001 From: Guest_account Guest_account prguest11 Date: Tue, 11 Oct 2011 16:16:53 +0100 Subject: remove implicit conversion-to-double operator from LogVal that caused overflow errors, clean up some pf code --- decoder/aligner.cc | 2 +- decoder/cfg.cc | 2 +- decoder/cfg_format.h | 2 +- decoder/decoder.cc | 10 ++++---- decoder/hg.cc | 4 ++-- decoder/rule_lexer.l | 2 ++ decoder/trule.h | 15 +++++++++++- gi/pf/brat.cc | 11 --------- gi/pf/cbgi.cc | 10 -------- gi/pf/dpnaive.cc | 12 ---------- gi/pf/itg.cc | 11 --------- gi/pf/pfbrat.cc | 11 --------- gi/pf/pfdist.cc | 11 --------- gi/pf/pfnaive.cc | 11 --------- mteval/mbr_kbest.cc | 4 ++-- phrasinator/ccrp_nt.h | 24 +++++++++++++++---- training/mpi_batch_optimize.cc | 2 +- training/mpi_compute_cllh.cc | 51 +++++++++++++++++++---------------------- training/mpi_online_optimize.cc | 4 ++-- utils/logval.h | 10 ++++---- 20 files changed, 78 insertions(+), 131 deletions(-) (limited to 'training') diff --git a/decoder/aligner.cc b/decoder/aligner.cc index 292ee123..53e059fb 100644 --- a/decoder/aligner.cc +++ b/decoder/aligner.cc @@ -165,7 +165,7 @@ inline void WriteProbGrid(const Array2D& m, ostream* pos) { if (m(i,j) == prob_t::Zero()) { os << "\t---X---"; } else { - snprintf(b, 1024, "%0.5f", static_cast(m(i,j))); + snprintf(b, 1024, "%0.5f", m(i,j).as_float()); os << '\t' << b; } } diff --git a/decoder/cfg.cc b/decoder/cfg.cc index 651978d2..cd7e66e9 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -639,7 +639,7 @@ void CFG::Print(std::ostream &o,CFGFormat const& f) const { o << '['<& src, SparseVector* trg) { for (SparseVector::const_iterator it = src.begin(); it != src.end(); ++it) - trg->set_value(it->first, it->second); + trg->set_value(it->first, it->second.as_float()); } }; @@ -788,10 +788,10 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { const bool show_tree_structure=conf.count("show_tree_structure"); if (!SILENT) forest_stats(forest," Init. forest",show_tree_structure,oracle.show_derivation); if (conf.count("show_expected_length")) { - const PRPair res = - Inside, - PRWeightFunction >(forest); - cerr << " Expected length (words): " << res.r / res.p << "\t" << res << endl; + const PRPair res = + Inside, + PRWeightFunction >(forest); + cerr << " Expected length (words): " << (res.r / res.p).as_float() << "\t" << res << endl; } if (conf.count("show_partition")) { diff --git a/decoder/hg.cc b/decoder/hg.cc index 3ad17f1a..180986d7 100644 --- a/decoder/hg.cc +++ b/decoder/hg.cc @@ -157,14 +157,14 @@ prob_t Hypergraph::ComputeEdgePosteriors(double scale, vector* posts) co const ScaledEdgeProb weight(scale); const ScaledTransitionEventWeightFunction w2(scale); SparseVector pv; - const double inside = InsideOutside, ScaledTransitionEventWeightFunction>(*this, &pv, weight, w2); posts->resize(edges_.size()); for (int i = 0; i < edges_.size(); ++i) (*posts)[i] = prob_t(pv.value(i)); - return prob_t(inside); + return inside; } prob_t Hypergraph::ComputeBestPathThroughEdges(vector* post) const { diff --git a/decoder/rule_lexer.l b/decoder/rule_lexer.l index 9331d8ed..083a5bb1 100644 --- a/decoder/rule_lexer.l +++ b/decoder/rule_lexer.l @@ -220,6 +220,8 @@ NT [^\t \[\],]+ std::cerr << "Line " << lex_line << ": LHS and RHS arity mismatch!\n"; abort(); } + // const bool ignore_grammar_features = false; + // if (ignore_grammar_features) scfglex_num_feats = 0; TRulePtr rp(new TRule(scfglex_lhs, scfglex_src_rhs, scfglex_src_rhs_size, scfglex_trg_rhs, scfglex_trg_rhs_size, scfglex_feat_ids, scfglex_feat_vals, scfglex_num_feats, scfglex_src_arity, scfglex_als, scfglex_num_als)); check_and_update_ctf_stack(rp); TRulePtr coarse_rp = ((ctf_level == 0) ? TRulePtr() : ctf_rule_stack.top()); diff --git a/decoder/trule.h b/decoder/trule.h index 4df4ec90..8eb2a059 100644 --- a/decoder/trule.h +++ b/decoder/trule.h @@ -5,7 +5,9 @@ #include #include #include -#include + +#include "boost/shared_ptr.hpp" +#include "boost/functional/hash.hpp" #include "sparse_vector.h" #include "wordid.h" @@ -162,4 +164,15 @@ class TRule { bool SanityCheck() const; }; +inline size_t hash_value(const TRule& r) { + size_t h = boost::hash_value(r.e_); + boost::hash_combine(h, -r.lhs_); + boost::hash_combine(h, boost::hash_value(r.f_)); + return h; +} + +inline bool operator==(const TRule& a, const TRule& b) { + return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); +} + #endif diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc index 4c6ba3ef..7b60ef23 100644 --- a/gi/pf/brat.cc +++ b/gi/pf/brat.cc @@ -25,17 +25,6 @@ static unsigned kMAX_SRC_PHRASE; static unsigned kMAX_TRG_PHRASE; struct FSTState; -size_t hash_value(const TRule& r) { - size_t h = 2 - r.lhs_; - boost::hash_combine(h, boost::hash_value(r.e_)); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - double log_poisson(unsigned x, const double& lambda) { assert(lambda > 0.0); return log(lambda) * x - lgamma(x + 1) - lambda; diff --git a/gi/pf/cbgi.cc b/gi/pf/cbgi.cc index 20204e8a..97f1ba34 100644 --- a/gi/pf/cbgi.cc +++ b/gi/pf/cbgi.cc @@ -27,16 +27,6 @@ double log_decay(unsigned x, const double& b) { return log(b - 1) - x * log(b); } -size_t hash_value(const TRule& r) { - // TODO fix hash function - size_t h = boost::hash_value(r.e_) * boost::hash_value(r.f_) * r.lhs_; - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - struct SimpleBase { SimpleBase(unsigned esize, unsigned fsize, unsigned ntsize = 144) : uniform_e(-log(esize)), diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc index 582d1be7..608f73d5 100644 --- a/gi/pf/dpnaive.cc +++ b/gi/pf/dpnaive.cc @@ -20,18 +20,6 @@ namespace po = boost::program_options; static unsigned kMAX_SRC_PHRASE; static unsigned kMAX_TRG_PHRASE; -struct FSTState; - -size_t hash_value(const TRule& r) { - size_t h = 2 - r.lhs_; - boost::hash_combine(h, boost::hash_value(r.e_)); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc index 2c2a86f9..ac3c16a3 100644 --- a/gi/pf/itg.cc +++ b/gi/pf/itg.cc @@ -27,17 +27,6 @@ ostream& operator<<(ostream& os, const vector& p) { return os << ']'; } -size_t hash_value(const TRule& r) { - size_t h = boost::hash_value(r.e_); - boost::hash_combine(h, -r.lhs_); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - double log_poisson(unsigned x, const double& lambda) { assert(lambda > 0.0); return log(lambda) * x - lgamma(x + 1) - lambda; diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc index 4c6ba3ef..7b60ef23 100644 --- a/gi/pf/pfbrat.cc +++ b/gi/pf/pfbrat.cc @@ -25,17 +25,6 @@ static unsigned kMAX_SRC_PHRASE; static unsigned kMAX_TRG_PHRASE; struct FSTState; -size_t hash_value(const TRule& r) { - size_t h = 2 - r.lhs_; - boost::hash_combine(h, boost::hash_value(r.e_)); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - double log_poisson(unsigned x, const double& lambda) { assert(lambda > 0.0); return log(lambda) * x - lgamma(x + 1) - lambda; diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc index 18dfd03b..81abd61b 100644 --- a/gi/pf/pfdist.cc +++ b/gi/pf/pfdist.cc @@ -24,17 +24,6 @@ namespace po = boost::program_options; shared_ptr prng; -size_t hash_value(const TRule& r) { - size_t h = boost::hash_value(r.e_); - boost::hash_combine(h, -r.lhs_); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc index 43c604c3..c30e7c4f 100644 --- a/gi/pf/pfnaive.cc +++ b/gi/pf/pfnaive.cc @@ -24,17 +24,6 @@ namespace po = boost::program_options; shared_ptr prng; -size_t hash_value(const TRule& r) { - size_t h = boost::hash_value(r.e_); - boost::hash_combine(h, -r.lhs_); - boost::hash_combine(h, boost::hash_value(r.f_)); - return h; -} - -bool operator==(const TRule& a, const TRule& b) { - return (a.lhs_ == b.lhs_ && a.e_ == b.e_ && a.f_ == b.f_); -} - void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc index 2867b36b..64a6a8bf 100644 --- a/mteval/mbr_kbest.cc +++ b/mteval/mbr_kbest.cc @@ -32,7 +32,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } struct LossComparer { - bool operator()(const pair, double>& a, const pair, double>& b) const { + bool operator()(const pair, prob_t>& a, const pair, prob_t>& b) const { return a.second < b.second; } }; @@ -108,7 +108,7 @@ int main(int argc, char** argv) { ScoreP s = scorer->ScoreCandidate(list[j].first); double loss = 1.0 - s->ComputeScore(); if (type == TER || type == AER) loss = 1.0 - loss; - double weighted_loss = loss * (joints[j] / marginal); + double weighted_loss = loss * (joints[j] / marginal).as_float(); wl_acc += weighted_loss; if ((!output_list) && wl_acc > mbr_loss) break; } diff --git a/phrasinator/ccrp_nt.h b/phrasinator/ccrp_nt.h index 163b643a..811bce73 100644 --- a/phrasinator/ccrp_nt.h +++ b/phrasinator/ccrp_nt.h @@ -50,15 +50,26 @@ class CCRP_NoTable { return it->second; } - void increment(const Dish& dish) { - ++custs_[dish]; + int increment(const Dish& dish) { + int table_diff = 0; + if (++custs_[dish] == 1) + table_diff = 1; ++num_customers_; + return table_diff; } - void decrement(const Dish& dish) { - if ((--custs_[dish]) == 0) + int decrement(const Dish& dish) { + int table_diff = 0; + int nc = --custs_[dish]; + if (nc == 0) { custs_.erase(dish); + table_diff = -1; + } else if (nc < 0) { + std::cerr << "Dish counts dropped below zero for: " << dish << std::endl; + abort(); + } --num_customers_; + return table_diff; } double prob(const Dish& dish, const double& p0) const { @@ -66,6 +77,11 @@ class CCRP_NoTable { return (at_table + p0 * concentration_) / (num_customers_ + concentration_); } + double logprob(const Dish& dish, const double& logp0) const { + const unsigned at_table = num_customers(dish); + return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_); + } + double log_crp_prob() const { return log_crp_prob(concentration_); } diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc index 0ba8c530..046e921c 100644 --- a/training/mpi_batch_optimize.cc +++ b/training/mpi_batch_optimize.cc @@ -92,7 +92,7 @@ struct TrainingObserver : public DecoderObserver { void SetLocalGradientAndObjective(vector* g, double* o) const { *o = acc_obj; for (SparseVector::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it) - (*g)[it->first] = it->second; + (*g)[it->first] = it->second.as_float(); } virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { diff --git a/training/mpi_compute_cllh.cc b/training/mpi_compute_cllh.cc index b496d196..d5caa745 100644 --- a/training/mpi_compute_cllh.cc +++ b/training/mpi_compute_cllh.cc @@ -1,6 +1,4 @@ -#include #include -#include #include #include #include @@ -12,6 +10,7 @@ #include #include +#include "sentence_metadata.h" #include "verbose.h" #include "hg.h" #include "prob.h" @@ -52,7 +51,8 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { return true; } -void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c, vector* ids) { +void ReadInstances(const string& fname, int rank, int size, vector* c) { + assert(fname != "-"); ReadFile rf(fname); istream& in = *rf.stream(); string line; @@ -60,20 +60,16 @@ void ReadTrainingCorpus(const string& fname, int rank, int size, vector* while(in) { getline(in, line); if (!in) break; - if (lc % size == rank) { - c->push_back(line); - ids->push_back(lc); - } + if (lc % size == rank) c->push_back(line); ++lc; } } static const double kMINUS_EPSILON = -1e-6; -struct TrainingObserver : public DecoderObserver { - void Reset() { - acc_obj = 0; - } +struct ConditionalLikelihoodObserver : public DecoderObserver { + + ConditionalLikelihoodObserver() : trg_words(), acc_obj(), cur_obj() {} virtual void NotifyDecodingStart(const SentenceMetadata&) { cur_obj = 0; @@ -120,8 +116,10 @@ struct TrainingObserver : public DecoderObserver { } assert(!isnan(log_ref_z)); acc_obj += (cur_obj - log_ref_z); + trg_words += smeta.GetReference().size(); } + unsigned trg_words; double acc_obj; double cur_obj; int state; @@ -161,35 +159,32 @@ int main(int argc, char** argv) { if (conf.count("weights")) Weights::InitFromFile(conf["weights"].as(), &weights); - // freeze feature set - //const bool freeze_feature_set = conf.count("freeze_feature_set"); - //if (freeze_feature_set) FD::Freeze(); - - vector corpus; vector ids; - ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus, &ids); + vector corpus; + ReadInstances(conf["training_data"].as(), rank, size, &corpus); assert(corpus.size() > 0); - assert(corpus.size() == ids.size()); - - TrainingObserver observer; - double objective = 0; - observer.Reset(); if (rank == 0) - cerr << "Each processor is decoding " << corpus.size() << " training examples...\n"; + cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n"; - for (int i = 0; i < corpus.size(); ++i) { - decoder.SetId(ids[i]); + ConditionalLikelihoodObserver observer; + for (int i = 0; i < corpus.size(); ++i) decoder.Decode(corpus[i], &observer); - } + double objective = 0; + unsigned total_words = 0; #ifdef HAVE_MPI reduce(world, observer.acc_obj, objective, std::plus(), 0); + reduce(world, observer.trg_words, total_words, std::plus(), 0); #else objective = observer.acc_obj; #endif - if (rank == 0) - cout << "OBJECTIVE: " << objective << endl; + if (rank == 0) { + cout << "CONDITIONAL LOG_e LIKELIHOOD: " << objective << endl; + cout << "CONDITIONAL LOG_2 LIKELIHOOD: " << (objective/log(2)) << endl; + cout << " CONDITIONAL ENTROPY: " << (objective/log(2) / total_words) << endl; + cout << " PERPLEXITY: " << pow(2, (objective/log(2) / total_words)) << endl; + } return 0; } diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc index 2ef4a2e7..f87b7274 100644 --- a/training/mpi_online_optimize.cc +++ b/training/mpi_online_optimize.cc @@ -94,7 +94,7 @@ struct TrainingObserver : public DecoderObserver { void SetLocalGradientAndObjective(vector* g, double* o) const { *o = acc_obj; for (SparseVector::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it) - (*g)[it->first] = it->second; + (*g)[it->first] = it->second.as_float(); } virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { @@ -158,7 +158,7 @@ struct TrainingObserver : public DecoderObserver { void GetGradient(SparseVector* g) const { g->clear(); for (SparseVector::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it) - g->set_value(it->first, it->second); + g->set_value(it->first, it->second.as_float()); } int total_complete; diff --git a/utils/logval.h b/utils/logval.h index 6fdc2c42..8a59d0b1 100644 --- a/utils/logval.h +++ b/utils/logval.h @@ -25,12 +25,13 @@ class LogVal { typedef LogVal Self; LogVal() : s_(), v_(LOGVAL_LOG0) {} - explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {} + LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {} + const Self& operator=(double x) { s_ = std::signbit(x); v_ = s_ ? std::log(-x) : std::log(x); return *this; } LogVal(init_minus_1) : s_(true),v_(0) { } LogVal(init_1) : s_(),v_(0) { } LogVal(init_0) : s_(),v_(LOGVAL_LOG0) { } - LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {} - LogVal(unsigned x) : s_(0), v_(std::log(x)) { } + explicit LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {} + explicit LogVal(unsigned x) : s_(0), v_(std::log(x)) { } LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {} LogVal(double lnx,init_lnx) : s_(),v_(lnx) {} static Self exp(T lnx) { return Self(lnx,false); } @@ -141,9 +142,6 @@ class LogVal { return pow(1/root); } - operator T() const { - if (s_) return -std::exp(v_); else return std::exp(v_); - } T as_float() const { if (s_) return -std::exp(v_); else return std::exp(v_); } -- cgit v1.2.3 From 708e8220d2cbc41fbbf2e8a713edf1d2cf3bc767 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 14 Oct 2011 11:51:12 +0100 Subject: remove FSA integration code. will have to be resurrected another day --- decoder/Makefile.am | 1 - decoder/apply_fsa_models.cc | 798 ---------------------------------------- decoder/cdec_ff.cc | 13 - decoder/feature_accum.h | 129 ------- decoder/ff_factory.h | 2 - decoder/ff_from_fsa.h | 304 --------------- decoder/ff_fsa.h | 401 -------------------- decoder/ff_fsa_data.h | 131 ------- decoder/ff_fsa_dynamic.h | 208 ----------- decoder/ff_lm.cc | 48 --- decoder/ff_lm_fsa.h | 140 ------- decoder/ff_register.h | 38 -- decoder/hg_test.cc | 16 +- training/mpi_online_optimize.cc | 2 + 14 files changed, 10 insertions(+), 2221 deletions(-) delete mode 100755 decoder/apply_fsa_models.cc delete mode 100755 decoder/feature_accum.h delete mode 100755 decoder/ff_from_fsa.h delete mode 100755 decoder/ff_fsa.h delete mode 100755 decoder/ff_fsa_data.h delete mode 100755 decoder/ff_fsa_dynamic.h delete mode 100755 decoder/ff_lm_fsa.h (limited to 'training') diff --git a/decoder/Makefile.am b/decoder/Makefile.am index ede1cff0..6b9360d8 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -42,7 +42,6 @@ libcdec_a_SOURCES = \ cfg.cc \ dwarf.cc \ ff_dwarf.cc \ - apply_fsa_models.cc \ rule_lexer.cc \ fst_translator.cc \ csplit.cc \ diff --git a/decoder/apply_fsa_models.cc b/decoder/apply_fsa_models.cc deleted file mode 100755 index 3e93cadd..00000000 --- a/decoder/apply_fsa_models.cc +++ /dev/null @@ -1,798 +0,0 @@ -//see apply_fsa_models.README for notes on the l2r earley fsa+cfg intersection -//implementation in this file (also some comments in this file) -#define SAFE_VALGRIND 1 - -#include "apply_fsa_models.h" -#include -#include -#include -#include - -#include "writer.h" -#include "hg.h" -#include "ff_fsa_dynamic.h" -#include "ff_from_fsa.h" -#include "feature_vector.h" -#include "stringlib.h" -#include "apply_models.h" -#include "cfg.h" -#include "hg_cfg.h" -#include "utoa.h" -#include "hash.h" -#include "value_array.h" -#include "d_ary_heap.h" -#include "agenda.h" -#include "show.h" -#include "string_to.h" - - -#define DFSA(x) x -//fsa earley chart - -#define DPFSA(x) x -//prefix trie - -#define DBUILDTRIE(x) - -#define PRINT_PREFIX 1 -#if PRINT_PREFIX -# define IF_PRINT_PREFIX(x) x -#else -# define IF_PRINT_PREFIX(x) -#endif -// keep backpointers in prefix trie so you can print a meaningful node id - -static const unsigned FSA_AGENDA_RESERVE=10; // TODO: increase to 1<<24 (16M) - -using namespace std; - -//impl details (not exported). flat namespace for my ease. - -typedef CFG::RHS RHS; -typedef CFG::BinRhs BinRhs; -typedef CFG::NTs NTs; -typedef CFG::NT NT; -typedef CFG::NTHandle NTHandle; -typedef CFG::Rules Rules; -typedef CFG::Rule Rule; -typedef CFG::RuleHandle RuleHandle; - -namespace { - -/* - -1) A -> x . * (trie) - -this is somewhat nice. cost pushed for best first, of course. similar benefit as left-branching binarization without the explicit predict/complete steps? - -vs. just - -2) * -> x . y - -here you have to potentially list out all A -> . x y as items * -> . x y immediately, and shared rhs seqs won't be shared except at the usual single-NT predict/complete. of course, the prediction of items -> . x y can occur lazy best-first. - -vs. - -3) * -> x . * - -with 3, we predict all sorts of useless items - that won't give us our goal A and may not partcipate in any parse. this is not a good option at all. - -I'm using option 1. -*/ - -// if we don't greedy-binarize, we want to encode recognized prefixes p (X -> p . rest) efficiently. if we're doing this, we may as well also push costs so we can best-first select rules in a lazy fashion. this is effectively left-branching binarization, of course. - -template -struct fsa_map_type { - typedef std::map type; // change to HASH_MAP ? -}; -//template typedef - and macro to make it less painful -#define FSA_MAP(k,v) fsa_map_type >::type - -struct PrefixTrieNode; -typedef PrefixTrieNode *NodeP; -typedef PrefixTrieNode const *NodePc; - -// for debugging prints only -struct TrieBackP { - WordID w; - NodePc from; - TrieBackP(WordID w=0,NodePc from=0) : w(w),from(from) { } -}; - -FsaFeatureFunction const* print_fsa=0; -CFG const* print_cfg=0; -inline ostream& print_cfg_rhs(std::ostream &o,WordID w,CFG const*pcfg=print_cfg) { - if (pcfg) - pcfg->print_rhs_name(o,w); - else - CFG::static_print_rhs_name(o,w); - return o; -} - -inline std::string nt_name(WordID n,CFG const*pcfg=print_cfg) { - if (pcfg) return pcfg->nt_name(n); - return CFG::static_nt_name(n); -} - -template -ostream& print_by_nt(std::ostream &o,V const& v,CFG const*pcfg=print_cfg,char const* header="\nNT -> X\n") { - o< "< -ostream& print_map_by_nt(std::ostream &o,V const& v,CFG const*pcfg=print_cfg,char const* header="\nNT -> X\n") { - o<first,pcfg) << " -> "<second<<"\n"; - } - return o; -} - -struct PrefixTrieEdge { - PrefixTrieEdge() - // : dest(0),w(TD::max_wordid) - {} - PrefixTrieEdge(WordID w,NodeP dest) - : dest(dest),w(w) - {} -// explicit PrefixTrieEdge(best_t p) : p(p),dest(0) { } - - best_t p;// viterbi additional prob, i.e. product over path incl. p_final = total rule prob. note: for final edge, set this. - //DPFSA() - // we can probably just store deltas, but for debugging remember the full p - // best_t delta; // - NodeP dest; - bool is_final() const { return dest==0; } - best_t p_dest() const; - WordID w; // for root and and is_final(), this will be (negated) NTHandle. - - // for sorting most probable first in adj; actually >(p) - inline bool operator <(PrefixTrieEdge const& o) const { - return o.p"< BPs; - void back_vec(BPs &ns) const { - IF_PRINT_PREFIX(if(backp.from) { ns.push_back(backp); backp.from->back_vec(ns); }) - } - - BPs back_vec() const { - BPs ret; - back_vec(ret); - return ret; - } - - unsigned size() const { - unsigned a=adj.size(); - unsigned e=edge_for.size(); - return a>e?a:e; - } - - void print_back_str(std::ostream &o) const { - BPs back=back_vec(); - unsigned i=back.size(); - if (!i) { - o<<"PrefixTrieNode@"<<(uintptr_t)this; - return; - } - bool first=true; - while (i--<=0) { - if (!first) o<<','; - first=false; - WordID w=back[i].w; - print_cfg_rhs(o,w); - } - } - std::string back_str() const { - std::ostringstream o; - print_back_str(o); - return o.str(); - } - -// best_t p_final; // additional prob beyond what we already paid. while building, this is the total prob -// instead of storing final, we'll say that an edge with a NULL dest is a final edge. this way it gets sorted into the list of adj. - - // instead of completed map, we have trie start w/ lhs. - NTHandle lhs; // nonneg. - instead of storing this in Item. - IF_PRINT_PREFIX(BP backp;) - - enum { ROOT=-1 }; - explicit PrefixTrieNode(NTHandle lhs=ROOT,best_t p=1) : p(p),lhs(lhs),IF_PRINT_PREFIX(backp()) { - //final=false; - } - bool is_root() const { return lhs==ROOT; } // means adj are the nonneg lhs indices, and we have the index edge_for still available - - // outgoing edges will be ordered highest p to worst p - - typedef FSA_MAP(WordID,PrefixTrieEdge) PrefixTrieEdgeFor; -public: - PrefixTrieEdgeFor edge_for; //TODO: move builder elsewhere? then need 2nd hash or edge include pointer to builder. just clear this later - bool have_adj() const { - return adj.size()>=edge_for.size(); - } - bool no_adj() const { - return adj.empty(); - } - - void index_adj() { - index_adj(edge_for); - } - template - void index_adj(M &m) { - assert(have_adj()); - m.clear(); - for (int i=0;i - void index_lhs(PV &v) { - for (int i=0,e=adj.size();i!=e;++i) { - PrefixTrieEdge const& edge=adj[i]; - // assert(edge.p.is_1()); // actually, after done_building, e will have telescoped dest->p/p. - NTHandle n=-edge.w; - assert(n>=0); -// SHOWM3(DPFSA,"index_lhs",i,edge,n); - v[n]=edge.dest; - } - } - - template - void done_root(PV &v) { - assert(is_root()); - SHOWM1(DBUILDTRIE,"done_root",OSTRF1(print_map_by_nt,edge_for)); - done_building_r(); //sets adj - SHOWM1(DBUILDTRIE,"done_root",OSTRF1(print_by_nt,adj)); -// SHOWM1(DBUILDTRIE,done_root,adj); -// index_adj(); // we want an index for the root node?. don't think so - index_lhs handles it. also we stopped clearing edge_for. - index_lhs(v); // uses adj - } - - // call only once. - void done_building_r() { - done_building(); - for (int i=0;idone_building_r(); - } - - // for done_building; compute incremental (telescoped) edge p - PrefixTrieEdge /*const&*/ operator()(PrefixTrieEdgeFor::value_type & pair) const { - PrefixTrieEdge &e=pair.second;//const_cast(pair.second); - e.p=e.p_dest()/p; - return e; - } - - // call only once. - void done_building() { - SHOWM3(DBUILDTRIE,"done_building",edge_for.size(),adj.size(),1); -#if 1 - adj.reinit_map(edge_for,*this); -#else - adj.reinit(edge_for.size()); - SHOWM3(DBUILDTRIE,"done_building_reinit",edge_for.size(),adj.size(),2); - Adj::iterator o=adj.begin(); - for (PrefixTrieEdgeFor::iterator i=edge_for.begin(),e=edge_for.end();i!=e;++i) { - SHOWM3(DBUILDTRIE,"edge_for",o-adj.begin(),i->first,i->second); - PrefixTrieEdge &edge=i->second; - edge.p=(edge.dest->p)/p; - *o++=edge; -// (*this)(*i); - } -#endif - SHOWM1(DBUILDTRIE,"done building adj",prange(adj.begin(),adj.end(),true)); - assert(adj.size()==edge_for.size()); -// if (final) p_final/=p; - std::sort(adj.begin(),adj.end()); - //TODO: store adjacent differences on edges (compared to - } - - typedef ValueArray Adj; -// typedef vector Adj; - Adj adj; - - typedef WordID W; - - // let's compute p_min so that every rule reachable from the created node has p at least this low. - NodeP improve_edge(PrefixTrieEdge const& e,best_t rulep) { - NodeP d=e.dest; - maybe_improve(d->p,rulep); - return d; - } - - inline NodeP build(W w,best_t rulep) { - return build(lhs,w,rulep); - } - inline NodeP build_lhs(NTHandle n,best_t rulep) { - return build(n,-n,rulep); - } - - NodeP build(NTHandle lhs_,W w,best_t rulep) { - PrefixTrieEdgeFor::iterator i=edge_for.find(w); - if (i!=edge_for.end()) - return improve_edge(i->second,rulep); - NodeP r=new PrefixTrieNode(lhs_,rulep); - IF_PRINT_PREFIX(r->backp=BP(w,this)); -// edge_for.insert(i,PrefixTrieEdgeFor::value_type(w,PrefixTrieEdge(w,r))); - add(edge_for,w,PrefixTrieEdge(w,r)); - SHOWM4(DBUILDTRIE,"built node",this,w,*r,r); - return r; - } - - void set_final(NTHandle lhs_,best_t pf) { - assert(no_adj()); -// final=true; - PrefixTrieEdge &e=edge_for[null_wordid]; - e.p=pf; - e.dest=0; - e.w=lhs_; - maybe_improve(p,pf); - } - -private: - void destroy_children() { - assert(adj.size()>=edge_for.size()); - for (int i=0,e=adj.size();i" << p; - o << ',' << size() << ','; - print_back_str(o); - } - PRINT_SELF(PrefixTrieNode) -}; - -inline best_t PrefixTrieEdge::p_dest() const { - return dest ? dest->p : p; // for final edge, p was set (no sentinel node) -} - - -//Trie starts with lhs (nonneg index), then continues w/ rhs (mixed >0 word, else NT) -// trie ends with final edge, which points to a per-lhs prefix node -struct PrefixTrie { - void print(std::ostream &o) const { - o << cfgp << ' ' << root; - } - PRINT_SELF(PrefixTrie); - CFG *cfgp; - Rules const* rulesp; - Rules const& rules() const { return *rulesp; } - CFG const& cfg() const { return *cfgp; } - PrefixTrieNode root; - typedef std::vector LhsToTrie; // will have to check lhs2[lhs].p for best cost of some rule with that lhs, then use edge deltas after? they're just caching a very cheap computation, really - LhsToTrie lhs2; // no reason to use a map or hash table; every NT in the CFG will have some rule rhses. lhs_to_trie[i]=root.edge_for[i], i.e. we still have a root trie node conceptually, we just access through this since it's faster. - typedef LhsToTrie LhsToComplete; - LhsToComplete lhs2complete; // the sentinel "we're completing" node (dot at end) for that lhs. special case of suffix-set=same trie minimization (aka right branching binarization) // these will be used to track kbest completions, along with a l state (r state will be in the list) - PrefixTrie(CFG &cfg) : cfgp(&cfg),rulesp(&cfg.rules),lhs2(cfg.nts.size(),0),lhs2complete(cfg.nts.size()) { -// cfg.SortLocalBestFirst(); // instead we'll sort in done_building_r - print_cfg=cfgp; - SHOWM2(DBUILDTRIE,"PrefixTrie()",rulesp->size(),lhs2.size()); - cfg.VisitRuleIds(*this); - root.done_root(lhs2); - SHOWM3(DBUILDTRIE,"done w/ PrefixTrie: ",root,root.adj.size(),lhs2.size()); - DBUILDTRIE(print_by_nt(cerr,lhs2,cfgp)); - SHOWM1(DBUILDTRIE,"lhs2",OSTRF2(print_by_nt,lhs2,cfgp)); - } - - void operator()(int ri) { - Rule const& r=rules()[ri]; - NTHandle lhs=r.lhs; - best_t p=r.p; -// NodeP n=const_cast(root).build_lhs(lhs,p); - NodeP n=root.build_lhs(lhs,p); - SHOWM4(DBUILDTRIE,"Prefixtrie rule id, root",ri,root,p,*n); - for (RHS::const_iterator i=r.rhs.begin(),e=r.rhs.end();;++i) { - SHOWM2(DBUILDTRIE,"PrefixTrie build or final",i-r.rhs.begin(),*n); - if (i==e) { - n->set_final(lhs,p); - break; - } - n=n->build(*i,p); - SHOWM2(DBUILDTRIE,"PrefixTrie built",*i,*n); - } -// root.build(lhs,r.p)->build(r.rhs,r.p); - } - inline NodeP lhs2_ex(NTHandle n) const { - NodeP r=lhs2[n]; - if (!r) throw std::runtime_error("PrefixTrie: no CFG rule w/ lhs "+cfgp->nt_name(n)); - return r; - } -private: - PrefixTrie(PrefixTrie const& o); -}; - - - -typedef std::size_t ItemHash; - - -struct ItemKey { - explicit ItemKey(NodeP start,Bytes const& start_state) : dot(start),q(start_state),r(start_state) { } - explicit ItemKey(NodeP dot) : dot(dot) { } - NodeP dot; // dot is a function of the stuff already recognized, and gives a set of suffixes y to complete to finish a rhs for lhs() -> dot y. for a lhs A -> . *, this will point to lh2[A] - Bytes q,r; // (q->r are the fsa states; if r is empty it means - bool operator==(ItemKey const& o) const { - return dot==o.dot && q==o.q && r==o.r; - } - inline ItemHash hash() const { - ItemHash h=GOLDEN_MEAN_FRACTION*(ItemHash)(dot-NULL); // i.e. lower order bits of ptr are nonrandom - using namespace boost; - hash_combine(h,q); - hash_combine(h,r); - return h; - } - template - void print(O &o) const { - o<<"lhs="<print_back_str(o); - if (print_fsa) { - o<<'/'; - print_fsa->print_state(o,&q[0]); - o<<"->"; - print_fsa->print_state(o,&r[0]); - } - } - NTHandle lhs() const { return dot->lhs; } - PRINT_SELF(ItemKey) -}; -inline ItemHash hash_value(ItemKey const& x) { - return x.hash(); -} -ItemKey null_item((PrefixTrieNode*)0); - -struct Item; -typedef Item *ItemP; - -/* we use a single type of item so it can live in a single best-first queue. we hold them by pointer so they can have mutable state, e.g. priority/location, but also lists of predictions and kbest completions (i.e. completions[L,r] = L -> * (r,s), by 1best for each possible s. we may discover more s later. we could use different subtypes since we hold by pointer, but for now everything will be packed as variants of Item */ -#undef INIT_LOCATION -#if D_ARY_TRACK_OUT_OF_HEAP -# define INIT_LOCATION , location(D_ARY_HEAP_NULL_INDEX) -#elif !defined(NDEBUG) || SAFE_VALGRIND - // avoid spurious valgrind warning - FIXME: still complains??? -# define INIT_LOCATION , location() -#else -# define INIT_LOCATION -#endif - -// these should go in a global best-first queue -struct ItemPrio { - // NOTE: sum = viterbi (max) - ItemPrio() : priority(init_0()),inner(init_0()) { } - explicit ItemPrio(best_t priority) : priority(priority),inner(init_0()) { } - best_t priority; // includes inner prob. (forward) - /* The forward probability alpha_i(X[k]->x.y) is the sum of the probabilities of all - constrained paths of length i that end in state X[k]->x.y*/ - best_t inner; - /* The inner probability beta_i(X[k]->x.y) is the sum of the probabilities of all - paths of length i-k that start in state X[k,k]->.xy and end in X[k,i]->x.y, and generate the input symbols x[k,...,i-1] */ - template - void print(O &o) const { - o<=0; - } - explicit Item(FFState const& state,NodeP dot,best_t prio,int next=0) : ItemPrio(prio),ItemKey(dot,state),trienext(next),from(0) - INIT_LOCATION - { -// t=ADJ; -// if (dot->adj.size()) - dot->p_delta(next,priority); -// SHOWM1(DFSA,"Item(state,dot,prio)",prio); - } - typedef std::queue Predicted; -// Predicted predicted; // this is empty, unless this is a predicted L -> .asdf item, or a to-complete L -> asdf . - int trienext; // index of dot->adj to complete (if dest==0), or predict (if NT), or scan (if word). note: we could store pointer inside adj since it and trie are @ fixed addrs. less pointer arith, more space. - ItemP from; //backpointer - 0 for L -> . asdf for the rest; L -> a .sdf, it's the L -> .asdf item. - ItemP predicted_from() const { - ItemP p=(ItemP)this; - while(p->from) p=p->from; - return p; - } - template - void print(O &o) const { - o<< '['; - o< -struct ApplyFsa { - ApplyFsa(HgCFG &i, - const SentenceMetadata& smeta, - const FsaFeatureFunction& fsa, - DenseWeightVector const& weights, - ApplyFsaBy const& by, - Hypergraph* oh - ) - :hgcfg(i),smeta(smeta),fsa(fsa),weights(weights),by(by),oh(oh) - { - stateless=!fsa.state_bytes(); - } - void Compute() { - if (by.IsBottomUp() || stateless) - ApplyBottomUp(); - else - ApplyEarley(); - } - void ApplyBottomUp(); - void ApplyEarley(); - CFG const& GetCFG(); -private: - CFG cfg; - HgCFG &hgcfg; - SentenceMetadata const& smeta; - FsaFF const& fsa; -// WeightVector weight_vector; - DenseWeightVector weights; - ApplyFsaBy by; - Hypergraph* oh; - std::string cfg_out; - bool stateless; -}; - -template -void ApplyFsa::ApplyBottomUp() -{ - assert(by.IsBottomUp()); - FeatureFunctionFromFsa buff(&fsa); - buff.Init(); // mandatory to call this (normally factory would do it) - vector ffs(1,&buff); - ModelSet models(weights, ffs); - IntersectionConfiguration i(stateless ? BU_FULL : by.BottomUpAlgorithm(),by.pop_limit); - ApplyModelSet(hgcfg.ih,smeta,models,i,oh); -} - -template -void ApplyFsa::ApplyEarley() -{ - hgcfg.GiveCFG(cfg); - print_cfg=&cfg; - print_fsa=&fsa; - Chart chart(cfg,smeta,fsa); - // don't need to uniq - option to do that already exists in cfg_options - //TODO: - chart.best_first(); - *oh=hgcfg.ih; -} - - -void ApplyFsaModels(HgCFG &i, - const SentenceMetadata& smeta, - const FsaFeatureFunction& fsa, - DenseWeightVector const& weight_vector, - ApplyFsaBy const& by, - Hypergraph* oh) -{ - ApplyFsa a(i,smeta,fsa,weight_vector,by,oh); - a.Compute(); -} - -/* -namespace { -char const* anames[]={ - "BU_CUBE", - "BU_FULL", - "EARLEY", - 0 -}; -} -*/ - -//TODO: named enum type in boost? - -std::string ApplyFsaBy::name() const { -// return anames[algorithm]; - return GetName(algorithm); -} - -std::string ApplyFsaBy::all_names() { - return FsaByNames(" "); - /* - std::ostringstream o; - for (int i=0;i=N_ALGORITHMS) - throw std::runtime_error("Unknown ApplyFsaBy type id: "+itos(i)+" - legal types: "+all_names()); -*/ - GetName(i); // checks validity - algorithm=i; -} - -int ApplyFsaBy::BottomUpAlgorithm() const { - assert(IsBottomUp()); - return algorithm==BU_CUBE ? - IntersectionConfiguration::CUBE - :IntersectionConfiguration::FULL; -} - -void ApplyFsaModels(Hypergraph const& ih, - const SentenceMetadata& smeta, - const FsaFeatureFunction& fsa, - DenseWeightVector const& weights, // pre: in is weighted by these (except with fsa featval=0 before this) - ApplyFsaBy const& cfg, - Hypergraph* out) -{ - HgCFG i(ih); - ApplyFsaModels(i,smeta,fsa,weights,cfg,out); -} diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 69f40c93..4ce5749e 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -12,8 +12,6 @@ #include "ff_rules.h" #include "ff_ruleshape.h" #include "ff_bleu.h" -#include "ff_lm_fsa.h" -#include "ff_sample_fsa.h" #include "ff_source_syntax.h" #include "ff_register.h" #include "ff_charset.h" @@ -31,15 +29,6 @@ void register_feature_functions() { } registered = true; - //TODO: these are worthless example target FSA ffs. remove later - RegisterFsaImpl(true); - RegisterFsaImpl(true); - RegisterFsaImpl(true); -// ff_registry.Register("LanguageModelFsaDynamic",new FFFactory > >); // to test correctness of FsaFeatureFunctionDynamic erasure - RegisterFsaDynToFF(); - RegisterFsaImpl(true); // same as LM but using fsa wrapper - RegisterFsaDynToFF(); - RegisterFF(); RegisterFF(); @@ -47,8 +36,6 @@ void register_feature_functions() { RegisterFF(); RegisterFF(); - ff_registry.Register(new FFFactory); // same as WordPenalty, but implemented using ff_fsa - //TODO: use for all features the new Register which requires static FF::usage(false,false) give name #ifdef HAVE_RANDLM ff_registry.Register("RandLM", new FFFactory); diff --git a/decoder/feature_accum.h b/decoder/feature_accum.h deleted file mode 100755 index 4b8338eb..00000000 --- a/decoder/feature_accum.h +++ /dev/null @@ -1,129 +0,0 @@ -#ifndef FEATURE_ACCUM_H -#define FEATURE_ACCUM_H - -#include "ff.h" -#include "sparse_vector.h" -#include "value_array.h" - -struct SparseFeatureAccumulator : public FeatureVector { - typedef FeatureVector State; - SparseFeatureAccumulator() { assert(!"this code is disabled"); } - template - FeatureVector const& describe(FF const& ) { return *this; } - void Store(FeatureVector *fv) const { -//NO fv->set_from(*this); - } - template - void Store(FF const& /* ff */,FeatureVector *fv) const { -//NO fv->set_from(*this); - } - template - void Add(FF const& /* ff */,FeatureVector const& fv) { - (*this)+=fv; - } - void Add(FeatureVector const& fv) { - (*this)+=fv; - } - /* - SparseFeatureAccumulator(FeatureVector const& fv) : State(fv) {} - FeatureAccumulator(Features const& fids) {} - FeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fv) {} - void Add(Features const& fids,FeatureVector const& fv) { - *this += fv; - } - */ - void Add(int i,Featval v) { -//NO (*this)[i]+=v; - } - void Add(Features const& fids,int i,Featval v) { -//NO (*this)[i]+=v; - } -}; - -struct SingleFeatureAccumulator { - typedef Featval State; - typedef SingleFeatureAccumulator Self; - State v; - /* - void operator +=(State const& o) { - v+=o; - } - */ - void operator +=(Self const& s) { - v+=s.v; - } - SingleFeatureAccumulator() : v() {} - template - State const& describe(FF const& ) const { return v; } - - template - void Store(FF const& ff,FeatureVector *fv) const { - fv->set_value(ff.fid_,v); - } - void Store(Features const& fids,FeatureVector *fv) const { - assert(fids.size()==1); - fv->set_value(fids[0],v); - } - /* - SingleFeatureAccumulator(Features const& fids) { assert(fids.size()==1); } - SingleFeatureAccumulator(Features const& fids,FeatureVector const& fv) - { - assert(fids.size()==1); - v=fv.get_singleton(); - } - */ - - template - void Add(FF const& ff,FeatureVector const& fv) { - v+=fv.get(ff.fid_); - } - void Add(FeatureVector const& fv) { - v+=fv.get_singleton(); - } - - void Add(Features const& fids,FeatureVector const& fv) { - v += fv.get(fids[0]); - } - void Add(Featval dv) { - v+=dv; - } - void Add(int,Featval dv) { - v+=dv; - } - void Add(FeatureVector const& fids,int i,Featval dv) { - assert(fids.size()==1 && i==0); - v+=dv; - } -}; - - -#if 0 -// omitting this so we can default construct an accum. might be worth resurrecting in the future -struct ArrayFeatureAccumulator : public ValueArray { - typedef ValueArray State; - template - ArrayFeatureAccumulator(Fsa const& fsa) : State(fsa.features_.size()) { } - ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) { } - ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) { } - ArrayFeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fids.size()) { - for (int i=0,e=iset_value(fids[i],(*this)[i]); - } - void Add(Features const& fids,FeatureVector const& fv) { - for (int i=0,e=i -#include "ff_fsa_dynamic.h" - class FeatureFunction; class FsaFeatureFunction; diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h deleted file mode 100755 index f8d79e03..00000000 --- a/decoder/ff_from_fsa.h +++ /dev/null @@ -1,304 +0,0 @@ -#ifndef FF_FROM_FSA_H -#define FF_FROM_FSA_H - -#include "ff_fsa.h" - -#ifndef TD__none -// replacing dependency on SRILM -#define TD__none -1 -#endif - -#ifndef FSA_FF_DEBUG -# define FSA_FF_DEBUG 0 -#endif -#if FSA_FF_DEBUG -# define FSAFFDBG(e,x) FSADBGif(debug(),e,x) -# define FSAFFDBGnl(e) FSADBGif_nl(debug(),e) -#else -# define FSAFFDBG(e,x) -# define FSAFFDBGnl(e) -#endif - -/* regular bottom up scorer from Fsa feature - uses guarantee about markov order=N to score ASAP - encoding of state: if less than N-1 (ctxlen) words - - usage: - typedef FeatureFunctionFromFsa LanguageModelFromFsa; -*/ - -template -class FeatureFunctionFromFsa : public FeatureFunction { - typedef void const* SP; - typedef WordID *W; - typedef WordID const* WP; -public: - template - FeatureFunctionFromFsa(I const& param) : ff(param) { - debug_=true; // because factory won't set until after we construct. - } - template - FeatureFunctionFromFsa(I & param) : ff(param) { - debug_=true; // because factory won't set until after we construct. - } - - static std::string usage(bool args,bool verbose) { - return Impl::usage(args,verbose); - } - void init_name_debug(std::string const& n,bool debug) { - FeatureFunction::init_name_debug(n,debug); - ff.init_name_debug(n,debug); - } - - // this should override - Features features() const { - DBGINIT("FeatureFunctionFromFsa features() name="<=1) - for (int j=0,ee=e.size();;++j) { // items in target side of rule - for(;;++j) { - if (j>=ee) goto rhs_done; // j may go 1 past ee due to k possibly getting to end - if (RHS_WORD(j)) break; - } - // word @j - int k=j; - while(k{"<") - FSAFFDBG(edge," end="<{"< -# define FSADBG(e,x) FSADBGif(d().debug(),e,x) -# define FSADBGnl(e) FSADBGif_nl(d().debug(),e,x) -#else -# define FSADBG(e,x) -# define FSADBGnl(e) -#endif - -#include "fast_lexical_cast.hpp" -#include -#include -#include "ff.h" -#include "sparse_vector.h" -#include "tdict.h" -#include "hg.h" -#include "ff_fsa_data.h" - -/* -usage: see ff_sample_fsa.h or ff_lm_fsa.h - - then, to decode, see ff_from_fsa.h (or TODO: left->right target-earley style rescoring) - - */ - - -template -struct FsaFeatureFunctionBase : public FsaFeatureFunctionData { - Impl const& d() const { return static_cast(*this); } - Impl & d() { return static_cast(*this); } - - // this will get called by factory - override if you have multiple or dynamically named features. note: may be called repeatedly - void Init() { - Init(name()); - DBGINIT("base (single feature) FsaFeatureFunctionBase::Init name="<set_value(fid,val) possibly with duplicates. state and next_state will never be the same memory. - //TODO: decide if we want to require you to support dest same as src, since that's how we use it most often in ff_from_fsa bottom-up wrapper (in l->r scoring, however, distinct copies will be the rule), and it probably wouldn't be too hard for most people to support. however, it's good to hide the complexity here, once (see overly clever FsaScan loop that swaps src/dest addresses repeatedly to scan a sequence by effectively swapping) - -protected: - // overrides have different name because of inheritance method hiding; - - // simple/common case; 1 fid. these need not be overriden if you have multiple feature ids - Featval Scan1(WordID w,void const* state,void *next_state) const { - assert(0); - return 0; - } - Featval Scan1Meta(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& /* edge */, - WordID w,void const* state,void *next_state) const { - return d().Scan1(w,state,next_state); - } -public: - - // must override this or Scan1Meta or Scan1 - template - inline void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, - WordID w,void const* state,void *next_state,Accum *a) const { - Add(d().Scan1Meta(smeta,edge,w,state,next_state),a); - } - - // bounce back and forth between two state vars starting at cs, returning end state location. if we required src=dest addr safe state updating, this concept wouldn't need to exist. - // required that you override this if you score phrases differently than word-by-word, however, you can just use the SCAN_PHRASE_ACCUM_OVERRIDE macro to do that in terms of ScanPhraseAccum - template - void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const { - // extra code - IT'S FOR EFFICIENCY, MAN! IT'S OK! definitely no bugs here. - if (!ssz) { - for (;io - odd: - d().ScanAccum(smeta,edge,i[0],os,es,accum); // o->e - } - return es; - } - - - static const bool simple_phrase_score=true; // if d().simple_phrase_score_, then you should expect different Phrase scores for phrase length > M. so, set this false if you provide ScanPhraseAccum (SCAN_PHRASE_ACCUM_OVERRIDE macro does this) - - // override this (and use SCAN_PHRASE_ACCUM_OVERRIDE ) if you want e.g. maximum possible order ngram scores with markov_order < n-1. in the future SparseFeatureAccumulator will probably be the only option for type-erased FSA ffs. - // note you'll still have to override ScanAccum - template - void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge, - WordID const* i, WordID const* end, - void const* state,void *next_state,Accum *accum) const { - if (!ssz) { - for (;i \ - void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const { \ - ScanPhraseAccum(smeta,edge,i,end,cs,ns,accum); \ - return ns; \ - } \ - template \ - void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, \ - WordID const* i, WordID const* end, \ - void const* state,Accum *accum) const { \ - char s2[ssz]; ScanPhraseAccum(smeta,edge,i,end,state,(void*)s2,accum); \ - } - - // override this or bounce along with above. note: you can just call ScanPhraseAccum - // doesn't set state (for heuristic in ff_from_fsa) - template - void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, - WordID const* i, WordID const* end, - void const* state,Accum *accum) const { - char s1[ssz]; - char s2[ssz]; - state_copy(s1,state); - d().ScanPhraseAccumBounce(smeta,edge,i,end,(void*)s1,(void*)s2,accum); - } - - // for single-feat only. but will work for different accums - template - inline void Add(Featval v,Accum *a) const { - a->Add(fid_,v); - } - inline void set_feat(FeatureVector *features,Featval v) const { - features->set_value(fid_,v); - } - - // don't set state-bytes etc. in ctor because it may depend on parsing param string - FsaFeatureFunctionBase(int statesz=0,Sentence const& end_sentence_phrase=Sentence()) - : FsaFeatureFunctionData(statesz,end_sentence_phrase) - { - name_=name(); // should allow FsaDynamic wrapper to get name copied to it with sync - } - -}; - -template -struct MultipleFeatureFsa : public FsaFeatureFunctionBase { - typedef SparseFeatureAccumulator Accum; -}; - - - - -// if State is pod. sets state size and allocs start, h_start -// usage: -// struct ShorterThanPrev : public FsaTypedBase -// i.e. Impl is a CRTP -template -struct FsaTypedBase : public FsaFeatureFunctionBase { - Impl const& d() const { return static_cast(*this); } - Impl & d() { return static_cast(*this); } -protected: - typedef FsaFeatureFunctionBase Base; - typedef St State; - static inline State & state(void *state) { - return *(State*)state; - } - static inline State const& state(void const* state) { - return *(State const*)state; - } - void set_starts(State const& s,State const& heuristic_s) { - if (0) { // already in ctor - Base::start.resize(sizeof(State)); - Base::h_start.resize(sizeof(State)); - } - assert(Base::start.size()==sizeof(State)); - assert(Base::h_start.size()==sizeof(State)); - state(Base::start.begin())=s; - state(Base::h_start.begin())=heuristic_s; - } - FsaTypedBase(St const& start_st=St() - ,St const& h_start_st=St() - ,Sentence const& end_sentence_phrase=Sentence()) - : Base(sizeof(State),end_sentence_phrase) { - set_starts(start_st,h_start_st); - } -public: - void print_state(std::ostream &o,void const*st) const { - o< - inline void ScanT(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID w,St const& prev_st,St &new_st,Accum *a) const { - Add(d().ScanT1(smeta,edge,w,prev_st,new_st),a); - } - - // note: you're on your own when it comes to Phrase overrides. see FsaFeatureFunctionBase. sorry. - - template - inline void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID w,void const* st,void *next_state,Accum *a) const { - Impl const& im=d(); - FSADBG(edge,"Scan "<describe(im)<<" "<"< -struct FsaScanner { -// enum {ALIGN=8}; - static const int ALIGN=8; - FF const& ff; - SentenceMetadata const& smeta; - int ssz; - Bytes states; // first is at begin, second is at (char*)begin+stride - void *st0; // states - void *st1; // states+stride - void *cs; // initially st0, alternates between st0 and st1 - inline void *nexts() const { - return (cs==st0)?st1:st0; - } - Hypergraph::Edge const& edge; - FsaScanner(FF const& ff,SentenceMetadata const& smeta,Hypergraph::Edge const& edge) : ff(ff),smeta(smeta),edge(edge) - { - ssz=ff.state_bytes(); - int stride=((ssz+ALIGN-1)/ALIGN)*ALIGN; // round up to multiple of ALIGN - states.resize(stride+ssz); - st0=states.begin(); - st1=(char*)st0+stride; -// for (int i=0;i<2;++i) st[i]=cs+(i*stride); - } - void reset(void const* state) { - cs=st0; - std::memcpy(st0,state,ssz); - } - template - void scan(WordID w,Accum *a) { - void *ns=nexts(); - ff.ScanAccum(smeta,edge,w,cs,ns,a); - cs=ns; - } - template - void scan(WordID const* i,WordID const* end,Accum *a) { - // faster. and allows greater-order excursions - cs=ff.ScanPhraseAccumBounce(smeta,edge,i,end,cs,nexts(),a); - } -}; - - -//TODO: combine 2 FsaFeatures typelist style (can recurse for more) - - - - -#endif diff --git a/decoder/ff_fsa_data.h b/decoder/ff_fsa_data.h deleted file mode 100755 index d215e940..00000000 --- a/decoder/ff_fsa_data.h +++ /dev/null @@ -1,131 +0,0 @@ -#ifndef FF_FSA_DATA_H -#define FF_FSA_DATA_H - -#include //C99 -#include -#include "sentences.h" -#include "feature_accum.h" -#include "value_array.h" -#include "ff.h" //debug -typedef ValueArray Bytes; - -// stuff I see no reason to have virtual. but because it's impossible (w/o virtual inheritance to have dynamic fsa ff know where the impl's data starts, implemented a sync (copy) method that needs to be called. init_name_debug was already necessary to keep state in sync between ff and ff_from_fsa, so no sync should be needed after it. supposing all modifications were through setters, then no explicit sync call would ever be needed; updates could be mirrored. -struct FsaFeatureFunctionData -{ - void init_name_debug(std::string const& n,bool debug) { - name_=n; - debug_=debug; - } - //HACK for diamond inheritance (w/o costing performance) - FsaFeatureFunctionData *sync_to_; - - void sync() const { // call this if you modify any fields after your constructor is done - if (sync_to_) { - DBGINIT("sync to "<<*sync_to_); - *sync_to_=*this; - DBGINIT("synced result="<<*sync_to_<< " from this="<<*this); - } else { - DBGINIT("nobody to sync to - from FeatureFunctionData this="<<*this); - } - } - - friend std::ostream &operator<<(std::ostream &o,FsaFeatureFunctionData const& d) { - o << "[FSA "< - static inline T* state_as(void *p) { return (T*)p; } - template - static inline T const* state_as(void const* p) { return (T*)p; } - std::string describe_features(FeatureVector const& feats) { - std::ostringstream o; - o<" for lm. -protected: - int ssz; // don't forget to set this. default 0 (it may depend on params of course) - // this can be called instead or after constructor (also set bytes and end_phrase_) - void set_state_bytes(int sb=0) { - if (start.size()!=sb) start.resize(sb); - if (h_start.size()!=sb) h_start.resize(sb); - ssz=sb; - } - void set_end_phrase(WordID single) { - end_phrase_=singleton_sentence(single); - } - - inline void static to_state(void *state,char const* begin,char const* end) { - std::memcpy(state,begin,end-begin); - } - inline void static to_state(void *state,char const* begin,int n) { - std::memcpy(state,begin,n); - } - template - inline void static to_state(void *state,T const* begin,int n=1) { - to_state(state,(char const*)begin,n*sizeof(T)); - } - template - inline void static to_state(void *state,T const* begin,T const* end) { - to_state(state,(char const*)begin,(char const*)end); - } - inline static char hexdigit(int i) { - int j=i-10; - return j>=0?'a'+j:'0'+i; - } - inline static void print_hex_byte(std::ostream &o,unsigned c) { - o<>4); - o<Add(v); - } - -}; - -#endif diff --git a/decoder/ff_fsa_dynamic.h b/decoder/ff_fsa_dynamic.h deleted file mode 100755 index 6f75bbe5..00000000 --- a/decoder/ff_fsa_dynamic.h +++ /dev/null @@ -1,208 +0,0 @@ -#ifndef FF_FSA_DYNAMIC_H -#define FF_FSA_DYNAMIC_H - -struct SentenceMetadata; - -#include "ff_fsa_data.h" -#include "hg.h" // can't forward declare nested Hypergraph::Edge class -#include - -// the type-erased interface - -//FIXME: diamond inheritance problem. make a copy of the fixed data? or else make the dynamic version not wrap but rather be templated CRTP base (yuck) -struct FsaFeatureFunction : public FsaFeatureFunctionData { - static const bool simple_phrase_score=false; - virtual int markov_order() const = 0; - - // see ff_fsa.h - FsaFeatureFunctionBase gives you reasonable impls of these if you override just ScanAccum - virtual void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, - WordID w,void const* state,void *next_state,Accum *a) const = 0; - virtual void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge, - WordID const* i, WordID const* end, - void const* state,void *next_state,Accum *accum) const = 0; - virtual void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, - WordID const* i, WordID const* end, - void const* state,Accum *accum) const = 0; - virtual void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const = 0; - - virtual int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const { return 0; } - // called after constructor, before use - virtual void Init() = 0; - virtual std::string usage_v(bool param,bool verbose) const { - return FeatureFunction::usage_helper("unnamed_dynamic_fsa_feature","","",param,verbose); - } - virtual void init_name_debug(std::string const& n,bool debug) { - FsaFeatureFunctionData::init_name_debug(n,debug); - } - - virtual void print_state(std::ostream &o,void const*state) const { - FsaFeatureFunctionData::print_state(o,state); - } - virtual std::string describe() const { return "[FSA unnamed_dynamic_fsa_feature]"; } - - //end_phrase() - virtual ~FsaFeatureFunction() {} - - // no need to override: - std::string describe_state(void const* state) const { - std::ostringstream o; - print_state(o,state); - return o.str(); - } -}; - -// conforming to above interface, type erases FsaImpl -// you might be wondering: why do this? answer: it's cool, and it means that the bottom-up ff over ff_fsa wrapper doesn't go through multiple layers of dynamic dispatch -// usage: typedef FsaFeatureFunctionDynamic MyFsaDyn; -template -struct FsaFeatureFunctionDynamic : public FsaFeatureFunction { - static const bool simple_phrase_score=Impl::simple_phrase_score; - Impl& d() { return impl;//static_cast(*this); - } - Impl const& d() const { return impl; - //static_cast(*this); - } - int markov_order() const { return d().markov_order(); } - - std::string describe() const { - return d().describe(); - } - - virtual void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, - WordID w,void const* state,void *next_state,Accum *a) const { - return d().ScanAccum(smeta,edge,w,state,next_state,a); - } - - virtual void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge, - WordID const* i, WordID const* end, - void const* state,void *next_state,Accum *a) const { - return d().ScanPhraseAccum(smeta,edge,i,end,state,next_state,a); - } - - virtual void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, - WordID const* i, WordID const* end, - void const* state,Accum *a) const { - return d().ScanPhraseAccumOnly(smeta,edge,i,end,state,a); - } - - virtual void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *a) const { - return d().ScanPhraseAccumBounce(smeta,edge,i,end,cs,ns,a); - } - - virtual int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const { - return d().early_score_words(smeta,edge,i,end,accum); - } - - static std::string usage(bool param,bool verbose) { - return Impl::usage(param,verbose); - } - - std::string usage_v(bool param,bool verbose) const { - return Impl::usage(param,verbose); - } - - virtual void print_state(std::ostream &o,void const*state) const { - return d().print_state(o,state); - } - - void init_name_debug(std::string const& n,bool debug) { - FsaFeatureFunction::init_name_debug(n,debug); - d().init_name_debug(n,debug); - } - - virtual void Init() { - d().sync_to_=(FsaFeatureFunctionData*)this; - d().Init(); - d().sync(); - } - - template - FsaFeatureFunctionDynamic(I const& param) : impl(param) { - Init(); - } -private: - Impl impl; -}; - -// constructor takes ptr or shared_ptr to Impl, otherwise same as above - note: not virtual -template -struct FsaFeatureFunctionPimpl : public FsaFeatureFunctionData { - typedef boost::shared_ptr Pimpl; - static const bool simple_phrase_score=Impl::simple_phrase_score; - Impl const& d() const { return *p_; } - int markov_order() const { return d().markov_order(); } - - std::string describe() const { - return d().describe(); - } - - void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, - WordID w,void const* state,void *next_state,Accum *a) const { - return d().ScanAccum(smeta,edge,w,state,next_state,a); - } - - void ScanPhraseAccum(SentenceMetadata const& smeta,Hypergraph::Edge const & edge, - WordID const* i, WordID const* end, - void const* state,void *next_state,Accum *a) const { - return d().ScanPhraseAccum(smeta,edge,i,end,state,next_state,a); - } - - void ScanPhraseAccumOnly(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, - WordID const* i, WordID const* end, - void const* state,Accum *a) const { - return d().ScanPhraseAccumOnly(smeta,edge,i,end,state,a); - } - - void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *a) const { - return d().ScanPhraseAccumBounce(smeta,edge,i,end,cs,ns,a); - } - - int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const { - return d().early_score_words(smeta,edge,i,end,accum); - } - - static std::string usage(bool param,bool verbose) { - return Impl::usage(param,verbose); - } - - std::string usage_v(bool param,bool verbose) const { - return Impl::usage(param,verbose); - } - - void print_state(std::ostream &o,void const*state) const { - return d().print_state(o,state); - } - -#if 0 - // this and Init() don't touch p_ because we want to leave the original alone. - void init_name_debug(std::string const& n,bool debug) { - FsaFeatureFunctionData::init_name_debug(n,debug); - } -#endif - void Init() { - p_=hold_pimpl_.get(); -#if 0 - d().sync_to_=static_cast(this); - d().Init(); -#endif - *static_cast(this)=d(); - } - - FsaFeatureFunctionPimpl(Impl const* const p) : hold_pimpl_(p,null_deleter()) { - Init(); - } - FsaFeatureFunctionPimpl(Pimpl const& p) : hold_pimpl_(p) { - Init(); - } -private: - Impl const* p_; - Pimpl hold_pimpl_; -}; - -typedef FsaFeatureFunctionPimpl FsaFeatureFunctionFwd; // allow ff_from_fsa for an existing dynamic-type ff (as opposed to usual register a wrapped known-type FSA in ff_register, which is more efficient) -//typedef FsaFeatureFunctionDynamic DynamicFsaFeatureFunctionFwd; //if you really need to have a dynamic fsa facade that's also a dynamic fsa - -//TODO: combine 2 (or N) FsaFeatureFunction (type erased) - - -#endif diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index afa36b96..5e16d4e3 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -46,7 +46,6 @@ char const* usage_verbose="-n determines the name of the feature (and its weight #endif #include "ff_lm.h" -#include "ff_lm_fsa.h" #include #include @@ -69,10 +68,6 @@ char const* usage_verbose="-n determines the name of the feature (and its weight using namespace std; -string LanguageModelFsa::usage(bool param,bool verbose) { - return FeatureFunction::usage_helper("LanguageModelFsa",usage_short,usage_verbose,param,verbose); -} - string LanguageModel::usage(bool param,bool verbose) { return FeatureFunction::usage_helper(usage_name,usage_short,usage_verbose,param,verbose); } @@ -524,49 +519,6 @@ LanguageModel::LanguageModel(const string& param) { SetStateSize(LanguageModelImpl::OrderToStateSize(order)); } -//TODO: decide whether to waste a word of space so states are always none-terminated for SRILM. otherwise we have to copy -void LanguageModelFsa::set_ngram_order(int i) { - assert(i>0); - ngram_order_=i; - ctxlen_=i-1; - set_state_bytes(ctxlen_*sizeof(WordID)); - WordID *ss=(WordID*)start.begin(); - WordID *hs=(WordID*)h_start.begin(); - if (ctxlen_) { // avoid segfault in case of unigram lm (0 state) - set_end_phrase(TD::Convert("")); -// se is pretty boring in unigram case, just adds constant prob. check that this is what we want - ss[0]=TD::Convert(""); // start-sentence context (length 1) - hs[0]=0; // empty context - for (int i=1;ifloor_; - set_ngram_order(lmorder); -} - -void LanguageModelFsa::print_state(ostream &o,void const* st) const { - WordID const *wst=(WordID const*)st; - o<<'['; - bool sp=false; - for (int i=ctxlen_;i>0;sp=true) { - --i; - WordID w=wst[i]; - if (w==0) continue; - if (sp) o<<' '; - o << TD::Convert(w); - } - o<<']'; -} - Features LanguageModel::features() const { return single_feature(fid_); } diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h deleted file mode 100755 index 85b7ef44..00000000 --- a/decoder/ff_lm_fsa.h +++ /dev/null @@ -1,140 +0,0 @@ -#ifndef FF_LM_FSA_H -#define FF_LM_FSA_H - -//FIXME: when FSA_LM_PHRASE 1, 3gram fsa has differences, especially with unk words, in about the 4th decimal digit (about .05%), compared to regular ff_lm. this is USUALLY a bug (there's way more actual precision in there). this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that). also, LM_FSA_SHORTEN_CONTEXT gives identical scores with FSA_LM_PHRASE 0 - -// enabling for now - retest unigram+ more, solve above puzzle - -// some impls in ff_lm.cc - -#define FSA_LM_PHRASE 1 - -#define FSA_LM_DEBUG 0 -#if FSA_LM_DEBUG -# define FSALMDBG(e,x) FSADBGif(debug(),e,x) -# define FSALMDBGnl(e) FSADBGif_nl(debug(),e) -#else -# define FSALMDBG(e,x) -# define FSALMDBGnl(e) -#endif - -#include "ff_fsa.h" -#include "ff_lm.h" - -#ifndef TD__none -// replacing dependency on SRILM -#define TD__none -1 -#endif - -namespace { -WordID empty_context=TD__none; -} - -struct LanguageModelFsa : public FsaFeatureFunctionBase { - typedef WordID * W; - typedef WordID const* WP; - - // overrides; implementations in ff_lm.cc - typedef SingleFeatureAccumulator Accum; - static std::string usage(bool,bool); - LanguageModelFsa(std::string const& param); - int markov_order() const { return ctxlen_; } - void print_state(std::ostream &,void const *) const; - inline Featval floored(Featval p) const { - return pleft;--e) - if (e[-1]!=TD__none) break; - //post: [left,e] are the seen left words - return e; - } - - template - void ScanAccum(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& edge,WordID w,void const* old_st,void *new_st,Accum *a) const { -#if USE_INFO_EDGE - Hypergraph::Edge &de=(Hypergraph::Edge &)edge; -#endif - if (!ctxlen_) { - Add(floored(pimpl_->WordProb(w,&empty_context)),a); - } else { - WordID ctx[ngram_order_]; //alloca if you don't have C99 - state_copy(ctx,old_st); - ctx[ctxlen_]=TD__none; - Featval p=floored(pimpl_->WordProb(w,ctx)); - FSALMDBG(de,"p("<ShortenContext(nst,ctxlen_); -#endif - Add(p,a); - } - } - -#if FSA_LM_PHRASE - //FIXME: there is a bug in here somewhere, or else the 3gram LM we use gives different scores for phrases (impossible? BOW nonzero when shortening context past what LM has?) - template - void ScanPhraseAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge&edge,WordID const* begin,WordID const* end,void const* old_st,void *new_st,Accum *a) const { - Hypergraph::Edge &de=(Hypergraph::Edge &)edge;(void)de; - if (begin==end) return; // otherwise w/ shortening it's possible to end up with no words at all. - /* // this is forcing unigram prob always. we will instead build the phrase - if (!ctxlen_) { - Featval p=0; - for (;iWordProb(*i,e&mpty_context)); - Add(p,a); - return; - } */ - int nw=end-begin; - WP st=(WP)old_st; - WP st_end=st+ctxlen_; // may include some null already (or none if full) - int nboth=nw+ctxlen_; - WordID ctx[nboth+1]; - ctx[nboth]=TD__none; - // reverse order - state at very end of context, then [i,end) in rev order ending at ctx[0] - W ctx_score_end=wordcpy_reverse(ctx,begin,end); - wordcpy(ctx_score_end,st,st_end); // st already reversed. - assert(ctx_score_end==ctx+nw); - // we could just copy the filled state words, but it probably doesn't save much time (and might cost some to scan to find the nones. most contexts are full except for the shortest source spans. - FSALMDBG(de," scan.r->l("<ctx;--ctx_score_end) - p+=floored(pimpl_->WordProb(ctx_score_end[-1],ctx_score_end)); - //TODO: look for score discrepancy - - // i had some idea that maybe shortencontext would return a different prob if the length provided was > ctxlen_; however, since the same disagreement happens with LM_FSA_SHORTEN_CONTEXT 0 anyway, it's not that. perhaps look to SCAN_PHRASE_ACCUM_OVERRIDE - make sure they do the right thing. -#if LM_FSA_SHORTEN_CONTEXT - p+=pimpl_->ShortenContext(ctx,nboth - need to use factory rather than ctor. -#if 0 -template -inline void RegisterFsa(bool ff_also=true,bool fsa_prefix_ff=true) { - assert(!ff_also); -// global_fsa_ff_registry->RegisterFsa(); -//if (ff_also) ff_registry.RegisterFF >(prefix_fsa(DynFsa::usage(false,false)),fsa_prefix_ff); -} -#endif - -//TODO: ff from fsa that uses pointer to fsa impl? e.g. in LanguageModel we share underlying lm file by recognizing same param, but without that effort, otherwise stateful ff may duplicate state if we enable both fsa and ff_from_fsa -template -inline void RegisterFsaImpl(bool ff_also=true,bool fsa_prefix_ff=false) { - typedef FsaFeatureFunctionDynamic DynFsa; - typedef FeatureFunctionFromFsa FFFrom; - std::string name=FsaImpl::usage(false,false); - fsa_ff_registry.Register(new FsaFactory); - if (ff_also) - ff_registry.Register(prefix_fsa(name,fsa_prefix_ff),new FFFactory); -} template inline void RegisterFF() { ff_registry.Register(new FFFactory); } -template -inline void RegisterFsaDynToFF(std::string name,bool prefix=true) { - typedef FsaFeatureFunctionDynamic DynFsa; - ff_registry.Register(prefix?"DynamicFsa"+name:name,new FFFactory >); -} - -template -inline void RegisterFsaDynToFF(bool prefix=true) { - RegisterFsaDynToFF(FsaImpl::usage(false,false),prefix); -} - void register_feature_functions(); #endif diff --git a/decoder/hg_test.cc b/decoder/hg_test.cc index 3be5b82d..5d1910fb 100644 --- a/decoder/hg_test.cc +++ b/decoder/hg_test.cc @@ -57,7 +57,7 @@ TEST_F(HGTest,Union) { c3 = ViterbiESentence(hg1, &t3); int l3 = ViterbiPathLength(hg1); cerr << c3 << "\t" << TD::GetString(t3) << endl; - EXPECT_FLOAT_EQ(c2, c3); + EXPECT_FLOAT_EQ(c2.as_float(), c3.as_float()); EXPECT_EQ(TD::GetString(t2), TD::GetString(t3)); EXPECT_EQ(l2, l3); @@ -117,7 +117,7 @@ TEST_F(HGTest,InsideScore) { cerr << "cost: " << cost << "\n"; hg.PrintGraphviz(); prob_t inside = Inside(hg); - EXPECT_FLOAT_EQ(1.7934048, inside); // computed by hand + EXPECT_FLOAT_EQ(1.7934048, inside.as_float()); // computed by hand vector post; inside = hg.ComputeBestPathThroughEdges(&post); EXPECT_FLOAT_EQ(-0.3, log(inside)); // computed by hand @@ -282,13 +282,13 @@ TEST_F(HGTest, TestGenericInside) { hg.Reweight(wts); vector inside; prob_t ins = Inside(hg, &inside); - EXPECT_FLOAT_EQ(1.7934048, ins); // computed by hand + EXPECT_FLOAT_EQ(1.7934048, ins.as_float()); // computed by hand vector outside; Outside(hg, inside, &outside); EXPECT_EQ(3, outside.size()); - EXPECT_FLOAT_EQ(1.7934048, outside[0]); - EXPECT_FLOAT_EQ(1.3114071, outside[1]); - EXPECT_FLOAT_EQ(1.0, outside[2]); + EXPECT_FLOAT_EQ(1.7934048, outside[0].as_float()); + EXPECT_FLOAT_EQ(1.3114071, outside[1].as_float()); + EXPECT_FLOAT_EQ(1.0, outside[2].as_float()); } TEST_F(HGTest,TestGenericInside2) { @@ -327,8 +327,8 @@ TEST_F(HGTest,TestAddExpectations) { SparseVector feat_exps; prob_t z = InsideOutside, EdgeFeaturesAndProbWeightFunction>(hg, &feat_exps); - EXPECT_FLOAT_EQ(-2.5439765, feat_exps.value(FD::Convert("f1")) / z); - EXPECT_FLOAT_EQ(-2.6357865, feat_exps.value(FD::Convert("f2")) / z); + EXPECT_FLOAT_EQ(-2.5439765, (feat_exps.value(FD::Convert("f1")) / z).as_float()); + EXPECT_FLOAT_EQ(-2.6357865, (feat_exps.value(FD::Convert("f2")) / z).as_float()); cerr << feat_exps << endl; cerr << "Z=" << z << endl; } diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc index f87b7274..993627f0 100644 --- a/training/mpi_online_optimize.cc +++ b/training/mpi_online_optimize.cc @@ -9,6 +9,7 @@ #include #include +#include "stringlib.h" #include "verbose.h" #include "hg.h" #include "prob.h" @@ -204,6 +205,7 @@ bool LoadAgenda(const string& file, vector >* a) { } int main(int argc, char** argv) { + cerr << "THIS SOFTWARE IS DEPRECATED YOU SHOULD USE mpi_flex_optimize\n"; #ifdef HAVE_MPI mpi::environment env(argc, argv); mpi::communicator world; -- cgit v1.2.3 From e4bc27b99d21ce4f3c6a5568070186015c655155 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 14 Oct 2011 22:39:37 +0100 Subject: le optimizer --- training/Makefile.am | 4 + training/mpi_flex_optimize.cc | 346 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 350 insertions(+) create mode 100644 training/mpi_flex_optimize.cc (limited to 'training') diff --git a/training/Makefile.am b/training/Makefile.am index 0b598fd5..2a11ae52 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -12,6 +12,7 @@ bin_PROGRAMS = \ mpi_extract_reachable \ mpi_extract_features \ mpi_online_optimize \ + mpi_flex_optimize \ mpi_batch_optimize \ mpi_compute_cllh \ augment_grammar @@ -25,6 +26,9 @@ TESTS = lbfgs_test optimize_test mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc online_optimizer.cc optimize.cc +mpi_flex_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/training/mpi_flex_optimize.cc b/training/mpi_flex_optimize.cc new file mode 100644 index 00000000..87c5f331 --- /dev/null +++ b/training/mpi_flex_optimize.cc @@ -0,0 +1,346 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "stringlib.h" +#include "verbose.h" +#include "hg.h" +#include "prob.h" +#include "inside_outside.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "optimize.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" +#include "sampler.h" + +#ifdef HAVE_MPI +#include +#include +namespace mpi = boost::mpi; +#endif + +using namespace std; +namespace po = boost::program_options; + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("cdec_config,c",po::value(),"Decoder configuration file") + ("weights,w",po::value(),"Initial feature weights") + ("training_data,d",po::value(),"Training data") + ("minibatch_size_per_proc,s", po::value()->default_value(6), "Number of training instances evaluated per processor in each minibatch") + ("optimization_method,m", po::value()->default_value("lbfgs"), "Optimization method (options: lbfgs, sgd, rprop)") + ("minibatch_iterations,i", po::value()->default_value(10), "Number of optimization iterations per minibatch (1 = standard SGD)") + ("iterations,I", po::value()->default_value(50), "Number of passes through the training data before termination") + ("random_seed,S", po::value(), "Random seed (if not specified, /dev/random will be used)") + ("lbfgs_memory_buffers,M", po::value()->default_value(10), "Number of memory buffers for LBFGS history") + ("eta_0,e", po::value()->default_value(0.1), "Initial learning rate for SGD") + ("L1,1","Use L1 regularization") + ("L2,2","Use L2 regularization") + ("regularization_strength,C", po::value()->default_value(1.0), "Regularization strength (C)"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("training_data") || !conf->count("cdec_config")) { + cerr << "General-purpose minibatch online optimizer (MPI support " +#if HAVE_MPI + << "enabled" +#else + << "not enabled" +#endif + << ")\n" << dcmdline_options << endl; + return false; + } + return true; +} + +void ReadTrainingCorpus(const string& fname, int rank, int size, vector* c, vector* order) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + int id = 0; + while(in) { + getline(in, line); + if (!in) break; + if (id % size == rank) { + c->push_back(line); + order->push_back(id); + } + ++id; + } +} + +static const double kMINUS_EPSILON = -1e-6; + +struct CopyHGsObserver : public DecoderObserver { + Hypergraph* hg_; + Hypergraph* gold_hg_; + + // this can free up some memory + void RemoveRules(Hypergraph* h) { + for (unsigned i = 0; i < h->edges_.size(); ++i) + h->edges_[i].rule_.reset(); + } + + void SetCurrentHypergraphs(Hypergraph* h, Hypergraph* gold_h) { + hg_ = h; + gold_hg_ = gold_h; + } + + virtual void NotifyDecodingStart(const SentenceMetadata&) { + state = 1; + } + + // compute model expectations, denominator of objective + virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) { + *hg_ = *hg; + RemoveRules(hg_); + assert(state == 1); + state = 2; + } + + // compute "empirical" expectations, numerator of objective + virtual void NotifyAlignmentForest(const SentenceMetadata&, Hypergraph* hg) { + assert(state == 2); + state = 3; + *gold_hg_ = *hg; + RemoveRules(gold_hg_); + } + + virtual void NotifyDecodingComplete(const SentenceMetadata&) { + if (state == 3) { + } else { + hg_->clear(); + gold_hg_->clear(); + } + } + + int state; +}; + +void ReadConfig(const string& ini, istringstream* out) { + ReadFile rf(ini); + istream& in = *rf.stream(); + ostringstream os; + while(in) { + string line; + getline(in, line); + if (!in) continue; + os << line << endl; + } + out->str(os.str()); +} + +#ifdef HAVE_MPI +namespace boost { namespace mpi { + template<> + struct is_commutative >, SparseVector > + : mpl::true_ { }; +} } // end namespace boost::mpi +#endif + +void AddGrad(const SparseVector x, double s, SparseVector* acc) { + for (SparseVector::const_iterator it = x.begin(); it != x.end(); ++it) + acc->add_value(it->first, it->second.as_float() * s); +} + +int main(int argc, char** argv) { +#ifdef HAVE_MPI + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); +#else + const int size = 1; + const int rank = 0; +#endif + if (size > 1) SetSilent(true); // turn off verbose decoder output + register_feature_functions(); + MT19937* rng = NULL; + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) + return 1; + + boost::shared_ptr o; + const unsigned lbfgs_memory_buffers = conf["lbfgs_memory_buffers"].as(); + + istringstream ins; + ReadConfig(conf["cdec_config"].as(), &ins); + Decoder decoder(&ins); + + // load initial weights + vector init_weights; + if (conf.count("weights")) + Weights::InitFromFile(conf["weights"].as(), &init_weights); + + vector corpus; + vector ids; + ReadTrainingCorpus(conf["training_data"].as(), rank, size, &corpus, &ids); + assert(corpus.size() > 0); + + const unsigned size_per_proc = conf["minibatch_size_per_proc"].as(); + if (size_per_proc > corpus.size()) { + cerr << "Minibatch size must be smaller than corpus size!\n"; + return 1; + } + + size_t total_corpus_size = 0; +#ifdef HAVE_MPI + reduce(world, corpus.size(), total_corpus_size, std::plus(), 0); +#else + total_corpus_size = corpus.size(); +#endif + + if (conf.count("random_seed")) + rng = new MT19937(conf["random_seed"].as()); + else + rng = new MT19937; + + const unsigned minibatch_iterations = conf["minibatch_iterations"].as(); + + if (rank == 0) { + cerr << "Total corpus size: " << total_corpus_size << endl; + const unsigned batch_size = size_per_proc * size; + } + + SparseVector x; + Weights::InitSparseVector(init_weights, &x); + CopyHGsObserver observer; + + int write_weights_every_ith = 100; // TODO configure + int titer = -1; + + vector& lambdas = decoder.CurrentWeightVector(); + lambdas.swap(init_weights); + init_weights.clear(); + + int iter = -1; + bool converged = false; + while (!converged) { +#ifdef HAVE_MPI + mpi::timer timer; +#endif + x.init_vector(&lambdas); + ++iter; ++titer; +#if 0 + if (rank == 0) { + converged = (iter == max_iteration); + Weights::SanityCheck(lambdas); + Weights::ShowLargestFeatures(lambdas); + string fname = "weights.cur.gz"; + if (iter % write_weights_every_ith == 0) { + ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz"; + fname = o.str(); + } + if (converged && ((ai+1)==agenda.size())) { fname = "weights.final.gz"; } + ostringstream vv; + vv << "total iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << x.size() << '/' << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast(corpus.size())) << " eta=" << lr->eta(titer); + const string svv = vv.str(); + cerr << svv << endl; + Weights::WriteToFile(fname, lambdas, true, &svv); + } +#endif + + vector hgs(size_per_proc); + vector gold_hgs(size_per_proc); + for (int i = 0; i < size_per_proc; ++i) { + int ei = corpus.size() * rng->next(); + int id = ids[ei]; + observer.SetCurrentHypergraphs(&hgs[i], &gold_hgs[i]); + decoder.SetId(id); + decoder.Decode(corpus[ei], &observer); + } + + SparseVector local_grad, g; + double local_obj = 0; + o.reset(); + for (unsigned mi = 0; mi < minibatch_iterations; ++mi) { + local_grad.clear(); + g.clear(); + local_obj = 0; + + for (unsigned i = 0; i < size_per_proc; ++i) { + Hypergraph& hg = hgs[i]; + Hypergraph& hg_gold = gold_hgs[i]; + if (hg.edges_.size() < 2) continue; + + hg.Reweight(lambdas); + hg_gold.Reweight(lambdas); + SparseVector model_exp, gold_exp; + const prob_t z = InsideOutside, + EdgeFeaturesAndProbWeightFunction>(hg, &model_exp); + local_obj += log(z); + model_exp /= z; + AddGrad(model_exp, 1.0, &local_grad); + model_exp.clear(); + + const prob_t goldz = InsideOutside, + EdgeFeaturesAndProbWeightFunction>(hg_gold, &gold_exp); + local_obj -= log(goldz); + + if (log(z) - log(goldz) < kMINUS_EPSILON) { + cerr << "DIFF. ERR! log_model_z < log_gold_z: " << log(z) << " " << log(goldz) << endl; + return 1; + } + + gold_exp /= goldz; + AddGrad(gold_exp, -1.0, &local_grad); + } + + double obj = 0; +#ifdef HAVE_MPI + // TODO obj + reduce(world, local_grad, g, std::plus >(), 0); +#else + obj = local_obj; + g.swap(local_grad); +#endif + local_grad.clear(); + if (rank == 0) { + g /= (size_per_proc * size); + if (!o) + o.reset(new LBFGSOptimizer(FD::NumFeats(), lbfgs_memory_buffers)); + vector gg(FD::NumFeats()); + if (gg.size() != lambdas.size()) { lambdas.resize(gg.size()); } + for (SparseVector::const_iterator it = g.begin(); it != g.end(); ++it) + if (it->first) { gg[it->first] = it->second; } + cerr << "OBJ: " << obj << endl; + o->Optimize(obj, gg, &lambdas); + } +#ifdef HAVE_MPI + broadcast(world, x, 0); + broadcast(world, converged, 0); + world.barrier(); + if (rank == 0) { cerr << " ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; } +#endif + } + } + return 0; +} -- cgit v1.2.3