From d3fa97575ddfe1a91b13a407ade5f86bd305b2e0 Mon Sep 17 00:00:00 2001 From: Avneesh Date: Fri, 28 Sep 2012 11:09:33 -0700 Subject: adding latent SSVM code, modified Makefile.am and configure.ac files --- latent_svm/Makefile.am | 6 + latent_svm/latent_svm.cc | 412 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 418 insertions(+) create mode 100644 latent_svm/Makefile.am create mode 100644 latent_svm/latent_svm.cc diff --git a/latent_svm/Makefile.am b/latent_svm/Makefile.am new file mode 100644 index 00000000..673b9159 --- /dev/null +++ b/latent_svm/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = latent_svm + +latent_svm_SOURCES = latent_svm.cc +latent_svm_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/latent_svm/latent_svm.cc b/latent_svm/latent_svm.cc new file mode 100644 index 00000000..ab9c1d5d --- /dev/null +++ b/latent_svm/latent_svm.cc @@ -0,0 +1,412 @@ +/* +Points to note regarding variable names: +total_loss and prev_loss actually refer not to loss, but the metric (usually BLEU) +*/ +#include +#include +#include +#include +#include + +//boost libraries +#include +#include +#include + +//cdec libraries +#include "config.h" +#include "hg_sampler.h" +#include "sentence_metadata.h" +#include "scorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" +#include "sampler.h" + +using namespace std; +using boost::shared_ptr; +namespace po = boost::program_options; + +bool invert_score; +boost::shared_ptr rng; //random seed ptr + +void RandomPermutation(int len, vector* p_ids) { + vector& ids = *p_ids; + ids.resize(len); + for (int i = 0; i < len; ++i) ids[i] = i; + for (int i = len; i > 0; --i) { + int j = rng->next() * i; + if (j == i) i--; + swap(ids[i-1], ids[j]); + } +} + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("weights,w",po::value(),"[REQD] Input feature weights file") + ("input,i",po::value(),"[REQD] Input source file for development set") + ("passes,p", po::value()->default_value(15), "Number of passes through the training data") + ("weights_write_interval,n", po::value()->default_value(1000), "Number of lines between writing out weights") + ("reference,r",po::value >(), "[REQD] Reference translation(s) (tokenized text file)") + ("mt_metric,m",po::value()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") + ("regularizer_strength,C", po::value()->default_value(0.01), "regularization strength") + ("mt_metric_scale,s", po::value()->default_value(1.0), "Cost function is -mt_metric_scale*BLEU") + ("costaug_log_bleu,l", "Flag converts BLEU to log space. Cost function is thus -mt_metric_scale*log(BLEU). Not on by default") + ("average,A", "Average the weights (this is a weighted average due to the scaling factor)") + ("mu,u", po::value()->default_value(0.0), "weight (between 0 and 1) to scale model score by for oracle selection") + ("stepsize_param,a", po::value()->default_value(0.01), "Stepsize parameter, during optimization") + ("stepsize_reduce,t", "Divide step size by sqrt(number of examples seen so far), as per Ratliff et al., 2007") + ("metric_threshold,T", po::value()->default_value(0.0), "Threshold for diff between oracle BLEU and cost-aug BLEU for updating the weights") + ("check_positive,P", "Check that the loss is positive before updating") + ("k_best_size,k", po::value()->default_value(250), "Size of hypothesis list to search for oracles") + ("best_ever,b", "Keep track of the best hypothesis we've ever seen (metric score), and use that as the reference") + ("random_seed,S", po::value(), "Random seed (if not specified, /dev/random will be used)") + ("decoder_config,c",po::value(),"Decoder configuration file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("weights") || !conf->count("input") || !conf->count("decoder_config") || !conf->count("reference")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +double scaling_trick = 1; // see http://blog.smola.org/post/940672544/fast-quadratic-regularization-for-online-learning +/*computes and returns cost augmented score for negative example selection*/ +double cost_augmented_score(const LogVal model_score, const double mt_metric_score, const double mt_metric_scale, const bool logbleu) { + if(logbleu) { + if(mt_metric_score != 0) + // NOTE: log(model_score) is just the model score feature weights * features + return log(model_score) * scaling_trick + (- mt_metric_scale * log(mt_metric_score)); + else + return -1000000; + } + // NOTE: log(model_score) is just the model score feature weights * features + return log(model_score) * scaling_trick + (- mt_metric_scale * mt_metric_score); +} + +/*computes and returns mu score, for oracle selection*/ +double muscore(const vector& feature_weights, const SparseVector& feature_values, const double mt_metric_score, const double mu, const bool logbleu) { + if(logbleu) { + if(mt_metric_score != 0) + return feature_values.dot(feature_weights) * mu + (1 - mu) * log(mt_metric_score); + else + return feature_values.dot(feature_weights) * mu + (1 - mu) * (-1000000); // log(0) is -inf + } + return feature_values.dot(feature_weights) * mu + (1 - mu) * mt_metric_score; +} + +static const double kMINUS_EPSILON = -1e-6; + +struct HypothesisInfo { + SparseVector features; + double mt_metric_score; + // The model score changes when the feature weights change, so it is not stored here + // It must be recomputed every time +}; + +struct GoodOracle { + shared_ptr good; +}; + +struct TrainingObserver : public DecoderObserver { + TrainingObserver(const int k, + const DocScorer& d, + vector* o, + const vector& feat_weights, + const double metric_scale, + const double Mu, + const bool bestever, + const bool LogBleu) : ds(d), feature_weights(feat_weights), oracles(*o), kbest_size(k), mt_metric_scale(metric_scale), mu(Mu), best_ever(bestever), log_bleu(LogBleu) {} + const DocScorer& ds; + const vector& feature_weights; + vector& oracles; + shared_ptr cur_best; + shared_ptr cur_costaug_best; + shared_ptr cur_ref; + const int kbest_size; + const double mt_metric_scale; + const double mu; + const bool best_ever; + const bool log_bleu; + + const HypothesisInfo& GetCurrentBestHypothesis() const { + return *cur_best; + } + + const HypothesisInfo& GetCurrentCostAugmentedHypothesis() const { + return *cur_costaug_best; + } + + const HypothesisInfo& GetCurrentReference() const { + return *cur_ref; + } + + virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { + UpdateOracles(smeta.GetSentenceID(), *hg); + } + + shared_ptr MakeHypothesisInfo(const SparseVector& feats, const double metric) { + shared_ptr h(new HypothesisInfo); + h->features = feats; + h->mt_metric_score = metric; + return h; + } + + void UpdateOracles(int sent_id, const Hypergraph& forest) { + //shared_ptr& cur_ref = oracles[sent_id].good; + cur_ref = oracles[sent_id].good; + if(!best_ever) + cur_ref.reset(); + + KBest::KBestDerivations, ESentenceTraversal> kbest(forest, kbest_size); + double costaug_best_score = 0; + + for (int i = 0; i < kbest_size; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + double mt_metric_score = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore(); //this might need to change!! + const SparseVector& feature_vals = d->feature_values; + double costaugmented_score = cost_augmented_score(d->score, mt_metric_score, mt_metric_scale, log_bleu); //note that d->score, i.e., model score, is passed in + if (i == 0) { //i.e., setting up cur_best to be model score highest, and initializing costaug_best + cur_best = MakeHypothesisInfo(feature_vals, mt_metric_score); + cur_costaug_best = cur_best; + costaug_best_score = costaugmented_score; + } + if (costaugmented_score > costaug_best_score) { // kbest_mira's cur_bad, i.e., "fear" derivation + cur_costaug_best = MakeHypothesisInfo(feature_vals, mt_metric_score); + costaug_best_score = costaugmented_score; + } + double cur_muscore = mt_metric_score; + if (!cur_ref) // kbest_mira's cur_good, i.e., "hope" derivation + cur_ref = MakeHypothesisInfo(feature_vals, cur_muscore); + else { + double cur_ref_muscore = cur_ref->mt_metric_score; + if(mu > 0) { //select oracle with mixture of model score and BLEU + cur_ref_muscore = muscore(feature_weights, cur_ref->features, cur_ref->mt_metric_score, mu, log_bleu); + cur_muscore = muscore(feature_weights, d->feature_values, mt_metric_score, mu, log_bleu); + } + if (cur_muscore > cur_ref_muscore) //replace oracle + cur_ref = MakeHypothesisInfo(feature_vals, mt_metric_score); + } + } + } +}; + +void ReadTrainingCorpus(const string& fname, vector* c) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (!in) break; + c->push_back(line); + } +} + +bool ApproxEqual(double a, double b) { + if (a == b) return true; + return (fabs(a-b)/fabs(b)) < 0.000001; +} + +int main(int argc, char** argv) { + register_feature_functions(); + SetSilent(true); // turn off verbose decoder output + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) return 1; + + if (conf.count("random_seed")) + rng.reset(new MT19937(conf["random_seed"].as())); + else + rng.reset(new MT19937); + + const bool best_ever = conf.count("best_ever") > 0; + vector corpus; + ReadTrainingCorpus(conf["input"].as(), &corpus); + + const string metric_name = conf["mt_metric"].as(); //set up scoring; this may need to be changed!! + + ScoreType type = ScoreTypeFromString(metric_name); + if (type == TER) { + invert_score = true; + } else { + invert_score = false; + } + DocScorer ds(type, conf["reference"].as >(), ""); + cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl; + if (ds.size() != corpus.size()) { + cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; + return 1; + } + + ReadFile ini_rf(conf["decoder_config"].as()); + Decoder decoder(ini_rf.stream()); + + // load initial weights + vector& decoder_weights = decoder.CurrentWeightVector(); //equivalent to "dense_weights" vector in kbest_mira.cc + SparseVector sparse_weights; //equivaelnt to kbest_mira.cc "lambdas" + Weights::InitFromFile(conf["weights"].as(), &decoder_weights); + Weights::InitSparseVector(decoder_weights, &sparse_weights); + + //initializing other algorithm and output parameters + const double c = conf["regularizer_strength"].as(); + const int weights_write_interval = conf["weights_write_interval"].as(); + const double mt_metric_scale = conf["mt_metric_scale"].as(); + const double mu = conf["mu"].as(); + const double metric_threshold = conf["metric_threshold"].as(); + const double stepsize_param = conf["stepsize_param"].as(); //step size in structured SGD optimization step + const bool stepsize_reduce = conf.count("stepsize_reduce") > 0; + const bool costaug_log_bleu = conf.count("costaug_log_bleu") > 0; + const bool average = conf.count("average") > 0; + const bool checkpositive = conf.count("check_positive") > 0; + + assert(corpus.size() > 0); + vector oracles(corpus.size()); + TrainingObserver observer(conf["k_best_size"].as(), // kbest size + ds, // doc scorer + &oracles, + decoder_weights, + mt_metric_scale, + mu, + best_ever, + costaug_log_bleu); + int cur_sent = 0; + int line_count = 0; + int normalizer = 0; + double total_loss = 0; + double prev_loss = 0; + int dots = 0; // progess bar + int cur_pass = 0; + SparseVector tot; + tot += sparse_weights; //add initial weights to total + normalizer++; //add 1 to normalizer + int max_iteration = conf["passes"].as(); + string msg = "# LatentSVM tuned weights"; + vector order; + int interval_counter = 0; + RandomPermutation(corpus.size(), &order); //shuffle corpus + while (line_count <= max_iteration * corpus.size()) { //loop over all (passes * num sentences) examples + //if ((interval_counter * 40 / weights_write_interval) > dots) { ++dots; cerr << '.'; } //check this + if ((cur_sent * 40 / corpus.size()) > dots) { ++dots; cerr << '.';} + if (interval_counter == weights_write_interval) { //i.e., we need to write out weights + sparse_weights *= scaling_trick; + tot *= scaling_trick; + scaling_trick = 1; + cerr << " [SENTENCE NUMBER= " << cur_sent << "\n"; + cerr << " [AVG METRIC LAST INTERVAL =" << ((total_loss - prev_loss) / weights_write_interval) << "]\n"; + cerr << " [AVG METRIC THIS PASS THUS FAR =" << (total_loss / cur_sent) << "]\n"; + cerr << " [TOTAL LOSS: =" << total_loss << "\n"; + Weights::ShowLargestFeatures(decoder_weights); + //dots = 0; + interval_counter = 0; + prev_loss = total_loss; + if (average){ + SparseVector x = tot; + x /= normalizer; + ostringstream sa; + sa << "weights.latentsvm-" << line_count/weights_write_interval << "-avg.gz"; + x.init_vector(&decoder_weights); + Weights::WriteToFile(sa.str(), decoder_weights, true, &msg); + } + else { + ostringstream os; + os << "weights.latentsvm-" << line_count/weights_write_interval << ".gz"; + sparse_weights.init_vector(&decoder_weights); + Weights::WriteToFile(os.str(), decoder_weights, true, &msg); + } + } + if (corpus.size() == cur_sent) { //i.e., finished a pass + //cerr << " [AVG METRIC LAST PASS=" << (document_metric_score / corpus.size()) << "]\n"; + cerr << " [AVG METRIC LAST PASS=" << (total_loss / corpus.size()) << "]\n"; + cerr << " TOTAL LOSS: " << total_loss << "\n"; + Weights::ShowLargestFeatures(decoder_weights); + cur_sent = 0; + total_loss = 0; + dots = 0; + if(average) { + SparseVector x = tot; + x /= normalizer; + ostringstream sa; + sa << "weights.latentsvm-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "-avg.gz"; + x.init_vector(&decoder_weights); + Weights::WriteToFile(sa.str(), decoder_weights, true, &msg); + } + else { + ostringstream os; + os << "weights.latentsvm-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << ".gz"; + Weights::WriteToFile(os.str(), decoder_weights, true, &msg); + } + cur_pass++; + RandomPermutation(corpus.size(), &order); + } + if (cur_sent == 0) { //i.e., starting a new pass + cerr << "PASS " << (line_count / corpus.size() + 1) << endl; + } + sparse_weights.init_vector(&decoder_weights); // copy sparse_weights to the decoder weights + decoder.SetId(order[cur_sent]); //assign current sentence + decoder.Decode(corpus[order[cur_sent]], &observer); // decode/update oracles + + const HypothesisInfo& cur_best = observer.GetCurrentBestHypothesis(); //model score best + const HypothesisInfo& cur_costaug = observer.GetCurrentCostAugmentedHypothesis(); //(model + cost) best; cost = -metric_scale*log(BLEU) or -metric_scale*BLEU + //const HypothesisInfo& cur_ref = *oracles[order[cur_sent]].good; //this oracle-best line only picks based on BLEU + const HypothesisInfo& cur_ref = observer.GetCurrentReference(); //if mu > 0, this mu-mixed oracle will be picked; otherwise, only on BLEU + total_loss += cur_best.mt_metric_score; + + double step_size = stepsize_param; + if (stepsize_reduce){ // w_{t+1} = w_t - stepsize_t * grad(Loss) + step_size /= (sqrt(cur_sent+1.0)); + } + //actual update step - compute gradient, and modify sparse_weights + if(cur_ref.mt_metric_score - cur_costaug.mt_metric_score > metric_threshold) { + const double loss = (cur_costaug.features.dot(decoder_weights) - cur_ref.features.dot(decoder_weights)) * scaling_trick + mt_metric_scale * (cur_ref.mt_metric_score - cur_costaug.mt_metric_score); + if (!checkpositive || loss > 0.0) { //can update either all the time if check positive is off, or only when loss > 0 if it's on + sparse_weights -= cur_costaug.features * step_size / ((1.0-2.0*step_size*c)*scaling_trick); // cost augmented hyp orig - + sparse_weights += cur_ref.features * step_size / ((1.0-2.0*step_size*c)*scaling_trick); // ref orig + + } + } + scaling_trick *= (1.0 - 2.0 * step_size * c); + + tot += sparse_weights; //for averaging purposes + normalizer++; //for averaging purposes + line_count++; + interval_counter++; + cur_sent++; + } + cerr << endl; + if(average) { + tot /= normalizer; + tot.init_vector(decoder_weights); + msg = "# Latent SSVM tuned weights (averaged vector)"; + Weights::WriteToFile("weights.latentsvm-final-avg.gz", decoder_weights, true, &msg); + cerr << "Optimization complete.\n" << "AVERAGED WEIGHTS: weights.latentsvm-final-avg.gz\n"; + } else { + Weights::WriteToFile("weights.latentsvm-final.gz", decoder_weights, true, &msg); + cerr << "Optimization complete.\n"; + } + return 0; +} + -- cgit v1.2.3 From be7f57fdd484e063775d7abf083b9fa4c403b610 Mon Sep 17 00:00:00 2001 From: Avneesh Date: Fri, 28 Sep 2012 11:22:55 -0700 Subject: latent SSVM code, new Makefile.am and configure.ac --- Makefile.am | 3 ++- configure.ac | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index 24aafd63..b16fc90f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -11,6 +11,7 @@ SUBDIRS = \ training \ training/liblbfgs \ mira \ + latent_svm \ dtrain \ dpmert \ pro-train \ @@ -18,7 +19,7 @@ SUBDIRS = \ minrisk \ gi/pf \ gi/markov_al \ - rst_parser + rst_parser #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava diff --git a/configure.ac b/configure.ac index ea9e84fb..dc68ab77 100644 --- a/configure.ac +++ b/configure.ac @@ -124,6 +124,7 @@ AC_CONFIG_FILES([minrisk/Makefile]) AC_CONFIG_FILES([klm/util/Makefile]) AC_CONFIG_FILES([klm/lm/Makefile]) AC_CONFIG_FILES([mira/Makefile]) +AC_CONFIG_FILES([latent_svm/Makefile]) AC_CONFIG_FILES([dtrain/Makefile]) AC_CONFIG_FILES([gi/pyp-topics/src/Makefile]) AC_CONFIG_FILES([gi/clda/src/Makefile]) -- cgit v1.2.3 From 4201c2acfc03c5a0d8ae6a82628e18046020d873 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 23 Mar 2013 23:09:37 -0400 Subject: fix rules features --- decoder/ff_rules.cc | 20 ++++++++++++++++---- decoder/ff_rules.h | 1 + 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/decoder/ff_rules.cc b/decoder/ff_rules.cc index 6716d3da..410e083c 100644 --- a/decoder/ff_rules.cc +++ b/decoder/ff_rules.cc @@ -107,7 +107,12 @@ void RuleSourceBigramFeatures::TraversalFeaturesImpl(const SentenceMetadata& sme (*features) += it->second; } -RuleTargetBigramFeatures::RuleTargetBigramFeatures(const std::string& param) { +RuleTargetBigramFeatures::RuleTargetBigramFeatures(const std::string& param) : inds(1000) { + for (unsigned i = 0; i < inds.size(); ++i) { + ostringstream os; + os << (i + 1); + inds[i] = os.str(); + } } void RuleTargetBigramFeatures::PrepareForInput(const SentenceMetadata& smeta) { @@ -126,11 +131,18 @@ void RuleTargetBigramFeatures::TraversalFeaturesImpl(const SentenceMetadata& sme it = rule2_feats_.insert(make_pair(&rule, SparseVector())).first; SparseVector& f = it->second; string prev = ""; + vector nt_types(rule.Arity()); + unsigned ntc = 0; + for (int i = 0; i < rule.f_.size(); ++i) + if (rule.f_[i] < 0) nt_types[ntc++] = -rule.f_[i]; for (int i = 0; i < rule.e_.size(); ++i) { WordID w = rule.e_[i]; - if (w < 0) w = -w; - if (w == 0) return; - const string& cur = TD::Convert(w); + string cur; + if (w > 0) { + cur = TD::Convert(w); + } else { + cur = TD::Convert(nt_types[-w]) + inds[-w]; + } ostringstream os; os << "RBT:" << prev << '_' << cur; const int fid = FD::Convert(Escape(os.str())); diff --git a/decoder/ff_rules.h b/decoder/ff_rules.h index b100ec34..f210dc65 100644 --- a/decoder/ff_rules.h +++ b/decoder/ff_rules.h @@ -51,6 +51,7 @@ class RuleTargetBigramFeatures : public FeatureFunction { void* context) const; virtual void PrepareForInput(const SentenceMetadata& smeta); private: + std::vector inds; mutable std::map > rule2_feats_; }; -- cgit v1.2.3 From 96fedabebafe7a38a6d5928be8fff767e411d705 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 26 Mar 2013 10:44:45 -0400 Subject: swahili abbreviations --- corpus/support/token_list | 152 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/corpus/support/token_list b/corpus/support/token_list index 366cd7ff..43dd80d9 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -294,3 +294,155 @@ Z. т.н. т.ч. н.э. +# Swahili +A.D. +Afr. +A.G. +agh. +A.H. +A.M. +a.s. +B.A. +B.C. +Bi. +B.J. +B.K. +B.O.M. +Brig. +Bro. +bt. +bw. +Bw. +Cap. +C.C. +cCM. +C.I.A. +cit. +C.M.S. +Co. +Corp. +C.S.Sp. +C.W. +D.C. +Dk. +Dkt. +Dk.B. +Dr. +E.C. +e.g. +E.M. +E.n. +etc. +Feb. +F.F.U. +F.M. +Fr. +F.W. +I.C.O. +i.e. +I.L.C. +Inc. +Jan. +J.F. +Jr. +J.S. +J.V.W.A. +K.A.R. +K.A.U. +K.C.M.C. +K.k. +K.K. +k.m. +km. +K.m. +K.N.C.U. +K.O. +K.S. +Ksh. +kt. +kumb. +k.v. +kv. +L.G. +ltd. +Ltd. +M.A. +M.D. +mf. +Mh. +Mhe. +mil. +m.m. +M.m. +Mm. +M.M. +Mr. +Mrs. +M.S. +Mt. +Mw. +M.W. +Mwl. +na. +Na. +N.F. +N.J. +n.k. +nk. +n.k.w. +N.N. +Nov. +O.C.D. +op. +P.C. +Phd. +Ph.D. +P.J. +P.o. +P.O. +P.O.P. +P.P.F. +Prof. +P.s. +P.S. +Q.C. +Rd. +s.a.w. +S.A.W. +S.D. +Sept. +sh. +Sh. +SH. +shs. +Shs. +S.J. +S.L. +S.L.P. +S.s. +S.S. +St. +s.w. +s.w.T. +taz. +Taz. +T.C. +T.E.C. +T.L.P. +T.O.H.S. +Tsh. +T.V. +tz. +uk. +Uk. +U.M.C.A. +U.N. +U.S. +Ush. +U.W.T. +Viii. +Vol. +V.T.C. +W.H. +yamb. +Y.M.C.A. -- cgit v1.2.3 From 52194eb06b5b8b95eea38503dcc3253bb6d64171 Mon Sep 17 00:00:00 2001 From: Avneesh Saluja Date: Thu, 28 Mar 2013 18:57:58 -0700 Subject: re-organized latent SVM (sub-dir of training now) --- latent_svm/Makefile.am | 6 - latent_svm/latent_svm.cc | 412 ----------------------------------------------- 2 files changed, 418 deletions(-) delete mode 100644 latent_svm/Makefile.am delete mode 100644 latent_svm/latent_svm.cc diff --git a/latent_svm/Makefile.am b/latent_svm/Makefile.am deleted file mode 100644 index 673b9159..00000000 --- a/latent_svm/Makefile.am +++ /dev/null @@ -1,6 +0,0 @@ -bin_PROGRAMS = latent_svm - -latent_svm_SOURCES = latent_svm.cc -latent_svm_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/latent_svm/latent_svm.cc b/latent_svm/latent_svm.cc deleted file mode 100644 index ab9c1d5d..00000000 --- a/latent_svm/latent_svm.cc +++ /dev/null @@ -1,412 +0,0 @@ -/* -Points to note regarding variable names: -total_loss and prev_loss actually refer not to loss, but the metric (usually BLEU) -*/ -#include -#include -#include -#include -#include - -//boost libraries -#include -#include -#include - -//cdec libraries -#include "config.h" -#include "hg_sampler.h" -#include "sentence_metadata.h" -#include "scorer.h" -#include "verbose.h" -#include "viterbi.h" -#include "hg.h" -#include "prob.h" -#include "kbest.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "sampler.h" - -using namespace std; -using boost::shared_ptr; -namespace po = boost::program_options; - -bool invert_score; -boost::shared_ptr rng; //random seed ptr - -void RandomPermutation(int len, vector* p_ids) { - vector& ids = *p_ids; - ids.resize(len); - for (int i = 0; i < len; ++i) ids[i] = i; - for (int i = len; i > 0; --i) { - int j = rng->next() * i; - if (j == i) i--; - swap(ids[i-1], ids[j]); - } -} - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("weights,w",po::value(),"[REQD] Input feature weights file") - ("input,i",po::value(),"[REQD] Input source file for development set") - ("passes,p", po::value()->default_value(15), "Number of passes through the training data") - ("weights_write_interval,n", po::value()->default_value(1000), "Number of lines between writing out weights") - ("reference,r",po::value >(), "[REQD] Reference translation(s) (tokenized text file)") - ("mt_metric,m",po::value()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") - ("regularizer_strength,C", po::value()->default_value(0.01), "regularization strength") - ("mt_metric_scale,s", po::value()->default_value(1.0), "Cost function is -mt_metric_scale*BLEU") - ("costaug_log_bleu,l", "Flag converts BLEU to log space. Cost function is thus -mt_metric_scale*log(BLEU). Not on by default") - ("average,A", "Average the weights (this is a weighted average due to the scaling factor)") - ("mu,u", po::value()->default_value(0.0), "weight (between 0 and 1) to scale model score by for oracle selection") - ("stepsize_param,a", po::value()->default_value(0.01), "Stepsize parameter, during optimization") - ("stepsize_reduce,t", "Divide step size by sqrt(number of examples seen so far), as per Ratliff et al., 2007") - ("metric_threshold,T", po::value()->default_value(0.0), "Threshold for diff between oracle BLEU and cost-aug BLEU for updating the weights") - ("check_positive,P", "Check that the loss is positive before updating") - ("k_best_size,k", po::value()->default_value(250), "Size of hypothesis list to search for oracles") - ("best_ever,b", "Keep track of the best hypothesis we've ever seen (metric score), and use that as the reference") - ("random_seed,S", po::value(), "Random seed (if not specified, /dev/random will be used)") - ("decoder_config,c",po::value(),"Decoder configuration file"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("weights") || !conf->count("input") || !conf->count("decoder_config") || !conf->count("reference")) { - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -double scaling_trick = 1; // see http://blog.smola.org/post/940672544/fast-quadratic-regularization-for-online-learning -/*computes and returns cost augmented score for negative example selection*/ -double cost_augmented_score(const LogVal model_score, const double mt_metric_score, const double mt_metric_scale, const bool logbleu) { - if(logbleu) { - if(mt_metric_score != 0) - // NOTE: log(model_score) is just the model score feature weights * features - return log(model_score) * scaling_trick + (- mt_metric_scale * log(mt_metric_score)); - else - return -1000000; - } - // NOTE: log(model_score) is just the model score feature weights * features - return log(model_score) * scaling_trick + (- mt_metric_scale * mt_metric_score); -} - -/*computes and returns mu score, for oracle selection*/ -double muscore(const vector& feature_weights, const SparseVector& feature_values, const double mt_metric_score, const double mu, const bool logbleu) { - if(logbleu) { - if(mt_metric_score != 0) - return feature_values.dot(feature_weights) * mu + (1 - mu) * log(mt_metric_score); - else - return feature_values.dot(feature_weights) * mu + (1 - mu) * (-1000000); // log(0) is -inf - } - return feature_values.dot(feature_weights) * mu + (1 - mu) * mt_metric_score; -} - -static const double kMINUS_EPSILON = -1e-6; - -struct HypothesisInfo { - SparseVector features; - double mt_metric_score; - // The model score changes when the feature weights change, so it is not stored here - // It must be recomputed every time -}; - -struct GoodOracle { - shared_ptr good; -}; - -struct TrainingObserver : public DecoderObserver { - TrainingObserver(const int k, - const DocScorer& d, - vector* o, - const vector& feat_weights, - const double metric_scale, - const double Mu, - const bool bestever, - const bool LogBleu) : ds(d), feature_weights(feat_weights), oracles(*o), kbest_size(k), mt_metric_scale(metric_scale), mu(Mu), best_ever(bestever), log_bleu(LogBleu) {} - const DocScorer& ds; - const vector& feature_weights; - vector& oracles; - shared_ptr cur_best; - shared_ptr cur_costaug_best; - shared_ptr cur_ref; - const int kbest_size; - const double mt_metric_scale; - const double mu; - const bool best_ever; - const bool log_bleu; - - const HypothesisInfo& GetCurrentBestHypothesis() const { - return *cur_best; - } - - const HypothesisInfo& GetCurrentCostAugmentedHypothesis() const { - return *cur_costaug_best; - } - - const HypothesisInfo& GetCurrentReference() const { - return *cur_ref; - } - - virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { - UpdateOracles(smeta.GetSentenceID(), *hg); - } - - shared_ptr MakeHypothesisInfo(const SparseVector& feats, const double metric) { - shared_ptr h(new HypothesisInfo); - h->features = feats; - h->mt_metric_score = metric; - return h; - } - - void UpdateOracles(int sent_id, const Hypergraph& forest) { - //shared_ptr& cur_ref = oracles[sent_id].good; - cur_ref = oracles[sent_id].good; - if(!best_ever) - cur_ref.reset(); - - KBest::KBestDerivations, ESentenceTraversal> kbest(forest, kbest_size); - double costaug_best_score = 0; - - for (int i = 0; i < kbest_size; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(forest.nodes_.size() - 1, i); - if (!d) break; - double mt_metric_score = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore(); //this might need to change!! - const SparseVector& feature_vals = d->feature_values; - double costaugmented_score = cost_augmented_score(d->score, mt_metric_score, mt_metric_scale, log_bleu); //note that d->score, i.e., model score, is passed in - if (i == 0) { //i.e., setting up cur_best to be model score highest, and initializing costaug_best - cur_best = MakeHypothesisInfo(feature_vals, mt_metric_score); - cur_costaug_best = cur_best; - costaug_best_score = costaugmented_score; - } - if (costaugmented_score > costaug_best_score) { // kbest_mira's cur_bad, i.e., "fear" derivation - cur_costaug_best = MakeHypothesisInfo(feature_vals, mt_metric_score); - costaug_best_score = costaugmented_score; - } - double cur_muscore = mt_metric_score; - if (!cur_ref) // kbest_mira's cur_good, i.e., "hope" derivation - cur_ref = MakeHypothesisInfo(feature_vals, cur_muscore); - else { - double cur_ref_muscore = cur_ref->mt_metric_score; - if(mu > 0) { //select oracle with mixture of model score and BLEU - cur_ref_muscore = muscore(feature_weights, cur_ref->features, cur_ref->mt_metric_score, mu, log_bleu); - cur_muscore = muscore(feature_weights, d->feature_values, mt_metric_score, mu, log_bleu); - } - if (cur_muscore > cur_ref_muscore) //replace oracle - cur_ref = MakeHypothesisInfo(feature_vals, mt_metric_score); - } - } - } -}; - -void ReadTrainingCorpus(const string& fname, vector* c) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - while(in) { - getline(in, line); - if (!in) break; - c->push_back(line); - } -} - -bool ApproxEqual(double a, double b) { - if (a == b) return true; - return (fabs(a-b)/fabs(b)) < 0.000001; -} - -int main(int argc, char** argv) { - register_feature_functions(); - SetSilent(true); // turn off verbose decoder output - - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) return 1; - - if (conf.count("random_seed")) - rng.reset(new MT19937(conf["random_seed"].as())); - else - rng.reset(new MT19937); - - const bool best_ever = conf.count("best_ever") > 0; - vector corpus; - ReadTrainingCorpus(conf["input"].as(), &corpus); - - const string metric_name = conf["mt_metric"].as(); //set up scoring; this may need to be changed!! - - ScoreType type = ScoreTypeFromString(metric_name); - if (type == TER) { - invert_score = true; - } else { - invert_score = false; - } - DocScorer ds(type, conf["reference"].as >(), ""); - cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl; - if (ds.size() != corpus.size()) { - cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; - return 1; - } - - ReadFile ini_rf(conf["decoder_config"].as()); - Decoder decoder(ini_rf.stream()); - - // load initial weights - vector& decoder_weights = decoder.CurrentWeightVector(); //equivalent to "dense_weights" vector in kbest_mira.cc - SparseVector sparse_weights; //equivaelnt to kbest_mira.cc "lambdas" - Weights::InitFromFile(conf["weights"].as(), &decoder_weights); - Weights::InitSparseVector(decoder_weights, &sparse_weights); - - //initializing other algorithm and output parameters - const double c = conf["regularizer_strength"].as(); - const int weights_write_interval = conf["weights_write_interval"].as(); - const double mt_metric_scale = conf["mt_metric_scale"].as(); - const double mu = conf["mu"].as(); - const double metric_threshold = conf["metric_threshold"].as(); - const double stepsize_param = conf["stepsize_param"].as(); //step size in structured SGD optimization step - const bool stepsize_reduce = conf.count("stepsize_reduce") > 0; - const bool costaug_log_bleu = conf.count("costaug_log_bleu") > 0; - const bool average = conf.count("average") > 0; - const bool checkpositive = conf.count("check_positive") > 0; - - assert(corpus.size() > 0); - vector oracles(corpus.size()); - TrainingObserver observer(conf["k_best_size"].as(), // kbest size - ds, // doc scorer - &oracles, - decoder_weights, - mt_metric_scale, - mu, - best_ever, - costaug_log_bleu); - int cur_sent = 0; - int line_count = 0; - int normalizer = 0; - double total_loss = 0; - double prev_loss = 0; - int dots = 0; // progess bar - int cur_pass = 0; - SparseVector tot; - tot += sparse_weights; //add initial weights to total - normalizer++; //add 1 to normalizer - int max_iteration = conf["passes"].as(); - string msg = "# LatentSVM tuned weights"; - vector order; - int interval_counter = 0; - RandomPermutation(corpus.size(), &order); //shuffle corpus - while (line_count <= max_iteration * corpus.size()) { //loop over all (passes * num sentences) examples - //if ((interval_counter * 40 / weights_write_interval) > dots) { ++dots; cerr << '.'; } //check this - if ((cur_sent * 40 / corpus.size()) > dots) { ++dots; cerr << '.';} - if (interval_counter == weights_write_interval) { //i.e., we need to write out weights - sparse_weights *= scaling_trick; - tot *= scaling_trick; - scaling_trick = 1; - cerr << " [SENTENCE NUMBER= " << cur_sent << "\n"; - cerr << " [AVG METRIC LAST INTERVAL =" << ((total_loss - prev_loss) / weights_write_interval) << "]\n"; - cerr << " [AVG METRIC THIS PASS THUS FAR =" << (total_loss / cur_sent) << "]\n"; - cerr << " [TOTAL LOSS: =" << total_loss << "\n"; - Weights::ShowLargestFeatures(decoder_weights); - //dots = 0; - interval_counter = 0; - prev_loss = total_loss; - if (average){ - SparseVector x = tot; - x /= normalizer; - ostringstream sa; - sa << "weights.latentsvm-" << line_count/weights_write_interval << "-avg.gz"; - x.init_vector(&decoder_weights); - Weights::WriteToFile(sa.str(), decoder_weights, true, &msg); - } - else { - ostringstream os; - os << "weights.latentsvm-" << line_count/weights_write_interval << ".gz"; - sparse_weights.init_vector(&decoder_weights); - Weights::WriteToFile(os.str(), decoder_weights, true, &msg); - } - } - if (corpus.size() == cur_sent) { //i.e., finished a pass - //cerr << " [AVG METRIC LAST PASS=" << (document_metric_score / corpus.size()) << "]\n"; - cerr << " [AVG METRIC LAST PASS=" << (total_loss / corpus.size()) << "]\n"; - cerr << " TOTAL LOSS: " << total_loss << "\n"; - Weights::ShowLargestFeatures(decoder_weights); - cur_sent = 0; - total_loss = 0; - dots = 0; - if(average) { - SparseVector x = tot; - x /= normalizer; - ostringstream sa; - sa << "weights.latentsvm-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "-avg.gz"; - x.init_vector(&decoder_weights); - Weights::WriteToFile(sa.str(), decoder_weights, true, &msg); - } - else { - ostringstream os; - os << "weights.latentsvm-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << ".gz"; - Weights::WriteToFile(os.str(), decoder_weights, true, &msg); - } - cur_pass++; - RandomPermutation(corpus.size(), &order); - } - if (cur_sent == 0) { //i.e., starting a new pass - cerr << "PASS " << (line_count / corpus.size() + 1) << endl; - } - sparse_weights.init_vector(&decoder_weights); // copy sparse_weights to the decoder weights - decoder.SetId(order[cur_sent]); //assign current sentence - decoder.Decode(corpus[order[cur_sent]], &observer); // decode/update oracles - - const HypothesisInfo& cur_best = observer.GetCurrentBestHypothesis(); //model score best - const HypothesisInfo& cur_costaug = observer.GetCurrentCostAugmentedHypothesis(); //(model + cost) best; cost = -metric_scale*log(BLEU) or -metric_scale*BLEU - //const HypothesisInfo& cur_ref = *oracles[order[cur_sent]].good; //this oracle-best line only picks based on BLEU - const HypothesisInfo& cur_ref = observer.GetCurrentReference(); //if mu > 0, this mu-mixed oracle will be picked; otherwise, only on BLEU - total_loss += cur_best.mt_metric_score; - - double step_size = stepsize_param; - if (stepsize_reduce){ // w_{t+1} = w_t - stepsize_t * grad(Loss) - step_size /= (sqrt(cur_sent+1.0)); - } - //actual update step - compute gradient, and modify sparse_weights - if(cur_ref.mt_metric_score - cur_costaug.mt_metric_score > metric_threshold) { - const double loss = (cur_costaug.features.dot(decoder_weights) - cur_ref.features.dot(decoder_weights)) * scaling_trick + mt_metric_scale * (cur_ref.mt_metric_score - cur_costaug.mt_metric_score); - if (!checkpositive || loss > 0.0) { //can update either all the time if check positive is off, or only when loss > 0 if it's on - sparse_weights -= cur_costaug.features * step_size / ((1.0-2.0*step_size*c)*scaling_trick); // cost augmented hyp orig - - sparse_weights += cur_ref.features * step_size / ((1.0-2.0*step_size*c)*scaling_trick); // ref orig + - } - } - scaling_trick *= (1.0 - 2.0 * step_size * c); - - tot += sparse_weights; //for averaging purposes - normalizer++; //for averaging purposes - line_count++; - interval_counter++; - cur_sent++; - } - cerr << endl; - if(average) { - tot /= normalizer; - tot.init_vector(decoder_weights); - msg = "# Latent SSVM tuned weights (averaged vector)"; - Weights::WriteToFile("weights.latentsvm-final-avg.gz", decoder_weights, true, &msg); - cerr << "Optimization complete.\n" << "AVERAGED WEIGHTS: weights.latentsvm-final-avg.gz\n"; - } else { - Weights::WriteToFile("weights.latentsvm-final.gz", decoder_weights, true, &msg); - cerr << "Optimization complete.\n"; - } - return 0; -} - -- cgit v1.2.3 From dc13d4b6c6a35563fb7dc5e426f8dc4142a03702 Mon Sep 17 00:00:00 2001 From: Avneesh Saluja Date: Thu, 28 Mar 2013 18:58:31 -0700 Subject: latent SVM --- training/latent_svm/Makefile.am | 6 + training/latent_svm/latent_svm.cc | 412 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 418 insertions(+) create mode 100644 training/latent_svm/Makefile.am create mode 100644 training/latent_svm/latent_svm.cc diff --git a/training/latent_svm/Makefile.am b/training/latent_svm/Makefile.am new file mode 100644 index 00000000..673b9159 --- /dev/null +++ b/training/latent_svm/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = latent_svm + +latent_svm_SOURCES = latent_svm.cc +latent_svm_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/latent_svm/latent_svm.cc b/training/latent_svm/latent_svm.cc new file mode 100644 index 00000000..ab9c1d5d --- /dev/null +++ b/training/latent_svm/latent_svm.cc @@ -0,0 +1,412 @@ +/* +Points to note regarding variable names: +total_loss and prev_loss actually refer not to loss, but the metric (usually BLEU) +*/ +#include +#include +#include +#include +#include + +//boost libraries +#include +#include +#include + +//cdec libraries +#include "config.h" +#include "hg_sampler.h" +#include "sentence_metadata.h" +#include "scorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" +#include "sampler.h" + +using namespace std; +using boost::shared_ptr; +namespace po = boost::program_options; + +bool invert_score; +boost::shared_ptr rng; //random seed ptr + +void RandomPermutation(int len, vector* p_ids) { + vector& ids = *p_ids; + ids.resize(len); + for (int i = 0; i < len; ++i) ids[i] = i; + for (int i = len; i > 0; --i) { + int j = rng->next() * i; + if (j == i) i--; + swap(ids[i-1], ids[j]); + } +} + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("weights,w",po::value(),"[REQD] Input feature weights file") + ("input,i",po::value(),"[REQD] Input source file for development set") + ("passes,p", po::value()->default_value(15), "Number of passes through the training data") + ("weights_write_interval,n", po::value()->default_value(1000), "Number of lines between writing out weights") + ("reference,r",po::value >(), "[REQD] Reference translation(s) (tokenized text file)") + ("mt_metric,m",po::value()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") + ("regularizer_strength,C", po::value()->default_value(0.01), "regularization strength") + ("mt_metric_scale,s", po::value()->default_value(1.0), "Cost function is -mt_metric_scale*BLEU") + ("costaug_log_bleu,l", "Flag converts BLEU to log space. Cost function is thus -mt_metric_scale*log(BLEU). Not on by default") + ("average,A", "Average the weights (this is a weighted average due to the scaling factor)") + ("mu,u", po::value()->default_value(0.0), "weight (between 0 and 1) to scale model score by for oracle selection") + ("stepsize_param,a", po::value()->default_value(0.01), "Stepsize parameter, during optimization") + ("stepsize_reduce,t", "Divide step size by sqrt(number of examples seen so far), as per Ratliff et al., 2007") + ("metric_threshold,T", po::value()->default_value(0.0), "Threshold for diff between oracle BLEU and cost-aug BLEU for updating the weights") + ("check_positive,P", "Check that the loss is positive before updating") + ("k_best_size,k", po::value()->default_value(250), "Size of hypothesis list to search for oracles") + ("best_ever,b", "Keep track of the best hypothesis we've ever seen (metric score), and use that as the reference") + ("random_seed,S", po::value(), "Random seed (if not specified, /dev/random will be used)") + ("decoder_config,c",po::value(),"Decoder configuration file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("weights") || !conf->count("input") || !conf->count("decoder_config") || !conf->count("reference")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +double scaling_trick = 1; // see http://blog.smola.org/post/940672544/fast-quadratic-regularization-for-online-learning +/*computes and returns cost augmented score for negative example selection*/ +double cost_augmented_score(const LogVal model_score, const double mt_metric_score, const double mt_metric_scale, const bool logbleu) { + if(logbleu) { + if(mt_metric_score != 0) + // NOTE: log(model_score) is just the model score feature weights * features + return log(model_score) * scaling_trick + (- mt_metric_scale * log(mt_metric_score)); + else + return -1000000; + } + // NOTE: log(model_score) is just the model score feature weights * features + return log(model_score) * scaling_trick + (- mt_metric_scale * mt_metric_score); +} + +/*computes and returns mu score, for oracle selection*/ +double muscore(const vector& feature_weights, const SparseVector& feature_values, const double mt_metric_score, const double mu, const bool logbleu) { + if(logbleu) { + if(mt_metric_score != 0) + return feature_values.dot(feature_weights) * mu + (1 - mu) * log(mt_metric_score); + else + return feature_values.dot(feature_weights) * mu + (1 - mu) * (-1000000); // log(0) is -inf + } + return feature_values.dot(feature_weights) * mu + (1 - mu) * mt_metric_score; +} + +static const double kMINUS_EPSILON = -1e-6; + +struct HypothesisInfo { + SparseVector features; + double mt_metric_score; + // The model score changes when the feature weights change, so it is not stored here + // It must be recomputed every time +}; + +struct GoodOracle { + shared_ptr good; +}; + +struct TrainingObserver : public DecoderObserver { + TrainingObserver(const int k, + const DocScorer& d, + vector* o, + const vector& feat_weights, + const double metric_scale, + const double Mu, + const bool bestever, + const bool LogBleu) : ds(d), feature_weights(feat_weights), oracles(*o), kbest_size(k), mt_metric_scale(metric_scale), mu(Mu), best_ever(bestever), log_bleu(LogBleu) {} + const DocScorer& ds; + const vector& feature_weights; + vector& oracles; + shared_ptr cur_best; + shared_ptr cur_costaug_best; + shared_ptr cur_ref; + const int kbest_size; + const double mt_metric_scale; + const double mu; + const bool best_ever; + const bool log_bleu; + + const HypothesisInfo& GetCurrentBestHypothesis() const { + return *cur_best; + } + + const HypothesisInfo& GetCurrentCostAugmentedHypothesis() const { + return *cur_costaug_best; + } + + const HypothesisInfo& GetCurrentReference() const { + return *cur_ref; + } + + virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { + UpdateOracles(smeta.GetSentenceID(), *hg); + } + + shared_ptr MakeHypothesisInfo(const SparseVector& feats, const double metric) { + shared_ptr h(new HypothesisInfo); + h->features = feats; + h->mt_metric_score = metric; + return h; + } + + void UpdateOracles(int sent_id, const Hypergraph& forest) { + //shared_ptr& cur_ref = oracles[sent_id].good; + cur_ref = oracles[sent_id].good; + if(!best_ever) + cur_ref.reset(); + + KBest::KBestDerivations, ESentenceTraversal> kbest(forest, kbest_size); + double costaug_best_score = 0; + + for (int i = 0; i < kbest_size; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + double mt_metric_score = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore(); //this might need to change!! + const SparseVector& feature_vals = d->feature_values; + double costaugmented_score = cost_augmented_score(d->score, mt_metric_score, mt_metric_scale, log_bleu); //note that d->score, i.e., model score, is passed in + if (i == 0) { //i.e., setting up cur_best to be model score highest, and initializing costaug_best + cur_best = MakeHypothesisInfo(feature_vals, mt_metric_score); + cur_costaug_best = cur_best; + costaug_best_score = costaugmented_score; + } + if (costaugmented_score > costaug_best_score) { // kbest_mira's cur_bad, i.e., "fear" derivation + cur_costaug_best = MakeHypothesisInfo(feature_vals, mt_metric_score); + costaug_best_score = costaugmented_score; + } + double cur_muscore = mt_metric_score; + if (!cur_ref) // kbest_mira's cur_good, i.e., "hope" derivation + cur_ref = MakeHypothesisInfo(feature_vals, cur_muscore); + else { + double cur_ref_muscore = cur_ref->mt_metric_score; + if(mu > 0) { //select oracle with mixture of model score and BLEU + cur_ref_muscore = muscore(feature_weights, cur_ref->features, cur_ref->mt_metric_score, mu, log_bleu); + cur_muscore = muscore(feature_weights, d->feature_values, mt_metric_score, mu, log_bleu); + } + if (cur_muscore > cur_ref_muscore) //replace oracle + cur_ref = MakeHypothesisInfo(feature_vals, mt_metric_score); + } + } + } +}; + +void ReadTrainingCorpus(const string& fname, vector* c) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (!in) break; + c->push_back(line); + } +} + +bool ApproxEqual(double a, double b) { + if (a == b) return true; + return (fabs(a-b)/fabs(b)) < 0.000001; +} + +int main(int argc, char** argv) { + register_feature_functions(); + SetSilent(true); // turn off verbose decoder output + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) return 1; + + if (conf.count("random_seed")) + rng.reset(new MT19937(conf["random_seed"].as())); + else + rng.reset(new MT19937); + + const bool best_ever = conf.count("best_ever") > 0; + vector corpus; + ReadTrainingCorpus(conf["input"].as(), &corpus); + + const string metric_name = conf["mt_metric"].as(); //set up scoring; this may need to be changed!! + + ScoreType type = ScoreTypeFromString(metric_name); + if (type == TER) { + invert_score = true; + } else { + invert_score = false; + } + DocScorer ds(type, conf["reference"].as >(), ""); + cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl; + if (ds.size() != corpus.size()) { + cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; + return 1; + } + + ReadFile ini_rf(conf["decoder_config"].as()); + Decoder decoder(ini_rf.stream()); + + // load initial weights + vector& decoder_weights = decoder.CurrentWeightVector(); //equivalent to "dense_weights" vector in kbest_mira.cc + SparseVector sparse_weights; //equivaelnt to kbest_mira.cc "lambdas" + Weights::InitFromFile(conf["weights"].as(), &decoder_weights); + Weights::InitSparseVector(decoder_weights, &sparse_weights); + + //initializing other algorithm and output parameters + const double c = conf["regularizer_strength"].as(); + const int weights_write_interval = conf["weights_write_interval"].as(); + const double mt_metric_scale = conf["mt_metric_scale"].as(); + const double mu = conf["mu"].as(); + const double metric_threshold = conf["metric_threshold"].as(); + const double stepsize_param = conf["stepsize_param"].as(); //step size in structured SGD optimization step + const bool stepsize_reduce = conf.count("stepsize_reduce") > 0; + const bool costaug_log_bleu = conf.count("costaug_log_bleu") > 0; + const bool average = conf.count("average") > 0; + const bool checkpositive = conf.count("check_positive") > 0; + + assert(corpus.size() > 0); + vector oracles(corpus.size()); + TrainingObserver observer(conf["k_best_size"].as(), // kbest size + ds, // doc scorer + &oracles, + decoder_weights, + mt_metric_scale, + mu, + best_ever, + costaug_log_bleu); + int cur_sent = 0; + int line_count = 0; + int normalizer = 0; + double total_loss = 0; + double prev_loss = 0; + int dots = 0; // progess bar + int cur_pass = 0; + SparseVector tot; + tot += sparse_weights; //add initial weights to total + normalizer++; //add 1 to normalizer + int max_iteration = conf["passes"].as(); + string msg = "# LatentSVM tuned weights"; + vector order; + int interval_counter = 0; + RandomPermutation(corpus.size(), &order); //shuffle corpus + while (line_count <= max_iteration * corpus.size()) { //loop over all (passes * num sentences) examples + //if ((interval_counter * 40 / weights_write_interval) > dots) { ++dots; cerr << '.'; } //check this + if ((cur_sent * 40 / corpus.size()) > dots) { ++dots; cerr << '.';} + if (interval_counter == weights_write_interval) { //i.e., we need to write out weights + sparse_weights *= scaling_trick; + tot *= scaling_trick; + scaling_trick = 1; + cerr << " [SENTENCE NUMBER= " << cur_sent << "\n"; + cerr << " [AVG METRIC LAST INTERVAL =" << ((total_loss - prev_loss) / weights_write_interval) << "]\n"; + cerr << " [AVG METRIC THIS PASS THUS FAR =" << (total_loss / cur_sent) << "]\n"; + cerr << " [TOTAL LOSS: =" << total_loss << "\n"; + Weights::ShowLargestFeatures(decoder_weights); + //dots = 0; + interval_counter = 0; + prev_loss = total_loss; + if (average){ + SparseVector x = tot; + x /= normalizer; + ostringstream sa; + sa << "weights.latentsvm-" << line_count/weights_write_interval << "-avg.gz"; + x.init_vector(&decoder_weights); + Weights::WriteToFile(sa.str(), decoder_weights, true, &msg); + } + else { + ostringstream os; + os << "weights.latentsvm-" << line_count/weights_write_interval << ".gz"; + sparse_weights.init_vector(&decoder_weights); + Weights::WriteToFile(os.str(), decoder_weights, true, &msg); + } + } + if (corpus.size() == cur_sent) { //i.e., finished a pass + //cerr << " [AVG METRIC LAST PASS=" << (document_metric_score / corpus.size()) << "]\n"; + cerr << " [AVG METRIC LAST PASS=" << (total_loss / corpus.size()) << "]\n"; + cerr << " TOTAL LOSS: " << total_loss << "\n"; + Weights::ShowLargestFeatures(decoder_weights); + cur_sent = 0; + total_loss = 0; + dots = 0; + if(average) { + SparseVector x = tot; + x /= normalizer; + ostringstream sa; + sa << "weights.latentsvm-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "-avg.gz"; + x.init_vector(&decoder_weights); + Weights::WriteToFile(sa.str(), decoder_weights, true, &msg); + } + else { + ostringstream os; + os << "weights.latentsvm-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << ".gz"; + Weights::WriteToFile(os.str(), decoder_weights, true, &msg); + } + cur_pass++; + RandomPermutation(corpus.size(), &order); + } + if (cur_sent == 0) { //i.e., starting a new pass + cerr << "PASS " << (line_count / corpus.size() + 1) << endl; + } + sparse_weights.init_vector(&decoder_weights); // copy sparse_weights to the decoder weights + decoder.SetId(order[cur_sent]); //assign current sentence + decoder.Decode(corpus[order[cur_sent]], &observer); // decode/update oracles + + const HypothesisInfo& cur_best = observer.GetCurrentBestHypothesis(); //model score best + const HypothesisInfo& cur_costaug = observer.GetCurrentCostAugmentedHypothesis(); //(model + cost) best; cost = -metric_scale*log(BLEU) or -metric_scale*BLEU + //const HypothesisInfo& cur_ref = *oracles[order[cur_sent]].good; //this oracle-best line only picks based on BLEU + const HypothesisInfo& cur_ref = observer.GetCurrentReference(); //if mu > 0, this mu-mixed oracle will be picked; otherwise, only on BLEU + total_loss += cur_best.mt_metric_score; + + double step_size = stepsize_param; + if (stepsize_reduce){ // w_{t+1} = w_t - stepsize_t * grad(Loss) + step_size /= (sqrt(cur_sent+1.0)); + } + //actual update step - compute gradient, and modify sparse_weights + if(cur_ref.mt_metric_score - cur_costaug.mt_metric_score > metric_threshold) { + const double loss = (cur_costaug.features.dot(decoder_weights) - cur_ref.features.dot(decoder_weights)) * scaling_trick + mt_metric_scale * (cur_ref.mt_metric_score - cur_costaug.mt_metric_score); + if (!checkpositive || loss > 0.0) { //can update either all the time if check positive is off, or only when loss > 0 if it's on + sparse_weights -= cur_costaug.features * step_size / ((1.0-2.0*step_size*c)*scaling_trick); // cost augmented hyp orig - + sparse_weights += cur_ref.features * step_size / ((1.0-2.0*step_size*c)*scaling_trick); // ref orig + + } + } + scaling_trick *= (1.0 - 2.0 * step_size * c); + + tot += sparse_weights; //for averaging purposes + normalizer++; //for averaging purposes + line_count++; + interval_counter++; + cur_sent++; + } + cerr << endl; + if(average) { + tot /= normalizer; + tot.init_vector(decoder_weights); + msg = "# Latent SSVM tuned weights (averaged vector)"; + Weights::WriteToFile("weights.latentsvm-final-avg.gz", decoder_weights, true, &msg); + cerr << "Optimization complete.\n" << "AVERAGED WEIGHTS: weights.latentsvm-final-avg.gz\n"; + } else { + Weights::WriteToFile("weights.latentsvm-final.gz", decoder_weights, true, &msg); + cerr << "Optimization complete.\n"; + } + return 0; +} + -- cgit v1.2.3 From 981c2e5b2b4415c9cf54532a3703ce4520c747e3 Mon Sep 17 00:00:00 2001 From: Avneesh Saluja Date: Thu, 28 Mar 2013 19:01:24 -0700 Subject: updated Makefiles --- training/Makefile.am | 1 + training/latent_svm/Makefile.am | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/training/Makefile.am b/training/Makefile.am index e95e045f..8ef3c939 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -6,6 +6,7 @@ SUBDIRS = \ dpmert \ pro \ dtrain \ + latent_svm \ mira \ rampion diff --git a/training/latent_svm/Makefile.am b/training/latent_svm/Makefile.am index 673b9159..65c5e038 100644 --- a/training/latent_svm/Makefile.am +++ b/training/latent_svm/Makefile.am @@ -1,6 +1,6 @@ bin_PROGRAMS = latent_svm latent_svm_SOURCES = latent_svm.cc -latent_svm_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +latent_svm_LDADD = ../..//decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -- cgit v1.2.3 From 19f9cb52e536ca36d38428ca7ac0104d315cbc97 Mon Sep 17 00:00:00 2001 From: Avneesh Saluja Date: Thu, 28 Mar 2013 22:42:04 -0700 Subject: updated configure.ac --- configure.ac | 1 + 1 file changed, 1 insertion(+) diff --git a/configure.ac b/configure.ac index 98deac86..8632fb51 100644 --- a/configure.ac +++ b/configure.ac @@ -128,6 +128,7 @@ AC_CONFIG_FILES([training/pro/Makefile]) AC_CONFIG_FILES([training/rampion/Makefile]) AC_CONFIG_FILES([training/minrisk/Makefile]) AC_CONFIG_FILES([training/mira/Makefile]) +AC_CONFIG_FILES([training/latent_svm/Makefile]) AC_CONFIG_FILES([training/dtrain/Makefile]) # external feature function example code -- cgit v1.2.3 From 5b4d7fd72b7662e38eccf9ff46192742128be7db Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Mon, 8 Apr 2013 13:08:04 -0400 Subject: online wasn't getting set --- python/pkg/cdec/sa/extract.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py index 782bed8b..bf39d080 100644 --- a/python/pkg/cdec/sa/extract.py +++ b/python/pkg/cdec/sa/extract.py @@ -62,6 +62,7 @@ def extract(inp): return '{}{}'.format(grammar_file, i, sentence, suffix) def main(): + global online logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.') parser.add_argument('-c', '--config', required=True, @@ -88,6 +89,8 @@ def main(): ' should be a python module\n'.format(featdef)) sys.exit(1) + online = args.online + if args.jobs > 1: logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize) pool = mp.Pool(args.jobs, make_extractor, (args,)) -- cgit v1.2.3 From 045bad698d84b1a0a54bafa9152dc4436f1cbb95 Mon Sep 17 00:00:00 2001 From: vlade Date: Sat, 13 Apr 2013 00:48:10 -0400 Subject: inital commit of mira code --- training/mira/kbest_mirav5.cc | 1148 +++++++++++++++++++++++++++++++++++++++++ training/mira/run_mira.pl | 548 ++++++++++++++++++++ 2 files changed, 1696 insertions(+) create mode 100644 training/mira/kbest_mirav5.cc create mode 100755 training/mira/run_mira.pl diff --git a/training/mira/kbest_mirav5.cc b/training/mira/kbest_mirav5.cc new file mode 100644 index 00000000..cea5cf67 --- /dev/null +++ b/training/mira/kbest_mirav5.cc @@ -0,0 +1,1148 @@ +#include +#include +#include +#include +#include +#include + +#include "config.h" + + +#include +#include +#include + +#include "sentence_metadata.h" +#include "scorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "time.h" +#include "sampler.h" + +#include "weights.h" +#include "sparse_vector.h" + +using namespace std; +using boost::shared_ptr; +namespace po = boost::program_options; + +bool invert_score; +boost::shared_ptr rng; +bool approx_score; +bool no_reweight; +bool no_select; +bool unique_kbest; +int update_list_size; +vector dense_weights_g; +double mt_metric_scale; +int optimizer; +int fear_select; +int hope_select; + +bool pseudo_doc; + +void SanityCheck(const vector& w) { + for (int i = 0; i < w.size(); ++i) { + assert(!isnan(w[i])); + assert(!isinf(w[i])); + } +} + +struct FComp { + const vector& w_; + FComp(const vector& w) : w_(w) {} + bool operator()(int a, int b) const { + return fabs(w_[a]) > fabs(w_[b]); + } +}; + +void ShowLargestFeatures(const vector& w) { + vector fnums(w.size()); + for (int i = 0; i < w.size(); ++i) + fnums[i] = i; + vector::iterator mid = fnums.begin(); + mid += (w.size() > 10 ? 10 : w.size()); + partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); + cerr << "TOP FEATURES:"; + for (vector::iterator i = fnums.begin(); i != mid; ++i) { + cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; + } + cerr << endl; +} + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("input_weights,w",po::value(),"Input feature weights file") + ("source,i",po::value(),"Source file for development set") + ("passes,p", po::value()->default_value(15), "Number of passes through the training data") + ("reference,r",po::value >(), "[REQD] Reference translation(s) (tokenized text file)") + ("mt_metric,m",po::value()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") + ("optimizer,o",po::value()->default_value(1), "Optimizer (sgd=1, mira 1-fear=2, full mira w/ cutting plane=3, full mira w/ nbest list=5, local update=4)") + ("fear,f",po::value()->default_value(1), "Fear selection (model-cost=1, max-cost=2, pred-base=3)") + ("hope,h",po::value()->default_value(1), "Hope selection (model+cost=1, max-cost=2, local-cost=3)") + ("max_step_size,C", po::value()->default_value(0.01), "regularization strength (C)") + ("random_seed,S", po::value(), "Random seed (if not specified, /dev/random will be used)") + ("mt_metric_scale,s", po::value()->default_value(1.0), "Amount to scale MT loss function by") + ("approx_score,a", "Use smoothed sentence-level BLEU score for approximate scoring") + ("no_reweight,d","Do not reweight forest for cutting plane") + ("no_select,n", "Do not use selection heuristic") + ("k_best_size,k", po::value()->default_value(250), "Size of hypothesis list to search for oracles") + ("update_k_best,b", po::value()->default_value(1), "Size of good, bad lists to perform update with") + ("unique_k_best,u", "Unique k-best translation list") + ("weights_output,O",po::value(),"Directory to write weights to") + ("output_dir,D",po::value(),"Directory to place output in") + ("decoder_config,c",po::value(),"Decoder configuration file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,H", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("input_weights") || !conf->count("decoder_config") || !conf->count("reference")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +//load previous translation, store array of each sentences score, subtract it from current sentence and replace with new translation score + + +static const double kMINUS_EPSILON = -1e-6; +static const double EPSILON = 0.000001; +static const double SMO_EPSILON = 0.0001; +static const double PSEUDO_SCALE = 0.95; +static const int MAX_SMO = 10; +int cur_pass; + +struct HypothesisInfo { + SparseVector features; + vector hyp; + double mt_metric; + double hope; + double fear; + double alpha; + double oracle_loss; + SparseVector oracle_feat_diff; + shared_ptr oracleN; +}; + +bool ApproxEqual(double a, double b) { + if (a == b) return true; + return (fabs(a-b)/fabs(b)) < EPSILON; +} + +typedef shared_ptr HI; +bool HypothesisCompareB(const HI& h1, const HI& h2 ) +{ + return h1->mt_metric > h2->mt_metric; +}; + + +bool HopeCompareB(const HI& h1, const HI& h2 ) +{ + return h1->hope > h2->hope; +}; + +bool FearCompareB(const HI& h1, const HI& h2 ) +{ + return h1->fear > h2->fear; +}; + +bool FearComparePred(const HI& h1, const HI& h2 ) +{ + return h1->features.dot(dense_weights_g) > h2->features.dot(dense_weights_g); +}; + +bool HypothesisCompareG(const HI& h1, const HI& h2 ) +{ + return h1->mt_metric < h2->mt_metric; +}; + + +void CuttingPlane(vector >* cur_c, bool* again, vector >& all_hyp, vector dense_weights) +{ + bool DEBUG_CUT = false; + shared_ptr max_fear, max_fear_in_set; + vector >& cur_constraint = *cur_c; + + if(no_reweight) + { + //find new hope hypothesis + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights); + all_hyp[u]->hope = 1 * all_hyp[u]->mt_metric + t_score; + //if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; + + } + + //sort hyps by hope score + sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); + + double hope_score = all_hyp[0]->features.dot(dense_weights); + if(DEBUG_CUT) cerr << "New hope derivation score " << hope_score << endl; + + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights); + //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; + + all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*all_hyp[0]->mt_metric - hope_score + t_score; //relative loss + // all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*all_hyp[0]->mt_metric; + //all_hyp[u]->oracle_feat_diff = all_hyp[0]->features - all_hyp[u]->features; + // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; + //if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; + + } + + sort(all_hyp.begin(),all_hyp.end(),FearCompareB); + + } + //assign maximum fear derivation from all derivations + max_fear = all_hyp[0]; + + if(DEBUG_CUT) cerr <<"Cutting Plane Max Fear "<fear ; + for(int i=0; i < cur_constraint.size();i++) //select maximal violator already in constraint set + { + if (!max_fear_in_set || cur_constraint[i]->fear > max_fear_in_set->fear) + max_fear_in_set = cur_constraint[i]; + } + if(DEBUG_CUT) cerr << "Max Fear in constraint set " << max_fear_in_set->fear << endl; + + if(max_fear->fear > max_fear_in_set->fear + SMO_EPSILON) + { + cur_constraint.push_back(max_fear); + *again = true; + if(DEBUG_CUT) cerr << "Optimize Again " << *again << endl; + } +} + + +double ComputeDelta(vector >* cur_p, double max_step_size,vector dense_weights ) +{ + vector >& cur_pair = *cur_p; + double loss = cur_pair[0]->oracle_loss - cur_pair[1]->oracle_loss; + //double margin = -cur_pair[0]->oracle_feat_diff.dot(dense_weights) + cur_pair[1]->oracle_feat_diff.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? + //double num = loss - margin; + + + double margin = -(cur_pair[0]->oracleN->features.dot(dense_weights)- cur_pair[0]->features.dot(dense_weights)) + (cur_pair[1]->oracleN->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights)); + const double num = margin + loss; + cerr << "LOSS: " << num << " Margin:" << margin << " BLEUL:" << loss << " " << cur_pair[1]->features.dot(dense_weights) << " " << cur_pair[0]->features.dot(dense_weights) <features.dot(dense_weights) - cur_pair[0]->features.dot(dense_weights); + // double loss = cur_pair[1]->oracle_loss; //good.mt_metric - cur_bad.mt_metric); + //const double num = margin + loss; + + //cerr << "Compute Delta " << loss << " " << margin << " "; + + // double margin = cur_pair[0]->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? +/* double num = + (cur_pair[0]->oracle_loss - cur_pair[0]->oracle_feat_diff.dot(dense_weights)) + - (cur_pair[1]->oracle_loss - cur_pair[1]->oracle_feat_diff.dot(dense_weights)); + */ + + SparseVector diff = cur_pair[0]->features; + diff -= cur_pair[1]->features; + /* SparseVector diff = cur_pair[0]->oracle_feat_diff; + diff -= cur_pair[1]->oracle_feat_diff;*/ + double diffsqnorm = diff.l2norm_sq(); + double delta; + if (diffsqnorm > 0) + delta = num / (diffsqnorm * max_step_size); + else + delta = 0; + cerr << " D1:" << delta; + //clip delta (enforce margin constraints) + + delta = max(-cur_pair[0]->alpha, min(delta, cur_pair[1]->alpha)); + cerr << " D2:" << delta; + return delta; +} + + +vector > SelectPair(vector >* cur_c) +{ + bool DEBUG_SELECT= false; + vector >& cur_constraint = *cur_c; + + vector > pair; + + if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for 1-mira + // if(optimizer == 2) { + pair.push_back(cur_constraint[0]); + pair.push_back(cur_constraint[1]); + return pair; + // } + } + + for(int u=0;u != cur_constraint.size();u++) + { + shared_ptr max_fear; + + if(DEBUG_SELECT) cerr<< "cur alpha " << u << " " << cur_constraint[u]->alpha; + for(int i=0; i < cur_constraint.size();i++) //select maximal violator + { + if(i != u) + if (!max_fear || cur_constraint[i]->fear > max_fear->fear) + max_fear = cur_constraint[i]; + } + if(!max_fear) return pair; // + + if(DEBUG_SELECT) cerr << " F" << max_fear->fear << endl; + + + if ((cur_constraint[u]->alpha == 0) && (cur_constraint[u]->fear > max_fear->fear + SMO_EPSILON)) + { + for(int i=0; i < cur_constraint.size();i++) //select maximal violator + { + if(i != u) + if (cur_constraint[i]->alpha > 0) + { + pair.push_back(cur_constraint[u]); + pair.push_back(cur_constraint[i]); + cerr << "RETJURN from 1" << endl; + return pair; + } + } + } + if ((cur_constraint[u]->alpha > 0) && (cur_constraint[u]->fear < max_fear->fear - SMO_EPSILON)) + { + for(int i=0; i < cur_constraint.size();i++) //select maximal violator + { + if(i != u) + if (cur_constraint[i]->fear > cur_constraint[u]->fear) + { + pair.push_back(cur_constraint[u]); + pair.push_back(cur_constraint[i]); + return pair; + } + } + } + + } + return pair; //no more constraints to optimize, we're done here + +} + +struct GoodBadOracle { + vector > good; + vector > bad; +}; + +struct TrainingObserver : public DecoderObserver { + TrainingObserver(const int k, const DocScorer& d, vector* o, vector* cbs) : ds(d), oracles(*o), corpus_bleu_sent_stats(*cbs), kbest_size(k) { + // TrainingObserver(const int k, const DocScorer& d, vector* o) : ds(d), oracles(*o), kbest_size(k) { + + //calculate corpus bleu score from previous iterations 1-best for BLEU gain + if(!pseudo_doc) + if(cur_pass > 0) + { + ScoreP acc; + for (int ii = 0; ii < corpus_bleu_sent_stats.size(); ii++) { + if (!acc) { acc = corpus_bleu_sent_stats[ii]->GetZero(); } + acc->PlusEquals(*corpus_bleu_sent_stats[ii]); + + } + corpus_bleu_stats = acc; + corpus_bleu_score = acc->ComputeScore(); + } + //corpus_src_length = 0; +} + const DocScorer& ds; + vector& corpus_bleu_sent_stats; + vector& oracles; + vector > cur_best; + shared_ptr cur_oracle; + const int kbest_size; + Hypergraph forest; + int cur_sent; + ScoreP corpus_bleu_stats; + float corpus_bleu_score; + + float corpus_src_length; + float curr_src_length; + + const int GetCurrentSent() const { + return cur_sent; + } + + const HypothesisInfo& GetCurrentBestHypothesis() const { + return *cur_best[0]; + } + + const vector > GetCurrentBest() const { + return cur_best; + } + + const HypothesisInfo& GetCurrentOracle() const { + return *cur_oracle; + } + + const Hypergraph& GetCurrentForest() const { + return forest; + } + + + virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { + cur_sent = smeta.GetSentenceID(); + //cerr << "SOURCE " << smeta.GetSourceLength() << endl; + curr_src_length = (float) smeta.GetSourceLength(); + //UpdateOracles(smeta.GetSentenceID(), *hg); + if(unique_kbest) + UpdateOracles(smeta.GetSentenceID(), *hg); + else + UpdateOracles > >(smeta.GetSentenceID(), *hg); + forest = *hg; + + } + + shared_ptr MakeHypothesisInfo(const SparseVector& feats, const double score, const vector& hyp) { + shared_ptr h(new HypothesisInfo); + h->features = feats; + h->mt_metric = score; + h->hyp = hyp; + return h; + } + + template + void UpdateOracles(int sent_id, const Hypergraph& forest) { + + bool PRINT_LIST= false; + vector >& cur_good = oracles[sent_id].good; + vector >& cur_bad = oracles[sent_id].bad; + //TODO: look at keeping previous iterations hypothesis lists around + cur_best.clear(); + cur_good.clear(); + cur_bad.clear(); + + vector > all_hyp; + + typedef KBest::KBestDerivations, ESentenceTraversal,Filter> K; + K kbest(forest,kbest_size); + + //KBest::KBestDerivations, ESentenceTraversal> kbest(forest, kbest_size); + for (int i = 0; i < kbest_size; ++i) { + //const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + typename K::Derivation *d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + + float sentscore; + if(approx_score) + { + + if(cur_pass > 0 && !pseudo_doc) + { + ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); + ScoreP corpus_no_best = corpus_bleu_stats->GetZero(); + + corpus_bleu_stats->Subtract(*corpus_bleu_sent_stats[sent_id], &*corpus_no_best); + sent_stats->PlusEquals(*corpus_no_best, 0.5); + + //compute gain from new sentence in 1-best corpus + sentscore = mt_metric_scale * (sent_stats->ComputeScore() - corpus_no_best->ComputeScore());// - corpus_bleu_score); + } + else if(pseudo_doc) + { + //cerr << "CORP:" << corpus_bleu_score << " NEW:" << sent_stats->ComputeScore() << " sentscore:" << sentscore << endl; + + //-----pseudo-corpus approach + float src_scale = corpus_src_length + curr_src_length; + ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); + if(!corpus_bleu_stats){ corpus_bleu_stats = sent_stats->GetZero();} + + sent_stats->PlusEquals(*corpus_bleu_stats); + sentscore = mt_metric_scale * src_scale * sent_stats->ComputeScore(); + + } + else + { + //cerr << "Using sentence-level approximation - PASS - " << boost::lexical_cast(cur_pass) << endl; + //approx style of computation, used for 0th iteration + sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeSentScore()); + + //use pseudo-doc + } + + + } + else + { + sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore()); + } + + if (invert_score) sentscore *= -1.0; + //cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << " " << approx_sentscore << endl; + + if (i < update_list_size){ + if (i == 0) //take cur best and add its bleu statistics counts to the pseudo-doc + { } + if(PRINT_LIST)cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; + cur_best.push_back( MakeHypothesisInfo(d->feature_values, sentscore, d->yield)); + } + + all_hyp.push_back(MakeHypothesisInfo(d->feature_values, sentscore,d->yield)); //store all hyp to extract oracle best and worst + + } + + if(pseudo_doc){ + //update psuedo-doc stats + string details, details2; + corpus_bleu_stats->ScoreDetails(&details2); + ScoreP sent_stats = ds[sent_id]->ScoreCandidate(cur_best[0]->hyp); + corpus_bleu_stats->PlusEquals(*sent_stats); + + + sent_stats->ScoreDetails(&details); + + + sent_stats = corpus_bleu_stats; + corpus_bleu_stats = sent_stats->GetZero(); + corpus_bleu_stats->PlusEquals(*sent_stats, PSEUDO_SCALE); + + + corpus_src_length = PSEUDO_SCALE * (corpus_src_length + curr_src_length); + cerr << "CORP S " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n " << details2 << endl; + + + } + + + //figure out how many hyps we can keep maximum + int temp_update_size = update_list_size; + if (all_hyp.size() < update_list_size){ temp_update_size = all_hyp.size();} + + //sort all hyps by sentscore (bleu) + sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareB); + + if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++) cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_weights_g) << endl; } + + //if(optimizer != 4 ) + if(hope_select == 1) + { + //find hope hypothesis using model + bleu + if (PRINT_LIST) cerr << "HOPE " << endl; + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights_g); + all_hyp[u]->hope = all_hyp[u]->mt_metric + t_score; + if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; + + } + + //sort hyps by hope score + sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); + } + + + //assign cur_good the sorted list + cur_good.insert(cur_good.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); + if(PRINT_LIST) { cerr << "GOOD" << endl; for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;} + /* if (!cur_oracle) { cur_oracle = cur_good[0]; + cerr << "Set oracle " << cur_oracle->hope << " " << cur_oracle->fear << " " << cur_oracle->mt_metric << endl; } + else { + cerr << "Stay oracle " << cur_oracle->hope << " " << cur_oracle->fear << " " << cur_oracle->mt_metric << endl; } */ + + shared_ptr& oracleN = cur_good[0]; + //if(optimizer != 4){ + if(fear_select == 1){ + //compute fear hyps + if (PRINT_LIST) cerr << "FEAR " << endl; + double hope_score = oracleN->features.dot(dense_weights_g); + //double hope_score = cur_oracle->features.dot(dense_weights); + if (PRINT_LIST) cerr << "hope score " << hope_score << endl; + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights_g); + //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; + + /* all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric - hope_score + t_score; //relative loss + all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric; + all_hyp[u]->oracle_feat_diff = cur_oracle->features - all_hyp[u]->features;*/ + + all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric - hope_score + t_score; //relative loss + all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric; + all_hyp[u]->oracle_feat_diff = oracleN->features - all_hyp[u]->features; + all_hyp[u]->oracleN=oracleN; + // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; + if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; + + } + + sort(all_hyp.begin(),all_hyp.end(),FearCompareB); + + cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); + } + else if(fear_select == 2) //select fear based on cost + { + cur_bad.insert(cur_bad.begin(), all_hyp.end()-temp_update_size, all_hyp.end()); + reverse(cur_bad.begin(),cur_bad.end()); + } + else //pred-based, fear_select = 3 + { + sort(all_hyp.begin(),all_hyp.end(),FearComparePred); + cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); + } + + + if(PRINT_LIST){ cerr<< "BAD"<mt_metric << " H:" << cur_bad[u]->hope << " F:" << cur_bad[u]->fear << endl;} + + cerr << "GOOD (BEST): " << cur_good[0]->mt_metric << endl; + cerr << " CUR: " << cur_best[0]->mt_metric << endl; + cerr << " BAD (WORST): " << cur_bad[0]->mt_metric << endl; + } +}; + +void ReadTrainingCorpus(const string& fname, vector* c) { + + + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (!in) break; + c->push_back(line); + } +} + +void ReadPastTranslationForScore(const int cur_pass, vector* c, DocScorer& ds, const string& od) +{ + cerr << "Reading BLEU gain file "; + string fname; + if(cur_pass == 0) + { + fname = od + "/run.raw.init"; + } + else + { + int last_pass = cur_pass - 1; + fname = od + "/run.raw." + boost::lexical_cast(last_pass) + ".B"; + } + cerr << fname << "\n"; + ReadFile rf(fname); + istream& in = *rf.stream(); + ScoreP acc; + string line; + int lc = 0; + while(in) { + getline(in, line); + if (line.empty() && !in) break; + vector sent; + TD::ConvertSentence(line, &sent); + ScoreP sentscore = ds[lc]->ScoreCandidate(sent); + c->push_back(sentscore); + if (!acc) { acc = sentscore->GetZero(); } + acc->PlusEquals(*sentscore); + ++lc; + + } + + + assert(lc > 0); + float score = acc->ComputeScore(); + string details; + acc->ScoreDetails(&details); + cerr << "INIT RUN " << details << score << endl; + +} + + +int main(int argc, char** argv) { + register_feature_functions(); + SetSilent(true); // turn off verbose decoder output + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) return 1; + + if (conf.count("random_seed")) + rng.reset(new MT19937(conf["random_seed"].as())); + else + rng.reset(new MT19937); + + vector corpus; + //ReadTrainingCorpus(conf["source"].as(), &corpus); + + const string metric_name = conf["mt_metric"].as(); + optimizer = conf["optimizer"].as(); + fear_select = conf["fear"].as(); + hope_select = conf["hope"].as(); + mt_metric_scale = conf["mt_metric_scale"].as(); + approx_score = conf.count("approx_score"); + no_reweight = conf.count("no_reweight"); + no_select = conf.count("no_select"); + update_list_size = conf["update_k_best"].as(); + unique_kbest = conf.count("unique_k_best"); + pseudo_doc = true; + + const string weights_dir = conf["weights_output"].as(); + const string output_dir = conf["output_dir"].as(); + ScoreType type = ScoreTypeFromString(metric_name); + + //establish metric used for tuning + if (type == TER) { + invert_score = true; + // approx_score = false; + } else { + invert_score = false; + } + + //load references + DocScorer ds(type, conf["reference"].as >(), ""); + cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl; + vector corpus_bleu_sent_stats; + + //check training pass,if >0, then use previous iterations corpus bleu stats + cur_pass = conf["passes"].as(); + if(cur_pass > 0) + { + ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir); + } + /* if (ds.size() != corpus.size()) { + cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; + return 1; + }*/ + cerr << "Optimizing with " << optimizer << endl; + // load initial weights + /*Weights weights; + weights.InitFromFile(conf["input_weights"].as()); + SparseVector lambdas; + weights.InitSparseVector(&lambdas); + */ + + + + ReadFile ini_rf(conf["decoder_config"].as()); + Decoder decoder(ini_rf.stream()); + + vector& dense_weights = decoder.CurrentWeightVector(); + + SparseVector lambdas; + Weights::InitFromFile(conf["input_weights"].as(), &dense_weights); + Weights::InitSparseVector(dense_weights, &lambdas); + + const string input = decoder.GetConf()["input"].as(); + //const bool show_feature_dictionary = decoder.GetConf().count("show_feature_dictionary"); + if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; + ReadFile in_read(input); + istream *in = in_read.stream(); + assert(*in); + string buf; + + const double max_step_size = conf["max_step_size"].as(); + + + // assert(corpus.size() > 0); + vector oracles(ds.size()); + + TrainingObserver observer(conf["k_best_size"].as(), ds, &oracles, &corpus_bleu_sent_stats); + + int cur_sent = 0; + int lcount = 0; + double objective=0; + double tot_loss = 0; + int dots = 0; + // int cur_pass = 1; + // vector dense_weights; + SparseVector tot; + SparseVector final_tot; + // tot += lambdas; // initial weights + // lcount++; // count for initial weights + + //string msg = "# MIRA tuned weights"; + // while (cur_pass <= max_iteration) { + SparseVector old_lambdas = lambdas; + tot.clear(); + tot += lambdas; + cerr << "PASS " << cur_pass << " " << endl << lambdas << endl; + ScoreP acc, acc_h, acc_f; + + while(*in) { + getline(*in, buf); + if (buf.empty()) continue; + //for (cur_sent = 0; cur_sent < corpus.size(); cur_sent++) { + + cerr << "SENT: " << cur_sent << endl; + //TODO: allow batch updating + //dense_weights.clear(); + //weights.InitFromVector(lambdas); + //weights.InitVector(&dense_weights); + //decoder.SetWeights(dense_weights); + lambdas.init_vector(&dense_weights); + dense_weights_g = dense_weights; + decoder.SetId(cur_sent); + decoder.Decode(buf, &observer); // decode the sentence, calling Notify to get the hope,fear, and model best hyps. + + cur_sent = observer.GetCurrentSent(); + const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis(); + const HypothesisInfo& cur_good = *oracles[cur_sent].good[0]; + const HypothesisInfo& cur_bad = *oracles[cur_sent].bad[0]; + + vector >& cur_good_v = oracles[cur_sent].good; + vector >& cur_bad_v = oracles[cur_sent].bad; + vector > cur_best_v = observer.GetCurrentBest(); + + tot_loss += cur_hyp.mt_metric; + + //score hyps to be able to compute corpus level bleu after we finish this iteration through the corpus + ScoreP sentscore = ds[cur_sent]->ScoreCandidate(cur_hyp.hyp); + if (!acc) { acc = sentscore->GetZero(); } + acc->PlusEquals(*sentscore); + + ScoreP hope_sentscore = ds[cur_sent]->ScoreCandidate(cur_good.hyp); + if (!acc_h) { acc_h = hope_sentscore->GetZero(); } + acc_h->PlusEquals(*hope_sentscore); + + ScoreP fear_sentscore = ds[cur_sent]->ScoreCandidate(cur_bad.hyp); + if (!acc_f) { acc_f = fear_sentscore->GetZero(); } + acc_f->PlusEquals(*fear_sentscore); + + if(optimizer == 4) { //single dual coordinate update, cur_good selected on BLEU score only (not model+BLEU) + // if (!ApproxEqual(cur_hyp.mt_metric, cur_good.mt_metric)) { + + double margin = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights); + double mt_loss = (cur_good.mt_metric - cur_bad.mt_metric); + const double loss = margin + mt_loss; + cerr << "LOSS: " << loss << " Margin:" << margin << " BLEUL:" << mt_loss << " " << cur_bad.features.dot(dense_weights) << " " << cur_good.features.dot(dense_weights) < 0.0) { + SparseVector diff = cur_good.features; + diff -= cur_bad.features; + + double diffsqnorm = diff.l2norm_sq(); + double delta; + if (diffsqnorm > 0) + delta = loss / (diffsqnorm); + else + delta = 0; + + //double step_size = loss / diff.l2norm_sq(); + cerr << loss << " " << delta << " " << diff << endl; + if (delta > max_step_size) delta = max_step_size; + lambdas += (cur_good.features * delta); + lambdas -= (cur_bad.features * delta); + //cerr << "L: " << lambdas << endl; + // } + // } + } + else if(optimizer == 1) //sgd - nonadapted step size + { + + lambdas += (cur_good.features) * max_step_size; + lambdas -= (cur_bad.features) * max_step_size; + } + //cerr << "L: " << lambdas << endl; + else if(optimizer == 5) //full mira with n-best list of constraints from oracle, fear, best + { + vector > cur_constraint; + cur_constraint.insert(cur_constraint.begin(), cur_bad_v.begin(), cur_bad_v.end()); + cur_constraint.insert(cur_constraint.begin(), cur_best_v.begin(), cur_best_v.end()); + cur_constraint.insert(cur_constraint.begin(), cur_good_v.begin(), cur_good_v.end()); + + bool optimize_again; + vector > cur_pair; + //SMO + for(int u=0;u!=cur_constraint.size();u++) + cur_constraint[u]->alpha =0; + + cur_constraint[0]->alpha =1; //set oracle to alpha=1 + + cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl; + int smo_iter = 10, smo_iter2 = 10; + int iter, iter2 =0; + bool DEBUG_SMO = false; + while (iter2 < smo_iter2) + { + iter =0; + while (iter < smo_iter) + { + optimize_again = true; + for (int i = 0; i< cur_constraint.size(); i++) + for (int j = i+1; j< cur_constraint.size(); j++) + { + if(DEBUG_SMO) cerr << "start " << i << " " << j << endl; + cur_pair.clear(); + cur_pair.push_back(cur_constraint[j]); + cur_pair.push_back(cur_constraint[i]); + double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); + + if (delta == 0) optimize_again = false; + // cur_pair[0]->alpha += delta; + // cur_pair[1]->alpha -= delta; + cur_constraint[j]->alpha += delta; + cur_constraint[i]->alpha -= delta; + double step_size = delta * max_step_size; + /*lambdas += (cur_pair[1]->features) * step_size; + lambdas -= (cur_pair[0]->features) * step_size;*/ + lambdas += (cur_constraint[i]->features) * step_size; + lambdas -= (cur_constraint[j]->features) * step_size; + if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << i << " " << j << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; + + //reload weights based on update + /*dense_weights.clear(); + weights.InitFromVector(lambdas); + weights.InitVector(&dense_weights);*/ + } + iter++; + + if(!optimize_again) + { + iter = 100; + cerr << "Optimization stopped, delta =0" << endl; + } + + + } + iter2++; + } + + + } + else if(optimizer == 2 || optimizer == 3) //1-fear and cutting plane mira + { + bool DEBUG_SMO= true; + vector > cur_constraint; + cur_constraint.push_back(cur_good_v[0]); //add oracle to constraint set + bool optimize_again = true; + int cut_plane_calls = 0; + while (optimize_again) + { + if(DEBUG_SMO) cerr<< "optimize again: " << optimize_again << endl; + if(optimizer == 2){ //1-fear + cur_constraint.push_back(cur_bad_v[0]); + + //check if we have a violation + if(!(cur_constraint[1]->fear > cur_constraint[0]->fear + SMO_EPSILON)) + { + optimize_again = false; + cerr << "Constraint not violated" << endl; + } + } + else + { //cutting plane to add constraints + if(DEBUG_SMO) cerr<< "Cutting Plane " << cut_plane_calls << " with " << lambdas << endl; + optimize_again = false; + cut_plane_calls++; + CuttingPlane(&cur_constraint, &optimize_again, oracles[cur_sent].bad, dense_weights); + if (cut_plane_calls >= MAX_SMO) optimize_again = false; + } + + if(optimize_again) + { + //SMO + for(int u=0;u!=cur_constraint.size();u++) + { + cur_constraint[u]->alpha =0; + //cur_good_v[0]->alpha = 1; cur_bad_v[0]->alpha = 0; + } + cur_constraint[0]->alpha = 1; + cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl; + int smo_iter = MAX_SMO; + int iter =0; + while (iter < smo_iter) + { + //select pair to optimize from constraint set + vector > cur_pair = SelectPair(&cur_constraint); + + if(cur_pair.empty()){iter=MAX_SMO; cerr << "Undefined pair " << endl; continue;} //pair is undefined so we are done with this smo + + //double num = cur_good_v[0]->fear - cur_bad_v[0]->fear; + /*double loss = cur_good_v[0]->oracle_loss - cur_bad_v[0]->oracle_loss; + double margin = cur_good_v[0]->oracle_feat_diff.dot(dense_weights) - cur_bad_v[0]->oracle_feat_diff.dot(dense_weights); + double num = loss - margin; + SparseVector diff = cur_good_v[0]->features; + diff -= cur_bad_v[0]->features; + double delta = num / (diff.l2norm_sq() * max_step_size); + delta = max(-cur_good_v[0]->alpha, min(delta, cur_bad_v[0]->alpha)); + cur_good_v[0]->alpha += delta; + cur_bad_v[0]->alpha -= delta; + double step_size = delta * max_step_size; + lambdas += (cur_bad_v[0]->features) * step_size; + lambdas -= (cur_good_v[0]->features) * step_size; + */ + + double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); + + cur_pair[0]->alpha += delta; + cur_pair[1]->alpha -= delta; + double step_size = delta * max_step_size; + /* lambdas += (cur_pair[1]->oracle_feat_diff) * step_size; + lambdas -= (cur_pair[0]->oracle_feat_diff) * step_size;*/ + + cerr << "step " << step_size << endl; + double alpha_sum=0; + SparseVector temp_lambdas = lambdas; + + for(int u=0;u!=cur_constraint.size();u++) + { + cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << endl; + temp_lambdas += (cur_constraint[u]->oracleN->features-cur_constraint[u]->features) * cur_constraint[u]->alpha * step_size; + alpha_sum += cur_constraint[u]->alpha; + } + cerr << "Alpha sum " << alpha_sum << " " << temp_lambdas << endl; + + lambdas += (cur_pair[1]->features) * step_size; + lambdas -= (cur_pair[0]->features) * step_size; + cerr << " Lambdas " << lambdas << endl; + //reload weights based on update + dense_weights.clear(); + //weights.InitFromVector(lambdas); + //weights.InitVector(&dense_weights); + lambdas.init_vector(&dense_weights); + dense_weights_g = dense_weights; + iter++; + + if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; + // cerr << "SMO opt " << iter << " " << delta << " " << cur_good_v[0]->alpha << " " << cur_bad_v[0]->alpha << endl; + if(no_select) //don't use selection heuristic to determine when to stop SMO, rather just when delta =0 + if (delta == 0) iter = MAX_SMO; + + //only perform one dual coordinate ascent step + if(optimizer == 2) + { + optimize_again = false; + iter = MAX_SMO; + } + + } + if(optimizer == 3) + { + if(!no_reweight) + { + if(DEBUG_SMO) cerr<< "Decoding with new weights -- now orac are " << oracles[cur_sent].good.size() << endl; + Hypergraph hg = observer.GetCurrentForest(); + hg.Reweight(dense_weights); + //observer.UpdateOracles(cur_sent, hg); + if(unique_kbest) + observer.UpdateOracles(cur_sent, hg); + else + observer.UpdateOracles > >(cur_sent, hg); + + + } + } + } + + + } + + //print objective after this sentence + double lambda_change = (lambdas - old_lambdas).l2norm_sq(); + double max_fear = cur_constraint[cur_constraint.size()-1]->fear; + double temp_objective = 0.5 * lambda_change;// + max_step_size * max_fear; + + for(int u=0;u!=cur_constraint.size();u++) + { + cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << " " << cur_constraint[u]->fear << endl; + temp_objective += cur_constraint[u]->alpha * cur_constraint[u]->fear; + } + objective += temp_objective; + + cerr << "SENT OBJ: " << temp_objective << " NEW OBJ: " << objective << endl; + } + + + if ((cur_sent * 40 / ds.size()) > dots) { ++dots; cerr << '.'; } + tot += lambdas; + ++lcount; + cur_sent++; + + cout << TD::GetString(cur_good_v[0]->hyp) << " ||| " << TD::GetString(cur_best_v[0]->hyp) << " ||| " << TD::GetString(cur_bad_v[0]->hyp) << endl; + + //clear good/bad lists from oracles for this sentences - you want to keep them around for things + + // oracles[cur_sent].good.clear(); + //oracles[cur_sent].bad.clear(); + } + + cerr << "FINAL OBJECTIVE: "<< objective << endl; + final_tot += tot; + cerr << "Translated " << lcount << " sentences " << endl; + cerr << " [AVG METRIC LAST PASS=" << (tot_loss / lcount) << "]\n"; + tot_loss = 0; + /* + float corpus_score = acc->ComputeScore(); + string corpus_details; + acc->ScoreDetails(&corpus_details); + cerr << "MODEL " << corpus_details << endl; + cout << corpus_score << endl; + + corpus_score = acc_h->ComputeScore(); + acc_h->ScoreDetails(&corpus_details); + cerr << "HOPE " << corpus_details << endl; + cout << corpus_score << endl; + + corpus_score = acc_f->ComputeScore(); + acc_f->ScoreDetails(&corpus_details); + cerr << "FEAR " << corpus_details << endl; + cout << corpus_score << endl; + */ + int node_id = rng->next() * 100000; + cerr << " Writing weights to " << node_id << endl; + Weights::ShowLargestFeatures(dense_weights); + dots = 0; + ostringstream os; + os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz"; + string msg = "# MIRA tuned weights ||| " + boost::lexical_cast(node_id) + " ||| " + boost::lexical_cast(lcount); + //Weights.InitFromVector(lambdas); + lambdas.init_vector(&dense_weights); + Weights::WriteToFile(os.str(), dense_weights, true, &msg); + + SparseVector x = tot; + x /= lcount; + ostringstream sa; + string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast(node_id) + " ||| " + boost::lexical_cast(lcount); + sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz"; + //Weights ww; + //ww.InitFromVector(x); + x.init_vector(&dense_weights); + Weights::WriteToFile(sa.str(), dense_weights, true, &msga); + + //assign averaged lambdas to initialize next iteration + //lambdas = x; + + /* double lambda_change = (old_lambdas - lambdas).l2norm_sq(); + cerr << "Change in lambda " << lambda_change << endl; + + if ( lambda_change < EPSILON) + { + cur_pass = max_iteration; + cerr << "Weights converged - breaking" << endl; + } + + ++cur_pass; + */ + + //} iteration while loop + + /* cerr << endl; + weights.WriteToFile("weights.mira-final.gz", true, &msg); + final_tot /= (lcount + 1);//max_iteration); + tot /= (corpus.size() + 1); + weights.InitFromVector(final_tot); + cerr << tot << "||||" << final_tot << endl; + msg = "# MIRA tuned weights (averaged vector)"; + weights.WriteToFile("weights.mira-final-avg.gz", true, &msg); + */ + cerr << "Optimization complete.\\AVERAGED WEIGHTS: weights.mira-final-avg.gz\n"; + return 0; +} + diff --git a/training/mira/run_mira.pl b/training/mira/run_mira.pl new file mode 100755 index 00000000..f4d61407 --- /dev/null +++ b/training/mira/run_mira.pl @@ -0,0 +1,548 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); +push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); + +require "libcall.pl"; + + +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; + +my $iteration = 0.0; +my $max_iterations = 6; +my $metric = "ibm_bleu"; +my $iniFile; +my $weights; +my $initialWeights; +my $decode_nodes = 1; # number of decode nodes +my $pmem = "1g"; +my $dir; + +my $SCORER = $FAST_SCORE; +my $local_server = "$bin_dir/local_parallelize.pl"; +my $parallelize = "$bin_dir/../dpmert/parallelize.pl"; +my $libcall = "$bin_dir/../dpmert/libcall.pl"; +my $sentserver = "$bin_dir/../dpmert/sentserver"; +my $sentclient = "$bin_dir/../dpmert/sentclient"; +my $run_local_server = 0; +my $run_local = 0; +my $usefork; +my $pass_suffix = ''; + +my $cdec ="$bin_dir/kbest_mirav5"; #"$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mira_lv"; + +#my $cdec ="$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mirav5"; #"$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mira_lv"; +die "Can't find decoder in $cdec" unless -x $cdec; +my $decoder = $cdec; +my $decoderOpt; +my $update_size=250; +my $approx_score; +my $kbest_size=250; +my $metric_scale=1; +my $optimizer=2; +my $disable_clean = 0; +my $use_make; # use make to parallelize line search +my $density_prune; +my $cpbin=1; +my $help = 0; +my $epsilon = 0.0001; +my $step_size = 0.01; +my $gpref; +my $unique_kbest; +my $freeze; +my $latent; +my $sample_max; +my $hopes=1; +my $fears=1; + +my $range = 35000; +my $minimum = 15000; +my $portn = int(rand($range)) + $minimum; + + +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "decoder=s" => \$decoderOpt, + "decode-nodes=i" => \$decode_nodes, + "density-prune=f" => \$density_prune, + "dont-clean" => \$disable_clean, + "pass-suffix=s" => \$pass_suffix, + "use-fork" => \$usefork, + "epsilon=s" => \$epsilon, + "help" => \$help, + "local" => \$run_local, + "local_server" => \$run_local_server, + "use-make=i" => \$use_make, + "max-iterations=i" => \$max_iterations, + "pmem=s" => \$pmem, + "cpbin!" => \$cpbin, + "ref-files=s" => \$refFiles, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "weights=s" => \$initialWeights, + "optimizer=i" => \$optimizer, + "metric-scale=i" => \$metric_scale, + "kbest-size=i" => \$kbest_size, + "update-size=i" => \$update_size, + "step-size=f" => \$step_size, + "hope-select=i" => \$hopes, + "fear-select=i" => \$fears, + "approx-score" => \$approx_score, + "unique-kbest" => \$unique_kbest, + "latent" => \$latent, + "sample-max=i" => \$sample_max, + "grammar-prefix=s" => \$gpref, + "freeze" => \$freeze, + "workdir=s" => \$dir, + ) == 0 || @ARGV!=1 || $help) { + print_help(); + exit; +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + + +#my $refs_comma_sep = get_comma_sep_refs($refFiles); +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +#my $refs_comma_sep_4cdec = get_comma_sep_refs_4cdec($refFiles); + +unless ($dir){ + $dir = "mira"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + +if ($decoderOpt){ $decoder = $decoderOpt; } + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; + +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { + local $_; + my $bindir=shift; + check_call("mkdir -p $bindir"); + -d $bindir || die "couldn't make bindir $bindir"; + for (@_) { + my $src=$$_; + $$_="$bindir/".basename($src); + check_call("cp -p $src $$_"); + } +} +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} + + + + +if (-e $dir && dirsize($dir)>1 && -e "$dir/weights" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs + die "ERROR: working dir $dir already exists\n\n"; +} else { + -e $dir || mkdir $dir; + mkdir "$dir/scripts"; + my $cmdfile="$dir/rerun-mira.sh"; + open CMD,'>',$cmdfile; + print CMD "cd ",&getcwd,"\n"; + my $cline=&cmdline."\n"; + print CMD $cline; + close CMD; + print STDERR $cline; + chmod(0755,$cmdfile); + unless (-e $initialWeights) { + print STDERR "Please specify an initial weights file with --initial-weights\n"; + print_help(); + exit; + } + check_call("cp $initialWeights $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); +} +write_config(*STDERR); + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc, $gpref); + +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while() { $devSize++; } +close F; + +my $lastPScore = 0; +my $lastWeightsFile; + +# main optimization loop +#while (1){ +for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { + + print STDERR "\n\nITERATION $opt_iter\n==========\n"; + print STDERR "Using port $portn\n"; + + # iteration-specific files + my $runFile="$dir/run.raw.$opt_iter"; + my $onebestFile="$dir/1best.$opt_iter"; + my $logdir="$dir/logs.$opt_iter"; + my $decoderLog="$logdir/decoder.sentserver.log.$opt_iter"; + my $scorerLog="$logdir/scorer.log.$opt_iter"; + my $weightdir="$dir/weights.pass$opt_iter/"; + check_call("mkdir -p $logdir"); + check_call("mkdir -p $weightdir"); + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); +# my $im1 = $opt_iter - 1; + my $weightsFile="$dir/weights.$opt_iter"; + print "ITER $iteration " ; + my $cur_pass = "-p 0$opt_iter"; + my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile $refs_comma_sep -m $metric -s $metric_scale -a -b $update_size -k $kbest_size -o $optimizer $cur_pass -O $weightdir -D $dir -h $hopes -f $fears -C $step_size"; + if($unique_kbest){ + $decoder_cmd .= " -u"; + } + if($latent){ + $decoder_cmd .= " -l"; + } + if($sample_max){ + $decoder_cmd .= " -t $sample_max"; + } + if ($density_prune) { + $decoder_cmd .= " --density_prune $density_prune"; + } + my $pcmd; + if ($run_local) { + $pcmd = "cat $srcFile |"; + } elsif ($use_make) { + # TODO: Throw error when decode_nodes is specified along with use_make + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --"; + } elsif ($run_local_server){ + $pcmd = "cat $srcFile | $local_server $usefork -p $pmem -e $logdir -n $decode_nodes --"; + } + else { + $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --baseport $portn --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + + my $retries = 0; + my $num_topbest; + while($retries < 5) { + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_topbest); + + + #score the output from this iteration + open RUN, "<$runFile" or die "Can't read $runFile: $!"; + open H, ">$runFile.H" or die; + open F, ">$runFile.F" or die; + open B, ">$runFile.B" or die; + while() { + chomp(); + (my $hope,my $best,my $fear) = split(/ \|\|\| /); + print H "$hope \n"; + print B "$best \n"; + print F "$fear \n"; + } + close RUN; + close F; close B; close H; + + my $dec_score = check_output("cat $runFile.B | $SCORER $refs_comma_sep -l $metric"); + my $dec_score_h = check_output("cat $runFile.H | $SCORER $refs_comma_sep -l $metric"); + my $dec_score_f = check_output("cat $runFile.F | $SCORER $refs_comma_sep -l $metric"); + chomp $dec_score; chomp $dec_score_h; chomp $dec_score_f; + print STDERR "DECODER SCORE: $dec_score HOPE: $dec_score_h FEAR: $dec_score_f\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + my $iter_filler=""; + if($opt_iter < 10) + {$iter_filler="0";} + + my $nextIter = $opt_iter + 1; + my $newWeightsFile = "$dir/weights.$nextIter"; + $lastWeightsFile = "$dir/weights.$opt_iter"; + + average_weights("$weightdir/weights.mira-pass*.*[0-9].gz", $newWeightsFile, $logdir); +# check_call("cp $lastW $newWeightsFile"); +# if ($icc < 2) { +# print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; +# last; +# } + system("gzip -f $logdir/kbes*"); + print STDERR "\n==========\n"; + $iteration++; +} +#find +#my $cmd = `grep SCORE /fs/clip-galep5/lexical_tm/log.runmira.nist.20 | cat -n | sort -k +2 | tail -1`; +#$cmd =~ m/([0-9]+)/; +#$lastWeightsFile = "$dir/weights.$1"; +#check_call("ln -s $lastWeightsFile $dir/weights.tuned"); +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while() { $lc++; } + return $lc; +} + +sub get_comma_sep_refs { + my ($r,$p) = @_; + my $o = check_output("echo $p"); + chomp $o; + my @files = split /\s+/, $o; + return "-$r " . join(" -$r ", @files); +} + + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while() { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +sub write_config { + my $fh = shift; + my $cleanup = "yes"; + if ($disable_clean) {$cleanup = "no";} + + print $fh "\n"; + print $fh "DECODER: $decoder\n"; + print $fh "INI FILE: $iniFile\n"; + print $fh "WORKING DIR: $dir\n"; + print $fh "SOURCE (DEV): $srcFile\n"; + print $fh "REFS (DEV): $refFiles\n"; + print $fh "EVAL METRIC: $metric\n"; + print $fh "START ITERATION: $iteration\n"; + print $fh "MAX ITERATIONS: $max_iterations\n"; + print $fh "DECODE NODES: $decode_nodes\n"; + print $fh "HEAD NODE: $host\n"; + print $fh "PMEM (DECODING): $pmem\n"; + print $fh "CLEANUP: $cleanup\n"; + print $fh "INITIAL WEIGHTS: $initialWeights\n"; + print $fh "GRAMMAR PREFIX: $gpref\n"; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + my $grammarpref = shift; + + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=){ + chomp $line; + if ($line =~ /^\s* tags, you must include a zero-based id attribute"; + } + } + elsif (defined $grammarpref) { + print NEWSRC "$line\n";} + else { + print NEWSRC "$line\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + print "Something wrong\n"; +} + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub average_weights { + + my $path = shift; + my $out = shift; + my $logpath = shift; + print "AVERAGE $path $out\n"; + my %feature_weights= (); + my $total =0; + my $total_mult =0; + sleep(10); + foreach my $file (glob "$path") + { + $file =~ /\/([^\/]+).gz$/; + my $fname = $1; + my $cmd = "gzip -d $file"; + $file =~ s/\.gz//; + check_bash_call($cmd); + my $mult = 0; + print "FILE $file \n"; + open SCORE, "< $file" or next; + $total++; + while( ) { + my $line = $_; + if ($line !~ m/^\#/) + { + my @s = split(" ",$line); + $feature_weights{$s[0]}+= $mult * $s[1]; + } + else + { + (my $msg,my $ran,$mult) = split(/ \|\|\| /); + print "RAN $ran $mult\n"; + } + } + $total_mult += $mult; + + close SCORE; + $cmd = "gzip $file"; check_bash_call($cmd); + } + +#print out new averaged weights + open OUT, "> $out" or next; + for my $f ( keys %feature_weights ) { + print "$f $feature_weights{$f} $total_mult\n"; + my $ave = $feature_weights{$f} / $total_mult; + + print "Printing $f $ave ||| "; + print OUT "$f $ave\n"; + } + +} -- cgit v1.2.3 From 58fad6230fc6c7b60add2382bf4afe55b9205c1a Mon Sep 17 00:00:00 2001 From: Vladimir Eidelman Date: Sat, 13 Apr 2013 21:57:37 -0400 Subject: mira run script --- environment/LocalConfig.pm | 2 +- training/mira/Makefile.am | 7 +- training/mira/kbest_cut_mira.cc | 1010 ++++++++++++++++++++++++++++++++++ training/mira/kbest_mirav5.cc | 1148 --------------------------------------- training/mira/run_mira.pl | 181 ++++-- 5 files changed, 1141 insertions(+), 1207 deletions(-) create mode 100644 training/mira/kbest_cut_mira.cc delete mode 100644 training/mira/kbest_mirav5.cc diff --git a/environment/LocalConfig.pm b/environment/LocalConfig.pm index 627f7f8c..f7c3b1c7 100644 --- a/environment/LocalConfig.pm +++ b/environment/LocalConfig.pm @@ -34,7 +34,7 @@ my $CCONFIG = { #'QSubQueue' => '-q long', }, 'UMIACS' => { - 'HOST_REGEXP' => qr/^d.*\.umiacs\.umd\.edu$/, + 'HOST_REGEXP' => qr/^(n|s|d).*\.umiacs\.umd\.edu$/, 'JobControl' => 'qsub', 'QSubMemFlag' => '-l pmem=', 'QSubQueue' => '-q batch', diff --git a/training/mira/Makefile.am b/training/mira/Makefile.am index fa4fb22d..8cddc2d7 100644 --- a/training/mira/Makefile.am +++ b/training/mira/Makefile.am @@ -1,6 +1,11 @@ -bin_PROGRAMS = kbest_mira +bin_PROGRAMS = kbest_mira \ + kbest_cut_mira kbest_mira_SOURCES = kbest_mira.cc kbest_mira_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a + +kbest_cut_mira_SOURCES = kbest_cut_mira.cc +kbest_cut_mira_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a + AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc new file mode 100644 index 00000000..34eb00dc --- /dev/null +++ b/training/mira/kbest_cut_mira.cc @@ -0,0 +1,1010 @@ +#include +#include +#include +#include +#include +#include + +#include "config.h" + + +#include +#include +#include + +#include "sentence_metadata.h" +#include "scorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "time.h" +#include "sampler.h" + +#include "weights.h" +#include "sparse_vector.h" + +using namespace std; +using boost::shared_ptr; +namespace po = boost::program_options; + +bool invert_score; +boost::shared_ptr rng; +bool approx_score; +bool no_reweight; +bool no_select; +bool unique_kbest; +int update_list_size; +vector dense_weights_g; +double mt_metric_scale; +int optimizer; +int fear_select; +int hope_select; +bool pseudo_doc; +bool sent_approx; +bool checkloss; + +void SanityCheck(const vector& w) { + for (int i = 0; i < w.size(); ++i) { + assert(!isnan(w[i])); + assert(!isinf(w[i])); + } +} + +struct FComp { + const vector& w_; + FComp(const vector& w) : w_(w) {} + bool operator()(int a, int b) const { + return fabs(w_[a]) > fabs(w_[b]); + } +}; + +void ShowLargestFeatures(const vector& w) { + vector fnums(w.size()); + for (int i = 0; i < w.size(); ++i) + fnums[i] = i; + vector::iterator mid = fnums.begin(); + mid += (w.size() > 10 ? 10 : w.size()); + partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); + cerr << "TOP FEATURES:"; + for (vector::iterator i = fnums.begin(); i != mid; ++i) { + cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; + } + cerr << endl; +} + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("input_weights,w",po::value(),"Input feature weights file") + ("source,i",po::value(),"Source file for development set") + ("pass,p", po::value()->default_value(15), "Current pass through the training data") + ("reference,r",po::value >(), "[REQD] Reference translation(s) (tokenized text file)") + ("mt_metric,m",po::value()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") + ("optimizer,o",po::value()->default_value(1), "Optimizer (SGD=1, PA MIRA w/Delta=2, Cutting Plane MIRA=3, PA MIRA=4, Triple nbest list MIRA=5)") + ("fear,f",po::value()->default_value(1), "Fear selection (model-cost=1, maxcost=2, maxscore=3)") + ("hope,h",po::value()->default_value(1), "Hope selection (model+cost=1, mincost=2)") + ("max_step_size,C", po::value()->default_value(0.01), "regularization strength (C)") + ("random_seed,S", po::value(), "Random seed (if not specified, /dev/random will be used)") + ("mt_metric_scale,s", po::value()->default_value(1.0), "Amount to scale MT loss function by") + ("sent_approx,a", "Use smoothed sentence-level BLEU score for approximate scoring") + ("pseudo_doc,e", "Use pseudo-document BLEU score for approximate scoring") + ("no_reweight,d","Do not reweight forest for cutting plane") + ("no_select,n", "Do not use selection heuristic") + ("k_best_size,k", po::value()->default_value(250), "Size of hypothesis list to search for oracles") + ("update_k_best,b", po::value()->default_value(1), "Size of good, bad lists to perform update with") + ("unique_k_best,u", "Unique k-best translation list") + ("weights_output,O",po::value(),"Directory to write weights to") + ("output_dir,D",po::value(),"Directory to place output in") + ("decoder_config,c",po::value(),"Decoder configuration file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,H", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("input_weights") || !conf->count("decoder_config") || !conf->count("reference")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +//load previous translation, store array of each sentences score, subtract it from current sentence and replace with new translation score + + +static const double kMINUS_EPSILON = -1e-6; +static const double EPSILON = 0.000001; +static const double SMO_EPSILON = 0.0001; +static const double PSEUDO_SCALE = 0.95; +static const int MAX_SMO = 10; +int cur_pass; + +struct HypothesisInfo { + SparseVector features; + vector hyp; + double mt_metric; + double hope; + double fear; + double alpha; + double oracle_loss; + SparseVector oracle_feat_diff; + shared_ptr oracleN; +}; + +bool ApproxEqual(double a, double b) { + if (a == b) return true; + return (fabs(a-b)/fabs(b)) < EPSILON; +} + +typedef shared_ptr HI; +bool HypothesisCompareB(const HI& h1, const HI& h2 ) +{ + return h1->mt_metric > h2->mt_metric; +}; + + +bool HopeCompareB(const HI& h1, const HI& h2 ) +{ + return h1->hope > h2->hope; +}; + +bool FearCompareB(const HI& h1, const HI& h2 ) +{ + return h1->fear > h2->fear; +}; + +bool FearComparePred(const HI& h1, const HI& h2 ) +{ + return h1->features.dot(dense_weights_g) > h2->features.dot(dense_weights_g); +}; + +bool HypothesisCompareG(const HI& h1, const HI& h2 ) +{ + return h1->mt_metric < h2->mt_metric; +}; + + +void CuttingPlane(vector >* cur_c, bool* again, vector >& all_hyp, vector dense_weights) +{ + bool DEBUG_CUT = false; + shared_ptr max_fear, max_fear_in_set; + vector >& cur_constraint = *cur_c; + + if(no_reweight) + { + //find new hope hypothesis + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights); + all_hyp[u]->hope = 1 * all_hyp[u]->mt_metric + t_score; + } + + //sort hyps by hope score + sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); + + double hope_score = all_hyp[0]->features.dot(dense_weights); + if(DEBUG_CUT) cerr << "New hope derivation score " << hope_score << endl; + + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights); + //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; + + all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*all_hyp[0]->mt_metric - hope_score + t_score; //relative loss + // all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*all_hyp[0]->mt_metric; + //all_hyp[u]->oracle_feat_diff = all_hyp[0]->features - all_hyp[u]->features; + // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; + } + + sort(all_hyp.begin(),all_hyp.end(),FearCompareB); + + } + //assign maximum fear derivation from all derivations + max_fear = all_hyp[0]; + + if(DEBUG_CUT) cerr <<"Cutting Plane Max Fear "<fear ; + for(int i=0; i < cur_constraint.size();i++) //select maximal violator already in constraint set + { + if (!max_fear_in_set || cur_constraint[i]->fear > max_fear_in_set->fear) + max_fear_in_set = cur_constraint[i]; + } + if(DEBUG_CUT) cerr << "Max Fear in constraint set " << max_fear_in_set->fear << endl; + + if(max_fear->fear > max_fear_in_set->fear + SMO_EPSILON) + { + cur_constraint.push_back(max_fear); + *again = true; + if(DEBUG_CUT) cerr << "Optimize Again " << *again << endl; + } +} + + +double ComputeDelta(vector >* cur_p, double max_step_size,vector dense_weights ) +{ + vector >& cur_pair = *cur_p; + double loss = cur_pair[0]->oracle_loss - cur_pair[1]->oracle_loss; + //double margin = -cur_pair[0]->oracle_feat_diff.dot(dense_weights) + cur_pair[1]->oracle_feat_diff.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? + //double num = loss - margin; + + + double margin = -(cur_pair[0]->oracleN->features.dot(dense_weights)- cur_pair[0]->features.dot(dense_weights)) + (cur_pair[1]->oracleN->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights)); + const double num = margin + loss; + cerr << "LOSS: " << num << " Margin:" << margin << " BLEUL:" << loss << " " << cur_pair[1]->features.dot(dense_weights) << " " << cur_pair[0]->features.dot(dense_weights) <oracle_loss - cur_pair[0]->oracle_feat_diff.dot(dense_weights)) + - (cur_pair[1]->oracle_loss - cur_pair[1]->oracle_feat_diff.dot(dense_weights)); + */ + + SparseVector diff = cur_pair[0]->features; + diff -= cur_pair[1]->features; + /* SparseVector diff = cur_pair[0]->oracle_feat_diff; + diff -= cur_pair[1]->oracle_feat_diff;*/ + double diffsqnorm = diff.l2norm_sq(); + double delta; + if (diffsqnorm > 0) + delta = num / (diffsqnorm * max_step_size); + else + delta = 0; + cerr << " D1:" << delta; + //clip delta (enforce margin constraints) + + delta = max(-cur_pair[0]->alpha, min(delta, cur_pair[1]->alpha)); + cerr << " D2:" << delta; + return delta; +} + + +vector > SelectPair(vector >* cur_c) +{ + bool DEBUG_SELECT= false; + vector >& cur_constraint = *cur_c; + + vector > pair; + + if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for 1-mira + // if(optimizer == 2) { + pair.push_back(cur_constraint[0]); + pair.push_back(cur_constraint[1]); + return pair; + // } + } + + for(int u=0;u != cur_constraint.size();u++) + { + shared_ptr max_fear; + + if(DEBUG_SELECT) cerr<< "cur alpha " << u << " " << cur_constraint[u]->alpha; + for(int i=0; i < cur_constraint.size();i++) //select maximal violator + { + if(i != u) + if (!max_fear || cur_constraint[i]->fear > max_fear->fear) + max_fear = cur_constraint[i]; + } + if(!max_fear) return pair; // + + if(DEBUG_SELECT) cerr << " F" << max_fear->fear << endl; + + + if ((cur_constraint[u]->alpha == 0) && (cur_constraint[u]->fear > max_fear->fear + SMO_EPSILON)) + { + for(int i=0; i < cur_constraint.size();i++) //select maximal violator + { + if(i != u) + if (cur_constraint[i]->alpha > 0) + { + pair.push_back(cur_constraint[u]); + pair.push_back(cur_constraint[i]); + cerr << "RETJURN from 1" << endl; + return pair; + } + } + } + if ((cur_constraint[u]->alpha > 0) && (cur_constraint[u]->fear < max_fear->fear - SMO_EPSILON)) + { + for(int i=0; i < cur_constraint.size();i++) //select maximal violator + { + if(i != u) + if (cur_constraint[i]->fear > cur_constraint[u]->fear) + { + pair.push_back(cur_constraint[u]); + pair.push_back(cur_constraint[i]); + return pair; + } + } + } + + } + return pair; //no more constraints to optimize, we're done here + +} + +struct GoodBadOracle { + vector > good; + vector > bad; +}; + +struct TrainingObserver : public DecoderObserver { + TrainingObserver(const int k, const DocScorer& d, vector* o, vector* cbs) : ds(d), oracles(*o), corpus_bleu_sent_stats(*cbs), kbest_size(k) { + // TrainingObserver(const int k, const DocScorer& d, vector* o) : ds(d), oracles(*o), kbest_size(k) { + + //calculate corpus bleu score from previous iterations 1-best for BLEU gain + if(!pseudo_doc && !sent_approx) + if(cur_pass > 0) + { + ScoreP acc; + for (int ii = 0; ii < corpus_bleu_sent_stats.size(); ii++) { + if (!acc) { acc = corpus_bleu_sent_stats[ii]->GetZero(); } + acc->PlusEquals(*corpus_bleu_sent_stats[ii]); + + } + corpus_bleu_stats = acc; + corpus_bleu_score = acc->ComputeScore(); + } + //corpus_src_length = 0; +} + const DocScorer& ds; + vector& corpus_bleu_sent_stats; + vector& oracles; + vector > cur_best; + shared_ptr cur_oracle; + const int kbest_size; + Hypergraph forest; + int cur_sent; + ScoreP corpus_bleu_stats; + float corpus_bleu_score; + + float corpus_src_length; + float curr_src_length; + + const int GetCurrentSent() const { + return cur_sent; + } + + const HypothesisInfo& GetCurrentBestHypothesis() const { + return *cur_best[0]; + } + + const vector > GetCurrentBest() const { + return cur_best; + } + + const HypothesisInfo& GetCurrentOracle() const { + return *cur_oracle; + } + + const Hypergraph& GetCurrentForest() const { + return forest; + } + + + virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { + cur_sent = smeta.GetSentenceID(); + //cerr << "SOURCE " << smeta.GetSourceLength() << endl; + curr_src_length = (float) smeta.GetSourceLength(); + //UpdateOracles(smeta.GetSentenceID(), *hg); + if(unique_kbest) + UpdateOracles(smeta.GetSentenceID(), *hg); + else + UpdateOracles > >(smeta.GetSentenceID(), *hg); + forest = *hg; + + } + + shared_ptr MakeHypothesisInfo(const SparseVector& feats, const double score, const vector& hyp) { + shared_ptr h(new HypothesisInfo); + h->features = feats; + h->mt_metric = score; + h->hyp = hyp; + return h; + } + + template + void UpdateOracles(int sent_id, const Hypergraph& forest) { + + bool PRINT_LIST= false; + vector >& cur_good = oracles[sent_id].good; + vector >& cur_bad = oracles[sent_id].bad; + //TODO: look at keeping previous iterations hypothesis lists around + cur_best.clear(); + cur_good.clear(); + cur_bad.clear(); + + vector > all_hyp; + + typedef KBest::KBestDerivations, ESentenceTraversal,Filter> K; + K kbest(forest,kbest_size); + + //KBest::KBestDerivations, ESentenceTraversal> kbest(forest, kbest_size); + for (int i = 0; i < kbest_size; ++i) { + //const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + typename K::Derivation *d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + + float sentscore; + if(cur_pass > 0 && !pseudo_doc && !sent_approx) + { + ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); + ScoreP corpus_no_best = corpus_bleu_stats->GetZero(); + + corpus_bleu_stats->Subtract(*corpus_bleu_sent_stats[sent_id], &*corpus_no_best); + sent_stats->PlusEquals(*corpus_no_best, 0.5); + + //compute gain from new sentence in 1-best corpus + sentscore = mt_metric_scale * (sent_stats->ComputeScore() - corpus_no_best->ComputeScore());// - corpus_bleu_score); + } + else if(pseudo_doc) //pseudo-corpus smoothing + { + float src_scale = corpus_src_length + curr_src_length; + ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); + if(!corpus_bleu_stats){ corpus_bleu_stats = sent_stats->GetZero();} + + sent_stats->PlusEquals(*corpus_bleu_stats); + sentscore = mt_metric_scale * src_scale * sent_stats->ComputeScore(); + + } + else //use sentence-level smoothing ( used when cur_pass=0 if not pseudo_doc) + { + + sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore()); + } + + if (invert_score) sentscore *= -1.0; + + if (i < update_list_size){ + if(PRINT_LIST)cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; + cur_best.push_back( MakeHypothesisInfo(d->feature_values, sentscore, d->yield)); + } + + all_hyp.push_back(MakeHypothesisInfo(d->feature_values, sentscore,d->yield)); //store all hyp to extract hope and fear + } + + if(pseudo_doc){ + //update psuedo-doc stats + string details, details2; + corpus_bleu_stats->ScoreDetails(&details2); + ScoreP sent_stats = ds[sent_id]->ScoreCandidate(cur_best[0]->hyp); + corpus_bleu_stats->PlusEquals(*sent_stats); + + sent_stats->ScoreDetails(&details); + sent_stats = corpus_bleu_stats; + corpus_bleu_stats = sent_stats->GetZero(); + corpus_bleu_stats->PlusEquals(*sent_stats, PSEUDO_SCALE); + + corpus_src_length = PSEUDO_SCALE * (corpus_src_length + curr_src_length); + cerr << "CORP S " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n" << details2 << endl; + } + + + //figure out how many hyps we can keep maximum + int temp_update_size = update_list_size; + if (all_hyp.size() < update_list_size){ temp_update_size = all_hyp.size();} + + //sort all hyps by sentscore (eg. bleu) + sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareB); + + if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++) cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_weights_g) << endl; } + + if(hope_select == 1) + { + //find hope hypothesis using model + bleu + if (PRINT_LIST) cerr << "HOPE " << endl; + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights_g); + all_hyp[u]->hope = all_hyp[u]->mt_metric + t_score; + if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; + + } + + //sort hyps by hope score + sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); + } + + //assign cur_good the sorted list + cur_good.insert(cur_good.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); + if(PRINT_LIST) { cerr << "GOOD" << endl; for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;} + + shared_ptr& oracleN = cur_good[0]; + + + if(fear_select == 1){ //compute fear hyps with model - bleu + if (PRINT_LIST) cerr << "FEAR " << endl; + double hope_score = oracleN->features.dot(dense_weights_g); + + if (PRINT_LIST) cerr << "hope score " << hope_score << endl; + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights_g); + //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; + + /* all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric - hope_score + t_score; //relative loss + all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric; + all_hyp[u]->oracle_feat_diff = cur_oracle->features - all_hyp[u]->features;*/ + + all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric - hope_score + t_score; //relative loss + all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric; + all_hyp[u]->oracle_feat_diff = oracleN->features - all_hyp[u]->features; + all_hyp[u]->oracleN=oracleN; + // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; + if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; + + } + + sort(all_hyp.begin(),all_hyp.end(),FearCompareB); + + cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); + } + else if(fear_select == 2) //select fear based on cost + { + cur_bad.insert(cur_bad.begin(), all_hyp.end()-temp_update_size, all_hyp.end()); + reverse(cur_bad.begin(),cur_bad.end()); + } + else //pred-based, fear_select = 3 + { + sort(all_hyp.begin(),all_hyp.end(),FearComparePred); + cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); + } + + + if(PRINT_LIST){ cerr<< "BAD"<mt_metric << " H:" << cur_bad[u]->hope << " F:" << cur_bad[u]->fear << endl;} + + cerr << "GOOD (BEST): " << cur_good[0]->mt_metric << endl; + cerr << " CUR: " << cur_best[0]->mt_metric << endl; + cerr << " BAD (WORST): " << cur_bad[0]->mt_metric << endl; + } +}; + +void ReadTrainingCorpus(const string& fname, vector* c) { + + + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (!in) break; + c->push_back(line); + } +} + +void ReadPastTranslationForScore(const int cur_pass, vector* c, DocScorer& ds, const string& od) +{ + cerr << "Reading BLEU gain file "; + string fname; + if(cur_pass == 0) + { + fname = od + "/run.raw.init"; + } + else + { + int last_pass = cur_pass - 1; + fname = od + "/run.raw." + boost::lexical_cast(last_pass) + ".B"; + } + cerr << fname << "\n"; + ReadFile rf(fname); + istream& in = *rf.stream(); + ScoreP acc; + string line; + int lc = 0; + while(in) { + getline(in, line); + if (line.empty() && !in) break; + vector sent; + TD::ConvertSentence(line, &sent); + ScoreP sentscore = ds[lc]->ScoreCandidate(sent); + c->push_back(sentscore); + if (!acc) { acc = sentscore->GetZero(); } + acc->PlusEquals(*sentscore); + ++lc; + + } + + + assert(lc > 0); + float score = acc->ComputeScore(); + string details; + acc->ScoreDetails(&details); + cerr << "INIT RUN " << details << score << endl; + +} + + +int main(int argc, char** argv) { + register_feature_functions(); + SetSilent(true); // turn off verbose decoder output + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) return 1; + + if (conf.count("random_seed")) + rng.reset(new MT19937(conf["random_seed"].as())); + else + rng.reset(new MT19937); + + vector corpus; + //ReadTrainingCorpus(conf["source"].as(), &corpus); + + const string metric_name = conf["mt_metric"].as(); + optimizer = conf["optimizer"].as(); + fear_select = conf["fear"].as(); + hope_select = conf["hope"].as(); + mt_metric_scale = conf["mt_metric_scale"].as(); + approx_score = conf.count("approx_score"); + no_reweight = conf.count("no_reweight"); + no_select = conf.count("no_select"); + update_list_size = conf["update_k_best"].as(); + unique_kbest = conf.count("unique_k_best"); + pseudo_doc = conf.count("pseudo_doc"); + sent_approx = conf.count("sent_approx"); + cerr << "PSEUDO " << pseudo_doc << " SENT " << sent_approx << endl; + if(pseudo_doc) + mt_metric_scale=1; + + const string weights_dir = conf["weights_output"].as(); + const string output_dir = conf["output_dir"].as(); + ScoreType type = ScoreTypeFromString(metric_name); + + //establish metric used for tuning + if (type == TER) { + invert_score = true; + // approx_score = false; + } else { + invert_score = false; + } + + //load references + DocScorer ds(type, conf["reference"].as >(), ""); + cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl; + vector corpus_bleu_sent_stats; + + //check training pass,if >0, then use previous iterations corpus bleu stats + cur_pass = conf["pass"].as(); + if(cur_pass > 0) + { + ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir); + } + /* if (ds.size() != corpus.size()) { + cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; + return 1; + }*/ + cerr << "Optimizing with " << optimizer << endl; + // load initial weights + /*Weights weights; + weights.InitFromFile(conf["input_weights"].as()); + SparseVector lambdas; + weights.InitSparseVector(&lambdas); + */ + + + + ReadFile ini_rf(conf["decoder_config"].as()); + Decoder decoder(ini_rf.stream()); + + vector& dense_weights = decoder.CurrentWeightVector(); + + SparseVector lambdas; + Weights::InitFromFile(conf["input_weights"].as(), &dense_weights); + Weights::InitSparseVector(dense_weights, &lambdas); + + const string input = decoder.GetConf()["input"].as(); + //const bool show_feature_dictionary = decoder.GetConf().count("show_feature_dictionary"); + if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; + ReadFile in_read(input); + istream *in = in_read.stream(); + assert(*in); + string buf; + + const double max_step_size = conf["max_step_size"].as(); + + + // assert(corpus.size() > 0); + vector oracles(ds.size()); + + TrainingObserver observer(conf["k_best_size"].as(), ds, &oracles, &corpus_bleu_sent_stats); + + int cur_sent = 0; + int lcount = 0; + double objective=0; + double tot_loss = 0; + int dots = 0; + // int cur_pass = 1; + // vector dense_weights; + SparseVector tot; + SparseVector final_tot; + // tot += lambdas; // initial weights + // lcount++; // count for initial weights + + //string msg = "# MIRA tuned weights"; + // while (cur_pass <= max_iteration) { + SparseVector old_lambdas = lambdas; + tot.clear(); + tot += lambdas; + cerr << "PASS " << cur_pass << " " << endl << lambdas << endl; + ScoreP acc, acc_h, acc_f; + + while(*in) { + getline(*in, buf); + if (buf.empty()) continue; + //TODO: allow batch updating + lambdas.init_vector(&dense_weights); + dense_weights_g = dense_weights; + decoder.SetId(cur_sent); + decoder.Decode(buf, &observer); // decode the sentence, calling Notify to get the hope,fear, and model best hyps. + + cur_sent = observer.GetCurrentSent(); + cerr << "SENT: " << cur_sent << endl; + const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis(); + const HypothesisInfo& cur_good = *oracles[cur_sent].good[0]; + const HypothesisInfo& cur_bad = *oracles[cur_sent].bad[0]; + + vector >& cur_good_v = oracles[cur_sent].good; + vector >& cur_bad_v = oracles[cur_sent].bad; + vector > cur_best_v = observer.GetCurrentBest(); + + tot_loss += cur_hyp.mt_metric; + + //score hyps to be able to compute corpus level bleu after we finish this iteration through the corpus + ScoreP sentscore = ds[cur_sent]->ScoreCandidate(cur_hyp.hyp); + if (!acc) { acc = sentscore->GetZero(); } + acc->PlusEquals(*sentscore); + + ScoreP hope_sentscore = ds[cur_sent]->ScoreCandidate(cur_good.hyp); + if (!acc_h) { acc_h = hope_sentscore->GetZero(); } + acc_h->PlusEquals(*hope_sentscore); + + ScoreP fear_sentscore = ds[cur_sent]->ScoreCandidate(cur_bad.hyp); + if (!acc_f) { acc_f = fear_sentscore->GetZero(); } + acc_f->PlusEquals(*fear_sentscore); + + if(optimizer == 4) { //passive-aggresive update (single dual coordinate step) + + double margin = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights); + double mt_loss = (cur_good.mt_metric - cur_bad.mt_metric); + const double loss = margin + mt_loss; + cerr << "LOSS: " << loss << " Margin:" << margin << " BLEUL:" << mt_loss << " " << cur_bad.features.dot(dense_weights) << " " << cur_good.features.dot(dense_weights) < 0.0 || !checkloss) { + SparseVector diff = cur_good.features; + diff -= cur_bad.features; + + double diffsqnorm = diff.l2norm_sq(); + double delta; + if (diffsqnorm > 0) + delta = loss / (diffsqnorm); + else + delta = 0; + + if (delta > max_step_size) delta = max_step_size; + lambdas += (cur_good.features * delta); + lambdas -= (cur_bad.features * delta); + + } + } + else if(optimizer == 1) //sgd - nonadapted step size + { + + lambdas += (cur_good.features) * max_step_size; + lambdas -= (cur_bad.features) * max_step_size; + } + else if(optimizer == 5) //full mira with n-best list of constraints from hope, fear, model best + { + vector > cur_constraint; + cur_constraint.insert(cur_constraint.begin(), cur_bad_v.begin(), cur_bad_v.end()); + cur_constraint.insert(cur_constraint.begin(), cur_best_v.begin(), cur_best_v.end()); + cur_constraint.insert(cur_constraint.begin(), cur_good_v.begin(), cur_good_v.end()); + + bool optimize_again; + vector > cur_pair; + //SMO + for(int u=0;u!=cur_constraint.size();u++) + cur_constraint[u]->alpha =0; + + cur_constraint[0]->alpha =1; //set oracle to alpha=1 + + cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl; + int smo_iter = MAX_SMO, smo_iter2 = MAX_SMO; + int iter, iter2 =0; + bool DEBUG_SMO = false; + while (iter2 < smo_iter2) + { + iter =0; + while (iter < smo_iter) + { + optimize_again = true; + for (int i = 0; i< cur_constraint.size(); i++) + for (int j = i+1; j< cur_constraint.size(); j++) + { + if(DEBUG_SMO) cerr << "start " << i << " " << j << endl; + cur_pair.clear(); + cur_pair.push_back(cur_constraint[j]); + cur_pair.push_back(cur_constraint[i]); + double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); + + if (delta == 0) optimize_again = false; + cur_constraint[j]->alpha += delta; + cur_constraint[i]->alpha -= delta; + double step_size = delta * max_step_size; + + lambdas += (cur_constraint[i]->features) * step_size; + lambdas -= (cur_constraint[j]->features) * step_size; + if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << i << " " << j << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; + } + iter++; + + if(!optimize_again) + { + iter = MAX_SMO; + cerr << "Optimization stopped, delta =0" << endl; + } + } + iter2++; + } + } + else if(optimizer == 2 || optimizer == 3) //PA and Cutting Plane MIRA update + { + bool DEBUG_SMO= true; + vector > cur_constraint; + cur_constraint.push_back(cur_good_v[0]); //add oracle to constraint set + bool optimize_again = true; + int cut_plane_calls = 0; + while (optimize_again) + { + if(DEBUG_SMO) cerr<< "optimize again: " << optimize_again << endl; + if(optimizer == 2){ //PA + cur_constraint.push_back(cur_bad_v[0]); + + //check if we have a violation + if(!(cur_constraint[1]->fear > cur_constraint[0]->fear + SMO_EPSILON)) + { + optimize_again = false; + cerr << "Constraint not violated" << endl; + } + } + else + { //cutting plane to add constraints + if(DEBUG_SMO) cerr<< "Cutting Plane " << cut_plane_calls << " with " << lambdas << endl; + optimize_again = false; + cut_plane_calls++; + CuttingPlane(&cur_constraint, &optimize_again, oracles[cur_sent].bad, dense_weights); + if (cut_plane_calls >= MAX_SMO) optimize_again = false; + } + + if(optimize_again) + { + //SMO + for(int u=0;u!=cur_constraint.size();u++) + { + cur_constraint[u]->alpha =0; + } + cur_constraint[0]->alpha = 1; + cerr <<" Optimizing with " << cur_constraint.size() << " constraints" << endl; + int smo_iter = MAX_SMO; + int iter =0; + while (iter < smo_iter) + { + //select pair to optimize from constraint set + vector > cur_pair = SelectPair(&cur_constraint); + + if(cur_pair.empty()){ + iter=MAX_SMO; + cerr << "Undefined pair " << endl; + continue; + } //pair is undefined so we are done with this smo + + double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); + + cur_pair[0]->alpha += delta; + cur_pair[1]->alpha -= delta; + double step_size = delta * max_step_size; + cerr << "step " << step_size << endl; + + lambdas += (cur_pair[1]->features) * step_size; + lambdas -= (cur_pair[0]->features) * step_size; + cerr << " Lambdas " << lambdas << endl; + //reload weights based on update + + dense_weights.clear(); + lambdas.init_vector(&dense_weights); + dense_weights_g = dense_weights; + iter++; + + if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; + if(no_select) //don't use selection heuristic to determine when to stop SMO, rather just when delta =0 + if (delta == 0) iter = MAX_SMO; + + //only perform one dual coordinate ascent step + if(optimizer == 2) + { + optimize_again = false; + iter = MAX_SMO; + } + } + if(optimizer == 3) + { + if(!no_reweight) //reweight the forest and select a new k-best + { + if(DEBUG_SMO) cerr<< "Decoding with new weights -- now orac are " << oracles[cur_sent].good.size() << endl; + Hypergraph hg = observer.GetCurrentForest(); + hg.Reweight(dense_weights); + if(unique_kbest) + observer.UpdateOracles(cur_sent, hg); + else + observer.UpdateOracles > >(cur_sent, hg); + } + } + } + + } + + //print objective after this sentence + double lambda_change = (lambdas - old_lambdas).l2norm_sq(); + double max_fear = cur_constraint[cur_constraint.size()-1]->fear; + double temp_objective = 0.5 * lambda_change;// + max_step_size * max_fear; + + for(int u=0;u!=cur_constraint.size();u++) + { + cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << " " << cur_constraint[u]->fear << endl; + temp_objective += cur_constraint[u]->alpha * cur_constraint[u]->fear; + } + objective += temp_objective; + + cerr << "SENT OBJ: " << temp_objective << " NEW OBJ: " << objective << endl; + } + + + if ((cur_sent * 40 / ds.size()) > dots) { ++dots; cerr << '.'; } + tot += lambdas; + ++lcount; + cur_sent++; + + cout << TD::GetString(cur_good_v[0]->hyp) << " ||| " << TD::GetString(cur_best_v[0]->hyp) << " ||| " << TD::GetString(cur_bad_v[0]->hyp) << endl; + + } + + cerr << "FINAL OBJECTIVE: "<< objective << endl; + final_tot += tot; + cerr << "Translated " << lcount << " sentences " << endl; + cerr << " [AVG METRIC LAST PASS=" << (tot_loss / lcount) << "]\n"; + tot_loss = 0; + + int node_id = rng->next() * 100000; + cerr << " Writing weights to " << node_id << endl; + Weights::ShowLargestFeatures(dense_weights); + dots = 0; + ostringstream os; + os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz"; + string msg = "# MIRA tuned weights ||| " + boost::lexical_cast(node_id) + " ||| " + boost::lexical_cast(lcount); + //Weights.InitFromVector(lambdas); + lambdas.init_vector(&dense_weights); + Weights::WriteToFile(os.str(), dense_weights, true, &msg); + + SparseVector x = tot; + x /= lcount; + ostringstream sa; + string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast(node_id) + " ||| " + boost::lexical_cast(lcount); + sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz"; + x.init_vector(&dense_weights); + Weights::WriteToFile(sa.str(), dense_weights, true, &msga); + + + cerr << "Optimization complete.\n"; + return 0; +} + diff --git a/training/mira/kbest_mirav5.cc b/training/mira/kbest_mirav5.cc deleted file mode 100644 index cea5cf67..00000000 --- a/training/mira/kbest_mirav5.cc +++ /dev/null @@ -1,1148 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "config.h" - - -#include -#include -#include - -#include "sentence_metadata.h" -#include "scorer.h" -#include "verbose.h" -#include "viterbi.h" -#include "hg.h" -#include "prob.h" -#include "kbest.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "fdict.h" -#include "time.h" -#include "sampler.h" - -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -using boost::shared_ptr; -namespace po = boost::program_options; - -bool invert_score; -boost::shared_ptr rng; -bool approx_score; -bool no_reweight; -bool no_select; -bool unique_kbest; -int update_list_size; -vector dense_weights_g; -double mt_metric_scale; -int optimizer; -int fear_select; -int hope_select; - -bool pseudo_doc; - -void SanityCheck(const vector& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - -struct FComp { - const vector& w_; - FComp(const vector& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowLargestFeatures(const vector& w) { - vector fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - for (vector::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input_weights,w",po::value(),"Input feature weights file") - ("source,i",po::value(),"Source file for development set") - ("passes,p", po::value()->default_value(15), "Number of passes through the training data") - ("reference,r",po::value >(), "[REQD] Reference translation(s) (tokenized text file)") - ("mt_metric,m",po::value()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") - ("optimizer,o",po::value()->default_value(1), "Optimizer (sgd=1, mira 1-fear=2, full mira w/ cutting plane=3, full mira w/ nbest list=5, local update=4)") - ("fear,f",po::value()->default_value(1), "Fear selection (model-cost=1, max-cost=2, pred-base=3)") - ("hope,h",po::value()->default_value(1), "Hope selection (model+cost=1, max-cost=2, local-cost=3)") - ("max_step_size,C", po::value()->default_value(0.01), "regularization strength (C)") - ("random_seed,S", po::value(), "Random seed (if not specified, /dev/random will be used)") - ("mt_metric_scale,s", po::value()->default_value(1.0), "Amount to scale MT loss function by") - ("approx_score,a", "Use smoothed sentence-level BLEU score for approximate scoring") - ("no_reweight,d","Do not reweight forest for cutting plane") - ("no_select,n", "Do not use selection heuristic") - ("k_best_size,k", po::value()->default_value(250), "Size of hypothesis list to search for oracles") - ("update_k_best,b", po::value()->default_value(1), "Size of good, bad lists to perform update with") - ("unique_k_best,u", "Unique k-best translation list") - ("weights_output,O",po::value(),"Directory to write weights to") - ("output_dir,D",po::value(),"Directory to place output in") - ("decoder_config,c",po::value(),"Decoder configuration file"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,H", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("input_weights") || !conf->count("decoder_config") || !conf->count("reference")) { - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -//load previous translation, store array of each sentences score, subtract it from current sentence and replace with new translation score - - -static const double kMINUS_EPSILON = -1e-6; -static const double EPSILON = 0.000001; -static const double SMO_EPSILON = 0.0001; -static const double PSEUDO_SCALE = 0.95; -static const int MAX_SMO = 10; -int cur_pass; - -struct HypothesisInfo { - SparseVector features; - vector hyp; - double mt_metric; - double hope; - double fear; - double alpha; - double oracle_loss; - SparseVector oracle_feat_diff; - shared_ptr oracleN; -}; - -bool ApproxEqual(double a, double b) { - if (a == b) return true; - return (fabs(a-b)/fabs(b)) < EPSILON; -} - -typedef shared_ptr HI; -bool HypothesisCompareB(const HI& h1, const HI& h2 ) -{ - return h1->mt_metric > h2->mt_metric; -}; - - -bool HopeCompareB(const HI& h1, const HI& h2 ) -{ - return h1->hope > h2->hope; -}; - -bool FearCompareB(const HI& h1, const HI& h2 ) -{ - return h1->fear > h2->fear; -}; - -bool FearComparePred(const HI& h1, const HI& h2 ) -{ - return h1->features.dot(dense_weights_g) > h2->features.dot(dense_weights_g); -}; - -bool HypothesisCompareG(const HI& h1, const HI& h2 ) -{ - return h1->mt_metric < h2->mt_metric; -}; - - -void CuttingPlane(vector >* cur_c, bool* again, vector >& all_hyp, vector dense_weights) -{ - bool DEBUG_CUT = false; - shared_ptr max_fear, max_fear_in_set; - vector >& cur_constraint = *cur_c; - - if(no_reweight) - { - //find new hope hypothesis - for(int u=0;u!=all_hyp.size();u++) - { - double t_score = all_hyp[u]->features.dot(dense_weights); - all_hyp[u]->hope = 1 * all_hyp[u]->mt_metric + t_score; - //if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; - - } - - //sort hyps by hope score - sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); - - double hope_score = all_hyp[0]->features.dot(dense_weights); - if(DEBUG_CUT) cerr << "New hope derivation score " << hope_score << endl; - - for(int u=0;u!=all_hyp.size();u++) - { - double t_score = all_hyp[u]->features.dot(dense_weights); - //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; - - all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*all_hyp[0]->mt_metric - hope_score + t_score; //relative loss - // all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*all_hyp[0]->mt_metric; - //all_hyp[u]->oracle_feat_diff = all_hyp[0]->features - all_hyp[u]->features; - // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; - //if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; - - } - - sort(all_hyp.begin(),all_hyp.end(),FearCompareB); - - } - //assign maximum fear derivation from all derivations - max_fear = all_hyp[0]; - - if(DEBUG_CUT) cerr <<"Cutting Plane Max Fear "<fear ; - for(int i=0; i < cur_constraint.size();i++) //select maximal violator already in constraint set - { - if (!max_fear_in_set || cur_constraint[i]->fear > max_fear_in_set->fear) - max_fear_in_set = cur_constraint[i]; - } - if(DEBUG_CUT) cerr << "Max Fear in constraint set " << max_fear_in_set->fear << endl; - - if(max_fear->fear > max_fear_in_set->fear + SMO_EPSILON) - { - cur_constraint.push_back(max_fear); - *again = true; - if(DEBUG_CUT) cerr << "Optimize Again " << *again << endl; - } -} - - -double ComputeDelta(vector >* cur_p, double max_step_size,vector dense_weights ) -{ - vector >& cur_pair = *cur_p; - double loss = cur_pair[0]->oracle_loss - cur_pair[1]->oracle_loss; - //double margin = -cur_pair[0]->oracle_feat_diff.dot(dense_weights) + cur_pair[1]->oracle_feat_diff.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? - //double num = loss - margin; - - - double margin = -(cur_pair[0]->oracleN->features.dot(dense_weights)- cur_pair[0]->features.dot(dense_weights)) + (cur_pair[1]->oracleN->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights)); - const double num = margin + loss; - cerr << "LOSS: " << num << " Margin:" << margin << " BLEUL:" << loss << " " << cur_pair[1]->features.dot(dense_weights) << " " << cur_pair[0]->features.dot(dense_weights) <features.dot(dense_weights) - cur_pair[0]->features.dot(dense_weights); - // double loss = cur_pair[1]->oracle_loss; //good.mt_metric - cur_bad.mt_metric); - //const double num = margin + loss; - - //cerr << "Compute Delta " << loss << " " << margin << " "; - - // double margin = cur_pair[0]->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? -/* double num = - (cur_pair[0]->oracle_loss - cur_pair[0]->oracle_feat_diff.dot(dense_weights)) - - (cur_pair[1]->oracle_loss - cur_pair[1]->oracle_feat_diff.dot(dense_weights)); - */ - - SparseVector diff = cur_pair[0]->features; - diff -= cur_pair[1]->features; - /* SparseVector diff = cur_pair[0]->oracle_feat_diff; - diff -= cur_pair[1]->oracle_feat_diff;*/ - double diffsqnorm = diff.l2norm_sq(); - double delta; - if (diffsqnorm > 0) - delta = num / (diffsqnorm * max_step_size); - else - delta = 0; - cerr << " D1:" << delta; - //clip delta (enforce margin constraints) - - delta = max(-cur_pair[0]->alpha, min(delta, cur_pair[1]->alpha)); - cerr << " D2:" << delta; - return delta; -} - - -vector > SelectPair(vector >* cur_c) -{ - bool DEBUG_SELECT= false; - vector >& cur_constraint = *cur_c; - - vector > pair; - - if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for 1-mira - // if(optimizer == 2) { - pair.push_back(cur_constraint[0]); - pair.push_back(cur_constraint[1]); - return pair; - // } - } - - for(int u=0;u != cur_constraint.size();u++) - { - shared_ptr max_fear; - - if(DEBUG_SELECT) cerr<< "cur alpha " << u << " " << cur_constraint[u]->alpha; - for(int i=0; i < cur_constraint.size();i++) //select maximal violator - { - if(i != u) - if (!max_fear || cur_constraint[i]->fear > max_fear->fear) - max_fear = cur_constraint[i]; - } - if(!max_fear) return pair; // - - if(DEBUG_SELECT) cerr << " F" << max_fear->fear << endl; - - - if ((cur_constraint[u]->alpha == 0) && (cur_constraint[u]->fear > max_fear->fear + SMO_EPSILON)) - { - for(int i=0; i < cur_constraint.size();i++) //select maximal violator - { - if(i != u) - if (cur_constraint[i]->alpha > 0) - { - pair.push_back(cur_constraint[u]); - pair.push_back(cur_constraint[i]); - cerr << "RETJURN from 1" << endl; - return pair; - } - } - } - if ((cur_constraint[u]->alpha > 0) && (cur_constraint[u]->fear < max_fear->fear - SMO_EPSILON)) - { - for(int i=0; i < cur_constraint.size();i++) //select maximal violator - { - if(i != u) - if (cur_constraint[i]->fear > cur_constraint[u]->fear) - { - pair.push_back(cur_constraint[u]); - pair.push_back(cur_constraint[i]); - return pair; - } - } - } - - } - return pair; //no more constraints to optimize, we're done here - -} - -struct GoodBadOracle { - vector > good; - vector > bad; -}; - -struct TrainingObserver : public DecoderObserver { - TrainingObserver(const int k, const DocScorer& d, vector* o, vector* cbs) : ds(d), oracles(*o), corpus_bleu_sent_stats(*cbs), kbest_size(k) { - // TrainingObserver(const int k, const DocScorer& d, vector* o) : ds(d), oracles(*o), kbest_size(k) { - - //calculate corpus bleu score from previous iterations 1-best for BLEU gain - if(!pseudo_doc) - if(cur_pass > 0) - { - ScoreP acc; - for (int ii = 0; ii < corpus_bleu_sent_stats.size(); ii++) { - if (!acc) { acc = corpus_bleu_sent_stats[ii]->GetZero(); } - acc->PlusEquals(*corpus_bleu_sent_stats[ii]); - - } - corpus_bleu_stats = acc; - corpus_bleu_score = acc->ComputeScore(); - } - //corpus_src_length = 0; -} - const DocScorer& ds; - vector& corpus_bleu_sent_stats; - vector& oracles; - vector > cur_best; - shared_ptr cur_oracle; - const int kbest_size; - Hypergraph forest; - int cur_sent; - ScoreP corpus_bleu_stats; - float corpus_bleu_score; - - float corpus_src_length; - float curr_src_length; - - const int GetCurrentSent() const { - return cur_sent; - } - - const HypothesisInfo& GetCurrentBestHypothesis() const { - return *cur_best[0]; - } - - const vector > GetCurrentBest() const { - return cur_best; - } - - const HypothesisInfo& GetCurrentOracle() const { - return *cur_oracle; - } - - const Hypergraph& GetCurrentForest() const { - return forest; - } - - - virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { - cur_sent = smeta.GetSentenceID(); - //cerr << "SOURCE " << smeta.GetSourceLength() << endl; - curr_src_length = (float) smeta.GetSourceLength(); - //UpdateOracles(smeta.GetSentenceID(), *hg); - if(unique_kbest) - UpdateOracles(smeta.GetSentenceID(), *hg); - else - UpdateOracles > >(smeta.GetSentenceID(), *hg); - forest = *hg; - - } - - shared_ptr MakeHypothesisInfo(const SparseVector& feats, const double score, const vector& hyp) { - shared_ptr h(new HypothesisInfo); - h->features = feats; - h->mt_metric = score; - h->hyp = hyp; - return h; - } - - template - void UpdateOracles(int sent_id, const Hypergraph& forest) { - - bool PRINT_LIST= false; - vector >& cur_good = oracles[sent_id].good; - vector >& cur_bad = oracles[sent_id].bad; - //TODO: look at keeping previous iterations hypothesis lists around - cur_best.clear(); - cur_good.clear(); - cur_bad.clear(); - - vector > all_hyp; - - typedef KBest::KBestDerivations, ESentenceTraversal,Filter> K; - K kbest(forest,kbest_size); - - //KBest::KBestDerivations, ESentenceTraversal> kbest(forest, kbest_size); - for (int i = 0; i < kbest_size; ++i) { - //const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - typename K::Derivation *d = - kbest.LazyKthBest(forest.nodes_.size() - 1, i); - if (!d) break; - - float sentscore; - if(approx_score) - { - - if(cur_pass > 0 && !pseudo_doc) - { - ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); - ScoreP corpus_no_best = corpus_bleu_stats->GetZero(); - - corpus_bleu_stats->Subtract(*corpus_bleu_sent_stats[sent_id], &*corpus_no_best); - sent_stats->PlusEquals(*corpus_no_best, 0.5); - - //compute gain from new sentence in 1-best corpus - sentscore = mt_metric_scale * (sent_stats->ComputeScore() - corpus_no_best->ComputeScore());// - corpus_bleu_score); - } - else if(pseudo_doc) - { - //cerr << "CORP:" << corpus_bleu_score << " NEW:" << sent_stats->ComputeScore() << " sentscore:" << sentscore << endl; - - //-----pseudo-corpus approach - float src_scale = corpus_src_length + curr_src_length; - ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); - if(!corpus_bleu_stats){ corpus_bleu_stats = sent_stats->GetZero();} - - sent_stats->PlusEquals(*corpus_bleu_stats); - sentscore = mt_metric_scale * src_scale * sent_stats->ComputeScore(); - - } - else - { - //cerr << "Using sentence-level approximation - PASS - " << boost::lexical_cast(cur_pass) << endl; - //approx style of computation, used for 0th iteration - sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeSentScore()); - - //use pseudo-doc - } - - - } - else - { - sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore()); - } - - if (invert_score) sentscore *= -1.0; - //cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << " " << approx_sentscore << endl; - - if (i < update_list_size){ - if (i == 0) //take cur best and add its bleu statistics counts to the pseudo-doc - { } - if(PRINT_LIST)cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; - cur_best.push_back( MakeHypothesisInfo(d->feature_values, sentscore, d->yield)); - } - - all_hyp.push_back(MakeHypothesisInfo(d->feature_values, sentscore,d->yield)); //store all hyp to extract oracle best and worst - - } - - if(pseudo_doc){ - //update psuedo-doc stats - string details, details2; - corpus_bleu_stats->ScoreDetails(&details2); - ScoreP sent_stats = ds[sent_id]->ScoreCandidate(cur_best[0]->hyp); - corpus_bleu_stats->PlusEquals(*sent_stats); - - - sent_stats->ScoreDetails(&details); - - - sent_stats = corpus_bleu_stats; - corpus_bleu_stats = sent_stats->GetZero(); - corpus_bleu_stats->PlusEquals(*sent_stats, PSEUDO_SCALE); - - - corpus_src_length = PSEUDO_SCALE * (corpus_src_length + curr_src_length); - cerr << "CORP S " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n " << details2 << endl; - - - } - - - //figure out how many hyps we can keep maximum - int temp_update_size = update_list_size; - if (all_hyp.size() < update_list_size){ temp_update_size = all_hyp.size();} - - //sort all hyps by sentscore (bleu) - sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareB); - - if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++) cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_weights_g) << endl; } - - //if(optimizer != 4 ) - if(hope_select == 1) - { - //find hope hypothesis using model + bleu - if (PRINT_LIST) cerr << "HOPE " << endl; - for(int u=0;u!=all_hyp.size();u++) - { - double t_score = all_hyp[u]->features.dot(dense_weights_g); - all_hyp[u]->hope = all_hyp[u]->mt_metric + t_score; - if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; - - } - - //sort hyps by hope score - sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); - } - - - //assign cur_good the sorted list - cur_good.insert(cur_good.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); - if(PRINT_LIST) { cerr << "GOOD" << endl; for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;} - /* if (!cur_oracle) { cur_oracle = cur_good[0]; - cerr << "Set oracle " << cur_oracle->hope << " " << cur_oracle->fear << " " << cur_oracle->mt_metric << endl; } - else { - cerr << "Stay oracle " << cur_oracle->hope << " " << cur_oracle->fear << " " << cur_oracle->mt_metric << endl; } */ - - shared_ptr& oracleN = cur_good[0]; - //if(optimizer != 4){ - if(fear_select == 1){ - //compute fear hyps - if (PRINT_LIST) cerr << "FEAR " << endl; - double hope_score = oracleN->features.dot(dense_weights_g); - //double hope_score = cur_oracle->features.dot(dense_weights); - if (PRINT_LIST) cerr << "hope score " << hope_score << endl; - for(int u=0;u!=all_hyp.size();u++) - { - double t_score = all_hyp[u]->features.dot(dense_weights_g); - //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; - - /* all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric - hope_score + t_score; //relative loss - all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric; - all_hyp[u]->oracle_feat_diff = cur_oracle->features - all_hyp[u]->features;*/ - - all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric - hope_score + t_score; //relative loss - all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric; - all_hyp[u]->oracle_feat_diff = oracleN->features - all_hyp[u]->features; - all_hyp[u]->oracleN=oracleN; - // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; - if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; - - } - - sort(all_hyp.begin(),all_hyp.end(),FearCompareB); - - cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); - } - else if(fear_select == 2) //select fear based on cost - { - cur_bad.insert(cur_bad.begin(), all_hyp.end()-temp_update_size, all_hyp.end()); - reverse(cur_bad.begin(),cur_bad.end()); - } - else //pred-based, fear_select = 3 - { - sort(all_hyp.begin(),all_hyp.end(),FearComparePred); - cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); - } - - - if(PRINT_LIST){ cerr<< "BAD"<mt_metric << " H:" << cur_bad[u]->hope << " F:" << cur_bad[u]->fear << endl;} - - cerr << "GOOD (BEST): " << cur_good[0]->mt_metric << endl; - cerr << " CUR: " << cur_best[0]->mt_metric << endl; - cerr << " BAD (WORST): " << cur_bad[0]->mt_metric << endl; - } -}; - -void ReadTrainingCorpus(const string& fname, vector* c) { - - - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - while(in) { - getline(in, line); - if (!in) break; - c->push_back(line); - } -} - -void ReadPastTranslationForScore(const int cur_pass, vector* c, DocScorer& ds, const string& od) -{ - cerr << "Reading BLEU gain file "; - string fname; - if(cur_pass == 0) - { - fname = od + "/run.raw.init"; - } - else - { - int last_pass = cur_pass - 1; - fname = od + "/run.raw." + boost::lexical_cast(last_pass) + ".B"; - } - cerr << fname << "\n"; - ReadFile rf(fname); - istream& in = *rf.stream(); - ScoreP acc; - string line; - int lc = 0; - while(in) { - getline(in, line); - if (line.empty() && !in) break; - vector sent; - TD::ConvertSentence(line, &sent); - ScoreP sentscore = ds[lc]->ScoreCandidate(sent); - c->push_back(sentscore); - if (!acc) { acc = sentscore->GetZero(); } - acc->PlusEquals(*sentscore); - ++lc; - - } - - - assert(lc > 0); - float score = acc->ComputeScore(); - string details; - acc->ScoreDetails(&details); - cerr << "INIT RUN " << details << score << endl; - -} - - -int main(int argc, char** argv) { - register_feature_functions(); - SetSilent(true); // turn off verbose decoder output - - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) return 1; - - if (conf.count("random_seed")) - rng.reset(new MT19937(conf["random_seed"].as())); - else - rng.reset(new MT19937); - - vector corpus; - //ReadTrainingCorpus(conf["source"].as(), &corpus); - - const string metric_name = conf["mt_metric"].as(); - optimizer = conf["optimizer"].as(); - fear_select = conf["fear"].as(); - hope_select = conf["hope"].as(); - mt_metric_scale = conf["mt_metric_scale"].as(); - approx_score = conf.count("approx_score"); - no_reweight = conf.count("no_reweight"); - no_select = conf.count("no_select"); - update_list_size = conf["update_k_best"].as(); - unique_kbest = conf.count("unique_k_best"); - pseudo_doc = true; - - const string weights_dir = conf["weights_output"].as(); - const string output_dir = conf["output_dir"].as(); - ScoreType type = ScoreTypeFromString(metric_name); - - //establish metric used for tuning - if (type == TER) { - invert_score = true; - // approx_score = false; - } else { - invert_score = false; - } - - //load references - DocScorer ds(type, conf["reference"].as >(), ""); - cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl; - vector corpus_bleu_sent_stats; - - //check training pass,if >0, then use previous iterations corpus bleu stats - cur_pass = conf["passes"].as(); - if(cur_pass > 0) - { - ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir); - } - /* if (ds.size() != corpus.size()) { - cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; - return 1; - }*/ - cerr << "Optimizing with " << optimizer << endl; - // load initial weights - /*Weights weights; - weights.InitFromFile(conf["input_weights"].as()); - SparseVector lambdas; - weights.InitSparseVector(&lambdas); - */ - - - - ReadFile ini_rf(conf["decoder_config"].as()); - Decoder decoder(ini_rf.stream()); - - vector& dense_weights = decoder.CurrentWeightVector(); - - SparseVector lambdas; - Weights::InitFromFile(conf["input_weights"].as(), &dense_weights); - Weights::InitSparseVector(dense_weights, &lambdas); - - const string input = decoder.GetConf()["input"].as(); - //const bool show_feature_dictionary = decoder.GetConf().count("show_feature_dictionary"); - if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; - ReadFile in_read(input); - istream *in = in_read.stream(); - assert(*in); - string buf; - - const double max_step_size = conf["max_step_size"].as(); - - - // assert(corpus.size() > 0); - vector oracles(ds.size()); - - TrainingObserver observer(conf["k_best_size"].as(), ds, &oracles, &corpus_bleu_sent_stats); - - int cur_sent = 0; - int lcount = 0; - double objective=0; - double tot_loss = 0; - int dots = 0; - // int cur_pass = 1; - // vector dense_weights; - SparseVector tot; - SparseVector final_tot; - // tot += lambdas; // initial weights - // lcount++; // count for initial weights - - //string msg = "# MIRA tuned weights"; - // while (cur_pass <= max_iteration) { - SparseVector old_lambdas = lambdas; - tot.clear(); - tot += lambdas; - cerr << "PASS " << cur_pass << " " << endl << lambdas << endl; - ScoreP acc, acc_h, acc_f; - - while(*in) { - getline(*in, buf); - if (buf.empty()) continue; - //for (cur_sent = 0; cur_sent < corpus.size(); cur_sent++) { - - cerr << "SENT: " << cur_sent << endl; - //TODO: allow batch updating - //dense_weights.clear(); - //weights.InitFromVector(lambdas); - //weights.InitVector(&dense_weights); - //decoder.SetWeights(dense_weights); - lambdas.init_vector(&dense_weights); - dense_weights_g = dense_weights; - decoder.SetId(cur_sent); - decoder.Decode(buf, &observer); // decode the sentence, calling Notify to get the hope,fear, and model best hyps. - - cur_sent = observer.GetCurrentSent(); - const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis(); - const HypothesisInfo& cur_good = *oracles[cur_sent].good[0]; - const HypothesisInfo& cur_bad = *oracles[cur_sent].bad[0]; - - vector >& cur_good_v = oracles[cur_sent].good; - vector >& cur_bad_v = oracles[cur_sent].bad; - vector > cur_best_v = observer.GetCurrentBest(); - - tot_loss += cur_hyp.mt_metric; - - //score hyps to be able to compute corpus level bleu after we finish this iteration through the corpus - ScoreP sentscore = ds[cur_sent]->ScoreCandidate(cur_hyp.hyp); - if (!acc) { acc = sentscore->GetZero(); } - acc->PlusEquals(*sentscore); - - ScoreP hope_sentscore = ds[cur_sent]->ScoreCandidate(cur_good.hyp); - if (!acc_h) { acc_h = hope_sentscore->GetZero(); } - acc_h->PlusEquals(*hope_sentscore); - - ScoreP fear_sentscore = ds[cur_sent]->ScoreCandidate(cur_bad.hyp); - if (!acc_f) { acc_f = fear_sentscore->GetZero(); } - acc_f->PlusEquals(*fear_sentscore); - - if(optimizer == 4) { //single dual coordinate update, cur_good selected on BLEU score only (not model+BLEU) - // if (!ApproxEqual(cur_hyp.mt_metric, cur_good.mt_metric)) { - - double margin = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights); - double mt_loss = (cur_good.mt_metric - cur_bad.mt_metric); - const double loss = margin + mt_loss; - cerr << "LOSS: " << loss << " Margin:" << margin << " BLEUL:" << mt_loss << " " << cur_bad.features.dot(dense_weights) << " " << cur_good.features.dot(dense_weights) < 0.0) { - SparseVector diff = cur_good.features; - diff -= cur_bad.features; - - double diffsqnorm = diff.l2norm_sq(); - double delta; - if (diffsqnorm > 0) - delta = loss / (diffsqnorm); - else - delta = 0; - - //double step_size = loss / diff.l2norm_sq(); - cerr << loss << " " << delta << " " << diff << endl; - if (delta > max_step_size) delta = max_step_size; - lambdas += (cur_good.features * delta); - lambdas -= (cur_bad.features * delta); - //cerr << "L: " << lambdas << endl; - // } - // } - } - else if(optimizer == 1) //sgd - nonadapted step size - { - - lambdas += (cur_good.features) * max_step_size; - lambdas -= (cur_bad.features) * max_step_size; - } - //cerr << "L: " << lambdas << endl; - else if(optimizer == 5) //full mira with n-best list of constraints from oracle, fear, best - { - vector > cur_constraint; - cur_constraint.insert(cur_constraint.begin(), cur_bad_v.begin(), cur_bad_v.end()); - cur_constraint.insert(cur_constraint.begin(), cur_best_v.begin(), cur_best_v.end()); - cur_constraint.insert(cur_constraint.begin(), cur_good_v.begin(), cur_good_v.end()); - - bool optimize_again; - vector > cur_pair; - //SMO - for(int u=0;u!=cur_constraint.size();u++) - cur_constraint[u]->alpha =0; - - cur_constraint[0]->alpha =1; //set oracle to alpha=1 - - cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl; - int smo_iter = 10, smo_iter2 = 10; - int iter, iter2 =0; - bool DEBUG_SMO = false; - while (iter2 < smo_iter2) - { - iter =0; - while (iter < smo_iter) - { - optimize_again = true; - for (int i = 0; i< cur_constraint.size(); i++) - for (int j = i+1; j< cur_constraint.size(); j++) - { - if(DEBUG_SMO) cerr << "start " << i << " " << j << endl; - cur_pair.clear(); - cur_pair.push_back(cur_constraint[j]); - cur_pair.push_back(cur_constraint[i]); - double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); - - if (delta == 0) optimize_again = false; - // cur_pair[0]->alpha += delta; - // cur_pair[1]->alpha -= delta; - cur_constraint[j]->alpha += delta; - cur_constraint[i]->alpha -= delta; - double step_size = delta * max_step_size; - /*lambdas += (cur_pair[1]->features) * step_size; - lambdas -= (cur_pair[0]->features) * step_size;*/ - lambdas += (cur_constraint[i]->features) * step_size; - lambdas -= (cur_constraint[j]->features) * step_size; - if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << i << " " << j << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; - - //reload weights based on update - /*dense_weights.clear(); - weights.InitFromVector(lambdas); - weights.InitVector(&dense_weights);*/ - } - iter++; - - if(!optimize_again) - { - iter = 100; - cerr << "Optimization stopped, delta =0" << endl; - } - - - } - iter2++; - } - - - } - else if(optimizer == 2 || optimizer == 3) //1-fear and cutting plane mira - { - bool DEBUG_SMO= true; - vector > cur_constraint; - cur_constraint.push_back(cur_good_v[0]); //add oracle to constraint set - bool optimize_again = true; - int cut_plane_calls = 0; - while (optimize_again) - { - if(DEBUG_SMO) cerr<< "optimize again: " << optimize_again << endl; - if(optimizer == 2){ //1-fear - cur_constraint.push_back(cur_bad_v[0]); - - //check if we have a violation - if(!(cur_constraint[1]->fear > cur_constraint[0]->fear + SMO_EPSILON)) - { - optimize_again = false; - cerr << "Constraint not violated" << endl; - } - } - else - { //cutting plane to add constraints - if(DEBUG_SMO) cerr<< "Cutting Plane " << cut_plane_calls << " with " << lambdas << endl; - optimize_again = false; - cut_plane_calls++; - CuttingPlane(&cur_constraint, &optimize_again, oracles[cur_sent].bad, dense_weights); - if (cut_plane_calls >= MAX_SMO) optimize_again = false; - } - - if(optimize_again) - { - //SMO - for(int u=0;u!=cur_constraint.size();u++) - { - cur_constraint[u]->alpha =0; - //cur_good_v[0]->alpha = 1; cur_bad_v[0]->alpha = 0; - } - cur_constraint[0]->alpha = 1; - cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl; - int smo_iter = MAX_SMO; - int iter =0; - while (iter < smo_iter) - { - //select pair to optimize from constraint set - vector > cur_pair = SelectPair(&cur_constraint); - - if(cur_pair.empty()){iter=MAX_SMO; cerr << "Undefined pair " << endl; continue;} //pair is undefined so we are done with this smo - - //double num = cur_good_v[0]->fear - cur_bad_v[0]->fear; - /*double loss = cur_good_v[0]->oracle_loss - cur_bad_v[0]->oracle_loss; - double margin = cur_good_v[0]->oracle_feat_diff.dot(dense_weights) - cur_bad_v[0]->oracle_feat_diff.dot(dense_weights); - double num = loss - margin; - SparseVector diff = cur_good_v[0]->features; - diff -= cur_bad_v[0]->features; - double delta = num / (diff.l2norm_sq() * max_step_size); - delta = max(-cur_good_v[0]->alpha, min(delta, cur_bad_v[0]->alpha)); - cur_good_v[0]->alpha += delta; - cur_bad_v[0]->alpha -= delta; - double step_size = delta * max_step_size; - lambdas += (cur_bad_v[0]->features) * step_size; - lambdas -= (cur_good_v[0]->features) * step_size; - */ - - double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); - - cur_pair[0]->alpha += delta; - cur_pair[1]->alpha -= delta; - double step_size = delta * max_step_size; - /* lambdas += (cur_pair[1]->oracle_feat_diff) * step_size; - lambdas -= (cur_pair[0]->oracle_feat_diff) * step_size;*/ - - cerr << "step " << step_size << endl; - double alpha_sum=0; - SparseVector temp_lambdas = lambdas; - - for(int u=0;u!=cur_constraint.size();u++) - { - cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << endl; - temp_lambdas += (cur_constraint[u]->oracleN->features-cur_constraint[u]->features) * cur_constraint[u]->alpha * step_size; - alpha_sum += cur_constraint[u]->alpha; - } - cerr << "Alpha sum " << alpha_sum << " " << temp_lambdas << endl; - - lambdas += (cur_pair[1]->features) * step_size; - lambdas -= (cur_pair[0]->features) * step_size; - cerr << " Lambdas " << lambdas << endl; - //reload weights based on update - dense_weights.clear(); - //weights.InitFromVector(lambdas); - //weights.InitVector(&dense_weights); - lambdas.init_vector(&dense_weights); - dense_weights_g = dense_weights; - iter++; - - if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; - // cerr << "SMO opt " << iter << " " << delta << " " << cur_good_v[0]->alpha << " " << cur_bad_v[0]->alpha << endl; - if(no_select) //don't use selection heuristic to determine when to stop SMO, rather just when delta =0 - if (delta == 0) iter = MAX_SMO; - - //only perform one dual coordinate ascent step - if(optimizer == 2) - { - optimize_again = false; - iter = MAX_SMO; - } - - } - if(optimizer == 3) - { - if(!no_reweight) - { - if(DEBUG_SMO) cerr<< "Decoding with new weights -- now orac are " << oracles[cur_sent].good.size() << endl; - Hypergraph hg = observer.GetCurrentForest(); - hg.Reweight(dense_weights); - //observer.UpdateOracles(cur_sent, hg); - if(unique_kbest) - observer.UpdateOracles(cur_sent, hg); - else - observer.UpdateOracles > >(cur_sent, hg); - - - } - } - } - - - } - - //print objective after this sentence - double lambda_change = (lambdas - old_lambdas).l2norm_sq(); - double max_fear = cur_constraint[cur_constraint.size()-1]->fear; - double temp_objective = 0.5 * lambda_change;// + max_step_size * max_fear; - - for(int u=0;u!=cur_constraint.size();u++) - { - cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << " " << cur_constraint[u]->fear << endl; - temp_objective += cur_constraint[u]->alpha * cur_constraint[u]->fear; - } - objective += temp_objective; - - cerr << "SENT OBJ: " << temp_objective << " NEW OBJ: " << objective << endl; - } - - - if ((cur_sent * 40 / ds.size()) > dots) { ++dots; cerr << '.'; } - tot += lambdas; - ++lcount; - cur_sent++; - - cout << TD::GetString(cur_good_v[0]->hyp) << " ||| " << TD::GetString(cur_best_v[0]->hyp) << " ||| " << TD::GetString(cur_bad_v[0]->hyp) << endl; - - //clear good/bad lists from oracles for this sentences - you want to keep them around for things - - // oracles[cur_sent].good.clear(); - //oracles[cur_sent].bad.clear(); - } - - cerr << "FINAL OBJECTIVE: "<< objective << endl; - final_tot += tot; - cerr << "Translated " << lcount << " sentences " << endl; - cerr << " [AVG METRIC LAST PASS=" << (tot_loss / lcount) << "]\n"; - tot_loss = 0; - /* - float corpus_score = acc->ComputeScore(); - string corpus_details; - acc->ScoreDetails(&corpus_details); - cerr << "MODEL " << corpus_details << endl; - cout << corpus_score << endl; - - corpus_score = acc_h->ComputeScore(); - acc_h->ScoreDetails(&corpus_details); - cerr << "HOPE " << corpus_details << endl; - cout << corpus_score << endl; - - corpus_score = acc_f->ComputeScore(); - acc_f->ScoreDetails(&corpus_details); - cerr << "FEAR " << corpus_details << endl; - cout << corpus_score << endl; - */ - int node_id = rng->next() * 100000; - cerr << " Writing weights to " << node_id << endl; - Weights::ShowLargestFeatures(dense_weights); - dots = 0; - ostringstream os; - os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz"; - string msg = "# MIRA tuned weights ||| " + boost::lexical_cast(node_id) + " ||| " + boost::lexical_cast(lcount); - //Weights.InitFromVector(lambdas); - lambdas.init_vector(&dense_weights); - Weights::WriteToFile(os.str(), dense_weights, true, &msg); - - SparseVector x = tot; - x /= lcount; - ostringstream sa; - string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast(node_id) + " ||| " + boost::lexical_cast(lcount); - sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz"; - //Weights ww; - //ww.InitFromVector(x); - x.init_vector(&dense_weights); - Weights::WriteToFile(sa.str(), dense_weights, true, &msga); - - //assign averaged lambdas to initialize next iteration - //lambdas = x; - - /* double lambda_change = (old_lambdas - lambdas).l2norm_sq(); - cerr << "Change in lambda " << lambda_change << endl; - - if ( lambda_change < EPSILON) - { - cur_pass = max_iteration; - cerr << "Weights converged - breaking" << endl; - } - - ++cur_pass; - */ - - //} iteration while loop - - /* cerr << endl; - weights.WriteToFile("weights.mira-final.gz", true, &msg); - final_tot /= (lcount + 1);//max_iteration); - tot /= (corpus.size() + 1); - weights.InitFromVector(final_tot); - cerr << tot << "||||" << final_tot << endl; - msg = "# MIRA tuned weights (averaged vector)"; - weights.WriteToFile("weights.mira-final-avg.gz", true, &msg); - */ - cerr << "Optimization complete.\\AVERAGED WEIGHTS: weights.mira-final-avg.gz\n"; - return 0; -} - diff --git a/training/mira/run_mira.pl b/training/mira/run_mira.pl index f4d61407..90a4da0e 100755 --- a/training/mira/run_mira.pl +++ b/training/mira/run_mira.pl @@ -3,7 +3,7 @@ use strict; my @ORIG_ARGV=@ARGV; use Cwd qw(getcwd); my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); -push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } # Skip local config (used for distributing jobs) if we're running in local-only mode use LocalConfig; @@ -11,51 +11,50 @@ use Getopt::Long; use IPC::Open2; use POSIX ":sys_wait_h"; my $QSUB_CMD = qsub_args(mert_memory()); - -require "libcall.pl"; - +my $default_jobs = env_default_jobs(); my $srcFile; my $refFiles; my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; my $iteration = 0.0; -my $max_iterations = 6; +my $max_iterations = 10; my $metric = "ibm_bleu"; my $iniFile; my $weights; my $initialWeights; -my $decode_nodes = 1; # number of decode nodes +my $jobs = $default_jobs; # number of decode nodes my $pmem = "1g"; my $dir; my $SCORER = $FAST_SCORE; -my $local_server = "$bin_dir/local_parallelize.pl"; -my $parallelize = "$bin_dir/../dpmert/parallelize.pl"; -my $libcall = "$bin_dir/../dpmert/libcall.pl"; -my $sentserver = "$bin_dir/../dpmert/sentserver"; -my $sentclient = "$bin_dir/../dpmert/sentclient"; -my $run_local_server = 0; + +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; + +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; + my $run_local = 0; -my $usefork; my $pass_suffix = ''; -my $cdec ="$bin_dir/kbest_mirav5"; #"$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mira_lv"; +my $cdec ="$bin_dir/kbest_cut_mira"; -#my $cdec ="$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mirav5"; #"$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mira_lv"; die "Can't find decoder in $cdec" unless -x $cdec; my $decoder = $cdec; my $decoderOpt; -my $update_size=250; +my $update_size; my $approx_score; my $kbest_size=250; my $metric_scale=1; my $optimizer=2; my $disable_clean = 0; -my $use_make; # use make to parallelize line search +my $use_make=0; my $density_prune; my $cpbin=1; my $help = 0; @@ -64,10 +63,10 @@ my $step_size = 0.01; my $gpref; my $unique_kbest; my $freeze; -my $latent; -my $sample_max; my $hopes=1; my $fears=1; +my $sent_approx=0; +my $pseudo_doc=0; my $range = 35000; my $minimum = 15000; @@ -78,15 +77,13 @@ my $portn = int(rand($range)) + $minimum; Getopt::Long::Configure("no_auto_abbrev"); if (GetOptions( "decoder=s" => \$decoderOpt, - "decode-nodes=i" => \$decode_nodes, + "jobs=i" => \$jobs, "density-prune=f" => \$density_prune, "dont-clean" => \$disable_clean, "pass-suffix=s" => \$pass_suffix, - "use-fork" => \$usefork, "epsilon=s" => \$epsilon, "help" => \$help, "local" => \$run_local, - "local_server" => \$run_local_server, "use-make=i" => \$use_make, "max-iterations=i" => \$max_iterations, "pmem=s" => \$pmem, @@ -102,10 +99,9 @@ if (GetOptions( "step-size=f" => \$step_size, "hope-select=i" => \$hopes, "fear-select=i" => \$fears, - "approx-score" => \$approx_score, + "sent-approx" => \$sent_approx, + "pseudo-doc" => \$pseudo_doc, "unique-kbest" => \$unique_kbest, - "latent" => \$latent, - "sample-max=i" => \$sample_max, "grammar-prefix=s" => \$gpref, "freeze" => \$freeze, "workdir=s" => \$dir, @@ -235,7 +231,9 @@ close F; my $lastPScore = 0; my $lastWeightsFile; - +my $bestScoreIter=-1; +my $bestScore=-1; +unless ($update_size){$update_size = $kbest_size;} # main optimization loop #while (1){ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { @@ -260,16 +258,16 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { my $weightsFile="$dir/weights.$opt_iter"; print "ITER $iteration " ; my $cur_pass = "-p 0$opt_iter"; - my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile $refs_comma_sep -m $metric -s $metric_scale -a -b $update_size -k $kbest_size -o $optimizer $cur_pass -O $weightdir -D $dir -h $hopes -f $fears -C $step_size"; + my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile $refs_comma_sep -m $metric -s $metric_scale -b $update_size -k $kbest_size -o $optimizer $cur_pass -O $weightdir -D $dir -h $hopes -f $fears -C $step_size"; if($unique_kbest){ $decoder_cmd .= " -u"; } - if($latent){ - $decoder_cmd .= " -l"; - } - if($sample_max){ - $decoder_cmd .= " -t $sample_max"; + if($sent_approx){ + $decoder_cmd .= " -a"; } + if($pseudo_doc){ + $decoder_cmd .= " -e"; + } if ($density_prune) { $decoder_cmd .= " --density_prune $density_prune"; } @@ -277,13 +275,11 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { if ($run_local) { $pcmd = "cat $srcFile |"; } elsif ($use_make) { - # TODO: Throw error when decode_nodes is specified along with use_make + # TODO: Throw error when jobs is speong with use_make $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --"; - } elsif ($run_local_server){ - $pcmd = "cat $srcFile | $local_server $usefork -p $pmem -e $logdir -n $decode_nodes --"; - } + } else { - $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --baseport $portn --"; + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --baseport $portn --"; } my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; print STDERR "COMMAND:\n$cmd\n"; @@ -291,14 +287,14 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { my $retries = 0; my $num_topbest; - while($retries < 5) { + while($retries < 6) { $num_topbest = check_output("wc -l < $runFile"); print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; if($devSize == $num_topbest) { last; } else { print STDERR "Incorrect number of topbest. Waiting for distributed filesystem and retrying...\n"; - sleep(3); + sleep(10); } $retries++; } @@ -320,12 +316,15 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { close RUN; close F; close B; close H; - my $dec_score = check_output("cat $runFile.B | $SCORER $refs_comma_sep -l $metric"); - my $dec_score_h = check_output("cat $runFile.H | $SCORER $refs_comma_sep -l $metric"); - my $dec_score_f = check_output("cat $runFile.F | $SCORER $refs_comma_sep -l $metric"); + my $dec_score = check_output("cat $runFile.B | $SCORER $refs_comma_sep -m $metric"); + my $dec_score_h = check_output("cat $runFile.H | $SCORER $refs_comma_sep -m $metric"); + my $dec_score_f = check_output("cat $runFile.F | $SCORER $refs_comma_sep -m $metric"); chomp $dec_score; chomp $dec_score_h; chomp $dec_score_f; print STDERR "DECODER SCORE: $dec_score HOPE: $dec_score_h FEAR: $dec_score_f\n"; - + if ($dec_score> $bestScore){ + $bestScoreIter=$opt_iter; + $bestScore=$dec_score; + } # save space check_call("gzip -f $runFile"); check_call("gzip -f $decoderLog"); @@ -338,21 +337,11 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { $lastWeightsFile = "$dir/weights.$opt_iter"; average_weights("$weightdir/weights.mira-pass*.*[0-9].gz", $newWeightsFile, $logdir); -# check_call("cp $lastW $newWeightsFile"); -# if ($icc < 2) { -# print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; -# last; -# } system("gzip -f $logdir/kbes*"); print STDERR "\n==========\n"; $iteration++; } -#find -#my $cmd = `grep SCORE /fs/clip-galep5/lexical_tm/log.runmira.nist.20 | cat -n | sort -k +2 | tail -1`; -#$cmd =~ m/([0-9]+)/; -#$lastWeightsFile = "$dir/weights.$1"; -#check_call("ln -s $lastWeightsFile $dir/weights.tuned"); -print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w with the decoder)\n\n"; +print STDERR "\nBEST ITER: $bestScoreIter :: $bestScore\n\n\n"; print STDOUT "$lastWeightsFile\n"; @@ -409,7 +398,7 @@ sub write_config { print $fh "EVAL METRIC: $metric\n"; print $fh "START ITERATION: $iteration\n"; print $fh "MAX ITERATIONS: $max_iterations\n"; - print $fh "DECODE NODES: $decode_nodes\n"; + print $fh "DECODE NODES: $jobs\n"; print $fh "HEAD NODE: $host\n"; print $fh "PMEM (DECODING): $pmem\n"; print $fh "CLEANUP: $cleanup\n"; @@ -462,9 +451,87 @@ sub enseg { } sub print_help { - print "Something wrong\n"; + my $executable = check_output("basename $0"); chomp $executable; + print << "Help"; + +Usage: $executable [options] + + $executable [options] + Runs a complete MIRA optimization using the ini file specified. + +Required: + + --ref-files + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + --source-file + Dev set source file. + --weights + Initial weights file + +General options: + + --help + Print this message and exit. + + --max-iterations + Maximum number of iterations to run. If not specified, defaults + to $max_iterations. + + --metric + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --workdir + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + --optimizer + Learning method to use for weight update. Choice are 1) SGD, 2) PA MIRA with Selection from Cutting Plane, 3) Cutting Plane MIRA, 4) PA MIRA,5) nbest MIRA with hope, fear, and model constraints + --metric-scale + Scale MT loss by this amount when computing hope/fear candidates + --kbest-size + Size of k-best list to extract from forest + --update-size + Size of k-best list to use for update (applies to optimizer 5) + --step-size + Controls aggresiveness of update (C) + --hope-select + How to select hope candidate. Choices are 1) model score - cost, 2) min cost + --fear-select + How to select fear candodate. Choices are 1) model score + cost, 2) max cost, 3) max score + --sent-approx + Use smoothed sentence-level MT metric + --pseudo-doc + Use pseudo document to approximate MT metric + --unique-kbest + Extract unique k-best from forest + --grammar-prefix + Path to sentence-specific grammar files + +Job control options: + + --jobs + Number of decoder processes to run in parallel. [default=$default_jobs] + + --pmem + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + + --local + Run single learner + --use-make + Run parallel learners on a single machine through fork. + + +Help } + sub cmdline { return join ' ',($0,@ORIG_ARGV); } -- cgit v1.2.3 From 4ae5f91f2d01d6b6824086b50eca91106232a04d Mon Sep 17 00:00:00 2001 From: Vladimir Eidelman Date: Sat, 13 Apr 2013 22:21:04 -0400 Subject: cleanup mira --- training/mira/kbest_cut_mira.cc | 128 +++++++++++----------------------------- 1 file changed, 36 insertions(+), 92 deletions(-) diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc index 34eb00dc..7df9a18f 100644 --- a/training/mira/kbest_cut_mira.cc +++ b/training/mira/kbest_cut_mira.cc @@ -40,7 +40,7 @@ bool no_reweight; bool no_select; bool unique_kbest; int update_list_size; -vector dense_weights_g; +vector dense_w_local; double mt_metric_scale; int optimizer; int fear_select; @@ -170,7 +170,7 @@ bool FearCompareB(const HI& h1, const HI& h2 ) bool FearComparePred(const HI& h1, const HI& h2 ) { - return h1->features.dot(dense_weights_g) > h2->features.dot(dense_weights_g); + return h1->features.dot(dense_w_local) > h2->features.dot(dense_w_local); }; bool HypothesisCompareG(const HI& h1, const HI& h2 ) @@ -203,12 +203,7 @@ void CuttingPlane(vector >* cur_c, bool* again, vecto for(int u=0;u!=all_hyp.size();u++) { double t_score = all_hyp[u]->features.dot(dense_weights); - //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; - all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*all_hyp[0]->mt_metric - hope_score + t_score; //relative loss - // all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*all_hyp[0]->mt_metric; - //all_hyp[u]->oracle_feat_diff = all_hyp[0]->features - all_hyp[u]->features; - // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; } sort(all_hyp.begin(),all_hyp.end(),FearCompareB); @@ -238,24 +233,14 @@ double ComputeDelta(vector >* cur_p, double max_step_ { vector >& cur_pair = *cur_p; double loss = cur_pair[0]->oracle_loss - cur_pair[1]->oracle_loss; - //double margin = -cur_pair[0]->oracle_feat_diff.dot(dense_weights) + cur_pair[1]->oracle_feat_diff.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? - //double num = loss - margin; - double margin = -(cur_pair[0]->oracleN->features.dot(dense_weights)- cur_pair[0]->features.dot(dense_weights)) + (cur_pair[1]->oracleN->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights)); const double num = margin + loss; cerr << "LOSS: " << num << " Margin:" << margin << " BLEUL:" << loss << " " << cur_pair[1]->features.dot(dense_weights) << " " << cur_pair[0]->features.dot(dense_weights) <oracle_loss - cur_pair[0]->oracle_feat_diff.dot(dense_weights)) - - (cur_pair[1]->oracle_loss - cur_pair[1]->oracle_feat_diff.dot(dense_weights)); - */ - SparseVector diff = cur_pair[0]->features; diff -= cur_pair[1]->features; - /* SparseVector diff = cur_pair[0]->oracle_feat_diff; - diff -= cur_pair[1]->oracle_feat_diff;*/ double diffsqnorm = diff.l2norm_sq(); double delta; if (diffsqnorm > 0) @@ -264,7 +249,6 @@ double ComputeDelta(vector >* cur_p, double max_step_ delta = 0; cerr << " D1:" << delta; //clip delta (enforce margin constraints) - delta = max(-cur_pair[0]->alpha, min(delta, cur_pair[1]->alpha)); cerr << " D2:" << delta; return delta; @@ -278,12 +262,12 @@ vector > SelectPair(vector vector > pair; - if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for 1-mira - // if(optimizer == 2) { + if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for pa-mira + pair.push_back(cur_constraint[0]); pair.push_back(cur_constraint[1]); return pair; - // } + } for(int u=0;u != cur_constraint.size();u++) @@ -299,8 +283,6 @@ vector > SelectPair(vector } if(!max_fear) return pair; // - if(DEBUG_SELECT) cerr << " F" << max_fear->fear << endl; - if ((cur_constraint[u]->alpha == 0) && (cur_constraint[u]->fear > max_fear->fear + SMO_EPSILON)) { @@ -310,8 +292,7 @@ vector > SelectPair(vector if (cur_constraint[i]->alpha > 0) { pair.push_back(cur_constraint[u]); - pair.push_back(cur_constraint[i]); - cerr << "RETJURN from 1" << endl; + pair.push_back(cur_constraint[i]); return pair; } } @@ -342,11 +323,10 @@ struct GoodBadOracle { struct TrainingObserver : public DecoderObserver { TrainingObserver(const int k, const DocScorer& d, vector* o, vector* cbs) : ds(d), oracles(*o), corpus_bleu_sent_stats(*cbs), kbest_size(k) { - // TrainingObserver(const int k, const DocScorer& d, vector* o) : ds(d), oracles(*o), kbest_size(k) { - //calculate corpus bleu score from previous iterations 1-best for BLEU gain + if(!pseudo_doc && !sent_approx) - if(cur_pass > 0) + if(cur_pass > 0) //calculate corpus bleu score from previous iterations 1-best for BLEU gain { ScoreP acc; for (int ii = 0; ii < corpus_bleu_sent_stats.size(); ii++) { @@ -357,7 +337,7 @@ struct TrainingObserver : public DecoderObserver { corpus_bleu_stats = acc; corpus_bleu_score = acc->ComputeScore(); } - //corpus_src_length = 0; + } const DocScorer& ds; vector& corpus_bleu_sent_stats; @@ -396,9 +376,8 @@ struct TrainingObserver : public DecoderObserver { virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { cur_sent = smeta.GetSentenceID(); - //cerr << "SOURCE " << smeta.GetSourceLength() << endl; curr_src_length = (float) smeta.GetSourceLength(); - //UpdateOracles(smeta.GetSentenceID(), *hg); + if(unique_kbest) UpdateOracles(smeta.GetSentenceID(), *hg); else @@ -431,9 +410,8 @@ struct TrainingObserver : public DecoderObserver { typedef KBest::KBestDerivations, ESentenceTraversal,Filter> K; K kbest(forest,kbest_size); - //KBest::KBestDerivations, ESentenceTraversal> kbest(forest, kbest_size); for (int i = 0; i < kbest_size; ++i) { - //const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + typename K::Derivation *d = kbest.LazyKthBest(forest.nodes_.size() - 1, i); if (!d) break; @@ -489,10 +467,9 @@ struct TrainingObserver : public DecoderObserver { corpus_bleu_stats->PlusEquals(*sent_stats, PSEUDO_SCALE); corpus_src_length = PSEUDO_SCALE * (corpus_src_length + curr_src_length); - cerr << "CORP S " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n" << details2 << endl; + cerr << "ps corpus size: " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n" << details2 << endl; } - //figure out how many hyps we can keep maximum int temp_update_size = update_list_size; if (all_hyp.size() < update_list_size){ temp_update_size = all_hyp.size();} @@ -500,7 +477,8 @@ struct TrainingObserver : public DecoderObserver { //sort all hyps by sentscore (eg. bleu) sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareB); - if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++) cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_weights_g) << endl; } + if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++) + cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_w_local) << endl; } if(hope_select == 1) { @@ -508,7 +486,7 @@ struct TrainingObserver : public DecoderObserver { if (PRINT_LIST) cerr << "HOPE " << endl; for(int u=0;u!=all_hyp.size();u++) { - double t_score = all_hyp[u]->features.dot(dense_weights_g); + double t_score = all_hyp[u]->features.dot(dense_w_local); all_hyp[u]->hope = all_hyp[u]->mt_metric + t_score; if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; @@ -522,47 +500,38 @@ struct TrainingObserver : public DecoderObserver { cur_good.insert(cur_good.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); if(PRINT_LIST) { cerr << "GOOD" << endl; for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;} + //use hope for fear selection shared_ptr& oracleN = cur_good[0]; - if(fear_select == 1){ //compute fear hyps with model - bleu if (PRINT_LIST) cerr << "FEAR " << endl; - double hope_score = oracleN->features.dot(dense_weights_g); + double hope_score = oracleN->features.dot(dense_w_local); if (PRINT_LIST) cerr << "hope score " << hope_score << endl; for(int u=0;u!=all_hyp.size();u++) { - double t_score = all_hyp[u]->features.dot(dense_weights_g); - //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; - - /* all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric - hope_score + t_score; //relative loss - all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric; - all_hyp[u]->oracle_feat_diff = cur_oracle->features - all_hyp[u]->features;*/ + double t_score = all_hyp[u]->features.dot(dense_w_local); all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric - hope_score + t_score; //relative loss all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric; all_hyp[u]->oracle_feat_diff = oracleN->features - all_hyp[u]->features; all_hyp[u]->oracleN=oracleN; - // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; } sort(all_hyp.begin(),all_hyp.end(),FearCompareB); - cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); } else if(fear_select == 2) //select fear based on cost { - cur_bad.insert(cur_bad.begin(), all_hyp.end()-temp_update_size, all_hyp.end()); - reverse(cur_bad.begin(),cur_bad.end()); + sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareG); } - else //pred-based, fear_select = 3 + else //max model score, also known as prediction-based { sort(all_hyp.begin(),all_hyp.end(),FearComparePred); - cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); } - + cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); if(PRINT_LIST){ cerr<< "BAD"<mt_metric << " H:" << cur_bad[u]->hope << " F:" << cur_bad[u]->fear << endl;} @@ -616,13 +585,12 @@ void ReadPastTranslationForScore(const int cur_pass, vector* c, DocScore ++lc; } - assert(lc > 0); float score = acc->ComputeScore(); string details; acc->ScoreDetails(&details); - cerr << "INIT RUN " << details << score << endl; + cerr << "Previous run: " << details << score << endl; } @@ -640,7 +608,6 @@ int main(int argc, char** argv) { rng.reset(new MT19937); vector corpus; - //ReadTrainingCorpus(conf["source"].as(), &corpus); const string metric_name = conf["mt_metric"].as(); optimizer = conf["optimizer"].as(); @@ -654,7 +621,7 @@ int main(int argc, char** argv) { unique_kbest = conf.count("unique_k_best"); pseudo_doc = conf.count("pseudo_doc"); sent_approx = conf.count("sent_approx"); - cerr << "PSEUDO " << pseudo_doc << " SENT " << sent_approx << endl; + cerr << "Using pseudo-doc:" << pseudo_doc << " Sent:" << sent_approx << endl; if(pseudo_doc) mt_metric_scale=1; @@ -665,7 +632,6 @@ int main(int argc, char** argv) { //establish metric used for tuning if (type == TER) { invert_score = true; - // approx_score = false; } else { invert_score = false; } @@ -681,20 +647,9 @@ int main(int argc, char** argv) { { ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir); } - /* if (ds.size() != corpus.size()) { - cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; - return 1; - }*/ - cerr << "Optimizing with " << optimizer << endl; - // load initial weights - /*Weights weights; - weights.InitFromFile(conf["input_weights"].as()); - SparseVector lambdas; - weights.InitSparseVector(&lambdas); - */ - - + cerr << "Using optimizer:" << optimizer << endl; + ReadFile ini_rf(conf["decoder_config"].as()); Decoder decoder(ini_rf.stream()); @@ -705,7 +660,6 @@ int main(int argc, char** argv) { Weights::InitSparseVector(dense_weights, &lambdas); const string input = decoder.GetConf()["input"].as(); - //const bool show_feature_dictionary = decoder.GetConf().count("show_feature_dictionary"); if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; ReadFile in_read(input); istream *in = in_read.stream(); @@ -714,8 +668,6 @@ int main(int argc, char** argv) { const double max_step_size = conf["max_step_size"].as(); - - // assert(corpus.size() > 0); vector oracles(ds.size()); TrainingObserver observer(conf["k_best_size"].as(), ds, &oracles, &corpus_bleu_sent_stats); @@ -725,27 +677,21 @@ int main(int argc, char** argv) { double objective=0; double tot_loss = 0; int dots = 0; - // int cur_pass = 1; - // vector dense_weights; SparseVector tot; SparseVector final_tot; - // tot += lambdas; // initial weights - // lcount++; // count for initial weights - - //string msg = "# MIRA tuned weights"; - // while (cur_pass <= max_iteration) { - SparseVector old_lambdas = lambdas; - tot.clear(); - tot += lambdas; - cerr << "PASS " << cur_pass << " " << endl << lambdas << endl; - ScoreP acc, acc_h, acc_f; - - while(*in) { + + SparseVector old_lambdas = lambdas; + tot.clear(); + tot += lambdas; + cerr << "PASS " << cur_pass << " " << endl << lambdas << endl; + ScoreP acc, acc_h, acc_f; + + while(*in) { getline(*in, buf); if (buf.empty()) continue; //TODO: allow batch updating lambdas.init_vector(&dense_weights); - dense_weights_g = dense_weights; + dense_w_local = dense_weights; decoder.SetId(cur_sent); decoder.Decode(buf, &observer); // decode the sentence, calling Notify to get the hope,fear, and model best hyps. @@ -922,7 +868,7 @@ int main(int argc, char** argv) { dense_weights.clear(); lambdas.init_vector(&dense_weights); - dense_weights_g = dense_weights; + dense_w_local = dense_weights; iter++; if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; @@ -991,19 +937,17 @@ int main(int argc, char** argv) { ostringstream os; os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz"; string msg = "# MIRA tuned weights ||| " + boost::lexical_cast(node_id) + " ||| " + boost::lexical_cast(lcount); - //Weights.InitFromVector(lambdas); lambdas.init_vector(&dense_weights); Weights::WriteToFile(os.str(), dense_weights, true, &msg); SparseVector x = tot; - x /= lcount; + x /= lcount+1; ostringstream sa; string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast(node_id) + " ||| " + boost::lexical_cast(lcount); sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz"; x.init_vector(&dense_weights); Weights::WriteToFile(sa.str(), dense_weights, true, &msga); - cerr << "Optimization complete.\n"; return 0; } -- cgit v1.2.3 From 6860205f7de409f3056d789a62f899cac5f8bcff Mon Sep 17 00:00:00 2001 From: Vladimir Eidelman Date: Sat, 13 Apr 2013 23:35:06 -0400 Subject: cleanup script --- training/mira/run_mira.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/mira/run_mira.pl b/training/mira/run_mira.pl index 90a4da0e..e72c02e0 100755 --- a/training/mira/run_mira.pl +++ b/training/mira/run_mira.pl @@ -593,7 +593,7 @@ sub average_weights { else { (my $msg,my $ran,$mult) = split(/ \|\|\| /); - print "RAN $ran $mult\n"; + print "Processing $ran $mult\n"; } } $total_mult += $mult; -- cgit v1.2.3 From 5dca0ad9e5f0156577e84eb20a077463185da871 Mon Sep 17 00:00:00 2001 From: Vladimir Eidelman Date: Sun, 14 Apr 2013 00:00:43 -0400 Subject: cleanup script --- training/mira/run_mira.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/mira/run_mira.pl b/training/mira/run_mira.pl index 90a4da0e..e72c02e0 100755 --- a/training/mira/run_mira.pl +++ b/training/mira/run_mira.pl @@ -593,7 +593,7 @@ sub average_weights { else { (my $msg,my $ran,$mult) = split(/ \|\|\| /); - print "RAN $ran $mult\n"; + print "Processing $ran $mult\n"; } } $total_mult += $mult; -- cgit v1.2.3 From 381ea2483cbb5fb7c8d33ab05bad248652f13ffc Mon Sep 17 00:00:00 2001 From: Vladimir Eidelman Date: Sun, 14 Apr 2013 00:02:00 -0400 Subject: add example to run script --- training/mira/run_mira.pl | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/training/mira/run_mira.pl b/training/mira/run_mira.pl index e72c02e0..d71590ba 100755 --- a/training/mira/run_mira.pl +++ b/training/mira/run_mira.pl @@ -455,9 +455,24 @@ sub print_help { print << "Help"; Usage: $executable [options] - - $executable [options] - Runs a complete MIRA optimization using the ini file specified. + Runs a complete MIRA optimization using the ini file specified. + Example invocation: + run_mira.pl \ + --pmem 3g \ + --max-iterations 20 \ + --optimizer 2 \ + --unique-kbest \ + --jobs 15 \ + --kbest-size 500 \ + --hope-select 1 \ + --fear-select 1 \ + --ref-files "ref.0.soseos ref.1.soseos" \ + --source-file src.soseos \ + --weights weights.init \ + --workdir workdir \ + --grammar-prefix grammars/grammar \ + --step-size 0.01 \ + --metric-scale 10000 \ Required: -- cgit v1.2.3