From 2d58182ec6c961fe2f08f4a88886f3e128fb0113 Mon Sep 17 00:00:00 2001 From: Vladimir Eidelman Date: Sat, 13 Apr 2013 21:57:37 -0400 Subject: mira run script --- environment/LocalConfig.pm | 2 +- training/mira/Makefile.am | 7 +- training/mira/kbest_cut_mira.cc | 1010 ++++++++++++++++++++++++++++++++++ training/mira/kbest_mirav5.cc | 1148 --------------------------------------- training/mira/run_mira.pl | 181 ++++-- 5 files changed, 1141 insertions(+), 1207 deletions(-) create mode 100644 training/mira/kbest_cut_mira.cc delete mode 100644 training/mira/kbest_mirav5.cc diff --git a/environment/LocalConfig.pm b/environment/LocalConfig.pm index 627f7f8c..f7c3b1c7 100644 --- a/environment/LocalConfig.pm +++ b/environment/LocalConfig.pm @@ -34,7 +34,7 @@ my $CCONFIG = { #'QSubQueue' => '-q long', }, 'UMIACS' => { - 'HOST_REGEXP' => qr/^d.*\.umiacs\.umd\.edu$/, + 'HOST_REGEXP' => qr/^(n|s|d).*\.umiacs\.umd\.edu$/, 'JobControl' => 'qsub', 'QSubMemFlag' => '-l pmem=', 'QSubQueue' => '-q batch', diff --git a/training/mira/Makefile.am b/training/mira/Makefile.am index fa4fb22d..8cddc2d7 100644 --- a/training/mira/Makefile.am +++ b/training/mira/Makefile.am @@ -1,6 +1,11 @@ -bin_PROGRAMS = kbest_mira +bin_PROGRAMS = kbest_mira \ + kbest_cut_mira kbest_mira_SOURCES = kbest_mira.cc kbest_mira_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a + +kbest_cut_mira_SOURCES = kbest_cut_mira.cc +kbest_cut_mira_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a + AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc new file mode 100644 index 00000000..34eb00dc --- /dev/null +++ b/training/mira/kbest_cut_mira.cc @@ -0,0 +1,1010 @@ +#include +#include +#include +#include +#include +#include + +#include "config.h" + + +#include +#include +#include + +#include "sentence_metadata.h" +#include "scorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "time.h" +#include "sampler.h" + +#include "weights.h" +#include "sparse_vector.h" + +using namespace std; +using boost::shared_ptr; +namespace po = boost::program_options; + +bool invert_score; +boost::shared_ptr rng; +bool approx_score; +bool no_reweight; +bool no_select; +bool unique_kbest; +int update_list_size; +vector dense_weights_g; +double mt_metric_scale; +int optimizer; +int fear_select; +int hope_select; +bool pseudo_doc; +bool sent_approx; +bool checkloss; + +void SanityCheck(const vector& w) { + for (int i = 0; i < w.size(); ++i) { + assert(!isnan(w[i])); + assert(!isinf(w[i])); + } +} + +struct FComp { + const vector& w_; + FComp(const vector& w) : w_(w) {} + bool operator()(int a, int b) const { + return fabs(w_[a]) > fabs(w_[b]); + } +}; + +void ShowLargestFeatures(const vector& w) { + vector fnums(w.size()); + for (int i = 0; i < w.size(); ++i) + fnums[i] = i; + vector::iterator mid = fnums.begin(); + mid += (w.size() > 10 ? 10 : w.size()); + partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); + cerr << "TOP FEATURES:"; + for (vector::iterator i = fnums.begin(); i != mid; ++i) { + cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; + } + cerr << endl; +} + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("input_weights,w",po::value(),"Input feature weights file") + ("source,i",po::value(),"Source file for development set") + ("pass,p", po::value()->default_value(15), "Current pass through the training data") + ("reference,r",po::value >(), "[REQD] Reference translation(s) (tokenized text file)") + ("mt_metric,m",po::value()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") + ("optimizer,o",po::value()->default_value(1), "Optimizer (SGD=1, PA MIRA w/Delta=2, Cutting Plane MIRA=3, PA MIRA=4, Triple nbest list MIRA=5)") + ("fear,f",po::value()->default_value(1), "Fear selection (model-cost=1, maxcost=2, maxscore=3)") + ("hope,h",po::value()->default_value(1), "Hope selection (model+cost=1, mincost=2)") + ("max_step_size,C", po::value()->default_value(0.01), "regularization strength (C)") + ("random_seed,S", po::value(), "Random seed (if not specified, /dev/random will be used)") + ("mt_metric_scale,s", po::value()->default_value(1.0), "Amount to scale MT loss function by") + ("sent_approx,a", "Use smoothed sentence-level BLEU score for approximate scoring") + ("pseudo_doc,e", "Use pseudo-document BLEU score for approximate scoring") + ("no_reweight,d","Do not reweight forest for cutting plane") + ("no_select,n", "Do not use selection heuristic") + ("k_best_size,k", po::value()->default_value(250), "Size of hypothesis list to search for oracles") + ("update_k_best,b", po::value()->default_value(1), "Size of good, bad lists to perform update with") + ("unique_k_best,u", "Unique k-best translation list") + ("weights_output,O",po::value(),"Directory to write weights to") + ("output_dir,D",po::value(),"Directory to place output in") + ("decoder_config,c",po::value(),"Decoder configuration file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,H", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("input_weights") || !conf->count("decoder_config") || !conf->count("reference")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +//load previous translation, store array of each sentences score, subtract it from current sentence and replace with new translation score + + +static const double kMINUS_EPSILON = -1e-6; +static const double EPSILON = 0.000001; +static const double SMO_EPSILON = 0.0001; +static const double PSEUDO_SCALE = 0.95; +static const int MAX_SMO = 10; +int cur_pass; + +struct HypothesisInfo { + SparseVector features; + vector hyp; + double mt_metric; + double hope; + double fear; + double alpha; + double oracle_loss; + SparseVector oracle_feat_diff; + shared_ptr oracleN; +}; + +bool ApproxEqual(double a, double b) { + if (a == b) return true; + return (fabs(a-b)/fabs(b)) < EPSILON; +} + +typedef shared_ptr HI; +bool HypothesisCompareB(const HI& h1, const HI& h2 ) +{ + return h1->mt_metric > h2->mt_metric; +}; + + +bool HopeCompareB(const HI& h1, const HI& h2 ) +{ + return h1->hope > h2->hope; +}; + +bool FearCompareB(const HI& h1, const HI& h2 ) +{ + return h1->fear > h2->fear; +}; + +bool FearComparePred(const HI& h1, const HI& h2 ) +{ + return h1->features.dot(dense_weights_g) > h2->features.dot(dense_weights_g); +}; + +bool HypothesisCompareG(const HI& h1, const HI& h2 ) +{ + return h1->mt_metric < h2->mt_metric; +}; + + +void CuttingPlane(vector >* cur_c, bool* again, vector >& all_hyp, vector dense_weights) +{ + bool DEBUG_CUT = false; + shared_ptr max_fear, max_fear_in_set; + vector >& cur_constraint = *cur_c; + + if(no_reweight) + { + //find new hope hypothesis + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights); + all_hyp[u]->hope = 1 * all_hyp[u]->mt_metric + t_score; + } + + //sort hyps by hope score + sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); + + double hope_score = all_hyp[0]->features.dot(dense_weights); + if(DEBUG_CUT) cerr << "New hope derivation score " << hope_score << endl; + + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights); + //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; + + all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*all_hyp[0]->mt_metric - hope_score + t_score; //relative loss + // all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*all_hyp[0]->mt_metric; + //all_hyp[u]->oracle_feat_diff = all_hyp[0]->features - all_hyp[u]->features; + // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; + } + + sort(all_hyp.begin(),all_hyp.end(),FearCompareB); + + } + //assign maximum fear derivation from all derivations + max_fear = all_hyp[0]; + + if(DEBUG_CUT) cerr <<"Cutting Plane Max Fear "<fear ; + for(int i=0; i < cur_constraint.size();i++) //select maximal violator already in constraint set + { + if (!max_fear_in_set || cur_constraint[i]->fear > max_fear_in_set->fear) + max_fear_in_set = cur_constraint[i]; + } + if(DEBUG_CUT) cerr << "Max Fear in constraint set " << max_fear_in_set->fear << endl; + + if(max_fear->fear > max_fear_in_set->fear + SMO_EPSILON) + { + cur_constraint.push_back(max_fear); + *again = true; + if(DEBUG_CUT) cerr << "Optimize Again " << *again << endl; + } +} + + +double ComputeDelta(vector >* cur_p, double max_step_size,vector dense_weights ) +{ + vector >& cur_pair = *cur_p; + double loss = cur_pair[0]->oracle_loss - cur_pair[1]->oracle_loss; + //double margin = -cur_pair[0]->oracle_feat_diff.dot(dense_weights) + cur_pair[1]->oracle_feat_diff.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? + //double num = loss - margin; + + + double margin = -(cur_pair[0]->oracleN->features.dot(dense_weights)- cur_pair[0]->features.dot(dense_weights)) + (cur_pair[1]->oracleN->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights)); + const double num = margin + loss; + cerr << "LOSS: " << num << " Margin:" << margin << " BLEUL:" << loss << " " << cur_pair[1]->features.dot(dense_weights) << " " << cur_pair[0]->features.dot(dense_weights) <oracle_loss - cur_pair[0]->oracle_feat_diff.dot(dense_weights)) + - (cur_pair[1]->oracle_loss - cur_pair[1]->oracle_feat_diff.dot(dense_weights)); + */ + + SparseVector diff = cur_pair[0]->features; + diff -= cur_pair[1]->features; + /* SparseVector diff = cur_pair[0]->oracle_feat_diff; + diff -= cur_pair[1]->oracle_feat_diff;*/ + double diffsqnorm = diff.l2norm_sq(); + double delta; + if (diffsqnorm > 0) + delta = num / (diffsqnorm * max_step_size); + else + delta = 0; + cerr << " D1:" << delta; + //clip delta (enforce margin constraints) + + delta = max(-cur_pair[0]->alpha, min(delta, cur_pair[1]->alpha)); + cerr << " D2:" << delta; + return delta; +} + + +vector > SelectPair(vector >* cur_c) +{ + bool DEBUG_SELECT= false; + vector >& cur_constraint = *cur_c; + + vector > pair; + + if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for 1-mira + // if(optimizer == 2) { + pair.push_back(cur_constraint[0]); + pair.push_back(cur_constraint[1]); + return pair; + // } + } + + for(int u=0;u != cur_constraint.size();u++) + { + shared_ptr max_fear; + + if(DEBUG_SELECT) cerr<< "cur alpha " << u << " " << cur_constraint[u]->alpha; + for(int i=0; i < cur_constraint.size();i++) //select maximal violator + { + if(i != u) + if (!max_fear || cur_constraint[i]->fear > max_fear->fear) + max_fear = cur_constraint[i]; + } + if(!max_fear) return pair; // + + if(DEBUG_SELECT) cerr << " F" << max_fear->fear << endl; + + + if ((cur_constraint[u]->alpha == 0) && (cur_constraint[u]->fear > max_fear->fear + SMO_EPSILON)) + { + for(int i=0; i < cur_constraint.size();i++) //select maximal violator + { + if(i != u) + if (cur_constraint[i]->alpha > 0) + { + pair.push_back(cur_constraint[u]); + pair.push_back(cur_constraint[i]); + cerr << "RETJURN from 1" << endl; + return pair; + } + } + } + if ((cur_constraint[u]->alpha > 0) && (cur_constraint[u]->fear < max_fear->fear - SMO_EPSILON)) + { + for(int i=0; i < cur_constraint.size();i++) //select maximal violator + { + if(i != u) + if (cur_constraint[i]->fear > cur_constraint[u]->fear) + { + pair.push_back(cur_constraint[u]); + pair.push_back(cur_constraint[i]); + return pair; + } + } + } + + } + return pair; //no more constraints to optimize, we're done here + +} + +struct GoodBadOracle { + vector > good; + vector > bad; +}; + +struct TrainingObserver : public DecoderObserver { + TrainingObserver(const int k, const DocScorer& d, vector* o, vector* cbs) : ds(d), oracles(*o), corpus_bleu_sent_stats(*cbs), kbest_size(k) { + // TrainingObserver(const int k, const DocScorer& d, vector* o) : ds(d), oracles(*o), kbest_size(k) { + + //calculate corpus bleu score from previous iterations 1-best for BLEU gain + if(!pseudo_doc && !sent_approx) + if(cur_pass > 0) + { + ScoreP acc; + for (int ii = 0; ii < corpus_bleu_sent_stats.size(); ii++) { + if (!acc) { acc = corpus_bleu_sent_stats[ii]->GetZero(); } + acc->PlusEquals(*corpus_bleu_sent_stats[ii]); + + } + corpus_bleu_stats = acc; + corpus_bleu_score = acc->ComputeScore(); + } + //corpus_src_length = 0; +} + const DocScorer& ds; + vector& corpus_bleu_sent_stats; + vector& oracles; + vector > cur_best; + shared_ptr cur_oracle; + const int kbest_size; + Hypergraph forest; + int cur_sent; + ScoreP corpus_bleu_stats; + float corpus_bleu_score; + + float corpus_src_length; + float curr_src_length; + + const int GetCurrentSent() const { + return cur_sent; + } + + const HypothesisInfo& GetCurrentBestHypothesis() const { + return *cur_best[0]; + } + + const vector > GetCurrentBest() const { + return cur_best; + } + + const HypothesisInfo& GetCurrentOracle() const { + return *cur_oracle; + } + + const Hypergraph& GetCurrentForest() const { + return forest; + } + + + virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { + cur_sent = smeta.GetSentenceID(); + //cerr << "SOURCE " << smeta.GetSourceLength() << endl; + curr_src_length = (float) smeta.GetSourceLength(); + //UpdateOracles(smeta.GetSentenceID(), *hg); + if(unique_kbest) + UpdateOracles(smeta.GetSentenceID(), *hg); + else + UpdateOracles > >(smeta.GetSentenceID(), *hg); + forest = *hg; + + } + + shared_ptr MakeHypothesisInfo(const SparseVector& feats, const double score, const vector& hyp) { + shared_ptr h(new HypothesisInfo); + h->features = feats; + h->mt_metric = score; + h->hyp = hyp; + return h; + } + + template + void UpdateOracles(int sent_id, const Hypergraph& forest) { + + bool PRINT_LIST= false; + vector >& cur_good = oracles[sent_id].good; + vector >& cur_bad = oracles[sent_id].bad; + //TODO: look at keeping previous iterations hypothesis lists around + cur_best.clear(); + cur_good.clear(); + cur_bad.clear(); + + vector > all_hyp; + + typedef KBest::KBestDerivations, ESentenceTraversal,Filter> K; + K kbest(forest,kbest_size); + + //KBest::KBestDerivations, ESentenceTraversal> kbest(forest, kbest_size); + for (int i = 0; i < kbest_size; ++i) { + //const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + typename K::Derivation *d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + + float sentscore; + if(cur_pass > 0 && !pseudo_doc && !sent_approx) + { + ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); + ScoreP corpus_no_best = corpus_bleu_stats->GetZero(); + + corpus_bleu_stats->Subtract(*corpus_bleu_sent_stats[sent_id], &*corpus_no_best); + sent_stats->PlusEquals(*corpus_no_best, 0.5); + + //compute gain from new sentence in 1-best corpus + sentscore = mt_metric_scale * (sent_stats->ComputeScore() - corpus_no_best->ComputeScore());// - corpus_bleu_score); + } + else if(pseudo_doc) //pseudo-corpus smoothing + { + float src_scale = corpus_src_length + curr_src_length; + ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); + if(!corpus_bleu_stats){ corpus_bleu_stats = sent_stats->GetZero();} + + sent_stats->PlusEquals(*corpus_bleu_stats); + sentscore = mt_metric_scale * src_scale * sent_stats->ComputeScore(); + + } + else //use sentence-level smoothing ( used when cur_pass=0 if not pseudo_doc) + { + + sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore()); + } + + if (invert_score) sentscore *= -1.0; + + if (i < update_list_size){ + if(PRINT_LIST)cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; + cur_best.push_back( MakeHypothesisInfo(d->feature_values, sentscore, d->yield)); + } + + all_hyp.push_back(MakeHypothesisInfo(d->feature_values, sentscore,d->yield)); //store all hyp to extract hope and fear + } + + if(pseudo_doc){ + //update psuedo-doc stats + string details, details2; + corpus_bleu_stats->ScoreDetails(&details2); + ScoreP sent_stats = ds[sent_id]->ScoreCandidate(cur_best[0]->hyp); + corpus_bleu_stats->PlusEquals(*sent_stats); + + sent_stats->ScoreDetails(&details); + sent_stats = corpus_bleu_stats; + corpus_bleu_stats = sent_stats->GetZero(); + corpus_bleu_stats->PlusEquals(*sent_stats, PSEUDO_SCALE); + + corpus_src_length = PSEUDO_SCALE * (corpus_src_length + curr_src_length); + cerr << "CORP S " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n" << details2 << endl; + } + + + //figure out how many hyps we can keep maximum + int temp_update_size = update_list_size; + if (all_hyp.size() < update_list_size){ temp_update_size = all_hyp.size();} + + //sort all hyps by sentscore (eg. bleu) + sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareB); + + if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++) cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_weights_g) << endl; } + + if(hope_select == 1) + { + //find hope hypothesis using model + bleu + if (PRINT_LIST) cerr << "HOPE " << endl; + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights_g); + all_hyp[u]->hope = all_hyp[u]->mt_metric + t_score; + if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; + + } + + //sort hyps by hope score + sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); + } + + //assign cur_good the sorted list + cur_good.insert(cur_good.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); + if(PRINT_LIST) { cerr << "GOOD" << endl; for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;} + + shared_ptr& oracleN = cur_good[0]; + + + if(fear_select == 1){ //compute fear hyps with model - bleu + if (PRINT_LIST) cerr << "FEAR " << endl; + double hope_score = oracleN->features.dot(dense_weights_g); + + if (PRINT_LIST) cerr << "hope score " << hope_score << endl; + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights_g); + //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; + + /* all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric - hope_score + t_score; //relative loss + all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric; + all_hyp[u]->oracle_feat_diff = cur_oracle->features - all_hyp[u]->features;*/ + + all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric - hope_score + t_score; //relative loss + all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric; + all_hyp[u]->oracle_feat_diff = oracleN->features - all_hyp[u]->features; + all_hyp[u]->oracleN=oracleN; + // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; + if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; + + } + + sort(all_hyp.begin(),all_hyp.end(),FearCompareB); + + cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); + } + else if(fear_select == 2) //select fear based on cost + { + cur_bad.insert(cur_bad.begin(), all_hyp.end()-temp_update_size, all_hyp.end()); + reverse(cur_bad.begin(),cur_bad.end()); + } + else //pred-based, fear_select = 3 + { + sort(all_hyp.begin(),all_hyp.end(),FearComparePred); + cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); + } + + + if(PRINT_LIST){ cerr<< "BAD"<mt_metric << " H:" << cur_bad[u]->hope << " F:" << cur_bad[u]->fear << endl;} + + cerr << "GOOD (BEST): " << cur_good[0]->mt_metric << endl; + cerr << " CUR: " << cur_best[0]->mt_metric << endl; + cerr << " BAD (WORST): " << cur_bad[0]->mt_metric << endl; + } +}; + +void ReadTrainingCorpus(const string& fname, vector* c) { + + + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (!in) break; + c->push_back(line); + } +} + +void ReadPastTranslationForScore(const int cur_pass, vector* c, DocScorer& ds, const string& od) +{ + cerr << "Reading BLEU gain file "; + string fname; + if(cur_pass == 0) + { + fname = od + "/run.raw.init"; + } + else + { + int last_pass = cur_pass - 1; + fname = od + "/run.raw." + boost::lexical_cast(last_pass) + ".B"; + } + cerr << fname << "\n"; + ReadFile rf(fname); + istream& in = *rf.stream(); + ScoreP acc; + string line; + int lc = 0; + while(in) { + getline(in, line); + if (line.empty() && !in) break; + vector sent; + TD::ConvertSentence(line, &sent); + ScoreP sentscore = ds[lc]->ScoreCandidate(sent); + c->push_back(sentscore); + if (!acc) { acc = sentscore->GetZero(); } + acc->PlusEquals(*sentscore); + ++lc; + + } + + + assert(lc > 0); + float score = acc->ComputeScore(); + string details; + acc->ScoreDetails(&details); + cerr << "INIT RUN " << details << score << endl; + +} + + +int main(int argc, char** argv) { + register_feature_functions(); + SetSilent(true); // turn off verbose decoder output + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) return 1; + + if (conf.count("random_seed")) + rng.reset(new MT19937(conf["random_seed"].as())); + else + rng.reset(new MT19937); + + vector corpus; + //ReadTrainingCorpus(conf["source"].as(), &corpus); + + const string metric_name = conf["mt_metric"].as(); + optimizer = conf["optimizer"].as(); + fear_select = conf["fear"].as(); + hope_select = conf["hope"].as(); + mt_metric_scale = conf["mt_metric_scale"].as(); + approx_score = conf.count("approx_score"); + no_reweight = conf.count("no_reweight"); + no_select = conf.count("no_select"); + update_list_size = conf["update_k_best"].as(); + unique_kbest = conf.count("unique_k_best"); + pseudo_doc = conf.count("pseudo_doc"); + sent_approx = conf.count("sent_approx"); + cerr << "PSEUDO " << pseudo_doc << " SENT " << sent_approx << endl; + if(pseudo_doc) + mt_metric_scale=1; + + const string weights_dir = conf["weights_output"].as(); + const string output_dir = conf["output_dir"].as(); + ScoreType type = ScoreTypeFromString(metric_name); + + //establish metric used for tuning + if (type == TER) { + invert_score = true; + // approx_score = false; + } else { + invert_score = false; + } + + //load references + DocScorer ds(type, conf["reference"].as >(), ""); + cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl; + vector corpus_bleu_sent_stats; + + //check training pass,if >0, then use previous iterations corpus bleu stats + cur_pass = conf["pass"].as(); + if(cur_pass > 0) + { + ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir); + } + /* if (ds.size() != corpus.size()) { + cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; + return 1; + }*/ + cerr << "Optimizing with " << optimizer << endl; + // load initial weights + /*Weights weights; + weights.InitFromFile(conf["input_weights"].as()); + SparseVector lambdas; + weights.InitSparseVector(&lambdas); + */ + + + + ReadFile ini_rf(conf["decoder_config"].as()); + Decoder decoder(ini_rf.stream()); + + vector& dense_weights = decoder.CurrentWeightVector(); + + SparseVector lambdas; + Weights::InitFromFile(conf["input_weights"].as(), &dense_weights); + Weights::InitSparseVector(dense_weights, &lambdas); + + const string input = decoder.GetConf()["input"].as(); + //const bool show_feature_dictionary = decoder.GetConf().count("show_feature_dictionary"); + if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; + ReadFile in_read(input); + istream *in = in_read.stream(); + assert(*in); + string buf; + + const double max_step_size = conf["max_step_size"].as(); + + + // assert(corpus.size() > 0); + vector oracles(ds.size()); + + TrainingObserver observer(conf["k_best_size"].as(), ds, &oracles, &corpus_bleu_sent_stats); + + int cur_sent = 0; + int lcount = 0; + double objective=0; + double tot_loss = 0; + int dots = 0; + // int cur_pass = 1; + // vector dense_weights; + SparseVector tot; + SparseVector final_tot; + // tot += lambdas; // initial weights + // lcount++; // count for initial weights + + //string msg = "# MIRA tuned weights"; + // while (cur_pass <= max_iteration) { + SparseVector old_lambdas = lambdas; + tot.clear(); + tot += lambdas; + cerr << "PASS " << cur_pass << " " << endl << lambdas << endl; + ScoreP acc, acc_h, acc_f; + + while(*in) { + getline(*in, buf); + if (buf.empty()) continue; + //TODO: allow batch updating + lambdas.init_vector(&dense_weights); + dense_weights_g = dense_weights; + decoder.SetId(cur_sent); + decoder.Decode(buf, &observer); // decode the sentence, calling Notify to get the hope,fear, and model best hyps. + + cur_sent = observer.GetCurrentSent(); + cerr << "SENT: " << cur_sent << endl; + const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis(); + const HypothesisInfo& cur_good = *oracles[cur_sent].good[0]; + const HypothesisInfo& cur_bad = *oracles[cur_sent].bad[0]; + + vector >& cur_good_v = oracles[cur_sent].good; + vector >& cur_bad_v = oracles[cur_sent].bad; + vector > cur_best_v = observer.GetCurrentBest(); + + tot_loss += cur_hyp.mt_metric; + + //score hyps to be able to compute corpus level bleu after we finish this iteration through the corpus + ScoreP sentscore = ds[cur_sent]->ScoreCandidate(cur_hyp.hyp); + if (!acc) { acc = sentscore->GetZero(); } + acc->PlusEquals(*sentscore); + + ScoreP hope_sentscore = ds[cur_sent]->ScoreCandidate(cur_good.hyp); + if (!acc_h) { acc_h = hope_sentscore->GetZero(); } + acc_h->PlusEquals(*hope_sentscore); + + ScoreP fear_sentscore = ds[cur_sent]->ScoreCandidate(cur_bad.hyp); + if (!acc_f) { acc_f = fear_sentscore->GetZero(); } + acc_f->PlusEquals(*fear_sentscore); + + if(optimizer == 4) { //passive-aggresive update (single dual coordinate step) + + double margin = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights); + double mt_loss = (cur_good.mt_metric - cur_bad.mt_metric); + const double loss = margin + mt_loss; + cerr << "LOSS: " << loss << " Margin:" << margin << " BLEUL:" << mt_loss << " " << cur_bad.features.dot(dense_weights) << " " << cur_good.features.dot(dense_weights) < 0.0 || !checkloss) { + SparseVector diff = cur_good.features; + diff -= cur_bad.features; + + double diffsqnorm = diff.l2norm_sq(); + double delta; + if (diffsqnorm > 0) + delta = loss / (diffsqnorm); + else + delta = 0; + + if (delta > max_step_size) delta = max_step_size; + lambdas += (cur_good.features * delta); + lambdas -= (cur_bad.features * delta); + + } + } + else if(optimizer == 1) //sgd - nonadapted step size + { + + lambdas += (cur_good.features) * max_step_size; + lambdas -= (cur_bad.features) * max_step_size; + } + else if(optimizer == 5) //full mira with n-best list of constraints from hope, fear, model best + { + vector > cur_constraint; + cur_constraint.insert(cur_constraint.begin(), cur_bad_v.begin(), cur_bad_v.end()); + cur_constraint.insert(cur_constraint.begin(), cur_best_v.begin(), cur_best_v.end()); + cur_constraint.insert(cur_constraint.begin(), cur_good_v.begin(), cur_good_v.end()); + + bool optimize_again; + vector > cur_pair; + //SMO + for(int u=0;u!=cur_constraint.size();u++) + cur_constraint[u]->alpha =0; + + cur_constraint[0]->alpha =1; //set oracle to alpha=1 + + cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl; + int smo_iter = MAX_SMO, smo_iter2 = MAX_SMO; + int iter, iter2 =0; + bool DEBUG_SMO = false; + while (iter2 < smo_iter2) + { + iter =0; + while (iter < smo_iter) + { + optimize_again = true; + for (int i = 0; i< cur_constraint.size(); i++) + for (int j = i+1; j< cur_constraint.size(); j++) + { + if(DEBUG_SMO) cerr << "start " << i << " " << j << endl; + cur_pair.clear(); + cur_pair.push_back(cur_constraint[j]); + cur_pair.push_back(cur_constraint[i]); + double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); + + if (delta == 0) optimize_again = false; + cur_constraint[j]->alpha += delta; + cur_constraint[i]->alpha -= delta; + double step_size = delta * max_step_size; + + lambdas += (cur_constraint[i]->features) * step_size; + lambdas -= (cur_constraint[j]->features) * step_size; + if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << i << " " << j << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; + } + iter++; + + if(!optimize_again) + { + iter = MAX_SMO; + cerr << "Optimization stopped, delta =0" << endl; + } + } + iter2++; + } + } + else if(optimizer == 2 || optimizer == 3) //PA and Cutting Plane MIRA update + { + bool DEBUG_SMO= true; + vector > cur_constraint; + cur_constraint.push_back(cur_good_v[0]); //add oracle to constraint set + bool optimize_again = true; + int cut_plane_calls = 0; + while (optimize_again) + { + if(DEBUG_SMO) cerr<< "optimize again: " << optimize_again << endl; + if(optimizer == 2){ //PA + cur_constraint.push_back(cur_bad_v[0]); + + //check if we have a violation + if(!(cur_constraint[1]->fear > cur_constraint[0]->fear + SMO_EPSILON)) + { + optimize_again = false; + cerr << "Constraint not violated" << endl; + } + } + else + { //cutting plane to add constraints + if(DEBUG_SMO) cerr<< "Cutting Plane " << cut_plane_calls << " with " << lambdas << endl; + optimize_again = false; + cut_plane_calls++; + CuttingPlane(&cur_constraint, &optimize_again, oracles[cur_sent].bad, dense_weights); + if (cut_plane_calls >= MAX_SMO) optimize_again = false; + } + + if(optimize_again) + { + //SMO + for(int u=0;u!=cur_constraint.size();u++) + { + cur_constraint[u]->alpha =0; + } + cur_constraint[0]->alpha = 1; + cerr <<" Optimizing with " << cur_constraint.size() << " constraints" << endl; + int smo_iter = MAX_SMO; + int iter =0; + while (iter < smo_iter) + { + //select pair to optimize from constraint set + vector > cur_pair = SelectPair(&cur_constraint); + + if(cur_pair.empty()){ + iter=MAX_SMO; + cerr << "Undefined pair " << endl; + continue; + } //pair is undefined so we are done with this smo + + double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); + + cur_pair[0]->alpha += delta; + cur_pair[1]->alpha -= delta; + double step_size = delta * max_step_size; + cerr << "step " << step_size << endl; + + lambdas += (cur_pair[1]->features) * step_size; + lambdas -= (cur_pair[0]->features) * step_size; + cerr << " Lambdas " << lambdas << endl; + //reload weights based on update + + dense_weights.clear(); + lambdas.init_vector(&dense_weights); + dense_weights_g = dense_weights; + iter++; + + if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; + if(no_select) //don't use selection heuristic to determine when to stop SMO, rather just when delta =0 + if (delta == 0) iter = MAX_SMO; + + //only perform one dual coordinate ascent step + if(optimizer == 2) + { + optimize_again = false; + iter = MAX_SMO; + } + } + if(optimizer == 3) + { + if(!no_reweight) //reweight the forest and select a new k-best + { + if(DEBUG_SMO) cerr<< "Decoding with new weights -- now orac are " << oracles[cur_sent].good.size() << endl; + Hypergraph hg = observer.GetCurrentForest(); + hg.Reweight(dense_weights); + if(unique_kbest) + observer.UpdateOracles(cur_sent, hg); + else + observer.UpdateOracles > >(cur_sent, hg); + } + } + } + + } + + //print objective after this sentence + double lambda_change = (lambdas - old_lambdas).l2norm_sq(); + double max_fear = cur_constraint[cur_constraint.size()-1]->fear; + double temp_objective = 0.5 * lambda_change;// + max_step_size * max_fear; + + for(int u=0;u!=cur_constraint.size();u++) + { + cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << " " << cur_constraint[u]->fear << endl; + temp_objective += cur_constraint[u]->alpha * cur_constraint[u]->fear; + } + objective += temp_objective; + + cerr << "SENT OBJ: " << temp_objective << " NEW OBJ: " << objective << endl; + } + + + if ((cur_sent * 40 / ds.size()) > dots) { ++dots; cerr << '.'; } + tot += lambdas; + ++lcount; + cur_sent++; + + cout << TD::GetString(cur_good_v[0]->hyp) << " ||| " << TD::GetString(cur_best_v[0]->hyp) << " ||| " << TD::GetString(cur_bad_v[0]->hyp) << endl; + + } + + cerr << "FINAL OBJECTIVE: "<< objective << endl; + final_tot += tot; + cerr << "Translated " << lcount << " sentences " << endl; + cerr << " [AVG METRIC LAST PASS=" << (tot_loss / lcount) << "]\n"; + tot_loss = 0; + + int node_id = rng->next() * 100000; + cerr << " Writing weights to " << node_id << endl; + Weights::ShowLargestFeatures(dense_weights); + dots = 0; + ostringstream os; + os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz"; + string msg = "# MIRA tuned weights ||| " + boost::lexical_cast(node_id) + " ||| " + boost::lexical_cast(lcount); + //Weights.InitFromVector(lambdas); + lambdas.init_vector(&dense_weights); + Weights::WriteToFile(os.str(), dense_weights, true, &msg); + + SparseVector x = tot; + x /= lcount; + ostringstream sa; + string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast(node_id) + " ||| " + boost::lexical_cast(lcount); + sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz"; + x.init_vector(&dense_weights); + Weights::WriteToFile(sa.str(), dense_weights, true, &msga); + + + cerr << "Optimization complete.\n"; + return 0; +} + diff --git a/training/mira/kbest_mirav5.cc b/training/mira/kbest_mirav5.cc deleted file mode 100644 index cea5cf67..00000000 --- a/training/mira/kbest_mirav5.cc +++ /dev/null @@ -1,1148 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "config.h" - - -#include -#include -#include - -#include "sentence_metadata.h" -#include "scorer.h" -#include "verbose.h" -#include "viterbi.h" -#include "hg.h" -#include "prob.h" -#include "kbest.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "fdict.h" -#include "time.h" -#include "sampler.h" - -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -using boost::shared_ptr; -namespace po = boost::program_options; - -bool invert_score; -boost::shared_ptr rng; -bool approx_score; -bool no_reweight; -bool no_select; -bool unique_kbest; -int update_list_size; -vector dense_weights_g; -double mt_metric_scale; -int optimizer; -int fear_select; -int hope_select; - -bool pseudo_doc; - -void SanityCheck(const vector& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - -struct FComp { - const vector& w_; - FComp(const vector& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowLargestFeatures(const vector& w) { - vector fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - for (vector::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input_weights,w",po::value(),"Input feature weights file") - ("source,i",po::value(),"Source file for development set") - ("passes,p", po::value()->default_value(15), "Number of passes through the training data") - ("reference,r",po::value >(), "[REQD] Reference translation(s) (tokenized text file)") - ("mt_metric,m",po::value()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") - ("optimizer,o",po::value()->default_value(1), "Optimizer (sgd=1, mira 1-fear=2, full mira w/ cutting plane=3, full mira w/ nbest list=5, local update=4)") - ("fear,f",po::value()->default_value(1), "Fear selection (model-cost=1, max-cost=2, pred-base=3)") - ("hope,h",po::value()->default_value(1), "Hope selection (model+cost=1, max-cost=2, local-cost=3)") - ("max_step_size,C", po::value()->default_value(0.01), "regularization strength (C)") - ("random_seed,S", po::value(), "Random seed (if not specified, /dev/random will be used)") - ("mt_metric_scale,s", po::value()->default_value(1.0), "Amount to scale MT loss function by") - ("approx_score,a", "Use smoothed sentence-level BLEU score for approximate scoring") - ("no_reweight,d","Do not reweight forest for cutting plane") - ("no_select,n", "Do not use selection heuristic") - ("k_best_size,k", po::value()->default_value(250), "Size of hypothesis list to search for oracles") - ("update_k_best,b", po::value()->default_value(1), "Size of good, bad lists to perform update with") - ("unique_k_best,u", "Unique k-best translation list") - ("weights_output,O",po::value(),"Directory to write weights to") - ("output_dir,D",po::value(),"Directory to place output in") - ("decoder_config,c",po::value(),"Decoder configuration file"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,H", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("input_weights") || !conf->count("decoder_config") || !conf->count("reference")) { - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -//load previous translation, store array of each sentences score, subtract it from current sentence and replace with new translation score - - -static const double kMINUS_EPSILON = -1e-6; -static const double EPSILON = 0.000001; -static const double SMO_EPSILON = 0.0001; -static const double PSEUDO_SCALE = 0.95; -static const int MAX_SMO = 10; -int cur_pass; - -struct HypothesisInfo { - SparseVector features; - vector hyp; - double mt_metric; - double hope; - double fear; - double alpha; - double oracle_loss; - SparseVector oracle_feat_diff; - shared_ptr oracleN; -}; - -bool ApproxEqual(double a, double b) { - if (a == b) return true; - return (fabs(a-b)/fabs(b)) < EPSILON; -} - -typedef shared_ptr HI; -bool HypothesisCompareB(const HI& h1, const HI& h2 ) -{ - return h1->mt_metric > h2->mt_metric; -}; - - -bool HopeCompareB(const HI& h1, const HI& h2 ) -{ - return h1->hope > h2->hope; -}; - -bool FearCompareB(const HI& h1, const HI& h2 ) -{ - return h1->fear > h2->fear; -}; - -bool FearComparePred(const HI& h1, const HI& h2 ) -{ - return h1->features.dot(dense_weights_g) > h2->features.dot(dense_weights_g); -}; - -bool HypothesisCompareG(const HI& h1, const HI& h2 ) -{ - return h1->mt_metric < h2->mt_metric; -}; - - -void CuttingPlane(vector >* cur_c, bool* again, vector >& all_hyp, vector dense_weights) -{ - bool DEBUG_CUT = false; - shared_ptr max_fear, max_fear_in_set; - vector >& cur_constraint = *cur_c; - - if(no_reweight) - { - //find new hope hypothesis - for(int u=0;u!=all_hyp.size();u++) - { - double t_score = all_hyp[u]->features.dot(dense_weights); - all_hyp[u]->hope = 1 * all_hyp[u]->mt_metric + t_score; - //if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; - - } - - //sort hyps by hope score - sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); - - double hope_score = all_hyp[0]->features.dot(dense_weights); - if(DEBUG_CUT) cerr << "New hope derivation score " << hope_score << endl; - - for(int u=0;u!=all_hyp.size();u++) - { - double t_score = all_hyp[u]->features.dot(dense_weights); - //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; - - all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*all_hyp[0]->mt_metric - hope_score + t_score; //relative loss - // all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*all_hyp[0]->mt_metric; - //all_hyp[u]->oracle_feat_diff = all_hyp[0]->features - all_hyp[u]->features; - // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; - //if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; - - } - - sort(all_hyp.begin(),all_hyp.end(),FearCompareB); - - } - //assign maximum fear derivation from all derivations - max_fear = all_hyp[0]; - - if(DEBUG_CUT) cerr <<"Cutting Plane Max Fear "<fear ; - for(int i=0; i < cur_constraint.size();i++) //select maximal violator already in constraint set - { - if (!max_fear_in_set || cur_constraint[i]->fear > max_fear_in_set->fear) - max_fear_in_set = cur_constraint[i]; - } - if(DEBUG_CUT) cerr << "Max Fear in constraint set " << max_fear_in_set->fear << endl; - - if(max_fear->fear > max_fear_in_set->fear + SMO_EPSILON) - { - cur_constraint.push_back(max_fear); - *again = true; - if(DEBUG_CUT) cerr << "Optimize Again " << *again << endl; - } -} - - -double ComputeDelta(vector >* cur_p, double max_step_size,vector dense_weights ) -{ - vector >& cur_pair = *cur_p; - double loss = cur_pair[0]->oracle_loss - cur_pair[1]->oracle_loss; - //double margin = -cur_pair[0]->oracle_feat_diff.dot(dense_weights) + cur_pair[1]->oracle_feat_diff.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? - //double num = loss - margin; - - - double margin = -(cur_pair[0]->oracleN->features.dot(dense_weights)- cur_pair[0]->features.dot(dense_weights)) + (cur_pair[1]->oracleN->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights)); - const double num = margin + loss; - cerr << "LOSS: " << num << " Margin:" << margin << " BLEUL:" << loss << " " << cur_pair[1]->features.dot(dense_weights) << " " << cur_pair[0]->features.dot(dense_weights) <features.dot(dense_weights) - cur_pair[0]->features.dot(dense_weights); - // double loss = cur_pair[1]->oracle_loss; //good.mt_metric - cur_bad.mt_metric); - //const double num = margin + loss; - - //cerr << "Compute Delta " << loss << " " << margin << " "; - - // double margin = cur_pair[0]->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? -/* double num = - (cur_pair[0]->oracle_loss - cur_pair[0]->oracle_feat_diff.dot(dense_weights)) - - (cur_pair[1]->oracle_loss - cur_pair[1]->oracle_feat_diff.dot(dense_weights)); - */ - - SparseVector diff = cur_pair[0]->features; - diff -= cur_pair[1]->features; - /* SparseVector diff = cur_pair[0]->oracle_feat_diff; - diff -= cur_pair[1]->oracle_feat_diff;*/ - double diffsqnorm = diff.l2norm_sq(); - double delta; - if (diffsqnorm > 0) - delta = num / (diffsqnorm * max_step_size); - else - delta = 0; - cerr << " D1:" << delta; - //clip delta (enforce margin constraints) - - delta = max(-cur_pair[0]->alpha, min(delta, cur_pair[1]->alpha)); - cerr << " D2:" << delta; - return delta; -} - - -vector > SelectPair(vector >* cur_c) -{ - bool DEBUG_SELECT= false; - vector >& cur_constraint = *cur_c; - - vector > pair; - - if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for 1-mira - // if(optimizer == 2) { - pair.push_back(cur_constraint[0]); - pair.push_back(cur_constraint[1]); - return pair; - // } - } - - for(int u=0;u != cur_constraint.size();u++) - { - shared_ptr max_fear; - - if(DEBUG_SELECT) cerr<< "cur alpha " << u << " " << cur_constraint[u]->alpha; - for(int i=0; i < cur_constraint.size();i++) //select maximal violator - { - if(i != u) - if (!max_fear || cur_constraint[i]->fear > max_fear->fear) - max_fear = cur_constraint[i]; - } - if(!max_fear) return pair; // - - if(DEBUG_SELECT) cerr << " F" << max_fear->fear << endl; - - - if ((cur_constraint[u]->alpha == 0) && (cur_constraint[u]->fear > max_fear->fear + SMO_EPSILON)) - { - for(int i=0; i < cur_constraint.size();i++) //select maximal violator - { - if(i != u) - if (cur_constraint[i]->alpha > 0) - { - pair.push_back(cur_constraint[u]); - pair.push_back(cur_constraint[i]); - cerr << "RETJURN from 1" << endl; - return pair; - } - } - } - if ((cur_constraint[u]->alpha > 0) && (cur_constraint[u]->fear < max_fear->fear - SMO_EPSILON)) - { - for(int i=0; i < cur_constraint.size();i++) //select maximal violator - { - if(i != u) - if (cur_constraint[i]->fear > cur_constraint[u]->fear) - { - pair.push_back(cur_constraint[u]); - pair.push_back(cur_constraint[i]); - return pair; - } - } - } - - } - return pair; //no more constraints to optimize, we're done here - -} - -struct GoodBadOracle { - vector > good; - vector > bad; -}; - -struct TrainingObserver : public DecoderObserver { - TrainingObserver(const int k, const DocScorer& d, vector* o, vector* cbs) : ds(d), oracles(*o), corpus_bleu_sent_stats(*cbs), kbest_size(k) { - // TrainingObserver(const int k, const DocScorer& d, vector* o) : ds(d), oracles(*o), kbest_size(k) { - - //calculate corpus bleu score from previous iterations 1-best for BLEU gain - if(!pseudo_doc) - if(cur_pass > 0) - { - ScoreP acc; - for (int ii = 0; ii < corpus_bleu_sent_stats.size(); ii++) { - if (!acc) { acc = corpus_bleu_sent_stats[ii]->GetZero(); } - acc->PlusEquals(*corpus_bleu_sent_stats[ii]); - - } - corpus_bleu_stats = acc; - corpus_bleu_score = acc->ComputeScore(); - } - //corpus_src_length = 0; -} - const DocScorer& ds; - vector& corpus_bleu_sent_stats; - vector& oracles; - vector > cur_best; - shared_ptr cur_oracle; - const int kbest_size; - Hypergraph forest; - int cur_sent; - ScoreP corpus_bleu_stats; - float corpus_bleu_score; - - float corpus_src_length; - float curr_src_length; - - const int GetCurrentSent() const { - return cur_sent; - } - - const HypothesisInfo& GetCurrentBestHypothesis() const { - return *cur_best[0]; - } - - const vector > GetCurrentBest() const { - return cur_best; - } - - const HypothesisInfo& GetCurrentOracle() const { - return *cur_oracle; - } - - const Hypergraph& GetCurrentForest() const { - return forest; - } - - - virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { - cur_sent = smeta.GetSentenceID(); - //cerr << "SOURCE " << smeta.GetSourceLength() << endl; - curr_src_length = (float) smeta.GetSourceLength(); - //UpdateOracles(smeta.GetSentenceID(), *hg); - if(unique_kbest) - UpdateOracles(smeta.GetSentenceID(), *hg); - else - UpdateOracles > >(smeta.GetSentenceID(), *hg); - forest = *hg; - - } - - shared_ptr MakeHypothesisInfo(const SparseVector& feats, const double score, const vector& hyp) { - shared_ptr h(new HypothesisInfo); - h->features = feats; - h->mt_metric = score; - h->hyp = hyp; - return h; - } - - template - void UpdateOracles(int sent_id, const Hypergraph& forest) { - - bool PRINT_LIST= false; - vector >& cur_good = oracles[sent_id].good; - vector >& cur_bad = oracles[sent_id].bad; - //TODO: look at keeping previous iterations hypothesis lists around - cur_best.clear(); - cur_good.clear(); - cur_bad.clear(); - - vector > all_hyp; - - typedef KBest::KBestDerivations, ESentenceTraversal,Filter> K; - K kbest(forest,kbest_size); - - //KBest::KBestDerivations, ESentenceTraversal> kbest(forest, kbest_size); - for (int i = 0; i < kbest_size; ++i) { - //const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - typename K::Derivation *d = - kbest.LazyKthBest(forest.nodes_.size() - 1, i); - if (!d) break; - - float sentscore; - if(approx_score) - { - - if(cur_pass > 0 && !pseudo_doc) - { - ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); - ScoreP corpus_no_best = corpus_bleu_stats->GetZero(); - - corpus_bleu_stats->Subtract(*corpus_bleu_sent_stats[sent_id], &*corpus_no_best); - sent_stats->PlusEquals(*corpus_no_best, 0.5); - - //compute gain from new sentence in 1-best corpus - sentscore = mt_metric_scale * (sent_stats->ComputeScore() - corpus_no_best->ComputeScore());// - corpus_bleu_score); - } - else if(pseudo_doc) - { - //cerr << "CORP:" << corpus_bleu_score << " NEW:" << sent_stats->ComputeScore() << " sentscore:" << sentscore << endl; - - //-----pseudo-corpus approach - float src_scale = corpus_src_length + curr_src_length; - ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); - if(!corpus_bleu_stats){ corpus_bleu_stats = sent_stats->GetZero();} - - sent_stats->PlusEquals(*corpus_bleu_stats); - sentscore = mt_metric_scale * src_scale * sent_stats->ComputeScore(); - - } - else - { - //cerr << "Using sentence-level approximation - PASS - " << boost::lexical_cast(cur_pass) << endl; - //approx style of computation, used for 0th iteration - sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeSentScore()); - - //use pseudo-doc - } - - - } - else - { - sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore()); - } - - if (invert_score) sentscore *= -1.0; - //cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << " " << approx_sentscore << endl; - - if (i < update_list_size){ - if (i == 0) //take cur best and add its bleu statistics counts to the pseudo-doc - { } - if(PRINT_LIST)cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; - cur_best.push_back( MakeHypothesisInfo(d->feature_values, sentscore, d->yield)); - } - - all_hyp.push_back(MakeHypothesisInfo(d->feature_values, sentscore,d->yield)); //store all hyp to extract oracle best and worst - - } - - if(pseudo_doc){ - //update psuedo-doc stats - string details, details2; - corpus_bleu_stats->ScoreDetails(&details2); - ScoreP sent_stats = ds[sent_id]->ScoreCandidate(cur_best[0]->hyp); - corpus_bleu_stats->PlusEquals(*sent_stats); - - - sent_stats->ScoreDetails(&details); - - - sent_stats = corpus_bleu_stats; - corpus_bleu_stats = sent_stats->GetZero(); - corpus_bleu_stats->PlusEquals(*sent_stats, PSEUDO_SCALE); - - - corpus_src_length = PSEUDO_SCALE * (corpus_src_length + curr_src_length); - cerr << "CORP S " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n " << details2 << endl; - - - } - - - //figure out how many hyps we can keep maximum - int temp_update_size = update_list_size; - if (all_hyp.size() < update_list_size){ temp_update_size = all_hyp.size();} - - //sort all hyps by sentscore (bleu) - sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareB); - - if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++) cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_weights_g) << endl; } - - //if(optimizer != 4 ) - if(hope_select == 1) - { - //find hope hypothesis using model + bleu - if (PRINT_LIST) cerr << "HOPE " << endl; - for(int u=0;u!=all_hyp.size();u++) - { - double t_score = all_hyp[u]->features.dot(dense_weights_g); - all_hyp[u]->hope = all_hyp[u]->mt_metric + t_score; - if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; - - } - - //sort hyps by hope score - sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); - } - - - //assign cur_good the sorted list - cur_good.insert(cur_good.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); - if(PRINT_LIST) { cerr << "GOOD" << endl; for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;} - /* if (!cur_oracle) { cur_oracle = cur_good[0]; - cerr << "Set oracle " << cur_oracle->hope << " " << cur_oracle->fear << " " << cur_oracle->mt_metric << endl; } - else { - cerr << "Stay oracle " << cur_oracle->hope << " " << cur_oracle->fear << " " << cur_oracle->mt_metric << endl; } */ - - shared_ptr& oracleN = cur_good[0]; - //if(optimizer != 4){ - if(fear_select == 1){ - //compute fear hyps - if (PRINT_LIST) cerr << "FEAR " << endl; - double hope_score = oracleN->features.dot(dense_weights_g); - //double hope_score = cur_oracle->features.dot(dense_weights); - if (PRINT_LIST) cerr << "hope score " << hope_score << endl; - for(int u=0;u!=all_hyp.size();u++) - { - double t_score = all_hyp[u]->features.dot(dense_weights_g); - //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; - - /* all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric - hope_score + t_score; //relative loss - all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric; - all_hyp[u]->oracle_feat_diff = cur_oracle->features - all_hyp[u]->features;*/ - - all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric - hope_score + t_score; //relative loss - all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric; - all_hyp[u]->oracle_feat_diff = oracleN->features - all_hyp[u]->features; - all_hyp[u]->oracleN=oracleN; - // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; - if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; - - } - - sort(all_hyp.begin(),all_hyp.end(),FearCompareB); - - cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); - } - else if(fear_select == 2) //select fear based on cost - { - cur_bad.insert(cur_bad.begin(), all_hyp.end()-temp_update_size, all_hyp.end()); - reverse(cur_bad.begin(),cur_bad.end()); - } - else //pred-based, fear_select = 3 - { - sort(all_hyp.begin(),all_hyp.end(),FearComparePred); - cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); - } - - - if(PRINT_LIST){ cerr<< "BAD"<mt_metric << " H:" << cur_bad[u]->hope << " F:" << cur_bad[u]->fear << endl;} - - cerr << "GOOD (BEST): " << cur_good[0]->mt_metric << endl; - cerr << " CUR: " << cur_best[0]->mt_metric << endl; - cerr << " BAD (WORST): " << cur_bad[0]->mt_metric << endl; - } -}; - -void ReadTrainingCorpus(const string& fname, vector* c) { - - - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - while(in) { - getline(in, line); - if (!in) break; - c->push_back(line); - } -} - -void ReadPastTranslationForScore(const int cur_pass, vector* c, DocScorer& ds, const string& od) -{ - cerr << "Reading BLEU gain file "; - string fname; - if(cur_pass == 0) - { - fname = od + "/run.raw.init"; - } - else - { - int last_pass = cur_pass - 1; - fname = od + "/run.raw." + boost::lexical_cast(last_pass) + ".B"; - } - cerr << fname << "\n"; - ReadFile rf(fname); - istream& in = *rf.stream(); - ScoreP acc; - string line; - int lc = 0; - while(in) { - getline(in, line); - if (line.empty() && !in) break; - vector sent; - TD::ConvertSentence(line, &sent); - ScoreP sentscore = ds[lc]->ScoreCandidate(sent); - c->push_back(sentscore); - if (!acc) { acc = sentscore->GetZero(); } - acc->PlusEquals(*sentscore); - ++lc; - - } - - - assert(lc > 0); - float score = acc->ComputeScore(); - string details; - acc->ScoreDetails(&details); - cerr << "INIT RUN " << details << score << endl; - -} - - -int main(int argc, char** argv) { - register_feature_functions(); - SetSilent(true); // turn off verbose decoder output - - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) return 1; - - if (conf.count("random_seed")) - rng.reset(new MT19937(conf["random_seed"].as())); - else - rng.reset(new MT19937); - - vector corpus; - //ReadTrainingCorpus(conf["source"].as(), &corpus); - - const string metric_name = conf["mt_metric"].as(); - optimizer = conf["optimizer"].as(); - fear_select = conf["fear"].as(); - hope_select = conf["hope"].as(); - mt_metric_scale = conf["mt_metric_scale"].as(); - approx_score = conf.count("approx_score"); - no_reweight = conf.count("no_reweight"); - no_select = conf.count("no_select"); - update_list_size = conf["update_k_best"].as(); - unique_kbest = conf.count("unique_k_best"); - pseudo_doc = true; - - const string weights_dir = conf["weights_output"].as(); - const string output_dir = conf["output_dir"].as(); - ScoreType type = ScoreTypeFromString(metric_name); - - //establish metric used for tuning - if (type == TER) { - invert_score = true; - // approx_score = false; - } else { - invert_score = false; - } - - //load references - DocScorer ds(type, conf["reference"].as >(), ""); - cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl; - vector corpus_bleu_sent_stats; - - //check training pass,if >0, then use previous iterations corpus bleu stats - cur_pass = conf["passes"].as(); - if(cur_pass > 0) - { - ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir); - } - /* if (ds.size() != corpus.size()) { - cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; - return 1; - }*/ - cerr << "Optimizing with " << optimizer << endl; - // load initial weights - /*Weights weights; - weights.InitFromFile(conf["input_weights"].as()); - SparseVector lambdas; - weights.InitSparseVector(&lambdas); - */ - - - - ReadFile ini_rf(conf["decoder_config"].as()); - Decoder decoder(ini_rf.stream()); - - vector& dense_weights = decoder.CurrentWeightVector(); - - SparseVector lambdas; - Weights::InitFromFile(conf["input_weights"].as(), &dense_weights); - Weights::InitSparseVector(dense_weights, &lambdas); - - const string input = decoder.GetConf()["input"].as(); - //const bool show_feature_dictionary = decoder.GetConf().count("show_feature_dictionary"); - if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; - ReadFile in_read(input); - istream *in = in_read.stream(); - assert(*in); - string buf; - - const double max_step_size = conf["max_step_size"].as(); - - - // assert(corpus.size() > 0); - vector oracles(ds.size()); - - TrainingObserver observer(conf["k_best_size"].as(), ds, &oracles, &corpus_bleu_sent_stats); - - int cur_sent = 0; - int lcount = 0; - double objective=0; - double tot_loss = 0; - int dots = 0; - // int cur_pass = 1; - // vector dense_weights; - SparseVector tot; - SparseVector final_tot; - // tot += lambdas; // initial weights - // lcount++; // count for initial weights - - //string msg = "# MIRA tuned weights"; - // while (cur_pass <= max_iteration) { - SparseVector old_lambdas = lambdas; - tot.clear(); - tot += lambdas; - cerr << "PASS " << cur_pass << " " << endl << lambdas << endl; - ScoreP acc, acc_h, acc_f; - - while(*in) { - getline(*in, buf); - if (buf.empty()) continue; - //for (cur_sent = 0; cur_sent < corpus.size(); cur_sent++) { - - cerr << "SENT: " << cur_sent << endl; - //TODO: allow batch updating - //dense_weights.clear(); - //weights.InitFromVector(lambdas); - //weights.InitVector(&dense_weights); - //decoder.SetWeights(dense_weights); - lambdas.init_vector(&dense_weights); - dense_weights_g = dense_weights; - decoder.SetId(cur_sent); - decoder.Decode(buf, &observer); // decode the sentence, calling Notify to get the hope,fear, and model best hyps. - - cur_sent = observer.GetCurrentSent(); - const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis(); - const HypothesisInfo& cur_good = *oracles[cur_sent].good[0]; - const HypothesisInfo& cur_bad = *oracles[cur_sent].bad[0]; - - vector >& cur_good_v = oracles[cur_sent].good; - vector >& cur_bad_v = oracles[cur_sent].bad; - vector > cur_best_v = observer.GetCurrentBest(); - - tot_loss += cur_hyp.mt_metric; - - //score hyps to be able to compute corpus level bleu after we finish this iteration through the corpus - ScoreP sentscore = ds[cur_sent]->ScoreCandidate(cur_hyp.hyp); - if (!acc) { acc = sentscore->GetZero(); } - acc->PlusEquals(*sentscore); - - ScoreP hope_sentscore = ds[cur_sent]->ScoreCandidate(cur_good.hyp); - if (!acc_h) { acc_h = hope_sentscore->GetZero(); } - acc_h->PlusEquals(*hope_sentscore); - - ScoreP fear_sentscore = ds[cur_sent]->ScoreCandidate(cur_bad.hyp); - if (!acc_f) { acc_f = fear_sentscore->GetZero(); } - acc_f->PlusEquals(*fear_sentscore); - - if(optimizer == 4) { //single dual coordinate update, cur_good selected on BLEU score only (not model+BLEU) - // if (!ApproxEqual(cur_hyp.mt_metric, cur_good.mt_metric)) { - - double margin = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights); - double mt_loss = (cur_good.mt_metric - cur_bad.mt_metric); - const double loss = margin + mt_loss; - cerr << "LOSS: " << loss << " Margin:" << margin << " BLEUL:" << mt_loss << " " << cur_bad.features.dot(dense_weights) << " " << cur_good.features.dot(dense_weights) < 0.0) { - SparseVector diff = cur_good.features; - diff -= cur_bad.features; - - double diffsqnorm = diff.l2norm_sq(); - double delta; - if (diffsqnorm > 0) - delta = loss / (diffsqnorm); - else - delta = 0; - - //double step_size = loss / diff.l2norm_sq(); - cerr << loss << " " << delta << " " << diff << endl; - if (delta > max_step_size) delta = max_step_size; - lambdas += (cur_good.features * delta); - lambdas -= (cur_bad.features * delta); - //cerr << "L: " << lambdas << endl; - // } - // } - } - else if(optimizer == 1) //sgd - nonadapted step size - { - - lambdas += (cur_good.features) * max_step_size; - lambdas -= (cur_bad.features) * max_step_size; - } - //cerr << "L: " << lambdas << endl; - else if(optimizer == 5) //full mira with n-best list of constraints from oracle, fear, best - { - vector > cur_constraint; - cur_constraint.insert(cur_constraint.begin(), cur_bad_v.begin(), cur_bad_v.end()); - cur_constraint.insert(cur_constraint.begin(), cur_best_v.begin(), cur_best_v.end()); - cur_constraint.insert(cur_constraint.begin(), cur_good_v.begin(), cur_good_v.end()); - - bool optimize_again; - vector > cur_pair; - //SMO - for(int u=0;u!=cur_constraint.size();u++) - cur_constraint[u]->alpha =0; - - cur_constraint[0]->alpha =1; //set oracle to alpha=1 - - cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl; - int smo_iter = 10, smo_iter2 = 10; - int iter, iter2 =0; - bool DEBUG_SMO = false; - while (iter2 < smo_iter2) - { - iter =0; - while (iter < smo_iter) - { - optimize_again = true; - for (int i = 0; i< cur_constraint.size(); i++) - for (int j = i+1; j< cur_constraint.size(); j++) - { - if(DEBUG_SMO) cerr << "start " << i << " " << j << endl; - cur_pair.clear(); - cur_pair.push_back(cur_constraint[j]); - cur_pair.push_back(cur_constraint[i]); - double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); - - if (delta == 0) optimize_again = false; - // cur_pair[0]->alpha += delta; - // cur_pair[1]->alpha -= delta; - cur_constraint[j]->alpha += delta; - cur_constraint[i]->alpha -= delta; - double step_size = delta * max_step_size; - /*lambdas += (cur_pair[1]->features) * step_size; - lambdas -= (cur_pair[0]->features) * step_size;*/ - lambdas += (cur_constraint[i]->features) * step_size; - lambdas -= (cur_constraint[j]->features) * step_size; - if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << i << " " << j << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; - - //reload weights based on update - /*dense_weights.clear(); - weights.InitFromVector(lambdas); - weights.InitVector(&dense_weights);*/ - } - iter++; - - if(!optimize_again) - { - iter = 100; - cerr << "Optimization stopped, delta =0" << endl; - } - - - } - iter2++; - } - - - } - else if(optimizer == 2 || optimizer == 3) //1-fear and cutting plane mira - { - bool DEBUG_SMO= true; - vector > cur_constraint; - cur_constraint.push_back(cur_good_v[0]); //add oracle to constraint set - bool optimize_again = true; - int cut_plane_calls = 0; - while (optimize_again) - { - if(DEBUG_SMO) cerr<< "optimize again: " << optimize_again << endl; - if(optimizer == 2){ //1-fear - cur_constraint.push_back(cur_bad_v[0]); - - //check if we have a violation - if(!(cur_constraint[1]->fear > cur_constraint[0]->fear + SMO_EPSILON)) - { - optimize_again = false; - cerr << "Constraint not violated" << endl; - } - } - else - { //cutting plane to add constraints - if(DEBUG_SMO) cerr<< "Cutting Plane " << cut_plane_calls << " with " << lambdas << endl; - optimize_again = false; - cut_plane_calls++; - CuttingPlane(&cur_constraint, &optimize_again, oracles[cur_sent].bad, dense_weights); - if (cut_plane_calls >= MAX_SMO) optimize_again = false; - } - - if(optimize_again) - { - //SMO - for(int u=0;u!=cur_constraint.size();u++) - { - cur_constraint[u]->alpha =0; - //cur_good_v[0]->alpha = 1; cur_bad_v[0]->alpha = 0; - } - cur_constraint[0]->alpha = 1; - cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl; - int smo_iter = MAX_SMO; - int iter =0; - while (iter < smo_iter) - { - //select pair to optimize from constraint set - vector > cur_pair = SelectPair(&cur_constraint); - - if(cur_pair.empty()){iter=MAX_SMO; cerr << "Undefined pair " << endl; continue;} //pair is undefined so we are done with this smo - - //double num = cur_good_v[0]->fear - cur_bad_v[0]->fear; - /*double loss = cur_good_v[0]->oracle_loss - cur_bad_v[0]->oracle_loss; - double margin = cur_good_v[0]->oracle_feat_diff.dot(dense_weights) - cur_bad_v[0]->oracle_feat_diff.dot(dense_weights); - double num = loss - margin; - SparseVector diff = cur_good_v[0]->features; - diff -= cur_bad_v[0]->features; - double delta = num / (diff.l2norm_sq() * max_step_size); - delta = max(-cur_good_v[0]->alpha, min(delta, cur_bad_v[0]->alpha)); - cur_good_v[0]->alpha += delta; - cur_bad_v[0]->alpha -= delta; - double step_size = delta * max_step_size; - lambdas += (cur_bad_v[0]->features) * step_size; - lambdas -= (cur_good_v[0]->features) * step_size; - */ - - double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); - - cur_pair[0]->alpha += delta; - cur_pair[1]->alpha -= delta; - double step_size = delta * max_step_size; - /* lambdas += (cur_pair[1]->oracle_feat_diff) * step_size; - lambdas -= (cur_pair[0]->oracle_feat_diff) * step_size;*/ - - cerr << "step " << step_size << endl; - double alpha_sum=0; - SparseVector temp_lambdas = lambdas; - - for(int u=0;u!=cur_constraint.size();u++) - { - cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << endl; - temp_lambdas += (cur_constraint[u]->oracleN->features-cur_constraint[u]->features) * cur_constraint[u]->alpha * step_size; - alpha_sum += cur_constraint[u]->alpha; - } - cerr << "Alpha sum " << alpha_sum << " " << temp_lambdas << endl; - - lambdas += (cur_pair[1]->features) * step_size; - lambdas -= (cur_pair[0]->features) * step_size; - cerr << " Lambdas " << lambdas << endl; - //reload weights based on update - dense_weights.clear(); - //weights.InitFromVector(lambdas); - //weights.InitVector(&dense_weights); - lambdas.init_vector(&dense_weights); - dense_weights_g = dense_weights; - iter++; - - if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; - // cerr << "SMO opt " << iter << " " << delta << " " << cur_good_v[0]->alpha << " " << cur_bad_v[0]->alpha << endl; - if(no_select) //don't use selection heuristic to determine when to stop SMO, rather just when delta =0 - if (delta == 0) iter = MAX_SMO; - - //only perform one dual coordinate ascent step - if(optimizer == 2) - { - optimize_again = false; - iter = MAX_SMO; - } - - } - if(optimizer == 3) - { - if(!no_reweight) - { - if(DEBUG_SMO) cerr<< "Decoding with new weights -- now orac are " << oracles[cur_sent].good.size() << endl; - Hypergraph hg = observer.GetCurrentForest(); - hg.Reweight(dense_weights); - //observer.UpdateOracles(cur_sent, hg); - if(unique_kbest) - observer.UpdateOracles(cur_sent, hg); - else - observer.UpdateOracles > >(cur_sent, hg); - - - } - } - } - - - } - - //print objective after this sentence - double lambda_change = (lambdas - old_lambdas).l2norm_sq(); - double max_fear = cur_constraint[cur_constraint.size()-1]->fear; - double temp_objective = 0.5 * lambda_change;// + max_step_size * max_fear; - - for(int u=0;u!=cur_constraint.size();u++) - { - cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << " " << cur_constraint[u]->fear << endl; - temp_objective += cur_constraint[u]->alpha * cur_constraint[u]->fear; - } - objective += temp_objective; - - cerr << "SENT OBJ: " << temp_objective << " NEW OBJ: " << objective << endl; - } - - - if ((cur_sent * 40 / ds.size()) > dots) { ++dots; cerr << '.'; } - tot += lambdas; - ++lcount; - cur_sent++; - - cout << TD::GetString(cur_good_v[0]->hyp) << " ||| " << TD::GetString(cur_best_v[0]->hyp) << " ||| " << TD::GetString(cur_bad_v[0]->hyp) << endl; - - //clear good/bad lists from oracles for this sentences - you want to keep them around for things - - // oracles[cur_sent].good.clear(); - //oracles[cur_sent].bad.clear(); - } - - cerr << "FINAL OBJECTIVE: "<< objective << endl; - final_tot += tot; - cerr << "Translated " << lcount << " sentences " << endl; - cerr << " [AVG METRIC LAST PASS=" << (tot_loss / lcount) << "]\n"; - tot_loss = 0; - /* - float corpus_score = acc->ComputeScore(); - string corpus_details; - acc->ScoreDetails(&corpus_details); - cerr << "MODEL " << corpus_details << endl; - cout << corpus_score << endl; - - corpus_score = acc_h->ComputeScore(); - acc_h->ScoreDetails(&corpus_details); - cerr << "HOPE " << corpus_details << endl; - cout << corpus_score << endl; - - corpus_score = acc_f->ComputeScore(); - acc_f->ScoreDetails(&corpus_details); - cerr << "FEAR " << corpus_details << endl; - cout << corpus_score << endl; - */ - int node_id = rng->next() * 100000; - cerr << " Writing weights to " << node_id << endl; - Weights::ShowLargestFeatures(dense_weights); - dots = 0; - ostringstream os; - os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz"; - string msg = "# MIRA tuned weights ||| " + boost::lexical_cast(node_id) + " ||| " + boost::lexical_cast(lcount); - //Weights.InitFromVector(lambdas); - lambdas.init_vector(&dense_weights); - Weights::WriteToFile(os.str(), dense_weights, true, &msg); - - SparseVector x = tot; - x /= lcount; - ostringstream sa; - string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast(node_id) + " ||| " + boost::lexical_cast(lcount); - sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz"; - //Weights ww; - //ww.InitFromVector(x); - x.init_vector(&dense_weights); - Weights::WriteToFile(sa.str(), dense_weights, true, &msga); - - //assign averaged lambdas to initialize next iteration - //lambdas = x; - - /* double lambda_change = (old_lambdas - lambdas).l2norm_sq(); - cerr << "Change in lambda " << lambda_change << endl; - - if ( lambda_change < EPSILON) - { - cur_pass = max_iteration; - cerr << "Weights converged - breaking" << endl; - } - - ++cur_pass; - */ - - //} iteration while loop - - /* cerr << endl; - weights.WriteToFile("weights.mira-final.gz", true, &msg); - final_tot /= (lcount + 1);//max_iteration); - tot /= (corpus.size() + 1); - weights.InitFromVector(final_tot); - cerr << tot << "||||" << final_tot << endl; - msg = "# MIRA tuned weights (averaged vector)"; - weights.WriteToFile("weights.mira-final-avg.gz", true, &msg); - */ - cerr << "Optimization complete.\\AVERAGED WEIGHTS: weights.mira-final-avg.gz\n"; - return 0; -} - diff --git a/training/mira/run_mira.pl b/training/mira/run_mira.pl index f4d61407..90a4da0e 100755 --- a/training/mira/run_mira.pl +++ b/training/mira/run_mira.pl @@ -3,7 +3,7 @@ use strict; my @ORIG_ARGV=@ARGV; use Cwd qw(getcwd); my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); -push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } # Skip local config (used for distributing jobs) if we're running in local-only mode use LocalConfig; @@ -11,51 +11,50 @@ use Getopt::Long; use IPC::Open2; use POSIX ":sys_wait_h"; my $QSUB_CMD = qsub_args(mert_memory()); - -require "libcall.pl"; - +my $default_jobs = env_default_jobs(); my $srcFile; my $refFiles; my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; my $iteration = 0.0; -my $max_iterations = 6; +my $max_iterations = 10; my $metric = "ibm_bleu"; my $iniFile; my $weights; my $initialWeights; -my $decode_nodes = 1; # number of decode nodes +my $jobs = $default_jobs; # number of decode nodes my $pmem = "1g"; my $dir; my $SCORER = $FAST_SCORE; -my $local_server = "$bin_dir/local_parallelize.pl"; -my $parallelize = "$bin_dir/../dpmert/parallelize.pl"; -my $libcall = "$bin_dir/../dpmert/libcall.pl"; -my $sentserver = "$bin_dir/../dpmert/sentserver"; -my $sentclient = "$bin_dir/../dpmert/sentclient"; -my $run_local_server = 0; + +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; + +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; + my $run_local = 0; -my $usefork; my $pass_suffix = ''; -my $cdec ="$bin_dir/kbest_mirav5"; #"$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mira_lv"; +my $cdec ="$bin_dir/kbest_cut_mira"; -#my $cdec ="$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mirav5"; #"$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mira_lv"; die "Can't find decoder in $cdec" unless -x $cdec; my $decoder = $cdec; my $decoderOpt; -my $update_size=250; +my $update_size; my $approx_score; my $kbest_size=250; my $metric_scale=1; my $optimizer=2; my $disable_clean = 0; -my $use_make; # use make to parallelize line search +my $use_make=0; my $density_prune; my $cpbin=1; my $help = 0; @@ -64,10 +63,10 @@ my $step_size = 0.01; my $gpref; my $unique_kbest; my $freeze; -my $latent; -my $sample_max; my $hopes=1; my $fears=1; +my $sent_approx=0; +my $pseudo_doc=0; my $range = 35000; my $minimum = 15000; @@ -78,15 +77,13 @@ my $portn = int(rand($range)) + $minimum; Getopt::Long::Configure("no_auto_abbrev"); if (GetOptions( "decoder=s" => \$decoderOpt, - "decode-nodes=i" => \$decode_nodes, + "jobs=i" => \$jobs, "density-prune=f" => \$density_prune, "dont-clean" => \$disable_clean, "pass-suffix=s" => \$pass_suffix, - "use-fork" => \$usefork, "epsilon=s" => \$epsilon, "help" => \$help, "local" => \$run_local, - "local_server" => \$run_local_server, "use-make=i" => \$use_make, "max-iterations=i" => \$max_iterations, "pmem=s" => \$pmem, @@ -102,10 +99,9 @@ if (GetOptions( "step-size=f" => \$step_size, "hope-select=i" => \$hopes, "fear-select=i" => \$fears, - "approx-score" => \$approx_score, + "sent-approx" => \$sent_approx, + "pseudo-doc" => \$pseudo_doc, "unique-kbest" => \$unique_kbest, - "latent" => \$latent, - "sample-max=i" => \$sample_max, "grammar-prefix=s" => \$gpref, "freeze" => \$freeze, "workdir=s" => \$dir, @@ -235,7 +231,9 @@ close F; my $lastPScore = 0; my $lastWeightsFile; - +my $bestScoreIter=-1; +my $bestScore=-1; +unless ($update_size){$update_size = $kbest_size;} # main optimization loop #while (1){ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { @@ -260,16 +258,16 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { my $weightsFile="$dir/weights.$opt_iter"; print "ITER $iteration " ; my $cur_pass = "-p 0$opt_iter"; - my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile $refs_comma_sep -m $metric -s $metric_scale -a -b $update_size -k $kbest_size -o $optimizer $cur_pass -O $weightdir -D $dir -h $hopes -f $fears -C $step_size"; + my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile $refs_comma_sep -m $metric -s $metric_scale -b $update_size -k $kbest_size -o $optimizer $cur_pass -O $weightdir -D $dir -h $hopes -f $fears -C $step_size"; if($unique_kbest){ $decoder_cmd .= " -u"; } - if($latent){ - $decoder_cmd .= " -l"; - } - if($sample_max){ - $decoder_cmd .= " -t $sample_max"; + if($sent_approx){ + $decoder_cmd .= " -a"; } + if($pseudo_doc){ + $decoder_cmd .= " -e"; + } if ($density_prune) { $decoder_cmd .= " --density_prune $density_prune"; } @@ -277,13 +275,11 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { if ($run_local) { $pcmd = "cat $srcFile |"; } elsif ($use_make) { - # TODO: Throw error when decode_nodes is specified along with use_make + # TODO: Throw error when jobs is speong with use_make $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --"; - } elsif ($run_local_server){ - $pcmd = "cat $srcFile | $local_server $usefork -p $pmem -e $logdir -n $decode_nodes --"; - } + } else { - $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --baseport $portn --"; + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --baseport $portn --"; } my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; print STDERR "COMMAND:\n$cmd\n"; @@ -291,14 +287,14 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { my $retries = 0; my $num_topbest; - while($retries < 5) { + while($retries < 6) { $num_topbest = check_output("wc -l < $runFile"); print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; if($devSize == $num_topbest) { last; } else { print STDERR "Incorrect number of topbest. Waiting for distributed filesystem and retrying...\n"; - sleep(3); + sleep(10); } $retries++; } @@ -320,12 +316,15 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { close RUN; close F; close B; close H; - my $dec_score = check_output("cat $runFile.B | $SCORER $refs_comma_sep -l $metric"); - my $dec_score_h = check_output("cat $runFile.H | $SCORER $refs_comma_sep -l $metric"); - my $dec_score_f = check_output("cat $runFile.F | $SCORER $refs_comma_sep -l $metric"); + my $dec_score = check_output("cat $runFile.B | $SCORER $refs_comma_sep -m $metric"); + my $dec_score_h = check_output("cat $runFile.H | $SCORER $refs_comma_sep -m $metric"); + my $dec_score_f = check_output("cat $runFile.F | $SCORER $refs_comma_sep -m $metric"); chomp $dec_score; chomp $dec_score_h; chomp $dec_score_f; print STDERR "DECODER SCORE: $dec_score HOPE: $dec_score_h FEAR: $dec_score_f\n"; - + if ($dec_score> $bestScore){ + $bestScoreIter=$opt_iter; + $bestScore=$dec_score; + } # save space check_call("gzip -f $runFile"); check_call("gzip -f $decoderLog"); @@ -338,21 +337,11 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { $lastWeightsFile = "$dir/weights.$opt_iter"; average_weights("$weightdir/weights.mira-pass*.*[0-9].gz", $newWeightsFile, $logdir); -# check_call("cp $lastW $newWeightsFile"); -# if ($icc < 2) { -# print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; -# last; -# } system("gzip -f $logdir/kbes*"); print STDERR "\n==========\n"; $iteration++; } -#find -#my $cmd = `grep SCORE /fs/clip-galep5/lexical_tm/log.runmira.nist.20 | cat -n | sort -k +2 | tail -1`; -#$cmd =~ m/([0-9]+)/; -#$lastWeightsFile = "$dir/weights.$1"; -#check_call("ln -s $lastWeightsFile $dir/weights.tuned"); -print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w with the decoder)\n\n"; +print STDERR "\nBEST ITER: $bestScoreIter :: $bestScore\n\n\n"; print STDOUT "$lastWeightsFile\n"; @@ -409,7 +398,7 @@ sub write_config { print $fh "EVAL METRIC: $metric\n"; print $fh "START ITERATION: $iteration\n"; print $fh "MAX ITERATIONS: $max_iterations\n"; - print $fh "DECODE NODES: $decode_nodes\n"; + print $fh "DECODE NODES: $jobs\n"; print $fh "HEAD NODE: $host\n"; print $fh "PMEM (DECODING): $pmem\n"; print $fh "CLEANUP: $cleanup\n"; @@ -462,9 +451,87 @@ sub enseg { } sub print_help { - print "Something wrong\n"; + my $executable = check_output("basename $0"); chomp $executable; + print << "Help"; + +Usage: $executable [options] + + $executable [options] + Runs a complete MIRA optimization using the ini file specified. + +Required: + + --ref-files + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + --source-file + Dev set source file. + --weights + Initial weights file + +General options: + + --help + Print this message and exit. + + --max-iterations + Maximum number of iterations to run. If not specified, defaults + to $max_iterations. + + --metric + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --workdir + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + --optimizer + Learning method to use for weight update. Choice are 1) SGD, 2) PA MIRA with Selection from Cutting Plane, 3) Cutting Plane MIRA, 4) PA MIRA,5) nbest MIRA with hope, fear, and model constraints + --metric-scale + Scale MT loss by this amount when computing hope/fear candidates + --kbest-size + Size of k-best list to extract from forest + --update-size + Size of k-best list to use for update (applies to optimizer 5) + --step-size + Controls aggresiveness of update (C) + --hope-select + How to select hope candidate. Choices are 1) model score - cost, 2) min cost + --fear-select + How to select fear candodate. Choices are 1) model score + cost, 2) max cost, 3) max score + --sent-approx + Use smoothed sentence-level MT metric + --pseudo-doc + Use pseudo document to approximate MT metric + --unique-kbest + Extract unique k-best from forest + --grammar-prefix + Path to sentence-specific grammar files + +Job control options: + + --jobs + Number of decoder processes to run in parallel. [default=$default_jobs] + + --pmem + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + + --local + Run single learner + --use-make + Run parallel learners on a single machine through fork. + + +Help } + sub cmdline { return join ' ',($0,@ORIG_ARGV); } -- cgit v1.2.3