diff options
Diffstat (limited to 'training/mira')
-rw-r--r-- | training/mira/Makefile.am | 7 | ||||
-rw-r--r-- | training/mira/kbest_cut_mira.cc (renamed from training/mira/kbest_mirav5.cc) | 256 | ||||
-rwxr-xr-x | training/mira/run_mira.pl | 181 |
3 files changed, 189 insertions, 255 deletions
diff --git a/training/mira/Makefile.am b/training/mira/Makefile.am index fa4fb22d..8cddc2d7 100644 --- a/training/mira/Makefile.am +++ b/training/mira/Makefile.am @@ -1,6 +1,11 @@ -bin_PROGRAMS = kbest_mira +bin_PROGRAMS = kbest_mira \ + kbest_cut_mira kbest_mira_SOURCES = kbest_mira.cc kbest_mira_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a + +kbest_cut_mira_SOURCES = kbest_cut_mira.cc +kbest_cut_mira_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a + AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/mira/kbest_mirav5.cc b/training/mira/kbest_cut_mira.cc index cea5cf67..34eb00dc 100644 --- a/training/mira/kbest_mirav5.cc +++ b/training/mira/kbest_cut_mira.cc @@ -45,8 +45,9 @@ double mt_metric_scale; int optimizer; int fear_select; int hope_select; - bool pseudo_doc; +bool sent_approx; +bool checkloss; void SanityCheck(const vector<double>& w) { for (int i = 0; i < w.size(); ++i) { @@ -82,16 +83,17 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("input_weights,w",po::value<string>(),"Input feature weights file") ("source,i",po::value<string>(),"Source file for development set") - ("passes,p", po::value<int>()->default_value(15), "Number of passes through the training data") + ("pass,p", po::value<int>()->default_value(15), "Current pass through the training data") ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)") ("mt_metric,m",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") - ("optimizer,o",po::value<int>()->default_value(1), "Optimizer (sgd=1, mira 1-fear=2, full mira w/ cutting plane=3, full mira w/ nbest list=5, local update=4)") - ("fear,f",po::value<int>()->default_value(1), "Fear selection (model-cost=1, max-cost=2, pred-base=3)") - ("hope,h",po::value<int>()->default_value(1), "Hope selection (model+cost=1, max-cost=2, local-cost=3)") + ("optimizer,o",po::value<int>()->default_value(1), "Optimizer (SGD=1, PA MIRA w/Delta=2, Cutting Plane MIRA=3, PA MIRA=4, Triple nbest list MIRA=5)") + ("fear,f",po::value<int>()->default_value(1), "Fear selection (model-cost=1, maxcost=2, maxscore=3)") + ("hope,h",po::value<int>()->default_value(1), "Hope selection (model+cost=1, mincost=2)") ("max_step_size,C", po::value<double>()->default_value(0.01), "regularization strength (C)") ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)") ("mt_metric_scale,s", po::value<double>()->default_value(1.0), "Amount to scale MT loss function by") - ("approx_score,a", "Use smoothed sentence-level BLEU score for approximate scoring") + ("sent_approx,a", "Use smoothed sentence-level BLEU score for approximate scoring") + ("pseudo_doc,e", "Use pseudo-document BLEU score for approximate scoring") ("no_reweight,d","Do not reweight forest for cutting plane") ("no_select,n", "Do not use selection heuristic") ("k_best_size,k", po::value<int>()->default_value(250), "Size of hypothesis list to search for oracles") @@ -190,8 +192,6 @@ void CuttingPlane(vector<shared_ptr<HypothesisInfo> >* cur_c, bool* again, vecto { double t_score = all_hyp[u]->features.dot(dense_weights); all_hyp[u]->hope = 1 * all_hyp[u]->mt_metric + t_score; - //if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; - } //sort hyps by hope score @@ -209,8 +209,6 @@ void CuttingPlane(vector<shared_ptr<HypothesisInfo> >* cur_c, bool* again, vecto // all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*all_hyp[0]->mt_metric; //all_hyp[u]->oracle_feat_diff = all_hyp[0]->features - all_hyp[u]->features; // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; - //if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; - } sort(all_hyp.begin(),all_hyp.end(),FearCompareB); @@ -248,13 +246,7 @@ double ComputeDelta(vector<shared_ptr<HypothesisInfo> >* cur_p, double max_step_ const double num = margin + loss; cerr << "LOSS: " << num << " Margin:" << margin << " BLEUL:" << loss << " " << cur_pair[1]->features.dot(dense_weights) << " " << cur_pair[0]->features.dot(dense_weights) <<endl; - // double margin = cur_pair[1]->features.dot(dense_weights) - cur_pair[0]->features.dot(dense_weights); - // double loss = cur_pair[1]->oracle_loss; //good.mt_metric - cur_bad.mt_metric); - //const double num = margin + loss; - - //cerr << "Compute Delta " << loss << " " << margin << " "; - // double margin = cur_pair[0]->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? /* double num = (cur_pair[0]->oracle_loss - cur_pair[0]->oracle_feat_diff.dot(dense_weights)) - (cur_pair[1]->oracle_loss - cur_pair[1]->oracle_feat_diff.dot(dense_weights)); @@ -353,7 +345,7 @@ struct TrainingObserver : public DecoderObserver { // TrainingObserver(const int k, const DocScorer& d, vector<GoodBadOracle>* o) : ds(d), oracles(*o), kbest_size(k) { //calculate corpus bleu score from previous iterations 1-best for BLEU gain - if(!pseudo_doc) + if(!pseudo_doc && !sent_approx) if(cur_pass > 0) { ScoreP acc; @@ -447,10 +439,7 @@ struct TrainingObserver : public DecoderObserver { if (!d) break; float sentscore; - if(approx_score) - { - - if(cur_pass > 0 && !pseudo_doc) + if(cur_pass > 0 && !pseudo_doc && !sent_approx) { ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); ScoreP corpus_no_best = corpus_bleu_stats->GetZero(); @@ -461,11 +450,8 @@ struct TrainingObserver : public DecoderObserver { //compute gain from new sentence in 1-best corpus sentscore = mt_metric_scale * (sent_stats->ComputeScore() - corpus_no_best->ComputeScore());// - corpus_bleu_score); } - else if(pseudo_doc) + else if(pseudo_doc) //pseudo-corpus smoothing { - //cerr << "CORP:" << corpus_bleu_score << " NEW:" << sent_stats->ComputeScore() << " sentscore:" << sentscore << endl; - - //-----pseudo-corpus approach float src_scale = corpus_src_length + curr_src_length; ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); if(!corpus_bleu_stats){ corpus_bleu_stats = sent_stats->GetZero();} @@ -474,34 +460,20 @@ struct TrainingObserver : public DecoderObserver { sentscore = mt_metric_scale * src_scale * sent_stats->ComputeScore(); } - else + else //use sentence-level smoothing ( used when cur_pass=0 if not pseudo_doc) { - //cerr << "Using sentence-level approximation - PASS - " << boost::lexical_cast<std::string>(cur_pass) << endl; - //approx style of computation, used for 0th iteration - sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeSentScore()); - - //use pseudo-doc + + sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore()); } - - - } - else - { - sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore()); - } - + if (invert_score) sentscore *= -1.0; - //cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << " " << approx_sentscore << endl; - + if (i < update_list_size){ - if (i == 0) //take cur best and add its bleu statistics counts to the pseudo-doc - { } if(PRINT_LIST)cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; cur_best.push_back( MakeHypothesisInfo(d->feature_values, sentscore, d->yield)); } - all_hyp.push_back(MakeHypothesisInfo(d->feature_values, sentscore,d->yield)); //store all hyp to extract oracle best and worst - + all_hyp.push_back(MakeHypothesisInfo(d->feature_values, sentscore,d->yield)); //store all hyp to extract hope and fear } if(pseudo_doc){ @@ -511,19 +483,13 @@ struct TrainingObserver : public DecoderObserver { ScoreP sent_stats = ds[sent_id]->ScoreCandidate(cur_best[0]->hyp); corpus_bleu_stats->PlusEquals(*sent_stats); - sent_stats->ScoreDetails(&details); - - sent_stats = corpus_bleu_stats; corpus_bleu_stats = sent_stats->GetZero(); corpus_bleu_stats->PlusEquals(*sent_stats, PSEUDO_SCALE); - - + corpus_src_length = PSEUDO_SCALE * (corpus_src_length + curr_src_length); - cerr << "CORP S " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n " << details2 << endl; - - + cerr << "CORP S " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n" << details2 << endl; } @@ -531,12 +497,11 @@ struct TrainingObserver : public DecoderObserver { int temp_update_size = update_list_size; if (all_hyp.size() < update_list_size){ temp_update_size = all_hyp.size();} - //sort all hyps by sentscore (bleu) + //sort all hyps by sentscore (eg. bleu) sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareB); if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++) cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_weights_g) << endl; } - //if(optimizer != 4 ) if(hope_select == 1) { //find hope hypothesis using model + bleu @@ -551,24 +516,19 @@ struct TrainingObserver : public DecoderObserver { //sort hyps by hope score sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); - } - + } //assign cur_good the sorted list cur_good.insert(cur_good.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); if(PRINT_LIST) { cerr << "GOOD" << endl; for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;} - /* if (!cur_oracle) { cur_oracle = cur_good[0]; - cerr << "Set oracle " << cur_oracle->hope << " " << cur_oracle->fear << " " << cur_oracle->mt_metric << endl; } - else { - cerr << "Stay oracle " << cur_oracle->hope << " " << cur_oracle->fear << " " << cur_oracle->mt_metric << endl; } */ shared_ptr<HypothesisInfo>& oracleN = cur_good[0]; - //if(optimizer != 4){ - if(fear_select == 1){ - //compute fear hyps + + + if(fear_select == 1){ //compute fear hyps with model - bleu if (PRINT_LIST) cerr << "FEAR " << endl; double hope_score = oracleN->features.dot(dense_weights_g); - //double hope_score = cur_oracle->features.dot(dense_weights); + if (PRINT_LIST) cerr << "hope score " << hope_score << endl; for(int u=0;u!=all_hyp.size();u++) { @@ -692,7 +652,11 @@ int main(int argc, char** argv) { no_select = conf.count("no_select"); update_list_size = conf["update_k_best"].as<int>(); unique_kbest = conf.count("unique_k_best"); - pseudo_doc = true; + pseudo_doc = conf.count("pseudo_doc"); + sent_approx = conf.count("sent_approx"); + cerr << "PSEUDO " << pseudo_doc << " SENT " << sent_approx << endl; + if(pseudo_doc) + mt_metric_scale=1; const string weights_dir = conf["weights_output"].as<string>(); const string output_dir = conf["output_dir"].as<string>(); @@ -712,7 +676,7 @@ int main(int argc, char** argv) { vector<ScoreP> corpus_bleu_sent_stats; //check training pass,if >0, then use previous iterations corpus bleu stats - cur_pass = conf["passes"].as<int>(); + cur_pass = conf["pass"].as<int>(); if(cur_pass > 0) { ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir); @@ -779,20 +743,14 @@ int main(int argc, char** argv) { while(*in) { getline(*in, buf); if (buf.empty()) continue; - //for (cur_sent = 0; cur_sent < corpus.size(); cur_sent++) { - - cerr << "SENT: " << cur_sent << endl; //TODO: allow batch updating - //dense_weights.clear(); - //weights.InitFromVector(lambdas); - //weights.InitVector(&dense_weights); - //decoder.SetWeights(dense_weights); lambdas.init_vector(&dense_weights); dense_weights_g = dense_weights; decoder.SetId(cur_sent); decoder.Decode(buf, &observer); // decode the sentence, calling Notify to get the hope,fear, and model best hyps. cur_sent = observer.GetCurrentSent(); + cerr << "SENT: " << cur_sent << endl; const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis(); const HypothesisInfo& cur_good = *oracles[cur_sent].good[0]; const HypothesisInfo& cur_bad = *oracles[cur_sent].bad[0]; @@ -816,14 +774,13 @@ int main(int argc, char** argv) { if (!acc_f) { acc_f = fear_sentscore->GetZero(); } acc_f->PlusEquals(*fear_sentscore); - if(optimizer == 4) { //single dual coordinate update, cur_good selected on BLEU score only (not model+BLEU) - // if (!ApproxEqual(cur_hyp.mt_metric, cur_good.mt_metric)) { + if(optimizer == 4) { //passive-aggresive update (single dual coordinate step) double margin = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights); double mt_loss = (cur_good.mt_metric - cur_bad.mt_metric); const double loss = margin + mt_loss; cerr << "LOSS: " << loss << " Margin:" << margin << " BLEUL:" << mt_loss << " " << cur_bad.features.dot(dense_weights) << " " << cur_good.features.dot(dense_weights) <<endl; - // if (loss > 0.0) { + if (loss > 0.0 || !checkloss) { SparseVector<double> diff = cur_good.features; diff -= cur_bad.features; @@ -834,14 +791,11 @@ int main(int argc, char** argv) { else delta = 0; - //double step_size = loss / diff.l2norm_sq(); - cerr << loss << " " << delta << " " << diff << endl; if (delta > max_step_size) delta = max_step_size; lambdas += (cur_good.features * delta); lambdas -= (cur_bad.features * delta); - //cerr << "L: " << lambdas << endl; - // } - // } + + } } else if(optimizer == 1) //sgd - nonadapted step size { @@ -849,8 +803,7 @@ int main(int argc, char** argv) { lambdas += (cur_good.features) * max_step_size; lambdas -= (cur_bad.features) * max_step_size; } - //cerr << "L: " << lambdas << endl; - else if(optimizer == 5) //full mira with n-best list of constraints from oracle, fear, best + else if(optimizer == 5) //full mira with n-best list of constraints from hope, fear, model best { vector<shared_ptr<HypothesisInfo> > cur_constraint; cur_constraint.insert(cur_constraint.begin(), cur_bad_v.begin(), cur_bad_v.end()); @@ -866,7 +819,7 @@ int main(int argc, char** argv) { cur_constraint[0]->alpha =1; //set oracle to alpha=1 cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl; - int smo_iter = 10, smo_iter2 = 10; + int smo_iter = MAX_SMO, smo_iter2 = MAX_SMO; int iter, iter2 =0; bool DEBUG_SMO = false; while (iter2 < smo_iter2) @@ -885,38 +838,26 @@ int main(int argc, char** argv) { double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); if (delta == 0) optimize_again = false; - // cur_pair[0]->alpha += delta; - // cur_pair[1]->alpha -= delta; cur_constraint[j]->alpha += delta; cur_constraint[i]->alpha -= delta; double step_size = delta * max_step_size; - /*lambdas += (cur_pair[1]->features) * step_size; - lambdas -= (cur_pair[0]->features) * step_size;*/ + lambdas += (cur_constraint[i]->features) * step_size; lambdas -= (cur_constraint[j]->features) * step_size; if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << i << " " << j << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; - - //reload weights based on update - /*dense_weights.clear(); - weights.InitFromVector(lambdas); - weights.InitVector(&dense_weights);*/ } iter++; if(!optimize_again) { - iter = 100; + iter = MAX_SMO; cerr << "Optimization stopped, delta =0" << endl; - } - - + } } iter2++; - } - - + } } - else if(optimizer == 2 || optimizer == 3) //1-fear and cutting plane mira + else if(optimizer == 2 || optimizer == 3) //PA and Cutting Plane MIRA update { bool DEBUG_SMO= true; vector<shared_ptr<HypothesisInfo> > cur_constraint; @@ -926,7 +867,7 @@ int main(int argc, char** argv) { while (optimize_again) { if(DEBUG_SMO) cerr<< "optimize again: " << optimize_again << endl; - if(optimizer == 2){ //1-fear + if(optimizer == 2){ //PA cur_constraint.push_back(cur_bad_v[0]); //check if we have a violation @@ -951,10 +892,9 @@ int main(int argc, char** argv) { for(int u=0;u!=cur_constraint.size();u++) { cur_constraint[u]->alpha =0; - //cur_good_v[0]->alpha = 1; cur_bad_v[0]->alpha = 0; } cur_constraint[0]->alpha = 1; - cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl; + cerr <<" Optimizing with " << cur_constraint.size() << " constraints" << endl; int smo_iter = MAX_SMO; int iter =0; while (iter < smo_iter) @@ -962,56 +902,30 @@ int main(int argc, char** argv) { //select pair to optimize from constraint set vector<shared_ptr<HypothesisInfo> > cur_pair = SelectPair(&cur_constraint); - if(cur_pair.empty()){iter=MAX_SMO; cerr << "Undefined pair " << endl; continue;} //pair is undefined so we are done with this smo - - //double num = cur_good_v[0]->fear - cur_bad_v[0]->fear; - /*double loss = cur_good_v[0]->oracle_loss - cur_bad_v[0]->oracle_loss; - double margin = cur_good_v[0]->oracle_feat_diff.dot(dense_weights) - cur_bad_v[0]->oracle_feat_diff.dot(dense_weights); - double num = loss - margin; - SparseVector<double> diff = cur_good_v[0]->features; - diff -= cur_bad_v[0]->features; - double delta = num / (diff.l2norm_sq() * max_step_size); - delta = max(-cur_good_v[0]->alpha, min(delta, cur_bad_v[0]->alpha)); - cur_good_v[0]->alpha += delta; - cur_bad_v[0]->alpha -= delta; - double step_size = delta * max_step_size; - lambdas += (cur_bad_v[0]->features) * step_size; - lambdas -= (cur_good_v[0]->features) * step_size; - */ - + if(cur_pair.empty()){ + iter=MAX_SMO; + cerr << "Undefined pair " << endl; + continue; + } //pair is undefined so we are done with this smo + double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); cur_pair[0]->alpha += delta; cur_pair[1]->alpha -= delta; double step_size = delta * max_step_size; - /* lambdas += (cur_pair[1]->oracle_feat_diff) * step_size; - lambdas -= (cur_pair[0]->oracle_feat_diff) * step_size;*/ - cerr << "step " << step_size << endl; - double alpha_sum=0; - SparseVector<double> temp_lambdas = lambdas; - - for(int u=0;u!=cur_constraint.size();u++) - { - cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << endl; - temp_lambdas += (cur_constraint[u]->oracleN->features-cur_constraint[u]->features) * cur_constraint[u]->alpha * step_size; - alpha_sum += cur_constraint[u]->alpha; - } - cerr << "Alpha sum " << alpha_sum << " " << temp_lambdas << endl; - + lambdas += (cur_pair[1]->features) * step_size; lambdas -= (cur_pair[0]->features) * step_size; cerr << " Lambdas " << lambdas << endl; //reload weights based on update + dense_weights.clear(); - //weights.InitFromVector(lambdas); - //weights.InitVector(&dense_weights); lambdas.init_vector(&dense_weights); dense_weights_g = dense_weights; iter++; if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; - // cerr << "SMO opt " << iter << " " << delta << " " << cur_good_v[0]->alpha << " " << cur_bad_v[0]->alpha << endl; if(no_select) //don't use selection heuristic to determine when to stop SMO, rather just when delta =0 if (delta == 0) iter = MAX_SMO; @@ -1020,28 +934,23 @@ int main(int argc, char** argv) { { optimize_again = false; iter = MAX_SMO; - } - + } } if(optimizer == 3) { - if(!no_reweight) + if(!no_reweight) //reweight the forest and select a new k-best { if(DEBUG_SMO) cerr<< "Decoding with new weights -- now orac are " << oracles[cur_sent].good.size() << endl; Hypergraph hg = observer.GetCurrentForest(); hg.Reweight(dense_weights); - //observer.UpdateOracles(cur_sent, hg); if(unique_kbest) observer.UpdateOracles<KBest::FilterUnique>(cur_sent, hg); else - observer.UpdateOracles<KBest::NoFilter<std::vector<WordID> > >(cur_sent, hg); - - + observer.UpdateOracles<KBest::NoFilter<std::vector<WordID> > >(cur_sent, hg); } } } - } //print objective after this sentence @@ -1067,10 +976,6 @@ int main(int argc, char** argv) { cout << TD::GetString(cur_good_v[0]->hyp) << " ||| " << TD::GetString(cur_best_v[0]->hyp) << " ||| " << TD::GetString(cur_bad_v[0]->hyp) << endl; - //clear good/bad lists from oracles for this sentences - you want to keep them around for things - - // oracles[cur_sent].good.clear(); - //oracles[cur_sent].bad.clear(); } cerr << "FINAL OBJECTIVE: "<< objective << endl; @@ -1078,23 +983,7 @@ int main(int argc, char** argv) { cerr << "Translated " << lcount << " sentences " << endl; cerr << " [AVG METRIC LAST PASS=" << (tot_loss / lcount) << "]\n"; tot_loss = 0; - /* - float corpus_score = acc->ComputeScore(); - string corpus_details; - acc->ScoreDetails(&corpus_details); - cerr << "MODEL " << corpus_details << endl; - cout << corpus_score << endl; - - corpus_score = acc_h->ComputeScore(); - acc_h->ScoreDetails(&corpus_details); - cerr << "HOPE " << corpus_details << endl; - cout << corpus_score << endl; - - corpus_score = acc_f->ComputeScore(); - acc_f->ScoreDetails(&corpus_details); - cerr << "FEAR " << corpus_details << endl; - cout << corpus_score << endl; - */ + int node_id = rng->next() * 100000; cerr << " Writing weights to " << node_id << endl; Weights::ShowLargestFeatures(dense_weights); @@ -1111,38 +1000,11 @@ int main(int argc, char** argv) { ostringstream sa; string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount); sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz"; - //Weights ww; - //ww.InitFromVector(x); x.init_vector(&dense_weights); Weights::WriteToFile(sa.str(), dense_weights, true, &msga); - - //assign averaged lambdas to initialize next iteration - //lambdas = x; - - /* double lambda_change = (old_lambdas - lambdas).l2norm_sq(); - cerr << "Change in lambda " << lambda_change << endl; - if ( lambda_change < EPSILON) - { - cur_pass = max_iteration; - cerr << "Weights converged - breaking" << endl; - } - - ++cur_pass; - */ - //} iteration while loop - - /* cerr << endl; - weights.WriteToFile("weights.mira-final.gz", true, &msg); - final_tot /= (lcount + 1);//max_iteration); - tot /= (corpus.size() + 1); - weights.InitFromVector(final_tot); - cerr << tot << "||||" << final_tot << endl; - msg = "# MIRA tuned weights (averaged vector)"; - weights.WriteToFile("weights.mira-final-avg.gz", true, &msg); - */ - cerr << "Optimization complete.\\AVERAGED WEIGHTS: weights.mira-final-avg.gz\n"; - return 0; + cerr << "Optimization complete.\n"; + return 0; } diff --git a/training/mira/run_mira.pl b/training/mira/run_mira.pl index f4d61407..90a4da0e 100755 --- a/training/mira/run_mira.pl +++ b/training/mira/run_mira.pl @@ -3,7 +3,7 @@ use strict; my @ORIG_ARGV=@ARGV; use Cwd qw(getcwd); my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); -push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } # Skip local config (used for distributing jobs) if we're running in local-only mode use LocalConfig; @@ -11,51 +11,50 @@ use Getopt::Long; use IPC::Open2; use POSIX ":sys_wait_h"; my $QSUB_CMD = qsub_args(mert_memory()); - -require "libcall.pl"; - +my $default_jobs = env_default_jobs(); my $srcFile; my $refFiles; my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; my $iteration = 0.0; -my $max_iterations = 6; +my $max_iterations = 10; my $metric = "ibm_bleu"; my $iniFile; my $weights; my $initialWeights; -my $decode_nodes = 1; # number of decode nodes +my $jobs = $default_jobs; # number of decode nodes my $pmem = "1g"; my $dir; my $SCORER = $FAST_SCORE; -my $local_server = "$bin_dir/local_parallelize.pl"; -my $parallelize = "$bin_dir/../dpmert/parallelize.pl"; -my $libcall = "$bin_dir/../dpmert/libcall.pl"; -my $sentserver = "$bin_dir/../dpmert/sentserver"; -my $sentclient = "$bin_dir/../dpmert/sentclient"; -my $run_local_server = 0; + +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; + +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; + my $run_local = 0; -my $usefork; my $pass_suffix = ''; -my $cdec ="$bin_dir/kbest_mirav5"; #"$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mira_lv"; +my $cdec ="$bin_dir/kbest_cut_mira"; -#my $cdec ="$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mirav5"; #"$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mira_lv"; die "Can't find decoder in $cdec" unless -x $cdec; my $decoder = $cdec; my $decoderOpt; -my $update_size=250; +my $update_size; my $approx_score; my $kbest_size=250; my $metric_scale=1; my $optimizer=2; my $disable_clean = 0; -my $use_make; # use make to parallelize line search +my $use_make=0; my $density_prune; my $cpbin=1; my $help = 0; @@ -64,10 +63,10 @@ my $step_size = 0.01; my $gpref; my $unique_kbest; my $freeze; -my $latent; -my $sample_max; my $hopes=1; my $fears=1; +my $sent_approx=0; +my $pseudo_doc=0; my $range = 35000; my $minimum = 15000; @@ -78,15 +77,13 @@ my $portn = int(rand($range)) + $minimum; Getopt::Long::Configure("no_auto_abbrev"); if (GetOptions( "decoder=s" => \$decoderOpt, - "decode-nodes=i" => \$decode_nodes, + "jobs=i" => \$jobs, "density-prune=f" => \$density_prune, "dont-clean" => \$disable_clean, "pass-suffix=s" => \$pass_suffix, - "use-fork" => \$usefork, "epsilon=s" => \$epsilon, "help" => \$help, "local" => \$run_local, - "local_server" => \$run_local_server, "use-make=i" => \$use_make, "max-iterations=i" => \$max_iterations, "pmem=s" => \$pmem, @@ -102,10 +99,9 @@ if (GetOptions( "step-size=f" => \$step_size, "hope-select=i" => \$hopes, "fear-select=i" => \$fears, - "approx-score" => \$approx_score, + "sent-approx" => \$sent_approx, + "pseudo-doc" => \$pseudo_doc, "unique-kbest" => \$unique_kbest, - "latent" => \$latent, - "sample-max=i" => \$sample_max, "grammar-prefix=s" => \$gpref, "freeze" => \$freeze, "workdir=s" => \$dir, @@ -235,7 +231,9 @@ close F; my $lastPScore = 0; my $lastWeightsFile; - +my $bestScoreIter=-1; +my $bestScore=-1; +unless ($update_size){$update_size = $kbest_size;} # main optimization loop #while (1){ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { @@ -260,16 +258,16 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { my $weightsFile="$dir/weights.$opt_iter"; print "ITER $iteration " ; my $cur_pass = "-p 0$opt_iter"; - my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile $refs_comma_sep -m $metric -s $metric_scale -a -b $update_size -k $kbest_size -o $optimizer $cur_pass -O $weightdir -D $dir -h $hopes -f $fears -C $step_size"; + my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile $refs_comma_sep -m $metric -s $metric_scale -b $update_size -k $kbest_size -o $optimizer $cur_pass -O $weightdir -D $dir -h $hopes -f $fears -C $step_size"; if($unique_kbest){ $decoder_cmd .= " -u"; } - if($latent){ - $decoder_cmd .= " -l"; - } - if($sample_max){ - $decoder_cmd .= " -t $sample_max"; + if($sent_approx){ + $decoder_cmd .= " -a"; } + if($pseudo_doc){ + $decoder_cmd .= " -e"; + } if ($density_prune) { $decoder_cmd .= " --density_prune $density_prune"; } @@ -277,13 +275,11 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { if ($run_local) { $pcmd = "cat $srcFile |"; } elsif ($use_make) { - # TODO: Throw error when decode_nodes is specified along with use_make + # TODO: Throw error when jobs is speong with use_make $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --"; - } elsif ($run_local_server){ - $pcmd = "cat $srcFile | $local_server $usefork -p $pmem -e $logdir -n $decode_nodes --"; - } + } else { - $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --baseport $portn --"; + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --baseport $portn --"; } my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; print STDERR "COMMAND:\n$cmd\n"; @@ -291,14 +287,14 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { my $retries = 0; my $num_topbest; - while($retries < 5) { + while($retries < 6) { $num_topbest = check_output("wc -l < $runFile"); print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; if($devSize == $num_topbest) { last; } else { print STDERR "Incorrect number of topbest. Waiting for distributed filesystem and retrying...\n"; - sleep(3); + sleep(10); } $retries++; } @@ -320,12 +316,15 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { close RUN; close F; close B; close H; - my $dec_score = check_output("cat $runFile.B | $SCORER $refs_comma_sep -l $metric"); - my $dec_score_h = check_output("cat $runFile.H | $SCORER $refs_comma_sep -l $metric"); - my $dec_score_f = check_output("cat $runFile.F | $SCORER $refs_comma_sep -l $metric"); + my $dec_score = check_output("cat $runFile.B | $SCORER $refs_comma_sep -m $metric"); + my $dec_score_h = check_output("cat $runFile.H | $SCORER $refs_comma_sep -m $metric"); + my $dec_score_f = check_output("cat $runFile.F | $SCORER $refs_comma_sep -m $metric"); chomp $dec_score; chomp $dec_score_h; chomp $dec_score_f; print STDERR "DECODER SCORE: $dec_score HOPE: $dec_score_h FEAR: $dec_score_f\n"; - + if ($dec_score> $bestScore){ + $bestScoreIter=$opt_iter; + $bestScore=$dec_score; + } # save space check_call("gzip -f $runFile"); check_call("gzip -f $decoderLog"); @@ -338,21 +337,11 @@ for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { $lastWeightsFile = "$dir/weights.$opt_iter"; average_weights("$weightdir/weights.mira-pass*.*[0-9].gz", $newWeightsFile, $logdir); -# check_call("cp $lastW $newWeightsFile"); -# if ($icc < 2) { -# print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; -# last; -# } system("gzip -f $logdir/kbes*"); print STDERR "\n==========\n"; $iteration++; } -#find -#my $cmd = `grep SCORE /fs/clip-galep5/lexical_tm/log.runmira.nist.20 | cat -n | sort -k +2 | tail -1`; -#$cmd =~ m/([0-9]+)/; -#$lastWeightsFile = "$dir/weights.$1"; -#check_call("ln -s $lastWeightsFile $dir/weights.tuned"); -print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; +print STDERR "\nBEST ITER: $bestScoreIter :: $bestScore\n\n\n"; print STDOUT "$lastWeightsFile\n"; @@ -409,7 +398,7 @@ sub write_config { print $fh "EVAL METRIC: $metric\n"; print $fh "START ITERATION: $iteration\n"; print $fh "MAX ITERATIONS: $max_iterations\n"; - print $fh "DECODE NODES: $decode_nodes\n"; + print $fh "DECODE NODES: $jobs\n"; print $fh "HEAD NODE: $host\n"; print $fh "PMEM (DECODING): $pmem\n"; print $fh "CLEANUP: $cleanup\n"; @@ -462,9 +451,87 @@ sub enseg { } sub print_help { - print "Something wrong\n"; + my $executable = check_output("basename $0"); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable [options] <ini file> + Runs a complete MIRA optimization using the ini file specified. + +Required: + + --ref-files <files> + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + --source-file <file> + Dev set source file. + --weights <file> + Initial weights file + +General options: + + --help + Print this message and exit. + + --max-iterations <M> + Maximum number of iterations to run. If not specified, defaults + to $max_iterations. + + --metric <method> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --workdir <dir> + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + --optimizer <I> + Learning method to use for weight update. Choice are 1) SGD, 2) PA MIRA with Selection from Cutting Plane, 3) Cutting Plane MIRA, 4) PA MIRA,5) nbest MIRA with hope, fear, and model constraints + --metric-scale <I> + Scale MT loss by this amount when computing hope/fear candidates + --kbest-size <I> + Size of k-best list to extract from forest + --update-size <I> + Size of k-best list to use for update (applies to optimizer 5) + --step-size <F> + Controls aggresiveness of update (C) + --hope-select<I> + How to select hope candidate. Choices are 1) model score - cost, 2) min cost + --fear-select <I> + How to select fear candodate. Choices are 1) model score + cost, 2) max cost, 3) max score + --sent-approx + Use smoothed sentence-level MT metric + --pseudo-doc + Use pseudo document to approximate MT metric + --unique-kbest + Extract unique k-best from forest + --grammar-prefix <path> + Path to sentence-specific grammar files + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + + --local + Run single learner + --use-make <I> + Run parallel learners on a single machine through fork. + + +Help } + sub cmdline { return join ' ',($0,@ORIG_ARGV); } |