diff options
-rw-r--r-- | training/mira/kbest_cut_mira.cc | 128 |
1 files changed, 36 insertions, 92 deletions
diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc index 34eb00dc..7df9a18f 100644 --- a/training/mira/kbest_cut_mira.cc +++ b/training/mira/kbest_cut_mira.cc @@ -40,7 +40,7 @@ bool no_reweight; bool no_select; bool unique_kbest; int update_list_size; -vector<weight_t> dense_weights_g; +vector<weight_t> dense_w_local; double mt_metric_scale; int optimizer; int fear_select; @@ -170,7 +170,7 @@ bool FearCompareB(const HI& h1, const HI& h2 ) bool FearComparePred(const HI& h1, const HI& h2 ) { - return h1->features.dot(dense_weights_g) > h2->features.dot(dense_weights_g); + return h1->features.dot(dense_w_local) > h2->features.dot(dense_w_local); }; bool HypothesisCompareG(const HI& h1, const HI& h2 ) @@ -203,12 +203,7 @@ void CuttingPlane(vector<shared_ptr<HypothesisInfo> >* cur_c, bool* again, vecto for(int u=0;u!=all_hyp.size();u++) { double t_score = all_hyp[u]->features.dot(dense_weights); - //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; - all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*all_hyp[0]->mt_metric - hope_score + t_score; //relative loss - // all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*all_hyp[0]->mt_metric; - //all_hyp[u]->oracle_feat_diff = all_hyp[0]->features - all_hyp[u]->features; - // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; } sort(all_hyp.begin(),all_hyp.end(),FearCompareB); @@ -238,24 +233,14 @@ double ComputeDelta(vector<shared_ptr<HypothesisInfo> >* cur_p, double max_step_ { vector<shared_ptr<HypothesisInfo> >& cur_pair = *cur_p; double loss = cur_pair[0]->oracle_loss - cur_pair[1]->oracle_loss; - //double margin = -cur_pair[0]->oracle_feat_diff.dot(dense_weights) + cur_pair[1]->oracle_feat_diff.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? - //double num = loss - margin; - double margin = -(cur_pair[0]->oracleN->features.dot(dense_weights)- cur_pair[0]->features.dot(dense_weights)) + (cur_pair[1]->oracleN->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights)); const double num = margin + loss; cerr << "LOSS: " << num << " Margin:" << margin << " BLEUL:" << loss << " " << cur_pair[1]->features.dot(dense_weights) << " " << cur_pair[0]->features.dot(dense_weights) <<endl; -/* double num = - (cur_pair[0]->oracle_loss - cur_pair[0]->oracle_feat_diff.dot(dense_weights)) - - (cur_pair[1]->oracle_loss - cur_pair[1]->oracle_feat_diff.dot(dense_weights)); - */ - SparseVector<double> diff = cur_pair[0]->features; diff -= cur_pair[1]->features; - /* SparseVector<double> diff = cur_pair[0]->oracle_feat_diff; - diff -= cur_pair[1]->oracle_feat_diff;*/ double diffsqnorm = diff.l2norm_sq(); double delta; if (diffsqnorm > 0) @@ -264,7 +249,6 @@ double ComputeDelta(vector<shared_ptr<HypothesisInfo> >* cur_p, double max_step_ delta = 0; cerr << " D1:" << delta; //clip delta (enforce margin constraints) - delta = max(-cur_pair[0]->alpha, min(delta, cur_pair[1]->alpha)); cerr << " D2:" << delta; return delta; @@ -278,12 +262,12 @@ vector<shared_ptr<HypothesisInfo> > SelectPair(vector<shared_ptr<HypothesisInfo> vector<shared_ptr<HypothesisInfo> > pair; - if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for 1-mira - // if(optimizer == 2) { + if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for pa-mira + pair.push_back(cur_constraint[0]); pair.push_back(cur_constraint[1]); return pair; - // } + } for(int u=0;u != cur_constraint.size();u++) @@ -299,8 +283,6 @@ vector<shared_ptr<HypothesisInfo> > SelectPair(vector<shared_ptr<HypothesisInfo> } if(!max_fear) return pair; // - if(DEBUG_SELECT) cerr << " F" << max_fear->fear << endl; - if ((cur_constraint[u]->alpha == 0) && (cur_constraint[u]->fear > max_fear->fear + SMO_EPSILON)) { @@ -310,8 +292,7 @@ vector<shared_ptr<HypothesisInfo> > SelectPair(vector<shared_ptr<HypothesisInfo> if (cur_constraint[i]->alpha > 0) { pair.push_back(cur_constraint[u]); - pair.push_back(cur_constraint[i]); - cerr << "RETJURN from 1" << endl; + pair.push_back(cur_constraint[i]); return pair; } } @@ -342,11 +323,10 @@ struct GoodBadOracle { struct TrainingObserver : public DecoderObserver { TrainingObserver(const int k, const DocScorer& d, vector<GoodBadOracle>* o, vector<ScoreP>* cbs) : ds(d), oracles(*o), corpus_bleu_sent_stats(*cbs), kbest_size(k) { - // TrainingObserver(const int k, const DocScorer& d, vector<GoodBadOracle>* o) : ds(d), oracles(*o), kbest_size(k) { - //calculate corpus bleu score from previous iterations 1-best for BLEU gain + if(!pseudo_doc && !sent_approx) - if(cur_pass > 0) + if(cur_pass > 0) //calculate corpus bleu score from previous iterations 1-best for BLEU gain { ScoreP acc; for (int ii = 0; ii < corpus_bleu_sent_stats.size(); ii++) { @@ -357,7 +337,7 @@ struct TrainingObserver : public DecoderObserver { corpus_bleu_stats = acc; corpus_bleu_score = acc->ComputeScore(); } - //corpus_src_length = 0; + } const DocScorer& ds; vector<ScoreP>& corpus_bleu_sent_stats; @@ -396,9 +376,8 @@ struct TrainingObserver : public DecoderObserver { virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { cur_sent = smeta.GetSentenceID(); - //cerr << "SOURCE " << smeta.GetSourceLength() << endl; curr_src_length = (float) smeta.GetSourceLength(); - //UpdateOracles(smeta.GetSentenceID(), *hg); + if(unique_kbest) UpdateOracles<KBest::FilterUnique>(smeta.GetSentenceID(), *hg); else @@ -431,9 +410,8 @@ struct TrainingObserver : public DecoderObserver { typedef KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,Filter> K; K kbest(forest,kbest_size); - //KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, kbest_size); for (int i = 0; i < kbest_size; ++i) { - //const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + typename K::Derivation *d = kbest.LazyKthBest(forest.nodes_.size() - 1, i); if (!d) break; @@ -489,10 +467,9 @@ struct TrainingObserver : public DecoderObserver { corpus_bleu_stats->PlusEquals(*sent_stats, PSEUDO_SCALE); corpus_src_length = PSEUDO_SCALE * (corpus_src_length + curr_src_length); - cerr << "CORP S " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n" << details2 << endl; + cerr << "ps corpus size: " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n" << details2 << endl; } - //figure out how many hyps we can keep maximum int temp_update_size = update_list_size; if (all_hyp.size() < update_list_size){ temp_update_size = all_hyp.size();} @@ -500,7 +477,8 @@ struct TrainingObserver : public DecoderObserver { //sort all hyps by sentscore (eg. bleu) sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareB); - if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++) cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_weights_g) << endl; } + if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++) + cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_w_local) << endl; } if(hope_select == 1) { @@ -508,7 +486,7 @@ struct TrainingObserver : public DecoderObserver { if (PRINT_LIST) cerr << "HOPE " << endl; for(int u=0;u!=all_hyp.size();u++) { - double t_score = all_hyp[u]->features.dot(dense_weights_g); + double t_score = all_hyp[u]->features.dot(dense_w_local); all_hyp[u]->hope = all_hyp[u]->mt_metric + t_score; if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; @@ -522,47 +500,38 @@ struct TrainingObserver : public DecoderObserver { cur_good.insert(cur_good.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); if(PRINT_LIST) { cerr << "GOOD" << endl; for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;} + //use hope for fear selection shared_ptr<HypothesisInfo>& oracleN = cur_good[0]; - if(fear_select == 1){ //compute fear hyps with model - bleu if (PRINT_LIST) cerr << "FEAR " << endl; - double hope_score = oracleN->features.dot(dense_weights_g); + double hope_score = oracleN->features.dot(dense_w_local); if (PRINT_LIST) cerr << "hope score " << hope_score << endl; for(int u=0;u!=all_hyp.size();u++) { - double t_score = all_hyp[u]->features.dot(dense_weights_g); - //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; - - /* all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric - hope_score + t_score; //relative loss - all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric; - all_hyp[u]->oracle_feat_diff = cur_oracle->features - all_hyp[u]->features;*/ + double t_score = all_hyp[u]->features.dot(dense_w_local); all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric - hope_score + t_score; //relative loss all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric; all_hyp[u]->oracle_feat_diff = oracleN->features - all_hyp[u]->features; all_hyp[u]->oracleN=oracleN; - // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; } sort(all_hyp.begin(),all_hyp.end(),FearCompareB); - cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); } else if(fear_select == 2) //select fear based on cost { - cur_bad.insert(cur_bad.begin(), all_hyp.end()-temp_update_size, all_hyp.end()); - reverse(cur_bad.begin(),cur_bad.end()); + sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareG); } - else //pred-based, fear_select = 3 + else //max model score, also known as prediction-based { sort(all_hyp.begin(),all_hyp.end(),FearComparePred); - cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); } - + cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); if(PRINT_LIST){ cerr<< "BAD"<<endl; for(int u=0;u!=cur_bad.size();u++) cerr << cur_bad[u]->mt_metric << " H:" << cur_bad[u]->hope << " F:" << cur_bad[u]->fear << endl;} @@ -616,13 +585,12 @@ void ReadPastTranslationForScore(const int cur_pass, vector<ScoreP>* c, DocScore ++lc; } - assert(lc > 0); float score = acc->ComputeScore(); string details; acc->ScoreDetails(&details); - cerr << "INIT RUN " << details << score << endl; + cerr << "Previous run: " << details << score << endl; } @@ -640,7 +608,6 @@ int main(int argc, char** argv) { rng.reset(new MT19937); vector<string> corpus; - //ReadTrainingCorpus(conf["source"].as<string>(), &corpus); const string metric_name = conf["mt_metric"].as<string>(); optimizer = conf["optimizer"].as<int>(); @@ -654,7 +621,7 @@ int main(int argc, char** argv) { unique_kbest = conf.count("unique_k_best"); pseudo_doc = conf.count("pseudo_doc"); sent_approx = conf.count("sent_approx"); - cerr << "PSEUDO " << pseudo_doc << " SENT " << sent_approx << endl; + cerr << "Using pseudo-doc:" << pseudo_doc << " Sent:" << sent_approx << endl; if(pseudo_doc) mt_metric_scale=1; @@ -665,7 +632,6 @@ int main(int argc, char** argv) { //establish metric used for tuning if (type == TER) { invert_score = true; - // approx_score = false; } else { invert_score = false; } @@ -681,20 +647,9 @@ int main(int argc, char** argv) { { ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir); } - /* if (ds.size() != corpus.size()) { - cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; - return 1; - }*/ - cerr << "Optimizing with " << optimizer << endl; - // load initial weights - /*Weights weights; - weights.InitFromFile(conf["input_weights"].as<string>()); - SparseVector<double> lambdas; - weights.InitSparseVector(&lambdas); - */ - - + cerr << "Using optimizer:" << optimizer << endl; + ReadFile ini_rf(conf["decoder_config"].as<string>()); Decoder decoder(ini_rf.stream()); @@ -705,7 +660,6 @@ int main(int argc, char** argv) { Weights::InitSparseVector(dense_weights, &lambdas); const string input = decoder.GetConf()["input"].as<string>(); - //const bool show_feature_dictionary = decoder.GetConf().count("show_feature_dictionary"); if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; ReadFile in_read(input); istream *in = in_read.stream(); @@ -714,8 +668,6 @@ int main(int argc, char** argv) { const double max_step_size = conf["max_step_size"].as<double>(); - - // assert(corpus.size() > 0); vector<GoodBadOracle> oracles(ds.size()); TrainingObserver observer(conf["k_best_size"].as<int>(), ds, &oracles, &corpus_bleu_sent_stats); @@ -725,27 +677,21 @@ int main(int argc, char** argv) { double objective=0; double tot_loss = 0; int dots = 0; - // int cur_pass = 1; - // vector<double> dense_weights; SparseVector<double> tot; SparseVector<double> final_tot; - // tot += lambdas; // initial weights - // lcount++; // count for initial weights - - //string msg = "# MIRA tuned weights"; - // while (cur_pass <= max_iteration) { - SparseVector<double> old_lambdas = lambdas; - tot.clear(); - tot += lambdas; - cerr << "PASS " << cur_pass << " " << endl << lambdas << endl; - ScoreP acc, acc_h, acc_f; - - while(*in) { + + SparseVector<double> old_lambdas = lambdas; + tot.clear(); + tot += lambdas; + cerr << "PASS " << cur_pass << " " << endl << lambdas << endl; + ScoreP acc, acc_h, acc_f; + + while(*in) { getline(*in, buf); if (buf.empty()) continue; //TODO: allow batch updating lambdas.init_vector(&dense_weights); - dense_weights_g = dense_weights; + dense_w_local = dense_weights; decoder.SetId(cur_sent); decoder.Decode(buf, &observer); // decode the sentence, calling Notify to get the hope,fear, and model best hyps. @@ -922,7 +868,7 @@ int main(int argc, char** argv) { dense_weights.clear(); lambdas.init_vector(&dense_weights); - dense_weights_g = dense_weights; + dense_w_local = dense_weights; iter++; if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; @@ -991,19 +937,17 @@ int main(int argc, char** argv) { ostringstream os; os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz"; string msg = "# MIRA tuned weights ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount); - //Weights.InitFromVector(lambdas); lambdas.init_vector(&dense_weights); Weights::WriteToFile(os.str(), dense_weights, true, &msg); SparseVector<double> x = tot; - x /= lcount; + x /= lcount+1; ostringstream sa; string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount); sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz"; x.init_vector(&dense_weights); Weights::WriteToFile(sa.str(), dense_weights, true, &msga); - cerr << "Optimization complete.\n"; return 0; } |