summaryrefslogtreecommitdiff
path: root/training/mira
diff options
context:
space:
mode:
authorVladimir Eidelman <vlad@umiacs.umd.edu>2013-04-13 22:21:04 -0400
committerVladimir Eidelman <vlad@umiacs.umd.edu>2013-04-13 22:21:04 -0400
commit4ae5f91f2d01d6b6824086b50eca91106232a04d (patch)
tree7d1dd5372d824e741baa23368d0eb9b21ae058d0 /training/mira
parent58fad6230fc6c7b60add2382bf4afe55b9205c1a (diff)
cleanup mira
Diffstat (limited to 'training/mira')
-rw-r--r--training/mira/kbest_cut_mira.cc128
1 files changed, 36 insertions, 92 deletions
diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc
index 34eb00dc..7df9a18f 100644
--- a/training/mira/kbest_cut_mira.cc
+++ b/training/mira/kbest_cut_mira.cc
@@ -40,7 +40,7 @@ bool no_reweight;
bool no_select;
bool unique_kbest;
int update_list_size;
-vector<weight_t> dense_weights_g;
+vector<weight_t> dense_w_local;
double mt_metric_scale;
int optimizer;
int fear_select;
@@ -170,7 +170,7 @@ bool FearCompareB(const HI& h1, const HI& h2 )
bool FearComparePred(const HI& h1, const HI& h2 )
{
- return h1->features.dot(dense_weights_g) > h2->features.dot(dense_weights_g);
+ return h1->features.dot(dense_w_local) > h2->features.dot(dense_w_local);
};
bool HypothesisCompareG(const HI& h1, const HI& h2 )
@@ -203,12 +203,7 @@ void CuttingPlane(vector<shared_ptr<HypothesisInfo> >* cur_c, bool* again, vecto
for(int u=0;u!=all_hyp.size();u++)
{
double t_score = all_hyp[u]->features.dot(dense_weights);
- //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score;
-
all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*all_hyp[0]->mt_metric - hope_score + t_score; //relative loss
- // all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*all_hyp[0]->mt_metric;
- //all_hyp[u]->oracle_feat_diff = all_hyp[0]->features - all_hyp[u]->features;
- // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score;
}
sort(all_hyp.begin(),all_hyp.end(),FearCompareB);
@@ -238,24 +233,14 @@ double ComputeDelta(vector<shared_ptr<HypothesisInfo> >* cur_p, double max_step_
{
vector<shared_ptr<HypothesisInfo> >& cur_pair = *cur_p;
double loss = cur_pair[0]->oracle_loss - cur_pair[1]->oracle_loss;
- //double margin = -cur_pair[0]->oracle_feat_diff.dot(dense_weights) + cur_pair[1]->oracle_feat_diff.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff?
- //double num = loss - margin;
-
double margin = -(cur_pair[0]->oracleN->features.dot(dense_weights)- cur_pair[0]->features.dot(dense_weights)) + (cur_pair[1]->oracleN->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights));
const double num = margin + loss;
cerr << "LOSS: " << num << " Margin:" << margin << " BLEUL:" << loss << " " << cur_pair[1]->features.dot(dense_weights) << " " << cur_pair[0]->features.dot(dense_weights) <<endl;
-/* double num =
- (cur_pair[0]->oracle_loss - cur_pair[0]->oracle_feat_diff.dot(dense_weights))
- - (cur_pair[1]->oracle_loss - cur_pair[1]->oracle_feat_diff.dot(dense_weights));
- */
-
SparseVector<double> diff = cur_pair[0]->features;
diff -= cur_pair[1]->features;
- /* SparseVector<double> diff = cur_pair[0]->oracle_feat_diff;
- diff -= cur_pair[1]->oracle_feat_diff;*/
double diffsqnorm = diff.l2norm_sq();
double delta;
if (diffsqnorm > 0)
@@ -264,7 +249,6 @@ double ComputeDelta(vector<shared_ptr<HypothesisInfo> >* cur_p, double max_step_
delta = 0;
cerr << " D1:" << delta;
//clip delta (enforce margin constraints)
-
delta = max(-cur_pair[0]->alpha, min(delta, cur_pair[1]->alpha));
cerr << " D2:" << delta;
return delta;
@@ -278,12 +262,12 @@ vector<shared_ptr<HypothesisInfo> > SelectPair(vector<shared_ptr<HypothesisInfo>
vector<shared_ptr<HypothesisInfo> > pair;
- if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for 1-mira
- // if(optimizer == 2) {
+ if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for pa-mira
+
pair.push_back(cur_constraint[0]);
pair.push_back(cur_constraint[1]);
return pair;
- // }
+
}
for(int u=0;u != cur_constraint.size();u++)
@@ -299,8 +283,6 @@ vector<shared_ptr<HypothesisInfo> > SelectPair(vector<shared_ptr<HypothesisInfo>
}
if(!max_fear) return pair; //
- if(DEBUG_SELECT) cerr << " F" << max_fear->fear << endl;
-
if ((cur_constraint[u]->alpha == 0) && (cur_constraint[u]->fear > max_fear->fear + SMO_EPSILON))
{
@@ -310,8 +292,7 @@ vector<shared_ptr<HypothesisInfo> > SelectPair(vector<shared_ptr<HypothesisInfo>
if (cur_constraint[i]->alpha > 0)
{
pair.push_back(cur_constraint[u]);
- pair.push_back(cur_constraint[i]);
- cerr << "RETJURN from 1" << endl;
+ pair.push_back(cur_constraint[i]);
return pair;
}
}
@@ -342,11 +323,10 @@ struct GoodBadOracle {
struct TrainingObserver : public DecoderObserver {
TrainingObserver(const int k, const DocScorer& d, vector<GoodBadOracle>* o, vector<ScoreP>* cbs) : ds(d), oracles(*o), corpus_bleu_sent_stats(*cbs), kbest_size(k) {
- // TrainingObserver(const int k, const DocScorer& d, vector<GoodBadOracle>* o) : ds(d), oracles(*o), kbest_size(k) {
- //calculate corpus bleu score from previous iterations 1-best for BLEU gain
+
if(!pseudo_doc && !sent_approx)
- if(cur_pass > 0)
+ if(cur_pass > 0) //calculate corpus bleu score from previous iterations 1-best for BLEU gain
{
ScoreP acc;
for (int ii = 0; ii < corpus_bleu_sent_stats.size(); ii++) {
@@ -357,7 +337,7 @@ struct TrainingObserver : public DecoderObserver {
corpus_bleu_stats = acc;
corpus_bleu_score = acc->ComputeScore();
}
- //corpus_src_length = 0;
+
}
const DocScorer& ds;
vector<ScoreP>& corpus_bleu_sent_stats;
@@ -396,9 +376,8 @@ struct TrainingObserver : public DecoderObserver {
virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
cur_sent = smeta.GetSentenceID();
- //cerr << "SOURCE " << smeta.GetSourceLength() << endl;
curr_src_length = (float) smeta.GetSourceLength();
- //UpdateOracles(smeta.GetSentenceID(), *hg);
+
if(unique_kbest)
UpdateOracles<KBest::FilterUnique>(smeta.GetSentenceID(), *hg);
else
@@ -431,9 +410,8 @@ struct TrainingObserver : public DecoderObserver {
typedef KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,Filter> K;
K kbest(forest,kbest_size);
- //KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, kbest_size);
for (int i = 0; i < kbest_size; ++i) {
- //const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+
typename K::Derivation *d =
kbest.LazyKthBest(forest.nodes_.size() - 1, i);
if (!d) break;
@@ -489,10 +467,9 @@ struct TrainingObserver : public DecoderObserver {
corpus_bleu_stats->PlusEquals(*sent_stats, PSEUDO_SCALE);
corpus_src_length = PSEUDO_SCALE * (corpus_src_length + curr_src_length);
- cerr << "CORP S " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n" << details2 << endl;
+ cerr << "ps corpus size: " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n" << details2 << endl;
}
-
//figure out how many hyps we can keep maximum
int temp_update_size = update_list_size;
if (all_hyp.size() < update_list_size){ temp_update_size = all_hyp.size();}
@@ -500,7 +477,8 @@ struct TrainingObserver : public DecoderObserver {
//sort all hyps by sentscore (eg. bleu)
sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareB);
- if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++) cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_weights_g) << endl; }
+ if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++)
+ cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_w_local) << endl; }
if(hope_select == 1)
{
@@ -508,7 +486,7 @@ struct TrainingObserver : public DecoderObserver {
if (PRINT_LIST) cerr << "HOPE " << endl;
for(int u=0;u!=all_hyp.size();u++)
{
- double t_score = all_hyp[u]->features.dot(dense_weights_g);
+ double t_score = all_hyp[u]->features.dot(dense_w_local);
all_hyp[u]->hope = all_hyp[u]->mt_metric + t_score;
if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl;
@@ -522,47 +500,38 @@ struct TrainingObserver : public DecoderObserver {
cur_good.insert(cur_good.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size);
if(PRINT_LIST) { cerr << "GOOD" << endl; for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;}
+ //use hope for fear selection
shared_ptr<HypothesisInfo>& oracleN = cur_good[0];
-
if(fear_select == 1){ //compute fear hyps with model - bleu
if (PRINT_LIST) cerr << "FEAR " << endl;
- double hope_score = oracleN->features.dot(dense_weights_g);
+ double hope_score = oracleN->features.dot(dense_w_local);
if (PRINT_LIST) cerr << "hope score " << hope_score << endl;
for(int u=0;u!=all_hyp.size();u++)
{
- double t_score = all_hyp[u]->features.dot(dense_weights_g);
- //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score;
-
- /* all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric - hope_score + t_score; //relative loss
- all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric;
- all_hyp[u]->oracle_feat_diff = cur_oracle->features - all_hyp[u]->features;*/
+ double t_score = all_hyp[u]->features.dot(dense_w_local);
all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric - hope_score + t_score; //relative loss
all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric;
all_hyp[u]->oracle_feat_diff = oracleN->features - all_hyp[u]->features;
all_hyp[u]->oracleN=oracleN;
- // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score;
if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl;
}
sort(all_hyp.begin(),all_hyp.end(),FearCompareB);
- cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size);
}
else if(fear_select == 2) //select fear based on cost
{
- cur_bad.insert(cur_bad.begin(), all_hyp.end()-temp_update_size, all_hyp.end());
- reverse(cur_bad.begin(),cur_bad.end());
+ sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareG);
}
- else //pred-based, fear_select = 3
+ else //max model score, also known as prediction-based
{
sort(all_hyp.begin(),all_hyp.end(),FearComparePred);
- cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size);
}
-
+ cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size);
if(PRINT_LIST){ cerr<< "BAD"<<endl; for(int u=0;u!=cur_bad.size();u++) cerr << cur_bad[u]->mt_metric << " H:" << cur_bad[u]->hope << " F:" << cur_bad[u]->fear << endl;}
@@ -616,13 +585,12 @@ void ReadPastTranslationForScore(const int cur_pass, vector<ScoreP>* c, DocScore
++lc;
}
-
assert(lc > 0);
float score = acc->ComputeScore();
string details;
acc->ScoreDetails(&details);
- cerr << "INIT RUN " << details << score << endl;
+ cerr << "Previous run: " << details << score << endl;
}
@@ -640,7 +608,6 @@ int main(int argc, char** argv) {
rng.reset(new MT19937);
vector<string> corpus;
- //ReadTrainingCorpus(conf["source"].as<string>(), &corpus);
const string metric_name = conf["mt_metric"].as<string>();
optimizer = conf["optimizer"].as<int>();
@@ -654,7 +621,7 @@ int main(int argc, char** argv) {
unique_kbest = conf.count("unique_k_best");
pseudo_doc = conf.count("pseudo_doc");
sent_approx = conf.count("sent_approx");
- cerr << "PSEUDO " << pseudo_doc << " SENT " << sent_approx << endl;
+ cerr << "Using pseudo-doc:" << pseudo_doc << " Sent:" << sent_approx << endl;
if(pseudo_doc)
mt_metric_scale=1;
@@ -665,7 +632,6 @@ int main(int argc, char** argv) {
//establish metric used for tuning
if (type == TER) {
invert_score = true;
- // approx_score = false;
} else {
invert_score = false;
}
@@ -681,20 +647,9 @@ int main(int argc, char** argv) {
{
ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir);
}
- /* if (ds.size() != corpus.size()) {
- cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n";
- return 1;
- }*/
- cerr << "Optimizing with " << optimizer << endl;
- // load initial weights
- /*Weights weights;
- weights.InitFromFile(conf["input_weights"].as<string>());
- SparseVector<double> lambdas;
- weights.InitSparseVector(&lambdas);
- */
-
-
+ cerr << "Using optimizer:" << optimizer << endl;
+
ReadFile ini_rf(conf["decoder_config"].as<string>());
Decoder decoder(ini_rf.stream());
@@ -705,7 +660,6 @@ int main(int argc, char** argv) {
Weights::InitSparseVector(dense_weights, &lambdas);
const string input = decoder.GetConf()["input"].as<string>();
- //const bool show_feature_dictionary = decoder.GetConf().count("show_feature_dictionary");
if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl;
ReadFile in_read(input);
istream *in = in_read.stream();
@@ -714,8 +668,6 @@ int main(int argc, char** argv) {
const double max_step_size = conf["max_step_size"].as<double>();
-
- // assert(corpus.size() > 0);
vector<GoodBadOracle> oracles(ds.size());
TrainingObserver observer(conf["k_best_size"].as<int>(), ds, &oracles, &corpus_bleu_sent_stats);
@@ -725,27 +677,21 @@ int main(int argc, char** argv) {
double objective=0;
double tot_loss = 0;
int dots = 0;
- // int cur_pass = 1;
- // vector<double> dense_weights;
SparseVector<double> tot;
SparseVector<double> final_tot;
- // tot += lambdas; // initial weights
- // lcount++; // count for initial weights
-
- //string msg = "# MIRA tuned weights";
- // while (cur_pass <= max_iteration) {
- SparseVector<double> old_lambdas = lambdas;
- tot.clear();
- tot += lambdas;
- cerr << "PASS " << cur_pass << " " << endl << lambdas << endl;
- ScoreP acc, acc_h, acc_f;
-
- while(*in) {
+
+ SparseVector<double> old_lambdas = lambdas;
+ tot.clear();
+ tot += lambdas;
+ cerr << "PASS " << cur_pass << " " << endl << lambdas << endl;
+ ScoreP acc, acc_h, acc_f;
+
+ while(*in) {
getline(*in, buf);
if (buf.empty()) continue;
//TODO: allow batch updating
lambdas.init_vector(&dense_weights);
- dense_weights_g = dense_weights;
+ dense_w_local = dense_weights;
decoder.SetId(cur_sent);
decoder.Decode(buf, &observer); // decode the sentence, calling Notify to get the hope,fear, and model best hyps.
@@ -922,7 +868,7 @@ int main(int argc, char** argv) {
dense_weights.clear();
lambdas.init_vector(&dense_weights);
- dense_weights_g = dense_weights;
+ dense_w_local = dense_weights;
iter++;
if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl;
@@ -991,19 +937,17 @@ int main(int argc, char** argv) {
ostringstream os;
os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz";
string msg = "# MIRA tuned weights ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount);
- //Weights.InitFromVector(lambdas);
lambdas.init_vector(&dense_weights);
Weights::WriteToFile(os.str(), dense_weights, true, &msg);
SparseVector<double> x = tot;
- x /= lcount;
+ x /= lcount+1;
ostringstream sa;
string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount);
sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz";
x.init_vector(&dense_weights);
Weights::WriteToFile(sa.str(), dense_weights, true, &msga);
-
cerr << "Optimization complete.\n";
return 0;
}