diff options
Diffstat (limited to 'training')
-rw-r--r-- | training/mira/kbest_mirav5.cc | 1148 | ||||
-rwxr-xr-x | training/mira/run_mira.pl | 548 |
2 files changed, 1696 insertions, 0 deletions
diff --git a/training/mira/kbest_mirav5.cc b/training/mira/kbest_mirav5.cc new file mode 100644 index 00000000..cea5cf67 --- /dev/null +++ b/training/mira/kbest_mirav5.cc @@ -0,0 +1,1148 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <cassert> +#include <cmath> +#include <algorithm> + +#include "config.h" + + +#include <boost/shared_ptr.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sentence_metadata.h" +#include "scorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "time.h" +#include "sampler.h" + +#include "weights.h" +#include "sparse_vector.h" + +using namespace std; +using boost::shared_ptr; +namespace po = boost::program_options; + +bool invert_score; +boost::shared_ptr<MT19937> rng; +bool approx_score; +bool no_reweight; +bool no_select; +bool unique_kbest; +int update_list_size; +vector<weight_t> dense_weights_g; +double mt_metric_scale; +int optimizer; +int fear_select; +int hope_select; + +bool pseudo_doc; + +void SanityCheck(const vector<double>& w) { + for (int i = 0; i < w.size(); ++i) { + assert(!isnan(w[i])); + assert(!isinf(w[i])); + } +} + +struct FComp { + const vector<double>& w_; + FComp(const vector<double>& w) : w_(w) {} + bool operator()(int a, int b) const { + return fabs(w_[a]) > fabs(w_[b]); + } +}; + +void ShowLargestFeatures(const vector<double>& w) { + vector<int> fnums(w.size()); + for (int i = 0; i < w.size(); ++i) + fnums[i] = i; + vector<int>::iterator mid = fnums.begin(); + mid += (w.size() > 10 ? 10 : w.size()); + partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); + cerr << "TOP FEATURES:"; + for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) { + cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; + } + cerr << endl; +} + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("input_weights,w",po::value<string>(),"Input feature weights file") + ("source,i",po::value<string>(),"Source file for development set") + ("passes,p", po::value<int>()->default_value(15), "Number of passes through the training data") + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)") + ("mt_metric,m",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") + ("optimizer,o",po::value<int>()->default_value(1), "Optimizer (sgd=1, mira 1-fear=2, full mira w/ cutting plane=3, full mira w/ nbest list=5, local update=4)") + ("fear,f",po::value<int>()->default_value(1), "Fear selection (model-cost=1, max-cost=2, pred-base=3)") + ("hope,h",po::value<int>()->default_value(1), "Hope selection (model+cost=1, max-cost=2, local-cost=3)") + ("max_step_size,C", po::value<double>()->default_value(0.01), "regularization strength (C)") + ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)") + ("mt_metric_scale,s", po::value<double>()->default_value(1.0), "Amount to scale MT loss function by") + ("approx_score,a", "Use smoothed sentence-level BLEU score for approximate scoring") + ("no_reweight,d","Do not reweight forest for cutting plane") + ("no_select,n", "Do not use selection heuristic") + ("k_best_size,k", po::value<int>()->default_value(250), "Size of hypothesis list to search for oracles") + ("update_k_best,b", po::value<int>()->default_value(1), "Size of good, bad lists to perform update with") + ("unique_k_best,u", "Unique k-best translation list") + ("weights_output,O",po::value<string>(),"Directory to write weights to") + ("output_dir,D",po::value<string>(),"Directory to place output in") + ("decoder_config,c",po::value<string>(),"Decoder configuration file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,H", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("input_weights") || !conf->count("decoder_config") || !conf->count("reference")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +//load previous translation, store array of each sentences score, subtract it from current sentence and replace with new translation score + + +static const double kMINUS_EPSILON = -1e-6; +static const double EPSILON = 0.000001; +static const double SMO_EPSILON = 0.0001; +static const double PSEUDO_SCALE = 0.95; +static const int MAX_SMO = 10; +int cur_pass; + +struct HypothesisInfo { + SparseVector<double> features; + vector<WordID> hyp; + double mt_metric; + double hope; + double fear; + double alpha; + double oracle_loss; + SparseVector<double> oracle_feat_diff; + shared_ptr<HypothesisInfo> oracleN; +}; + +bool ApproxEqual(double a, double b) { + if (a == b) return true; + return (fabs(a-b)/fabs(b)) < EPSILON; +} + +typedef shared_ptr<HypothesisInfo> HI; +bool HypothesisCompareB(const HI& h1, const HI& h2 ) +{ + return h1->mt_metric > h2->mt_metric; +}; + + +bool HopeCompareB(const HI& h1, const HI& h2 ) +{ + return h1->hope > h2->hope; +}; + +bool FearCompareB(const HI& h1, const HI& h2 ) +{ + return h1->fear > h2->fear; +}; + +bool FearComparePred(const HI& h1, const HI& h2 ) +{ + return h1->features.dot(dense_weights_g) > h2->features.dot(dense_weights_g); +}; + +bool HypothesisCompareG(const HI& h1, const HI& h2 ) +{ + return h1->mt_metric < h2->mt_metric; +}; + + +void CuttingPlane(vector<shared_ptr<HypothesisInfo> >* cur_c, bool* again, vector<shared_ptr<HypothesisInfo> >& all_hyp, vector<weight_t> dense_weights) +{ + bool DEBUG_CUT = false; + shared_ptr<HypothesisInfo> max_fear, max_fear_in_set; + vector<shared_ptr<HypothesisInfo> >& cur_constraint = *cur_c; + + if(no_reweight) + { + //find new hope hypothesis + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights); + all_hyp[u]->hope = 1 * all_hyp[u]->mt_metric + t_score; + //if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; + + } + + //sort hyps by hope score + sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); + + double hope_score = all_hyp[0]->features.dot(dense_weights); + if(DEBUG_CUT) cerr << "New hope derivation score " << hope_score << endl; + + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights); + //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; + + all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*all_hyp[0]->mt_metric - hope_score + t_score; //relative loss + // all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*all_hyp[0]->mt_metric; + //all_hyp[u]->oracle_feat_diff = all_hyp[0]->features - all_hyp[u]->features; + // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; + //if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; + + } + + sort(all_hyp.begin(),all_hyp.end(),FearCompareB); + + } + //assign maximum fear derivation from all derivations + max_fear = all_hyp[0]; + + if(DEBUG_CUT) cerr <<"Cutting Plane Max Fear "<<max_fear->fear ; + for(int i=0; i < cur_constraint.size();i++) //select maximal violator already in constraint set + { + if (!max_fear_in_set || cur_constraint[i]->fear > max_fear_in_set->fear) + max_fear_in_set = cur_constraint[i]; + } + if(DEBUG_CUT) cerr << "Max Fear in constraint set " << max_fear_in_set->fear << endl; + + if(max_fear->fear > max_fear_in_set->fear + SMO_EPSILON) + { + cur_constraint.push_back(max_fear); + *again = true; + if(DEBUG_CUT) cerr << "Optimize Again " << *again << endl; + } +} + + +double ComputeDelta(vector<shared_ptr<HypothesisInfo> >* cur_p, double max_step_size,vector<weight_t> dense_weights ) +{ + vector<shared_ptr<HypothesisInfo> >& cur_pair = *cur_p; + double loss = cur_pair[0]->oracle_loss - cur_pair[1]->oracle_loss; + //double margin = -cur_pair[0]->oracle_feat_diff.dot(dense_weights) + cur_pair[1]->oracle_feat_diff.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? + //double num = loss - margin; + + + double margin = -(cur_pair[0]->oracleN->features.dot(dense_weights)- cur_pair[0]->features.dot(dense_weights)) + (cur_pair[1]->oracleN->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights)); + const double num = margin + loss; + cerr << "LOSS: " << num << " Margin:" << margin << " BLEUL:" << loss << " " << cur_pair[1]->features.dot(dense_weights) << " " << cur_pair[0]->features.dot(dense_weights) <<endl; + + // double margin = cur_pair[1]->features.dot(dense_weights) - cur_pair[0]->features.dot(dense_weights); + // double loss = cur_pair[1]->oracle_loss; //good.mt_metric - cur_bad.mt_metric); + //const double num = margin + loss; + + //cerr << "Compute Delta " << loss << " " << margin << " "; + + // double margin = cur_pair[0]->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff? +/* double num = + (cur_pair[0]->oracle_loss - cur_pair[0]->oracle_feat_diff.dot(dense_weights)) + - (cur_pair[1]->oracle_loss - cur_pair[1]->oracle_feat_diff.dot(dense_weights)); + */ + + SparseVector<double> diff = cur_pair[0]->features; + diff -= cur_pair[1]->features; + /* SparseVector<double> diff = cur_pair[0]->oracle_feat_diff; + diff -= cur_pair[1]->oracle_feat_diff;*/ + double diffsqnorm = diff.l2norm_sq(); + double delta; + if (diffsqnorm > 0) + delta = num / (diffsqnorm * max_step_size); + else + delta = 0; + cerr << " D1:" << delta; + //clip delta (enforce margin constraints) + + delta = max(-cur_pair[0]->alpha, min(delta, cur_pair[1]->alpha)); + cerr << " D2:" << delta; + return delta; +} + + +vector<shared_ptr<HypothesisInfo> > SelectPair(vector<shared_ptr<HypothesisInfo> >* cur_c) +{ + bool DEBUG_SELECT= false; + vector<shared_ptr<HypothesisInfo> >& cur_constraint = *cur_c; + + vector<shared_ptr<HypothesisInfo> > pair; + + if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for 1-mira + // if(optimizer == 2) { + pair.push_back(cur_constraint[0]); + pair.push_back(cur_constraint[1]); + return pair; + // } + } + + for(int u=0;u != cur_constraint.size();u++) + { + shared_ptr<HypothesisInfo> max_fear; + + if(DEBUG_SELECT) cerr<< "cur alpha " << u << " " << cur_constraint[u]->alpha; + for(int i=0; i < cur_constraint.size();i++) //select maximal violator + { + if(i != u) + if (!max_fear || cur_constraint[i]->fear > max_fear->fear) + max_fear = cur_constraint[i]; + } + if(!max_fear) return pair; // + + if(DEBUG_SELECT) cerr << " F" << max_fear->fear << endl; + + + if ((cur_constraint[u]->alpha == 0) && (cur_constraint[u]->fear > max_fear->fear + SMO_EPSILON)) + { + for(int i=0; i < cur_constraint.size();i++) //select maximal violator + { + if(i != u) + if (cur_constraint[i]->alpha > 0) + { + pair.push_back(cur_constraint[u]); + pair.push_back(cur_constraint[i]); + cerr << "RETJURN from 1" << endl; + return pair; + } + } + } + if ((cur_constraint[u]->alpha > 0) && (cur_constraint[u]->fear < max_fear->fear - SMO_EPSILON)) + { + for(int i=0; i < cur_constraint.size();i++) //select maximal violator + { + if(i != u) + if (cur_constraint[i]->fear > cur_constraint[u]->fear) + { + pair.push_back(cur_constraint[u]); + pair.push_back(cur_constraint[i]); + return pair; + } + } + } + + } + return pair; //no more constraints to optimize, we're done here + +} + +struct GoodBadOracle { + vector<shared_ptr<HypothesisInfo> > good; + vector<shared_ptr<HypothesisInfo> > bad; +}; + +struct TrainingObserver : public DecoderObserver { + TrainingObserver(const int k, const DocScorer& d, vector<GoodBadOracle>* o, vector<ScoreP>* cbs) : ds(d), oracles(*o), corpus_bleu_sent_stats(*cbs), kbest_size(k) { + // TrainingObserver(const int k, const DocScorer& d, vector<GoodBadOracle>* o) : ds(d), oracles(*o), kbest_size(k) { + + //calculate corpus bleu score from previous iterations 1-best for BLEU gain + if(!pseudo_doc) + if(cur_pass > 0) + { + ScoreP acc; + for (int ii = 0; ii < corpus_bleu_sent_stats.size(); ii++) { + if (!acc) { acc = corpus_bleu_sent_stats[ii]->GetZero(); } + acc->PlusEquals(*corpus_bleu_sent_stats[ii]); + + } + corpus_bleu_stats = acc; + corpus_bleu_score = acc->ComputeScore(); + } + //corpus_src_length = 0; +} + const DocScorer& ds; + vector<ScoreP>& corpus_bleu_sent_stats; + vector<GoodBadOracle>& oracles; + vector<shared_ptr<HypothesisInfo> > cur_best; + shared_ptr<HypothesisInfo> cur_oracle; + const int kbest_size; + Hypergraph forest; + int cur_sent; + ScoreP corpus_bleu_stats; + float corpus_bleu_score; + + float corpus_src_length; + float curr_src_length; + + const int GetCurrentSent() const { + return cur_sent; + } + + const HypothesisInfo& GetCurrentBestHypothesis() const { + return *cur_best[0]; + } + + const vector<shared_ptr<HypothesisInfo> > GetCurrentBest() const { + return cur_best; + } + + const HypothesisInfo& GetCurrentOracle() const { + return *cur_oracle; + } + + const Hypergraph& GetCurrentForest() const { + return forest; + } + + + virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { + cur_sent = smeta.GetSentenceID(); + //cerr << "SOURCE " << smeta.GetSourceLength() << endl; + curr_src_length = (float) smeta.GetSourceLength(); + //UpdateOracles(smeta.GetSentenceID(), *hg); + if(unique_kbest) + UpdateOracles<KBest::FilterUnique>(smeta.GetSentenceID(), *hg); + else + UpdateOracles<KBest::NoFilter<std::vector<WordID> > >(smeta.GetSentenceID(), *hg); + forest = *hg; + + } + + shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double score, const vector<WordID>& hyp) { + shared_ptr<HypothesisInfo> h(new HypothesisInfo); + h->features = feats; + h->mt_metric = score; + h->hyp = hyp; + return h; + } + + template <class Filter> + void UpdateOracles(int sent_id, const Hypergraph& forest) { + + bool PRINT_LIST= false; + vector<shared_ptr<HypothesisInfo> >& cur_good = oracles[sent_id].good; + vector<shared_ptr<HypothesisInfo> >& cur_bad = oracles[sent_id].bad; + //TODO: look at keeping previous iterations hypothesis lists around + cur_best.clear(); + cur_good.clear(); + cur_bad.clear(); + + vector<shared_ptr<HypothesisInfo> > all_hyp; + + typedef KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,Filter> K; + K kbest(forest,kbest_size); + + //KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, kbest_size); + for (int i = 0; i < kbest_size; ++i) { + //const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + typename K::Derivation *d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + + float sentscore; + if(approx_score) + { + + if(cur_pass > 0 && !pseudo_doc) + { + ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); + ScoreP corpus_no_best = corpus_bleu_stats->GetZero(); + + corpus_bleu_stats->Subtract(*corpus_bleu_sent_stats[sent_id], &*corpus_no_best); + sent_stats->PlusEquals(*corpus_no_best, 0.5); + + //compute gain from new sentence in 1-best corpus + sentscore = mt_metric_scale * (sent_stats->ComputeScore() - corpus_no_best->ComputeScore());// - corpus_bleu_score); + } + else if(pseudo_doc) + { + //cerr << "CORP:" << corpus_bleu_score << " NEW:" << sent_stats->ComputeScore() << " sentscore:" << sentscore << endl; + + //-----pseudo-corpus approach + float src_scale = corpus_src_length + curr_src_length; + ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield); + if(!corpus_bleu_stats){ corpus_bleu_stats = sent_stats->GetZero();} + + sent_stats->PlusEquals(*corpus_bleu_stats); + sentscore = mt_metric_scale * src_scale * sent_stats->ComputeScore(); + + } + else + { + //cerr << "Using sentence-level approximation - PASS - " << boost::lexical_cast<std::string>(cur_pass) << endl; + //approx style of computation, used for 0th iteration + sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeSentScore()); + + //use pseudo-doc + } + + + } + else + { + sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore()); + } + + if (invert_score) sentscore *= -1.0; + //cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << " " << approx_sentscore << endl; + + if (i < update_list_size){ + if (i == 0) //take cur best and add its bleu statistics counts to the pseudo-doc + { } + if(PRINT_LIST)cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; + cur_best.push_back( MakeHypothesisInfo(d->feature_values, sentscore, d->yield)); + } + + all_hyp.push_back(MakeHypothesisInfo(d->feature_values, sentscore,d->yield)); //store all hyp to extract oracle best and worst + + } + + if(pseudo_doc){ + //update psuedo-doc stats + string details, details2; + corpus_bleu_stats->ScoreDetails(&details2); + ScoreP sent_stats = ds[sent_id]->ScoreCandidate(cur_best[0]->hyp); + corpus_bleu_stats->PlusEquals(*sent_stats); + + + sent_stats->ScoreDetails(&details); + + + sent_stats = corpus_bleu_stats; + corpus_bleu_stats = sent_stats->GetZero(); + corpus_bleu_stats->PlusEquals(*sent_stats, PSEUDO_SCALE); + + + corpus_src_length = PSEUDO_SCALE * (corpus_src_length + curr_src_length); + cerr << "CORP S " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n " << details2 << endl; + + + } + + + //figure out how many hyps we can keep maximum + int temp_update_size = update_list_size; + if (all_hyp.size() < update_list_size){ temp_update_size = all_hyp.size();} + + //sort all hyps by sentscore (bleu) + sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareB); + + if(PRINT_LIST){ cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++) cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_weights_g) << endl; } + + //if(optimizer != 4 ) + if(hope_select == 1) + { + //find hope hypothesis using model + bleu + if (PRINT_LIST) cerr << "HOPE " << endl; + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights_g); + all_hyp[u]->hope = all_hyp[u]->mt_metric + t_score; + if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; + + } + + //sort hyps by hope score + sort(all_hyp.begin(),all_hyp.end(),HopeCompareB); + } + + + //assign cur_good the sorted list + cur_good.insert(cur_good.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); + if(PRINT_LIST) { cerr << "GOOD" << endl; for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;} + /* if (!cur_oracle) { cur_oracle = cur_good[0]; + cerr << "Set oracle " << cur_oracle->hope << " " << cur_oracle->fear << " " << cur_oracle->mt_metric << endl; } + else { + cerr << "Stay oracle " << cur_oracle->hope << " " << cur_oracle->fear << " " << cur_oracle->mt_metric << endl; } */ + + shared_ptr<HypothesisInfo>& oracleN = cur_good[0]; + //if(optimizer != 4){ + if(fear_select == 1){ + //compute fear hyps + if (PRINT_LIST) cerr << "FEAR " << endl; + double hope_score = oracleN->features.dot(dense_weights_g); + //double hope_score = cur_oracle->features.dot(dense_weights); + if (PRINT_LIST) cerr << "hope score " << hope_score << endl; + for(int u=0;u!=all_hyp.size();u++) + { + double t_score = all_hyp[u]->features.dot(dense_weights_g); + //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score; + + /* all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric - hope_score + t_score; //relative loss + all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric; + all_hyp[u]->oracle_feat_diff = cur_oracle->features - all_hyp[u]->features;*/ + + all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric - hope_score + t_score; //relative loss + all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric; + all_hyp[u]->oracle_feat_diff = oracleN->features - all_hyp[u]->features; + all_hyp[u]->oracleN=oracleN; + // all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score; + if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; + + } + + sort(all_hyp.begin(),all_hyp.end(),FearCompareB); + + cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); + } + else if(fear_select == 2) //select fear based on cost + { + cur_bad.insert(cur_bad.begin(), all_hyp.end()-temp_update_size, all_hyp.end()); + reverse(cur_bad.begin(),cur_bad.end()); + } + else //pred-based, fear_select = 3 + { + sort(all_hyp.begin(),all_hyp.end(),FearComparePred); + cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); + } + + + if(PRINT_LIST){ cerr<< "BAD"<<endl; for(int u=0;u!=cur_bad.size();u++) cerr << cur_bad[u]->mt_metric << " H:" << cur_bad[u]->hope << " F:" << cur_bad[u]->fear << endl;} + + cerr << "GOOD (BEST): " << cur_good[0]->mt_metric << endl; + cerr << " CUR: " << cur_best[0]->mt_metric << endl; + cerr << " BAD (WORST): " << cur_bad[0]->mt_metric << endl; + } +}; + +void ReadTrainingCorpus(const string& fname, vector<string>* c) { + + + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (!in) break; + c->push_back(line); + } +} + +void ReadPastTranslationForScore(const int cur_pass, vector<ScoreP>* c, DocScorer& ds, const string& od) +{ + cerr << "Reading BLEU gain file "; + string fname; + if(cur_pass == 0) + { + fname = od + "/run.raw.init"; + } + else + { + int last_pass = cur_pass - 1; + fname = od + "/run.raw." + boost::lexical_cast<std::string>(last_pass) + ".B"; + } + cerr << fname << "\n"; + ReadFile rf(fname); + istream& in = *rf.stream(); + ScoreP acc; + string line; + int lc = 0; + while(in) { + getline(in, line); + if (line.empty() && !in) break; + vector<WordID> sent; + TD::ConvertSentence(line, &sent); + ScoreP sentscore = ds[lc]->ScoreCandidate(sent); + c->push_back(sentscore); + if (!acc) { acc = sentscore->GetZero(); } + acc->PlusEquals(*sentscore); + ++lc; + + } + + + assert(lc > 0); + float score = acc->ComputeScore(); + string details; + acc->ScoreDetails(&details); + cerr << "INIT RUN " << details << score << endl; + +} + + +int main(int argc, char** argv) { + register_feature_functions(); + SetSilent(true); // turn off verbose decoder output + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) return 1; + + if (conf.count("random_seed")) + rng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + rng.reset(new MT19937); + + vector<string> corpus; + //ReadTrainingCorpus(conf["source"].as<string>(), &corpus); + + const string metric_name = conf["mt_metric"].as<string>(); + optimizer = conf["optimizer"].as<int>(); + fear_select = conf["fear"].as<int>(); + hope_select = conf["hope"].as<int>(); + mt_metric_scale = conf["mt_metric_scale"].as<double>(); + approx_score = conf.count("approx_score"); + no_reweight = conf.count("no_reweight"); + no_select = conf.count("no_select"); + update_list_size = conf["update_k_best"].as<int>(); + unique_kbest = conf.count("unique_k_best"); + pseudo_doc = true; + + const string weights_dir = conf["weights_output"].as<string>(); + const string output_dir = conf["output_dir"].as<string>(); + ScoreType type = ScoreTypeFromString(metric_name); + + //establish metric used for tuning + if (type == TER) { + invert_score = true; + // approx_score = false; + } else { + invert_score = false; + } + + //load references + DocScorer ds(type, conf["reference"].as<vector<string> >(), ""); + cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl; + vector<ScoreP> corpus_bleu_sent_stats; + + //check training pass,if >0, then use previous iterations corpus bleu stats + cur_pass = conf["passes"].as<int>(); + if(cur_pass > 0) + { + ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir); + } + /* if (ds.size() != corpus.size()) { + cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; + return 1; + }*/ + cerr << "Optimizing with " << optimizer << endl; + // load initial weights + /*Weights weights; + weights.InitFromFile(conf["input_weights"].as<string>()); + SparseVector<double> lambdas; + weights.InitSparseVector(&lambdas); + */ + + + + ReadFile ini_rf(conf["decoder_config"].as<string>()); + Decoder decoder(ini_rf.stream()); + + vector<weight_t>& dense_weights = decoder.CurrentWeightVector(); + + SparseVector<weight_t> lambdas; + Weights::InitFromFile(conf["input_weights"].as<string>(), &dense_weights); + Weights::InitSparseVector(dense_weights, &lambdas); + + const string input = decoder.GetConf()["input"].as<string>(); + //const bool show_feature_dictionary = decoder.GetConf().count("show_feature_dictionary"); + if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; + ReadFile in_read(input); + istream *in = in_read.stream(); + assert(*in); + string buf; + + const double max_step_size = conf["max_step_size"].as<double>(); + + + // assert(corpus.size() > 0); + vector<GoodBadOracle> oracles(ds.size()); + + TrainingObserver observer(conf["k_best_size"].as<int>(), ds, &oracles, &corpus_bleu_sent_stats); + + int cur_sent = 0; + int lcount = 0; + double objective=0; + double tot_loss = 0; + int dots = 0; + // int cur_pass = 1; + // vector<double> dense_weights; + SparseVector<double> tot; + SparseVector<double> final_tot; + // tot += lambdas; // initial weights + // lcount++; // count for initial weights + + //string msg = "# MIRA tuned weights"; + // while (cur_pass <= max_iteration) { + SparseVector<double> old_lambdas = lambdas; + tot.clear(); + tot += lambdas; + cerr << "PASS " << cur_pass << " " << endl << lambdas << endl; + ScoreP acc, acc_h, acc_f; + + while(*in) { + getline(*in, buf); + if (buf.empty()) continue; + //for (cur_sent = 0; cur_sent < corpus.size(); cur_sent++) { + + cerr << "SENT: " << cur_sent << endl; + //TODO: allow batch updating + //dense_weights.clear(); + //weights.InitFromVector(lambdas); + //weights.InitVector(&dense_weights); + //decoder.SetWeights(dense_weights); + lambdas.init_vector(&dense_weights); + dense_weights_g = dense_weights; + decoder.SetId(cur_sent); + decoder.Decode(buf, &observer); // decode the sentence, calling Notify to get the hope,fear, and model best hyps. + + cur_sent = observer.GetCurrentSent(); + const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis(); + const HypothesisInfo& cur_good = *oracles[cur_sent].good[0]; + const HypothesisInfo& cur_bad = *oracles[cur_sent].bad[0]; + + vector<shared_ptr<HypothesisInfo> >& cur_good_v = oracles[cur_sent].good; + vector<shared_ptr<HypothesisInfo> >& cur_bad_v = oracles[cur_sent].bad; + vector<shared_ptr<HypothesisInfo> > cur_best_v = observer.GetCurrentBest(); + + tot_loss += cur_hyp.mt_metric; + + //score hyps to be able to compute corpus level bleu after we finish this iteration through the corpus + ScoreP sentscore = ds[cur_sent]->ScoreCandidate(cur_hyp.hyp); + if (!acc) { acc = sentscore->GetZero(); } + acc->PlusEquals(*sentscore); + + ScoreP hope_sentscore = ds[cur_sent]->ScoreCandidate(cur_good.hyp); + if (!acc_h) { acc_h = hope_sentscore->GetZero(); } + acc_h->PlusEquals(*hope_sentscore); + + ScoreP fear_sentscore = ds[cur_sent]->ScoreCandidate(cur_bad.hyp); + if (!acc_f) { acc_f = fear_sentscore->GetZero(); } + acc_f->PlusEquals(*fear_sentscore); + + if(optimizer == 4) { //single dual coordinate update, cur_good selected on BLEU score only (not model+BLEU) + // if (!ApproxEqual(cur_hyp.mt_metric, cur_good.mt_metric)) { + + double margin = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights); + double mt_loss = (cur_good.mt_metric - cur_bad.mt_metric); + const double loss = margin + mt_loss; + cerr << "LOSS: " << loss << " Margin:" << margin << " BLEUL:" << mt_loss << " " << cur_bad.features.dot(dense_weights) << " " << cur_good.features.dot(dense_weights) <<endl; + // if (loss > 0.0) { + SparseVector<double> diff = cur_good.features; + diff -= cur_bad.features; + + double diffsqnorm = diff.l2norm_sq(); + double delta; + if (diffsqnorm > 0) + delta = loss / (diffsqnorm); + else + delta = 0; + + //double step_size = loss / diff.l2norm_sq(); + cerr << loss << " " << delta << " " << diff << endl; + if (delta > max_step_size) delta = max_step_size; + lambdas += (cur_good.features * delta); + lambdas -= (cur_bad.features * delta); + //cerr << "L: " << lambdas << endl; + // } + // } + } + else if(optimizer == 1) //sgd - nonadapted step size + { + + lambdas += (cur_good.features) * max_step_size; + lambdas -= (cur_bad.features) * max_step_size; + } + //cerr << "L: " << lambdas << endl; + else if(optimizer == 5) //full mira with n-best list of constraints from oracle, fear, best + { + vector<shared_ptr<HypothesisInfo> > cur_constraint; + cur_constraint.insert(cur_constraint.begin(), cur_bad_v.begin(), cur_bad_v.end()); + cur_constraint.insert(cur_constraint.begin(), cur_best_v.begin(), cur_best_v.end()); + cur_constraint.insert(cur_constraint.begin(), cur_good_v.begin(), cur_good_v.end()); + + bool optimize_again; + vector<shared_ptr<HypothesisInfo> > cur_pair; + //SMO + for(int u=0;u!=cur_constraint.size();u++) + cur_constraint[u]->alpha =0; + + cur_constraint[0]->alpha =1; //set oracle to alpha=1 + + cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl; + int smo_iter = 10, smo_iter2 = 10; + int iter, iter2 =0; + bool DEBUG_SMO = false; + while (iter2 < smo_iter2) + { + iter =0; + while (iter < smo_iter) + { + optimize_again = true; + for (int i = 0; i< cur_constraint.size(); i++) + for (int j = i+1; j< cur_constraint.size(); j++) + { + if(DEBUG_SMO) cerr << "start " << i << " " << j << endl; + cur_pair.clear(); + cur_pair.push_back(cur_constraint[j]); + cur_pair.push_back(cur_constraint[i]); + double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); + + if (delta == 0) optimize_again = false; + // cur_pair[0]->alpha += delta; + // cur_pair[1]->alpha -= delta; + cur_constraint[j]->alpha += delta; + cur_constraint[i]->alpha -= delta; + double step_size = delta * max_step_size; + /*lambdas += (cur_pair[1]->features) * step_size; + lambdas -= (cur_pair[0]->features) * step_size;*/ + lambdas += (cur_constraint[i]->features) * step_size; + lambdas -= (cur_constraint[j]->features) * step_size; + if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << i << " " << j << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; + + //reload weights based on update + /*dense_weights.clear(); + weights.InitFromVector(lambdas); + weights.InitVector(&dense_weights);*/ + } + iter++; + + if(!optimize_again) + { + iter = 100; + cerr << "Optimization stopped, delta =0" << endl; + } + + + } + iter2++; + } + + + } + else if(optimizer == 2 || optimizer == 3) //1-fear and cutting plane mira + { + bool DEBUG_SMO= true; + vector<shared_ptr<HypothesisInfo> > cur_constraint; + cur_constraint.push_back(cur_good_v[0]); //add oracle to constraint set + bool optimize_again = true; + int cut_plane_calls = 0; + while (optimize_again) + { + if(DEBUG_SMO) cerr<< "optimize again: " << optimize_again << endl; + if(optimizer == 2){ //1-fear + cur_constraint.push_back(cur_bad_v[0]); + + //check if we have a violation + if(!(cur_constraint[1]->fear > cur_constraint[0]->fear + SMO_EPSILON)) + { + optimize_again = false; + cerr << "Constraint not violated" << endl; + } + } + else + { //cutting plane to add constraints + if(DEBUG_SMO) cerr<< "Cutting Plane " << cut_plane_calls << " with " << lambdas << endl; + optimize_again = false; + cut_plane_calls++; + CuttingPlane(&cur_constraint, &optimize_again, oracles[cur_sent].bad, dense_weights); + if (cut_plane_calls >= MAX_SMO) optimize_again = false; + } + + if(optimize_again) + { + //SMO + for(int u=0;u!=cur_constraint.size();u++) + { + cur_constraint[u]->alpha =0; + //cur_good_v[0]->alpha = 1; cur_bad_v[0]->alpha = 0; + } + cur_constraint[0]->alpha = 1; + cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl; + int smo_iter = MAX_SMO; + int iter =0; + while (iter < smo_iter) + { + //select pair to optimize from constraint set + vector<shared_ptr<HypothesisInfo> > cur_pair = SelectPair(&cur_constraint); + + if(cur_pair.empty()){iter=MAX_SMO; cerr << "Undefined pair " << endl; continue;} //pair is undefined so we are done with this smo + + //double num = cur_good_v[0]->fear - cur_bad_v[0]->fear; + /*double loss = cur_good_v[0]->oracle_loss - cur_bad_v[0]->oracle_loss; + double margin = cur_good_v[0]->oracle_feat_diff.dot(dense_weights) - cur_bad_v[0]->oracle_feat_diff.dot(dense_weights); + double num = loss - margin; + SparseVector<double> diff = cur_good_v[0]->features; + diff -= cur_bad_v[0]->features; + double delta = num / (diff.l2norm_sq() * max_step_size); + delta = max(-cur_good_v[0]->alpha, min(delta, cur_bad_v[0]->alpha)); + cur_good_v[0]->alpha += delta; + cur_bad_v[0]->alpha -= delta; + double step_size = delta * max_step_size; + lambdas += (cur_bad_v[0]->features) * step_size; + lambdas -= (cur_good_v[0]->features) * step_size; + */ + + double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights); + + cur_pair[0]->alpha += delta; + cur_pair[1]->alpha -= delta; + double step_size = delta * max_step_size; + /* lambdas += (cur_pair[1]->oracle_feat_diff) * step_size; + lambdas -= (cur_pair[0]->oracle_feat_diff) * step_size;*/ + + cerr << "step " << step_size << endl; + double alpha_sum=0; + SparseVector<double> temp_lambdas = lambdas; + + for(int u=0;u!=cur_constraint.size();u++) + { + cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << endl; + temp_lambdas += (cur_constraint[u]->oracleN->features-cur_constraint[u]->features) * cur_constraint[u]->alpha * step_size; + alpha_sum += cur_constraint[u]->alpha; + } + cerr << "Alpha sum " << alpha_sum << " " << temp_lambdas << endl; + + lambdas += (cur_pair[1]->features) * step_size; + lambdas -= (cur_pair[0]->features) * step_size; + cerr << " Lambdas " << lambdas << endl; + //reload weights based on update + dense_weights.clear(); + //weights.InitFromVector(lambdas); + //weights.InitVector(&dense_weights); + lambdas.init_vector(&dense_weights); + dense_weights_g = dense_weights; + iter++; + + if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha << endl; + // cerr << "SMO opt " << iter << " " << delta << " " << cur_good_v[0]->alpha << " " << cur_bad_v[0]->alpha << endl; + if(no_select) //don't use selection heuristic to determine when to stop SMO, rather just when delta =0 + if (delta == 0) iter = MAX_SMO; + + //only perform one dual coordinate ascent step + if(optimizer == 2) + { + optimize_again = false; + iter = MAX_SMO; + } + + } + if(optimizer == 3) + { + if(!no_reweight) + { + if(DEBUG_SMO) cerr<< "Decoding with new weights -- now orac are " << oracles[cur_sent].good.size() << endl; + Hypergraph hg = observer.GetCurrentForest(); + hg.Reweight(dense_weights); + //observer.UpdateOracles(cur_sent, hg); + if(unique_kbest) + observer.UpdateOracles<KBest::FilterUnique>(cur_sent, hg); + else + observer.UpdateOracles<KBest::NoFilter<std::vector<WordID> > >(cur_sent, hg); + + + } + } + } + + + } + + //print objective after this sentence + double lambda_change = (lambdas - old_lambdas).l2norm_sq(); + double max_fear = cur_constraint[cur_constraint.size()-1]->fear; + double temp_objective = 0.5 * lambda_change;// + max_step_size * max_fear; + + for(int u=0;u!=cur_constraint.size();u++) + { + cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << " " << cur_constraint[u]->fear << endl; + temp_objective += cur_constraint[u]->alpha * cur_constraint[u]->fear; + } + objective += temp_objective; + + cerr << "SENT OBJ: " << temp_objective << " NEW OBJ: " << objective << endl; + } + + + if ((cur_sent * 40 / ds.size()) > dots) { ++dots; cerr << '.'; } + tot += lambdas; + ++lcount; + cur_sent++; + + cout << TD::GetString(cur_good_v[0]->hyp) << " ||| " << TD::GetString(cur_best_v[0]->hyp) << " ||| " << TD::GetString(cur_bad_v[0]->hyp) << endl; + + //clear good/bad lists from oracles for this sentences - you want to keep them around for things + + // oracles[cur_sent].good.clear(); + //oracles[cur_sent].bad.clear(); + } + + cerr << "FINAL OBJECTIVE: "<< objective << endl; + final_tot += tot; + cerr << "Translated " << lcount << " sentences " << endl; + cerr << " [AVG METRIC LAST PASS=" << (tot_loss / lcount) << "]\n"; + tot_loss = 0; + /* + float corpus_score = acc->ComputeScore(); + string corpus_details; + acc->ScoreDetails(&corpus_details); + cerr << "MODEL " << corpus_details << endl; + cout << corpus_score << endl; + + corpus_score = acc_h->ComputeScore(); + acc_h->ScoreDetails(&corpus_details); + cerr << "HOPE " << corpus_details << endl; + cout << corpus_score << endl; + + corpus_score = acc_f->ComputeScore(); + acc_f->ScoreDetails(&corpus_details); + cerr << "FEAR " << corpus_details << endl; + cout << corpus_score << endl; + */ + int node_id = rng->next() * 100000; + cerr << " Writing weights to " << node_id << endl; + Weights::ShowLargestFeatures(dense_weights); + dots = 0; + ostringstream os; + os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz"; + string msg = "# MIRA tuned weights ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount); + //Weights.InitFromVector(lambdas); + lambdas.init_vector(&dense_weights); + Weights::WriteToFile(os.str(), dense_weights, true, &msg); + + SparseVector<double> x = tot; + x /= lcount; + ostringstream sa; + string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount); + sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz"; + //Weights ww; + //ww.InitFromVector(x); + x.init_vector(&dense_weights); + Weights::WriteToFile(sa.str(), dense_weights, true, &msga); + + //assign averaged lambdas to initialize next iteration + //lambdas = x; + + /* double lambda_change = (old_lambdas - lambdas).l2norm_sq(); + cerr << "Change in lambda " << lambda_change << endl; + + if ( lambda_change < EPSILON) + { + cur_pass = max_iteration; + cerr << "Weights converged - breaking" << endl; + } + + ++cur_pass; + */ + + //} iteration while loop + + /* cerr << endl; + weights.WriteToFile("weights.mira-final.gz", true, &msg); + final_tot /= (lcount + 1);//max_iteration); + tot /= (corpus.size() + 1); + weights.InitFromVector(final_tot); + cerr << tot << "||||" << final_tot << endl; + msg = "# MIRA tuned weights (averaged vector)"; + weights.WriteToFile("weights.mira-final-avg.gz", true, &msg); + */ + cerr << "Optimization complete.\\AVERAGED WEIGHTS: weights.mira-final-avg.gz\n"; + return 0; +} + diff --git a/training/mira/run_mira.pl b/training/mira/run_mira.pl new file mode 100755 index 00000000..f4d61407 --- /dev/null +++ b/training/mira/run_mira.pl @@ -0,0 +1,548 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); +push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); + +require "libcall.pl"; + + +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; + +my $iteration = 0.0; +my $max_iterations = 6; +my $metric = "ibm_bleu"; +my $iniFile; +my $weights; +my $initialWeights; +my $decode_nodes = 1; # number of decode nodes +my $pmem = "1g"; +my $dir; + +my $SCORER = $FAST_SCORE; +my $local_server = "$bin_dir/local_parallelize.pl"; +my $parallelize = "$bin_dir/../dpmert/parallelize.pl"; +my $libcall = "$bin_dir/../dpmert/libcall.pl"; +my $sentserver = "$bin_dir/../dpmert/sentserver"; +my $sentclient = "$bin_dir/../dpmert/sentclient"; +my $run_local_server = 0; +my $run_local = 0; +my $usefork; +my $pass_suffix = ''; + +my $cdec ="$bin_dir/kbest_mirav5"; #"$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mira_lv"; + +#my $cdec ="$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mirav5"; #"$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mira_lv"; +die "Can't find decoder in $cdec" unless -x $cdec; +my $decoder = $cdec; +my $decoderOpt; +my $update_size=250; +my $approx_score; +my $kbest_size=250; +my $metric_scale=1; +my $optimizer=2; +my $disable_clean = 0; +my $use_make; # use make to parallelize line search +my $density_prune; +my $cpbin=1; +my $help = 0; +my $epsilon = 0.0001; +my $step_size = 0.01; +my $gpref; +my $unique_kbest; +my $freeze; +my $latent; +my $sample_max; +my $hopes=1; +my $fears=1; + +my $range = 35000; +my $minimum = 15000; +my $portn = int(rand($range)) + $minimum; + + +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "decoder=s" => \$decoderOpt, + "decode-nodes=i" => \$decode_nodes, + "density-prune=f" => \$density_prune, + "dont-clean" => \$disable_clean, + "pass-suffix=s" => \$pass_suffix, + "use-fork" => \$usefork, + "epsilon=s" => \$epsilon, + "help" => \$help, + "local" => \$run_local, + "local_server" => \$run_local_server, + "use-make=i" => \$use_make, + "max-iterations=i" => \$max_iterations, + "pmem=s" => \$pmem, + "cpbin!" => \$cpbin, + "ref-files=s" => \$refFiles, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "weights=s" => \$initialWeights, + "optimizer=i" => \$optimizer, + "metric-scale=i" => \$metric_scale, + "kbest-size=i" => \$kbest_size, + "update-size=i" => \$update_size, + "step-size=f" => \$step_size, + "hope-select=i" => \$hopes, + "fear-select=i" => \$fears, + "approx-score" => \$approx_score, + "unique-kbest" => \$unique_kbest, + "latent" => \$latent, + "sample-max=i" => \$sample_max, + "grammar-prefix=s" => \$gpref, + "freeze" => \$freeze, + "workdir=s" => \$dir, + ) == 0 || @ARGV!=1 || $help) { + print_help(); + exit; +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + + +#my $refs_comma_sep = get_comma_sep_refs($refFiles); +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +#my $refs_comma_sep_4cdec = get_comma_sep_refs_4cdec($refFiles); + +unless ($dir){ + $dir = "mira"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + +if ($decoderOpt){ $decoder = $decoderOpt; } + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; + +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { + local $_; + my $bindir=shift; + check_call("mkdir -p $bindir"); + -d $bindir || die "couldn't make bindir $bindir"; + for (@_) { + my $src=$$_; + $$_="$bindir/".basename($src); + check_call("cp -p $src $$_"); + } +} +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} + + + + +if (-e $dir && dirsize($dir)>1 && -e "$dir/weights" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs + die "ERROR: working dir $dir already exists\n\n"; +} else { + -e $dir || mkdir $dir; + mkdir "$dir/scripts"; + my $cmdfile="$dir/rerun-mira.sh"; + open CMD,'>',$cmdfile; + print CMD "cd ",&getcwd,"\n"; + my $cline=&cmdline."\n"; + print CMD $cline; + close CMD; + print STDERR $cline; + chmod(0755,$cmdfile); + unless (-e $initialWeights) { + print STDERR "Please specify an initial weights file with --initial-weights\n"; + print_help(); + exit; + } + check_call("cp $initialWeights $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); +} +write_config(*STDERR); + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc, $gpref); + +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +my $lastPScore = 0; +my $lastWeightsFile; + +# main optimization loop +#while (1){ +for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) { + + print STDERR "\n\nITERATION $opt_iter\n==========\n"; + print STDERR "Using port $portn\n"; + + # iteration-specific files + my $runFile="$dir/run.raw.$opt_iter"; + my $onebestFile="$dir/1best.$opt_iter"; + my $logdir="$dir/logs.$opt_iter"; + my $decoderLog="$logdir/decoder.sentserver.log.$opt_iter"; + my $scorerLog="$logdir/scorer.log.$opt_iter"; + my $weightdir="$dir/weights.pass$opt_iter/"; + check_call("mkdir -p $logdir"); + check_call("mkdir -p $weightdir"); + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); +# my $im1 = $opt_iter - 1; + my $weightsFile="$dir/weights.$opt_iter"; + print "ITER $iteration " ; + my $cur_pass = "-p 0$opt_iter"; + my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile $refs_comma_sep -m $metric -s $metric_scale -a -b $update_size -k $kbest_size -o $optimizer $cur_pass -O $weightdir -D $dir -h $hopes -f $fears -C $step_size"; + if($unique_kbest){ + $decoder_cmd .= " -u"; + } + if($latent){ + $decoder_cmd .= " -l"; + } + if($sample_max){ + $decoder_cmd .= " -t $sample_max"; + } + if ($density_prune) { + $decoder_cmd .= " --density_prune $density_prune"; + } + my $pcmd; + if ($run_local) { + $pcmd = "cat $srcFile |"; + } elsif ($use_make) { + # TODO: Throw error when decode_nodes is specified along with use_make + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --"; + } elsif ($run_local_server){ + $pcmd = "cat $srcFile | $local_server $usefork -p $pmem -e $logdir -n $decode_nodes --"; + } + else { + $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --baseport $portn --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + + my $retries = 0; + my $num_topbest; + while($retries < 5) { + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_topbest); + + + #score the output from this iteration + open RUN, "<$runFile" or die "Can't read $runFile: $!"; + open H, ">$runFile.H" or die; + open F, ">$runFile.F" or die; + open B, ">$runFile.B" or die; + while(<RUN>) { + chomp(); + (my $hope,my $best,my $fear) = split(/ \|\|\| /); + print H "$hope \n"; + print B "$best \n"; + print F "$fear \n"; + } + close RUN; + close F; close B; close H; + + my $dec_score = check_output("cat $runFile.B | $SCORER $refs_comma_sep -l $metric"); + my $dec_score_h = check_output("cat $runFile.H | $SCORER $refs_comma_sep -l $metric"); + my $dec_score_f = check_output("cat $runFile.F | $SCORER $refs_comma_sep -l $metric"); + chomp $dec_score; chomp $dec_score_h; chomp $dec_score_f; + print STDERR "DECODER SCORE: $dec_score HOPE: $dec_score_h FEAR: $dec_score_f\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + my $iter_filler=""; + if($opt_iter < 10) + {$iter_filler="0";} + + my $nextIter = $opt_iter + 1; + my $newWeightsFile = "$dir/weights.$nextIter"; + $lastWeightsFile = "$dir/weights.$opt_iter"; + + average_weights("$weightdir/weights.mira-pass*.*[0-9].gz", $newWeightsFile, $logdir); +# check_call("cp $lastW $newWeightsFile"); +# if ($icc < 2) { +# print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; +# last; +# } + system("gzip -f $logdir/kbes*"); + print STDERR "\n==========\n"; + $iteration++; +} +#find +#my $cmd = `grep SCORE /fs/clip-galep5/lexical_tm/log.runmira.nist.20 | cat -n | sort -k +2 | tail -1`; +#$cmd =~ m/([0-9]+)/; +#$lastWeightsFile = "$dir/weights.$1"; +#check_call("ln -s $lastWeightsFile $dir/weights.tuned"); +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while(<FL>) { $lc++; } + return $lc; +} + +sub get_comma_sep_refs { + my ($r,$p) = @_; + my $o = check_output("echo $p"); + chomp $o; + my @files = split /\s+/, $o; + return "-$r " . join(" -$r ", @files); +} + + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while(<F>) { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +sub write_config { + my $fh = shift; + my $cleanup = "yes"; + if ($disable_clean) {$cleanup = "no";} + + print $fh "\n"; + print $fh "DECODER: $decoder\n"; + print $fh "INI FILE: $iniFile\n"; + print $fh "WORKING DIR: $dir\n"; + print $fh "SOURCE (DEV): $srcFile\n"; + print $fh "REFS (DEV): $refFiles\n"; + print $fh "EVAL METRIC: $metric\n"; + print $fh "START ITERATION: $iteration\n"; + print $fh "MAX ITERATIONS: $max_iterations\n"; + print $fh "DECODE NODES: $decode_nodes\n"; + print $fh "HEAD NODE: $host\n"; + print $fh "PMEM (DECODING): $pmem\n"; + print $fh "CLEANUP: $cleanup\n"; + print $fh "INITIAL WEIGHTS: $initialWeights\n"; + print $fh "GRAMMAR PREFIX: $gpref\n"; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + my $grammarpref = shift; + + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } + elsif (defined $grammarpref) { + print NEWSRC "<seg id=\"$i\" grammar=\"$grammarpref.$i.gz\">$line</seg>\n";} + else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + print "Something wrong\n"; +} + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub average_weights { + + my $path = shift; + my $out = shift; + my $logpath = shift; + print "AVERAGE $path $out\n"; + my %feature_weights= (); + my $total =0; + my $total_mult =0; + sleep(10); + foreach my $file (glob "$path") + { + $file =~ /\/([^\/]+).gz$/; + my $fname = $1; + my $cmd = "gzip -d $file"; + $file =~ s/\.gz//; + check_bash_call($cmd); + my $mult = 0; + print "FILE $file \n"; + open SCORE, "< $file" or next; + $total++; + while( <SCORE> ) { + my $line = $_; + if ($line !~ m/^\#/) + { + my @s = split(" ",$line); + $feature_weights{$s[0]}+= $mult * $s[1]; + } + else + { + (my $msg,my $ran,$mult) = split(/ \|\|\| /); + print "RAN $ran $mult\n"; + } + } + $total_mult += $mult; + + close SCORE; + $cmd = "gzip $file"; check_bash_call($cmd); + } + +#print out new averaged weights + open OUT, "> $out" or next; + for my $f ( keys %feature_weights ) { + print "$f $feature_weights{$f} $total_mult\n"; + my $ave = $feature_weights{$f} / $total_mult; + + print "Printing $f $ave ||| "; + print OUT "$f $ave\n"; + } + +} |