2 files changed, 1696 insertions, 0 deletions
diff --git a/training/mira/kbest_mirav5.cc b/training/mira/kbest_mirav5.cc
new file mode 100644
index 00000000..cea5cf67
--- /dev/null
+++ b/training/mira/kbest_mirav5.cc
@@ -0,0 +1,1148 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+#include <algorithm>
+
+#include "config.h"
+
+
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sentence_metadata.h"
+#include "scorer.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "hg.h"
+#include "prob.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "time.h"
+#include "sampler.h"
+
+#include "weights.h"
+#include "sparse_vector.h"
+
+using namespace std;
+using boost::shared_ptr;
+namespace po = boost::program_options;
+
+bool invert_score;
+boost::shared_ptr<MT19937> rng;
+bool approx_score;
+bool no_reweight;
+bool no_select;
+bool unique_kbest;
+int update_list_size;
+vector<weight_t> dense_weights_g;
+double mt_metric_scale;
+int optimizer;
+int fear_select;
+int hope_select;
+
+bool pseudo_doc;
+
+void SanityCheck(const vector<double>& w) {
+  for (int i = 0; i < w.size(); ++i) {
+    assert(!isnan(w[i]));
+    assert(!isinf(w[i]));
+  }
+}
+
+struct FComp {
+  const vector<double>& w_;
+  FComp(const vector<double>& w) : w_(w) {}
+  bool operator()(int a, int b) const {
+    return fabs(w_[a]) > fabs(w_[b]);
+  }
+};
+
+void ShowLargestFeatures(const vector<double>& w) {
+  vector<int> fnums(w.size());
+  for (int i = 0; i < w.size(); ++i)
+    fnums[i] = i;
+  vector<int>::iterator mid = fnums.begin();
+  mid += (w.size() > 10 ? 10 : w.size());
+  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
+  cerr << "TOP FEATURES:";
+  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
+    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
+  }
+  cerr << endl;
+}
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+    ("input_weights,w",po::value<string>(),"Input feature weights file")
+    ("source,i",po::value<string>(),"Source file for development set")
+    ("passes,p", po::value<int>()->default_value(15), "Number of passes through the training data")
+    ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)")
+    ("mt_metric,m",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)")
+    ("optimizer,o",po::value<int>()->default_value(1), "Optimizer (sgd=1, mira 1-fear=2, full mira w/ cutting plane=3, full mira w/ nbest list=5, local update=4)")
+    ("fear,f",po::value<int>()->default_value(1), "Fear selection (model-cost=1, max-cost=2, pred-base=3)")
+    ("hope,h",po::value<int>()->default_value(1), "Hope selection (model+cost=1, max-cost=2, local-cost=3)")
+    ("max_step_size,C", po::value<double>()->default_value(0.01), "regularization strength (C)")
+    ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+    ("mt_metric_scale,s", po::value<double>()->default_value(1.0), "Amount to scale MT loss function by")
+    ("approx_score,a", "Use smoothed sentence-level BLEU score for approximate scoring")
+    ("no_reweight,d","Do not reweight forest for cutting plane")
+    ("no_select,n", "Do not use selection heuristic")
+    ("k_best_size,k", po::value<int>()->default_value(250), "Size of hypothesis list to search for oracles")
+    ("update_k_best,b", po::value<int>()->default_value(1), "Size of good, bad lists to perform update with")
+    ("unique_k_best,u", "Unique k-best translation list")
+    ("weights_output,O",po::value<string>(),"Directory to write weights to")
+    ("output_dir,D",po::value<string>(),"Directory to place output in")
+    ("decoder_config,c",po::value<string>(),"Decoder configuration file");
+  po::options_description clo("Command line options");
+  clo.add_options()
+    ("config", po::value<string>(), "Configuration file")
+    ("help,H", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || !conf->count("input_weights") || !conf->count("decoder_config") || !conf->count("reference")) {
+    cerr << dcmdline_options << endl;
+    return false;
+  }
+  return true;
+}
+
+//load previous translation, store array of each sentences score, subtract it from current sentence and replace with new translation score
+
+
+static const double kMINUS_EPSILON = -1e-6;
+static const double EPSILON = 0.000001;
+static const double SMO_EPSILON = 0.0001;
+static const double PSEUDO_SCALE = 0.95;
+static const int MAX_SMO = 10;
+int cur_pass;
+
+struct HypothesisInfo {
+  SparseVector<double> features;
+  vector<WordID> hyp;
+  double mt_metric;
+  double hope;
+  double fear;
+  double alpha;
+  double oracle_loss;
+  SparseVector<double> oracle_feat_diff;
+  shared_ptr<HypothesisInfo> oracleN;
+};
+
+bool ApproxEqual(double a, double b) {
+  if (a == b) return true;
+  return (fabs(a-b)/fabs(b)) < EPSILON;
+}
+
+typedef shared_ptr<HypothesisInfo> HI;
+bool HypothesisCompareB(const HI& h1, const HI& h2 ) 
+{
+  return h1->mt_metric > h2->mt_metric;
+};
+
+
+bool HopeCompareB(const HI& h1, const HI& h2 ) 
+{
+  return h1->hope > h2->hope;
+};
+
+bool FearCompareB(const HI& h1, const HI& h2 ) 
+{
+  return h1->fear > h2->fear;
+};
+
+bool FearComparePred(const HI& h1, const HI& h2 ) 
+{
+  return h1->features.dot(dense_weights_g) > h2->features.dot(dense_weights_g);
+};
+
+bool HypothesisCompareG(const HI& h1, const HI& h2 ) 
+{
+  return h1->mt_metric < h2->mt_metric;
+};
+
+
+void CuttingPlane(vector<shared_ptr<HypothesisInfo> >* cur_c, bool* again, vector<shared_ptr<HypothesisInfo> >& all_hyp, vector<weight_t> dense_weights)
+{
+  bool DEBUG_CUT = false;
+  shared_ptr<HypothesisInfo> max_fear, max_fear_in_set;
+  vector<shared_ptr<HypothesisInfo> >& cur_constraint = *cur_c;
+
+  if(no_reweight)
+    {
+      //find new hope hypothesis
+      for(int u=0;u!=all_hyp.size();u++)	
+	{ 
+	  double t_score = all_hyp[u]->features.dot(dense_weights);
+	  all_hyp[u]->hope = 1 * all_hyp[u]->mt_metric + t_score;
+	  //if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; 
+	  
+	}
+      
+      //sort hyps by hope score
+      sort(all_hyp.begin(),all_hyp.end(),HopeCompareB);    
+      
+      double hope_score = all_hyp[0]->features.dot(dense_weights);
+      if(DEBUG_CUT) cerr << "New hope derivation score " << hope_score << endl;
+     
+      for(int u=0;u!=all_hyp.size();u++)	
+	{ 
+	  double t_score = all_hyp[u]->features.dot(dense_weights);
+	  //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score;
+	  
+	  all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*all_hyp[0]->mt_metric - hope_score + t_score; //relative loss
+	  //      all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*all_hyp[0]->mt_metric;
+	  //all_hyp[u]->oracle_feat_diff = all_hyp[0]->features - all_hyp[u]->features;
+	  //	all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score;
+	  //if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; 
+	  
+	}
+    
+      sort(all_hyp.begin(),all_hyp.end(),FearCompareB);
+      
+    }
+  //assign maximum fear derivation from all derivations
+  max_fear = all_hyp[0];
+  
+  if(DEBUG_CUT) cerr <<"Cutting Plane Max Fear "<<max_fear->fear ;
+  for(int i=0; i < cur_constraint.size();i++) //select maximal violator already in constraint set
+    {
+      if (!max_fear_in_set || cur_constraint[i]->fear > max_fear_in_set->fear)
+	max_fear_in_set = cur_constraint[i];
+    }
+  if(DEBUG_CUT) cerr << "Max Fear in constraint set " << max_fear_in_set->fear << endl;
+  
+  if(max_fear->fear > max_fear_in_set->fear + SMO_EPSILON)
+    {
+      cur_constraint.push_back(max_fear);
+      *again = true;
+      if(DEBUG_CUT) cerr << "Optimize Again " << *again << endl;
+    }
+}
+
+
+double ComputeDelta(vector<shared_ptr<HypothesisInfo> >* cur_p, double max_step_size,vector<weight_t> dense_weights )
+{
+  vector<shared_ptr<HypothesisInfo> >& cur_pair = *cur_p;
+   double loss = cur_pair[0]->oracle_loss - cur_pair[1]->oracle_loss;
+   //double margin = -cur_pair[0]->oracle_feat_diff.dot(dense_weights) + cur_pair[1]->oracle_feat_diff.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff?
+   //double num = loss - margin;
+  
+
+   double margin = -(cur_pair[0]->oracleN->features.dot(dense_weights)- cur_pair[0]->features.dot(dense_weights)) + (cur_pair[1]->oracleN->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights));
+   const double num = margin +  loss;
+   cerr << "LOSS: " << num << " Margin:" << margin << " BLEUL:" << loss << " " << cur_pair[1]->features.dot(dense_weights) << " " << cur_pair[0]->features.dot(dense_weights) <<endl;
+   
+   // double margin = cur_pair[1]->features.dot(dense_weights) - cur_pair[0]->features.dot(dense_weights);
+   // double loss =  cur_pair[1]->oracle_loss; //good.mt_metric - cur_bad.mt_metric);
+   //const double num = margin +  loss;
+  
+   //cerr << "Compute Delta " << loss << " " << margin << " ";
+
+  //  double margin = cur_pair[0]->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights); //TODO: is it a problem that new oracle is used in diff?
+/*  double num = 
+    (cur_pair[0]->oracle_loss - cur_pair[0]->oracle_feat_diff.dot(dense_weights))
+    - (cur_pair[1]->oracle_loss - cur_pair[1]->oracle_feat_diff.dot(dense_weights));
+  */
+
+  SparseVector<double> diff = cur_pair[0]->features;
+  diff -= cur_pair[1]->features;
+  /*  SparseVector<double> diff = cur_pair[0]->oracle_feat_diff;
+  diff -= cur_pair[1]->oracle_feat_diff;*/
+  double diffsqnorm = diff.l2norm_sq();
+  double delta;
+  if (diffsqnorm > 0)
+    delta = num / (diffsqnorm * max_step_size);
+  else
+    delta = 0;
+  cerr << " D1:" << delta;
+  //clip delta (enforce margin constraints)
+
+  delta = max(-cur_pair[0]->alpha, min(delta, cur_pair[1]->alpha));
+  cerr << " D2:" << delta;
+  return delta;
+}
+
+
+vector<shared_ptr<HypothesisInfo> > SelectPair(vector<shared_ptr<HypothesisInfo> >* cur_c)
+{
+  bool DEBUG_SELECT= false;
+  vector<shared_ptr<HypothesisInfo> >& cur_constraint = *cur_c;
+  
+  vector<shared_ptr<HypothesisInfo> > pair;
+
+  if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for 1-mira
+  //    if(optimizer == 2)      {
+      pair.push_back(cur_constraint[0]);
+      pair.push_back(cur_constraint[1]);
+      return pair;
+      //   }
+    }
+  
+  for(int u=0;u != cur_constraint.size();u++)	
+    {
+      shared_ptr<HypothesisInfo> max_fear;
+      
+      if(DEBUG_SELECT) cerr<< "cur alpha " << u  << " " << cur_constraint[u]->alpha;
+      for(int i=0; i < cur_constraint.size();i++) //select maximal violator
+	{
+	  if(i != u)
+	    if (!max_fear || cur_constraint[i]->fear > max_fear->fear)
+	      max_fear = cur_constraint[i];
+	}
+      if(!max_fear) return pair; //
+      
+      if(DEBUG_SELECT) cerr << " F" << max_fear->fear << endl;
+
+      
+      if ((cur_constraint[u]->alpha == 0) && (cur_constraint[u]->fear > max_fear->fear + SMO_EPSILON))
+	{
+	  for(int i=0; i < cur_constraint.size();i++) //select maximal violator
+	    {
+	      if(i != u)
+		if (cur_constraint[i]->alpha > 0)
+		  {
+		    pair.push_back(cur_constraint[u]);
+		    pair.push_back(cur_constraint[i]);
+		    cerr << "RETJURN from 1" << endl;
+		    return pair;
+		  }
+	    }
+	}	       
+      if ((cur_constraint[u]->alpha > 0) && (cur_constraint[u]->fear < max_fear->fear - SMO_EPSILON))
+	{
+	  for(int i=0; i < cur_constraint.size();i++) //select maximal violator
+	    {
+	      if(i != u)	
+		if (cur_constraint[i]->fear > cur_constraint[u]->fear)
+		  {
+		    pair.push_back(cur_constraint[u]);
+		    pair.push_back(cur_constraint[i]);
+		    return pair;
+		  }
+	    }  
+	}
+    
+    } 
+  return pair; //no more constraints to optimize, we're done here
+
+}
+
+struct GoodBadOracle {
+  vector<shared_ptr<HypothesisInfo> > good;
+  vector<shared_ptr<HypothesisInfo> > bad;
+};
+
+struct TrainingObserver : public DecoderObserver {
+  TrainingObserver(const int k, const DocScorer& d, vector<GoodBadOracle>* o, vector<ScoreP>* cbs) : ds(d), oracles(*o), corpus_bleu_sent_stats(*cbs), kbest_size(k) {
+  // TrainingObserver(const int k, const DocScorer& d, vector<GoodBadOracle>* o) : ds(d), oracles(*o), kbest_size(k) {
+    
+    //calculate corpus bleu score from previous iterations 1-best for BLEU gain
+    if(!pseudo_doc)
+    if(cur_pass > 0)
+      {
+	ScoreP acc;
+	for (int ii = 0; ii < corpus_bleu_sent_stats.size(); ii++) {
+	  if (!acc) { acc = corpus_bleu_sent_stats[ii]->GetZero(); }
+	  acc->PlusEquals(*corpus_bleu_sent_stats[ii]);
+	  
+	}
+	corpus_bleu_stats = acc;
+	corpus_bleu_score = acc->ComputeScore();
+      }
+    //corpus_src_length = 0;
+}
+  const DocScorer& ds;
+  vector<ScoreP>& corpus_bleu_sent_stats;
+  vector<GoodBadOracle>& oracles;
+  vector<shared_ptr<HypothesisInfo> > cur_best;
+  shared_ptr<HypothesisInfo> cur_oracle;
+  const int kbest_size;
+  Hypergraph forest;
+  int cur_sent;
+  ScoreP corpus_bleu_stats;
+  float corpus_bleu_score;
+
+  float corpus_src_length;
+  float curr_src_length;
+
+  const int GetCurrentSent() const {
+    return cur_sent;
+  }
+
+  const HypothesisInfo& GetCurrentBestHypothesis() const {
+    return *cur_best[0];
+  }
+
+  const vector<shared_ptr<HypothesisInfo> > GetCurrentBest() const {
+    return cur_best;
+  }
+  
+ const HypothesisInfo& GetCurrentOracle() const {
+    return *cur_oracle;
+  }
+  
+  const Hypergraph& GetCurrentForest() const {
+    return forest;
+  }
+  
+
+  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+    cur_sent = smeta.GetSentenceID();
+    //cerr << "SOURCE " << smeta.GetSourceLength() << endl;
+    curr_src_length = (float) smeta.GetSourceLength();
+    //UpdateOracles(smeta.GetSentenceID(), *hg);
+    if(unique_kbest)
+      UpdateOracles<KBest::FilterUnique>(smeta.GetSentenceID(), *hg);
+    else
+      UpdateOracles<KBest::NoFilter<std::vector<WordID> > >(smeta.GetSentenceID(), *hg);
+    forest = *hg;
+    
+  }
+
+  shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double score, const vector<WordID>& hyp) {
+    shared_ptr<HypothesisInfo> h(new HypothesisInfo);
+    h->features = feats;
+    h->mt_metric = score;
+    h->hyp = hyp;
+    return h;
+  }
+
+  template <class Filter>  
+  void UpdateOracles(int sent_id, const Hypergraph& forest) {
+
+    bool PRINT_LIST= false;    
+    vector<shared_ptr<HypothesisInfo> >& cur_good = oracles[sent_id].good;
+    vector<shared_ptr<HypothesisInfo> >& cur_bad = oracles[sent_id].bad;
+    //TODO: look at keeping previous iterations hypothesis lists around
+    cur_best.clear();
+    cur_good.clear();
+    cur_bad.clear();
+
+    vector<shared_ptr<HypothesisInfo> > all_hyp;
+
+    typedef KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,Filter> K;
+    K kbest(forest,kbest_size);
+    
+    //KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, kbest_size);
+    for (int i = 0; i < kbest_size; ++i) {
+      //const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+      typename K::Derivation *d =
+        kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+      if (!d) break;
+
+      float sentscore;
+      if(approx_score)
+	{
+
+	  if(cur_pass > 0 && !pseudo_doc)
+	    {
+	      ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield);
+	      ScoreP corpus_no_best = corpus_bleu_stats->GetZero();
+
+	      corpus_bleu_stats->Subtract(*corpus_bleu_sent_stats[sent_id], &*corpus_no_best);
+	      sent_stats->PlusEquals(*corpus_no_best, 0.5);
+	      
+	      //compute gain from new sentence in 1-best corpus
+	      sentscore = mt_metric_scale * (sent_stats->ComputeScore() - corpus_no_best->ComputeScore());// - corpus_bleu_score);
+	    }
+	  else if(pseudo_doc)
+	    {
+	      //cerr << "CORP:" << corpus_bleu_score << " NEW:" << sent_stats->ComputeScore() << " sentscore:" << sentscore << endl;
+
+	  //-----pseudo-corpus approach
+	      float src_scale = corpus_src_length + curr_src_length;
+	      ScoreP sent_stats = ds[sent_id]->ScoreCandidate(d->yield);
+	      if(!corpus_bleu_stats){ corpus_bleu_stats = sent_stats->GetZero();}
+	      
+	      sent_stats->PlusEquals(*corpus_bleu_stats);
+	      sentscore =  mt_metric_scale  * src_scale * sent_stats->ComputeScore();
+
+	    }
+	  else
+	    {
+	      //cerr << "Using sentence-level approximation - PASS - " << boost::lexical_cast<std::string>(cur_pass) << endl;
+	      //approx style of computation, used for 0th iteration
+	      sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeSentScore());
+
+	      //use pseudo-doc
+	    }
+	  
+	 
+	}
+      else
+	{
+	  sentscore = mt_metric_scale * (ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore());
+	}
+     
+      if (invert_score) sentscore *= -1.0;
+      //cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << " " << approx_sentscore << endl;
+
+      if (i < update_list_size){ 
+	if (i == 0) //take cur best and add its bleu statistics counts to the pseudo-doc
+	  {  }
+	if(PRINT_LIST)cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; 
+	cur_best.push_back( MakeHypothesisInfo(d->feature_values, sentscore, d->yield));
+      }
+      
+      all_hyp.push_back(MakeHypothesisInfo(d->feature_values, sentscore,d->yield));   //store all hyp to extract oracle best and worst
+         
+    }
+    
+    if(pseudo_doc){
+    //update psuedo-doc stats
+      string details, details2;     
+      corpus_bleu_stats->ScoreDetails(&details2);   
+      ScoreP sent_stats = ds[sent_id]->ScoreCandidate(cur_best[0]->hyp);
+      corpus_bleu_stats->PlusEquals(*sent_stats);
+      
+     
+      sent_stats->ScoreDetails(&details);
+      
+      
+      sent_stats = corpus_bleu_stats;
+      corpus_bleu_stats = sent_stats->GetZero();
+      corpus_bleu_stats->PlusEquals(*sent_stats, PSEUDO_SCALE);
+      
+      
+      corpus_src_length = PSEUDO_SCALE * (corpus_src_length + curr_src_length);
+      cerr << "CORP S " << corpus_src_length << " " << curr_src_length << "\n" << details << "\n " << details2 << endl;
+      
+
+    }
+
+
+    //figure out how many hyps we can keep maximum
+    int temp_update_size = update_list_size;
+    if (all_hyp.size() < update_list_size){ temp_update_size = all_hyp.size();}
+
+    //sort all hyps by sentscore (bleu)
+    sort(all_hyp.begin(),all_hyp.end(),HypothesisCompareB);
+    
+    if(PRINT_LIST){  cerr << "Sorting " << endl; for(int u=0;u!=all_hyp.size();u++)	cerr << all_hyp[u]->mt_metric << " " << all_hyp[u]->features.dot(dense_weights_g) << endl; }
+    
+    //if(optimizer != 4 )
+    if(hope_select == 1)
+      {
+	//find hope hypothesis using model + bleu
+	if (PRINT_LIST) cerr << "HOPE " << endl;
+	for(int u=0;u!=all_hyp.size();u++)	
+	  { 
+	    double t_score = all_hyp[u]->features.dot(dense_weights_g);
+	    all_hyp[u]->hope = all_hyp[u]->mt_metric + t_score;
+	    if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " S:" << t_score << endl; 
+	    
+	  }
+	
+	//sort hyps by hope score
+	sort(all_hyp.begin(),all_hyp.end(),HopeCompareB);
+      }
+        
+
+    //assign cur_good the sorted list
+    cur_good.insert(cur_good.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size);    
+    if(PRINT_LIST) { cerr << "GOOD" << endl;  for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;}     
+    /*    if (!cur_oracle) {      cur_oracle = cur_good[0];
+      cerr << "Set oracle " << cur_oracle->hope << " " << cur_oracle->fear << " " << cur_oracle->mt_metric << endl;      }
+    else      {
+	cerr << "Stay oracle " << cur_oracle->hope << " " << cur_oracle->fear << " " << cur_oracle->mt_metric << endl;      }    */
+
+    shared_ptr<HypothesisInfo>& oracleN = cur_good[0];
+    //if(optimizer != 4){
+    if(fear_select == 1){
+      //compute fear hyps
+      if (PRINT_LIST) cerr << "FEAR " << endl;
+      double hope_score = oracleN->features.dot(dense_weights_g);
+      //double hope_score = cur_oracle->features.dot(dense_weights);
+      if (PRINT_LIST) cerr << "hope score " << hope_score << endl;
+      for(int u=0;u!=all_hyp.size();u++)	
+	{ 
+	  double t_score = all_hyp[u]->features.dot(dense_weights_g);
+	  //all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - hope_score + t_score;
+	  
+	  /*	  all_hyp[u]->fear = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric - hope_score + t_score; //relative loss
+	  all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric - -1*cur_oracle->mt_metric;
+	  all_hyp[u]->oracle_feat_diff = cur_oracle->features - all_hyp[u]->features;*/
+
+	  all_hyp[u]->fear = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric - hope_score + t_score; //relative loss
+	  all_hyp[u]->oracle_loss = -1*all_hyp[u]->mt_metric + 1*oracleN->mt_metric;
+	  all_hyp[u]->oracle_feat_diff = oracleN->features - all_hyp[u]->features;
+	  all_hyp[u]->oracleN=oracleN;
+	  //	all_hyp[u]->fear = -1 * all_hyp[u]->mt_metric + t_score;
+	  if (PRINT_LIST) cerr << all_hyp[u]->mt_metric << " H:" << all_hyp[u]->hope << " F:" << all_hyp[u]->fear << endl; 
+	  
+	}
+      
+      sort(all_hyp.begin(),all_hyp.end(),FearCompareB);
+      
+      cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size);    
+    }
+    else if(fear_select == 2) //select fear based on cost
+      {
+	cur_bad.insert(cur_bad.begin(), all_hyp.end()-temp_update_size, all_hyp.end()); 
+	reverse(cur_bad.begin(),cur_bad.end());
+      }
+    else //pred-based, fear_select = 3
+      {
+	sort(all_hyp.begin(),all_hyp.end(),FearComparePred);
+	cur_bad.insert(cur_bad.begin(), all_hyp.begin(), all_hyp.begin()+temp_update_size); 
+      }
+
+
+    if(PRINT_LIST){ cerr<< "BAD"<<endl; for(int u=0;u!=cur_bad.size();u++) cerr << cur_bad[u]->mt_metric << " H:" << cur_bad[u]->hope << " F:" << cur_bad[u]->fear << endl;}
+    
+    cerr << "GOOD (BEST): " << cur_good[0]->mt_metric << endl;
+    cerr << " CUR: " << cur_best[0]->mt_metric << endl;
+    cerr << " BAD (WORST): " << cur_bad[0]->mt_metric << endl;
+  }
+};
+
+void ReadTrainingCorpus(const string& fname, vector<string>* c) {
+
+
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  while(in) {
+    getline(in, line);
+    if (!in) break;
+    c->push_back(line);
+  }
+}
+
+void ReadPastTranslationForScore(const int cur_pass, vector<ScoreP>* c, DocScorer& ds, const string& od)
+{
+  cerr << "Reading BLEU gain file ";
+  string fname;
+  if(cur_pass == 0)
+    {
+      fname = od + "/run.raw.init";
+    }
+  else
+    {
+      int last_pass = cur_pass - 1; 
+      fname = od + "/run.raw."  +  boost::lexical_cast<std::string>(last_pass) + ".B";
+    }
+  cerr << fname << "\n";
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  ScoreP acc;
+  string line;
+  int lc = 0;
+  while(in) {
+    getline(in, line);
+    if (line.empty() && !in) break;
+    vector<WordID> sent;
+    TD::ConvertSentence(line, &sent);
+    ScoreP sentscore = ds[lc]->ScoreCandidate(sent);
+    c->push_back(sentscore);
+    if (!acc) { acc = sentscore->GetZero(); }
+    acc->PlusEquals(*sentscore);
+    ++lc;
+ 
+  }
+
+  
+  assert(lc > 0);
+  float score = acc->ComputeScore();
+  string details;
+  acc->ScoreDetails(&details);
+  cerr << "INIT RUN " << details << score << endl;
+
+}
+
+
+int main(int argc, char** argv) {
+  register_feature_functions();
+  SetSilent(true);  // turn off verbose decoder output
+
+  po::variables_map conf;
+  if (!InitCommandLine(argc, argv, &conf)) return 1;
+
+  if (conf.count("random_seed"))
+    rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    rng.reset(new MT19937);
+  
+  vector<string> corpus;
+  //ReadTrainingCorpus(conf["source"].as<string>(), &corpus);
+
+  const string metric_name = conf["mt_metric"].as<string>();
+  optimizer = conf["optimizer"].as<int>();
+  fear_select = conf["fear"].as<int>();
+  hope_select = conf["hope"].as<int>();
+  mt_metric_scale = conf["mt_metric_scale"].as<double>();
+  approx_score = conf.count("approx_score");
+  no_reweight = conf.count("no_reweight");
+  no_select = conf.count("no_select");
+  update_list_size = conf["update_k_best"].as<int>();
+  unique_kbest = conf.count("unique_k_best");
+  pseudo_doc = true;
+
+  const string weights_dir = conf["weights_output"].as<string>();
+  const string output_dir = conf["output_dir"].as<string>();
+  ScoreType type = ScoreTypeFromString(metric_name);
+
+  //establish metric used for tuning
+  if (type == TER) {
+    invert_score = true;
+    // approx_score = false;
+  } else {
+    invert_score = false;
+  }
+
+  //load references
+  DocScorer ds(type, conf["reference"].as<vector<string> >(), "");
+  cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl;
+  vector<ScoreP> corpus_bleu_sent_stats;
+  
+  //check training pass,if >0, then use previous iterations corpus bleu stats
+  cur_pass = conf["passes"].as<int>();
+  if(cur_pass > 0)
+    {
+      ReadPastTranslationForScore(cur_pass, &corpus_bleu_sent_stats, ds, output_dir);
+    }
+  /*  if (ds.size() != corpus.size()) {
+    cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n";
+    return 1;
+    }*/
+  cerr << "Optimizing with " << optimizer << endl;
+  // load initial weights
+  /*Weights weights;
+  weights.InitFromFile(conf["input_weights"].as<string>());
+  SparseVector<double> lambdas;
+  weights.InitSparseVector(&lambdas);
+  */
+
+  
+  
+  ReadFile ini_rf(conf["decoder_config"].as<string>());
+  Decoder decoder(ini_rf.stream());
+
+  vector<weight_t>& dense_weights = decoder.CurrentWeightVector();
+  
+  SparseVector<weight_t> lambdas;
+  Weights::InitFromFile(conf["input_weights"].as<string>(), &dense_weights);
+  Weights::InitSparseVector(dense_weights, &lambdas);
+
+  const string input = decoder.GetConf()["input"].as<string>();
+  //const bool show_feature_dictionary = decoder.GetConf().count("show_feature_dictionary");
+  if (!SILENT) cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl;
+  ReadFile in_read(input);
+  istream *in = in_read.stream();
+  assert(*in);  
+  string buf;
+  
+  const double max_step_size = conf["max_step_size"].as<double>();
+
+
+  //  assert(corpus.size() > 0);
+  vector<GoodBadOracle> oracles(ds.size());
+
+  TrainingObserver observer(conf["k_best_size"].as<int>(), ds, &oracles, &corpus_bleu_sent_stats);
+
+  int cur_sent = 0;
+  int lcount = 0;
+  double objective=0;
+  double tot_loss = 0;
+  int dots = 0;
+  //  int cur_pass = 1;
+  //  vector<double> dense_weights;
+  SparseVector<double> tot;
+  SparseVector<double> final_tot;
+  //  tot += lambdas;          // initial weights
+  //  lcount++;                // count for initial weights
+
+  //string msg = "# MIRA tuned weights";
+  // while (cur_pass <= max_iteration) {
+    SparseVector<double> old_lambdas = lambdas;
+    tot.clear();
+    tot += lambdas;
+    cerr << "PASS " << cur_pass << " " << endl << lambdas << endl; 
+    ScoreP acc, acc_h, acc_f;
+    
+    while(*in) {
+      getline(*in, buf);
+      if (buf.empty()) continue;
+      //for (cur_sent = 0; cur_sent < corpus.size(); cur_sent++) {
+      
+      cerr << "SENT: " << cur_sent << endl;
+      //TODO: allow batch updating
+      //dense_weights.clear();
+      //weights.InitFromVector(lambdas);
+      //weights.InitVector(&dense_weights);
+      //decoder.SetWeights(dense_weights);  
+      lambdas.init_vector(&dense_weights);
+      dense_weights_g = dense_weights;
+      decoder.SetId(cur_sent);
+      decoder.Decode(buf, &observer);  // decode the sentence, calling Notify to get the hope,fear, and model best hyps. 
+      
+      cur_sent = observer.GetCurrentSent();
+      const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis();
+      const HypothesisInfo& cur_good = *oracles[cur_sent].good[0];
+      const HypothesisInfo& cur_bad = *oracles[cur_sent].bad[0];
+
+      vector<shared_ptr<HypothesisInfo> >& cur_good_v = oracles[cur_sent].good;
+      vector<shared_ptr<HypothesisInfo> >& cur_bad_v = oracles[cur_sent].bad;
+      vector<shared_ptr<HypothesisInfo> > cur_best_v = observer.GetCurrentBest();
+
+      tot_loss += cur_hyp.mt_metric;
+      
+      //score hyps to be able to compute corpus level bleu after we finish this iteration through the corpus
+      ScoreP sentscore = ds[cur_sent]->ScoreCandidate(cur_hyp.hyp);
+      if (!acc) { acc = sentscore->GetZero(); }
+      acc->PlusEquals(*sentscore);
+
+      ScoreP hope_sentscore = ds[cur_sent]->ScoreCandidate(cur_good.hyp);
+      if (!acc_h) { acc_h = hope_sentscore->GetZero(); }
+      acc_h->PlusEquals(*hope_sentscore);
+
+      ScoreP fear_sentscore = ds[cur_sent]->ScoreCandidate(cur_bad.hyp);
+      if (!acc_f) { acc_f = fear_sentscore->GetZero(); }
+      acc_f->PlusEquals(*fear_sentscore);
+      
+      if(optimizer == 4) { //single dual coordinate update, cur_good selected on BLEU score only (not model+BLEU)
+	//	if (!ApproxEqual(cur_hyp.mt_metric, cur_good.mt_metric)) {
+      
+	  double margin = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights);
+	  double mt_loss = (cur_good.mt_metric - cur_bad.mt_metric);
+	  const double loss = margin +  mt_loss;
+	  cerr << "LOSS: " << loss << " Margin:" << margin << " BLEUL:" << mt_loss << " " << cur_bad.features.dot(dense_weights) << " " << cur_good.features.dot(dense_weights) <<endl;
+	  //	  if (loss > 0.0) {
+	    SparseVector<double> diff = cur_good.features;
+	    diff -= cur_bad.features;	    
+
+	    double diffsqnorm = diff.l2norm_sq();
+	    double delta;
+	    if (diffsqnorm > 0)
+	      delta = loss / (diffsqnorm);
+	    else
+	      delta = 0;
+	    
+	    //double step_size = loss / diff.l2norm_sq();
+	    cerr << loss << " " << delta << " " << diff << endl;
+	    if (delta > max_step_size) delta = max_step_size;
+	    lambdas += (cur_good.features * delta);
+	    lambdas -= (cur_bad.features * delta);
+	    //cerr << "L: " << lambdas << endl;
+	    //	  }
+	    //	  }
+      }
+      else if(optimizer == 1) //sgd - nonadapted step size
+	{
+	   
+	  lambdas += (cur_good.features) * max_step_size;
+	  lambdas -= (cur_bad.features) * max_step_size;
+	}
+      //cerr << "L: " << lambdas << endl;
+      else if(optimizer == 5) //full mira with n-best list of constraints from oracle, fear, best
+	{
+	  vector<shared_ptr<HypothesisInfo> > cur_constraint;
+	  cur_constraint.insert(cur_constraint.begin(), cur_bad_v.begin(), cur_bad_v.end());
+	  cur_constraint.insert(cur_constraint.begin(), cur_best_v.begin(), cur_best_v.end());
+	  cur_constraint.insert(cur_constraint.begin(), cur_good_v.begin(), cur_good_v.end());
+
+	  bool optimize_again;
+	  vector<shared_ptr<HypothesisInfo> > cur_pair;
+	  //SMO 
+	  for(int u=0;u!=cur_constraint.size();u++)	
+	    cur_constraint[u]->alpha =0;	      
+	  
+	  cur_constraint[0]->alpha =1; //set oracle to alpha=1
+
+	  cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl;
+	  int smo_iter = 10, smo_iter2 = 10;
+	  int iter, iter2 =0;
+	  bool DEBUG_SMO = false;
+	  while (iter2 < smo_iter2)
+	    {
+	      iter =0;
+	      while (iter < smo_iter)
+		{
+		  optimize_again = true;
+		  for (int i = 0; i< cur_constraint.size(); i++)
+		    for (int j = i+1; j< cur_constraint.size(); j++)
+		      {
+			if(DEBUG_SMO) cerr << "start " << i << " " << j <<  endl;
+			cur_pair.clear();
+			cur_pair.push_back(cur_constraint[j]);
+			cur_pair.push_back(cur_constraint[i]);
+			double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights);
+			
+			if (delta == 0) optimize_again = false;
+			//			cur_pair[0]->alpha += delta;
+			//	cur_pair[1]->alpha -= delta;
+			cur_constraint[j]->alpha += delta;
+			cur_constraint[i]->alpha -= delta;
+			double step_size = delta * max_step_size;
+			/*lambdas += (cur_pair[1]->features) * step_size;
+			lambdas -= (cur_pair[0]->features) * step_size;*/
+			lambdas += (cur_constraint[i]->features) * step_size;
+			lambdas -= (cur_constraint[j]->features) * step_size;
+			if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << i << " " << j << " " <<  delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha <<  endl;		
+			
+			//reload weights based on update
+			/*dense_weights.clear();
+			weights.InitFromVector(lambdas);
+			weights.InitVector(&dense_weights);*/
+		      }
+		  iter++;
+		  
+		  if(!optimize_again)
+		    { 
+		      iter = 100;
+		      cerr << "Optimization stopped, delta =0" << endl;
+		    }
+		  
+		  
+		}
+	      iter2++;
+	    }
+
+	  
+	}
+      else if(optimizer == 2 || optimizer == 3) //1-fear and cutting plane mira
+	  {
+	    bool DEBUG_SMO= true;
+	    vector<shared_ptr<HypothesisInfo> > cur_constraint;
+	    cur_constraint.push_back(cur_good_v[0]); //add oracle to constraint set
+	    bool optimize_again = true;
+	    int cut_plane_calls = 0;
+	    while (optimize_again)
+	      { 
+		if(DEBUG_SMO) cerr<< "optimize again: " << optimize_again << endl;
+		if(optimizer == 2){ //1-fear
+		  cur_constraint.push_back(cur_bad_v[0]);
+
+		  //check if we have a violation
+		  if(!(cur_constraint[1]->fear > cur_constraint[0]->fear + SMO_EPSILON))
+		    {
+		      optimize_again = false;
+		      cerr << "Constraint not violated" << endl;
+		    }
+		}
+		else
+		  { //cutting plane to add constraints
+		    if(DEBUG_SMO) cerr<< "Cutting Plane " << cut_plane_calls << " with " << lambdas << endl;
+		    optimize_again = false;
+		    cut_plane_calls++;
+		    CuttingPlane(&cur_constraint, &optimize_again, oracles[cur_sent].bad, dense_weights);
+		    if (cut_plane_calls >= MAX_SMO) optimize_again = false;
+		  }
+
+		if(optimize_again)
+		  {
+		    //SMO 
+		    for(int u=0;u!=cur_constraint.size();u++)	
+		      { 
+			cur_constraint[u]->alpha =0;
+			//cur_good_v[0]->alpha = 1; cur_bad_v[0]->alpha = 0;
+		      }
+		    cur_constraint[0]->alpha = 1;
+		    cerr <<"Optimizing with " << cur_constraint.size() << " constraints" << endl;
+		    int smo_iter = MAX_SMO;
+		    int iter =0;
+		    while (iter < smo_iter)
+		      {			
+			//select pair to optimize from constraint set
+			vector<shared_ptr<HypothesisInfo> > cur_pair = SelectPair(&cur_constraint);
+			
+			if(cur_pair.empty()){iter=MAX_SMO; cerr << "Undefined pair " << endl; continue;} //pair is undefined so we are done with this smo 
+
+			//double num = cur_good_v[0]->fear - cur_bad_v[0]->fear;
+			/*double loss = cur_good_v[0]->oracle_loss - cur_bad_v[0]->oracle_loss;
+			  double margin = cur_good_v[0]->oracle_feat_diff.dot(dense_weights) - cur_bad_v[0]->oracle_feat_diff.dot(dense_weights);
+			  double num = loss - margin;
+			  SparseVector<double> diff = cur_good_v[0]->features;
+			  diff -= cur_bad_v[0]->features;
+			  double delta = num / (diff.l2norm_sq() * max_step_size);
+			  delta = max(-cur_good_v[0]->alpha, min(delta, cur_bad_v[0]->alpha));
+			  cur_good_v[0]->alpha += delta;
+			  cur_bad_v[0]->alpha -= delta;
+			  double step_size = delta * max_step_size;
+			  lambdas += (cur_bad_v[0]->features) * step_size;
+			  lambdas -= (cur_good_v[0]->features) * step_size;
+			*/
+			
+			double delta = ComputeDelta(&cur_pair,max_step_size, dense_weights);
+
+			cur_pair[0]->alpha += delta;
+			cur_pair[1]->alpha -= delta;
+			double step_size = delta * max_step_size;
+			/*			lambdas += (cur_pair[1]->oracle_feat_diff) * step_size;
+						lambdas -= (cur_pair[0]->oracle_feat_diff) * step_size;*/
+			
+			cerr << "step " << step_size << endl;
+			double alpha_sum=0;
+			SparseVector<double> temp_lambdas = lambdas;
+			
+			for(int u=0;u!=cur_constraint.size();u++)	
+			  { 
+			    cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << endl;
+			    temp_lambdas += (cur_constraint[u]->oracleN->features-cur_constraint[u]->features) * cur_constraint[u]->alpha * step_size;
+			    alpha_sum += cur_constraint[u]->alpha;
+			  }
+			cerr << "Alpha sum " << alpha_sum << " " << temp_lambdas << endl;
+						
+			lambdas += (cur_pair[1]->features) * step_size;
+			lambdas -= (cur_pair[0]->features) * step_size;
+			cerr << " Lambdas " << lambdas << endl;
+			//reload weights based on update
+			dense_weights.clear();
+			//weights.InitFromVector(lambdas);
+			//weights.InitVector(&dense_weights);
+			lambdas.init_vector(&dense_weights);
+			dense_weights_g = dense_weights;
+			iter++;
+					
+			if(DEBUG_SMO) cerr << "SMO opt " << iter << " " << delta << " " << cur_pair[0]->alpha << " " << cur_pair[1]->alpha <<  endl;		
+			//		cerr << "SMO opt " << iter << " " << delta << " " << cur_good_v[0]->alpha << " " << cur_bad_v[0]->alpha <<  endl;
+			if(no_select) //don't use selection heuristic to determine when to stop SMO, rather just when delta =0 
+			  if (delta == 0) iter = MAX_SMO;
+			
+			//only perform one dual coordinate ascent step
+			if(optimizer == 2) 
+			  {
+			    optimize_again = false;
+			    iter = MAX_SMO;
+			  }		
+			
+		      }
+		    if(optimizer == 3)
+		      {
+			if(!no_reweight)
+			  {
+			    if(DEBUG_SMO) cerr<< "Decoding with new weights -- now orac are " << oracles[cur_sent].good.size() << endl;
+			    Hypergraph hg = observer.GetCurrentForest();
+			    hg.Reweight(dense_weights);
+			    //observer.UpdateOracles(cur_sent, hg);
+			    if(unique_kbest)
+                              observer.UpdateOracles<KBest::FilterUnique>(cur_sent, hg);
+                            else
+                              observer.UpdateOracles<KBest::NoFilter<std::vector<WordID> > >(cur_sent, hg);
+
+			    
+			  }
+		      }
+		  }
+		
+		
+	      }
+	   
+	    //print objective after this sentence
+	    double lambda_change = (lambdas - old_lambdas).l2norm_sq();
+	    double max_fear = cur_constraint[cur_constraint.size()-1]->fear;
+	    double temp_objective = 0.5 * lambda_change;// + max_step_size * max_fear;
+
+	    for(int u=0;u!=cur_constraint.size();u++)	
+	      { 
+		cerr << cur_constraint[u]->alpha << " " << cur_constraint[u]->hope << " " << cur_constraint[u]->fear << endl;
+		temp_objective += cur_constraint[u]->alpha * cur_constraint[u]->fear;
+	      }
+	    objective += temp_objective;
+	    
+	    cerr << "SENT OBJ: " << temp_objective << " NEW OBJ: " << objective << endl;
+	  }
+      
+    
+      if ((cur_sent * 40 / ds.size()) > dots) { ++dots; cerr << '.'; }
+      tot += lambdas;
+      ++lcount;
+      cur_sent++;
+      
+      cout << TD::GetString(cur_good_v[0]->hyp) << " ||| " << TD::GetString(cur_best_v[0]->hyp) << " ||| " << TD::GetString(cur_bad_v[0]->hyp) << endl;
+
+      //clear good/bad lists from oracles for this sentences  - you want to keep them around for things
+      
+      //      oracles[cur_sent].good.clear();
+      //oracles[cur_sent].bad.clear();
+    }
+
+    cerr << "FINAL OBJECTIVE: "<< objective << endl;
+    final_tot += tot;
+    cerr << "Translated " << lcount << " sentences " << endl;
+    cerr << " [AVG METRIC LAST PASS=" << (tot_loss / lcount) << "]\n";
+    tot_loss = 0;
+    /*
+      float corpus_score = acc->ComputeScore();
+      string corpus_details;
+      acc->ScoreDetails(&corpus_details);
+      cerr << "MODEL " << corpus_details << endl;
+      cout << corpus_score << endl;
+      
+      corpus_score = acc_h->ComputeScore();
+      acc_h->ScoreDetails(&corpus_details);
+      cerr << "HOPE " << corpus_details << endl;
+      cout << corpus_score << endl;
+      
+      corpus_score = acc_f->ComputeScore();
+      acc_f->ScoreDetails(&corpus_details);
+      cerr << "FEAR " << corpus_details << endl;
+      cout << corpus_score << endl;
+    */
+    int node_id = rng->next() * 100000;
+    cerr << " Writing weights to " << node_id << endl;
+    Weights::ShowLargestFeatures(dense_weights);
+    dots = 0;
+    ostringstream os;
+    os << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << ".gz";
+    string msg = "# MIRA tuned weights ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount);
+    //Weights.InitFromVector(lambdas);
+    lambdas.init_vector(&dense_weights);
+    Weights::WriteToFile(os.str(), dense_weights, true, &msg);
+
+    SparseVector<double> x = tot;
+    x /= lcount;
+    ostringstream sa;
+    string msga = "# MIRA tuned weights AVERAGED ||| " + boost::lexical_cast<std::string>(node_id) + " ||| " + boost::lexical_cast<std::string>(lcount);
+    sa << weights_dir << "/weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "." << node_id << "-avg.gz";
+    //Weights ww;
+    //ww.InitFromVector(x);
+    x.init_vector(&dense_weights);
+    Weights::WriteToFile(sa.str(), dense_weights, true, &msga);
+
+    //assign averaged lambdas to initialize next iteration
+    //lambdas = x;
+
+    /*    double lambda_change = (old_lambdas - lambdas).l2norm_sq();
+    cerr << "Change in lambda " << lambda_change << endl;
+    
+    if ( lambda_change < EPSILON)
+      {
+	cur_pass = max_iteration;
+	cerr << "Weights converged - breaking" << endl;
+      }
+            
+    ++cur_pass;
+    */
+    
+    //} iteration while loop
+ 
+    /* cerr << endl;
+  weights.WriteToFile("weights.mira-final.gz", true, &msg);
+  final_tot /= (lcount + 1);//max_iteration);
+  tot /= (corpus.size() + 1);
+  weights.InitFromVector(final_tot);
+  cerr << tot << "||||" << final_tot << endl;
+  msg = "# MIRA tuned weights (averaged vector)";
+  weights.WriteToFile("weights.mira-final-avg.gz", true, &msg);
+    */
+  cerr << "Optimization complete.\\AVERAGED WEIGHTS: weights.mira-final-avg.gz\n";
+  return 0;
+}
+
diff --git a/training/mira/run_mira.pl b/training/mira/run_mira.pl
new file mode 100755
index 00000000..f4d61407
--- /dev/null
+++ b/training/mira/run_mira.pl
@@ -0,0 +1,548 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0));
+push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+
+require "libcall.pl";
+
+
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+
+my $iteration = 0.0;
+my $max_iterations = 6;
+my $metric = "ibm_bleu";
+my $iniFile;
+my $weights;
+my $initialWeights;
+my $decode_nodes = 1;   # number of decode nodes
+my $pmem = "1g";
+my $dir;
+
+my $SCORER = $FAST_SCORE;
+my $local_server = "$bin_dir/local_parallelize.pl";
+my $parallelize = "$bin_dir/../dpmert/parallelize.pl";
+my $libcall = "$bin_dir/../dpmert/libcall.pl";
+my $sentserver = "$bin_dir/../dpmert/sentserver";
+my $sentclient = "$bin_dir/../dpmert/sentclient";
+my $run_local_server = 0;
+my $run_local = 0;
+my $usefork;
+my $pass_suffix = '';
+
+my $cdec ="$bin_dir/kbest_mirav5"; #"$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mira_lv";
+
+#my $cdec ="$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mirav5"; #"$bin_dir/kbest_mira_rmmv2"; #"$bin_dir/kbest_mira_lv";
+die "Can't find decoder in $cdec" unless -x $cdec;
+my $decoder = $cdec;
+my $decoderOpt;
+my $update_size=250;
+my $approx_score;
+my $kbest_size=250;
+my $metric_scale=1;
+my $optimizer=2;
+my $disable_clean = 0;
+my $use_make;  # use make to parallelize line search
+my $density_prune;
+my $cpbin=1;
+my $help = 0;
+my $epsilon = 0.0001;
+my $step_size = 0.01;
+my $gpref;
+my $unique_kbest;
+my $freeze;
+my $latent;
+my $sample_max;
+my $hopes=1;
+my $fears=1;
+
+my $range = 35000;
+my $minimum = 15000;
+my $portn = int(rand($range)) + $minimum;
+
+
+# Process command-line options
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+        "decoder=s" => \$decoderOpt,
+        "decode-nodes=i" => \$decode_nodes,
+        "density-prune=f" => \$density_prune,
+        "dont-clean" => \$disable_clean,
+        "pass-suffix=s" => \$pass_suffix,
+        "use-fork" => \$usefork,
+        "epsilon=s" => \$epsilon,
+        "help" => \$help,
+        "local" => \$run_local,
+	"local_server" => \$run_local_server,
+        "use-make=i" => \$use_make,
+        "max-iterations=i" => \$max_iterations,
+        "pmem=s" => \$pmem,
+        "cpbin!" => \$cpbin,
+        "ref-files=s" => \$refFiles,
+        "metric=s" => \$metric,
+        "source-file=s" => \$srcFile,
+        "weights=s" => \$initialWeights,
+	"optimizer=i" => \$optimizer,
+	"metric-scale=i" => \$metric_scale,
+	"kbest-size=i" => \$kbest_size,
+	"update-size=i" => \$update_size,
+	"step-size=f" => \$step_size,
+	"hope-select=i" => \$hopes,
+	"fear-select=i" => \$fears,
+	"approx-score" => \$approx_score,
+	"unique-kbest" => \$unique_kbest,
+	"latent" => \$latent,
+	"sample-max=i" => \$sample_max,
+        "grammar-prefix=s" => \$gpref,
+	"freeze" => \$freeze,
+        "workdir=s" => \$dir,
+	) == 0 || @ARGV!=1 || $help) {
+        print_help();
+        exit;
+}
+
+($iniFile) = @ARGV;
+
+
+sub write_config;
+sub enseg;
+sub print_help;
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+
+#my $refs_comma_sep = get_comma_sep_refs($refFiles);
+my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
+
+#my $refs_comma_sep_4cdec = get_comma_sep_refs_4cdec($refFiles);
+
+unless ($dir){
+        $dir = "mira";
+}
+unless ($dir =~ /^\//){  # convert relative path to absolute path
+        my $basedir = check_output("pwd");
+        chomp $basedir;
+        $dir = "$basedir/$dir";
+}
+
+if ($decoderOpt){ $decoder = $decoderOpt; }
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+        print STDERR "Cleanup...\n";
+        for my $pid (@childpids){ unchecked_call("kill $pid"); }
+        for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+        exit 1;
+};
+
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit =
+    sub{ cleanup(); };
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+
+
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+use File::Basename qw(basename);
+#pass bindir, refs to vars holding bin
+sub modbin {
+    local $_;
+    my $bindir=shift;
+    check_call("mkdir -p $bindir");
+    -d $bindir || die "couldn't make bindir $bindir";
+    for (@_) {
+        my $src=$$_;
+        $$_="$bindir/".basename($src);
+        check_call("cp -p $src $$_");
+    }
+}
+sub dirsize {
+    opendir ISEMPTY,$_[0];
+    return scalar(readdir(ISEMPTY))-1;
+}
+
+
+
+
+if (-e $dir && dirsize($dir)>1 && -e "$dir/weights" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs
+    die "ERROR: working dir $dir already exists\n\n";
+} else {
+    -e $dir || mkdir $dir;
+    mkdir "$dir/scripts";
+    my $cmdfile="$dir/rerun-mira.sh";
+    open CMD,'>',$cmdfile;
+    print CMD "cd ",&getcwd,"\n";
+    my $cline=&cmdline."\n";
+    print CMD $cline;
+    close CMD;
+    print STDERR $cline;
+    chmod(0755,$cmdfile);
+    unless (-e $initialWeights) {
+        print STDERR "Please specify an initial weights file with --initial-weights\n";
+        print_help();
+        exit;
+    }
+    check_call("cp $initialWeights $dir/weights.0");
+    die "Can't find weights.0" unless (-e "$dir/weights.0");
+}
+write_config(*STDERR);
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+$iniFile = $newIniFile;
+
+my $newsrc = "$dir/dev.input";
+enseg($srcFile, $newsrc, $gpref);
+
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+my $lastPScore = 0;
+my $lastWeightsFile;
+
+# main optimization loop
+#while (1){
+for (my $opt_iter=0; $opt_iter<$max_iterations; $opt_iter++) {
+
+	print STDERR "\n\nITERATION $opt_iter\n==========\n";
+	print STDERR "Using port $portn\n";
+
+	# iteration-specific files
+	my $runFile="$dir/run.raw.$opt_iter";
+	my $onebestFile="$dir/1best.$opt_iter";
+	my $logdir="$dir/logs.$opt_iter";
+	my $decoderLog="$logdir/decoder.sentserver.log.$opt_iter";
+	my $scorerLog="$logdir/scorer.log.$opt_iter";
+	my $weightdir="$dir/weights.pass$opt_iter/";
+	check_call("mkdir -p $logdir");
+	check_call("mkdir -p $weightdir");
+
+	#decode
+	print STDERR "RUNNING DECODER AT ";
+	print STDERR unchecked_output("date");
+#	my $im1 = $opt_iter - 1;
+	my $weightsFile="$dir/weights.$opt_iter";
+	print "ITER $iteration " ;
+	my $cur_pass = "-p 0$opt_iter";
+	my $decoder_cmd = "$decoder -c $iniFile -w $weightsFile $refs_comma_sep -m $metric -s $metric_scale -a -b $update_size -k $kbest_size -o $optimizer $cur_pass -O $weightdir -D $dir  -h $hopes -f $fears -C $step_size";
+	if($unique_kbest){
+		$decoder_cmd .= " -u";
+	}
+	if($latent){
+		$decoder_cmd .= " -l";
+	}
+	if($sample_max){
+		$decoder_cmd .= " -t $sample_max";
+	}
+	if ($density_prune) {
+		$decoder_cmd .= " --density_prune $density_prune";
+	}
+	my $pcmd;
+	if ($run_local) {
+		$pcmd = "cat $srcFile |";
+	} elsif ($use_make) {
+	    # TODO: Throw error when decode_nodes is specified along with use_make
+		$pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $use_make --";
+	} elsif ($run_local_server){
+	    $pcmd = "cat $srcFile | $local_server $usefork -p $pmem -e $logdir -n $decode_nodes --";
+	}
+	else {
+	    $pcmd = "cat $srcFile | $parallelize $usefork -p $pmem -e $logdir -j $decode_nodes --baseport $portn --";
+	}
+	my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+	print STDERR "COMMAND:\n$cmd\n";
+	check_bash_call($cmd);
+
+	my $retries = 0;
+        my $num_topbest;
+        while($retries < 5) {
+            $num_topbest = check_output("wc -l < $runFile");
+            print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+            if($devSize == $num_topbest) {
+                last;
+            } else {
+                print STDERR "Incorrect number of topbest. Waiting for distributed filesystem and retrying...\n";
+                sleep(3);
+            }
+            $retries++;
+        }
+	 die "Dev set contains $devSize sentences, but we don't have topbest for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_topbest);
+
+
+	#score the output from this iteration
+	open RUN, "<$runFile" or die "Can't read $runFile: $!";
+	open H, ">$runFile.H" or die;
+	open F, ">$runFile.F" or die;
+	open B, ">$runFile.B" or die;
+	while(<RUN>) {
+	    chomp();
+	    (my $hope,my $best,my $fear) = split(/ \|\|\| /);
+	    print H "$hope \n"; 	    
+	    print B "$best \n";
+ 	    print F "$fear \n";
+	}
+	close RUN;
+	close F; close B; close H;
+	
+	my $dec_score = check_output("cat $runFile.B | $SCORER $refs_comma_sep -l $metric");
+	my $dec_score_h = check_output("cat $runFile.H | $SCORER $refs_comma_sep -l $metric");
+	my $dec_score_f = check_output("cat $runFile.F | $SCORER $refs_comma_sep -l $metric");
+	chomp $dec_score; chomp $dec_score_h; chomp $dec_score_f;
+	print STDERR "DECODER SCORE: $dec_score HOPE: $dec_score_h FEAR: $dec_score_f\n";
+
+	# save space
+	check_call("gzip -f $runFile");
+	check_call("gzip -f $decoderLog");
+		my $iter_filler="";
+	if($opt_iter < 10)
+	{$iter_filler="0";}
+
+	my $nextIter = $opt_iter + 1;
+	my $newWeightsFile = "$dir/weights.$nextIter";
+	$lastWeightsFile = "$dir/weights.$opt_iter";
+
+	average_weights("$weightdir/weights.mira-pass*.*[0-9].gz", $newWeightsFile, $logdir);
+#	check_call("cp $lastW $newWeightsFile");
+#	if ($icc < 2) {
+#		print STDERR "\nREACHED STOPPING CRITERION: score change too little\n";
+#		last;
+#	}
+	system("gzip -f $logdir/kbes*");
+	print STDERR "\n==========\n";
+	$iteration++;
+}
+#find 
+#my $cmd = `grep SCORE /fs/clip-galep5/lexical_tm/log.runmira.nist.20 | cat -n | sort -k +2 | tail -1`;
+#$cmd =~ m/([0-9]+)/;
+#$lastWeightsFile = "$dir/weights.$1";
+#check_call("ln -s $lastWeightsFile $dir/weights.tuned");
+print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
+
+print STDOUT "$lastWeightsFile\n";
+
+sub get_lines {
+  my $fn = shift @_;
+  open FL, "<$fn" or die "Couldn't read $fn: $!";
+  my $lc = 0;
+  while(<FL>) { $lc++; }
+  return $lc;
+}
+
+sub get_comma_sep_refs {
+  my ($r,$p) = @_;
+  my $o = check_output("echo $p");
+  chomp $o;
+  my @files = split /\s+/, $o;
+  return "-$r " . join(" -$r ", @files);
+}
+
+
+sub read_weights_file {
+  my ($file) = @_;
+  open F, "<$file" or die "Couldn't read $file: $!";
+  my @r = ();
+  my $pm = -1;
+  while(<F>) {
+    next if /^#/;
+    next if /^\s*$/;
+    chomp;
+    if (/^(.+)\s+(.+)$/) {
+      my $m = $1;
+      my $w = $2;
+      die "Weights out of order: $m <= $pm" unless $m > $pm;
+      push @r, $w;
+    } else {
+      warn "Unexpected feature name in weight file: $_";
+    }
+  }
+  close F;
+  return join ' ', @r;
+}
+
+sub write_config {
+	my $fh = shift;
+	my $cleanup = "yes";
+	if ($disable_clean) {$cleanup = "no";}
+
+	print $fh "\n";
+	print $fh "DECODER:          $decoder\n";
+	print $fh "INI FILE:         $iniFile\n";
+	print $fh "WORKING DIR:      $dir\n";
+	print $fh "SOURCE (DEV):     $srcFile\n";
+	print $fh "REFS (DEV):       $refFiles\n";
+	print $fh "EVAL METRIC:      $metric\n";
+	print $fh "START ITERATION:  $iteration\n";
+	print $fh "MAX ITERATIONS:   $max_iterations\n";
+	print $fh "DECODE NODES:     $decode_nodes\n";
+	print $fh "HEAD NODE:        $host\n";
+	print $fh "PMEM (DECODING):  $pmem\n";
+	print $fh "CLEANUP:          $cleanup\n";
+	print $fh "INITIAL WEIGHTS:  $initialWeights\n";
+        print $fh "GRAMMAR PREFIX:   $gpref\n";
+}
+
+sub update_weights_file {
+  my ($neww, $rfn, $rpts) = @_;
+  my @feats = @$rfn;
+  my @pts = @$rpts;
+  my $num_feats = scalar @feats;
+  my $num_pts = scalar @pts;
+  die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+  open G, ">$neww" or die;
+  for (my $i = 0; $i < $num_feats; $i++) {
+    my $f = $feats[$i];
+    my $lambda = $pts[$i];
+    print G "$f $lambda\n";
+  }
+  close G;
+}
+
+sub enseg {
+    my $src = shift;
+    my $newsrc = shift;
+    my $grammarpref = shift;
+
+    open(SRC, $src);
+    open(NEWSRC, ">$newsrc");
+    my $i=0;
+    while (my $line=<SRC>){
+	chomp $line;
+	if ($line =~ /^\s*<seg/i) {
+	    if($line =~ /id="[0-9]+"/) {
+		print NEWSRC "$line\n";
+	    } else {
+		die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+	    }
+	}
+	elsif (defined $grammarpref) {
+	    print NEWSRC "<seg id=\"$i\" grammar=\"$grammarpref.$i.gz\">$line</seg>\n";}
+	else {
+	    print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+	}
+	$i++;
+    }
+    close SRC;
+    close NEWSRC;
+}
+
+sub print_help {
+	print "Something wrong\n";
+}
+
+sub cmdline {
+    return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+    my ($arg)=@_;
+    return undef unless defined $arg;
+    if ($arg =~ /$is_shell_special/) {
+        $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+        return "\"$arg\"";
+    }
+    return $arg;
+}
+
+sub escaped_shell_args {
+    return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+    return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+    return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
+
+sub average_weights {
+
+    my $path = shift;
+    my $out = shift;
+    my $logpath = shift;
+    print "AVERAGE $path $out\n";
+    my %feature_weights= ();
+    my $total =0;
+    my $total_mult =0;
+    sleep(10);
+    foreach my $file (glob "$path")
+    {
+	$file =~ /\/([^\/]+).gz$/;
+	my $fname = $1;
+	my $cmd = "gzip -d $file";
+	$file =~ s/\.gz//;
+	check_bash_call($cmd);
+	my $mult = 0;
+	print "FILE $file \n";
+	open SCORE, "< $file" or next;
+	$total++;
+	while( <SCORE> ) {
+	    my $line = $_;
+	    if ($line !~ m/^\#/)
+	    {
+		my @s = split(" ",$line);
+		$feature_weights{$s[0]}+= $mult * $s[1];
+	    }
+	    else
+	    {
+		(my $msg,my $ran,$mult) = split(/ \|\|\| /);
+		print "RAN $ran $mult\n";
+	    }
+	}
+	$total_mult += $mult;
+	
+	close SCORE;
+	$cmd = "gzip $file"; check_bash_call($cmd);
+    }
+    
+#print out new averaged weights
+    open OUT, "> $out" or next;
+    for my $f ( keys %feature_weights ) {
+	print "$f $feature_weights{$f} $total_mult\n";
+	my $ave = $feature_weights{$f} / $total_mult;
+	
+	print "Printing $f $ave ||| ";
+	print OUT "$f $ave\n";
+    }
+    
+}