oracle directions

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@276 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-16 01:56:34 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-16 01:56:34 +0000
commit: d7d59c4bb81262f1dfece384ec68fa2c25096843 (patch)
tree: 5521dc624dc23adeb3bc9d9c8f8fecc7feb57724
parent: ff323448416bbfa691a9697ddf3b30a0398fa08a (diff)
9 files changed, 174 insertions, 75 deletions
diff --git a/decoder/cdec.cc b/decoder/cdec.cc
index e616f1bb..75c907b1 100644
--- a/decoder/cdec.cc
+++ b/decoder/cdec.cc
@@ -308,7 +308,7 @@ bool prelm_weights_string(po::variables_map const& conf,string &s)
 }
 
 
-void forest_stats(Hypergraph &forest,string name,bool show_tree,bool show_features,FeatureWeights *weights=0) {
+void forest_stats(Hypergraph &forest,string name,bool show_tree,bool show_features,WeightVector *weights=0) {
     cerr << viterbi_stats(forest,name,true,show_tree);
     if (show_features) {
       cerr << name<<"     features: ";
@@ -601,33 +601,14 @@ int main(int argc, char** argv) {
     vector<WordID> trans;
     ViterbiESentence(forest, &trans);
 
+
     /*Oracle Rescoring*/
     if(get_oracle_forest) {
-      Timer t("Forest Oracle rescoring:");
-
-      oracle.DumpKBest(conf,"model",sent_id, forest, 10, true);
-
-      Translation best(forest);
-      {
-        Hypergraph oracle_forest;
-        oracle.Rescore(smeta,forest,&oracle_forest,feature_weights,1.0);
-        forest.swap(oracle_forest);
-      }
-      Translation oracle_trans(forest);
-
+      Oracles o=oracles.ComputeOracles(smeta,forest,feature_weights,&cerr,10,conf["forest_output"].as<std::string>());
       cerr << "  +Oracle BLEU forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl;
       cerr << "  +Oracle BLEU (paths): " << forest.NumberOfPaths() << endl;
-      oracle_trans.Print(cerr,"  +Oracle BLEU");
-      //compute kbest for oracle
-      oracle.DumpKBest(conf,"oracle",sent_id, forest, 10, true);
-
-      //reweight the model with -1 for the BLEU feature to compute k-best list for negative examples
-      oracle.ReweightBleu(&forest,-1.0);
-      Translation neg_trans(forest);
-      neg_trans.Print(cerr,"  -Oracle BLEU");
-      //compute kbest for negative
-      oracle.DumpKBest(conf,"negative",sent_id, forest, 10, true);
-
+      o.hope.Print(cerr,"  +Oracle BLEU");
+      o.fear.Print(cerr,"  -Oracle BLEU");
       //Add 1-best translation (trans) to psuedo-doc vectors
       oracle.IncludeLastScore(&cerr);
     }
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
index cd56f1a5..2ffd6ef8 100755
--- a/decoder/ff_fsa.h
+++ b/decoder/ff_fsa.h
@@ -1,9 +1,14 @@
 #ifndef FF_FSA_H
 #define FF_FSA_H
 
+#include <stdint.h> //C99
 #include <string>
 #include "ff.h"
 #include "sparse_vector.h"
+#include "value_array.h"
+
+typedef ValueArray<uint8_t> Bytes;
+
 /*
 
  */
diff --git a/decoder/logval.h b/decoder/logval.h
index 9aaba557..c8c342a3 100644
--- a/decoder/logval.h
+++ b/decoder/logval.h
@@ -58,6 +58,12 @@ class LogVal {
     return *this += b;
   }
 
+  // LogVal(fabs(log(x)),x.s_)
+  friend LogVal abslog(LogVal x) {
+    if (x.v_<0) x.v_=-x.v_;
+    return x;
+  }
+
   LogVal& poweq(const T& power) {
 #if LOGVAL_CHECK_NEG
     if (s_) {
diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h
index 5fef53fd..550f438f 100755
--- a/decoder/oracle_bleu.h
+++ b/decoder/oracle_bleu.h
@@ -37,7 +37,31 @@ struct Translation {
     out<<pre<<"Viterbi: "<<TD::GetString(sentence)<<"\n";
     out<<pre<<"features: "<<features<<std::endl;
   }
+  bool is_null() {
+    return features.size()==0 /* && sentence.size()==0 */;
+  }
+
+};
+
+struct Oracles {
+  bool is_null() {
+    return model.is_null() /* && fear.is_null() && hope.is_null() */;
+  }
 
+  Translation model,fear,hope;
+  // feature 0 will be the error rate in fear and hope
+  // move toward hope
+  WeightVector ModelHopeGradient() {
+    WeightVector r=hope-model;
+    r[0]=0;
+    return r;
+  }
+  // move toward hope from fear
+  WeightVector FearHopeGradient() {
+    Weightvector r=hope-fear;
+    r[0]=0;
+    return r;
+  }
 };
 
 
@@ -53,6 +77,7 @@ struct OracleBleu {
     opts->add_options()
       ("references,R", value<Refs >(), "Translation reference files")
       ("oracle_loss", value<string>(), "IBM_BLEU_3 (default), IBM_BLEU etc")
+      ("bleu_weight", value<double>()->default_value(1.), "weight to give the hope/fear loss function vs. model score")
       ;
   }
   int order;
@@ -66,17 +91,20 @@ struct OracleBleu {
   double doc_src_length;
   void set_oracle_doc_size(int size) {
     oracle_doc_size=size;
-    scale_oracle=  1-1./oracle_doc_size;\
+    scale_oracle=  1-1./oracle_doc_size;
     doc_src_length=0;
   }
   OracleBleu(int doc_size=10) {
     set_oracle_doc_size(doc_size);
   }
 
-  boost::shared_ptr<Score> doc_score,sentscore; // made from factory, so we delete them
+  typedef boost::shared_ptr<Score> ScoreP;
+  ScoreP doc_score,sentscore; // made from factory, so we delete them
 
+  double bleu_weight;
   void UseConf(boost::program_options::variables_map const& conf) {
     using namespace std;
+    bleu_weight=conf["bleu_weight"].as<double>();
     set_loss(conf["oracle_loss"].as<string>());
     set_refs(conf["references"].as<Refs>());
   }
@@ -108,21 +136,48 @@ struct OracleBleu {
     ViterbiFSentence(forest,&srcsent);
     SentenceMetadata sm(sent_id,Lattice()); //TODO: make reference from refs?
     sm.SetSourceLength(srcsent.size());
+	smeta.SetScore(doc_score.get());
+	smeta.SetDocScorer(&ds);
+	smeta.SetDocLen(doc_src_length);
     return sm;
   }
 
-  void Rescore(SentenceMetadata & smeta,Hypergraph const& forest,Hypergraph *dest_forest,WeightVector const& feature_weights,double bleu_weight=1.0) {
-    Translation model_trans(forest);
-    sentscore.reset(ds[smeta.GetSentenceID()]->ScoreCandidate(model_trans.sentence));
+  Oracles ComputeOracles(SentenceMetadata & smeta,Hypergraph const& forest,WeightVector const& feature_weights,std::ostream *log=0,unsigned kbest=0,std::string const& forest_output="") {
+    Oracles r;
+    int sent_id=smeta.GetSentenceID();
+    r.model=Translation(forest);
+
+    if (kbest) DumpKBest("model",sent_id, forest, kbest, true, forest_output);
+    {
+      Timer t("Forest Oracle rescoring:");
+      Hypergraph oracle_forest;
+      Rescore(smeta,forest,&oracle_forest,feature_weights,blue_weight,log);
+      forest.swap(oracle_forest);
+    }
+    r.hope=Translation(forest);
+    if (kbest) DumpKBest("oracle",sent_id, forest, kbest, true, forest_output);
+    oracle.ReweightBleu(&forest,-blue_weight);
+    r.fear=Translation(forest);
+    if (kbest) DumpKBest("negative",sent_id, forest, kbest, true, forest_output);
+    return r;
+  }
+
+  ScoreP Score(Sentence const& sentence,int sent_id) {
+    return ds[sent_id]->ScoreCandidate(sentence);
+  }
+  ScoreP Score(Hypergraph const& forest,int sent_id) {
+    return Score(model_trans(forest).translation,sent_id);
+  }
+
+  void Rescore(SentenceMetadata & smeta,Hypergraph const& forest,Hypergraph *dest_forest,WeightVector const& feature_weights,double bleu_weight=1.0,std::ostream *log=&std::cerr) {
+    // the sentence bleu stats will get added to doc only if you call IncludeLastScore
+    sentscore=Score(forest,smeta.GetSentenceID());
 	if (!doc_score) { doc_score.reset(sentscore->GetOne()); }
 	tmp_src_length = smeta.GetSourceLength(); //TODO: where does this come from?
-	smeta.SetScore(doc_score.get());
-	smeta.SetDocLen(doc_src_length);
-	smeta.SetDocScorer(&ds);
     using namespace std;
-    ModelSet oracle_models(FeatureWeights(bleu_weight,1),vector<FeatureFunction const*>(1,pff.get()));
+    ModelSet oracle_models(WeightVector(bleu_weight,1),vector<FeatureFunction const*>(1,pff.get()));
     const IntersectionConfiguration inter_conf_oracle(0, 0);
-	cerr << "Going to call Apply Model " << endl;
+	if (log) *log << "Going to call Apply Model " << endl;
 	ApplyModelSet(forest,
                   smeta,
                   oracle_models,
@@ -190,10 +245,10 @@ struct OracleBleu {
     }
   }
 
-  void DumpKBest(boost::program_options::variables_map const& conf,std::string const& suffix,const int sent_id, const Hypergraph& forest, const int k, const bool unique)
+void DumpKBest(boost::program_options::variables_map const& conf,std::string const& suffix,const int sent_id, const Hypergraph& forest, const int k, const bool unique, std::string const& forest_output)
   {
     std::ostringstream kbest_string_stream;
-    kbest_string_stream << conf["forest_output"].as<std::string>() << "/kbest_"<<suffix<< "." << sent_id;
+    kbest_string_stream << forest_output << "/kbest_"<<suffix<< "." << sent_id;
     DumpKBest(sent_id, forest, k, unique, kbest_string_stream.str());
   }
 
diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h
index bda10974..c6c57150 100644
--- a/decoder/sparse_vector.h
+++ b/decoder/sparse_vector.h
@@ -20,10 +20,21 @@ public:
   SparseVector() {}
   explicit SparseVector(std::vector<T> const& v) {
     typename MapType::iterator p=values_.begin();
-    for (unsigned i=0;i<v.size();++i)
-      p=values_.insert(p,typename MapType::value_type(i,v[i])); //faster
+    const T z=T(0);
+    for (unsigned i=0;i<v.size();++i) {
+      T const& t=v[i];
+      if (t!=z)
+        p=values_.insert(p,typename MapType::value_type(i,t)); //hint makes insertion faster
+    }
+
+  }
+
+  void set_new_value(int index, T const& val) {
+    assert(values_.find(index)==values_.end());
+    values_[index]=val;
   }
 
+
   const T operator[](int index) const {
     typename MapType::const_iterator found = values_.find(index);
     if (found == values_.end())
@@ -265,9 +276,29 @@ private:
   MapType values_;
 };
 
+// doesn't support fast indexing directly
+template <class T>
+class SparseVectorList {
+  typedef std::vector<const int,T> ListType;
+  typedef typename ListType::value_type pair_type;
+  typedef typename ListType::const_iterator const_iterator;
+  SparseVectorList() {  }
+  explicit SparseVectorList(std::vector<T> const& v) {
+    const T z=T(0);
+    for (unsigned i=0;i<v.size();++i) {
+      T const& t=v[i];
+      if (t!=z)
+        p.push_back(pair_type(i,t));
+    }
+    p.resize(p.size());
+  }
+private:
+  ListType p;
+};
+
+
 typedef SparseVector<double> FeatureVector;
-typedef std::vector<double> FeatureWeights;
-typedef FeatureWeights WeightVector;
+typedef SparseVector<double> WeightVector;
 
 template <typename T>
 SparseVector<T> operator+(const SparseVector<T>& a, const SparseVector<T>& b) {
diff --git a/decoder/value_array.h b/decoder/value_array.h
index bfdd1155..7401938a 100755
--- a/decoder/value_array.h
+++ b/decoder/value_array.h
@@ -1,12 +1,12 @@
 #ifndef VALUE_ARRAY_H
 #define VALUE_ARRAY_H
 
-# include <cstdlib>
-# include <algorithm>
-# include <new>
-# include <boost/range.hpp>
-# include <boost/utility/enable_if.hpp>
-# include <boost/type_traits.hpp>
+#include <cstdlib>
+#include <algorithm>
+#include <new>
+#include <boost/range.hpp>
+#include <boost/utility/enable_if.hpp>
+#include <boost/type_traits.hpp>
 #ifdef USE_BOOST_SERIALIZE
 # include <boost/serialization/split_member.hpp>
 # include <boost/serialization/access.hpp>
diff --git a/decoder/viterbi.cc b/decoder/viterbi.cc
index f11b77ec..7719de32 100644
--- a/decoder/viterbi.cc
+++ b/decoder/viterbi.cc
@@ -116,7 +116,7 @@ inline bool close_enough(double a,double b,double epsilon)
     return diff<=epsilon*fabs(a) || diff<=epsilon*fabs(b);
 }
 
-FeatureVector ViterbiFeatures(Hypergraph const& hg,FeatureWeights const* weights,bool fatal_dotprod_disagreement) {
+FeatureVector ViterbiFeatures(Hypergraph const& hg,WeightVector const* weights,bool fatal_dotprod_disagreement) {
   FeatureVector r;
   const prob_t p = Viterbi<FeatureVectorTraversal>(hg, &r);
   if (weights) {
diff --git a/decoder/viterbi.h b/decoder/viterbi.h
index 4697590b..388bff3c 100644
--- a/decoder/viterbi.h
+++ b/decoder/viterbi.h
@@ -205,6 +205,6 @@ int ViterbiELength(const Hypergraph& hg);
 int ViterbiPathLength(const Hypergraph& hg);
 
 /// if weights supplied, assert viterbi prob = features.dot(*weights) (exception if fatal, cerr warn if not).  return features (sum over all edges in viterbi derivation)
-FeatureVector ViterbiFeatures(Hypergraph const& hg,FeatureWeights const* weights=0,bool fatal_dotprod_disagreement=false);
+FeatureVector ViterbiFeatures(Hypergraph const& hg,WeightVector const* weights=0,bool fatal_dotprod_disagreement=false);
 
 #endif
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc
index e9a5650b..677c0497 100644
--- a/vest/mr_vest_generate_mapper_input.cc
+++ b/vest/mr_vest_generate_mapper_input.cc
@@ -84,16 +84,16 @@ struct oracle_directions {
     OracleBleu::AddOptions(&opts);
     opts.add_options()
       ("dev_set_size,s",po::value<unsigned>(&dev_set_size),"[REQD] Development set size (# of parallel sentences)")
-      ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository")
-      ("weights,w",po::value<string>(),"[REQD] Current feature weights file")
+      ("forest_repository,r",po::value<string>(&forest_repository),"[REQD] Path to forest repository")
+      ("weights,w",po::value<string>(&weights_file),"[REQD] Current feature weights file")
       ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
-      ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in")
+      ("random_directions,d",po::value<unsigned>(&random_directions)->default_value(10),"Number of random directions to run the line optimizer in")
       ("no_primary,n","don't use the primary (orthogonal each feature alone) directions")
-      ("oracle_directions,O",po::value<unsigned>()->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.")
+      ("oracle_directions,O",po::value<unsigned>(&n_oracle)->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.")
       ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it")
-      ("oracle_batch,b",po::value<unsigned>()->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences")
-      ("max_similarity,m",po::value<double>()->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)).  0 means don't filter, 1 means only 1 direction allowed?")
-      ("fear_to_hope,f","for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)")
+      ("oracle_batch,b",po::value<unsigned>(&oracle_batch)->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences")
+      ("max_similarity,m",po::value<double>(&max_similarity)->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)).  0 means don't filter, 1 means only 1 direction allowed?")
+      ("fear_to_hope,f",po::bool_switch(&fear_to_hope),"for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)")
       ("help,h", "Help");
     po::options_description dcmdline_options;
     dcmdline_options.add(opts);
@@ -139,16 +139,20 @@ struct oracle_directions {
     oracle.UseConf(conf);
 
     include_primary=!conf.count("no_primary");
+    old_to_hope=!conf.count("no_old_to_hope");
+
     if (conf.count("optimize_feature") > 0)
       optimize_features=conf["optimize_feature"].as<vector<string> >();
-    fear_to_hope=conf.count("fear_to_hope");
-    n_random=conf["random_directions"].as<unsigned int>();
-    forest_repository=conf["forest_repository"].as<string>();
+
+    // po::value<X>(&var) takes care of below:
+//    fear_to_hope=conf.count("fear_to_hope");
+//    n_random=conf["random_directions"].as<unsigned int>();
+//    forest_repository=conf["forest_repository"].as<string>();
 //    dev_set_size=conf["dev_set_size"].as<unsigned int>();
-    n_oracle=conf["oracle_directions"].as<unsigned>();
-    oracle_batch=conf["oracle_batch"].as<unsigned>();
-    max_similarity=conf["max_similarity"].as<double>();
-    weights_file=conf["weights"].as<string>();
+//    n_oracle=conf["oracle_directions"].as<unsigned>();
+//    oracle_batch=conf["oracle_batch"].as<unsigned>();
+//    max_similarity=conf["max_similarity"].as<double>();
+//    weights_file=conf["weights"].as<string>();
 
     Init();
   }
@@ -158,7 +162,7 @@ struct oracle_directions {
   unsigned n_oracle, oracle_batch;
   string forest_repository;
   unsigned dev_set_size;
-  vector<Dir> dirs; //best_to_hope_dirs
+  vector<Oracle> oracles;
   vector<int> fids;
   string forest_file(unsigned i) const {
     ostringstream o;
@@ -178,6 +182,7 @@ struct oracle_directions {
     weights.InitSparseVector(&origin);
     fids.clear();
     AddFeatureIds(features);
+    oracles.resize(dev_set_size);
   }
 
   Weights weights;
@@ -189,26 +194,42 @@ struct oracle_directions {
  }
 
 
-  Dir const& operator[](unsigned i) {
-    Dir &dir=dirs[i];
-    if (dir.empty()) {
+  //TODO: is it worthwhile to get a complete document bleu first?  would take a list of 1best translations one per line from the decoders, rather than loading all the forests (expensive)
+  Oracle const& ComputeOracle(unsigned i) {
+    Oracle &o=oracles[i];
+    if (o.is_null()) {
       ReadFile rf(forest_file(i));
-      FeatureVector fear,hope,best;
-      //TODO: get hope/oracle from vlad.  random for now.
-      LineOptimizer::RandomUnitVector(fids,&dir,&rng);
+      Hypergraph hg;
+      {
+        Timer t("Loading forest from JSON "+forest_file(i));
+        HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+      }
+      o=oracle.ComputeOracles(MakeMetadata(hg,i),hg,origin,&cerr);
     }
-    return dir;
+    return o;
   }
+
   // if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random.  oracle vectors are summed
   void AddOracleDirections() {
     MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1);
     unsigned b=0;
     for(unsigned i=0;i<n_oracle;++i) {
-      directions.push_back(Dir());
-      Dir &d=directions.back();
-      for (unsigned j=0;j<oracle_batch;++j,++b)
-        d+=(*this)[(start_random || b>=dev_set_size)?rsg():b];
-      d/=(double)oracle_batch;
+      Dir o2hope;
+      Dir fear2hope;
+      for (unsigned j=0;j<oracle_batch;++j,++b) {
+        Oracle const& o=ComputeOracle((start_random||b>=dev_set_size) ? rsg() : b);
+
+        o2hope+=o.ModelHopeGradient();
+        if (fear_to_hope)
+          fear2hope+=o.FearHopeGradient();
+      }
+      double N=(double)oracle_batch;
+      o2hope/=N;
+      directions.push_back(o2hope);
+      if (fear_to_hope) {
+        fear2hope/=N;
+        directions.push_back(fear2hope);
+      }
     }
   }
 };
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-16 01:56:34 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-16 01:56:34 +0000
commit	d7d59c4bb81262f1dfece384ec68fa2c25096843 (patch)
tree	5521dc624dc23adeb3bc9d9c8f8fecc7feb57724
parent	ff323448416bbfa691a9697ddf3b30a0398fa08a (diff)