Added oracle forest rescoring

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@254 ec762483-ff6d-05da-a07a-a48fb63a330f
author: vladimir.eidelman <vladimir.eidelman@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-14 23:00:08 +0000
committer: vladimir.eidelman <vladimir.eidelman@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-14 23:00:08 +0000
commit: 1350b8e8e465acc9d4d8d43d807cc6093e8f37b9 (patch)
tree: ddbf972363b1d51ecca6d27e1ef226391a4e7151
parent: dc6e2c9c453a76f0bb3dfbca4471e763cc8af1e7 (diff)
14 files changed, 662 insertions, 23 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 49aa45d0..e7b6abd8 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -74,6 +74,13 @@ libcdec_a_SOURCES = \
   ff_wordalign.cc \
   ff_csplit.cc \
   ff_tagger.cc \
+  ff_bleu.cc \
+  ../vest/scorer.cc \
+  ../vest/ter.cc \
+  ../vest/aer_scorer.cc \
+  ../vest/comb_scorer.cc \
+  ../vest/error_surface.cc \
+  ../vest/viterbi_envelope.cc \
   tromble_loss.cc \
   freqdict.cc \
   lexalign.cc \
diff --git a/decoder/cdec.cc b/decoder/cdec.cc
index b6cc6f66..5f06b0c8 100644
--- a/decoder/cdec.cc
+++ b/decoder/cdec.cc
@@ -32,6 +32,7 @@
 #include "inside_outside.h"
 #include "exp_semiring.h"
 #include "sentence_metadata.h"
+#include "../vest/scorer.h"
 
 using namespace std;
 using namespace std::tr1;
@@ -143,7 +144,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* confp) {
         ("pb_max_distortion,D", po::value<int>()->default_value(4), "Phrase-based decoder: maximum distortion")
         ("cll_gradient,G","Compute conditional log-likelihood gradient and write to STDOUT (src & ref required)")
         ("crf_uniform_empirical", "If there are multple references use (i.e., lattice) a uniform distribution rather than posterior weighting a la EM")
-        ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)")
+    ("get_oracle_forest,OO", "Calculate rescored hypregraph using approximate BLEU scoring of rules")
+    ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)")
+    ("references,R", po::value<vector<string> >(), "Translation reference files")    
         ("vector_format",po::value<string>()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)")
         ("combine_size,C",po::value<int>()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)")
         ("forest_output,O",po::value<string>(),"Directory to write forests to")
@@ -258,16 +261,30 @@ void MaxTranslationSample(Hypergraph* hg, const int samples, const int k) {
 }
 
 // TODO decoder output should probably be moved to another file
-void DumpKBest(const int sent_id, const Hypergraph& forest, const int k, const bool unique) {
+void DumpKBest(const int sent_id, const Hypergraph& forest, const int k, const bool unique, const char *kbest_out_filename_, float doc_src_length, float tmp_src_length, const DocScorer &ds, Score* doc_score) {
 cerr << "In kbest\n";
+
+ ofstream kbest_out;
+ kbest_out.open(kbest_out_filename_);
+ cerr << "Output kbest to " << kbest_out_filename_;
+ 
+ //add length (f side) src length of this sentence to the psuedo-doc src length count
+ float curr_src_length = doc_src_length + tmp_src_length;
+
  if (unique) {
     KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique> kbest(forest, k);
     for (int i = 0; i < k; ++i) {
       const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique>::Derivation* d =
         kbest.LazyKthBest(forest.nodes_.size() - 1, i);
       if (!d) break;
-      cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| "
-           << d->feature_values << " ||| " << log(d->score) << endl;
+      //calculate score in context of psuedo-doc
+      Score* sentscore = ds[sent_id]->ScoreCandidate(d->yield);
+      sentscore->PlusEquals(*doc_score,float(1));
+      float bleu = curr_src_length * sentscore->ComputeScore();
+      kbest_out << sent_id << " ||| " << TD::GetString(d->yield) << " ||| "
+		<< d->feature_values << " ||| " << log(d->score) << " ||| " << bleu << endl;
+      // cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| "
+      //     << d->feature_values << " ||| " << log(d->score) << endl;
     }
   } else {
     KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k);
@@ -498,6 +515,48 @@ int main(int argc, char** argv) {
   const bool kbest = conf.count("k_best");
   const bool unique_kbest = conf.count("unique_k_best");
   const bool crf_uniform_empirical = conf.count("crf_uniform_empirical");
+  const bool get_oracle_forest = conf.count("get_oracle_forest");
+
+  /*Oracle Extraction Prep*/
+  vector<const FeatureFunction*> oracle_model_ffs;
+  vector<double> oracle_feature_weights;
+  shared_ptr<FeatureFunction> oracle_pff;
+  if(get_oracle_forest)      {
+    
+    /*Add feature for oracle rescoring */
+    string ff, param;
+    ff="BLEUModel";
+    //pass the location of the references file via param to BLEUModel
+    for(int kk=0;kk < conf["references"].as<vector<string> >().size();kk++)
+      {
+	param =  param + " " + conf["references"].as<vector<string> >()[kk];
+      }       
+    cerr << "Feature: " << ff << "->" << param << endl;
+    oracle_pff = global_ff_registry->Create(ff,param);
+    if (!oracle_pff) { exit(1); }
+    oracle_model_ffs.push_back(oracle_pff.get());	
+    oracle_feature_weights.push_back(1.0);
+
+  }
+
+  ModelSet oracle_models(oracle_feature_weights, oracle_model_ffs); 
+
+  const string loss_function3 = "IBM_BLEU_3";
+  ScoreType type3 = ScoreTypeFromString(loss_function3);
+  const DocScorer ds(type3, conf["references"].as<vector<string> >(), "");
+  cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function3 << endl;
+  
+    
+  std::ostringstream kbest_string_stream;
+  Score* doc_score=NULL;
+  float doc_src_length=0;
+  float tmp_src_length=0; 
+  int oracle_doc_size= 10;   //used for scaling/weighting oracle doc
+  float scale_oracle=  1-float(1)/oracle_doc_size;
+
+  /*End Oracle Extraction Prep*/
+
+
   shared_ptr<WriteFile> extract_file;
   if (conf.count("extract_rules"))
     extract_file.reset(new WriteFile(str("extract_rules",conf)));
@@ -610,6 +669,87 @@ int main(int argc, char** argv) {
 
     maybe_prune(forest,conf,"beam_prune","density_prune","+LM",srclen);
 
+    vector<WordID> trans;
+    ViterbiESentence(forest, &trans);
+    
+    /*Oracle Rescoring*/
+    if(get_oracle_forest)
+      {
+	Timer t("Forest Oracle rescoring:");
+	vector<WordID> model_trans;
+	model_trans = trans;
+
+	trans=model_trans;
+	Score* sentscore = ds[sent_id]->ScoreCandidate(model_trans);
+	//initilize psuedo-doc vector to 1 counts
+	if (!doc_score) { doc_score = sentscore->GetOne(); }
+	double bleu_scale_ = doc_src_length * doc_score->ComputeScore();
+	tmp_src_length = smeta.GetSourceLength();
+	smeta.SetScore(doc_score);
+	smeta.SetDocLen(doc_src_length);
+	smeta.SetDocScorer(&ds); 
+	
+	feature_weights[0]=1.0;	
+
+	kbest_string_stream << conf["forest_output"].as<string>() << "/kbest_model" << "." << sent_id;
+	DumpKBest(sent_id, forest, 10, true, kbest_string_stream.str().c_str(), doc_src_length, tmp_src_length, ds, doc_score);
+	kbest_string_stream.str("");
+
+
+	forest.SortInEdgesByEdgeWeights();
+	Hypergraph lm_forest;
+	const IntersectionConfiguration inter_conf_oracle(0, 0);
+	cerr << "Going to call Apply Model " << endl;
+	ApplyModelSet(forest,
+		      smeta,
+		      oracle_models,
+		      inter_conf_oracle,
+		      &lm_forest);
+	
+	forest.swap(lm_forest);
+	forest.Reweight(feature_weights);
+	forest.SortInEdgesByEdgeWeights();
+	vector<WordID> oracle_trans;
+
+	ViterbiESentence(forest, &oracle_trans);
+	cerr << "  +Oracle BLEU forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl;
+	cerr << "  +Oracle BLEU (paths): " << forest.NumberOfPaths() << endl;
+	cerr << "  +Oracle BLEU Viterbi: " << TD::GetString(oracle_trans) << endl;  
+      
+	//compute kbest for oracle
+	kbest_string_stream << conf["forest_output"].as<string>() <<"/kbest_oracle" << "." << sent_id;
+	DumpKBest(sent_id, forest, 10, true, kbest_string_stream.str().c_str(), doc_src_length, tmp_src_length, ds, doc_score);
+	kbest_string_stream.str("");
+	
+	
+	//reweight the model with -1 for the BLEU feature to compute k-best list for negative examples
+	feature_weights[0]=-1.0;
+	forest.Reweight(feature_weights);
+	forest.SortInEdgesByEdgeWeights();
+	vector<WordID> neg_trans;
+	ViterbiESentence(forest, &neg_trans);
+	cerr << "  -Oracle BLEU forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl;
+	cerr << "  -Oracle BLEU (paths): " << forest.NumberOfPaths() << endl;
+	cerr << "  -Oracle BLEU Viterbi: " << TD::GetString(neg_trans) << endl;  
+	
+	//compute kbest for negative
+	kbest_string_stream << conf["forest_output"].as<string>() << "/kbest_negative" << "." << sent_id;
+	DumpKBest(sent_id, forest, 10, true, kbest_string_stream.str().c_str(), doc_src_length, tmp_src_length,ds,  doc_score);
+	kbest_string_stream.str("");
+		
+	//Add 1-best translation (trans) to psuedo-doc vectors
+	doc_score->PlusEquals(*sentscore, scale_oracle);
+	delete sentscore;
+	
+	doc_src_length = (doc_src_length + tmp_src_length) * scale_oracle;
+	
+	
+	string details;
+	doc_score->ScoreDetails(&details);
+	cerr << "SCALED SCORE: " << bleu_scale_ << "DOC BLEU " << doc_score->ComputeScore() << " " <<details << endl;
+      }
+
+
     if (conf.count("forest_output") && !has_ref) {
       ForestWriter writer(str("forest_output",conf), sent_id);
       if (FileExists(writer.fname_)) {
@@ -632,11 +772,9 @@ int main(int argc, char** argv) {
     if (sample_max_trans) {
       MaxTranslationSample(&forest, sample_max_trans, conf.count("k_best") ? conf["k_best"].as<int>() : 0);
     } else {
-      vector<WordID> trans;
-      ViterbiESentence(forest, &trans);
-
+      
       if (kbest) {
-        DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest);
+        DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest,"", doc_src_length, tmp_src_length, ds,  doc_score);
       } else if (csplit_output_plf) {
         cout << HypergraphIO::AsPLF(forest, false) << endl;
       } else {
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 077956a8..c91780e2 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -7,6 +7,7 @@
 #include "ff_tagger.h"
 #include "ff_factory.h"
 #include "ff_ruleshape.h"
+#include "ff_bleu.h"
 
 boost::shared_ptr<FFRegistry> global_ff_registry;
 
@@ -20,6 +21,7 @@ void register_feature_functions() {
   global_ff_registry->Register(new FFFactory<WordPenalty>);
   global_ff_registry->Register(new FFFactory<SourceWordPenalty>);
   global_ff_registry->Register(new FFFactory<ArityPenalty>);
+  global_ff_registry->Register("BLEUModel", new FFFactory<BLEUModel>);
   global_ff_registry->Register("RuleShape", new FFFactory<RuleShapeFeatures>);
   global_ff_registry->Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>);
   global_ff_registry->Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>);
diff --git a/decoder/ff_bleu.cc b/decoder/ff_bleu.cc
new file mode 100644
index 00000000..4a13f89e
--- /dev/null
+++ b/decoder/ff_bleu.cc
@@ -0,0 +1,285 @@
+#include "ff_bleu.h"
+
+#include <sstream>
+#include <unistd.h>
+
+#include <boost/shared_ptr.hpp>
+
+#include "tdict.h"
+#include "Vocab.h"
+#include "Ngram.h"
+#include "hg.h"
+#include "stringlib.h"
+#include "sentence_metadata.h"
+#include "../vest/scorer.h"
+
+using namespace std;
+
+class BLEUModelImpl {
+ public:
+  explicit BLEUModelImpl(int order) :
+      ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
+      floor_(-100.0),
+      kSTART(TD::Convert("<s>")),
+      kSTOP(TD::Convert("</s>")),
+      kUNKNOWN(TD::Convert("<unk>")),
+      kNONE(-1),
+      kSTAR(TD::Convert("<{STAR}>")) {}
+
+  BLEUModelImpl(int order, const string& f) :
+      ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
+      floor_(-100.0),
+      kSTART(TD::Convert("<s>")),
+      kSTOP(TD::Convert("</s>")),
+      kUNKNOWN(TD::Convert("<unk>")),
+      kNONE(-1),
+      kSTAR(TD::Convert("<{STAR}>")) {}
+  
+
+  virtual ~BLEUModelImpl() {
+      }
+
+  inline int StateSize(const void* state) const {
+    return *(static_cast<const char*>(state) + state_size_);
+  }
+
+  inline void SetStateSize(int size, void* state) const {
+    *(static_cast<char*>(state) + state_size_) = size;
+  }
+
+  void GetRefToNgram()
+  {}
+ 
+  string DebugStateToString(const void* state) const {
+    int len = StateSize(state);
+    const int* astate = reinterpret_cast<const int*>(state);
+    string res = "[";
+    for (int i = 0; i < len; ++i) {
+      res += " ";
+      res += TD::Convert(astate[i]);
+    }
+    res += " ]";
+    return res;
+  }
+
+  inline double ProbNoRemnant(int i, int len) {
+    int edge = len;
+    bool flag = true;
+    double sum = 0.0;
+    while (i >= 0) {
+      if (buffer_[i] == kSTAR) {
+        edge = i;
+        flag = false;
+      } else if (buffer_[i] <= 0) {
+        edge = i;
+        flag = true;
+      } else {
+        if ((edge-i >= order_) || (flag && !(i == (len-1) && buffer_[i] == kSTART)))
+	  {          //sum += LookupProbForBufferContents(i);
+	    //cerr << "FT";
+	    CalcPhrase(buffer_[i], &buffer_[i+1]);
+	  }
+      }
+      --i;
+    }
+    return sum;
+  }
+
+  double FinalTraversalCost(const void* state) {
+    int slen = StateSize(state);
+    int len = slen + 2;
+    // cerr << "residual len: " << len << endl;
+    buffer_.resize(len + 1);
+    buffer_[len] = kNONE;
+    buffer_[len-1] = kSTART;
+    const int* astate = reinterpret_cast<const int*>(state);
+    int i = len - 2;
+    for (int j = 0; j < slen; ++j,--i)
+      buffer_[i] = astate[j];
+    buffer_[i] = kSTOP;
+    assert(i == 0);
+    return ProbNoRemnant(len - 1, len);
+  }
+
+  vector<WordID> CalcPhrase(int word, int* context) {
+     int i = order_;
+    vector<WordID> vs;
+    int c = 1;
+    vs.push_back(word);
+    // while (i > 1 && *context > 0) {
+     while (*context > 0) {
+      --i;
+      vs.push_back(*context);
+      ++context;
+      ++c;
+    }
+     if(false){	cerr << "VS1( ";
+	vector<WordID>::reverse_iterator rit;
+	for ( rit=vs.rbegin() ; rit != vs.rend(); ++rit )
+	  cerr << " " << TD::Convert(*rit);
+	cerr << ")\n";}
+    
+    return vs;
+  }
+
+
+  double LookupWords(const TRule& rule, const vector<const void*>& ant_states, void* vstate, const SentenceMetadata& smeta) {
+   
+    int len = rule.ELength() - rule.Arity();
+    
+    for (int i = 0; i < ant_states.size(); ++i)
+      len += StateSize(ant_states[i]);
+    buffer_.resize(len + 1);
+    buffer_[len] = kNONE;
+    int i = len - 1;
+    const vector<WordID>& e = rule.e();
+
+    /*cerr << "RULE::" << rule.ELength() << " ";
+    for (vector<WordID>::const_iterator i = e.begin(); i != e.end(); ++i)
+      {
+	const WordID& c = *i;
+	if(c > 0) cerr << TD::Convert(c) << "--";
+	else cerr <<"N--";
+      }
+    cerr << endl;
+    */
+
+    for (int j = 0; j < e.size(); ++j) {
+      if (e[j] < 1) {
+        const int* astate = reinterpret_cast<const int*>(ant_states[-e[j]]);
+        int slen = StateSize(astate);
+        for (int k = 0; k < slen; ++k)
+          buffer_[i--] = astate[k];
+      } else {
+        buffer_[i--] = e[j];
+      }
+    }
+
+    double approx_bleu = 0.0;
+    int* remnant = reinterpret_cast<int*>(vstate);
+    int j = 0;
+    i = len - 1;
+    int edge = len;
+
+
+    vector<WordID> vs;
+    while (i >= 0) {
+      vs = CalcPhrase(buffer_[i],&buffer_[i+1]);
+      if (buffer_[i] == kSTAR) {
+        edge = i;
+      } else if (edge-i >= order_) {
+	
+	vs = CalcPhrase(buffer_[i],&buffer_[i+1]);
+      
+      } else if (edge == len && remnant) {
+        remnant[j++] = buffer_[i];
+      }
+      --i;
+    }
+
+    //calculate Bvector here
+    /* cerr << "VS1( ";
+    vector<WordID>::reverse_iterator rit;
+    for ( rit=vs.rbegin() ; rit != vs.rend(); ++rit )
+      cerr << " " << TD::Convert(*rit);
+    cerr << ")\n"; 
+    */
+
+    Score *node_score = smeta.GetDocScorer()[smeta.GetSentenceID()]->ScoreCCandidate(vs);
+    string details;
+    node_score->ScoreDetails(&details);
+    const Score *base_score= &smeta.GetScore();
+    //cerr << "SWBASE : " << base_score->ComputeScore() << details << " ";
+
+    int src_length = smeta.GetSourceLength();    
+    node_score->PlusPartialEquals(*base_score, rule.EWords(), rule.FWords(), src_length );
+    float oracledoc_factor = (src_length + smeta.GetDocLen())/  src_length;
+
+    //how it seems to be done in code
+    //TODO: might need to reverse the -1/+1 of the oracle/neg examples
+    approx_bleu = ( rule.FWords() * oracledoc_factor  ) * node_score->ComputeScore();
+    //how I thought it was done from the paper
+    //approx_bleu = ( rule.FWords()+ smeta.GetDocLen() ) * node_score->ComputeScore();
+
+    if (!remnant){  return approx_bleu;}
+
+    if (edge != len || len >= order_) {
+      remnant[j++] = kSTAR;
+      if (order_-1 < edge) edge = order_-1;
+      for (int i = edge-1; i >= 0; --i)
+        remnant[j++] = buffer_[i];
+    }
+
+    SetStateSize(j, vstate);
+    //cerr << "Return APPROX_BLEU: " << approx_bleu << " "<<  DebugStateToString(vstate) << endl;
+    return approx_bleu;
+  }
+
+  static int OrderToStateSize(int order) {
+    return ((order-1) * 2 + 1) * sizeof(WordID) + 1;
+  }
+
+ protected:
+  Ngram ngram_;
+  vector<WordID> buffer_;
+  const int order_;
+  const int state_size_;
+  const double floor_;
+
+ public:
+  const WordID kSTART;
+  const WordID kSTOP;
+  const WordID kUNKNOWN;
+  const WordID kNONE;
+  const WordID kSTAR;
+};
+
+BLEUModel::BLEUModel(const string& param) :
+  fid_(0) { //The partial BLEU score is kept in feature id=0
+  vector<string> argv;
+  int argc = SplitOnWhitespace(param, &argv);
+  int order = 3;
+  string filename;
+ 
+  //loop over argv and load all references into vector of NgramMaps   
+  if (argc < 1) { cerr << "BLEUModel requires a filename, minimally!\n"; abort(); }
+  
+  
+  SetStateSize(BLEUModelImpl::OrderToStateSize(order));
+  pimpl_ = new BLEUModelImpl(order, filename);
+}
+
+BLEUModel::~BLEUModel() {
+  delete pimpl_;
+}
+
+string BLEUModel::DebugStateToString(const void* state) const{
+  return pimpl_->DebugStateToString(state);
+}
+
+void BLEUModel::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                          const Hypergraph::Edge& edge,
+                                          const vector<const void*>& ant_states,
+                                          SparseVector<double>* features,
+                                          SparseVector<double>* estimated_features,
+                                          void* state) const {
+
+  (void) smeta;
+  /*cerr << "In BM calling set " << endl;  
+  const Score *s=  &smeta.GetScore();
+  const int dl = smeta.GetDocLen();
+  cerr << "SCO " << s->ComputeScore() << endl;
+  const DocScorer *ds = &smeta.GetDocScorer();
+  */
+
+  cerr<< "Loading sentence " << smeta.GetSentenceID() << endl;
+      //}
+  features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state, smeta));
+  //cerr << "FID" << fid_ << " " << DebugStateToString(state) << endl;
+}
+
+void BLEUModel::FinalTraversalFeatures(const void* ant_state,
+                                           SparseVector<double>* features) const {
+
+  features->set_value(fid_, pimpl_->FinalTraversalCost(ant_state));
+}
diff --git a/decoder/ff_bleu.h b/decoder/ff_bleu.h
new file mode 100644
index 00000000..fb127241
--- /dev/null
+++ b/decoder/ff_bleu.h
@@ -0,0 +1,32 @@
+#ifndef _BLEU_FF_H_
+#define _BLEU_FF_H_
+
+#include <vector>
+#include <string>
+
+#include "hg.h"
+#include "ff.h"
+#include "config.h"
+
+class BLEUModelImpl;
+
+class BLEUModel : public FeatureFunction {
+ public:
+  // param = "filename.lm [-o n]"
+  BLEUModel(const std::string& param);
+  ~BLEUModel();
+  virtual void FinalTraversalFeatures(const void* context,
+                                      SparseVector<double>* features) const;
+  std::string DebugStateToString(const void* state) const;
+ protected:
+  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                     const Hypergraph::Edge& edge,
+                                     const std::vector<const void*>& ant_contexts,
+                                     SparseVector<double>* features,
+                                     SparseVector<double>* estimated_features,
+                                     void* out_context) const;
+ private:
+  const int fid_;
+  mutable BLEUModelImpl* pimpl_;
+};
+#endif
diff --git a/decoder/sentence_metadata.h b/decoder/sentence_metadata.h
index ef9eb388..21be9b21 100644
--- a/decoder/sentence_metadata.h
+++ b/decoder/sentence_metadata.h
@@ -3,6 +3,7 @@
 
 #include <cassert>
 #include "lattice.h"
+#include "../vest/scorer.h"
 
 struct SentenceMetadata {
   SentenceMetadata(int id, const Lattice& ref) :
@@ -30,10 +31,22 @@ struct SentenceMetadata {
   // this will be empty if the translator accepts non FS input!
   const Lattice& GetSourceLattice() const { return src_lattice_; }
 
+  // access to document level scores for MIRA vector computation
+  void SetScore(Score *s){app_score=s;}
+  void SetDocScorer (const DocScorer *d){ds = d;}
+  void SetDocLen(double dl){doc_len = dl;}
+
+  const Score& GetScore() const { return *app_score; }
+  const DocScorer& GetDocScorer() const { return *ds; }
+  double GetDocLen() const {return doc_len;}
+
  private:
   const int sent_id_;
   // the following should be set, if possible, by the Translator
   int src_len_;
+  double doc_len;
+  const DocScorer* ds;
+  const Score* app_score;
  public:
   Lattice src_lattice_;  // this will only be set if inputs are finite state!
  private:
diff --git a/vest/aer_scorer.cc b/vest/aer_scorer.cc
index 9c8a783a..d3f28804 100644
--- a/vest/aer_scorer.cc
+++ b/vest/aer_scorer.cc
@@ -15,15 +15,27 @@ class AERScore : public Score {
   AERScore() : num_matches(), num_predicted(), num_in_ref() {}
   AERScore(int m, int p, int r) :
     num_matches(m), num_predicted(p), num_in_ref(r) {}
-  virtual void PlusEquals(const Score& delta) {
+  virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){}
+  virtual void PlusEquals(const Score& delta, const float scale) {
     const AERScore& other = static_cast<const AERScore&>(delta);
     num_matches   += other.num_matches;
     num_predicted += other.num_predicted;
     num_in_ref    += other.num_in_ref;
   }
+ virtual void PlusEquals(const Score& delta) {
+    const AERScore& other = static_cast<const AERScore&>(delta);
+    num_matches   += other.num_matches;
+    num_predicted += other.num_predicted;
+    num_in_ref    += other.num_in_ref;
+  }
+
+
   virtual Score* GetZero() const {
     return new AERScore;
   }
+  virtual Score* GetOne() const {
+    return new AERScore;
+  }
   virtual void Subtract(const Score& rhs, Score* out) const {
     AERScore* res = static_cast<AERScore*>(out);
     const AERScore& other = static_cast<const AERScore&>(rhs);
@@ -37,6 +49,7 @@ class AERScore : public Score {
   float Recall() const {
     return static_cast<float>(num_matches) / num_in_ref;
   }
+  float ComputePartialScore() const { return 0.0;}
   virtual float ComputeScore() const {
     const float prec = Precision();
     const float rec = Recall();
@@ -82,6 +95,11 @@ static inline bool Safe(const Array2D<bool>& a, int i, int j) {
     return false;
 }
 
+Score* AERScorer::ScoreCCandidate(const vector<WordID>& shyp) const {
+  Score* a = NULL;
+  return a;
+}
+
 Score* AERScorer::ScoreCandidate(const vector<WordID>& shyp) const {
   boost::shared_ptr<Array2D<bool> > hyp =
     AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(shyp));
diff --git a/vest/aer_scorer.h b/vest/aer_scorer.h
index a0afea3b..d0df35d5 100644
--- a/vest/aer_scorer.h
+++ b/vest/aer_scorer.h
@@ -12,6 +12,7 @@ class AERScorer : public SentenceScorer {
   // is necessary.
   AERScorer(const std::vector<std::vector<WordID> >& refs, const std::string& src = "");
   Score* ScoreCandidate(const std::vector<WordID>& hyp) const;
+  Score* ScoreCCandidate(const std::vector<WordID>& hyp) const;
   static Score* ScoreFromString(const std::string& in);
   const std::string* GetSource() const;
  private:
diff --git a/vest/comb_scorer.cc b/vest/comb_scorer.cc
index 7b2187f4..3dd077a6 100644
--- a/vest/comb_scorer.cc
+++ b/vest/comb_scorer.cc
@@ -8,6 +8,7 @@ class BLEUTERCombinationScore : public Score {
   friend class BLEUTERCombinationScorer;
  public:
   ~BLEUTERCombinationScore();
+  float ComputePartialScore() const { return 0.0;}
   float ComputeScore() const {
     return (bleu->ComputeScore() - ter->ComputeScore()) / 2.0f;
   }
@@ -17,10 +18,25 @@ class BLEUTERCombinationScore : public Score {
       ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, ter->ComputeScore()*100.0f);
     *details = buf;
   }
+  void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){}
+
+  void PlusEquals(const Score& delta, const float scale) {
+    bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu, scale);
+    ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter, scale);
+  }
   void PlusEquals(const Score& delta) {
     bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu);
     ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter);
   }
+
+
+
+  Score* GetOne() const {
+    BLEUTERCombinationScore* res = new BLEUTERCombinationScore;
+    res->bleu = bleu->GetOne();
+    res->ter = ter->GetOne();
+    return res;    
+  }
   Score* GetZero() const {
     BLEUTERCombinationScore* res = new BLEUTERCombinationScore;
     res->bleu = bleu->GetZero();
@@ -65,6 +81,11 @@ BLEUTERCombinationScorer::~BLEUTERCombinationScorer() {
   delete ter_;
 }
 
+Score* BLEUTERCombinationScorer::ScoreCCandidate(const vector<WordID>& hyp) const {
+  Score* a = NULL;
+  return a;
+}
+
 Score* BLEUTERCombinationScorer::ScoreCandidate(const std::vector<WordID>& hyp) const {
   BLEUTERCombinationScore* res = new BLEUTERCombinationScore;
   res->bleu = bleu_->ScoreCandidate(hyp);
diff --git a/vest/comb_scorer.h b/vest/comb_scorer.h
index 70b1ec75..1a4f3324 100644
--- a/vest/comb_scorer.h
+++ b/vest/comb_scorer.h
@@ -8,6 +8,7 @@ class BLEUTERCombinationScorer : public SentenceScorer {
   BLEUTERCombinationScorer(const std::vector<std::vector<WordID> >& refs);
   ~BLEUTERCombinationScorer();
   Score* ScoreCandidate(const std::vector<WordID>& hyp) const;
+  Score* ScoreCCandidate(const std::vector<WordID>& hyp) const;
   static Score* ScoreFromString(const std::string& in);
  private:
   SentenceScorer* bleu_;
diff --git a/vest/scorer.cc b/vest/scorer.cc
index 6c604ab8..524b15a5 100644
--- a/vest/scorer.cc
+++ b/vest/scorer.cc
@@ -35,6 +35,8 @@ ScoreType ScoreTypeFromString(const string& st) {
     return AER;
   if (sl == "bleu" || sl == "ibm_bleu")
     return IBM_BLEU;
+  if (sl == "ibm_bleu_3")
+    return IBM_BLEU_3;
   if (sl == "nist_bleu")
     return NIST_BLEU;
   if (sl == "koehn_bleu")
@@ -53,6 +55,7 @@ class SERScore : public Score {
   friend class SERScorer;
  public:
   SERScore() : correct(0), total(0) {}
+  float ComputePartialScore() const { return 0.0;}
   float ComputeScore() const {
     return static_cast<float>(correct) / static_cast<float>(total);
   }
@@ -61,11 +64,18 @@ class SERScore : public Score {
     os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')';
     *details = os.str();
   }
-  void PlusEquals(const Score& delta) {
+  void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){}
+  
+  void PlusEquals(const Score& delta, const float scale) {
     correct += static_cast<const SERScore&>(delta).correct;
     total += static_cast<const SERScore&>(delta).total;
   }
+  void PlusEquals(const Score& delta) {
+    correct += static_cast<const SERScore&>(delta).correct;
+    total += static_cast<const SERScore&>(delta).total;
+    }
   Score* GetZero() const { return new SERScore; }
+  Score* GetOne() const { return new SERScore; }
   void Subtract(const Score& rhs, Score* res) const {
     SERScore* r = static_cast<SERScore*>(res);
     r->correct = correct - static_cast<const SERScore&>(rhs).correct;
@@ -84,6 +94,10 @@ class SERScore : public Score {
 class SERScorer : public SentenceScorer {
  public:
   SERScorer(const vector<vector<WordID> >& references) : refs_(references) {}
+  Score* ScoreCCandidate(const vector<WordID>& hyp) const {
+    Score* a = NULL;
+    return a;
+  }
   Score* ScoreCandidate(const vector<WordID>& hyp) const {
     SERScore* res = new SERScore;
     res->total = 1;
@@ -101,13 +115,20 @@ class SERScorer : public SentenceScorer {
 class BLEUScore : public Score {
   friend class BLEUScorerBase;
  public:
-  BLEUScore(int n) : correct_ngram_hit_counts(0,n), hyp_ngram_counts(0,n) {
+  BLEUScore(int n) : correct_ngram_hit_counts(float(0),float(n)), hyp_ngram_counts(float(0),float(n)) {
     ref_len = 0;
     hyp_len = 0; }
+  BLEUScore(int n, int k) :  correct_ngram_hit_counts(float(k),float(n)), hyp_ngram_counts(float(k),float(n)) {
+    ref_len = k;
+    hyp_len = k; }  
   float ComputeScore() const;
+  float ComputePartialScore() const;
   void ScoreDetails(string* details) const;
   void PlusEquals(const Score& delta);
+  void PlusEquals(const Score& delta, const float scale);
+  void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len);
   Score* GetZero() const;
+  Score* GetOne() const;
   void Subtract(const Score& rhs, Score* res) const;
   void Encode(string* out) const;
   bool IsAdditiveIdentity() const {
@@ -119,10 +140,11 @@ class BLEUScore : public Score {
   }
  private:
   float ComputeScore(vector<float>* precs, float* bp) const;
-  valarray<int> correct_ngram_hit_counts;
-  valarray<int> hyp_ngram_counts;
+  float ComputePartialScore(vector<float>* prec, float* bp) const;
+  valarray<float> correct_ngram_hit_counts;
+  valarray<float> hyp_ngram_counts;
   float ref_len;
-  int hyp_len;
+  float hyp_len;
 };
 
 class BLEUScorerBase : public SentenceScorer {
@@ -131,6 +153,7 @@ class BLEUScorerBase : public SentenceScorer {
              int n
              );
   Score* ScoreCandidate(const vector<WordID>& hyp) const;
+  Score* ScoreCCandidate(const vector<WordID>& hyp) const;
   static Score* ScoreFromString(const string& in);
 
  protected:
@@ -171,8 +194,10 @@ class BLEUScorerBase : public SentenceScorer {
   }
 
   void ComputeNgramStats(const vector<WordID>& sent,
-       valarray<int>* correct,
-       valarray<int>* hyp) const {
+			 valarray<float>* correct,
+			 valarray<float>* hyp,
+			 bool clip_counts)
+    const {
     assert(correct->size() == n_);
     assert(hyp->size() == n_);
     vector<WordID> ngram(n_);
@@ -186,10 +211,15 @@ class BLEUScorerBase : public SentenceScorer {
       for (int i=1; i<=k; ++i) {
 	ngram.push_back(sent[j + i - 1]);
         pair<int,int>& p = ngrams_[ngram];
-        if (p.second < p.first) {
-          ++p.second;
-          (*correct)[i-1]++;
-        }
+	if(clip_counts){
+	  if (p.second < p.first) {
+	    ++p.second;
+	    (*correct)[i-1]++;
+	  }}
+	else {
+	  ++p.second;
+	  (*correct)[i-1]++;
+	}
 	// if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams:
 	if (!p.first) {
 	  for (; i<=k; ++i)
@@ -284,7 +314,8 @@ SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type,
       const vector<vector<WordID> >& refs,
       const string& src) {
   switch (type) {
-    case IBM_BLEU: return new IBM_BLEUScorer(refs, 4);
+  case IBM_BLEU: return new IBM_BLEUScorer(refs, 4);
+  case IBM_BLEU_3 : return new IBM_BLEUScorer(refs,3);
     case NIST_BLEU: return new NIST_BLEUScorer(refs, 4);
     case Koehn_BLEU: return new Koehn_BLEUScorer(refs, 4);
     case AER: return new AERScorer(refs, src);
@@ -299,6 +330,7 @@ SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type,
 Score* SentenceScorer::CreateScoreFromString(const ScoreType type, const string& in) {
   switch (type) {
     case IBM_BLEU:
+  case IBM_BLEU_3:
     case NIST_BLEU:
     case Koehn_BLEU:
       return BLEUScorerBase::ScoreFromString(in);
@@ -423,6 +455,36 @@ float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const {
   return exp(log_bleu);
 }
 
+
+//comptue scaled score for oracle retrieval
+float BLEUScore::ComputePartialScore(vector<float>* precs, float* bp) const {
+  // cerr << "Then here " << endl;
+  float log_bleu = 0;
+  if (precs) precs->clear();
+  int count = 0;
+  for (int i = 0; i < hyp_ngram_counts.size(); ++i) {
+    //  cerr << "In CPS " << hyp_ngram_counts[i] << " " << correct_ngram_hit_counts[i] << endl;
+    if (hyp_ngram_counts[i] > 0) {
+      float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]);
+      if (precs) precs->push_back(exp(lprec));
+      log_bleu += lprec;
+      ++count;
+    }
+  }
+  log_bleu /= static_cast<float>(count);
+  float lbp = 0.0;
+  if (hyp_len < ref_len)
+    lbp = (hyp_len - ref_len) / hyp_len;
+  log_bleu += lbp;
+  if (bp) *bp = exp(lbp);
+  return exp(log_bleu);
+}
+
+float BLEUScore::ComputePartialScore() const {
+  // cerr << "In here first " << endl;
+  return ComputePartialScore(NULL, NULL);
+}
+
 float BLEUScore::ComputeScore() const {
   return ComputeScore(NULL, NULL);
 }
@@ -444,10 +506,37 @@ void BLEUScore::PlusEquals(const Score& delta) {
   hyp_len += d.hyp_len;
 }
 
+void BLEUScore::PlusEquals(const Score& delta, const float scale) {
+  const BLEUScore& d = static_cast<const BLEUScore&>(delta);
+  correct_ngram_hit_counts = (correct_ngram_hit_counts + d.correct_ngram_hit_counts) * scale;
+  hyp_ngram_counts = ( hyp_ngram_counts + d.hyp_ngram_counts) * scale;
+  ref_len = (ref_len + d.ref_len) * scale;
+  hyp_len = ( hyp_len + d.hyp_len) * scale;
+
+}
+
+void BLEUScore::PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){
+  const BLEUScore& d = static_cast<const BLEUScore&>(delta);
+  correct_ngram_hit_counts += d.correct_ngram_hit_counts;
+  hyp_ngram_counts += d.hyp_ngram_counts;
+  //scale the reference length according to the size of the input sentence covered by this rule
+  
+  ref_len *= (float)oracle_f_cover / src_len;
+  ref_len += d.ref_len;
+  
+  hyp_len = oracle_e_cover;
+  hyp_len += d.hyp_len;
+}
+
+
 Score* BLEUScore::GetZero() const {
   return new BLEUScore(hyp_ngram_counts.size());
 }
 
+Score* BLEUScore::GetOne() const {
+  return new BLEUScore(hyp_ngram_counts.size(),1);
+}
+
 void BLEUScore::Encode(string* out) const {
   ostringstream os;
   const int n = correct_ngram_hit_counts.size();
@@ -470,12 +559,24 @@ Score* BLEUScorerBase::ScoreCandidate(const vector<WordID>& hyp) const {
   BLEUScore* bs = new BLEUScore(n_);
   for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i)
     i->second.second = 0;
-  ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts);
+  ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts, true);
   bs->ref_len = ComputeRefLength(hyp);
   bs->hyp_len = hyp.size();
   return bs;
 }
 
+Score* BLEUScorerBase::ScoreCCandidate(const vector<WordID>& hyp) const {
+  BLEUScore* bs = new BLEUScore(n_);
+  for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i)
+    i->second.second = 0;
+  bool clip = false;
+  ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts,clip);
+  bs->ref_len = ComputeRefLength(hyp);
+  bs->hyp_len = hyp.size();
+  return bs;
+}
+
+
 DocScorer::~DocScorer() {
   for (int i=0; i < scorers_.size(); ++i)
     delete scorers_[i];
diff --git a/vest/scorer.h b/vest/scorer.h
index 83d4db4c..7ce688c4 100644
--- a/vest/scorer.h
+++ b/vest/scorer.h
@@ -10,17 +10,21 @@ class ViterbiEnvelope;
 class ErrorSurface;
 class Hypergraph;  // needed for alignment
 
-enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER };
+enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3 };
 ScoreType ScoreTypeFromString(const std::string& st);
 
 class Score {
  public:
   virtual ~Score();
   virtual float ComputeScore() const = 0;
+  virtual float ComputePartialScore() const =0;
   virtual void ScoreDetails(std::string* details) const = 0;
+  virtual void PlusEquals(const Score& rhs, const float scale) = 0;
   virtual void PlusEquals(const Score& rhs) = 0;
+  virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len) = 0;
   virtual void Subtract(const Score& rhs, Score* res) const = 0;
   virtual Score* GetZero() const = 0;
+  virtual Score* GetOne() const = 0;
   virtual bool IsAdditiveIdentity() const = 0; // returns true if adding this delta
                                       // to another score results in no score change
 				      // under any circumstances
@@ -32,6 +36,7 @@ class SentenceScorer {
   virtual ~SentenceScorer();
   void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const;
   virtual Score* ScoreCandidate(const std::vector<WordID>& hyp) const = 0;
+  virtual Score* ScoreCCandidate(const std::vector<WordID>& hyp) const =0;
   virtual const std::string* GetSource() const;
   static Score* CreateScoreFromString(const ScoreType type, const std::string& in);
   static SentenceScorer* CreateSentenceScorer(const ScoreType type,
diff --git a/vest/ter.cc b/vest/ter.cc
index ef66f3b7..6e16e1cf 100644
--- a/vest/ter.cc
+++ b/vest/ter.cc
@@ -424,17 +424,26 @@ class TERScore : public Score {
   static const unsigned kDUMMY_LAST_ENTRY = 5;
 
  TERScore() : stats(0,kDUMMY_LAST_ENTRY) {}
+  float ComputePartialScore() const { return 0.0;}
   float ComputeScore() const {
     float edits = static_cast<float>(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]);
     return edits / static_cast<float>(stats[kREF_WORDCOUNT]);
   }
   void ScoreDetails(string* details) const;
+  void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){}
+  void PlusEquals(const Score& delta, const float scale) {
+    stats += static_cast<const TERScore&>(delta).stats;
+  }
   void PlusEquals(const Score& delta) {
     stats += static_cast<const TERScore&>(delta).stats;
   }
+
   Score* GetZero() const {
     return new TERScore;
   }
+  Score* GetOne() const {
+    return new TERScore;
+  }
   void Subtract(const Score& rhs, Score* res) const {
     static_cast<TERScore*>(res)->stats = stats - static_cast<const TERScore&>(rhs).stats;
   }
@@ -489,6 +498,11 @@ TERScorer::TERScorer(const vector<vector<WordID> >& refs) : impl_(refs.size()) {
     impl_[i] = new TERScorerImpl(refs[i]);
 }
 
+Score* TERScorer::ScoreCCandidate(const vector<WordID>& hyp) const {
+  Score* a = NULL;
+  return a;
+}
+
 Score* TERScorer::ScoreCandidate(const std::vector<WordID>& hyp) const {
   float best_score = numeric_limits<float>::max();
   TERScore* res = new TERScore;
diff --git a/vest/ter.h b/vest/ter.h
index fe4ba36c..21007874 100644
--- a/vest/ter.h
+++ b/vest/ter.h
@@ -10,6 +10,7 @@ class TERScorer : public SentenceScorer {
   TERScorer(const std::vector<std::vector<WordID> >& references);
   ~TERScorer();
   Score* ScoreCandidate(const std::vector<WordID>& hyp) const;
+  Score* ScoreCCandidate(const std::vector<WordID>& hyp) const;
   static Score* ScoreFromString(const std::string& data);
  private:
   std::vector<TERScorerImpl*> impl_;
author	vladimir.eidelman <vladimir.eidelman@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-14 23:00:08 +0000
committer	vladimir.eidelman <vladimir.eidelman@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-14 23:00:08 +0000
commit	1350b8e8e465acc9d4d8d43d807cc6093e8f37b9 (patch)
tree	ddbf972363b1d51ecca6d27e1ef226391a4e7151
parent	dc6e2c9c453a76f0bb3dfbca4471e763cc8af1e7 (diff)