diff options
| -rw-r--r-- | decoder/Makefile.am | 7 | ||||
| -rw-r--r-- | decoder/cdec.cc | 154 | ||||
| -rw-r--r-- | decoder/cdec_ff.cc | 2 | ||||
| -rw-r--r-- | decoder/ff_bleu.cc | 285 | ||||
| -rw-r--r-- | decoder/ff_bleu.h | 32 | ||||
| -rw-r--r-- | decoder/sentence_metadata.h | 13 | ||||
| -rw-r--r-- | vest/aer_scorer.cc | 20 | ||||
| -rw-r--r-- | vest/aer_scorer.h | 1 | ||||
| -rw-r--r-- | vest/comb_scorer.cc | 21 | ||||
| -rw-r--r-- | vest/comb_scorer.h | 1 | ||||
| -rw-r--r-- | vest/scorer.cc | 127 | ||||
| -rw-r--r-- | vest/scorer.h | 7 | ||||
| -rw-r--r-- | vest/ter.cc | 14 | ||||
| -rw-r--r-- | vest/ter.h | 1 | 
14 files changed, 662 insertions, 23 deletions
| diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 49aa45d0..e7b6abd8 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -74,6 +74,13 @@ libcdec_a_SOURCES = \    ff_wordalign.cc \    ff_csplit.cc \    ff_tagger.cc \ +  ff_bleu.cc \ +  ../vest/scorer.cc \ +  ../vest/ter.cc \ +  ../vest/aer_scorer.cc \ +  ../vest/comb_scorer.cc \ +  ../vest/error_surface.cc \ +  ../vest/viterbi_envelope.cc \    tromble_loss.cc \    freqdict.cc \    lexalign.cc \ diff --git a/decoder/cdec.cc b/decoder/cdec.cc index b6cc6f66..5f06b0c8 100644 --- a/decoder/cdec.cc +++ b/decoder/cdec.cc @@ -32,6 +32,7 @@  #include "inside_outside.h"  #include "exp_semiring.h"  #include "sentence_metadata.h" +#include "../vest/scorer.h"  using namespace std;  using namespace std::tr1; @@ -143,7 +144,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* confp) {          ("pb_max_distortion,D", po::value<int>()->default_value(4), "Phrase-based decoder: maximum distortion")          ("cll_gradient,G","Compute conditional log-likelihood gradient and write to STDOUT (src & ref required)")          ("crf_uniform_empirical", "If there are multple references use (i.e., lattice) a uniform distribution rather than posterior weighting a la EM") -        ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)") +    ("get_oracle_forest,OO", "Calculate rescored hypregraph using approximate BLEU scoring of rules") +    ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)") +    ("references,R", po::value<vector<string> >(), "Translation reference files")              ("vector_format",po::value<string>()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)")          ("combine_size,C",po::value<int>()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)")          ("forest_output,O",po::value<string>(),"Directory to write forests to") @@ -258,16 +261,30 @@ void MaxTranslationSample(Hypergraph* hg, const int samples, const int k) {  }  // TODO decoder output should probably be moved to another file -void DumpKBest(const int sent_id, const Hypergraph& forest, const int k, const bool unique) { +void DumpKBest(const int sent_id, const Hypergraph& forest, const int k, const bool unique, const char *kbest_out_filename_, float doc_src_length, float tmp_src_length, const DocScorer &ds, Score* doc_score) {  cerr << "In kbest\n"; + + ofstream kbest_out; + kbest_out.open(kbest_out_filename_); + cerr << "Output kbest to " << kbest_out_filename_; +  + //add length (f side) src length of this sentence to the psuedo-doc src length count + float curr_src_length = doc_src_length + tmp_src_length; +   if (unique) {      KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique> kbest(forest, k);      for (int i = 0; i < k; ++i) {        const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique>::Derivation* d =          kbest.LazyKthBest(forest.nodes_.size() - 1, i);        if (!d) break; -      cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " -           << d->feature_values << " ||| " << log(d->score) << endl; +      //calculate score in context of psuedo-doc +      Score* sentscore = ds[sent_id]->ScoreCandidate(d->yield); +      sentscore->PlusEquals(*doc_score,float(1)); +      float bleu = curr_src_length * sentscore->ComputeScore(); +      kbest_out << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " +		<< d->feature_values << " ||| " << log(d->score) << " ||| " << bleu << endl; +      // cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " +      //     << d->feature_values << " ||| " << log(d->score) << endl;      }    } else {      KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k); @@ -498,6 +515,48 @@ int main(int argc, char** argv) {    const bool kbest = conf.count("k_best");    const bool unique_kbest = conf.count("unique_k_best");    const bool crf_uniform_empirical = conf.count("crf_uniform_empirical"); +  const bool get_oracle_forest = conf.count("get_oracle_forest"); + +  /*Oracle Extraction Prep*/ +  vector<const FeatureFunction*> oracle_model_ffs; +  vector<double> oracle_feature_weights; +  shared_ptr<FeatureFunction> oracle_pff; +  if(get_oracle_forest)      { +     +    /*Add feature for oracle rescoring */ +    string ff, param; +    ff="BLEUModel"; +    //pass the location of the references file via param to BLEUModel +    for(int kk=0;kk < conf["references"].as<vector<string> >().size();kk++) +      { +	param =  param + " " + conf["references"].as<vector<string> >()[kk]; +      }        +    cerr << "Feature: " << ff << "->" << param << endl; +    oracle_pff = global_ff_registry->Create(ff,param); +    if (!oracle_pff) { exit(1); } +    oracle_model_ffs.push_back(oracle_pff.get());	 +    oracle_feature_weights.push_back(1.0); + +  } + +  ModelSet oracle_models(oracle_feature_weights, oracle_model_ffs);  + +  const string loss_function3 = "IBM_BLEU_3"; +  ScoreType type3 = ScoreTypeFromString(loss_function3); +  const DocScorer ds(type3, conf["references"].as<vector<string> >(), ""); +  cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function3 << endl; +   +     +  std::ostringstream kbest_string_stream; +  Score* doc_score=NULL; +  float doc_src_length=0; +  float tmp_src_length=0;  +  int oracle_doc_size= 10;   //used for scaling/weighting oracle doc +  float scale_oracle=  1-float(1)/oracle_doc_size; + +  /*End Oracle Extraction Prep*/ + +    shared_ptr<WriteFile> extract_file;    if (conf.count("extract_rules"))      extract_file.reset(new WriteFile(str("extract_rules",conf))); @@ -610,6 +669,87 @@ int main(int argc, char** argv) {      maybe_prune(forest,conf,"beam_prune","density_prune","+LM",srclen); +    vector<WordID> trans; +    ViterbiESentence(forest, &trans); +     +    /*Oracle Rescoring*/ +    if(get_oracle_forest) +      { +	Timer t("Forest Oracle rescoring:"); +	vector<WordID> model_trans; +	model_trans = trans; + +	trans=model_trans; +	Score* sentscore = ds[sent_id]->ScoreCandidate(model_trans); +	//initilize psuedo-doc vector to 1 counts +	if (!doc_score) { doc_score = sentscore->GetOne(); } +	double bleu_scale_ = doc_src_length * doc_score->ComputeScore(); +	tmp_src_length = smeta.GetSourceLength(); +	smeta.SetScore(doc_score); +	smeta.SetDocLen(doc_src_length); +	smeta.SetDocScorer(&ds);  +	 +	feature_weights[0]=1.0;	 + +	kbest_string_stream << conf["forest_output"].as<string>() << "/kbest_model" << "." << sent_id; +	DumpKBest(sent_id, forest, 10, true, kbest_string_stream.str().c_str(), doc_src_length, tmp_src_length, ds, doc_score); +	kbest_string_stream.str(""); + + +	forest.SortInEdgesByEdgeWeights(); +	Hypergraph lm_forest; +	const IntersectionConfiguration inter_conf_oracle(0, 0); +	cerr << "Going to call Apply Model " << endl; +	ApplyModelSet(forest, +		      smeta, +		      oracle_models, +		      inter_conf_oracle, +		      &lm_forest); +	 +	forest.swap(lm_forest); +	forest.Reweight(feature_weights); +	forest.SortInEdgesByEdgeWeights(); +	vector<WordID> oracle_trans; + +	ViterbiESentence(forest, &oracle_trans); +	cerr << "  +Oracle BLEU forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; +	cerr << "  +Oracle BLEU (paths): " << forest.NumberOfPaths() << endl; +	cerr << "  +Oracle BLEU Viterbi: " << TD::GetString(oracle_trans) << endl;   +       +	//compute kbest for oracle +	kbest_string_stream << conf["forest_output"].as<string>() <<"/kbest_oracle" << "." << sent_id; +	DumpKBest(sent_id, forest, 10, true, kbest_string_stream.str().c_str(), doc_src_length, tmp_src_length, ds, doc_score); +	kbest_string_stream.str(""); +	 +	 +	//reweight the model with -1 for the BLEU feature to compute k-best list for negative examples +	feature_weights[0]=-1.0; +	forest.Reweight(feature_weights); +	forest.SortInEdgesByEdgeWeights(); +	vector<WordID> neg_trans; +	ViterbiESentence(forest, &neg_trans); +	cerr << "  -Oracle BLEU forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; +	cerr << "  -Oracle BLEU (paths): " << forest.NumberOfPaths() << endl; +	cerr << "  -Oracle BLEU Viterbi: " << TD::GetString(neg_trans) << endl;   +	 +	//compute kbest for negative +	kbest_string_stream << conf["forest_output"].as<string>() << "/kbest_negative" << "." << sent_id; +	DumpKBest(sent_id, forest, 10, true, kbest_string_stream.str().c_str(), doc_src_length, tmp_src_length,ds,  doc_score); +	kbest_string_stream.str(""); +		 +	//Add 1-best translation (trans) to psuedo-doc vectors +	doc_score->PlusEquals(*sentscore, scale_oracle); +	delete sentscore; +	 +	doc_src_length = (doc_src_length + tmp_src_length) * scale_oracle; +	 +	 +	string details; +	doc_score->ScoreDetails(&details); +	cerr << "SCALED SCORE: " << bleu_scale_ << "DOC BLEU " << doc_score->ComputeScore() << " " <<details << endl; +      } + +      if (conf.count("forest_output") && !has_ref) {        ForestWriter writer(str("forest_output",conf), sent_id);        if (FileExists(writer.fname_)) { @@ -632,11 +772,9 @@ int main(int argc, char** argv) {      if (sample_max_trans) {        MaxTranslationSample(&forest, sample_max_trans, conf.count("k_best") ? conf["k_best"].as<int>() : 0);      } else { -      vector<WordID> trans; -      ViterbiESentence(forest, &trans); - +              if (kbest) { -        DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest); +        DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest,"", doc_src_length, tmp_src_length, ds,  doc_score);        } else if (csplit_output_plf) {          cout << HypergraphIO::AsPLF(forest, false) << endl;        } else { diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 077956a8..c91780e2 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -7,6 +7,7 @@  #include "ff_tagger.h"  #include "ff_factory.h"  #include "ff_ruleshape.h" +#include "ff_bleu.h"  boost::shared_ptr<FFRegistry> global_ff_registry; @@ -20,6 +21,7 @@ void register_feature_functions() {    global_ff_registry->Register(new FFFactory<WordPenalty>);    global_ff_registry->Register(new FFFactory<SourceWordPenalty>);    global_ff_registry->Register(new FFFactory<ArityPenalty>); +  global_ff_registry->Register("BLEUModel", new FFFactory<BLEUModel>);    global_ff_registry->Register("RuleShape", new FFFactory<RuleShapeFeatures>);    global_ff_registry->Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>);    global_ff_registry->Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>); diff --git a/decoder/ff_bleu.cc b/decoder/ff_bleu.cc new file mode 100644 index 00000000..4a13f89e --- /dev/null +++ b/decoder/ff_bleu.cc @@ -0,0 +1,285 @@ +#include "ff_bleu.h" + +#include <sstream> +#include <unistd.h> + +#include <boost/shared_ptr.hpp> + +#include "tdict.h" +#include "Vocab.h" +#include "Ngram.h" +#include "hg.h" +#include "stringlib.h" +#include "sentence_metadata.h" +#include "../vest/scorer.h" + +using namespace std; + +class BLEUModelImpl { + public: +  explicit BLEUModelImpl(int order) : +      ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), +      floor_(-100.0), +      kSTART(TD::Convert("<s>")), +      kSTOP(TD::Convert("</s>")), +      kUNKNOWN(TD::Convert("<unk>")), +      kNONE(-1), +      kSTAR(TD::Convert("<{STAR}>")) {} + +  BLEUModelImpl(int order, const string& f) : +      ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), +      floor_(-100.0), +      kSTART(TD::Convert("<s>")), +      kSTOP(TD::Convert("</s>")), +      kUNKNOWN(TD::Convert("<unk>")), +      kNONE(-1), +      kSTAR(TD::Convert("<{STAR}>")) {} +   + +  virtual ~BLEUModelImpl() { +      } + +  inline int StateSize(const void* state) const { +    return *(static_cast<const char*>(state) + state_size_); +  } + +  inline void SetStateSize(int size, void* state) const { +    *(static_cast<char*>(state) + state_size_) = size; +  } + +  void GetRefToNgram() +  {} +  +  string DebugStateToString(const void* state) const { +    int len = StateSize(state); +    const int* astate = reinterpret_cast<const int*>(state); +    string res = "["; +    for (int i = 0; i < len; ++i) { +      res += " "; +      res += TD::Convert(astate[i]); +    } +    res += " ]"; +    return res; +  } + +  inline double ProbNoRemnant(int i, int len) { +    int edge = len; +    bool flag = true; +    double sum = 0.0; +    while (i >= 0) { +      if (buffer_[i] == kSTAR) { +        edge = i; +        flag = false; +      } else if (buffer_[i] <= 0) { +        edge = i; +        flag = true; +      } else { +        if ((edge-i >= order_) || (flag && !(i == (len-1) && buffer_[i] == kSTART))) +	  {          //sum += LookupProbForBufferContents(i); +	    //cerr << "FT"; +	    CalcPhrase(buffer_[i], &buffer_[i+1]); +	  } +      } +      --i; +    } +    return sum; +  } + +  double FinalTraversalCost(const void* state) { +    int slen = StateSize(state); +    int len = slen + 2; +    // cerr << "residual len: " << len << endl; +    buffer_.resize(len + 1); +    buffer_[len] = kNONE; +    buffer_[len-1] = kSTART; +    const int* astate = reinterpret_cast<const int*>(state); +    int i = len - 2; +    for (int j = 0; j < slen; ++j,--i) +      buffer_[i] = astate[j]; +    buffer_[i] = kSTOP; +    assert(i == 0); +    return ProbNoRemnant(len - 1, len); +  } + +  vector<WordID> CalcPhrase(int word, int* context) { +     int i = order_; +    vector<WordID> vs; +    int c = 1; +    vs.push_back(word); +    // while (i > 1 && *context > 0) { +     while (*context > 0) { +      --i; +      vs.push_back(*context); +      ++context; +      ++c; +    } +     if(false){	cerr << "VS1( "; +	vector<WordID>::reverse_iterator rit; +	for ( rit=vs.rbegin() ; rit != vs.rend(); ++rit ) +	  cerr << " " << TD::Convert(*rit); +	cerr << ")\n";} +     +    return vs; +  } + + +  double LookupWords(const TRule& rule, const vector<const void*>& ant_states, void* vstate, const SentenceMetadata& smeta) { +    +    int len = rule.ELength() - rule.Arity(); +     +    for (int i = 0; i < ant_states.size(); ++i) +      len += StateSize(ant_states[i]); +    buffer_.resize(len + 1); +    buffer_[len] = kNONE; +    int i = len - 1; +    const vector<WordID>& e = rule.e(); + +    /*cerr << "RULE::" << rule.ELength() << " "; +    for (vector<WordID>::const_iterator i = e.begin(); i != e.end(); ++i) +      { +	const WordID& c = *i; +	if(c > 0) cerr << TD::Convert(c) << "--"; +	else cerr <<"N--"; +      } +    cerr << endl; +    */ + +    for (int j = 0; j < e.size(); ++j) { +      if (e[j] < 1) { +        const int* astate = reinterpret_cast<const int*>(ant_states[-e[j]]); +        int slen = StateSize(astate); +        for (int k = 0; k < slen; ++k) +          buffer_[i--] = astate[k]; +      } else { +        buffer_[i--] = e[j]; +      } +    } + +    double approx_bleu = 0.0; +    int* remnant = reinterpret_cast<int*>(vstate); +    int j = 0; +    i = len - 1; +    int edge = len; + + +    vector<WordID> vs; +    while (i >= 0) { +      vs = CalcPhrase(buffer_[i],&buffer_[i+1]); +      if (buffer_[i] == kSTAR) { +        edge = i; +      } else if (edge-i >= order_) { +	 +	vs = CalcPhrase(buffer_[i],&buffer_[i+1]); +       +      } else if (edge == len && remnant) { +        remnant[j++] = buffer_[i]; +      } +      --i; +    } + +    //calculate Bvector here +    /* cerr << "VS1( "; +    vector<WordID>::reverse_iterator rit; +    for ( rit=vs.rbegin() ; rit != vs.rend(); ++rit ) +      cerr << " " << TD::Convert(*rit); +    cerr << ")\n";  +    */ + +    Score *node_score = smeta.GetDocScorer()[smeta.GetSentenceID()]->ScoreCCandidate(vs); +    string details; +    node_score->ScoreDetails(&details); +    const Score *base_score= &smeta.GetScore(); +    //cerr << "SWBASE : " << base_score->ComputeScore() << details << " "; + +    int src_length = smeta.GetSourceLength();     +    node_score->PlusPartialEquals(*base_score, rule.EWords(), rule.FWords(), src_length ); +    float oracledoc_factor = (src_length + smeta.GetDocLen())/  src_length; + +    //how it seems to be done in code +    //TODO: might need to reverse the -1/+1 of the oracle/neg examples +    approx_bleu = ( rule.FWords() * oracledoc_factor  ) * node_score->ComputeScore(); +    //how I thought it was done from the paper +    //approx_bleu = ( rule.FWords()+ smeta.GetDocLen() ) * node_score->ComputeScore(); + +    if (!remnant){  return approx_bleu;} + +    if (edge != len || len >= order_) { +      remnant[j++] = kSTAR; +      if (order_-1 < edge) edge = order_-1; +      for (int i = edge-1; i >= 0; --i) +        remnant[j++] = buffer_[i]; +    } + +    SetStateSize(j, vstate); +    //cerr << "Return APPROX_BLEU: " << approx_bleu << " "<<  DebugStateToString(vstate) << endl; +    return approx_bleu; +  } + +  static int OrderToStateSize(int order) { +    return ((order-1) * 2 + 1) * sizeof(WordID) + 1; +  } + + protected: +  Ngram ngram_; +  vector<WordID> buffer_; +  const int order_; +  const int state_size_; +  const double floor_; + + public: +  const WordID kSTART; +  const WordID kSTOP; +  const WordID kUNKNOWN; +  const WordID kNONE; +  const WordID kSTAR; +}; + +BLEUModel::BLEUModel(const string& param) : +  fid_(0) { //The partial BLEU score is kept in feature id=0 +  vector<string> argv; +  int argc = SplitOnWhitespace(param, &argv); +  int order = 3; +  string filename; +  +  //loop over argv and load all references into vector of NgramMaps    +  if (argc < 1) { cerr << "BLEUModel requires a filename, minimally!\n"; abort(); } +   +   +  SetStateSize(BLEUModelImpl::OrderToStateSize(order)); +  pimpl_ = new BLEUModelImpl(order, filename); +} + +BLEUModel::~BLEUModel() { +  delete pimpl_; +} + +string BLEUModel::DebugStateToString(const void* state) const{ +  return pimpl_->DebugStateToString(state); +} + +void BLEUModel::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                          const Hypergraph::Edge& edge, +                                          const vector<const void*>& ant_states, +                                          SparseVector<double>* features, +                                          SparseVector<double>* estimated_features, +                                          void* state) const { + +  (void) smeta; +  /*cerr << "In BM calling set " << endl;   +  const Score *s=  &smeta.GetScore(); +  const int dl = smeta.GetDocLen(); +  cerr << "SCO " << s->ComputeScore() << endl; +  const DocScorer *ds = &smeta.GetDocScorer(); +  */ + +  cerr<< "Loading sentence " << smeta.GetSentenceID() << endl; +      //} +  features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state, smeta)); +  //cerr << "FID" << fid_ << " " << DebugStateToString(state) << endl; +} + +void BLEUModel::FinalTraversalFeatures(const void* ant_state, +                                           SparseVector<double>* features) const { + +  features->set_value(fid_, pimpl_->FinalTraversalCost(ant_state)); +} diff --git a/decoder/ff_bleu.h b/decoder/ff_bleu.h new file mode 100644 index 00000000..fb127241 --- /dev/null +++ b/decoder/ff_bleu.h @@ -0,0 +1,32 @@ +#ifndef _BLEU_FF_H_ +#define _BLEU_FF_H_ + +#include <vector> +#include <string> + +#include "hg.h" +#include "ff.h" +#include "config.h" + +class BLEUModelImpl; + +class BLEUModel : public FeatureFunction { + public: +  // param = "filename.lm [-o n]" +  BLEUModel(const std::string& param); +  ~BLEUModel(); +  virtual void FinalTraversalFeatures(const void* context, +                                      SparseVector<double>* features) const; +  std::string DebugStateToString(const void* state) const; + protected: +  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* out_context) const; + private: +  const int fid_; +  mutable BLEUModelImpl* pimpl_; +}; +#endif diff --git a/decoder/sentence_metadata.h b/decoder/sentence_metadata.h index ef9eb388..21be9b21 100644 --- a/decoder/sentence_metadata.h +++ b/decoder/sentence_metadata.h @@ -3,6 +3,7 @@  #include <cassert>  #include "lattice.h" +#include "../vest/scorer.h"  struct SentenceMetadata {    SentenceMetadata(int id, const Lattice& ref) : @@ -30,10 +31,22 @@ struct SentenceMetadata {    // this will be empty if the translator accepts non FS input!    const Lattice& GetSourceLattice() const { return src_lattice_; } +  // access to document level scores for MIRA vector computation +  void SetScore(Score *s){app_score=s;} +  void SetDocScorer (const DocScorer *d){ds = d;} +  void SetDocLen(double dl){doc_len = dl;} + +  const Score& GetScore() const { return *app_score; } +  const DocScorer& GetDocScorer() const { return *ds; } +  double GetDocLen() const {return doc_len;} +   private:    const int sent_id_;    // the following should be set, if possible, by the Translator    int src_len_; +  double doc_len; +  const DocScorer* ds; +  const Score* app_score;   public:    Lattice src_lattice_;  // this will only be set if inputs are finite state!   private: diff --git a/vest/aer_scorer.cc b/vest/aer_scorer.cc index 9c8a783a..d3f28804 100644 --- a/vest/aer_scorer.cc +++ b/vest/aer_scorer.cc @@ -15,15 +15,27 @@ class AERScore : public Score {    AERScore() : num_matches(), num_predicted(), num_in_ref() {}    AERScore(int m, int p, int r) :      num_matches(m), num_predicted(p), num_in_ref(r) {} -  virtual void PlusEquals(const Score& delta) { +  virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} +  virtual void PlusEquals(const Score& delta, const float scale) {      const AERScore& other = static_cast<const AERScore&>(delta);      num_matches   += other.num_matches;      num_predicted += other.num_predicted;      num_in_ref    += other.num_in_ref;    } + virtual void PlusEquals(const Score& delta) { +    const AERScore& other = static_cast<const AERScore&>(delta); +    num_matches   += other.num_matches; +    num_predicted += other.num_predicted; +    num_in_ref    += other.num_in_ref; +  } + +    virtual Score* GetZero() const {      return new AERScore;    } +  virtual Score* GetOne() const { +    return new AERScore; +  }    virtual void Subtract(const Score& rhs, Score* out) const {      AERScore* res = static_cast<AERScore*>(out);      const AERScore& other = static_cast<const AERScore&>(rhs); @@ -37,6 +49,7 @@ class AERScore : public Score {    float Recall() const {      return static_cast<float>(num_matches) / num_in_ref;    } +  float ComputePartialScore() const { return 0.0;}    virtual float ComputeScore() const {      const float prec = Precision();      const float rec = Recall(); @@ -82,6 +95,11 @@ static inline bool Safe(const Array2D<bool>& a, int i, int j) {      return false;  } +Score* AERScorer::ScoreCCandidate(const vector<WordID>& shyp) const { +  Score* a = NULL; +  return a; +} +  Score* AERScorer::ScoreCandidate(const vector<WordID>& shyp) const {    boost::shared_ptr<Array2D<bool> > hyp =      AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(shyp)); diff --git a/vest/aer_scorer.h b/vest/aer_scorer.h index a0afea3b..d0df35d5 100644 --- a/vest/aer_scorer.h +++ b/vest/aer_scorer.h @@ -12,6 +12,7 @@ class AERScorer : public SentenceScorer {    // is necessary.    AERScorer(const std::vector<std::vector<WordID> >& refs, const std::string& src = "");    Score* ScoreCandidate(const std::vector<WordID>& hyp) const; +  Score* ScoreCCandidate(const std::vector<WordID>& hyp) const;    static Score* ScoreFromString(const std::string& in);    const std::string* GetSource() const;   private: diff --git a/vest/comb_scorer.cc b/vest/comb_scorer.cc index 7b2187f4..3dd077a6 100644 --- a/vest/comb_scorer.cc +++ b/vest/comb_scorer.cc @@ -8,6 +8,7 @@ class BLEUTERCombinationScore : public Score {    friend class BLEUTERCombinationScorer;   public:    ~BLEUTERCombinationScore(); +  float ComputePartialScore() const { return 0.0;}    float ComputeScore() const {      return (bleu->ComputeScore() - ter->ComputeScore()) / 2.0f;    } @@ -17,10 +18,25 @@ class BLEUTERCombinationScore : public Score {        ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, ter->ComputeScore()*100.0f);      *details = buf;    } +  void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} + +  void PlusEquals(const Score& delta, const float scale) { +    bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu, scale); +    ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter, scale); +  }    void PlusEquals(const Score& delta) {      bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu);      ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter);    } + + + +  Score* GetOne() const { +    BLEUTERCombinationScore* res = new BLEUTERCombinationScore; +    res->bleu = bleu->GetOne(); +    res->ter = ter->GetOne(); +    return res;     +  }    Score* GetZero() const {      BLEUTERCombinationScore* res = new BLEUTERCombinationScore;      res->bleu = bleu->GetZero(); @@ -65,6 +81,11 @@ BLEUTERCombinationScorer::~BLEUTERCombinationScorer() {    delete ter_;  } +Score* BLEUTERCombinationScorer::ScoreCCandidate(const vector<WordID>& hyp) const { +  Score* a = NULL; +  return a; +} +  Score* BLEUTERCombinationScorer::ScoreCandidate(const std::vector<WordID>& hyp) const {    BLEUTERCombinationScore* res = new BLEUTERCombinationScore;    res->bleu = bleu_->ScoreCandidate(hyp); diff --git a/vest/comb_scorer.h b/vest/comb_scorer.h index 70b1ec75..1a4f3324 100644 --- a/vest/comb_scorer.h +++ b/vest/comb_scorer.h @@ -8,6 +8,7 @@ class BLEUTERCombinationScorer : public SentenceScorer {    BLEUTERCombinationScorer(const std::vector<std::vector<WordID> >& refs);    ~BLEUTERCombinationScorer();    Score* ScoreCandidate(const std::vector<WordID>& hyp) const; +  Score* ScoreCCandidate(const std::vector<WordID>& hyp) const;    static Score* ScoreFromString(const std::string& in);   private:    SentenceScorer* bleu_; diff --git a/vest/scorer.cc b/vest/scorer.cc index 6c604ab8..524b15a5 100644 --- a/vest/scorer.cc +++ b/vest/scorer.cc @@ -35,6 +35,8 @@ ScoreType ScoreTypeFromString(const string& st) {      return AER;    if (sl == "bleu" || sl == "ibm_bleu")      return IBM_BLEU; +  if (sl == "ibm_bleu_3") +    return IBM_BLEU_3;    if (sl == "nist_bleu")      return NIST_BLEU;    if (sl == "koehn_bleu") @@ -53,6 +55,7 @@ class SERScore : public Score {    friend class SERScorer;   public:    SERScore() : correct(0), total(0) {} +  float ComputePartialScore() const { return 0.0;}    float ComputeScore() const {      return static_cast<float>(correct) / static_cast<float>(total);    } @@ -61,11 +64,18 @@ class SERScore : public Score {      os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')';      *details = os.str();    } -  void PlusEquals(const Score& delta) { +  void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){} +   +  void PlusEquals(const Score& delta, const float scale) {      correct += static_cast<const SERScore&>(delta).correct;      total += static_cast<const SERScore&>(delta).total;    } +  void PlusEquals(const Score& delta) { +    correct += static_cast<const SERScore&>(delta).correct; +    total += static_cast<const SERScore&>(delta).total; +    }    Score* GetZero() const { return new SERScore; } +  Score* GetOne() const { return new SERScore; }    void Subtract(const Score& rhs, Score* res) const {      SERScore* r = static_cast<SERScore*>(res);      r->correct = correct - static_cast<const SERScore&>(rhs).correct; @@ -84,6 +94,10 @@ class SERScore : public Score {  class SERScorer : public SentenceScorer {   public:    SERScorer(const vector<vector<WordID> >& references) : refs_(references) {} +  Score* ScoreCCandidate(const vector<WordID>& hyp) const { +    Score* a = NULL; +    return a; +  }    Score* ScoreCandidate(const vector<WordID>& hyp) const {      SERScore* res = new SERScore;      res->total = 1; @@ -101,13 +115,20 @@ class SERScorer : public SentenceScorer {  class BLEUScore : public Score {    friend class BLEUScorerBase;   public: -  BLEUScore(int n) : correct_ngram_hit_counts(0,n), hyp_ngram_counts(0,n) { +  BLEUScore(int n) : correct_ngram_hit_counts(float(0),float(n)), hyp_ngram_counts(float(0),float(n)) {      ref_len = 0;      hyp_len = 0; } +  BLEUScore(int n, int k) :  correct_ngram_hit_counts(float(k),float(n)), hyp_ngram_counts(float(k),float(n)) { +    ref_len = k; +    hyp_len = k; }      float ComputeScore() const; +  float ComputePartialScore() const;    void ScoreDetails(string* details) const;    void PlusEquals(const Score& delta); +  void PlusEquals(const Score& delta, const float scale); +  void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len);    Score* GetZero() const; +  Score* GetOne() const;    void Subtract(const Score& rhs, Score* res) const;    void Encode(string* out) const;    bool IsAdditiveIdentity() const { @@ -119,10 +140,11 @@ class BLEUScore : public Score {    }   private:    float ComputeScore(vector<float>* precs, float* bp) const; -  valarray<int> correct_ngram_hit_counts; -  valarray<int> hyp_ngram_counts; +  float ComputePartialScore(vector<float>* prec, float* bp) const; +  valarray<float> correct_ngram_hit_counts; +  valarray<float> hyp_ngram_counts;    float ref_len; -  int hyp_len; +  float hyp_len;  };  class BLEUScorerBase : public SentenceScorer { @@ -131,6 +153,7 @@ class BLEUScorerBase : public SentenceScorer {               int n               );    Score* ScoreCandidate(const vector<WordID>& hyp) const; +  Score* ScoreCCandidate(const vector<WordID>& hyp) const;    static Score* ScoreFromString(const string& in);   protected: @@ -171,8 +194,10 @@ class BLEUScorerBase : public SentenceScorer {    }    void ComputeNgramStats(const vector<WordID>& sent, -       valarray<int>* correct, -       valarray<int>* hyp) const { +			 valarray<float>* correct, +			 valarray<float>* hyp, +			 bool clip_counts) +    const {      assert(correct->size() == n_);      assert(hyp->size() == n_);      vector<WordID> ngram(n_); @@ -186,10 +211,15 @@ class BLEUScorerBase : public SentenceScorer {        for (int i=1; i<=k; ++i) {  	ngram.push_back(sent[j + i - 1]);          pair<int,int>& p = ngrams_[ngram]; -        if (p.second < p.first) { -          ++p.second; -          (*correct)[i-1]++; -        } +	if(clip_counts){ +	  if (p.second < p.first) { +	    ++p.second; +	    (*correct)[i-1]++; +	  }} +	else { +	  ++p.second; +	  (*correct)[i-1]++; +	}  	// if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams:  	if (!p.first) {  	  for (; i<=k; ++i) @@ -284,7 +314,8 @@ SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type,        const vector<vector<WordID> >& refs,        const string& src) {    switch (type) { -    case IBM_BLEU: return new IBM_BLEUScorer(refs, 4); +  case IBM_BLEU: return new IBM_BLEUScorer(refs, 4); +  case IBM_BLEU_3 : return new IBM_BLEUScorer(refs,3);      case NIST_BLEU: return new NIST_BLEUScorer(refs, 4);      case Koehn_BLEU: return new Koehn_BLEUScorer(refs, 4);      case AER: return new AERScorer(refs, src); @@ -299,6 +330,7 @@ SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type,  Score* SentenceScorer::CreateScoreFromString(const ScoreType type, const string& in) {    switch (type) {      case IBM_BLEU: +  case IBM_BLEU_3:      case NIST_BLEU:      case Koehn_BLEU:        return BLEUScorerBase::ScoreFromString(in); @@ -423,6 +455,36 @@ float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const {    return exp(log_bleu);  } + +//comptue scaled score for oracle retrieval +float BLEUScore::ComputePartialScore(vector<float>* precs, float* bp) const { +  // cerr << "Then here " << endl; +  float log_bleu = 0; +  if (precs) precs->clear(); +  int count = 0; +  for (int i = 0; i < hyp_ngram_counts.size(); ++i) { +    //  cerr << "In CPS " << hyp_ngram_counts[i] << " " << correct_ngram_hit_counts[i] << endl; +    if (hyp_ngram_counts[i] > 0) { +      float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]); +      if (precs) precs->push_back(exp(lprec)); +      log_bleu += lprec; +      ++count; +    } +  } +  log_bleu /= static_cast<float>(count); +  float lbp = 0.0; +  if (hyp_len < ref_len) +    lbp = (hyp_len - ref_len) / hyp_len; +  log_bleu += lbp; +  if (bp) *bp = exp(lbp); +  return exp(log_bleu); +} + +float BLEUScore::ComputePartialScore() const { +  // cerr << "In here first " << endl; +  return ComputePartialScore(NULL, NULL); +} +  float BLEUScore::ComputeScore() const {    return ComputeScore(NULL, NULL);  } @@ -444,10 +506,37 @@ void BLEUScore::PlusEquals(const Score& delta) {    hyp_len += d.hyp_len;  } +void BLEUScore::PlusEquals(const Score& delta, const float scale) { +  const BLEUScore& d = static_cast<const BLEUScore&>(delta); +  correct_ngram_hit_counts = (correct_ngram_hit_counts + d.correct_ngram_hit_counts) * scale; +  hyp_ngram_counts = ( hyp_ngram_counts + d.hyp_ngram_counts) * scale; +  ref_len = (ref_len + d.ref_len) * scale; +  hyp_len = ( hyp_len + d.hyp_len) * scale; + +} + +void BLEUScore::PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){ +  const BLEUScore& d = static_cast<const BLEUScore&>(delta); +  correct_ngram_hit_counts += d.correct_ngram_hit_counts; +  hyp_ngram_counts += d.hyp_ngram_counts; +  //scale the reference length according to the size of the input sentence covered by this rule +   +  ref_len *= (float)oracle_f_cover / src_len; +  ref_len += d.ref_len; +   +  hyp_len = oracle_e_cover; +  hyp_len += d.hyp_len; +} + +  Score* BLEUScore::GetZero() const {    return new BLEUScore(hyp_ngram_counts.size());  } +Score* BLEUScore::GetOne() const { +  return new BLEUScore(hyp_ngram_counts.size(),1); +} +  void BLEUScore::Encode(string* out) const {    ostringstream os;    const int n = correct_ngram_hit_counts.size(); @@ -470,12 +559,24 @@ Score* BLEUScorerBase::ScoreCandidate(const vector<WordID>& hyp) const {    BLEUScore* bs = new BLEUScore(n_);    for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i)      i->second.second = 0; -  ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts); +  ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts, true);    bs->ref_len = ComputeRefLength(hyp);    bs->hyp_len = hyp.size();    return bs;  } +Score* BLEUScorerBase::ScoreCCandidate(const vector<WordID>& hyp) const { +  BLEUScore* bs = new BLEUScore(n_); +  for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i) +    i->second.second = 0; +  bool clip = false; +  ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts,clip); +  bs->ref_len = ComputeRefLength(hyp); +  bs->hyp_len = hyp.size(); +  return bs; +} + +  DocScorer::~DocScorer() {    for (int i=0; i < scorers_.size(); ++i)      delete scorers_[i]; diff --git a/vest/scorer.h b/vest/scorer.h index 83d4db4c..7ce688c4 100644 --- a/vest/scorer.h +++ b/vest/scorer.h @@ -10,17 +10,21 @@ class ViterbiEnvelope;  class ErrorSurface;  class Hypergraph;  // needed for alignment -enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER }; +enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3 };  ScoreType ScoreTypeFromString(const std::string& st);  class Score {   public:    virtual ~Score();    virtual float ComputeScore() const = 0; +  virtual float ComputePartialScore() const =0;    virtual void ScoreDetails(std::string* details) const = 0; +  virtual void PlusEquals(const Score& rhs, const float scale) = 0;    virtual void PlusEquals(const Score& rhs) = 0; +  virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len) = 0;    virtual void Subtract(const Score& rhs, Score* res) const = 0;    virtual Score* GetZero() const = 0; +  virtual Score* GetOne() const = 0;    virtual bool IsAdditiveIdentity() const = 0; // returns true if adding this delta                                        // to another score results in no score change  				      // under any circumstances @@ -32,6 +36,7 @@ class SentenceScorer {    virtual ~SentenceScorer();    void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const;    virtual Score* ScoreCandidate(const std::vector<WordID>& hyp) const = 0; +  virtual Score* ScoreCCandidate(const std::vector<WordID>& hyp) const =0;    virtual const std::string* GetSource() const;    static Score* CreateScoreFromString(const ScoreType type, const std::string& in);    static SentenceScorer* CreateSentenceScorer(const ScoreType type, diff --git a/vest/ter.cc b/vest/ter.cc index ef66f3b7..6e16e1cf 100644 --- a/vest/ter.cc +++ b/vest/ter.cc @@ -424,17 +424,26 @@ class TERScore : public Score {    static const unsigned kDUMMY_LAST_ENTRY = 5;   TERScore() : stats(0,kDUMMY_LAST_ENTRY) {} +  float ComputePartialScore() const { return 0.0;}    float ComputeScore() const {      float edits = static_cast<float>(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]);      return edits / static_cast<float>(stats[kREF_WORDCOUNT]);    }    void ScoreDetails(string* details) const; +  void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} +  void PlusEquals(const Score& delta, const float scale) { +    stats += static_cast<const TERScore&>(delta).stats; +  }    void PlusEquals(const Score& delta) {      stats += static_cast<const TERScore&>(delta).stats;    } +    Score* GetZero() const {      return new TERScore;    } +  Score* GetOne() const { +    return new TERScore; +  }    void Subtract(const Score& rhs, Score* res) const {      static_cast<TERScore*>(res)->stats = stats - static_cast<const TERScore&>(rhs).stats;    } @@ -489,6 +498,11 @@ TERScorer::TERScorer(const vector<vector<WordID> >& refs) : impl_(refs.size()) {      impl_[i] = new TERScorerImpl(refs[i]);  } +Score* TERScorer::ScoreCCandidate(const vector<WordID>& hyp) const { +  Score* a = NULL; +  return a; +} +  Score* TERScorer::ScoreCandidate(const std::vector<WordID>& hyp) const {    float best_score = numeric_limits<float>::max();    TERScore* res = new TERScore; @@ -10,6 +10,7 @@ class TERScorer : public SentenceScorer {    TERScorer(const std::vector<std::vector<WordID> >& references);    ~TERScorer();    Score* ScoreCandidate(const std::vector<WordID>& hyp) const; +  Score* ScoreCCandidate(const std::vector<WordID>& hyp) const;    static Score* ScoreFromString(const std::string& data);   private:    std::vector<TERScorerImpl*> impl_; | 
