diff options
| author | vladimir.eidelman <vladimir.eidelman@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-14 23:00:08 +0000 | 
|---|---|---|
| committer | vladimir.eidelman <vladimir.eidelman@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-14 23:00:08 +0000 | 
| commit | 2775fc13d1e8d3ad45c8ddf94226397403e0e373 (patch) | |
| tree | 487fe0f9e717e6d444a448142d7b91e75e6873a1 /vest | |
| parent | 8f97e6b03114761870f0c72f18f0928fac28d0f9 (diff) | |
Added oracle forest rescoring
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@254 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'vest')
| -rw-r--r-- | vest/aer_scorer.cc | 20 | ||||
| -rw-r--r-- | vest/aer_scorer.h | 1 | ||||
| -rw-r--r-- | vest/comb_scorer.cc | 21 | ||||
| -rw-r--r-- | vest/comb_scorer.h | 1 | ||||
| -rw-r--r-- | vest/scorer.cc | 127 | ||||
| -rw-r--r-- | vest/scorer.h | 7 | ||||
| -rw-r--r-- | vest/ter.cc | 14 | ||||
| -rw-r--r-- | vest/ter.h | 1 | 
8 files changed, 177 insertions, 15 deletions
| diff --git a/vest/aer_scorer.cc b/vest/aer_scorer.cc index 9c8a783a..d3f28804 100644 --- a/vest/aer_scorer.cc +++ b/vest/aer_scorer.cc @@ -15,15 +15,27 @@ class AERScore : public Score {    AERScore() : num_matches(), num_predicted(), num_in_ref() {}    AERScore(int m, int p, int r) :      num_matches(m), num_predicted(p), num_in_ref(r) {} -  virtual void PlusEquals(const Score& delta) { +  virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} +  virtual void PlusEquals(const Score& delta, const float scale) {      const AERScore& other = static_cast<const AERScore&>(delta);      num_matches   += other.num_matches;      num_predicted += other.num_predicted;      num_in_ref    += other.num_in_ref;    } + virtual void PlusEquals(const Score& delta) { +    const AERScore& other = static_cast<const AERScore&>(delta); +    num_matches   += other.num_matches; +    num_predicted += other.num_predicted; +    num_in_ref    += other.num_in_ref; +  } + +    virtual Score* GetZero() const {      return new AERScore;    } +  virtual Score* GetOne() const { +    return new AERScore; +  }    virtual void Subtract(const Score& rhs, Score* out) const {      AERScore* res = static_cast<AERScore*>(out);      const AERScore& other = static_cast<const AERScore&>(rhs); @@ -37,6 +49,7 @@ class AERScore : public Score {    float Recall() const {      return static_cast<float>(num_matches) / num_in_ref;    } +  float ComputePartialScore() const { return 0.0;}    virtual float ComputeScore() const {      const float prec = Precision();      const float rec = Recall(); @@ -82,6 +95,11 @@ static inline bool Safe(const Array2D<bool>& a, int i, int j) {      return false;  } +Score* AERScorer::ScoreCCandidate(const vector<WordID>& shyp) const { +  Score* a = NULL; +  return a; +} +  Score* AERScorer::ScoreCandidate(const vector<WordID>& shyp) const {    boost::shared_ptr<Array2D<bool> > hyp =      AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(shyp)); diff --git a/vest/aer_scorer.h b/vest/aer_scorer.h index a0afea3b..d0df35d5 100644 --- a/vest/aer_scorer.h +++ b/vest/aer_scorer.h @@ -12,6 +12,7 @@ class AERScorer : public SentenceScorer {    // is necessary.    AERScorer(const std::vector<std::vector<WordID> >& refs, const std::string& src = "");    Score* ScoreCandidate(const std::vector<WordID>& hyp) const; +  Score* ScoreCCandidate(const std::vector<WordID>& hyp) const;    static Score* ScoreFromString(const std::string& in);    const std::string* GetSource() const;   private: diff --git a/vest/comb_scorer.cc b/vest/comb_scorer.cc index 7b2187f4..3dd077a6 100644 --- a/vest/comb_scorer.cc +++ b/vest/comb_scorer.cc @@ -8,6 +8,7 @@ class BLEUTERCombinationScore : public Score {    friend class BLEUTERCombinationScorer;   public:    ~BLEUTERCombinationScore(); +  float ComputePartialScore() const { return 0.0;}    float ComputeScore() const {      return (bleu->ComputeScore() - ter->ComputeScore()) / 2.0f;    } @@ -17,10 +18,25 @@ class BLEUTERCombinationScore : public Score {        ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, ter->ComputeScore()*100.0f);      *details = buf;    } +  void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} + +  void PlusEquals(const Score& delta, const float scale) { +    bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu, scale); +    ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter, scale); +  }    void PlusEquals(const Score& delta) {      bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu);      ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter);    } + + + +  Score* GetOne() const { +    BLEUTERCombinationScore* res = new BLEUTERCombinationScore; +    res->bleu = bleu->GetOne(); +    res->ter = ter->GetOne(); +    return res;     +  }    Score* GetZero() const {      BLEUTERCombinationScore* res = new BLEUTERCombinationScore;      res->bleu = bleu->GetZero(); @@ -65,6 +81,11 @@ BLEUTERCombinationScorer::~BLEUTERCombinationScorer() {    delete ter_;  } +Score* BLEUTERCombinationScorer::ScoreCCandidate(const vector<WordID>& hyp) const { +  Score* a = NULL; +  return a; +} +  Score* BLEUTERCombinationScorer::ScoreCandidate(const std::vector<WordID>& hyp) const {    BLEUTERCombinationScore* res = new BLEUTERCombinationScore;    res->bleu = bleu_->ScoreCandidate(hyp); diff --git a/vest/comb_scorer.h b/vest/comb_scorer.h index 70b1ec75..1a4f3324 100644 --- a/vest/comb_scorer.h +++ b/vest/comb_scorer.h @@ -8,6 +8,7 @@ class BLEUTERCombinationScorer : public SentenceScorer {    BLEUTERCombinationScorer(const std::vector<std::vector<WordID> >& refs);    ~BLEUTERCombinationScorer();    Score* ScoreCandidate(const std::vector<WordID>& hyp) const; +  Score* ScoreCCandidate(const std::vector<WordID>& hyp) const;    static Score* ScoreFromString(const std::string& in);   private:    SentenceScorer* bleu_; diff --git a/vest/scorer.cc b/vest/scorer.cc index 6c604ab8..524b15a5 100644 --- a/vest/scorer.cc +++ b/vest/scorer.cc @@ -35,6 +35,8 @@ ScoreType ScoreTypeFromString(const string& st) {      return AER;    if (sl == "bleu" || sl == "ibm_bleu")      return IBM_BLEU; +  if (sl == "ibm_bleu_3") +    return IBM_BLEU_3;    if (sl == "nist_bleu")      return NIST_BLEU;    if (sl == "koehn_bleu") @@ -53,6 +55,7 @@ class SERScore : public Score {    friend class SERScorer;   public:    SERScore() : correct(0), total(0) {} +  float ComputePartialScore() const { return 0.0;}    float ComputeScore() const {      return static_cast<float>(correct) / static_cast<float>(total);    } @@ -61,11 +64,18 @@ class SERScore : public Score {      os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')';      *details = os.str();    } -  void PlusEquals(const Score& delta) { +  void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){} +   +  void PlusEquals(const Score& delta, const float scale) {      correct += static_cast<const SERScore&>(delta).correct;      total += static_cast<const SERScore&>(delta).total;    } +  void PlusEquals(const Score& delta) { +    correct += static_cast<const SERScore&>(delta).correct; +    total += static_cast<const SERScore&>(delta).total; +    }    Score* GetZero() const { return new SERScore; } +  Score* GetOne() const { return new SERScore; }    void Subtract(const Score& rhs, Score* res) const {      SERScore* r = static_cast<SERScore*>(res);      r->correct = correct - static_cast<const SERScore&>(rhs).correct; @@ -84,6 +94,10 @@ class SERScore : public Score {  class SERScorer : public SentenceScorer {   public:    SERScorer(const vector<vector<WordID> >& references) : refs_(references) {} +  Score* ScoreCCandidate(const vector<WordID>& hyp) const { +    Score* a = NULL; +    return a; +  }    Score* ScoreCandidate(const vector<WordID>& hyp) const {      SERScore* res = new SERScore;      res->total = 1; @@ -101,13 +115,20 @@ class SERScorer : public SentenceScorer {  class BLEUScore : public Score {    friend class BLEUScorerBase;   public: -  BLEUScore(int n) : correct_ngram_hit_counts(0,n), hyp_ngram_counts(0,n) { +  BLEUScore(int n) : correct_ngram_hit_counts(float(0),float(n)), hyp_ngram_counts(float(0),float(n)) {      ref_len = 0;      hyp_len = 0; } +  BLEUScore(int n, int k) :  correct_ngram_hit_counts(float(k),float(n)), hyp_ngram_counts(float(k),float(n)) { +    ref_len = k; +    hyp_len = k; }      float ComputeScore() const; +  float ComputePartialScore() const;    void ScoreDetails(string* details) const;    void PlusEquals(const Score& delta); +  void PlusEquals(const Score& delta, const float scale); +  void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len);    Score* GetZero() const; +  Score* GetOne() const;    void Subtract(const Score& rhs, Score* res) const;    void Encode(string* out) const;    bool IsAdditiveIdentity() const { @@ -119,10 +140,11 @@ class BLEUScore : public Score {    }   private:    float ComputeScore(vector<float>* precs, float* bp) const; -  valarray<int> correct_ngram_hit_counts; -  valarray<int> hyp_ngram_counts; +  float ComputePartialScore(vector<float>* prec, float* bp) const; +  valarray<float> correct_ngram_hit_counts; +  valarray<float> hyp_ngram_counts;    float ref_len; -  int hyp_len; +  float hyp_len;  };  class BLEUScorerBase : public SentenceScorer { @@ -131,6 +153,7 @@ class BLEUScorerBase : public SentenceScorer {               int n               );    Score* ScoreCandidate(const vector<WordID>& hyp) const; +  Score* ScoreCCandidate(const vector<WordID>& hyp) const;    static Score* ScoreFromString(const string& in);   protected: @@ -171,8 +194,10 @@ class BLEUScorerBase : public SentenceScorer {    }    void ComputeNgramStats(const vector<WordID>& sent, -       valarray<int>* correct, -       valarray<int>* hyp) const { +			 valarray<float>* correct, +			 valarray<float>* hyp, +			 bool clip_counts) +    const {      assert(correct->size() == n_);      assert(hyp->size() == n_);      vector<WordID> ngram(n_); @@ -186,10 +211,15 @@ class BLEUScorerBase : public SentenceScorer {        for (int i=1; i<=k; ++i) {  	ngram.push_back(sent[j + i - 1]);          pair<int,int>& p = ngrams_[ngram]; -        if (p.second < p.first) { -          ++p.second; -          (*correct)[i-1]++; -        } +	if(clip_counts){ +	  if (p.second < p.first) { +	    ++p.second; +	    (*correct)[i-1]++; +	  }} +	else { +	  ++p.second; +	  (*correct)[i-1]++; +	}  	// if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams:  	if (!p.first) {  	  for (; i<=k; ++i) @@ -284,7 +314,8 @@ SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type,        const vector<vector<WordID> >& refs,        const string& src) {    switch (type) { -    case IBM_BLEU: return new IBM_BLEUScorer(refs, 4); +  case IBM_BLEU: return new IBM_BLEUScorer(refs, 4); +  case IBM_BLEU_3 : return new IBM_BLEUScorer(refs,3);      case NIST_BLEU: return new NIST_BLEUScorer(refs, 4);      case Koehn_BLEU: return new Koehn_BLEUScorer(refs, 4);      case AER: return new AERScorer(refs, src); @@ -299,6 +330,7 @@ SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type,  Score* SentenceScorer::CreateScoreFromString(const ScoreType type, const string& in) {    switch (type) {      case IBM_BLEU: +  case IBM_BLEU_3:      case NIST_BLEU:      case Koehn_BLEU:        return BLEUScorerBase::ScoreFromString(in); @@ -423,6 +455,36 @@ float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const {    return exp(log_bleu);  } + +//comptue scaled score for oracle retrieval +float BLEUScore::ComputePartialScore(vector<float>* precs, float* bp) const { +  // cerr << "Then here " << endl; +  float log_bleu = 0; +  if (precs) precs->clear(); +  int count = 0; +  for (int i = 0; i < hyp_ngram_counts.size(); ++i) { +    //  cerr << "In CPS " << hyp_ngram_counts[i] << " " << correct_ngram_hit_counts[i] << endl; +    if (hyp_ngram_counts[i] > 0) { +      float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]); +      if (precs) precs->push_back(exp(lprec)); +      log_bleu += lprec; +      ++count; +    } +  } +  log_bleu /= static_cast<float>(count); +  float lbp = 0.0; +  if (hyp_len < ref_len) +    lbp = (hyp_len - ref_len) / hyp_len; +  log_bleu += lbp; +  if (bp) *bp = exp(lbp); +  return exp(log_bleu); +} + +float BLEUScore::ComputePartialScore() const { +  // cerr << "In here first " << endl; +  return ComputePartialScore(NULL, NULL); +} +  float BLEUScore::ComputeScore() const {    return ComputeScore(NULL, NULL);  } @@ -444,10 +506,37 @@ void BLEUScore::PlusEquals(const Score& delta) {    hyp_len += d.hyp_len;  } +void BLEUScore::PlusEquals(const Score& delta, const float scale) { +  const BLEUScore& d = static_cast<const BLEUScore&>(delta); +  correct_ngram_hit_counts = (correct_ngram_hit_counts + d.correct_ngram_hit_counts) * scale; +  hyp_ngram_counts = ( hyp_ngram_counts + d.hyp_ngram_counts) * scale; +  ref_len = (ref_len + d.ref_len) * scale; +  hyp_len = ( hyp_len + d.hyp_len) * scale; + +} + +void BLEUScore::PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){ +  const BLEUScore& d = static_cast<const BLEUScore&>(delta); +  correct_ngram_hit_counts += d.correct_ngram_hit_counts; +  hyp_ngram_counts += d.hyp_ngram_counts; +  //scale the reference length according to the size of the input sentence covered by this rule +   +  ref_len *= (float)oracle_f_cover / src_len; +  ref_len += d.ref_len; +   +  hyp_len = oracle_e_cover; +  hyp_len += d.hyp_len; +} + +  Score* BLEUScore::GetZero() const {    return new BLEUScore(hyp_ngram_counts.size());  } +Score* BLEUScore::GetOne() const { +  return new BLEUScore(hyp_ngram_counts.size(),1); +} +  void BLEUScore::Encode(string* out) const {    ostringstream os;    const int n = correct_ngram_hit_counts.size(); @@ -470,12 +559,24 @@ Score* BLEUScorerBase::ScoreCandidate(const vector<WordID>& hyp) const {    BLEUScore* bs = new BLEUScore(n_);    for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i)      i->second.second = 0; -  ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts); +  ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts, true);    bs->ref_len = ComputeRefLength(hyp);    bs->hyp_len = hyp.size();    return bs;  } +Score* BLEUScorerBase::ScoreCCandidate(const vector<WordID>& hyp) const { +  BLEUScore* bs = new BLEUScore(n_); +  for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i) +    i->second.second = 0; +  bool clip = false; +  ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts,clip); +  bs->ref_len = ComputeRefLength(hyp); +  bs->hyp_len = hyp.size(); +  return bs; +} + +  DocScorer::~DocScorer() {    for (int i=0; i < scorers_.size(); ++i)      delete scorers_[i]; diff --git a/vest/scorer.h b/vest/scorer.h index 83d4db4c..7ce688c4 100644 --- a/vest/scorer.h +++ b/vest/scorer.h @@ -10,17 +10,21 @@ class ViterbiEnvelope;  class ErrorSurface;  class Hypergraph;  // needed for alignment -enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER }; +enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3 };  ScoreType ScoreTypeFromString(const std::string& st);  class Score {   public:    virtual ~Score();    virtual float ComputeScore() const = 0; +  virtual float ComputePartialScore() const =0;    virtual void ScoreDetails(std::string* details) const = 0; +  virtual void PlusEquals(const Score& rhs, const float scale) = 0;    virtual void PlusEquals(const Score& rhs) = 0; +  virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len) = 0;    virtual void Subtract(const Score& rhs, Score* res) const = 0;    virtual Score* GetZero() const = 0; +  virtual Score* GetOne() const = 0;    virtual bool IsAdditiveIdentity() const = 0; // returns true if adding this delta                                        // to another score results in no score change  				      // under any circumstances @@ -32,6 +36,7 @@ class SentenceScorer {    virtual ~SentenceScorer();    void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const;    virtual Score* ScoreCandidate(const std::vector<WordID>& hyp) const = 0; +  virtual Score* ScoreCCandidate(const std::vector<WordID>& hyp) const =0;    virtual const std::string* GetSource() const;    static Score* CreateScoreFromString(const ScoreType type, const std::string& in);    static SentenceScorer* CreateSentenceScorer(const ScoreType type, diff --git a/vest/ter.cc b/vest/ter.cc index ef66f3b7..6e16e1cf 100644 --- a/vest/ter.cc +++ b/vest/ter.cc @@ -424,17 +424,26 @@ class TERScore : public Score {    static const unsigned kDUMMY_LAST_ENTRY = 5;   TERScore() : stats(0,kDUMMY_LAST_ENTRY) {} +  float ComputePartialScore() const { return 0.0;}    float ComputeScore() const {      float edits = static_cast<float>(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]);      return edits / static_cast<float>(stats[kREF_WORDCOUNT]);    }    void ScoreDetails(string* details) const; +  void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} +  void PlusEquals(const Score& delta, const float scale) { +    stats += static_cast<const TERScore&>(delta).stats; +  }    void PlusEquals(const Score& delta) {      stats += static_cast<const TERScore&>(delta).stats;    } +    Score* GetZero() const {      return new TERScore;    } +  Score* GetOne() const { +    return new TERScore; +  }    void Subtract(const Score& rhs, Score* res) const {      static_cast<TERScore*>(res)->stats = stats - static_cast<const TERScore&>(rhs).stats;    } @@ -489,6 +498,11 @@ TERScorer::TERScorer(const vector<vector<WordID> >& refs) : impl_(refs.size()) {      impl_[i] = new TERScorerImpl(refs[i]);  } +Score* TERScorer::ScoreCCandidate(const vector<WordID>& hyp) const { +  Score* a = NULL; +  return a; +} +  Score* TERScorer::ScoreCandidate(const std::vector<WordID>& hyp) const {    float best_score = numeric_limits<float>::max();    TERScore* res = new TERScore; @@ -10,6 +10,7 @@ class TERScorer : public SentenceScorer {    TERScorer(const std::vector<std::vector<WordID> >& references);    ~TERScorer();    Score* ScoreCandidate(const std::vector<WordID>& hyp) const; +  Score* ScoreCCandidate(const std::vector<WordID>& hyp) const;    static Score* ScoreFromString(const std::string& data);   private:    std::vector<TERScorerImpl*> impl_; | 
