diff options
author | vladimir.eidelman <vladimir.eidelman@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-14 23:00:08 +0000 |
---|---|---|
committer | vladimir.eidelman <vladimir.eidelman@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-14 23:00:08 +0000 |
commit | 2775fc13d1e8d3ad45c8ddf94226397403e0e373 (patch) | |
tree | 487fe0f9e717e6d444a448142d7b91e75e6873a1 /vest | |
parent | 8f97e6b03114761870f0c72f18f0928fac28d0f9 (diff) |
Added oracle forest rescoring
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@254 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'vest')
-rw-r--r-- | vest/aer_scorer.cc | 20 | ||||
-rw-r--r-- | vest/aer_scorer.h | 1 | ||||
-rw-r--r-- | vest/comb_scorer.cc | 21 | ||||
-rw-r--r-- | vest/comb_scorer.h | 1 | ||||
-rw-r--r-- | vest/scorer.cc | 127 | ||||
-rw-r--r-- | vest/scorer.h | 7 | ||||
-rw-r--r-- | vest/ter.cc | 14 | ||||
-rw-r--r-- | vest/ter.h | 1 |
8 files changed, 177 insertions, 15 deletions
diff --git a/vest/aer_scorer.cc b/vest/aer_scorer.cc index 9c8a783a..d3f28804 100644 --- a/vest/aer_scorer.cc +++ b/vest/aer_scorer.cc @@ -15,15 +15,27 @@ class AERScore : public Score { AERScore() : num_matches(), num_predicted(), num_in_ref() {} AERScore(int m, int p, int r) : num_matches(m), num_predicted(p), num_in_ref(r) {} - virtual void PlusEquals(const Score& delta) { + virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} + virtual void PlusEquals(const Score& delta, const float scale) { const AERScore& other = static_cast<const AERScore&>(delta); num_matches += other.num_matches; num_predicted += other.num_predicted; num_in_ref += other.num_in_ref; } + virtual void PlusEquals(const Score& delta) { + const AERScore& other = static_cast<const AERScore&>(delta); + num_matches += other.num_matches; + num_predicted += other.num_predicted; + num_in_ref += other.num_in_ref; + } + + virtual Score* GetZero() const { return new AERScore; } + virtual Score* GetOne() const { + return new AERScore; + } virtual void Subtract(const Score& rhs, Score* out) const { AERScore* res = static_cast<AERScore*>(out); const AERScore& other = static_cast<const AERScore&>(rhs); @@ -37,6 +49,7 @@ class AERScore : public Score { float Recall() const { return static_cast<float>(num_matches) / num_in_ref; } + float ComputePartialScore() const { return 0.0;} virtual float ComputeScore() const { const float prec = Precision(); const float rec = Recall(); @@ -82,6 +95,11 @@ static inline bool Safe(const Array2D<bool>& a, int i, int j) { return false; } +Score* AERScorer::ScoreCCandidate(const vector<WordID>& shyp) const { + Score* a = NULL; + return a; +} + Score* AERScorer::ScoreCandidate(const vector<WordID>& shyp) const { boost::shared_ptr<Array2D<bool> > hyp = AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(shyp)); diff --git a/vest/aer_scorer.h b/vest/aer_scorer.h index a0afea3b..d0df35d5 100644 --- a/vest/aer_scorer.h +++ b/vest/aer_scorer.h @@ -12,6 +12,7 @@ class AERScorer : public SentenceScorer { // is necessary. AERScorer(const std::vector<std::vector<WordID> >& refs, const std::string& src = ""); Score* ScoreCandidate(const std::vector<WordID>& hyp) const; + Score* ScoreCCandidate(const std::vector<WordID>& hyp) const; static Score* ScoreFromString(const std::string& in); const std::string* GetSource() const; private: diff --git a/vest/comb_scorer.cc b/vest/comb_scorer.cc index 7b2187f4..3dd077a6 100644 --- a/vest/comb_scorer.cc +++ b/vest/comb_scorer.cc @@ -8,6 +8,7 @@ class BLEUTERCombinationScore : public Score { friend class BLEUTERCombinationScorer; public: ~BLEUTERCombinationScore(); + float ComputePartialScore() const { return 0.0;} float ComputeScore() const { return (bleu->ComputeScore() - ter->ComputeScore()) / 2.0f; } @@ -17,10 +18,25 @@ class BLEUTERCombinationScore : public Score { ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, ter->ComputeScore()*100.0f); *details = buf; } + void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} + + void PlusEquals(const Score& delta, const float scale) { + bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu, scale); + ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter, scale); + } void PlusEquals(const Score& delta) { bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu); ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter); } + + + + Score* GetOne() const { + BLEUTERCombinationScore* res = new BLEUTERCombinationScore; + res->bleu = bleu->GetOne(); + res->ter = ter->GetOne(); + return res; + } Score* GetZero() const { BLEUTERCombinationScore* res = new BLEUTERCombinationScore; res->bleu = bleu->GetZero(); @@ -65,6 +81,11 @@ BLEUTERCombinationScorer::~BLEUTERCombinationScorer() { delete ter_; } +Score* BLEUTERCombinationScorer::ScoreCCandidate(const vector<WordID>& hyp) const { + Score* a = NULL; + return a; +} + Score* BLEUTERCombinationScorer::ScoreCandidate(const std::vector<WordID>& hyp) const { BLEUTERCombinationScore* res = new BLEUTERCombinationScore; res->bleu = bleu_->ScoreCandidate(hyp); diff --git a/vest/comb_scorer.h b/vest/comb_scorer.h index 70b1ec75..1a4f3324 100644 --- a/vest/comb_scorer.h +++ b/vest/comb_scorer.h @@ -8,6 +8,7 @@ class BLEUTERCombinationScorer : public SentenceScorer { BLEUTERCombinationScorer(const std::vector<std::vector<WordID> >& refs); ~BLEUTERCombinationScorer(); Score* ScoreCandidate(const std::vector<WordID>& hyp) const; + Score* ScoreCCandidate(const std::vector<WordID>& hyp) const; static Score* ScoreFromString(const std::string& in); private: SentenceScorer* bleu_; diff --git a/vest/scorer.cc b/vest/scorer.cc index 6c604ab8..524b15a5 100644 --- a/vest/scorer.cc +++ b/vest/scorer.cc @@ -35,6 +35,8 @@ ScoreType ScoreTypeFromString(const string& st) { return AER; if (sl == "bleu" || sl == "ibm_bleu") return IBM_BLEU; + if (sl == "ibm_bleu_3") + return IBM_BLEU_3; if (sl == "nist_bleu") return NIST_BLEU; if (sl == "koehn_bleu") @@ -53,6 +55,7 @@ class SERScore : public Score { friend class SERScorer; public: SERScore() : correct(0), total(0) {} + float ComputePartialScore() const { return 0.0;} float ComputeScore() const { return static_cast<float>(correct) / static_cast<float>(total); } @@ -61,11 +64,18 @@ class SERScore : public Score { os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')'; *details = os.str(); } - void PlusEquals(const Score& delta) { + void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){} + + void PlusEquals(const Score& delta, const float scale) { correct += static_cast<const SERScore&>(delta).correct; total += static_cast<const SERScore&>(delta).total; } + void PlusEquals(const Score& delta) { + correct += static_cast<const SERScore&>(delta).correct; + total += static_cast<const SERScore&>(delta).total; + } Score* GetZero() const { return new SERScore; } + Score* GetOne() const { return new SERScore; } void Subtract(const Score& rhs, Score* res) const { SERScore* r = static_cast<SERScore*>(res); r->correct = correct - static_cast<const SERScore&>(rhs).correct; @@ -84,6 +94,10 @@ class SERScore : public Score { class SERScorer : public SentenceScorer { public: SERScorer(const vector<vector<WordID> >& references) : refs_(references) {} + Score* ScoreCCandidate(const vector<WordID>& hyp) const { + Score* a = NULL; + return a; + } Score* ScoreCandidate(const vector<WordID>& hyp) const { SERScore* res = new SERScore; res->total = 1; @@ -101,13 +115,20 @@ class SERScorer : public SentenceScorer { class BLEUScore : public Score { friend class BLEUScorerBase; public: - BLEUScore(int n) : correct_ngram_hit_counts(0,n), hyp_ngram_counts(0,n) { + BLEUScore(int n) : correct_ngram_hit_counts(float(0),float(n)), hyp_ngram_counts(float(0),float(n)) { ref_len = 0; hyp_len = 0; } + BLEUScore(int n, int k) : correct_ngram_hit_counts(float(k),float(n)), hyp_ngram_counts(float(k),float(n)) { + ref_len = k; + hyp_len = k; } float ComputeScore() const; + float ComputePartialScore() const; void ScoreDetails(string* details) const; void PlusEquals(const Score& delta); + void PlusEquals(const Score& delta, const float scale); + void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len); Score* GetZero() const; + Score* GetOne() const; void Subtract(const Score& rhs, Score* res) const; void Encode(string* out) const; bool IsAdditiveIdentity() const { @@ -119,10 +140,11 @@ class BLEUScore : public Score { } private: float ComputeScore(vector<float>* precs, float* bp) const; - valarray<int> correct_ngram_hit_counts; - valarray<int> hyp_ngram_counts; + float ComputePartialScore(vector<float>* prec, float* bp) const; + valarray<float> correct_ngram_hit_counts; + valarray<float> hyp_ngram_counts; float ref_len; - int hyp_len; + float hyp_len; }; class BLEUScorerBase : public SentenceScorer { @@ -131,6 +153,7 @@ class BLEUScorerBase : public SentenceScorer { int n ); Score* ScoreCandidate(const vector<WordID>& hyp) const; + Score* ScoreCCandidate(const vector<WordID>& hyp) const; static Score* ScoreFromString(const string& in); protected: @@ -171,8 +194,10 @@ class BLEUScorerBase : public SentenceScorer { } void ComputeNgramStats(const vector<WordID>& sent, - valarray<int>* correct, - valarray<int>* hyp) const { + valarray<float>* correct, + valarray<float>* hyp, + bool clip_counts) + const { assert(correct->size() == n_); assert(hyp->size() == n_); vector<WordID> ngram(n_); @@ -186,10 +211,15 @@ class BLEUScorerBase : public SentenceScorer { for (int i=1; i<=k; ++i) { ngram.push_back(sent[j + i - 1]); pair<int,int>& p = ngrams_[ngram]; - if (p.second < p.first) { - ++p.second; - (*correct)[i-1]++; - } + if(clip_counts){ + if (p.second < p.first) { + ++p.second; + (*correct)[i-1]++; + }} + else { + ++p.second; + (*correct)[i-1]++; + } // if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams: if (!p.first) { for (; i<=k; ++i) @@ -284,7 +314,8 @@ SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type, const vector<vector<WordID> >& refs, const string& src) { switch (type) { - case IBM_BLEU: return new IBM_BLEUScorer(refs, 4); + case IBM_BLEU: return new IBM_BLEUScorer(refs, 4); + case IBM_BLEU_3 : return new IBM_BLEUScorer(refs,3); case NIST_BLEU: return new NIST_BLEUScorer(refs, 4); case Koehn_BLEU: return new Koehn_BLEUScorer(refs, 4); case AER: return new AERScorer(refs, src); @@ -299,6 +330,7 @@ SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type, Score* SentenceScorer::CreateScoreFromString(const ScoreType type, const string& in) { switch (type) { case IBM_BLEU: + case IBM_BLEU_3: case NIST_BLEU: case Koehn_BLEU: return BLEUScorerBase::ScoreFromString(in); @@ -423,6 +455,36 @@ float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const { return exp(log_bleu); } + +//comptue scaled score for oracle retrieval +float BLEUScore::ComputePartialScore(vector<float>* precs, float* bp) const { + // cerr << "Then here " << endl; + float log_bleu = 0; + if (precs) precs->clear(); + int count = 0; + for (int i = 0; i < hyp_ngram_counts.size(); ++i) { + // cerr << "In CPS " << hyp_ngram_counts[i] << " " << correct_ngram_hit_counts[i] << endl; + if (hyp_ngram_counts[i] > 0) { + float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]); + if (precs) precs->push_back(exp(lprec)); + log_bleu += lprec; + ++count; + } + } + log_bleu /= static_cast<float>(count); + float lbp = 0.0; + if (hyp_len < ref_len) + lbp = (hyp_len - ref_len) / hyp_len; + log_bleu += lbp; + if (bp) *bp = exp(lbp); + return exp(log_bleu); +} + +float BLEUScore::ComputePartialScore() const { + // cerr << "In here first " << endl; + return ComputePartialScore(NULL, NULL); +} + float BLEUScore::ComputeScore() const { return ComputeScore(NULL, NULL); } @@ -444,10 +506,37 @@ void BLEUScore::PlusEquals(const Score& delta) { hyp_len += d.hyp_len; } +void BLEUScore::PlusEquals(const Score& delta, const float scale) { + const BLEUScore& d = static_cast<const BLEUScore&>(delta); + correct_ngram_hit_counts = (correct_ngram_hit_counts + d.correct_ngram_hit_counts) * scale; + hyp_ngram_counts = ( hyp_ngram_counts + d.hyp_ngram_counts) * scale; + ref_len = (ref_len + d.ref_len) * scale; + hyp_len = ( hyp_len + d.hyp_len) * scale; + +} + +void BLEUScore::PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){ + const BLEUScore& d = static_cast<const BLEUScore&>(delta); + correct_ngram_hit_counts += d.correct_ngram_hit_counts; + hyp_ngram_counts += d.hyp_ngram_counts; + //scale the reference length according to the size of the input sentence covered by this rule + + ref_len *= (float)oracle_f_cover / src_len; + ref_len += d.ref_len; + + hyp_len = oracle_e_cover; + hyp_len += d.hyp_len; +} + + Score* BLEUScore::GetZero() const { return new BLEUScore(hyp_ngram_counts.size()); } +Score* BLEUScore::GetOne() const { + return new BLEUScore(hyp_ngram_counts.size(),1); +} + void BLEUScore::Encode(string* out) const { ostringstream os; const int n = correct_ngram_hit_counts.size(); @@ -470,12 +559,24 @@ Score* BLEUScorerBase::ScoreCandidate(const vector<WordID>& hyp) const { BLEUScore* bs = new BLEUScore(n_); for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i) i->second.second = 0; - ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts); + ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts, true); bs->ref_len = ComputeRefLength(hyp); bs->hyp_len = hyp.size(); return bs; } +Score* BLEUScorerBase::ScoreCCandidate(const vector<WordID>& hyp) const { + BLEUScore* bs = new BLEUScore(n_); + for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i) + i->second.second = 0; + bool clip = false; + ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts,clip); + bs->ref_len = ComputeRefLength(hyp); + bs->hyp_len = hyp.size(); + return bs; +} + + DocScorer::~DocScorer() { for (int i=0; i < scorers_.size(); ++i) delete scorers_[i]; diff --git a/vest/scorer.h b/vest/scorer.h index 83d4db4c..7ce688c4 100644 --- a/vest/scorer.h +++ b/vest/scorer.h @@ -10,17 +10,21 @@ class ViterbiEnvelope; class ErrorSurface; class Hypergraph; // needed for alignment -enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER }; +enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3 }; ScoreType ScoreTypeFromString(const std::string& st); class Score { public: virtual ~Score(); virtual float ComputeScore() const = 0; + virtual float ComputePartialScore() const =0; virtual void ScoreDetails(std::string* details) const = 0; + virtual void PlusEquals(const Score& rhs, const float scale) = 0; virtual void PlusEquals(const Score& rhs) = 0; + virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len) = 0; virtual void Subtract(const Score& rhs, Score* res) const = 0; virtual Score* GetZero() const = 0; + virtual Score* GetOne() const = 0; virtual bool IsAdditiveIdentity() const = 0; // returns true if adding this delta // to another score results in no score change // under any circumstances @@ -32,6 +36,7 @@ class SentenceScorer { virtual ~SentenceScorer(); void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const; virtual Score* ScoreCandidate(const std::vector<WordID>& hyp) const = 0; + virtual Score* ScoreCCandidate(const std::vector<WordID>& hyp) const =0; virtual const std::string* GetSource() const; static Score* CreateScoreFromString(const ScoreType type, const std::string& in); static SentenceScorer* CreateSentenceScorer(const ScoreType type, diff --git a/vest/ter.cc b/vest/ter.cc index ef66f3b7..6e16e1cf 100644 --- a/vest/ter.cc +++ b/vest/ter.cc @@ -424,17 +424,26 @@ class TERScore : public Score { static const unsigned kDUMMY_LAST_ENTRY = 5; TERScore() : stats(0,kDUMMY_LAST_ENTRY) {} + float ComputePartialScore() const { return 0.0;} float ComputeScore() const { float edits = static_cast<float>(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]); return edits / static_cast<float>(stats[kREF_WORDCOUNT]); } void ScoreDetails(string* details) const; + void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} + void PlusEquals(const Score& delta, const float scale) { + stats += static_cast<const TERScore&>(delta).stats; + } void PlusEquals(const Score& delta) { stats += static_cast<const TERScore&>(delta).stats; } + Score* GetZero() const { return new TERScore; } + Score* GetOne() const { + return new TERScore; + } void Subtract(const Score& rhs, Score* res) const { static_cast<TERScore*>(res)->stats = stats - static_cast<const TERScore&>(rhs).stats; } @@ -489,6 +498,11 @@ TERScorer::TERScorer(const vector<vector<WordID> >& refs) : impl_(refs.size()) { impl_[i] = new TERScorerImpl(refs[i]); } +Score* TERScorer::ScoreCCandidate(const vector<WordID>& hyp) const { + Score* a = NULL; + return a; +} + Score* TERScorer::ScoreCandidate(const std::vector<WordID>& hyp) const { float best_score = numeric_limits<float>::max(); TERScore* res = new TERScore; @@ -10,6 +10,7 @@ class TERScorer : public SentenceScorer { TERScorer(const std::vector<std::vector<WordID> >& references); ~TERScorer(); Score* ScoreCandidate(const std::vector<WordID>& hyp) const; + Score* ScoreCCandidate(const std::vector<WordID>& hyp) const; static Score* ScoreFromString(const std::string& data); private: std::vector<TERScorerImpl*> impl_; |