summaryrefslogtreecommitdiff
path: root/vest
diff options
context:
space:
mode:
authorvladimir.eidelman <vladimir.eidelman@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-14 23:00:08 +0000
committervladimir.eidelman <vladimir.eidelman@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-14 23:00:08 +0000
commit2775fc13d1e8d3ad45c8ddf94226397403e0e373 (patch)
tree487fe0f9e717e6d444a448142d7b91e75e6873a1 /vest
parent8f97e6b03114761870f0c72f18f0928fac28d0f9 (diff)
Added oracle forest rescoring
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@254 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'vest')
-rw-r--r--vest/aer_scorer.cc20
-rw-r--r--vest/aer_scorer.h1
-rw-r--r--vest/comb_scorer.cc21
-rw-r--r--vest/comb_scorer.h1
-rw-r--r--vest/scorer.cc127
-rw-r--r--vest/scorer.h7
-rw-r--r--vest/ter.cc14
-rw-r--r--vest/ter.h1
8 files changed, 177 insertions, 15 deletions
diff --git a/vest/aer_scorer.cc b/vest/aer_scorer.cc
index 9c8a783a..d3f28804 100644
--- a/vest/aer_scorer.cc
+++ b/vest/aer_scorer.cc
@@ -15,15 +15,27 @@ class AERScore : public Score {
AERScore() : num_matches(), num_predicted(), num_in_ref() {}
AERScore(int m, int p, int r) :
num_matches(m), num_predicted(p), num_in_ref(r) {}
- virtual void PlusEquals(const Score& delta) {
+ virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){}
+ virtual void PlusEquals(const Score& delta, const float scale) {
const AERScore& other = static_cast<const AERScore&>(delta);
num_matches += other.num_matches;
num_predicted += other.num_predicted;
num_in_ref += other.num_in_ref;
}
+ virtual void PlusEquals(const Score& delta) {
+ const AERScore& other = static_cast<const AERScore&>(delta);
+ num_matches += other.num_matches;
+ num_predicted += other.num_predicted;
+ num_in_ref += other.num_in_ref;
+ }
+
+
virtual Score* GetZero() const {
return new AERScore;
}
+ virtual Score* GetOne() const {
+ return new AERScore;
+ }
virtual void Subtract(const Score& rhs, Score* out) const {
AERScore* res = static_cast<AERScore*>(out);
const AERScore& other = static_cast<const AERScore&>(rhs);
@@ -37,6 +49,7 @@ class AERScore : public Score {
float Recall() const {
return static_cast<float>(num_matches) / num_in_ref;
}
+ float ComputePartialScore() const { return 0.0;}
virtual float ComputeScore() const {
const float prec = Precision();
const float rec = Recall();
@@ -82,6 +95,11 @@ static inline bool Safe(const Array2D<bool>& a, int i, int j) {
return false;
}
+Score* AERScorer::ScoreCCandidate(const vector<WordID>& shyp) const {
+ Score* a = NULL;
+ return a;
+}
+
Score* AERScorer::ScoreCandidate(const vector<WordID>& shyp) const {
boost::shared_ptr<Array2D<bool> > hyp =
AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(shyp));
diff --git a/vest/aer_scorer.h b/vest/aer_scorer.h
index a0afea3b..d0df35d5 100644
--- a/vest/aer_scorer.h
+++ b/vest/aer_scorer.h
@@ -12,6 +12,7 @@ class AERScorer : public SentenceScorer {
// is necessary.
AERScorer(const std::vector<std::vector<WordID> >& refs, const std::string& src = "");
Score* ScoreCandidate(const std::vector<WordID>& hyp) const;
+ Score* ScoreCCandidate(const std::vector<WordID>& hyp) const;
static Score* ScoreFromString(const std::string& in);
const std::string* GetSource() const;
private:
diff --git a/vest/comb_scorer.cc b/vest/comb_scorer.cc
index 7b2187f4..3dd077a6 100644
--- a/vest/comb_scorer.cc
+++ b/vest/comb_scorer.cc
@@ -8,6 +8,7 @@ class BLEUTERCombinationScore : public Score {
friend class BLEUTERCombinationScorer;
public:
~BLEUTERCombinationScore();
+ float ComputePartialScore() const { return 0.0;}
float ComputeScore() const {
return (bleu->ComputeScore() - ter->ComputeScore()) / 2.0f;
}
@@ -17,10 +18,25 @@ class BLEUTERCombinationScore : public Score {
ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, ter->ComputeScore()*100.0f);
*details = buf;
}
+ void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){}
+
+ void PlusEquals(const Score& delta, const float scale) {
+ bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu, scale);
+ ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter, scale);
+ }
void PlusEquals(const Score& delta) {
bleu->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).bleu);
ter->PlusEquals(*static_cast<const BLEUTERCombinationScore&>(delta).ter);
}
+
+
+
+ Score* GetOne() const {
+ BLEUTERCombinationScore* res = new BLEUTERCombinationScore;
+ res->bleu = bleu->GetOne();
+ res->ter = ter->GetOne();
+ return res;
+ }
Score* GetZero() const {
BLEUTERCombinationScore* res = new BLEUTERCombinationScore;
res->bleu = bleu->GetZero();
@@ -65,6 +81,11 @@ BLEUTERCombinationScorer::~BLEUTERCombinationScorer() {
delete ter_;
}
+Score* BLEUTERCombinationScorer::ScoreCCandidate(const vector<WordID>& hyp) const {
+ Score* a = NULL;
+ return a;
+}
+
Score* BLEUTERCombinationScorer::ScoreCandidate(const std::vector<WordID>& hyp) const {
BLEUTERCombinationScore* res = new BLEUTERCombinationScore;
res->bleu = bleu_->ScoreCandidate(hyp);
diff --git a/vest/comb_scorer.h b/vest/comb_scorer.h
index 70b1ec75..1a4f3324 100644
--- a/vest/comb_scorer.h
+++ b/vest/comb_scorer.h
@@ -8,6 +8,7 @@ class BLEUTERCombinationScorer : public SentenceScorer {
BLEUTERCombinationScorer(const std::vector<std::vector<WordID> >& refs);
~BLEUTERCombinationScorer();
Score* ScoreCandidate(const std::vector<WordID>& hyp) const;
+ Score* ScoreCCandidate(const std::vector<WordID>& hyp) const;
static Score* ScoreFromString(const std::string& in);
private:
SentenceScorer* bleu_;
diff --git a/vest/scorer.cc b/vest/scorer.cc
index 6c604ab8..524b15a5 100644
--- a/vest/scorer.cc
+++ b/vest/scorer.cc
@@ -35,6 +35,8 @@ ScoreType ScoreTypeFromString(const string& st) {
return AER;
if (sl == "bleu" || sl == "ibm_bleu")
return IBM_BLEU;
+ if (sl == "ibm_bleu_3")
+ return IBM_BLEU_3;
if (sl == "nist_bleu")
return NIST_BLEU;
if (sl == "koehn_bleu")
@@ -53,6 +55,7 @@ class SERScore : public Score {
friend class SERScorer;
public:
SERScore() : correct(0), total(0) {}
+ float ComputePartialScore() const { return 0.0;}
float ComputeScore() const {
return static_cast<float>(correct) / static_cast<float>(total);
}
@@ -61,11 +64,18 @@ class SERScore : public Score {
os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')';
*details = os.str();
}
- void PlusEquals(const Score& delta) {
+ void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){}
+
+ void PlusEquals(const Score& delta, const float scale) {
correct += static_cast<const SERScore&>(delta).correct;
total += static_cast<const SERScore&>(delta).total;
}
+ void PlusEquals(const Score& delta) {
+ correct += static_cast<const SERScore&>(delta).correct;
+ total += static_cast<const SERScore&>(delta).total;
+ }
Score* GetZero() const { return new SERScore; }
+ Score* GetOne() const { return new SERScore; }
void Subtract(const Score& rhs, Score* res) const {
SERScore* r = static_cast<SERScore*>(res);
r->correct = correct - static_cast<const SERScore&>(rhs).correct;
@@ -84,6 +94,10 @@ class SERScore : public Score {
class SERScorer : public SentenceScorer {
public:
SERScorer(const vector<vector<WordID> >& references) : refs_(references) {}
+ Score* ScoreCCandidate(const vector<WordID>& hyp) const {
+ Score* a = NULL;
+ return a;
+ }
Score* ScoreCandidate(const vector<WordID>& hyp) const {
SERScore* res = new SERScore;
res->total = 1;
@@ -101,13 +115,20 @@ class SERScorer : public SentenceScorer {
class BLEUScore : public Score {
friend class BLEUScorerBase;
public:
- BLEUScore(int n) : correct_ngram_hit_counts(0,n), hyp_ngram_counts(0,n) {
+ BLEUScore(int n) : correct_ngram_hit_counts(float(0),float(n)), hyp_ngram_counts(float(0),float(n)) {
ref_len = 0;
hyp_len = 0; }
+ BLEUScore(int n, int k) : correct_ngram_hit_counts(float(k),float(n)), hyp_ngram_counts(float(k),float(n)) {
+ ref_len = k;
+ hyp_len = k; }
float ComputeScore() const;
+ float ComputePartialScore() const;
void ScoreDetails(string* details) const;
void PlusEquals(const Score& delta);
+ void PlusEquals(const Score& delta, const float scale);
+ void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len);
Score* GetZero() const;
+ Score* GetOne() const;
void Subtract(const Score& rhs, Score* res) const;
void Encode(string* out) const;
bool IsAdditiveIdentity() const {
@@ -119,10 +140,11 @@ class BLEUScore : public Score {
}
private:
float ComputeScore(vector<float>* precs, float* bp) const;
- valarray<int> correct_ngram_hit_counts;
- valarray<int> hyp_ngram_counts;
+ float ComputePartialScore(vector<float>* prec, float* bp) const;
+ valarray<float> correct_ngram_hit_counts;
+ valarray<float> hyp_ngram_counts;
float ref_len;
- int hyp_len;
+ float hyp_len;
};
class BLEUScorerBase : public SentenceScorer {
@@ -131,6 +153,7 @@ class BLEUScorerBase : public SentenceScorer {
int n
);
Score* ScoreCandidate(const vector<WordID>& hyp) const;
+ Score* ScoreCCandidate(const vector<WordID>& hyp) const;
static Score* ScoreFromString(const string& in);
protected:
@@ -171,8 +194,10 @@ class BLEUScorerBase : public SentenceScorer {
}
void ComputeNgramStats(const vector<WordID>& sent,
- valarray<int>* correct,
- valarray<int>* hyp) const {
+ valarray<float>* correct,
+ valarray<float>* hyp,
+ bool clip_counts)
+ const {
assert(correct->size() == n_);
assert(hyp->size() == n_);
vector<WordID> ngram(n_);
@@ -186,10 +211,15 @@ class BLEUScorerBase : public SentenceScorer {
for (int i=1; i<=k; ++i) {
ngram.push_back(sent[j + i - 1]);
pair<int,int>& p = ngrams_[ngram];
- if (p.second < p.first) {
- ++p.second;
- (*correct)[i-1]++;
- }
+ if(clip_counts){
+ if (p.second < p.first) {
+ ++p.second;
+ (*correct)[i-1]++;
+ }}
+ else {
+ ++p.second;
+ (*correct)[i-1]++;
+ }
// if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams:
if (!p.first) {
for (; i<=k; ++i)
@@ -284,7 +314,8 @@ SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type,
const vector<vector<WordID> >& refs,
const string& src) {
switch (type) {
- case IBM_BLEU: return new IBM_BLEUScorer(refs, 4);
+ case IBM_BLEU: return new IBM_BLEUScorer(refs, 4);
+ case IBM_BLEU_3 : return new IBM_BLEUScorer(refs,3);
case NIST_BLEU: return new NIST_BLEUScorer(refs, 4);
case Koehn_BLEU: return new Koehn_BLEUScorer(refs, 4);
case AER: return new AERScorer(refs, src);
@@ -299,6 +330,7 @@ SentenceScorer* SentenceScorer::CreateSentenceScorer(const ScoreType type,
Score* SentenceScorer::CreateScoreFromString(const ScoreType type, const string& in) {
switch (type) {
case IBM_BLEU:
+ case IBM_BLEU_3:
case NIST_BLEU:
case Koehn_BLEU:
return BLEUScorerBase::ScoreFromString(in);
@@ -423,6 +455,36 @@ float BLEUScore::ComputeScore(vector<float>* precs, float* bp) const {
return exp(log_bleu);
}
+
+//comptue scaled score for oracle retrieval
+float BLEUScore::ComputePartialScore(vector<float>* precs, float* bp) const {
+ // cerr << "Then here " << endl;
+ float log_bleu = 0;
+ if (precs) precs->clear();
+ int count = 0;
+ for (int i = 0; i < hyp_ngram_counts.size(); ++i) {
+ // cerr << "In CPS " << hyp_ngram_counts[i] << " " << correct_ngram_hit_counts[i] << endl;
+ if (hyp_ngram_counts[i] > 0) {
+ float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]);
+ if (precs) precs->push_back(exp(lprec));
+ log_bleu += lprec;
+ ++count;
+ }
+ }
+ log_bleu /= static_cast<float>(count);
+ float lbp = 0.0;
+ if (hyp_len < ref_len)
+ lbp = (hyp_len - ref_len) / hyp_len;
+ log_bleu += lbp;
+ if (bp) *bp = exp(lbp);
+ return exp(log_bleu);
+}
+
+float BLEUScore::ComputePartialScore() const {
+ // cerr << "In here first " << endl;
+ return ComputePartialScore(NULL, NULL);
+}
+
float BLEUScore::ComputeScore() const {
return ComputeScore(NULL, NULL);
}
@@ -444,10 +506,37 @@ void BLEUScore::PlusEquals(const Score& delta) {
hyp_len += d.hyp_len;
}
+void BLEUScore::PlusEquals(const Score& delta, const float scale) {
+ const BLEUScore& d = static_cast<const BLEUScore&>(delta);
+ correct_ngram_hit_counts = (correct_ngram_hit_counts + d.correct_ngram_hit_counts) * scale;
+ hyp_ngram_counts = ( hyp_ngram_counts + d.hyp_ngram_counts) * scale;
+ ref_len = (ref_len + d.ref_len) * scale;
+ hyp_len = ( hyp_len + d.hyp_len) * scale;
+
+}
+
+void BLEUScore::PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){
+ const BLEUScore& d = static_cast<const BLEUScore&>(delta);
+ correct_ngram_hit_counts += d.correct_ngram_hit_counts;
+ hyp_ngram_counts += d.hyp_ngram_counts;
+ //scale the reference length according to the size of the input sentence covered by this rule
+
+ ref_len *= (float)oracle_f_cover / src_len;
+ ref_len += d.ref_len;
+
+ hyp_len = oracle_e_cover;
+ hyp_len += d.hyp_len;
+}
+
+
Score* BLEUScore::GetZero() const {
return new BLEUScore(hyp_ngram_counts.size());
}
+Score* BLEUScore::GetOne() const {
+ return new BLEUScore(hyp_ngram_counts.size(),1);
+}
+
void BLEUScore::Encode(string* out) const {
ostringstream os;
const int n = correct_ngram_hit_counts.size();
@@ -470,12 +559,24 @@ Score* BLEUScorerBase::ScoreCandidate(const vector<WordID>& hyp) const {
BLEUScore* bs = new BLEUScore(n_);
for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i)
i->second.second = 0;
- ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts);
+ ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts, true);
bs->ref_len = ComputeRefLength(hyp);
bs->hyp_len = hyp.size();
return bs;
}
+Score* BLEUScorerBase::ScoreCCandidate(const vector<WordID>& hyp) const {
+ BLEUScore* bs = new BLEUScore(n_);
+ for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i)
+ i->second.second = 0;
+ bool clip = false;
+ ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts,clip);
+ bs->ref_len = ComputeRefLength(hyp);
+ bs->hyp_len = hyp.size();
+ return bs;
+}
+
+
DocScorer::~DocScorer() {
for (int i=0; i < scorers_.size(); ++i)
delete scorers_[i];
diff --git a/vest/scorer.h b/vest/scorer.h
index 83d4db4c..7ce688c4 100644
--- a/vest/scorer.h
+++ b/vest/scorer.h
@@ -10,17 +10,21 @@ class ViterbiEnvelope;
class ErrorSurface;
class Hypergraph; // needed for alignment
-enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER };
+enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3 };
ScoreType ScoreTypeFromString(const std::string& st);
class Score {
public:
virtual ~Score();
virtual float ComputeScore() const = 0;
+ virtual float ComputePartialScore() const =0;
virtual void ScoreDetails(std::string* details) const = 0;
+ virtual void PlusEquals(const Score& rhs, const float scale) = 0;
virtual void PlusEquals(const Score& rhs) = 0;
+ virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len) = 0;
virtual void Subtract(const Score& rhs, Score* res) const = 0;
virtual Score* GetZero() const = 0;
+ virtual Score* GetOne() const = 0;
virtual bool IsAdditiveIdentity() const = 0; // returns true if adding this delta
// to another score results in no score change
// under any circumstances
@@ -32,6 +36,7 @@ class SentenceScorer {
virtual ~SentenceScorer();
void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const;
virtual Score* ScoreCandidate(const std::vector<WordID>& hyp) const = 0;
+ virtual Score* ScoreCCandidate(const std::vector<WordID>& hyp) const =0;
virtual const std::string* GetSource() const;
static Score* CreateScoreFromString(const ScoreType type, const std::string& in);
static SentenceScorer* CreateSentenceScorer(const ScoreType type,
diff --git a/vest/ter.cc b/vest/ter.cc
index ef66f3b7..6e16e1cf 100644
--- a/vest/ter.cc
+++ b/vest/ter.cc
@@ -424,17 +424,26 @@ class TERScore : public Score {
static const unsigned kDUMMY_LAST_ENTRY = 5;
TERScore() : stats(0,kDUMMY_LAST_ENTRY) {}
+ float ComputePartialScore() const { return 0.0;}
float ComputeScore() const {
float edits = static_cast<float>(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]);
return edits / static_cast<float>(stats[kREF_WORDCOUNT]);
}
void ScoreDetails(string* details) const;
+ void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){}
+ void PlusEquals(const Score& delta, const float scale) {
+ stats += static_cast<const TERScore&>(delta).stats;
+ }
void PlusEquals(const Score& delta) {
stats += static_cast<const TERScore&>(delta).stats;
}
+
Score* GetZero() const {
return new TERScore;
}
+ Score* GetOne() const {
+ return new TERScore;
+ }
void Subtract(const Score& rhs, Score* res) const {
static_cast<TERScore*>(res)->stats = stats - static_cast<const TERScore&>(rhs).stats;
}
@@ -489,6 +498,11 @@ TERScorer::TERScorer(const vector<vector<WordID> >& refs) : impl_(refs.size()) {
impl_[i] = new TERScorerImpl(refs[i]);
}
+Score* TERScorer::ScoreCCandidate(const vector<WordID>& hyp) const {
+ Score* a = NULL;
+ return a;
+}
+
Score* TERScorer::ScoreCandidate(const std::vector<WordID>& hyp) const {
float best_score = numeric_limits<float>::max();
TERScore* res = new TERScore;
diff --git a/vest/ter.h b/vest/ter.h
index fe4ba36c..21007874 100644
--- a/vest/ter.h
+++ b/vest/ter.h
@@ -10,6 +10,7 @@ class TERScorer : public SentenceScorer {
TERScorer(const std::vector<std::vector<WordID> >& references);
~TERScorer();
Score* ScoreCandidate(const std::vector<WordID>& hyp) const;
+ Score* ScoreCCandidate(const std::vector<WordID>& hyp) const;
static Score* ScoreFromString(const std::string& data);
private:
std::vector<TERScorerImpl*> impl_;