From 79d19204f64deda0762ebbedaa55d686bb0f6d7c Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Fri, 16 Oct 2015 10:24:09 +0200
Subject: dtrain score.h: refactoring, 'summation' bleu
---
training/dtrain/score.h | 308 +++++++++++++++++++++++++++++-------------------
1 file changed, 188 insertions(+), 120 deletions(-)
diff --git a/training/dtrain/score.h b/training/dtrain/score.h
index ca3da39b..a9c60b64 100644
--- a/training/dtrain/score.h
+++ b/training/dtrain/score.h
@@ -9,66 +9,77 @@ namespace dtrain
struct NgramCounts
{
size_t N_;
- map clipped_;
- map sum_;
+ map clipped;
+ map sum;
NgramCounts() {}
- NgramCounts(const size_t N) : N_(N) { Zero(); }
+ NgramCounts(const size_t N) : N_(N) { zero(); }
inline void
operator+=(const NgramCounts& rhs)
{
- if (rhs.N_ > N_) Resize(rhs.N_);
+ if (rhs.N_ > N_) resize(rhs.N_);
for (size_t i = 0; i < N_; i++) {
- this->clipped_[i] += rhs.clipped_.find(i)->second;
- this->sum_[i] += rhs.sum_.find(i)->second;
+ this->clipped[i] += rhs.clipped.find(i)->second;
+ this->sum[i] += rhs.sum.find(i)->second;
}
}
inline void
operator*=(const weight_t rhs)
{
- for (unsigned i = 0; i < N_; i++) {
- this->clipped_[i] *= rhs;
- this->sum_[i] *= rhs;
+ for (size_t i=0; iclipped[i] *= rhs;
+ this->sum[i] *= rhs;
}
}
inline void
- Add(const size_t count, const size_t ref_count, const size_t i)
+ add(const size_t count,
+ const size_t count_ref,
+ const size_t i)
{
assert(i < N_);
- if (count > ref_count) {
- clipped_[i] += ref_count;
+ if (count > count_ref) {
+ clipped[i] += count_ref;
} else {
- clipped_[i] += count;
+ clipped[i] += count;
}
- sum_[i] += count;
+ sum[i] += count;
}
inline void
- Zero()
+ zero()
{
- for (size_t i = 0; i < N_; i++) {
- clipped_[i] = 0.;
- sum_[i] = 0.;
+ for (size_t i=0; i N_) {
for (size_t i = N_; i < N; i++) {
- clipped_[i] = 0.;
- sum_[i] = 0.;
+ clipped[i] = 0.;
+ sum[i] = 0.;
}
} else { // N < N_
for (size_t i = N_-1; i > N-1; i--) {
- clipped_.erase(i);
- sum_.erase(i);
+ clipped.erase(i);
+ sum.erase(i);
}
}
N_ = N;
@@ -78,37 +89,38 @@ struct NgramCounts
typedef map, size_t> Ngrams;
inline Ngrams
-MakeNgrams(const vector& s, const size_t N)
+ngrams(const vector& vw,
+ const size_t N)
{
- Ngrams ngrams;
+ Ngrams r;
vector ng;
- for (size_t i = 0; i < s.size(); i++) {
+ for (size_t i=0; i& hyp,
- const vector& ref,
- const size_t N)
+ngram_counts(const vector& hyp,
+ const vector& ngrams_ref,
+ const size_t N)
{
- Ngrams hyp_ngrams = MakeNgrams(hyp, N);
+ Ngrams ngrams_hyp = ngrams(hyp, N);
NgramCounts counts(N);
Ngrams::iterator it, ti;
- for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) {
+ for (it = ngrams_hyp.begin(); it != ngrams_hyp.end(); it++) {
size_t max_ref_count = 0;
- for (auto r: ref) {
+ for (auto r: ngrams_ref) {
ti = r.find(it->first);
if (ti != r.end())
max_ref_count = max(max_ref_count, ti->second);
}
- counts.Add(it->second, min(it->second, max_ref_count), it->first.size()-1);
+ counts.add(it->second, min(it->second, max_ref_count), it->first.size()-1);
}
return counts;
@@ -128,9 +140,9 @@ class Scorer
}
inline bool
- Init(const vector& hyp,
- const vector& ref_ngs,
- const vector& ref_ls,
+ init(const vector& hyp,
+ const vector& reference_ngrams,
+ const vector& reference_lengths,
size_t& hl,
size_t& rl,
size_t& M,
@@ -138,10 +150,12 @@ class Scorer
NgramCounts& counts)
{
hl = hyp.size();
- if (hl == 0) return false;
- rl = BestMatchLength(hl, ref_ls);
- if (rl == 0) return false;
- counts = MakeNgramCounts(hyp, ref_ngs, N_);
+ if (hl == 0)
+ return false;
+ rl = best_match_length(hl, reference_lengths);
+ if (rl == 0)
+ return false;
+ counts = ngram_counts(hyp, reference_ngrams, N_);
if (rl < N_) {
M = rl;
for (size_t i = 0; i < M; i++) v.push_back(1/((weight_t)M));
@@ -154,7 +168,8 @@ class Scorer
}
inline weight_t
- BrevityPenalty(const size_t hl, const size_t rl)
+ brevity_penalty(const size_t hl,
+ const size_t rl)
{
if (hl > rl)
return 1;
@@ -163,16 +178,16 @@ class Scorer
}
inline size_t
- BestMatchLength(const size_t hl,
- const vector& ref_ls)
+ best_match_length(const size_t hl,
+ const vector& reference_lengths)
{
size_t m;
- if (ref_ls.size() == 1) {
- m = ref_ls.front();
+ if (reference_lengths.size() == 1) {
+ m = reference_lengths.front();
} else {
size_t i = 0, best_idx = 0;
size_t best = numeric_limits::max();
- for (auto l: ref_ls) {
+ for (auto l: reference_lengths) {
size_t d = abs(hl-l);
if (d < best) {
best_idx = i;
@@ -180,61 +195,63 @@ class Scorer
}
i += 1;
}
- m = ref_ls[best_idx];
+ m = reference_lengths[best_idx];
}
return m;
}
virtual weight_t
- Score(const vector&,
+ score(const vector&,
const vector&,
const vector&) = 0;
void
- UpdateContext(const vector& /*hyp*/,
- const vector& /*ref_ngs*/,
- const vector& /*ref_ls*/,
- weight_t /*decay*/) {}
+ update_context(const vector& /*hyp*/,
+ const vector& /*reference_ngrams*/,
+ const vector& /*reference_lengths*/,
+ weight_t /*decay*/) {}
};
/*
- * 'fixed' per-sentence BLEU
- * simply add 1 to reference length for calculation of BP
+ * ['fixed'] per-sentence BLEU
+ * simply add 'fix' (1) to reference length for calculation of BP
+ * to avoid short translations
*
* as in "Optimizing for Sentence-Level BLEU+1
* Yields Short Translations"
* (Nakov et al. '12)
*
*/
-class PerSentenceBleuScorer : public Scorer
+class NakovBleuScorer : public Scorer
{
+ weight_t fix;
+
public:
- PerSentenceBleuScorer(size_t n) : Scorer(n) {}
+ NakovBleuScorer(size_t n, weight_t fix) : Scorer(n), fix(fix) {}
weight_t
- Score(const vector& hyp,
- const vector& ref_ngs,
- const vector& ref_ls)
+ score(const vector& hyp,
+ const vector& reference_ngrams,
+ const vector& reference_lengths)
{
size_t hl, rl, M;
vector v;
NgramCounts counts;
- if (!Init(hyp, ref_ngs, ref_ls, hl, rl, M, v, counts))
+ if (!init(hyp, reference_ngrams, reference_lengths, hl, rl, M, v, counts))
return 0.;
weight_t sum=0, add=0;
- for (size_t i = 0; i < M; i++) {
- if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.;
+ for (size_t i=0; i 0) add = 1;
- sum += v[i] * log(((weight_t)counts.clipped_[i] + add)
- / ((counts.sum_[i] + add)));
+ sum += v[i] * log(((weight_t)counts.clipped[i] + add)
+ / ((counts.sum[i] + add)));
}
- return BrevityPenalty(hl, rl+1) * exp(sum);
+ return brevity_penalty(hl, rl+1) * exp(sum);
}
};
-
/*
* BLEU
* 0 if for one n \in {1..N} count is 0
@@ -244,29 +261,28 @@ class PerSentenceBleuScorer : public Scorer
* (Papineni et al. '02)
*
*/
-
-class BleuScorer : public Scorer
+class PapineniBleuScorer : public Scorer
{
public:
- BleuScorer(size_t n) : Scorer(n) {}
+ PapineniBleuScorer(size_t n) : Scorer(n) {}
weight_t
- Score(const vector& hyp,
- const vector& ref_ngs,
- const vector& ref_ls)
+ score(const vector& hyp,
+ const vector& reference_ngrams,
+ const vector& reference_lengths)
{
size_t hl, rl, M;
vector v;
NgramCounts counts;
- if (!Init(hyp, ref_ngs, ref_ls, hl, rl, M, v, counts))
+ if (!init(hyp, reference_ngrams, reference_lengths, hl, rl, M, v, counts))
return 0.;
weight_t sum = 0;
- for (size_t i = 0; i < M; i++) {
- if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) return 0.;
- sum += v[i] * log((weight_t)counts.clipped_[i]/counts.sum_[i]);
+ for (size_t i=0; i& hyp,
- const vector& ref_ngs,
- const vector& ref_ls)
+ score(const vector& hyp,
+ const vector& reference_ngrams,
+ const vector& reference_lengths)
{
size_t hl, rl, M;
vector v;
NgramCounts counts;
- if (!Init(hyp, ref_ngs, ref_ls, hl, rl, M, v, counts))
+ if (!init(hyp, reference_ngrams, reference_lengths, hl, rl, M, v, counts))
return 0.;
weight_t sum=0, add=0;
- for (size_t i = 0; i < M; i++) {
- if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.;
+ for (size_t i=0; i& hyp,
- const vector& ref_ngs,
- const vector& ref_ls)
+ score(const vector& hyp,
+ const vector& reference_ngrams,
+ const vector& reference_lengths)
{
- size_t hl=hyp.size(), rl=BestMatchLength(hl, ref_ls);
+ size_t hl=hyp.size(), rl=best_match_length(hl, reference_lengths);
if (hl == 0 || rl == 0) return 0.;
- NgramCounts counts = MakeNgramCounts(hyp, ref_ngs, N_);
+ NgramCounts counts = ngram_counts(hyp, reference_ngrams, N_);
size_t M = N_;
if (rl < N_) M = rl;
weight_t sum = 0.;
vector i_bleu;
- for (size_t i=0; i < M; i++)
+ for (size_t i=0; i& hyp,
- const vector& ref_ngs,
- const vector& ref_ls)
+ score(const vector& hyp,
+ const vector& reference_ngrams,
+ const vector& reference_lengths)
{
size_t hl, rl, M;
vector v;
NgramCounts counts;
- if (!Init(hyp, ref_ngs, ref_ls, hl, rl, M, v, counts))
+ if (!init(hyp, reference_ngrams, reference_lengths, hl, rl, M, v, counts))
return 0.;
counts += context;
weight_t sum = 0;
for (size_t i = 0; i < M; i++) {
- if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) return 0.;
- sum += v[i] * log((weight_t)counts.clipped_[i]/counts.sum_[i]);
+ if (counts.sum[i]==0 || counts.clipped[i]==0) return 0.;
+ sum += v[i] * log((weight_t)counts.clipped[i] / counts.sum[i]);
}
- return BrevityPenalty(hyp_sz_sum+hl, ref_sz_sum+rl) * exp(sum);
+ return brevity_penalty(hyp_sz_sum+hl, ref_sz_sum+rl) * exp(sum);
}
void
- UpdateContext(const vector& hyp,
- const vector& ref_ngs,
- const vector& ref_ls,
- weight_t decay=0.9)
+ update_context(const vector& hyp,
+ const vector& reference_ngrams,
+ const vector& reference_lengths,
+ weight_t decay=0.9)
{
size_t hl, rl, M;
vector v;
NgramCounts counts;
- Init(hyp, ref_ngs, ref_ls, hl, rl, M, v, counts);
+ init(hyp, reference_ngrams, reference_lengths, hl, rl, M, v, counts);
context += counts;
context *= decay;
@@ -413,6 +431,56 @@ class ApproxBleuScorer : public Scorer
}
};
+/*
+ * 'sum' bleu
+ *
+ * Merely sum up Ngram precisions
+ */
+class SumBleuScorer : public Scorer
+{
+ public:
+ SumBleuScorer(size_t n) : Scorer(n) {}
+
+ weight_t
+ score(const vector& hyp,
+ const vector& reference_ngrams,
+ const vector& reference_lengths)
+ {
+ size_t hl, rl, M;
+ vector v;
+ NgramCounts counts;
+ if (!init(hyp, reference_ngrams, reference_lengths, hl, rl, M, v, counts))
+ return 0.;
+ weight_t sum = 0.;
+ size_t j = 1;
+ for (size_t i=0; i