summaryrefslogtreecommitdiff
path: root/dtrain/score.cc
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-08-03 01:29:52 +0200
committerPatrick Simianer <p@simianer.de>2011-09-23 19:13:57 +0200
commitd0c482c1d69a5c26f7d1bc27cf5b3a252716cb2e (patch)
tree5af960943f0b4f83b10cdf149146aa78deda5c43 /dtrain/score.cc
parent06829982fb0c03a5b0bbd95ee04de5a0019c5263 (diff)
refactoring, cleaning up
Diffstat (limited to 'dtrain/score.cc')
-rw-r--r--dtrain/score.cc166
1 files changed, 166 insertions, 0 deletions
diff --git a/dtrain/score.cc b/dtrain/score.cc
new file mode 100644
index 00000000..72e6db71
--- /dev/null
+++ b/dtrain/score.cc
@@ -0,0 +1,166 @@
+#include "score.h"
+
+
+namespace dtrain
+{
+
+
+/******************************************************************************
+ * NGRAMS
+ *
+ *
+ * make_ngrams
+ *
+ */
+typedef map<vector<WordID>, size_t> Ngrams;
+Ngrams
+make_ngrams( vector<WordID>& s, size_t N )
+{
+ Ngrams ngrams;
+ vector<WordID> ng;
+ for ( size_t i = 0; i < s.size(); i++ ) {
+ ng.clear();
+ for ( size_t j = i; j < min( i+N, s.size() ); j++ ) {
+ ng.push_back( s[j] );
+ ngrams[ng]++;
+ }
+ }
+ return ngrams;
+}
+
+
+/*
+ * ngram_matches
+ *
+ */
+NgramCounts
+make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N )
+{
+ Ngrams hyp_ngrams = make_ngrams( hyp, N );
+ Ngrams ref_ngrams = make_ngrams( ref, N );
+ NgramCounts counts( N );
+ Ngrams::iterator it;
+ Ngrams::iterator ti;
+ for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) {
+ ti = ref_ngrams.find( it->first );
+ if ( ti != ref_ngrams.end() ) {
+ counts.add( it->second, ti->second, it->first.size() - 1 );
+ } else {
+ counts.add( it->second, 0, it->first.size() - 1 );
+ }
+ }
+ return counts;
+}
+
+
+/******************************************************************************
+ * SCORES
+ *
+ *
+ * brevity_penaly
+ *
+ */
+double
+brevity_penaly( const size_t hyp_len, const size_t ref_len )
+{
+ if ( hyp_len > ref_len ) return 1;
+ return exp( 1 - (double)ref_len/(double)hyp_len );
+}
+
+
+/*
+ * bleu
+ * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02)
+ * page TODO
+ * 0 if for N one of the counts = 0
+ */
+double
+bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ size_t N, vector<float> weights )
+{
+ if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ if ( ref_len < N ) N = ref_len;
+ float N_ = (float)N;
+ if ( weights.empty() )
+ {
+ for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ }
+ double sum = 0;
+ for ( size_t i = 0; i < N; i++ ) {
+ if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0;
+ sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] );
+ }
+ return brevity_penaly( hyp_len, ref_len ) * exp( sum );
+}
+
+
+/*
+ * stupid_bleu
+ * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04)
+ * page TODO
+ * 0 iff no 1gram match
+ */
+double
+stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ size_t N, vector<float> weights )
+{
+ if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ if ( ref_len < N ) N = ref_len;
+ float N_ = (float)N;
+ if ( weights.empty() )
+ {
+ for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ }
+ double sum = 0;
+ float add = 0;
+ for ( size_t i = 0; i < N; i++ ) {
+ if ( i == 1 ) add = 1;
+ sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) );
+ }
+ return brevity_penaly( hyp_len, ref_len ) * exp( sum );
+}
+
+
+/*
+ * smooth_bleu
+ * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06)
+ * page TODO
+ * max. 0.9375
+ */
+double
+smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ const size_t N, vector<float> weights )
+{
+ if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ float N_ = (float)N;
+ if ( weights.empty() )
+ {
+ for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ }
+ double sum = 0;
+ float j = 1;
+ for ( size_t i = 0; i < N; i++ ) {
+ if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
+ sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 );
+ j++;
+ }
+ return brevity_penaly( hyp_len, ref_len ) * sum;
+}
+
+
+/*
+ * approx_bleu
+ * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07)
+ * page TODO
+ *
+ */
+double
+approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ const size_t N, vector<float> weights )
+{
+ return bleu( counts, hyp_len, ref_len, N, weights );
+}
+
+
+} // namespace
+