diff options
Diffstat (limited to 'dtrain/dcommon.cc')
-rw-r--r-- | dtrain/dcommon.cc | 311 |
1 files changed, 311 insertions, 0 deletions
diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc new file mode 100644 index 00000000..a6bdc92c --- /dev/null +++ b/dtrain/dcommon.cc @@ -0,0 +1,311 @@ +#include "dcommon.h" + + + +/* + * make_ngrams + * + */ +typedef map<vector<WordID>, size_t> Ngrams; +Ngrams +make_ngrams( vector<WordID>& s, size_t N ) +{ + Ngrams ngrams; + vector<WordID> ng; + for ( size_t i = 0; i < s.size(); i++ ) { + ng.clear(); + for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { + ng.push_back( s[j] ); + ngrams[ng]++; + } + } + return ngrams; +} + + + + + +/* + * ngram_matches + * + */ +NgramCounts +make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ) +{ + Ngrams hyp_ngrams = make_ngrams( hyp, N ); + Ngrams ref_ngrams = make_ngrams( ref, N ); + NgramCounts counts( N ); + Ngrams::iterator it; + Ngrams::iterator ti; + for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { + ti = ref_ngrams.find( it->first ); + if ( ti != ref_ngrams.end() ) { + counts.add( it->second, ti->second, it->first.size() - 1 ); + } else { + counts.add( it->second, 0, it->first.size() - 1 ); + } + } + return counts; +} + + +/* + * brevity_penaly + * + */ +double +brevity_penaly( const size_t hyp_len, const size_t ref_len ) +{ + if ( hyp_len > ref_len ) return 1; + return exp( 1 - (double)ref_len/(double)hyp_len ); +} + + +/* + * bleu + * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) + * page TODO + * 0 if for N one of the counts = 0 + */ +double +bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + size_t N, vector<float> weights ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + if ( ref_len < N ) N = ref_len; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + for ( size_t i = 0; i < N; i++ ) { + if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; + sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); + } + return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * stupid_bleu + * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) + * page TODO + * 0 iff no 1gram match + */ +double +stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + size_t N, vector<float> weights ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + if ( ref_len < N ) N = ref_len; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + float add = 0; + for ( size_t i = 0; i < N; i++ ) { + if ( i == 1 ) add = 1; + sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); + } + return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * smooth_bleu + * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) + * page TODO + * max. 0.9375 + */ +double +smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + const size_t N, vector<float> weights ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + float j = 1; + for ( size_t i = 0; i < N; i++ ) { + if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; + sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); + j++; + } + return brevity_penaly( hyp_len, ref_len ) * sum; +} + + +/* + * approx_bleu + * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) + * page TODO + * + */ +double +approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + const size_t N, vector<float> weights ) +{ + return bleu( counts, hyp_len, ref_len, N, weights ); +} + + +/* + * register_and_convert + * + */ +void +register_and_convert(const vector<string>& strs, vector<WordID>& ids) +{ + vector<string>::const_iterator it; + for ( it = strs.begin(); it < strs.end(); it++ ) { + ids.push_back( TD::Convert( *it ) ); + } +} + + + + +/* + * + * + */ +void +test_ngrams() +{ + cout << "Testing ngrams..." << endl << endl; + size_t N = 5; + cout << "N = " << N << endl; + vector<int> a; // hyp + vector<int> b; // ref + cout << "a "; + for (size_t i = 1; i <= 8; i++) { + cout << i << " "; + a.push_back(i); + } + cout << endl << "b "; + for (size_t i = 1; i <= 4; i++) { + cout << i << " "; + b.push_back(i); + } + cout << endl << endl; + NgramCounts c = make_ngram_counts( a, b, N ); + assert( c.clipped[N-1] == 0 ); + assert( c.sum[N-1] == 4 ); + c.print(); + c += c; + cout << endl; + c.print(); + cout << endl; +} + + +/* + * + * + */ +double +approx_equal( double x, double y ) +{ + const double EPSILON = 1E-5; + if ( x == 0 ) return fabs( y ) <= EPSILON; + if ( y == 0 ) return fabs( x ) <= EPSILON; + return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; +} + + +/* + * + * + */ +void +test_metrics() +{ + cout << "Testing metrics..." << endl << endl; + using namespace boost::assign; + vector<string> a, b; + vector<double> expect_vanilla, expect_smooth, expect_stupid; + a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp + b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref + expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0; + expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587; + expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707; + vector<string> aa, bb; + vector<WordID> aai, bbi; + double vanilla, smooth, stupid; + size_t N = 4; + cout << "N = " << N << endl << endl; + for ( size_t i = 0; i < a.size(); i++ ) { + cout << " hyp: " << a[i] << endl; + cout << " ref: " << b[i] << endl; + aa.clear(); bb.clear(); aai.clear(); bbi.clear(); + boost::split( aa, a[i], boost::is_any_of(" ") ); + boost::split( bb, b[i], boost::is_any_of(" ") ); + register_and_convert( aa, aai ); + register_and_convert( bb, bbi ); + NgramCounts counts = make_ngram_counts( aai, bbi, N ); + vanilla = bleu( counts, aa.size(), bb.size(), N); + smooth = smooth_bleu( counts, aa.size(), bb.size(), N); + stupid = stupid_bleu( counts, aa.size(), bb.size(), N); + assert( approx_equal(vanilla, expect_vanilla[i]) ); + assert( approx_equal(smooth, expect_smooth[i]) ); + assert( approx_equal(stupid, expect_stupid[i]) ); + cout << setw(14) << "bleu = " << vanilla << endl; + cout << setw(14) << "smooth bleu = " << smooth << endl; + cout << setw(14) << "stupid bleu = " << stupid << endl << endl; + } + cout << endl; +} + +/* + * + * + */ +void +test_SetWeights() +{ + cout << "Testing Weights::SetWeight..." << endl << endl; + Weights weights; + SparseVector<double> lambdas; + weights.InitSparseVector( &lambdas ); + weights.SetWeight( &lambdas, "test", 0 ); + weights.SetWeight( &lambdas, "test1", 1 ); + WordID fid = FD::Convert( "test2" ); + weights.SetWeight( &lambdas, fid, 2 ); + string fn = "weights-test"; + cout << "FD::NumFeats() " << FD::NumFeats() << endl; + assert( FD::NumFeats() == 4 ); + weights.WriteToFile( fn, true ); + cout << endl; +} + + +/* + * + * + */ +void +run_tests() +{ + cout << endl; + test_ngrams(); + cout << endl; + test_metrics(); + cout << endl; + test_SetWeights(); + exit(0); +} + + +void +print_FD() +{ + for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; +} + |