From 2e605eb2745e56619b16fdbcb8095e0a6543ab27 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 3 Aug 2011 01:29:52 +0200 Subject: refactoring, cleaning up --- dtrain/dcommon.cc | 330 ------------------------------------------------------ 1 file changed, 330 deletions(-) delete mode 100644 dtrain/dcommon.cc (limited to 'dtrain/dcommon.cc') diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc deleted file mode 100644 index 6657bed6..00000000 --- a/dtrain/dcommon.cc +++ /dev/null @@ -1,330 +0,0 @@ -#include "dcommon.h" - - - - -/****************************************************************************** - * NGRAMS - * - * - * make_ngrams - * - */ -typedef map, size_t> Ngrams; -Ngrams -make_ngrams( vector& s, size_t N ) -{ - Ngrams ngrams; - vector ng; - for ( size_t i = 0; i < s.size(); i++ ) { - ng.clear(); - for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { - ng.push_back( s[j] ); - ngrams[ng]++; - } - } - return ngrams; -} - - -/* - * ngram_matches - * - */ -NgramCounts -make_ngram_counts( vector hyp, vector ref, size_t N ) -{ - Ngrams hyp_ngrams = make_ngrams( hyp, N ); - Ngrams ref_ngrams = make_ngrams( ref, N ); - NgramCounts counts( N ); - Ngrams::iterator it; - Ngrams::iterator ti; - for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { - ti = ref_ngrams.find( it->first ); - if ( ti != ref_ngrams.end() ) { - counts.add( it->second, ti->second, it->first.size() - 1 ); - } else { - counts.add( it->second, 0, it->first.size() - 1 ); - } - } - return counts; -} - - - - -/****************************************************************************** - * SCORES - * - * - * brevity_penaly - * - */ -double -brevity_penaly( const size_t hyp_len, const size_t ref_len ) -{ - if ( hyp_len > ref_len ) return 1; - return exp( 1 - (double)ref_len/(double)hyp_len ); -} - - -/* - * bleu - * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) - * page TODO - * 0 if for N one of the counts = 0 - */ -double -bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - size_t N, vector weights ) -{ - if ( hyp_len == 0 || ref_len == 0 ) return 0; - if ( ref_len < N ) N = ref_len; - float N_ = (float)N; - if ( weights.empty() ) - { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); - } - double sum = 0; - for ( size_t i = 0; i < N; i++ ) { - if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; - sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); - } - return brevity_penaly( hyp_len, ref_len ) * exp( sum ); -} - - -/* - * stupid_bleu - * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) - * page TODO - * 0 iff no 1gram match - */ -double -stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - size_t N, vector weights ) -{ - if ( hyp_len == 0 || ref_len == 0 ) return 0; - if ( ref_len < N ) N = ref_len; - float N_ = (float)N; - if ( weights.empty() ) - { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); - } - double sum = 0; - float add = 0; - for ( size_t i = 0; i < N; i++ ) { - if ( i == 1 ) add = 1; - sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); - } - return brevity_penaly( hyp_len, ref_len ) * exp( sum ); -} - - -/* - * smooth_bleu - * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) - * page TODO - * max. 0.9375 - */ -double -smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - const size_t N, vector weights ) -{ - if ( hyp_len == 0 || ref_len == 0 ) return 0; - float N_ = (float)N; - if ( weights.empty() ) - { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); - } - double sum = 0; - float j = 1; - for ( size_t i = 0; i < N; i++ ) { - if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; - sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); - j++; - } - return brevity_penaly( hyp_len, ref_len ) * sum; -} - - -/* - * approx_bleu - * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) - * page TODO - * - */ -double -approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - const size_t N, vector weights ) -{ - return bleu( counts, hyp_len, ref_len, N, weights ); -} - - - - -/****************************************************************************** - * UTILS - * - * - * register_and_convert - * - */ -void -register_and_convert(const vector& strs, vector& ids) -{ - vector::const_iterator it; - for ( it = strs.begin(); it < strs.end(); it++ ) { - ids.push_back( TD::Convert( *it ) ); - } -} - - -/* - * approx_equal - * - */ -double -approx_equal( double x, double y ) -{ - const double EPSILON = 1E-5; - if ( x == 0 ) return fabs( y ) <= EPSILON; - if ( y == 0 ) return fabs( x ) <= EPSILON; - return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; -} - - -/* - * print_FD - * - */ -void -print_FD() -{ - for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; -} - - - - -/****************************************************************************** - * TESTS - * - * - * test_ngrams - * - */ -void -test_ngrams() -{ - cout << "Testing ngrams..." << endl << endl; - size_t N = 5; - cout << "N = " << N << endl; - vector a; // hyp - vector b; // ref - cout << "a "; - for (size_t i = 1; i <= 8; i++) { - cout << i << " "; - a.push_back(i); - } - cout << endl << "b "; - for (size_t i = 1; i <= 4; i++) { - cout << i << " "; - b.push_back(i); - } - cout << endl << endl; - NgramCounts c = make_ngram_counts( a, b, N ); - assert( c.clipped[N-1] == 0 ); - assert( c.sum[N-1] == 4 ); - c.print(); - c += c; - cout << endl; - c.print(); - cout << endl; -} - - -/* - * test_metrics - * - */ -void -test_metrics() -{ - cout << "Testing metrics..." << endl << endl; - using namespace boost::assign; - vector a, b; - vector expect_vanilla, expect_smooth, expect_stupid; - a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp - b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref - expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0; - expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587; - expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707; - vector aa, bb; - vector aai, bbi; - double vanilla, smooth, stupid; - size_t N = 4; - cout << "N = " << N << endl << endl; - for ( size_t i = 0; i < a.size(); i++ ) { - cout << " hyp: " << a[i] << endl; - cout << " ref: " << b[i] << endl; - aa.clear(); bb.clear(); aai.clear(); bbi.clear(); - boost::split( aa, a[i], boost::is_any_of(" ") ); - boost::split( bb, b[i], boost::is_any_of(" ") ); - register_and_convert( aa, aai ); - register_and_convert( bb, bbi ); - NgramCounts counts = make_ngram_counts( aai, bbi, N ); - vanilla = bleu( counts, aa.size(), bb.size(), N); - smooth = smooth_bleu( counts, aa.size(), bb.size(), N); - stupid = stupid_bleu( counts, aa.size(), bb.size(), N); - assert( approx_equal(vanilla, expect_vanilla[i]) ); - assert( approx_equal(smooth, expect_smooth[i]) ); - assert( approx_equal(stupid, expect_stupid[i]) ); - cout << setw(14) << "bleu = " << vanilla << endl; - cout << setw(14) << "smooth bleu = " << smooth << endl; - cout << setw(14) << "stupid bleu = " << stupid << endl << endl; - } - cout << endl; -} - - -/* - * test_SetWeights - * - */ -void -test_SetWeights() -{ - cout << "Testing Weights::SetWeight..." << endl << endl; - Weights weights; - SparseVector lambdas; - weights.InitSparseVector( &lambdas ); - weights.SetWeight( &lambdas, "test", 0 ); - weights.SetWeight( &lambdas, "test1", 1 ); - WordID fid = FD::Convert( "test2" ); - weights.SetWeight( &lambdas, fid, 2 ); - string fn = "weights-test"; - cout << "FD::NumFeats() " << FD::NumFeats() << endl; - assert( FD::NumFeats() == 4 ); - weights.WriteToFile( fn, true ); - cout << endl; -} - - -/* - * run_tests - * - */ -void -run_tests() -{ - cout << endl; - test_ngrams(); - cout << endl; - test_metrics(); - cout << endl; - test_SetWeights(); - exit(0); -} - -- cgit v1.2.3