diff options
| author | Patrick Simianer <p@simianer.de> | 2011-07-29 00:48:04 +0200 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2011-09-23 19:13:57 +0200 | 
| commit | b732e625ffcf59da8440db577183110488f5c4b7 (patch) | |
| tree | 863b59dea0ffd927621751da5c53298924c2ee2f /dtrain/dcommon.cc | |
| parent | 05c41075d0018ca6142f7ba593742fbadfecdf65 (diff) | |
first cut for sofia-ml, little change in utils/dict.h, coarse refactoring
Diffstat (limited to 'dtrain/dcommon.cc')
| -rw-r--r-- | dtrain/dcommon.cc | 311 | 
1 files changed, 311 insertions, 0 deletions
| diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc new file mode 100644 index 00000000..a6bdc92c --- /dev/null +++ b/dtrain/dcommon.cc @@ -0,0 +1,311 @@ +#include "dcommon.h" + + + +/* + * make_ngrams + * + */ +typedef map<vector<WordID>, size_t> Ngrams; +Ngrams +make_ngrams( vector<WordID>& s, size_t N ) +{ +  Ngrams ngrams; +  vector<WordID> ng; +  for ( size_t i = 0; i < s.size(); i++ ) { +    ng.clear(); +    for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { +      ng.push_back( s[j] ); +      ngrams[ng]++; +    } +  } +  return ngrams; +} + + + + + +/* + * ngram_matches + * + */ +NgramCounts +make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ) +{ +  Ngrams hyp_ngrams = make_ngrams( hyp, N ); +  Ngrams ref_ngrams = make_ngrams( ref, N ); +  NgramCounts counts( N ); +  Ngrams::iterator it; +  Ngrams::iterator ti; +  for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { +    ti = ref_ngrams.find( it->first ); +    if ( ti != ref_ngrams.end() ) { +      counts.add( it->second, ti->second, it->first.size() - 1 ); +    } else { +      counts.add( it->second, 0, it->first.size() - 1 ); +    } +  } +  return counts; +} + + +/* + * brevity_penaly + * + */ +double +brevity_penaly( const size_t hyp_len, const size_t ref_len ) +{ +  if ( hyp_len > ref_len ) return 1; +  return exp( 1 - (double)ref_len/(double)hyp_len ); +} + + +/* + * bleu + * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) + * page TODO + * 0 if for N one of the counts = 0 + */ +double +bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +      size_t N, vector<float> weights  ) +{ +  if ( hyp_len == 0 || ref_len == 0 ) return 0; +  if ( ref_len < N ) N = ref_len; +  float N_ = (float)N; +  if ( weights.empty() ) +  { +    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); +  } +  double sum = 0; +  for ( size_t i = 0; i < N; i++ ) { +    if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; +    sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); +  } +  return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * stupid_bleu + * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) + * page TODO + * 0 iff no 1gram match + */ +double +stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +             size_t N, vector<float> weights  ) +{ +  if ( hyp_len == 0 || ref_len == 0 ) return 0; +  if ( ref_len < N ) N = ref_len; +  float N_ = (float)N; +  if ( weights.empty() ) +  { +    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); +  } +  double sum = 0; +  float add = 0; +  for ( size_t i = 0; i < N; i++ ) { +    if ( i == 1 ) add = 1; +    sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); +  } +  return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * smooth_bleu + * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) + * page TODO + * max. 0.9375 + */ +double +smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +             const size_t N, vector<float> weights  ) +{ +  if ( hyp_len == 0 || ref_len == 0 ) return 0; +  float N_ = (float)N; +  if ( weights.empty() ) +  { +    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); +  } +  double sum = 0; +  float j = 1; +  for ( size_t i = 0; i < N; i++ ) { +    if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; +    sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); +    j++; +  } +  return brevity_penaly( hyp_len, ref_len ) * sum; +} + + +/* + * approx_bleu + * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) + * page TODO + * + */ +double +approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +     const size_t N, vector<float> weights ) +{ +  return bleu( counts, hyp_len, ref_len, N, weights ); +} + + +/* + * register_and_convert + * + */ +void +register_and_convert(const vector<string>& strs, vector<WordID>& ids) +{ +  vector<string>::const_iterator it; +  for ( it = strs.begin(); it < strs.end(); it++ ) { +    ids.push_back( TD::Convert( *it ) ); +  } +} + + + + +/* + * + * + */ +void +test_ngrams() +{ +  cout << "Testing ngrams..." << endl << endl; +  size_t N = 5; +  cout << "N = " << N << endl; +  vector<int> a; // hyp +  vector<int> b; // ref +  cout << "a "; +  for (size_t i = 1; i <= 8; i++) { +    cout << i << " "; +    a.push_back(i); +  } +  cout << endl << "b "; +  for (size_t i = 1; i <= 4; i++) { +    cout << i << " "; +    b.push_back(i); +  } +  cout << endl << endl; +  NgramCounts c = make_ngram_counts( a, b, N ); +  assert( c.clipped[N-1] == 0 ); +  assert( c.sum[N-1] == 4 ); +  c.print(); +  c += c; +  cout << endl; +  c.print(); +  cout << endl; +} + + +/* + * + * + */ +double +approx_equal( double x, double y ) +{ +  const double EPSILON = 1E-5; +  if ( x == 0 ) return fabs( y ) <= EPSILON; +  if ( y == 0 ) return fabs( x ) <= EPSILON; +  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; +} + + +/* + * + * + */ +void +test_metrics() +{ +  cout << "Testing metrics..." << endl << endl; +  using namespace boost::assign; +  vector<string> a, b; +  vector<double> expect_vanilla, expect_smooth, expect_stupid; +  a +=              "a a a a", "a a a a", "a",   "a", "b",        "a a a a", "a a",  "a a a", "a b a"; // hyp +  b +=              "b b b b", "a a a a", "a",   "b", "b b b b",  "a",       "a a",  "a a a", "a b b"; // ref +  expect_vanilla += 0,         1,         1,      0,  0,          .25,       1,      1,       0; +  expect_smooth  += 0,          .9375,     .0625, 0,   .00311169, .0441942,   .1875,  .4375,   .161587; +  expect_stupid  += 0,         1,         1,      0,   .0497871,  .25,       1,      1,        .605707; +  vector<string> aa, bb; +  vector<WordID> aai, bbi; +  double vanilla, smooth, stupid; +  size_t N = 4; +  cout << "N = " << N << endl << endl; +  for ( size_t i = 0; i < a.size(); i++ ) { +    cout << " hyp: " << a[i] << endl; +    cout << " ref: " << b[i] << endl; +    aa.clear(); bb.clear(); aai.clear(); bbi.clear(); +    boost::split( aa, a[i], boost::is_any_of(" ") ); +    boost::split( bb, b[i], boost::is_any_of(" ") ); +    register_and_convert( aa, aai ); +    register_and_convert( bb, bbi ); +    NgramCounts counts = make_ngram_counts( aai, bbi, N ); +    vanilla =        bleu( counts, aa.size(), bb.size(), N); +    smooth  = smooth_bleu( counts, aa.size(), bb.size(), N); +    stupid  = stupid_bleu( counts, aa.size(), bb.size(), N); +    assert( approx_equal(vanilla, expect_vanilla[i]) ); +    assert( approx_equal(smooth, expect_smooth[i]) ); +    assert( approx_equal(stupid, expect_stupid[i]) ); +    cout << setw(14) << "bleu = "      << vanilla << endl; +    cout << setw(14) << "smooth bleu = " << smooth << endl; +    cout << setw(14) << "stupid bleu = " << stupid << endl << endl; +  } +  cout << endl; +} + +/* + * + * + */ +void +test_SetWeights() +{ +  cout << "Testing Weights::SetWeight..." << endl << endl; +  Weights weights; +  SparseVector<double> lambdas; +  weights.InitSparseVector( &lambdas ); +  weights.SetWeight( &lambdas, "test", 0 ); +  weights.SetWeight( &lambdas, "test1", 1 ); +  WordID fid = FD::Convert( "test2" ); +  weights.SetWeight( &lambdas, fid, 2 ); +  string fn = "weights-test"; +  cout << "FD::NumFeats() " << FD::NumFeats() << endl; +  assert( FD::NumFeats() == 4 ); +  weights.WriteToFile( fn, true ); +  cout << endl; +} + + +/* + * + * + */ +void +run_tests() +{ +  cout << endl; +  test_ngrams(); +  cout << endl; +  test_metrics(); +  cout << endl; +  test_SetWeights(); +  exit(0); +} + + +void +print_FD() +{ +  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; +} + | 
