diff options
| -rw-r--r-- | dtrain/Makefile.am | 2 | ||||
| -rw-r--r-- | dtrain/dtrain.cc | 10 | ||||
| -rw-r--r-- | dtrain/hgsampler.cc (renamed from dtrain/sample_hg.cc) | 2 | ||||
| -rw-r--r-- | dtrain/hgsampler.h (renamed from dtrain/sample_hg.h) | 17 | ||||
| -rw-r--r-- | dtrain/ksampler.h | 2 | ||||
| -rw-r--r-- | dtrain/pairsampling.h | 35 | ||||
| -rw-r--r-- | dtrain/score.cc | 165 | ||||
| -rw-r--r-- | dtrain/score.h | 53 | 
8 files changed, 133 insertions, 153 deletions
| diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index 9b5df8bf..12084a70 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -1,7 +1,7 @@  # TODO I'm sure I can leave something out.  bin_PROGRAMS = dtrain -dtrain_SOURCES = dtrain.cc score.cc sample_hg.cc +dtrain_SOURCES = dtrain.cc score.cc hgsampler.cc  dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams  AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 01821b30..01119997 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -347,10 +347,9 @@ main( int argc, char** argv )              cand_len = kb->sents[i].size();          }          NgramCounts counts_tmp = global_counts + counts; -        // TODO as param -        score = 0.9 * scorer( counts_tmp, -                              global_ref_len, -                              global_hyp_len + cand_len, N, bleu_weights ); +        score = .9*scorer( counts_tmp, +                        global_ref_len, +                        global_hyp_len + cand_len, N, bleu_weights );        } else {          // other scorers          cand_len = kb->sents[i].size(); @@ -381,7 +380,8 @@ main( int argc, char** argv )      if ( !noup ) {        TrainingInstances pairs; -      sample_all( kb, pairs ); +      sample_all_pairs(kb, pairs); +      //sample_rand_pairs( kb, pairs, &rng );        for ( TrainingInstances::iterator ti = pairs.begin();              ti != pairs.end(); ti++ ) { diff --git a/dtrain/sample_hg.cc b/dtrain/hgsampler.cc index 33872fb8..7a00a3d3 100644 --- a/dtrain/sample_hg.cc +++ b/dtrain/hgsampler.cc @@ -1,4 +1,4 @@ -#include "sample_hg.h" +#include "hgsampler.h"  #include <queue> diff --git a/dtrain/sample_hg.h b/dtrain/hgsampler.h index 932fd369..b840c07f 100644 --- a/dtrain/sample_hg.h +++ b/dtrain/hgsampler.h @@ -1,5 +1,6 @@ -#ifndef _SAMPLE_HG_H_ -#define _SAMPLE_HG_H_ +#ifndef _DTRAIN_HGSAMPLER_H_ +#define _DTRAIN_HGSAMPLER_H_ +  #include <vector>  #include "sparse_vector.h" @@ -9,16 +10,20 @@  class Hypergraph;  struct HypergraphSampler { +    struct Hypothesis {      std::vector<WordID> words;      SparseVector<double> fmap;      prob_t model_score;    }; -  static void sample_hypotheses(const Hypergraph& hg, -                                unsigned n, -                                MT19937* rng, -                                std::vector<Hypothesis>* hypos); +  static void +  sample_hypotheses(const Hypergraph& hg, +                    unsigned n, +                    MT19937* rng, +                    std::vector<Hypothesis>* hypos);  }; +  #endif + diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index a28b69c9..914e9723 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -2,7 +2,7 @@  #define _DTRAIN_KSAMPLER_H_  #include "kbest.h" -#include "sample_hg.h" +#include "hgsampler.h"  #include "sampler.h"  namespace dtrain diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 502901af..9774ba4a 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -1,9 +1,8 @@ -#ifndef _DTRAIN_SAMPLE_H_ -#define _DTRAIN_SAMPLE_H_ - +#ifndef _DTRAIN_PAIRSAMPLING_H_ +#define _DTRAIN_PAIRSAMPLING_H_  #include "kbestget.h" - +#include "sampler.h" // cdec MT19937  namespace dtrain  { @@ -11,19 +10,18 @@ namespace dtrain  struct TPair  { -  SparseVector<double> first, second; -  size_t first_rank, second_rank; -  double first_score, second_score; +  SparseVector<double> first,       second; +  size_t               first_rank,  second_rank; +  double               first_score, second_score;  };  typedef vector<TPair> TrainingInstances; -  void -sample_all( KBestList* kb, TrainingInstances &training ) +sample_all_pairs(KBestList* kb, TrainingInstances &training)  { -  for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { -    for ( size_t j = i+1; j < kb->GetSize(); j++ ) { +  for (size_t i = 0; i < kb->GetSize()-1; i++) { +    for (size_t j = i+1; j < kb->GetSize(); j++) {        TPair p;        p.first = kb->feats[i];        p.second = kb->feats[j]; @@ -31,18 +29,18 @@ sample_all( KBestList* kb, TrainingInstances &training )        p.second_rank = j;        p.first_score = kb->scores[i];        p.second_score = kb->scores[j]; -      training.push_back( p ); +      training.push_back(p);      }    }  }  void -sample_rand( KBestList* kb, TrainingInstances &training ) +sample_rand_pairs(KBestList* kb, TrainingInstances &training, MT19937* prng)  { -  srand( time(NULL) ); -  for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { -    for ( size_t j = i+1; j < kb->GetSize(); j++ ) { -      if ( rand() % 2 ) { +  srand(time(NULL)); +  for (size_t i = 0; i < kb->GetSize()-1; i++) { +    for (size_t j = i+1; j < kb->GetSize(); j++) { +      if (prng->next() < .5) {          TPair p;          p.first = kb->feats[i];          p.second = kb->feats[j]; @@ -50,10 +48,11 @@ sample_rand( KBestList* kb, TrainingInstances &training )          p.second_rank = j;          p.first_score = kb->scores[i];          p.second_score = kb->scores[j]; -        training.push_back( p ); +        training.push_back(p);        }      }    } +  cout << training.size() << " sampled" << endl;  } diff --git a/dtrain/score.cc b/dtrain/score.cc index 1e98c11d..d08e87f3 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -1,166 +1,149 @@  #include "score.h" -  namespace dtrain  { -/****************************************************************************** - * NGRAMS - * - * - * make_ngrams - * - */ -typedef map<vector<WordID>, size_t> Ngrams;  Ngrams -make_ngrams( vector<WordID>& s, size_t N ) +make_ngrams(vector<WordID>& s, size_t N)  {    Ngrams ngrams;    vector<WordID> ng; -  for ( size_t i = 0; i < s.size(); i++ ) { +  for (size_t i = 0; i < s.size(); i++) {      ng.clear(); -    for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { -      ng.push_back( s[j] ); +    for (size_t j = i; j < min(i+N, s.size()); j++) { +      ng.push_back(s[j]);        ngrams[ng]++;      }    }    return ngrams;  } - -/* - * ngram_matches - * - */  NgramCounts -make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ) +make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N)  { -  Ngrams hyp_ngrams = make_ngrams( hyp, N ); -  Ngrams ref_ngrams = make_ngrams( ref, N ); -  NgramCounts counts( N ); +  Ngrams hyp_ngrams = make_ngrams(hyp, N); +  Ngrams ref_ngrams = make_ngrams(ref, N); +  NgramCounts counts(N);    Ngrams::iterator it;    Ngrams::iterator ti; -  for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { -    ti = ref_ngrams.find( it->first ); -    if ( ti != ref_ngrams.end() ) { -      counts.add( it->second, ti->second, it->first.size() - 1 ); +  for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) { +    ti = ref_ngrams.find(it->first); +    if (ti != ref_ngrams.end()) { +      counts.add(it->second, ti->second, it->first.size() - 1);      } else { -      counts.add( it->second, 0, it->first.size() - 1 ); +      counts.add(it->second, 0, it->first.size() - 1);      }    }    return counts;  } - -/****************************************************************************** - * SCORERS - * +/* + * bleu   * - * brevity_penaly + * as in "BLEU: a Method for Automatic Evaluation + *        of Machine Translation" + * (Papineni et al. '02)   * + * NOTE: 0 if one n in {1..N} has 0 count   */  double -brevity_penaly( const size_t hyp_len, const size_t ref_len ) +brevity_penaly(const size_t hyp_len, const size_t ref_len)  { -  if ( hyp_len > ref_len ) return 1; -  return exp( 1 - (double)ref_len/(double)hyp_len ); +  if (hyp_len > ref_len) return 1; +  return exp(1 - (double)ref_len/(double)hyp_len);  } - - -/* - * bleu - * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) - * page TODO - * 0 if for N one of the counts = 0 - */  double -bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -      size_t N, vector<float> weights  ) +bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +      size_t N, vector<float> weights )  { -  if ( hyp_len == 0 || ref_len == 0 ) return 0; -  if ( ref_len < N ) N = ref_len; +  if (hyp_len == 0 || ref_len == 0) return 0; +  if (ref_len < N) N = ref_len;    float N_ = (float)N; -  if ( weights.empty() ) +  if (weights.empty())    { -    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); +    for (size_t i = 0; i < N; i++) weights.push_back(1/N_);    }    double sum = 0; -  for ( size_t i = 0; i < N; i++ ) { -    if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; -    sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); +  for (size_t i = 0; i < N; i++) { +    if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0; +    sum += weights[i] * log((double)counts.clipped[i] / (double)counts.sum[i]);    } -  return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +  return brevity_penaly(hyp_len, ref_len) * exp(sum);  } -  /* - * stupid_bleu - * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) - * page TODO - * 0 iff no 1gram match + * 'stupid' bleu + * + * as in "ORANGE: a Method for Evaluating + *        Automatic Evaluation Metrics + *        for Machine Translation" + * (Lin & Och '04) + * + * NOTE: 0 iff no 1gram match   */  double -stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -             size_t N, vector<float> weights  ) +stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +             size_t N, vector<float> weights )  { -  if ( hyp_len == 0 || ref_len == 0 ) return 0; -  if ( ref_len < N ) N = ref_len; +  if (hyp_len == 0 || ref_len == 0) return 0; +  if (ref_len < N) N = ref_len;    float N_ = (float)N; -  if ( weights.empty() ) +  if (weights.empty())    { -    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); +    for (size_t i = 0; i < N; i++) weights.push_back(1/N_);    }    double sum = 0;    float add = 0; -  for ( size_t i = 0; i < N; i++ ) { -    if ( i == 1 ) add = 1; -    sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); +  for (size_t i = 0; i < N; i++) { +    if (i == 1) add = 1; +    sum += weights[i] * log(((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add));    } -  return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +  return brevity_penaly(hyp_len, ref_len) * exp(sum);  } -  /* - * smooth_bleu - * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) - * page TODO - * max. 0.9375 + * smooth bleu + * + * as in "An End-to-End Discriminative Approach + *        to Machine Translation" + * (Liang et al. '06) + * + * NOTE: max is 0.9375   */  double -smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -             const size_t N, vector<float> weights  ) +smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +            const size_t N, vector<float> weights )  { -  if ( hyp_len == 0 || ref_len == 0 ) return 0; +  if (hyp_len == 0 || ref_len == 0) return 0;    float N_ = (float)N; -  if ( weights.empty() ) +  if (weights.empty())    { -    for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); +    for (size_t i = 0; i < N; i++) weights.push_back(1/N_);    }    double sum = 0;    float j = 1; -  for ( size_t i = 0; i < N; i++ ) { -    if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; -    sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); +  for (size_t i = 0; i < N; i++) { +    if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue; +    sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow(2, N_-j+1);      j++;    } -  return brevity_penaly( hyp_len, ref_len ) * sum; +  return brevity_penaly(hyp_len, ref_len) * sum;  } -  /* - * approx_bleu - * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) - * CHIANG, RESNIK, synt struct features - * .9* - * page TODO + * approx. bleu   * + * as in "Online Large-Margin Training of Syntactic + *        and Structural Translation Features" + * (Chiang et al. '08)   */  double -approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -     const size_t N, vector<float> weights ) +approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, +            const size_t N, vector<float> weights)  { -  return bleu( counts, hyp_len, ref_len, N, weights ); +  return brevity_penaly(hyp_len, ref_len)  +         * 0.9 * bleu(counts, hyp_len, ref_len, N, weights);  } diff --git a/dtrain/score.h b/dtrain/score.h index e88387c5..0afb6237 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -1,29 +1,23 @@  #ifndef _DTRAIN_SCORE_H_  #define _DTRAIN_SCORE_H_ -  #include <iostream>  #include <vector>  #include <map>  #include <cassert>  #include <cmath> -#include "wordid.h" +#include "wordid.h" // cdec  using namespace std; -  namespace dtrain  { -/* - * NgramCounts - * - */  struct NgramCounts  { -  NgramCounts( const size_t N ) : N_( N ) { +  NgramCounts(const size_t N) : N_(N) {      reset();    }     size_t N_; @@ -31,17 +25,17 @@ struct NgramCounts    map<size_t, size_t> sum;    void -  operator+=( const NgramCounts& rhs ) +  operator+=(const NgramCounts& rhs)    { -    assert( N_ == rhs.N_ ); -    for ( size_t i = 0; i < N_; i++ ) { +    assert(N_ == rhs.N_); +    for (size_t i = 0; i < N_; i++) {        this->clipped[i] += rhs.clipped.find(i)->second;        this->sum[i] += rhs.sum.find(i)->second;      }    }    const NgramCounts -  operator+( const NgramCounts &other ) const +  operator+(const NgramCounts &other) const    {      NgramCounts result = *this;      result += other; @@ -49,10 +43,10 @@ struct NgramCounts    }    void -  add( size_t count, size_t ref_count, size_t i ) +  add(size_t count, size_t ref_count, size_t i)    { -    assert( i < N_ ); -    if ( count > ref_count ) { +    assert(i < N_); +    if (count > ref_count) {        clipped[i] += ref_count;        sum[i] += count;      } else { @@ -65,7 +59,7 @@ struct NgramCounts    reset()    {      size_t i; -    for ( i = 0; i < N_; i++ ) { +    for (i = 0; i < N_; i++) {        clipped[i] = 0;        sum[i] = 0;      } @@ -74,27 +68,26 @@ struct NgramCounts    void    print()    { -    for ( size_t i = 0; i < N_; i++ ) { +    for (size_t i = 0; i < N_; i++) {        cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;        cout << i+1 << "grams:\t\t\t" << sum[i] << endl;      }    }  }; -  typedef map<vector<WordID>, size_t> Ngrams; -Ngrams make_ngrams( vector<WordID>& s, size_t N ); -NgramCounts make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ); - -double brevity_penaly( const size_t hyp_len, const size_t ref_len ); -double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, -             vector<float> weights = vector<float>() ); -double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, -                    vector<float> weights = vector<float>() ); -double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, -                    vector<float> weights = vector<float>() ); -double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, -                    vector<float> weights = vector<float>() ); +Ngrams make_ngrams(vector<WordID>& s, size_t N); +NgramCounts make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N); + +double brevity_penaly(const size_t hyp_len, const size_t ref_len); +double bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, +             vector<float> weights = vector<float>()); +double stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, +                    vector<float> weights = vector<float>()); +double smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, +                    vector<float> weights = vector<float>()); +double approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, +                    vector<float> weights = vector<float>());  } // namespace | 
