diff options
Diffstat (limited to 'dtrain')
| -rw-r--r-- | dtrain/dtrain.cc | 41 | ||||
| -rw-r--r-- | dtrain/kbestget.h | 20 | ||||
| -rw-r--r-- | dtrain/ksampler.h | 8 | ||||
| -rw-r--r-- | dtrain/pairsampling.h | 4 | ||||
| -rw-r--r-- | dtrain/score.cc | 132 | ||||
| -rw-r--r-- | dtrain/score.h | 120 | ||||
| -rw-r--r-- | dtrain/test/example/cdec.ini | 2 | ||||
| -rw-r--r-- | dtrain/test/example/dtrain.ini | 4 | ||||
| -rw-r--r-- | dtrain/test/example/weights.gz | bin | 248 -> 12001 bytes | 
9 files changed, 213 insertions, 118 deletions
| diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 44090242..35e6cc46 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -106,7 +106,7 @@ main(int argc, char** argv)    // scoring metric/scorer    string scorer_str = cfg["scorer"].as<string>(); -  score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>); +  /*score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);    if (scorer_str == "bleu") {      scorer = &bleu;    } else if (scorer_str == "stupid_bleu") { @@ -122,9 +122,11 @@ main(int argc, char** argv)    NgramCounts global_counts(N); // counts for 1 best translations    unsigned global_hyp_len = 0;    // sum hypothesis lengths    unsigned global_ref_len = 0;    // sum reference lengths -  // ^^^ global_* for approx_bleu +  // ^^^ global_* for approx_bleu*/    vector<score_t> bleu_weights;   // we leave this empty -> 1/N  -  if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; +  //if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; +  StupidBleuScorer scorer; +  scorer.Init(N, bleu_weights);    // init weights    Weights weights; @@ -240,7 +242,6 @@ main(int argc, char** argv)        // handling input        strsplit(in, in_split, '\t', 4);        // getting reference -      ref_ids.clear();        vector<string> ref_tok;        strsplit(in_split[2], ref_tok, ' ');        register_and_convert(ref_tok, ref_ids); @@ -279,43 +280,23 @@ main(int argc, char** argv)      // (local) scoring      if (t > 0) ref_ids = ref_ids_buf[ii]; -    score_t score = 0.;      for (unsigned i = 0; i < samples->size(); i++) { -      NgramCounts counts = make_ngram_counts(ref_ids, (*samples)[i].w, N); -      if (scorer_str == "approx_bleu") { -        unsigned hyp_len = 0; -        if (i == 0) { // 'context of 1best translations' -          global_counts  += counts; -          global_hyp_len += (*samples)[i].w.size(); -          global_ref_len += ref_ids.size(); -          counts.reset(); -        } else { -            hyp_len = (*samples)[i].w.size(); -        } -        NgramCounts _c = global_counts + counts; -        score = .9 * scorer(_c, -                            global_ref_len, -                            global_hyp_len + hyp_len, N, bleu_weights); -      } else { -        score = scorer(counts, -                       ref_ids.size(), -                       (*samples)[i].w.size(), N, bleu_weights); -      } - -      (*samples)[i].score = (score); +        //cout << ii << " " << i << endl; +        cout << _p9; +      (*samples)[i].score = scorer.Score((*samples)[i], ref_ids, ii);        if (i == 0) { -        score_sum += score; +        score_sum += (*samples)[i].score;          model_sum += (*samples)[i].model;        }        if (verbose) {          if (i == 0) cerr << "'" << TD::GetString(ref_ids) << "' [ref]" << endl;          cerr << _p5 << _np << "[hyp " << i << "] " << "'" << TD::GetString((*samples)[i].w) << "'"; -        cerr << " [SCORE=" << score << ",model="<< (*samples)[i].model << "]" << endl; +        cerr << " [SCORE=" << (*samples)[i].score << ",model="<< (*samples)[i].model << "]" << endl;          cerr << (*samples)[i].f << endl;        } -    } // sample/scoring loop +    }      if (verbose) cerr << endl; diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h index 935998a0..2a2c6073 100644 --- a/dtrain/kbestget.h +++ b/dtrain/kbestget.h @@ -1,11 +1,24 @@  #ifndef _DTRAIN_KBESTGET_H_  #define _DTRAIN_KBESTGET_H_ -#include "kbest.h" + +#include <vector> +#include <string> + +using namespace std; + +#include "kbest.h" // cdec +#include "verbose.h" +#include "viterbi.h" +#include "ff_register.h" +#include "decoder.h" +#include "weights.h"  namespace dtrain  { +typedef double score_t; // float +  struct ScoredHyp  { @@ -13,11 +26,12 @@ struct ScoredHyp    SparseVector<double> f;    score_t model;    score_t score; +  unsigned rank;  };  struct HypSampler : public DecoderObserver  { -  virtual vector<ScoredHyp>* GetSamples() {} +  virtual vector<ScoredHyp>* GetSamples()=0;  };  struct KBestGetter : public HypSampler @@ -62,6 +76,7 @@ struct KBestGetter : public HypSampler        h.w = d->yield;        h.f = d->feature_values;        h.model = log(d->score); +      h.rank = i;        s_.push_back(h);      }    } @@ -79,6 +94,7 @@ struct KBestGetter : public HypSampler        h.w = d->yield;        h.f = d->feature_values;        h.model = log(d->score); +      h.rank = i;        s_.push_back(h);      }    } diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index 17b0ba56..767dc42e 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -1,7 +1,13 @@  #ifndef _DTRAIN_KSAMPLER_H_  #define _DTRAIN_KSAMPLER_H_ +#include "kbestget.h"  #include "hgsampler.h" +#include <vector> +#include <string> + +using namespace std; +  #include "kbest.h" // cdec  #include "sampler.h" @@ -14,6 +20,7 @@ struct KSampler : public HypSampler    const unsigned k_;    vector<ScoredHyp> s_;    MT19937* prng_; +  score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);    explicit KSampler(const unsigned k, MT19937* prng) :      k_(k), prng_(prng) {} @@ -35,6 +42,7 @@ struct KSampler : public HypSampler        h.w = samples[i].words;        h.f = samples[i].fmap;        h.model = log(samples[i].model_score);  +      h.rank = i;        s_.push_back(h);      }    } diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 9546a945..4a6d93d1 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -2,6 +2,10 @@  #define _DTRAIN_PAIRSAMPLING_H_  #include "kbestget.h" +#include "score.h" +#include <vector> +#include <string> +using namespace std;  #include "sampler.h" // cdec, MT19937  namespace dtrain diff --git a/dtrain/score.cc b/dtrain/score.cc index 52644250..9b22508b 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -4,40 +4,6 @@ namespace dtrain  { -Ngrams -make_ngrams(vector<WordID>& s, unsigned N) -{ -  Ngrams ngrams; -  vector<WordID> ng; -  for (size_t i = 0; i < s.size(); i++) { -    ng.clear(); -    for (unsigned j = i; j < min(i+N, s.size()); j++) { -      ng.push_back(s[j]); -      ngrams[ng]++; -    } -  } -  return ngrams; -} - -NgramCounts -make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N) -{ -  Ngrams hyp_ngrams = make_ngrams(hyp, N); -  Ngrams ref_ngrams = make_ngrams(ref, N); -  NgramCounts counts(N); -  Ngrams::iterator it; -  Ngrams::iterator ti; -  for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) { -    ti = ref_ngrams.find(it->first); -    if (ti != ref_ngrams.end()) { -      counts.add(it->second, ti->second, it->first.size() - 1); -    } else { -      counts.add(it->second, 0, it->first.size() - 1); -    } -  } -  return counts; -} -  /*   * bleu   * @@ -48,26 +14,28 @@ make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N)   * NOTE: 0 if one n in {1..N} has 0 count   */  score_t -brevity_penaly(const unsigned hyp_len, const unsigned ref_len) -{ -  if (hyp_len > ref_len) return 1; -  return exp(1 - (score_t)ref_len/hyp_len); -} -score_t -bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, -      unsigned N, vector<score_t> weights ) +BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len)  {    if (hyp_len == 0 || ref_len == 0) return 0; -  if (ref_len < N) N = ref_len; -  if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N); +  unsigned M = N_; +  if (ref_len < N_) M = ref_len;    score_t sum = 0; -  for (unsigned i = 0; i < N; i++) { +  for (unsigned i = 0; i < M; i++) {      if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0; -    sum += weights[i] * log((score_t)counts.clipped[i] / counts.sum[i]); +    sum += w_[i] * log((score_t)counts.clipped[i] / counts.sum[i]);    }    return brevity_penaly(hyp_len, ref_len) * exp(sum);  } +score_t +BleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id) +{ +  unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size(); +  if (hyp_len == 0 || ref_len == 0) return 0; +  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_); +  return Bleu(counts, hyp_len, ref_len); +} +  /*   * 'stupid' bleu   * @@ -79,18 +47,31 @@ bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,   * NOTE: 0 iff no 1gram match   */  score_t -stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, -             unsigned N, vector<score_t> weights ) +StupidBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)  { +  unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size();    if (hyp_len == 0 || ref_len == 0) return 0; -  if (ref_len < N) N = ref_len; -  if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N); +  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_); +  unsigned M = N_; +  if (ref_len < N_) M = ref_len;    score_t sum = 0, add = 0; -  for (unsigned i = 0; i < N; i++) { +  for (unsigned i = 0; i < M; i++) {      if (i == 1) add = 1; -    sum += weights[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add))); +    //cout << ((score_t)counts.clipped[i] + add) << "/" << counts.sum[i] +add << "." << endl; +    //cout << "w_[i] " << w_[i] << endl; +    sum += w_[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add))); +    //cout << "sum += "<< w_[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add))) << endl;    } -  return brevity_penaly(hyp_len, ref_len) * exp(sum); +  /*cout << ref_ids << endl; +  cout << hyp.w << endl; +  cout << "ref_len " << ref_len << endl; +  cout << "hyp_len " << hyp_len << endl; +  cout << "bp " << brevity_penaly(hyp_len, ref_len) << endl; +  cout << "exp(sum) " << exp(sum) << endl; +  counts.Print(); +  cout << brevity_penaly(hyp_len, ref_len) * exp(sum) << endl; +  cout << "---" << endl;*/ +  return  brevity_penaly(hyp_len, ref_len) * exp(sum);  }  /* @@ -103,16 +84,16 @@ stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,   * NOTE: max is 0.9375   */  score_t -smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, -            const unsigned N, vector<score_t> weights ) +SmoothBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)  { +  unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size();    if (hyp_len == 0 || ref_len == 0) return 0; -  if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N); +  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_);    score_t sum = 0;    unsigned j = 1; -  for (unsigned i = 0; i < N; i++) { +  for (unsigned i = 0; i < N_; i++) {      if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue; -    sum += exp((weights[i] * log((score_t)counts.clipped[i]/counts.sum[i]))) / pow(2, N-j+1); +    sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]))) / pow(2, N_-j+1);      j++;    }    return brevity_penaly(hyp_len, ref_len) * sum; @@ -125,14 +106,39 @@ smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len,   *        and Structural Translation Features"   * (Chiang et al. '08)   */ -score_t -approx_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, -            const unsigned N, vector<score_t> weights) +/*void +ApproxBleuScorer::Prep(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len) +{ +  glob_onebest_counts += counts; +  glob_hyp_len += hyp_len; +  glob_ref_len += ref_len; +} + +void +ApproxBleuScorer::Reset()  { -  return brevity_penaly(hyp_len, ref_len)  -           * 0.9 * bleu(counts, hyp_len, ref_len, N, weights); +  glob_onebest_counts.Zero(); +  glob_hyp_len = 0; +  glob_ref_len = 0;  } +score_t +ApproxBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id) +{ +  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_); +  if (id == 0) reset(); +  unsigned hyp_len = 0, ref_len = 0; +  if (hyp.rank == 0) { // 'context of 1best translations' +    scorer->prep(counts, hyp.w.size(), ref_ids.size());  +    counts.reset(); +  } else { +    hyp_len = hyp.w.size(); +    ref_len = ref_ids.size(); +  } +  return 0.9 * BleuScorer::Bleu(glob_onebest_counts + counts, +                                glob_hyp_len + hyp_len, glob_ref_len + ref_len); +}*/ +  } // namespace diff --git a/dtrain/score.h b/dtrain/score.h index 3e5d82a9..f87d708c 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -7,6 +7,8 @@  #include <cassert>  #include <cmath> +#include "kbestget.h" +  #include "wordid.h" // cdec  using namespace std; @@ -15,15 +17,13 @@ namespace dtrain  { -typedef double score_t; // float -  struct NgramCounts  {    unsigned N_;    map<unsigned, unsigned> clipped;    map<unsigned, unsigned> sum; -  NgramCounts(const unsigned N) : N_(N) { reset(); }  +  NgramCounts(const unsigned N) : N_(N) { Zero(); }     void    operator+=(const NgramCounts& rhs) @@ -44,20 +44,19 @@ struct NgramCounts    }    void -  add(unsigned count, unsigned ref_count, unsigned i) +  Add(unsigned count, unsigned ref_count, unsigned i)    {      assert(i < N_);      if (count > ref_count) {        clipped[i] += ref_count; -      sum[i] += count;      } else {        clipped[i] += count; -      sum[i] += count;      } +    sum[i] += count;    }    void -  reset() +  Zero()    {      unsigned i;      for (i = 0; i < N_; i++) { @@ -67,7 +66,7 @@ struct NgramCounts    }    void -  print() +  Print()    {      for (unsigned i = 0; i < N_; i++) {        cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; @@ -78,18 +77,99 @@ struct NgramCounts  typedef map<vector<WordID>, unsigned> Ngrams; -Ngrams make_ngrams(vector<WordID>& s, unsigned N); -NgramCounts make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N); - -score_t brevity_penaly(const unsigned hyp_len, const unsigned ref_len); -score_t bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N, -             vector<score_t> weights = vector<score_t>()); -score_t stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, unsigned N, -                    vector<score_t> weights = vector<score_t>()); -score_t smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N, -                    vector<score_t> weights = vector<score_t>()); -score_t approx_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N, -                    vector<score_t> weights = vector<score_t>()); +inline Ngrams +make_ngrams(const vector<WordID>& s, const unsigned N) +{ +  Ngrams ngrams; +  vector<WordID> ng; +  for (size_t i = 0; i < s.size(); i++) { +    ng.clear(); +    for (unsigned j = i; j < min(i+N, s.size()); j++) { +      ng.push_back(s[j]); +      ngrams[ng]++; +    } +  } +  return ngrams; +} + +inline NgramCounts +make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned N) +{ +  Ngrams hyp_ngrams = make_ngrams(hyp, N); +  Ngrams ref_ngrams = make_ngrams(ref, N); +  NgramCounts counts(N); +  Ngrams::iterator it; +  Ngrams::iterator ti; +  for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) { +    ti = ref_ngrams.find(it->first); +    if (ti != ref_ngrams.end()) { +      counts.Add(it->second, ti->second, it->first.size() - 1); +    } else { +      counts.Add(it->second, 0, it->first.size() - 1); +    } +  } +  return counts; +} + +struct LocalScorer +{ +  unsigned N_; +  vector<score_t> w_; + +  virtual score_t +  Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)=0; + +  void +  Init(unsigned N, vector<score_t> weights) +  { +    assert(N > 0); +    N_ = N; +    if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_); +    else w_ = weights; +  } + +  score_t +  brevity_penaly(const unsigned hyp_len, const unsigned ref_len) +  { +    if (hyp_len > ref_len) return 1; +    return exp(1 - (score_t)ref_len/hyp_len); +  } +}; + +struct BleuScorer : public LocalScorer +{ +  score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len); +  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id); +}; + +struct StupidBleuScorer : public LocalScorer +{ +  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id); +}; + +struct SmoothBleuScorer : public LocalScorer +{ +  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id); +}; + +// FIXME +/*struct ApproxBleuScorer : public LocalScorer +{ +  NgramCounts glob_onebest_counts; +  unsigned glob_hyp_len, glob_ref_len; + +  void Prep(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len); +  void Reset(); +  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id); + +  ApproxBleuScorer()  +  { +    glob_onebest_counts.Zero(); +    glob_hyp_len = 0; +    glob_ref_len = 0; +  } +};*/ +  } // namespace diff --git a/dtrain/test/example/cdec.ini b/dtrain/test/example/cdec.ini index 50379afe..31a205c7 100644 --- a/dtrain/test/example/cdec.ini +++ b/dtrain/test/example/cdec.ini @@ -4,4 +4,4 @@ cubepruning_pop_limit=30  scfg_max_span_limit=15  feature_function=WordPenalty  feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz -#feature_function=RuleIdentityFeatures +feature_function=RuleIdentityFeatures diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index fbddb915..df746e51 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,7 +1,7 @@  decoder_config=test/example/cdec.ini  k=100 -N=3 -epochs=1000 +N=4 +epochs=10  input=test/example/nc-1k.gz  scorer=stupid_bleu  output=test/example/weights.gz diff --git a/dtrain/test/example/weights.gz b/dtrain/test/example/weights.gzBinary files differ index e2e1ecce..e7baa367 100644 --- a/dtrain/test/example/weights.gz +++ b/dtrain/test/example/weights.gz | 
