diff options
Diffstat (limited to 'dtrain')
| -rw-r--r-- | dtrain/dtrain.cc | 77 | ||||
| -rw-r--r-- | dtrain/dtrain.h | 25 | ||||
| -rw-r--r-- | dtrain/kbestget.h | 46 | ||||
| -rw-r--r-- | dtrain/ksampler.h | 11 | ||||
| -rw-r--r-- | dtrain/pairsampling.h | 6 | ||||
| -rw-r--r-- | dtrain/score.cc | 37 | ||||
| -rw-r--r-- | dtrain/score.h | 44 | ||||
| -rw-r--r-- | dtrain/test/example/dtrain.ini | 8 | ||||
| -rw-r--r-- | dtrain/test/example/weights.gz | bin | 12001 -> 395 bytes | 
9 files changed, 98 insertions, 156 deletions
| diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 35e6cc46..622cd01e 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -95,38 +95,32 @@ main(int argc, char** argv)      cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;    Decoder decoder(ini_rf.stream()); -  MT19937 rng; // random number generator -  // setup decoder observer -  HypSampler* observer; -  if (sample_from == "kbest") { -    observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type)); -  } else { -    observer = dynamic_cast<KSampler*>(new KSampler(k, &rng)); -  } -    // scoring metric/scorer    string scorer_str = cfg["scorer"].as<string>(); -  /*score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>); +  LocalScorer* scorer;    if (scorer_str == "bleu") { -    scorer = &bleu;    } else if (scorer_str == "stupid_bleu") { -    scorer = &stupid_bleu; +    scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer);    } else if (scorer_str == "smooth_bleu") { -    scorer = &smooth_bleu; +      scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);    } else if (scorer_str == "approx_bleu") { -    scorer = &approx_bleu; +      scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer); // FIXME    } else {      cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;      exit(1);    } -  NgramCounts global_counts(N); // counts for 1 best translations -  unsigned global_hyp_len = 0;    // sum hypothesis lengths -  unsigned global_ref_len = 0;    // sum reference lengths -  // ^^^ global_* for approx_bleu*/ -  vector<score_t> bleu_weights;   // we leave this empty -> 1/N  -  //if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; -  StupidBleuScorer scorer; -  scorer.Init(N, bleu_weights); +  vector<score_t> bleu_weights; +  scorer->Init(N, bleu_weights); +  if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; + +  // setup decoder observer +  MT19937 rng; // random number generator +  HypSampler* observer; +  if (sample_from == "kbest") +    observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type)); +  else +    observer = dynamic_cast<KSampler*>(new KSampler(k, &rng)); +  observer->SetScorer(scorer);    // init weights    Weights weights; @@ -240,10 +234,10 @@ main(int argc, char** argv)      vector<WordID> ref_ids;  // reference as vector<WordID>      if (t == 0) {        // handling input -      strsplit(in, in_split, '\t', 4); +      boost::split(in_split, in, boost::is_any_of("\t"));        // getting reference        vector<string> ref_tok; -      strsplit(in_split[2], ref_tok, ' '); +      boost::split(ref_tok, in_split[2], boost::is_any_of(" "));        register_and_convert(ref_tok, ref_ids);        ref_ids_buf.push_back(ref_ids);        // process and set grammar @@ -259,8 +253,9 @@ main(int argc, char** argv)        in_split[3] += "\n";        grammar_buf_out << in_split[3] << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;        decoder.SetSentenceGrammarFromString(in_split[3]); -      // decode        src_str_buf.push_back(in_split[1]); +      // decode +      observer->SetRef(ref_ids);        decoder.Decode(in_split[1], observer);      } else {        // get buffered grammar @@ -273,32 +268,24 @@ main(int argc, char** argv)        }        decoder.SetSentenceGrammarFromString(grammar_str);        // decode +      observer->SetRef(ref_ids_buf[ii]);        decoder.Decode(src_str_buf[ii], observer);      } +    // get (scored) samples       vector<ScoredHyp>* samples = observer->GetSamples(); -    // (local) scoring -    if (t > 0) ref_ids = ref_ids_buf[ii]; -    for (unsigned i = 0; i < samples->size(); i++) { -        //cout << ii << " " << i << endl; - -        cout << _p9; -      (*samples)[i].score = scorer.Score((*samples)[i], ref_ids, ii); -      if (i == 0) { -        score_sum += (*samples)[i].score; -        model_sum += (*samples)[i].model; -      } - -      if (verbose) { -        if (i == 0) cerr << "'" << TD::GetString(ref_ids) << "' [ref]" << endl; -        cerr << _p5 << _np << "[hyp " << i << "] " << "'" << TD::GetString((*samples)[i].w) << "'"; -        cerr << " [SCORE=" << (*samples)[i].score << ",model="<< (*samples)[i].model << "]" << endl; -        cerr << (*samples)[i].f << endl; -      } +    if (verbose) { +      cout << "[ref: '"; +      if (t > 0) cout << ref_ids_buf[ii]; +      else cout << ref_ids; +      cout << endl; +      cout << _p5 << _np << "1best: " << "'" << (*samples)[0].w << "'" << endl; +      cout << "SCORE=" << (*samples)[0].score << ",model="<< (*samples)[0].model << endl; +      cout << "F{" << (*samples)[0].f << "} ]" << endl << endl;      } - -    if (verbose) cerr << endl; +    score_sum += (*samples)[0].score; +    model_sum += (*samples)[0].model;  //////////////////////////////////////////////////////////      // UPDATE WEIGHTS diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index ed75a297..0c27167d 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -7,14 +7,6 @@  #include <boost/algorithm/string.hpp>  #include <boost/program_options.hpp> -#include "verbose.h" -#include "viterbi.h" -#include "ff_register.h" -#include "decoder.h" -#include "weights.h" - -#include "score.h" -#include "kbestget.h"  #include "ksampler.h"  #include "pairsampling.h" @@ -31,27 +23,12 @@ inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids    for (it = strs.begin(); it < strs.end(); it++)      ids.push_back(TD::Convert(*it));  } +  inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }  inline ostream& _p(ostream& out)  { return out << setiosflags(ios::showpos); }  inline ostream& _p2(ostream& out) { return out << setprecision(2); }  inline ostream& _p5(ostream& out) { return out << setprecision(5); }  inline ostream& _p9(ostream& out) { return out << setprecision(9); } -inline void strsplit(string &s, vector<string>& v, char d = '\t', unsigned parts = 0) {  -  stringstream ss(s); -  string t; -  unsigned i = 0; -  while(true) -  { -    if (parts > 0 && i == parts-1) { -      getline(ss, t); -      v.push_back(t); -      break; -    } -    if (!getline(ss, t, d)) break; -    v.push_back(t); -    i++; -  } -}  #endif diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h index 2a2c6073..c0fd3f47 100644 --- a/dtrain/kbestget.h +++ b/dtrain/kbestget.h @@ -1,12 +1,6 @@  #ifndef _DTRAIN_KBESTGET_H_  #define _DTRAIN_KBESTGET_H_ - -#include <vector> -#include <string> - -using namespace std; -  #include "kbest.h" // cdec  #include "verbose.h"  #include "viterbi.h" @@ -14,11 +8,13 @@ using namespace std;  #include "decoder.h"  #include "weights.h" +using namespace std; +  namespace dtrain  { -typedef double score_t; // float +typedef double score_t; // float  struct ScoredHyp  { @@ -29,10 +25,44 @@ struct ScoredHyp    unsigned rank;  }; +struct LocalScorer +{ +  unsigned N_; +  vector<score_t> w_; + +  virtual score_t +  Score(vector<WordID>& hyp, vector<WordID>& ref)=0; + +  void +  Init(unsigned N, vector<score_t> weights) +  { +    assert(N > 0); +    N_ = N; +    if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_); +    else w_ = weights; +  } + +  score_t +  brevity_penaly(const unsigned hyp_len, const unsigned ref_len) +  { +    if (hyp_len > ref_len) return 1; +    return exp(1 - (score_t)ref_len/hyp_len); +  } +}; +  struct HypSampler : public DecoderObserver  { +  LocalScorer* scorer_; +  vector<WordID>* ref_;    virtual vector<ScoredHyp>* GetSamples()=0; +  void SetScorer(LocalScorer* scorer) { scorer_ = scorer; } +  void SetRef(vector<WordID>& ref) { ref_ = &ref; }   }; +///////////////////////////////////////////////////////////////////// +// wtf + + +  struct KBestGetter : public HypSampler  { @@ -77,6 +107,7 @@ struct KBestGetter : public HypSampler        h.f = d->feature_values;        h.model = log(d->score);        h.rank = i; +      h.score = scorer_->Score(h.w, *ref_);        s_.push_back(h);      }    } @@ -95,6 +126,7 @@ struct KBestGetter : public HypSampler        h.f = d->feature_values;        h.model = log(d->score);        h.rank = i; +      h.score = scorer_->Score(h.w, *ref_);        s_.push_back(h);      }    } diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index 767dc42e..7567f43a 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -1,15 +1,9 @@  #ifndef _DTRAIN_KSAMPLER_H_  #define _DTRAIN_KSAMPLER_H_ -#include "kbestget.h"  #include "hgsampler.h" -#include <vector> -#include <string> - -using namespace std; - -#include "kbest.h" // cdec -#include "sampler.h" +#include "kbestget.h" +#include "score.h"  namespace dtrain  { @@ -43,6 +37,7 @@ struct KSampler : public HypSampler        h.f = samples[i].fmap;        h.model = log(samples[i].model_score);         h.rank = i; +      h.score = scorer_->Score(h.w, *ref_);        s_.push_back(h);      }    } diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 4a6d93d1..6db0c045 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -1,12 +1,6 @@  #ifndef _DTRAIN_PAIRSAMPLING_H_  #define _DTRAIN_PAIRSAMPLING_H_ -#include "kbestget.h" -#include "score.h" -#include <vector> -#include <string> -using namespace std; -#include "sampler.h" // cdec, MT19937  namespace dtrain  { diff --git a/dtrain/score.cc b/dtrain/score.cc index 9b22508b..93c4e80b 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -22,17 +22,17 @@ BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref    score_t sum = 0;    for (unsigned i = 0; i < M; i++) {      if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0; -    sum += w_[i] * log((score_t)counts.clipped[i] / counts.sum[i]); +    sum += w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]);    }    return brevity_penaly(hyp_len, ref_len) * exp(sum);  }  score_t -BleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id) +BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref)  { -  unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size(); +  unsigned hyp_len = hyp.size(), ref_len = ref.size();    if (hyp_len == 0 || ref_len == 0) return 0; -  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_); +  NgramCounts counts = make_ngram_counts(hyp, ref, N_);    return Bleu(counts, hyp_len, ref_len);  } @@ -47,30 +47,18 @@ BleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)   * NOTE: 0 iff no 1gram match   */  score_t -StupidBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id) +StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref)  { -  unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size(); +  unsigned hyp_len = hyp.size(), ref_len = ref.size();    if (hyp_len == 0 || ref_len == 0) return 0; -  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_); +  NgramCounts counts = make_ngram_counts(hyp, ref, N_);    unsigned M = N_;    if (ref_len < N_) M = ref_len;    score_t sum = 0, add = 0;    for (unsigned i = 0; i < M; i++) {      if (i == 1) add = 1; -    //cout << ((score_t)counts.clipped[i] + add) << "/" << counts.sum[i] +add << "." << endl; -    //cout << "w_[i] " << w_[i] << endl; -    sum += w_[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add))); -    //cout << "sum += "<< w_[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add))) << endl; +    sum += w_[i] * log(((score_t)counts.clipped[i] + add)/((counts.sum[i] + add)));    } -  /*cout << ref_ids << endl; -  cout << hyp.w << endl; -  cout << "ref_len " << ref_len << endl; -  cout << "hyp_len " << hyp_len << endl; -  cout << "bp " << brevity_penaly(hyp_len, ref_len) << endl; -  cout << "exp(sum) " << exp(sum) << endl; -  counts.Print(); -  cout << brevity_penaly(hyp_len, ref_len) * exp(sum) << endl; -  cout << "---" << endl;*/    return  brevity_penaly(hyp_len, ref_len) * exp(sum);  } @@ -84,21 +72,22 @@ StupidBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)   * NOTE: max is 0.9375   */  score_t -SmoothBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id) +SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref)  { -  unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size(); +  unsigned hyp_len = hyp.size(), ref_len = ref.size();    if (hyp_len == 0 || ref_len == 0) return 0; -  NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_); +  NgramCounts counts = make_ngram_counts(hyp, ref, N_);    score_t sum = 0;    unsigned j = 1;    for (unsigned i = 0; i < N_; i++) {      if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue; -    sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]))) / pow(2, N_-j+1); +    sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i])))/pow(2, N_-j+1);      j++;    }    return brevity_penaly(hyp_len, ref_len) * sum;  } +// FIXME  /*   * approx. bleu   * diff --git a/dtrain/score.h b/dtrain/score.h index f87d708c..9af56ef9 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -1,16 +1,8 @@  #ifndef _DTRAIN_SCORE_H_  #define _DTRAIN_SCORE_H_ -#include <iostream> -#include <vector> -#include <map> -#include <cassert> -#include <cmath> -  #include "kbestget.h" -#include "wordid.h" // cdec -  using namespace std;  namespace dtrain @@ -111,51 +103,28 @@ make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const un    return counts;  } -struct LocalScorer -{ -  unsigned N_; -  vector<score_t> w_; - -  virtual score_t -  Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)=0; - -  void -  Init(unsigned N, vector<score_t> weights) -  { -    assert(N > 0); -    N_ = N; -    if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_); -    else w_ = weights; -  } - -  score_t -  brevity_penaly(const unsigned hyp_len, const unsigned ref_len) -  { -    if (hyp_len > ref_len) return 1; -    return exp(1 - (score_t)ref_len/hyp_len); -  } -}; -  struct BleuScorer : public LocalScorer  {    score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len); -  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id); +  score_t Score(vector<WordID>& hyp, vector<WordID>& ref_ids);  };  struct StupidBleuScorer : public LocalScorer  { -  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id); +  score_t Score(vector<WordID>& hyp, vector<WordID>& ref);  };  struct SmoothBleuScorer : public LocalScorer  { -  score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id); +  score_t Score(vector<WordID>& hyp, vector<WordID>& ref);  };  // FIXME  /*struct ApproxBleuScorer : public LocalScorer  { -  NgramCounts glob_onebest_counts; +  bool prepped; + +  NgramCounts* glob_onebest_counts;    unsigned glob_hyp_len, glob_ref_len;    void Prep(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len); @@ -171,7 +140,6 @@ struct SmoothBleuScorer : public LocalScorer  };*/ -  } // namespace  #endif diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index df746e51..fd3a3841 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,11 +1,11 @@  decoder_config=test/example/cdec.ini  k=100  N=4 -epochs=10 +epochs=100  input=test/example/nc-1k.gz  scorer=stupid_bleu  output=test/example/weights.gz -stop_after=10 -sample_from=kbest -pair_sampling=all +stop_after=0 +sample_from=forest +pair_sampling=rand  print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough diff --git a/dtrain/test/example/weights.gz b/dtrain/test/example/weights.gzBinary files differ index e7baa367..7960a05a 100644 --- a/dtrain/test/example/weights.gz +++ b/dtrain/test/example/weights.gz | 
