diff options
Diffstat (limited to 'dtrain')
| -rw-r--r-- | dtrain/Makefile.am | 2 | ||||
| -rw-r--r-- | dtrain/README.md | 3 | ||||
| -rw-r--r-- | dtrain/dtrain.cc | 11 | ||||
| -rw-r--r-- | dtrain/hstreaming/dtrain.ini | 2 | ||||
| -rw-r--r-- | dtrain/kbestget.h | 12 | ||||
| -rw-r--r-- | dtrain/ksampler.h | 6 | ||||
| -rw-r--r-- | dtrain/pairsampling.h | 2 | ||||
| -rw-r--r-- | dtrain/score.cc | 58 | ||||
| -rw-r--r-- | dtrain/score.h | 52 | ||||
| -rw-r--r-- | dtrain/test/example/dtrain.ini | 6 | 
10 files changed, 87 insertions, 67 deletions
| diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index 64fef489..f39d161e 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -3,5 +3,5 @@ bin_PROGRAMS = dtrain  dtrain_SOURCES = dtrain.cc score.cc  dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval +AM_CPPFLAGS = -O3 -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/README.md b/dtrain/README.md index 2a24ec22..92d6ba0d 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -3,7 +3,8 @@ which is able to train the weights of very many (sparse) features.  It was used here:    "Joint Feature Selection in Distributed Stochastic     Learning for Large-Scale Discriminative Training in -   SMT" Simianer, Riezler, Dyer; ACL 2012 +   SMT" +(Simianer, Riezler, Dyer; ACL 2012)  Building diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index ea5b8835..3dee10f2 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -32,7 +32,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)      ("l1_reg_strength",   po::value<weight_t>(),                                                  "l1 regularization strength")      ("inc_correct",       po::value<bool>()->zero_tokens(),                      "include correctly ranked pairs into updates")      ("fselect",           po::value<weight_t>()->default_value(-1),   "TODO select top x percent of features after each epoch") -    ("approx_bleu_scale", po::value<score_t>()->default_value(0.9),                                 "scaling for approx. BLEU") +    ("approx_bleu_d",     po::value<score_t>()->default_value(0.9),                                "discount for approx. BLEU")  #ifdef DTRAIN_LOCAL      ("refs,r",            po::value<string>(),                                                      "references in local mode")  #endif @@ -136,6 +136,7 @@ main(int argc, char** argv)    const score_t pair_threshold = cfg["pair_threshold"].as<score_t>();    const string select_weights = cfg["select_weights"].as<string>();    const float hi_lo = cfg["hi_lo"].as<float>(); +  const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>();    bool average = false;    if (select_weights == "avg")      average = true; @@ -161,7 +162,7 @@ main(int argc, char** argv)    } else if (scorer_str == "smooth_bleu") {      scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);    } else if (scorer_str == "approx_bleu") { -    scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N)); +    scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d));    } else {      cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;      exit(1); @@ -235,6 +236,8 @@ main(int argc, char** argv)      cerr << setw(25) << "N " << N << endl;      cerr << setw(25) << "T " << T << endl;      cerr << setw(25) << "scorer '" << scorer_str << "'" << endl; +    if (scorer_str == "approx_bleu") +      cerr << setw(25) << "approx. B discount " << approx_bleu_d << endl;      cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl;      if (sample_from == "kbest")        cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl; @@ -242,7 +245,7 @@ main(int argc, char** argv)      cerr << setw(25) << "gamma " << gamma << endl;      cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;      if (pair_sampling == "XYX") -      cerr << setw(25) << "hi lo " << "'" << hi_lo << "'" << endl; +      cerr << setw(25) << "hi lo " << hi_lo << endl;      cerr << setw(25) << "pair threshold " << pair_threshold << endl;      cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl;      if (cfg.count("l1_reg")) @@ -261,7 +264,7 @@ main(int argc, char** argv)        cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl;      if (cfg.count("stop-after"))        cerr << setw(25) << "stop_after " << stop_after << endl; -    if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " lines of input)" << endl; +    if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " inputs)" << endl;    } diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini index 05535299..a2c219a1 100644 --- a/dtrain/hstreaming/dtrain.ini +++ b/dtrain/hstreaming/dtrain.ini @@ -10,6 +10,6 @@ gamma=0  scorer=stupid_bleu  sample_from=kbest  filter=uniq -pair_sampling=108010 +pair_sampling=XYX  pair_threshold=0  select_weights=last diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h index bcd82610..77d4a139 100644 --- a/dtrain/kbestget.h +++ b/dtrain/kbestget.h @@ -2,6 +2,8 @@  #define _DTRAIN_KBESTGET_H_  #include "kbest.h" // cdec +#include "sentence_metadata.h" +  #include "verbose.h"  #include "viterbi.h"  #include "ff_register.h" @@ -32,7 +34,7 @@ struct LocalScorer    vector<score_t> w_;    virtual score_t -  Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank)=0; +  Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len)=0;    void Reset() {} // only for approx bleu @@ -71,13 +73,15 @@ struct KBestGetter : public HypSampler    const unsigned k_;    const string filter_type_;    vector<ScoredHyp> s_; +  unsigned src_len_;    KBestGetter(const unsigned k, const string filter_type) :      k_(k), filter_type_(filter_type) {}    virtual void -  NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg) +  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)    { +    src_len_ = smeta.GetSourceLength();      KBestScored(*hg);    } @@ -109,7 +113,7 @@ struct KBestGetter : public HypSampler        h.f = d->feature_values;        h.model = log(d->score);        h.rank = i; -      h.score = scorer_->Score(h.w, *ref_, i); +      h.score = scorer_->Score(h.w, *ref_, i, src_len_);        s_.push_back(h);      }    } @@ -128,7 +132,7 @@ struct KBestGetter : public HypSampler        h.f = d->feature_values;        h.model = log(d->score);        h.rank = i; -      h.score = scorer_->Score(h.w, *ref_, i); +      h.score = scorer_->Score(h.w, *ref_, i, src_len_);        s_.push_back(h);      }    } diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index eb4813ab..0783f98b 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -15,13 +15,15 @@ struct KSampler : public HypSampler    vector<ScoredHyp> s_;    MT19937* prng_;    score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>); +  unsigned src_len_;    explicit KSampler(const unsigned k, MT19937* prng) :      k_(k), prng_(prng) {}    virtual void -  NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg) +  NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)    { +    src_len_ = smeta.GetSourceLength();      ScoredSamples(*hg);    } @@ -37,7 +39,7 @@ struct KSampler : public HypSampler        h.f = samples[i].fmap;        h.model = log(samples[i].model_score);        h.rank = i; -      h.score = scorer_->Score(h.w, *ref_, i); +      h.score = scorer_->Score(h.w, *ref_, i, src_len_);        s_.push_back(h);      }    } diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 66ca1706..56702b86 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -44,7 +44,7 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor  {    sort(s->begin(), s->end(), _XYX_cmp_hyp_by_score);    unsigned sz = s->size(); -  unsigned sep = sz * hi_lo; +  unsigned sep = round(sz*hi_lo);    for (unsigned i = 0; i < sep; i++) {      for (unsigned j = sep; j < sz; j++) {        if ((*s)[i].rank < (*s)[j].rank) { diff --git a/dtrain/score.cc b/dtrain/score.cc index d964b4da..d0f9e8a0 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -16,23 +16,23 @@ namespace dtrain  score_t  BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len)  { -  if (hyp_len == 0 || ref_len == 0) return 0; +  if (hyp_len == 0 || ref_len == 0) return 0.;    unsigned M = N_;    if (ref_len < N_) M = ref_len;    score_t sum = 0;    for (unsigned i = 0; i < M; i++) { -    if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0; -    sum += w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]); +    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) return 0.; +    sum += w_[i] * log((score_t)counts.clipped_[i]/counts.sum_[i]);    }    return brevity_penalty(hyp_len, ref_len) * exp(sum);  }  score_t  BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, -                  const unsigned /*rank*/) +                  const unsigned /*rank*/, const unsigned /*src_len*/)  {    unsigned hyp_len = hyp.size(), ref_len = ref.size(); -  if (hyp_len == 0 || ref_len == 0) return 0; +  if (hyp_len == 0 || ref_len == 0) return 0.;    NgramCounts counts = make_ngram_counts(hyp, ref, N_);    return Bleu(counts, hyp_len, ref_len);  } @@ -49,18 +49,18 @@ BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,   */  score_t  StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, -                        const unsigned /*rank*/) +                        const unsigned /*rank*/, const unsigned /*src_len*/)  {    unsigned hyp_len = hyp.size(), ref_len = ref.size(); -  if (hyp_len == 0 || ref_len == 0) return 0; +  if (hyp_len == 0 || ref_len == 0) return 0.;    NgramCounts counts = make_ngram_counts(hyp, ref, N_);    unsigned M = N_;    if (ref_len < N_) M = ref_len;    score_t sum = 0, add = 0;    for (unsigned i = 0; i < M; i++) { -    if (i == 0 && (counts.clipped[i] == 0 || counts.sum[i] == 0)) return 0; +    if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.;      if (i == 1) add = 1; -    sum += w_[i] * log(((score_t)counts.clipped[i] + add)/((counts.sum[i] + add))); +    sum += w_[i] * log(((score_t)counts.clipped_[i] + add)/((counts.sum_[i] + add)));    }    return  brevity_penalty(hyp_len, ref_len) * exp(sum);  } @@ -76,10 +76,10 @@ StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,   */  score_t  SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, -                        const unsigned /*rank*/) +                        const unsigned /*rank*/, const unsigned /*src_len*/)  {    unsigned hyp_len = hyp.size(), ref_len = ref.size(); -  if (hyp_len == 0 || ref_len == 0) return 0; +  if (hyp_len == 0 || ref_len == 0) return 0.;    NgramCounts counts = make_ngram_counts(hyp, ref, N_);    unsigned M = N_;    if (ref_len < N_) M = ref_len; @@ -87,10 +87,10 @@ SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,    vector<score_t> i_bleu;    for (unsigned i = 0; i < M; i++) i_bleu.push_back(0.);    for (unsigned i = 0; i < M; i++) { -    if (counts.clipped[i] == 0 || counts.sum[i] == 0) { +    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) {        break;      } else { -      score_t i_ng = log((score_t)counts.clipped[i]/counts.sum[i]); +      score_t i_ng = log((score_t)counts.clipped_[i]/counts.sum_[i]);        for (unsigned j = i; j < M; j++) {          i_bleu[j] += (1/((score_t)j+1)) * i_ng;        } @@ -107,29 +107,29 @@ SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,   *        and Structural Translation Features"   * (Chiang et al. '08)   * - * NOTE: needs some code in dtrain.cc + * NOTE: needs some more code in dtrain.cc   */  score_t  ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, -                        const unsigned rank) +                        const unsigned rank, const unsigned src_len)  {    unsigned hyp_len = hyp.size(), ref_len = ref.size(); -  if (hyp_len == 0 || ref_len == 0) return 0; -  NgramCounts counts = make_ngram_counts(hyp, ref, N_); -  NgramCounts tmp(N_); +  if (ref_len == 0) return 0.; +  score_t score = 0.; +  NgramCounts counts(N_); +  if (hyp_len > 0) { +    counts = make_ngram_counts(hyp, ref, N_); +    NgramCounts tmp = glob_onebest_counts_ + counts; +    score = Bleu(tmp, hyp_len, ref_len); +  }    if (rank == 0) { // 'context of 1best translations' -    glob_onebest_counts += counts; -    glob_hyp_len += hyp_len; -    glob_ref_len += ref_len; -    hyp_len = glob_hyp_len; -    ref_len = glob_ref_len; -    tmp = glob_onebest_counts; -  } else { -    hyp_len = hyp.size(); -    ref_len = ref.size(); -    tmp = glob_onebest_counts + counts; +    glob_onebest_counts_ += counts; +    glob_onebest_counts_ *= discount_; +    glob_hyp_len_ = discount_ * (glob_hyp_len_ + hyp_len); +    glob_ref_len_ = discount_ * (glob_ref_len_ + ref_len); +    glob_src_len_ = discount_ * (glob_src_len_ + src_len);    } -  return 0.9 * Bleu(tmp, hyp_len, ref_len); // TODO param +  return (score_t)glob_src_len_ * score;  } diff --git a/dtrain/score.h b/dtrain/score.h index 5aceb81f..d0e79f65 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -12,8 +12,8 @@ namespace dtrain  struct NgramCounts  {    unsigned N_; -  map<unsigned, unsigned> clipped; -  map<unsigned, unsigned> sum; +  map<unsigned, score_t> clipped_; +  map<unsigned, score_t> sum_;    NgramCounts(const unsigned N) : N_(N) { Zero(); } @@ -22,8 +22,8 @@ struct NgramCounts    {      assert(N_ == rhs.N_);      for (unsigned i = 0; i < N_; i++) { -      this->clipped[i] += rhs.clipped.find(i)->second; -      this->sum[i] += rhs.sum.find(i)->second; +      this->clipped_[i] += rhs.clipped_.find(i)->second; +      this->sum_[i] += rhs.sum_.find(i)->second;      }    } @@ -36,15 +36,24 @@ struct NgramCounts    }    inline void +  operator*=(const score_t rhs) +  { +    for (unsigned i = 0; i < N_; i++) { +      this->clipped_[i] *= rhs; +      this->sum_[i] *= rhs; +    } +  } + +  inline void    Add(const unsigned count, const unsigned ref_count, const unsigned i)    {      assert(i < N_);      if (count > ref_count) { -      clipped[i] += ref_count; +      clipped_[i] += ref_count;      } else { -      clipped[i] += count; +      clipped_[i] += count;      } -    sum[i] += count; +    sum_[i] += count;    }    inline void @@ -52,8 +61,8 @@ struct NgramCounts    {      unsigned i;      for (i = 0; i < N_; i++) { -      clipped[i] = 0; -      sum[i] = 0; +      clipped_[i] = 0; +      sum_[i] = 0;      }    } @@ -61,8 +70,8 @@ struct NgramCounts    Print()    {      for (unsigned i = 0; i < N_; i++) { -      cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; -      cout << i+1 << "grams:\t\t\t" << sum[i] << endl; +      cout << i+1 << "grams (clipped):\t" << clipped_[i] << endl; +      cout << i+1 << "grams:\t\t\t" << sum_[i] << endl;      }    }  }; @@ -106,35 +115,36 @@ make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const un  struct BleuScorer : public LocalScorer  {    score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len); -  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank); +  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);  };  struct StupidBleuScorer : public LocalScorer  { -  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank); +  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);  };  struct SmoothBleuScorer : public LocalScorer  { -  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank); +  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);  };  struct ApproxBleuScorer : public BleuScorer  { -  NgramCounts glob_onebest_counts; -  unsigned glob_hyp_len, glob_ref_len; +  NgramCounts glob_onebest_counts_; +  unsigned glob_hyp_len_, glob_ref_len_, glob_src_len_; +  score_t discount_; -  ApproxBleuScorer(unsigned N) : glob_onebest_counts(NgramCounts(N)) +  ApproxBleuScorer(unsigned N, score_t d) : glob_onebest_counts_(NgramCounts(N)), discount_(d)    { -    glob_hyp_len = glob_ref_len = 0; +    glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0;    }    inline void Reset() { -    glob_onebest_counts.Zero(); -    glob_hyp_len = glob_ref_len = 0; +    glob_onebest_counts_.Zero(); +    glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0.;    } -  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank); +  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len);  }; diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index cd2c75e7..2ad44688 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -4,18 +4,18 @@ decoder_config=test/example/cdec.ini # config for cdec  # weights for these features will be printed on each iteration  print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough  tmp=/tmp -stop_after=100 # stop epoch after 100 inputs +stop_after=20 # stop epoch after 20 inputs  # interesting stuff  epochs=3                # run over input 3 times  k=100                   # use 100best lists  N=4                     # optimize (approx) BLEU4 -scorer=approx_bleu      # use 'stupid' BLEU+1 +scorer=stupid_bleu      # use 'stupid' BLEU+1  learning_rate=0.0001    # learning rate  gamma=0                 # use SVM reg  sample_from=kbest       # use kbest lists (as opposed to forest)  filter=uniq             # only unique entries in kbest (surface form)  pair_sampling=XYX -hi_lo=0.1               # 10 vs 80 vs 10 and 80 vs 10 +hi_lo=0.1               # 10 vs 80 vs 10 and 80 vs 10 here  pair_threshold=0        # minimum distance in BLEU (this will still only use pairs with diff > 0)  select_weights=VOID     # don't output weights | 
