From 62c805c90c5347b844f92574e240db5c65578e12 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 31 May 2012 14:33:59 +0200 Subject: new scorer, stuff --- dtrain/README.md | 3 ++- dtrain/dtrain.cc | 75 +++++++++++++++++++++++++++------------------------ dtrain/pairsampling.h | 6 ++--- dtrain/score.cc | 35 +++++++++++++++++++++++- dtrain/score.h | 49 ++++++++++++++++++++++++++++++--- 5 files changed, 125 insertions(+), 43 deletions(-) diff --git a/dtrain/README.md b/dtrain/README.md index 350c7423..7aefcc55 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -41,7 +41,7 @@ DTRAIN_LOCAL. Next ---- -+ approx. Bleu? ++ approx. Bleu? proper lc_bleu (init with X) + turn off inclusion + (dtrain|decoder) more meta-parameters testing + feature selection directly in dtrain @@ -57,6 +57,7 @@ Next + resharding [nfold cross val.] + bigger LM, feats (target side Ng., word alignments etc.) + merge kbest lists ++ proper eval, pairwise ranking, forced transl Legal ----- diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 717d47a2..88413a1d 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -6,38 +6,39 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) { po::options_description ini("Configuration File Options"); ini.add_options() - ("input", po::value()->default_value("-"), "input file") - ("output", po::value()->default_value("-"), "output weights file, '-' for STDOUT") - ("input_weights", po::value(), "input weights file (e.g. from previous iteration)") - ("decoder_config", po::value(), "configuration file for cdec") - ("print_weights", po::value(), "weights to print on each iteration") - ("stop_after", po::value()->default_value(0), "stop after X input sentences") - ("tmp", po::value()->default_value("/tmp"), "temp dir to use") - ("keep", po::value()->zero_tokens(), "keep weights files for each iteration") - ("hstreaming", po::value(), "run in hadoop streaming mode, arg is a task id") - ("epochs", po::value()->default_value(10), "# of iterations T (per shard)") - ("k", po::value()->default_value(100), "how many translations to sample") - ("sample_from", po::value()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'") - ("filter", po::value()->default_value("uniq"), "filter kbest list: 'not', 'uniq'") - ("pair_sampling", po::value()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'") - ("hi_lo", po::value()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5") - ("pair_threshold", po::value()->default_value(0.), "bleu [0,1] threshold to filter pairs") - ("N", po::value()->default_value(4), "N for Ngrams (BLEU)") - ("scorer", po::value()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_") - ("learning_rate", po::value()->default_value(0.0001), "learning rate") - ("gamma", po::value()->default_value(0.), "gamma for SVM (0 for perceptron)") - ("select_weights", po::value()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") - ("rescale", po::value()->zero_tokens(), "rescale weight vector after each input") - ("l1_reg", po::value()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)") - ("l1_reg_strength", po::value(), "l1 regularization strength") - ("fselect", po::value()->default_value(-1), "TODO select top x percent (or by threshold) of features after each epoch") - ("approx_bleu_d", po::value()->default_value(0.9), "discount for approx. BLEU") - ("scale_bleu_diff", po::value()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") - ("loss_margin", po::value()->default_value(0.), "update if no error in pref pair but model scores this near") + ("input", po::value()->default_value("-"), "input file") + ("output", po::value()->default_value("-"), "output weights file, '-' for STDOUT") + ("input_weights", po::value(), "input weights file (e.g. from previous iteration)") + ("decoder_config", po::value(), "configuration file for cdec") + ("print_weights", po::value(), "weights to print on each iteration") + ("stop_after", po::value()->default_value(0), "stop after X input sentences") + ("tmp", po::value()->default_value("/tmp"), "temp dir to use") + ("keep", po::value()->zero_tokens(), "keep weights files for each iteration") + ("hstreaming", po::value(), "run in hadoop streaming mode, arg is a task id") + ("epochs", po::value()->default_value(10), "# of iterations T (per shard)") + ("k", po::value()->default_value(100), "how many translations to sample") + ("sample_from", po::value()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'") + ("filter", po::value()->default_value("uniq"), "filter kbest list: 'not', 'uniq'") + ("pair_sampling", po::value()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'") + ("hi_lo", po::value()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5") + ("pair_threshold", po::value()->default_value(0.), "bleu [0,1] threshold to filter pairs") + ("N", po::value()->default_value(4), "N for Ngrams (BLEU)") + ("scorer", po::value()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_, lc_") + ("learning_rate", po::value()->default_value(0.0001), "learning rate") + ("gamma", po::value()->default_value(0.), "gamma for SVM (0 for perceptron)") + ("select_weights", po::value()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") + ("rescale", po::value()->zero_tokens(), "rescale weight vector after each input") + ("l1_reg", po::value()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)") + ("l1_reg_strength", po::value(), "l1 regularization strength") + ("fselect", po::value()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPL") // TODO + ("approx_bleu_d", po::value()->default_value(0.9), "discount for approx. BLEU") + ("scale_bleu_diff", po::value()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") + ("loss_margin", po::value()->default_value(0.), "update if no error in pref pair but model scores this near") + ("max_pairs", po::value()->default_value(std::numeric_limits::max()), "max. # of pairs per Sent.") #ifdef DTRAIN_LOCAL - ("refs,r", po::value(), "references in local mode") + ("refs,r", po::value(), "references in local mode") #endif - ("noup", po::value()->zero_tokens(), "do not update weights"); + ("noup", po::value()->zero_tokens(), "do not update weights"); po::options_description cl("Command Line Options"); cl.add_options() ("config,c", po::value(), "dtrain config file") @@ -135,6 +136,7 @@ main(int argc, char** argv) const string select_weights = cfg["select_weights"].as(); const float hi_lo = cfg["hi_lo"].as(); const score_t approx_bleu_d = cfg["approx_bleu_d"].as(); + const unsigned max_pairs = cfg["max_pairs"].as(); weight_t loss_margin = cfg["loss_margin"].as(); if (loss_margin > 9998.) loss_margin = std::numeric_limits::max(); bool scale_bleu_diff = false; @@ -167,6 +169,8 @@ main(int argc, char** argv) scorer = dynamic_cast(new SmoothSingleBleuScorer); } else if (scorer_str == "approx_bleu") { scorer = dynamic_cast(new ApproxBleuScorer(N, approx_bleu_d)); + } else if (scorer_str == "lc_bleu") { + scorer = dynamic_cast(new LinearBleuScorer(N)); } else { cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl; exit(1); @@ -257,6 +261,7 @@ main(int argc, char** argv) cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as() << "'" << endl; if (rescale) cerr << setw(25) << "rescale " << rescale << endl; + cerr << "max pairs " << max_pairs << endl; cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; #ifdef DTRAIN_LOCAL @@ -421,17 +426,17 @@ main(int argc, char** argv) // get pairs vector > pairs; if (pair_sampling == "all") - all_pairs(samples, pairs, pair_threshold); + all_pairs(samples, pairs, pair_threshold, max_pairs); if (pair_sampling == "XYX") - partXYX(samples, pairs, pair_threshold, hi_lo); + partXYX(samples, pairs, pair_threshold, max_pairs, hi_lo); if (pair_sampling == "PRO") - PROsampling(samples, pairs, pair_threshold); + PROsampling(samples, pairs, pair_threshold, max_pairs); npairs += pairs.size(); for (vector >::iterator it = pairs.begin(); it != pairs.end(); it++) { #ifdef DTRAIN_FASTER_PERCEPTRON - bool rank_error = true; // pair filtering already did this for us + bool rank_error = true; // pair sampling already did this for us rank_errors++; score_t margin = std::numeric_limits::max(); #else @@ -498,7 +503,7 @@ main(int argc, char** argv) if (average) w_average += lambdas; - if (scorer_str == "approx_bleu") scorer->Reset(); + if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset(); if (t == 0) { in_sz = ii; // remember size of input (# lines) diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 32006a41..71c8ae59 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -19,7 +19,7 @@ cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b) } inline void -all_pairs(vector* s, vector >& training, score_t threshold, float _unused=1) +all_pairs(vector* s, vector >& training, score_t threshold, unsigned max, float _unused=1) { sort(s->begin(), s->end(), cmp_hyp_by_score_d); unsigned sz = s->size(); @@ -44,7 +44,7 @@ all_pairs(vector* s, vector >& training, sc */ inline void -partXYX(vector* s, vector >& training, score_t threshold, float hi_lo) +partXYX(vector* s, vector >& training, score_t threshold, unsigned max, float hi_lo) { unsigned sz = s->size(); if (sz < 2) return; @@ -104,7 +104,7 @@ _PRO_cmp_pair_by_diff_d(pair a, pair b return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score)); } inline void -PROsampling(vector* s, vector >& training, score_t threshold, float _unused=1) +PROsampling(vector* s, vector >& training, score_t threshold, unsigned max, float _unused=1) { unsigned max_count = 5000, count = 0, sz = s->size(); bool b = false; diff --git a/dtrain/score.cc b/dtrain/score.cc index b331dc4f..5c356c0f 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -122,12 +122,13 @@ SmoothSingleBleuScorer::Score(vector& hyp, vector& ref, unsigned j = 1; for (unsigned i = 0; i < M; i++) { if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; - sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2.0, N_-j+1); + sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2., N_-j+1); j++; } return brevity_penalty(hyp_len, ref_len) * sum; } + /* * approx. bleu * @@ -160,6 +161,38 @@ ApproxBleuScorer::Score(vector& hyp, vector& ref, return (score_t)glob_src_len_ * score; } +/* + * Linear (Corpus) Bleu + * + * as in "Lattice Minimum Bayes-Risk Decoding + * for Statistical Machine Translation" + * (Tromble et al. '08) + * + */ +score_t +LinearBleuScorer::Score(vector& hyp, vector& ref, + const unsigned rank, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (ref_len == 0) return 0.; + unsigned M = N_; + if (ref_len < N_) M = ref_len; + NgramCounts counts(M); + if (hyp_len > 0) + counts = make_ngram_counts(hyp, ref, M); + score_t ret = 0.; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || onebest_counts_.sum_[i] == 0) break; + ret += counts.sum_[i]/onebest_counts_.sum_[i]; + } + ret = -(hyp_len/(score_t)onebest_len_) + (1./M) * ret; + if (rank == 0) { + onebest_len_ += hyp_len; + onebest_counts_ += counts; + } + return ret; +} + } // namespace diff --git a/dtrain/score.h b/dtrain/score.h index d4fba22c..c5be2829 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -20,7 +20,7 @@ struct NgramCounts inline void operator+=(const NgramCounts& rhs) { - assert(N_ == rhs.N_); + if (rhs.N_ > N_) Resize(rhs.N_); for (unsigned i = 0; i < N_; i++) { this->clipped_[i] += rhs.clipped_.find(i)->second; this->sum_[i] += rhs.sum_.find(i)->second; @@ -59,13 +59,21 @@ struct NgramCounts inline void Zero() { - unsigned i; - for (i = 0; i < N_; i++) { + for (unsigned i = 0; i < N_; i++) { clipped_[i] = 0.; sum_[i] = 0.; } } + inline void + One() + { + for (unsigned i = 0; i < N_; i++) { + clipped_[i] = 1.; + sum_[i] = 1.; + } + } + inline void Print() { @@ -74,6 +82,23 @@ struct NgramCounts cout << i+1 << "grams:\t\t\t" << sum_[i] << endl; } } + + inline void Resize(unsigned N) + { + if (N == N_) return; + else if (N > N_) { + for (unsigned i = N_; i < N; i++) { + clipped_[i] = 0.; + sum_[i] = 0.; + } + } else { // N < N_ + for (unsigned i = N_-1; i > N-1; i--) { + clipped_.erase(i); + sum_.erase(i); + } + } + N_ = N; + } }; typedef map, unsigned> Ngrams; @@ -152,6 +177,24 @@ struct ApproxBleuScorer : public BleuScorer score_t Score(vector& hyp, vector& ref, const unsigned rank, const unsigned src_len); }; +struct LinearBleuScorer : public BleuScorer +{ + unsigned onebest_len_; + NgramCounts onebest_counts_; + + LinearBleuScorer(unsigned N) : onebest_len_(1), onebest_counts_(N) + { + onebest_counts_.One(); + } + + score_t Score(vector& hyp, vector& ref, const unsigned rank, const unsigned /*src_len*/); + + inline void Reset() { + onebest_len_ = 1; + onebest_counts_.One(); + } +}; + } // namespace -- cgit v1.2.3