diff options
Diffstat (limited to 'dtrain')
| -rw-r--r-- | dtrain/Makefile.am | 2 | ||||
| -rw-r--r-- | dtrain/README.md | 10 | ||||
| -rw-r--r-- | dtrain/dtrain.cc | 97 | ||||
| -rw-r--r-- | dtrain/dtrain.h | 14 | ||||
| -rw-r--r-- | dtrain/ksampler.h | 7 | ||||
| -rw-r--r-- | dtrain/pairsampling.h | 49 | ||||
| -rw-r--r-- | dtrain/score.cc | 117 | ||||
| -rw-r--r-- | dtrain/score.h | 64 | ||||
| -rw-r--r-- | dtrain/test/example/README | 4 | ||||
| -rw-r--r-- | dtrain/test/example/dtrain.ini | 3 | ||||
| -rw-r--r-- | dtrain/test/example/expected-output | 125 | 
11 files changed, 421 insertions, 71 deletions
| diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index f39d161e..64fef489 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -3,5 +3,5 @@ bin_PROGRAMS = dtrain  dtrain_SOURCES = dtrain.cc score.cc  dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -AM_CPPFLAGS = -O3 -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/README.md b/dtrain/README.md index 9580df6d..7edabbf1 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -39,16 +39,6 @@ For an example of local usage (with the 'distributed' format)  the see test/example/ . This expects dtrain to be built without  DTRAIN_LOCAL. -Next ----- -+ (dtrain|decoder) more meta-parameters testing -+ feature selection directly in dtrain -+ feature template: target side rule ngrams -+ sa-extract -> leave-one-out for grammar of training set? -+ make svm doable; no subgradient? -+ reranking while sgd? -+ try PRO, mira emulations -  Legal  -----  Copyright (c) 2012 by Patrick Simianer <p@simianer.de> diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index d9bce843..b3e62914 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -6,37 +6,39 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)  {    po::options_description ini("Configuration File Options");    ini.add_options() -    ("input",             po::value<string>()->default_value("-"),                                                "input file") -    ("output",            po::value<string>()->default_value("-"),                       "output weights file, '-' for STDOUT") -    ("input_weights",     po::value<string>(),                             "input weights file (e.g. from previous iteration)") -    ("decoder_config",    po::value<string>(),                                                   "configuration file for cdec") -    ("print_weights",     po::value<string>(),                                            "weights to print on each iteration") -    ("stop_after",        po::value<unsigned>()->default_value(0),                              "stop after X input sentences") -    ("tmp",               po::value<string>()->default_value("/tmp"),                                        "temp dir to use") -    ("keep",              po::value<bool>()->zero_tokens(),                            "keep weights files for each iteration") -    ("hstreaming",        po::value<string>(),                                "run in hadoop streaming mode, arg is a task id") -    ("epochs",            po::value<unsigned>()->default_value(10),                            "# of iterations T (per shard)") -    ("k",                 po::value<unsigned>()->default_value(100),                         "how many translations to sample") -    ("sample_from",       po::value<string>()->default_value("kbest"),  "where to sample translations from: 'kbest', 'forest'") -    ("filter",            po::value<string>()->default_value("uniq"),                       "filter kbest list: 'not', 'uniq'") -    ("pair_sampling",     po::value<string>()->default_value("XYX"),              "how to sample pairs: 'all', 'XYX' or 'PRO'") -    ("hi_lo",             po::value<float>()->default_value(0.1),                "hi and lo (X) for XYX (default 0.1), <= 0.5") -    ("pair_threshold",    po::value<score_t>()->default_value(0.),                      "bleu [0,1] threshold to filter pairs") -    ("N",                 po::value<unsigned>()->default_value(4),                                       "N for Ngrams (BLEU)") -    ("scorer",            po::value<string>()->default_value("stupid_bleu"),        "scoring: bleu, stupid_, smooth_, approx_") -    ("learning_rate",     po::value<weight_t>()->default_value(0.0001),                                        "learning rate") -    ("gamma",             po::value<weight_t>()->default_value(0.),                         "gamma for SVM (0 for perceptron)") -    ("select_weights",    po::value<string>()->default_value("last"),  "output best, last, avg weights ('VOID' to throw away)") -    ("rescale",           po::value<bool>()->zero_tokens(),                           "rescale weight vector after each input") -    ("l1_reg",            po::value<string>()->default_value("none"),   "apply l1 regularization as in 'Tsuroka et al' (2010)") -    ("l1_reg_strength",   po::value<weight_t>(),                                                  "l1 regularization strength") -    ("fselect",           po::value<weight_t>()->default_value(-1), "TODO select top x percent (or by threshold) of features after each epoch") -    ("approx_bleu_d",     po::value<score_t>()->default_value(0.9),                                "discount for approx. BLEU") -    ("scale_bleu_diff",   po::value<bool>()->zero_tokens(),                   "learning rate <- bleu diff of a misranked pair") +    ("input",             po::value<string>()->default_value("-"),                                                   "input file") +    ("output",            po::value<string>()->default_value("-"),                          "output weights file, '-' for STDOUT") +    ("input_weights",     po::value<string>(),                                "input weights file (e.g. from previous iteration)") +    ("decoder_config",    po::value<string>(),                                                      "configuration file for cdec") +    ("print_weights",     po::value<string>(),                                               "weights to print on each iteration") +    ("stop_after",        po::value<unsigned>()->default_value(0),                                 "stop after X input sentences") +    ("tmp",               po::value<string>()->default_value("/tmp"),                                           "temp dir to use") +    ("keep",              po::value<bool>()->zero_tokens(),                               "keep weights files for each iteration") +    ("hstreaming",        po::value<string>(),                                   "run in hadoop streaming mode, arg is a task id") +    ("epochs",            po::value<unsigned>()->default_value(10),                               "# of iterations T (per shard)") +    ("k",                 po::value<unsigned>()->default_value(100),                            "how many translations to sample") +    ("sample_from",       po::value<string>()->default_value("kbest"),     "where to sample translations from: 'kbest', 'forest'") +    ("filter",            po::value<string>()->default_value("uniq"),                          "filter kbest list: 'not', 'uniq'") +    ("pair_sampling",     po::value<string>()->default_value("XYX"),                 "how to sample pairs: 'all', 'XYX' or 'PRO'") +    ("hi_lo",             po::value<float>()->default_value(0.1),                   "hi and lo (X) for XYX (default 0.1), <= 0.5") +    ("pair_threshold",    po::value<score_t>()->default_value(0.),                         "bleu [0,1] threshold to filter pairs") +    ("N",                 po::value<unsigned>()->default_value(4),                                          "N for Ngrams (BLEU)") +    ("scorer",            po::value<string>()->default_value("stupid_bleu"),      "scoring: bleu, stupid_, smooth_, approx_, lc_") +    ("learning_rate",     po::value<weight_t>()->default_value(0.0001),                                           "learning rate") +    ("gamma",             po::value<weight_t>()->default_value(0.),                            "gamma for SVM (0 for perceptron)") +    ("select_weights",    po::value<string>()->default_value("last"),     "output best, last, avg weights ('VOID' to throw away)") +    ("rescale",           po::value<bool>()->zero_tokens(),                              "rescale weight vector after each input") +    ("l1_reg",            po::value<string>()->default_value("none"),      "apply l1 regularization as in 'Tsuroka et al' (2010)") +    ("l1_reg_strength",   po::value<weight_t>(),                                                     "l1 regularization strength") +    ("fselect",           po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPL") // TODO +    ("approx_bleu_d",     po::value<score_t>()->default_value(0.9),                                   "discount for approx. BLEU") +    ("scale_bleu_diff",   po::value<bool>()->zero_tokens(),                      "learning rate <- bleu diff of a misranked pair") +    ("loss_margin",       po::value<weight_t>()->default_value(0.),  "update if no error in pref pair but model scores this near") +    ("max_pairs",         po::value<unsigned>()->default_value(std::numeric_limits<unsigned>::max()), "max. # of pairs per Sent.")  #ifdef DTRAIN_LOCAL -    ("refs,r",            po::value<string>(),                                                      "references in local mode") +    ("refs,r",            po::value<string>(),                                                         "references in local mode")  #endif -    ("noup",              po::value<bool>()->zero_tokens(),                                            "do not update weights"); +    ("noup",              po::value<bool>()->zero_tokens(),                                               "do not update weights");    po::options_description cl("Command Line Options");    cl.add_options()      ("config,c",         po::value<string>(),              "dtrain config file") @@ -134,6 +136,9 @@ main(int argc, char** argv)    const string select_weights = cfg["select_weights"].as<string>();    const float hi_lo = cfg["hi_lo"].as<float>();    const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>(); +  const unsigned max_pairs = cfg["max_pairs"].as<unsigned>(); +  weight_t loss_margin = cfg["loss_margin"].as<weight_t>(); +  if (loss_margin > 9998.) loss_margin = std::numeric_limits<float>::max();    bool scale_bleu_diff = false;    if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true;    bool average = false; @@ -160,8 +165,16 @@ main(int argc, char** argv)      scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer);    } else if (scorer_str == "smooth_bleu") {      scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer); +  } else if (scorer_str == "sum_bleu") { +    scorer = dynamic_cast<SumBleuScorer*>(new SumBleuScorer); +  } else if (scorer_str == "sumexp_bleu") { +    scorer = dynamic_cast<SumExpBleuScorer*>(new SumExpBleuScorer); +  } else if (scorer_str == "sumwhatever_bleu") { +    scorer = dynamic_cast<SumWhateverBleuScorer*>(new SumWhateverBleuScorer);    } else if (scorer_str == "approx_bleu") {      scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d)); +  } else if (scorer_str == "lc_bleu") { +    scorer = dynamic_cast<LinearBleuScorer*>(new LinearBleuScorer(N));    } else {      cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;      exit(1); @@ -220,7 +233,7 @@ main(int argc, char** argv)    grammar_buf_out.open(grammar_buf_fn.c_str());  #endif -  unsigned in_sz = UINT_MAX; // input index, input size +  unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size    vector<pair<score_t, score_t> > all_scores;    score_t max_score = 0.;    unsigned best_it = 0; @@ -242,6 +255,7 @@ main(int argc, char** argv)      if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl;      else cerr << setw(25) << "learning rate " << "bleu diff" << endl;      cerr << setw(25) << "gamma " << gamma << endl; +    cerr << setw(25) << "loss margin " << loss_margin << endl;      cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;      if (pair_sampling == "XYX")        cerr << setw(25) << "hi lo " << hi_lo << endl; @@ -251,6 +265,7 @@ main(int argc, char** argv)        cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl;      if (rescale)        cerr << setw(25) << "rescale " << rescale << endl; +    cerr << setw(25) << "max pairs " << max_pairs << endl;      cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;      cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;  #ifdef DTRAIN_LOCAL @@ -415,21 +430,27 @@ main(int argc, char** argv)        // get pairs        vector<pair<ScoredHyp,ScoredHyp> > pairs;        if (pair_sampling == "all") -        all_pairs(samples, pairs, pair_threshold); +        all_pairs(samples, pairs, pair_threshold, max_pairs);        if (pair_sampling == "XYX") -        partXYX(samples, pairs, pair_threshold, hi_lo); +        partXYX(samples, pairs, pair_threshold, max_pairs, hi_lo);        if (pair_sampling == "PRO") -        PROsampling(samples, pairs, pair_threshold); +        PROsampling(samples, pairs, pair_threshold, max_pairs);        npairs += pairs.size();        for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin();             it != pairs.end(); it++) { +#ifdef DTRAIN_FASTER_PERCEPTRON +        bool rank_error = true; // pair sampling already did this for us +        rank_errors++; +        score_t margin = std::numeric_limits<float>::max(); +#else          bool rank_error = it->first.model <= it->second.model;          if (rank_error) rank_errors++; -        score_t margin = fabs(it->first.model - it->second.model); -        if (!rank_error && margin < 1) margin_violations++; +        score_t margin = fabs(fabs(it->first.model) - fabs(it->second.model)); +        if (!rank_error && margin < loss_margin) margin_violations++; +#endif          if (scale_bleu_diff) eta = it->first.score - it->second.score; -        if (rank_error || (gamma && margin<1)) { +        if (rank_error || margin < loss_margin) {            SparseVector<weight_t> diff_vec = it->first.f - it->second.f;            lambdas.plus_eq_v_times_s(diff_vec, eta);            if (gamma) @@ -486,7 +507,7 @@ main(int argc, char** argv)    if (average) w_average += lambdas; -  if (scorer_str == "approx_bleu") scorer->Reset(); +  if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset();    if (t == 0) {      in_sz = ii; // remember size of input (# lines) @@ -534,8 +555,10 @@ main(int argc, char** argv)      cerr << _np << npairs/(float)in_sz << endl;      cerr << "        avg # rank err: ";      cerr << rank_errors/(float)in_sz << endl; +#ifndef DTRAIN_FASTER_PERCEPTRON      cerr << "     avg # margin viol: ";      cerr << margin_violations/(float)in_sz << endl; +#endif      cerr << "    non0 feature count: " <<  nonz << endl;      cerr << "           avg list sz: " << list_sz/(float)in_sz << endl;      cerr << "           avg f count: " << f_count/(float)list_sz << endl; diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 94d149ce..7e084a79 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -1,6 +1,14 @@  #ifndef _DTRAIN_H_  #define _DTRAIN_H_ +#undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs +                                 // DO NOT USE WITH SVM! +#define DTRAIN_LOCAL +#define DTRAIN_DOTS 10 // after how many inputs to display a '.' +#define DTRAIN_GRAMMAR_DELIM "########EOS########" +#define DTRAIN_SCALE 100000 + +  #include <iomanip>  #include <climits>  #include <string.h> @@ -13,11 +21,7 @@  #include "filelib.h" -#undef DTRAIN_LOCAL -#define DTRAIN_DOTS 10 // after how many inputs to display a '.' -#define DTRAIN_GRAMMAR_DELIM "########EOS########" -#define DTRAIN_SCALE 100000  using namespace std;  using namespace dtrain; @@ -32,7 +36,7 @@ inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids  inline string gettmpf(const string path, const string infix)  { -  char fn[1024]; +  char fn[path.size() + infix.size() + 8];    strcpy(fn, path.c_str());    strcat(fn, "/");    strcat(fn, infix.c_str()); diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index f52fb649..bc2f56cd 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -8,6 +8,11 @@  namespace dtrain  { +bool +cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b) +{ +  return a.model > b.model; +}  struct KSampler : public HypSampler  { @@ -44,6 +49,8 @@ struct KSampler : public HypSampler        sz_++;        f_count_ += h.f.size();      } +    sort(s_.begin(), s_.end(), cmp_hyp_by_model_d); +    for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i;    }  }; diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index bac132c6..84be1efb 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -19,10 +19,12 @@ cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b)  }  inline void -all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused=1) +all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float _unused=1)  {    sort(s->begin(), s->end(), cmp_hyp_by_score_d);    unsigned sz = s->size(); +  bool b = false; +  unsigned count = 0;    for (unsigned i = 0; i < sz-1; i++) {      for (unsigned j = i+1; j < sz; j++) {        if (threshold > 0) { @@ -32,7 +34,12 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc          if ((*s)[i].score != (*s)[j].score)            training.push_back(make_pair((*s)[i], (*s)[j]));        } +      if (++count == max) { +        b = true; +        break; +      }      } +    if (b) break;    }  } @@ -44,13 +51,22 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc   */  inline void -partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float hi_lo) +partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float hi_lo)  { -  sort(s->begin(), s->end(), cmp_hyp_by_score_d);    unsigned sz = s->size(); +  if (sz < 2) return; +  sort(s->begin(), s->end(), cmp_hyp_by_score_d);    unsigned sep = round(sz*hi_lo); -  for (unsigned i = 0; i < sep; i++) { -    for (unsigned j = sep; j < sz; j++) { +  unsigned sep_hi = sep; +  if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi; +  else sep_hi = 1; +  bool b = false; +  unsigned count = 0; +  for (unsigned i = 0; i < sep_hi; i++) { +    for (unsigned j = sep_hi; j < sz; j++) { +#ifdef DTRAIN_FASTER_PERCEPTRON +      if ((*s)[i].model <= (*s)[j].model) { +#endif        if (threshold > 0) {          if (accept_pair((*s)[i].score, (*s)[j].score, threshold))            training.push_back(make_pair((*s)[i], (*s)[j])); @@ -58,10 +74,23 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor          if ((*s)[i].score != (*s)[j].score)            training.push_back(make_pair((*s)[i], (*s)[j]));        } +      if (++count == max) { +        b = true; +        break; +      } +#ifdef DTRAIN_FASTER_PERCEPTRON +      } +#endif      } +    if (b) break;    } -  for (unsigned i = sep; i < sz-sep; i++) { -    for (unsigned j = sz-sep; j < sz; j++) { +  unsigned sep_lo = sz-sep; +  while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo; +  for (unsigned i = sep_hi; i < sz-sep_lo; i++) { +    for (unsigned j = sz-sep_lo; j < sz; j++) { +#ifdef DTRAIN_FASTER_PERCEPTRON +      if ((*s)[i].model <= (*s)[j].model) { +#endif        if (threshold > 0) {          if (accept_pair((*s)[i].score, (*s)[j].score, threshold))            training.push_back(make_pair((*s)[i], (*s)[j])); @@ -69,6 +98,10 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor          if ((*s)[i].score != (*s)[j].score)            training.push_back(make_pair((*s)[i], (*s)[j]));        } +      if (++count == max) return; +#ifdef DTRAIN_FASTER_PERCEPTRON +      } +#endif      }    }  } @@ -86,7 +119,7 @@ _PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b    return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));  }  inline void -PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused=1) +PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float _unused=1)  {    unsigned max_count = 5000, count = 0, sz = s->size();    bool b = false; diff --git a/dtrain/score.cc b/dtrain/score.cc index 7b1f6be4..4a7cac6e 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -80,7 +80,7 @@ StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,   *        to Machine Translation"   * (Liang et al. '06)   * - * NOTE: max is 0.9375 + * NOTE: max is 0.9375 (with N=4)   */  score_t  SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, @@ -103,7 +103,83 @@ SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,          i_bleu[j] += (1/((score_t)j+1)) * i_ng;        }      } -    sum += exp(i_bleu[i])/(pow(2.0, static_cast<double>(N_-i))); +    sum += exp(i_bleu[i])/(pow(2.0, N_-i)); +  } +  return brevity_penalty(hyp_len, ref_len) * sum; +} + +/* + * 'sum' bleu + * + * sum up Ngram precisions + */ +score_t +SumBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, +                        const unsigned /*rank*/, const unsigned /*src_len*/) +{ +  unsigned hyp_len = hyp.size(), ref_len = ref.size(); +  if (hyp_len == 0 || ref_len == 0) return 0.; +  NgramCounts counts = make_ngram_counts(hyp, ref, N_); +  unsigned M = N_; +  if (ref_len < N_) M = ref_len; +  score_t sum = 0.; +  unsigned j = 1; +  for (unsigned i = 0; i < M; i++) { +    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; +    sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2., N_-j+1); +    j++; +  } +  return brevity_penalty(hyp_len, ref_len) * sum; +} + +/* + * 'sum' (exp) bleu + * + * sum up exp(Ngram precisions) + */ +score_t +SumExpBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, +                        const unsigned /*rank*/, const unsigned /*src_len*/) +{ +  unsigned hyp_len = hyp.size(), ref_len = ref.size(); +  if (hyp_len == 0 || ref_len == 0) return 0.; +  NgramCounts counts = make_ngram_counts(hyp, ref, N_); +  unsigned M = N_; +  if (ref_len < N_) M = ref_len; +  score_t sum = 0.; +  unsigned j = 1; +  for (unsigned i = 0; i < M; i++) { +    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; +    sum += exp(((score_t)counts.clipped_[i]/counts.sum_[i]))/pow(2., N_-j+1); +    j++; +  } +  return brevity_penalty(hyp_len, ref_len) * sum; +} + +/* + * 'sum' (whatever) bleu + * + * sum up exp(weight * log(Ngram precisions)) + */ +score_t +SumWhateverBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, +                        const unsigned /*rank*/, const unsigned /*src_len*/) +{ +  unsigned hyp_len = hyp.size(), ref_len = ref.size(); +  if (hyp_len == 0 || ref_len == 0) return 0.; +  NgramCounts counts = make_ngram_counts(hyp, ref, N_); +  unsigned M = N_; +  vector<score_t> v = w_; +  if (ref_len < N_) { +    M = ref_len; +    for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M); +  } +  score_t sum = 0.; +  unsigned j = 1; +  for (unsigned i = 0; i < M; i++) { +    if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; +    sum += exp(v[i] * log(((score_t)counts.clipped_[i]/counts.sum_[i])))/pow(2., N_-j+1); +    j++;    }    return brevity_penalty(hyp_len, ref_len) * sum;  } @@ -115,7 +191,8 @@ SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,   *        and Structural Translation Features"   * (Chiang et al. '08)   * - * NOTE: needs some more code in dtrain.cc + * NOTE: Needs some more code in dtrain.cc . + *       No scaling by src len.   */  score_t  ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, @@ -137,7 +214,39 @@ ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,      glob_ref_len_ = discount_ * (glob_ref_len_ + ref_len);      glob_src_len_ = discount_ * (glob_src_len_ + src_len);    } -  return (score_t)glob_src_len_ * score; +  return score; +} + +/* + * Linear (Corpus) Bleu + * + * as in "Lattice Minimum Bayes-Risk Decoding + *        for Statistical Machine Translation" + * (Tromble et al. '08) + * + */ +score_t +LinearBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, +                        const unsigned rank, const unsigned /*src_len*/) +{ +  unsigned hyp_len = hyp.size(), ref_len = ref.size(); +  if (ref_len == 0) return 0.; +  unsigned M = N_; +  if (ref_len < N_) M = ref_len; +  NgramCounts counts(M); +  if (hyp_len > 0) +    counts = make_ngram_counts(hyp, ref, M); +  score_t ret = 0.; +  for (unsigned i = 0; i < M; i++) { +    if (counts.sum_[i] == 0 || onebest_counts_.sum_[i] == 0) break; +    ret += counts.sum_[i]/onebest_counts_.sum_[i]; +  } +  ret = -(hyp_len/(score_t)onebest_len_) + (1./M) * ret; +  if (rank == 0) { +    onebest_len_ += hyp_len; +    onebest_counts_ += counts; +  } +  return ret;  } diff --git a/dtrain/score.h b/dtrain/score.h index eb8ad912..f317c903 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -20,7 +20,7 @@ struct NgramCounts    inline void    operator+=(const NgramCounts& rhs)    { -    assert(N_ == rhs.N_); +    if (rhs.N_ > N_) Resize(rhs.N_);      for (unsigned i = 0; i < N_; i++) {        this->clipped_[i] += rhs.clipped_.find(i)->second;        this->sum_[i] += rhs.sum_.find(i)->second; @@ -59,14 +59,22 @@ struct NgramCounts    inline void    Zero()    { -    unsigned i; -    for (i = 0; i < N_; i++) { +    for (unsigned i = 0; i < N_; i++) {        clipped_[i] = 0.;        sum_[i] = 0.;      }    }    inline void +  One() +  { +    for (unsigned i = 0; i < N_; i++) { +      clipped_[i] = 1.; +      sum_[i] = 1.; +    } +  } + +  inline void    Print()    {      for (unsigned i = 0; i < N_; i++) { @@ -74,6 +82,23 @@ struct NgramCounts        cout << i+1 << "grams:\t\t\t" << sum_[i] << endl;      }    } + +  inline void Resize(unsigned N) +  { +    if (N == N_) return; +    else if (N > N_) { +      for (unsigned i = N_; i < N; i++) { +        clipped_[i] = 0.; +        sum_[i] = 0.; +      } +    } else { // N < N_ +      for (unsigned i = N_-1; i > N-1; i--) { +        clipped_.erase(i); +        sum_.erase(i); +      } +    } +    N_ = N; +  }  };  typedef map<vector<WordID>, unsigned> Ngrams; @@ -128,6 +153,21 @@ struct SmoothBleuScorer : public LocalScorer    score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);  }; +struct SumBleuScorer : public LocalScorer +{ +   score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct SumExpBleuScorer : public LocalScorer +{ +   score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct SumWhateverBleuScorer : public LocalScorer +{ +   score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; +  struct ApproxBleuScorer : public BleuScorer  {    NgramCounts glob_onebest_counts_; @@ -147,6 +187,24 @@ struct ApproxBleuScorer : public BleuScorer    score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len);  }; +struct LinearBleuScorer : public BleuScorer +{ +  unsigned onebest_len_; +  NgramCounts onebest_counts_; + +  LinearBleuScorer(unsigned N) : onebest_len_(1), onebest_counts_(N) +  { +    onebest_counts_.One(); +  } + +  score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned /*src_len*/); + +  inline void Reset() { +    onebest_len_ = 1; +    onebest_counts_.One(); +  } +}; +  } // namespace diff --git a/dtrain/test/example/README b/dtrain/test/example/README index b3ea5f06..6937b11b 100644 --- a/dtrain/test/example/README +++ b/dtrain/test/example/README @@ -1,8 +1,8 @@  Small example of input format for distributed training.  Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini . -For this to work, disable '#define DTRAIN_LOCAL' from dtrain.h +For this to work, undef 'DTRAIN_LOCAL' in dtrain.h  and recompile. -Data is here: http://simianer.de/dtrain +Data is here: http://simianer.de/#dtrain diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index f87ee9cf..c8ac7c3f 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -5,7 +5,7 @@ decoder_config=test/example/cdec.ini # config for cdec  # weights for these features will be printed on each iteration  print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough  tmp=/tmp -stop_after=10 # stop epoch after 20 inputs +stop_after=10 # stop epoch after 10 inputs  # interesting stuff  epochs=3                # run over input 3 times @@ -19,3 +19,4 @@ filter=uniq             # only unique entries in kbest (surface form)  pair_sampling=XYX  hi_lo=0.1               # 10 vs 80 vs 10 and 80 vs 10 here  pair_threshold=0        # minimum distance in BLEU (this will still only use pairs with diff > 0) +loss_margin=0 diff --git a/dtrain/test/example/expected-output b/dtrain/test/example/expected-output new file mode 100644 index 00000000..25d2c069 --- /dev/null +++ b/dtrain/test/example/expected-output @@ -0,0 +1,125 @@ +                cdec cfg 'test/example/cdec.ini' +feature: WordPenalty (no config parameters) +State is 0 bytes for feature WordPenalty +feature: KLanguageModel (with config parameters 'test/example/nc-wmt11.en.srilm.gz') +Loading the LM will be faster if you build a binary file. +Reading test/example/nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** +Loaded 5-gram KLM from test/example/nc-wmt11.en.srilm.gz (MapSize=49581) +State is 98 bytes for feature KLanguageModel test/example/nc-wmt11.en.srilm.gz +feature: RuleIdentityFeatures (no config parameters) +State is 0 bytes for feature RuleIdentityFeatures +feature: RuleNgramFeatures (no config parameters) +State is 0 bytes for feature RuleNgramFeatures +feature: RuleShape (no config parameters) +  Example feature: Shape_S00000_T00000 +State is 0 bytes for feature RuleShape +Seeding random number sequence to 1072059181 + +dtrain +Parameters: +                       k 100 +                       N 4 +                       T 3 +                 scorer 'stupid_bleu' +             sample from 'kbest' +                  filter 'uniq' +           learning rate 0.0001 +                   gamma 0 +             loss margin 0 +                   pairs 'XYX' +                   hi lo 0.1 +          pair threshold 0 +          select weights 'VOID' +                  l1 reg 0 'none' +                cdec cfg 'test/example/cdec.ini' +                   input 'test/example/nc-wmt11.1k.gz' +                  output '-' +              stop_after 10 +(a dot represents 10 inputs) +Iteration #1 of 3. + . 10 +Stopping after 10 input sentences. +WEIGHTS +              Glue = -0.0293 +       WordPenalty = +0.049075 +     LanguageModel = +0.24345 + LanguageModel_OOV = -0.2029 +     PhraseModel_0 = +0.0084102 +     PhraseModel_1 = +0.021729 +     PhraseModel_2 = +0.014922 +     PhraseModel_3 = +0.104 +     PhraseModel_4 = -0.14308 +     PhraseModel_5 = +0.0247 +     PhraseModel_6 = -0.012 +       PassThrough = -0.2161 +        --- +       1best avg score: 0.16872 (+0.16872) + 1best avg model score: -1.8276 (-1.8276) +           avg # pairs: 1121.1 +        avg # rank err: 555.6 +     avg # margin viol: 0 +    non0 feature count: 277 +           avg list sz: 77.2 +           avg f count: 90.96 +(time 0.1 min, 0.6 s/S) + +Iteration #2 of 3. + . 10 +WEIGHTS +              Glue = -0.3526 +       WordPenalty = +0.067576 +     LanguageModel = +1.155 + LanguageModel_OOV = -0.2728 +     PhraseModel_0 = -0.025529 +     PhraseModel_1 = +0.095869 +     PhraseModel_2 = +0.094567 +     PhraseModel_3 = +0.12482 +     PhraseModel_4 = -0.36533 +     PhraseModel_5 = +0.1068 +     PhraseModel_6 = -0.1517 +       PassThrough = -0.286 +        --- +       1best avg score: 0.18394 (+0.015221) + 1best avg model score: 3.205 (+5.0326) +           avg # pairs: 1168.3 +        avg # rank err: 594.8 +     avg # margin viol: 0 +    non0 feature count: 543 +           avg list sz: 77.5 +           avg f count: 85.916 +(time 0.083 min, 0.5 s/S) + +Iteration #3 of 3. + . 10 +WEIGHTS +              Glue = -0.392 +       WordPenalty = +0.071963 +     LanguageModel = +0.81266 + LanguageModel_OOV = -0.4177 +     PhraseModel_0 = -0.2649 +     PhraseModel_1 = -0.17931 +     PhraseModel_2 = +0.038261 +     PhraseModel_3 = +0.20261 +     PhraseModel_4 = -0.42621 +     PhraseModel_5 = +0.3198 +     PhraseModel_6 = -0.1437 +       PassThrough = -0.4309 +        --- +       1best avg score: 0.2962 (+0.11225) + 1best avg model score: -36.274 (-39.479) +           avg # pairs: 1109.6 +        avg # rank err: 515.9 +     avg # margin viol: 0 +    non0 feature count: 741 +           avg list sz: 77 +           avg f count: 88.982 +(time 0.083 min, 0.5 s/S) + +Writing weights file to '-' ... +done + +--- +Best iteration: 3 [SCORE 'stupid_bleu'=0.2962]. +This took 0.26667 min. | 
