diff options
| -rw-r--r-- | dtrain/dtrain.cc | 76 | ||||
| -rw-r--r-- | dtrain/dtrain.h | 8 | ||||
| -rw-r--r-- | dtrain/pairsampling.h | 17 | ||||
| -rw-r--r-- | dtrain/score.cc | 2 | ||||
| -rw-r--r-- | dtrain/test/example/README | 6 | ||||
| -rw-r--r-- | dtrain/test/example/dtrain.ini | 9 | 
6 files changed, 68 insertions, 50 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index cf913765..ea5b8835 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -6,35 +6,37 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)  {    po::options_description ini("Configuration File Options");    ini.add_options() -    ("input",           po::value<string>()->default_value("-"),                                                "input file") -    ("output",          po::value<string>()->default_value("-"),                       "output weights file, '-' for STDOUT") -    ("input_weights",   po::value<string>(),                             "input weights file (e.g. from previous iteration)") -    ("decoder_config",  po::value<string>(),                                                   "configuration file for cdec") -    ("print_weights",   po::value<string>(),                                            "weights to print on each iteration") -    ("stop_after",      po::value<unsigned>()->default_value(0),                              "stop after X input sentences") -    ("tmp",             po::value<string>()->default_value("/tmp"),                                        "temp dir to use") -    ("keep",            po::value<bool>()->zero_tokens(),                            "keep weights files for each iteration") -    ("hstreaming",      po::value<string>(),                                "run in hadoop streaming mode, arg is a task id") -    ("epochs",          po::value<unsigned>()->default_value(10),                            "# of iterations T (per shard)") -    ("k",               po::value<unsigned>()->default_value(100),                         "how many translations to sample") -    ("sample_from",     po::value<string>()->default_value("kbest"),  "where to sample translations from: 'kbest', 'forest'") -    ("filter",          po::value<string>()->default_value("uniq"),                       "filter kbest list: 'not', 'uniq'") -    ("pair_sampling",   po::value<string>()->default_value("108010"),        "how to sample pairs: 'all', '108010' or 'PRO'") -    ("pair_threshold",  po::value<score_t>()->default_value(0),                       "bleu [0,1] threshold to filter pairs") -    ("N",               po::value<unsigned>()->default_value(4),                                       "N for Ngrams (BLEU)") -    ("scorer",          po::value<string>()->default_value("stupid_bleu"),        "scoring: bleu, stupid_, smooth_, approx_") -    ("learning_rate",   po::value<weight_t>()->default_value(0.0001),                                        "learning rate") -    ("gamma",           po::value<weight_t>()->default_value(0),                          "gamma for SVM (0 for perceptron)") -    ("select_weights",  po::value<string>()->default_value("last"),  "output best, last, avg weights ('VOID' to throw away)") -    ("rescale",         po::value<bool>()->zero_tokens(),                           "rescale weight vector after each input") -    ("l1_reg",          po::value<string>()->default_value("none"),   "apply l1 regularization as in 'Tsuroka et al' (2010)") -    ("l1_reg_strength", po::value<weight_t>(),                                                  "l1 regularization strength") -    ("inc_correct",     po::value<bool>()->zero_tokens(),                      "include correctly ranked pairs into updates") -    ("fselect",         po::value<weight_t>()->default_value(-1),   "TODO select top x percent of features after each epoch") +    ("input",             po::value<string>()->default_value("-"),                                                "input file") +    ("output",            po::value<string>()->default_value("-"),                       "output weights file, '-' for STDOUT") +    ("input_weights",     po::value<string>(),                             "input weights file (e.g. from previous iteration)") +    ("decoder_config",    po::value<string>(),                                                   "configuration file for cdec") +    ("print_weights",     po::value<string>(),                                            "weights to print on each iteration") +    ("stop_after",        po::value<unsigned>()->default_value(0),                              "stop after X input sentences") +    ("tmp",               po::value<string>()->default_value("/tmp"),                                        "temp dir to use") +    ("keep",              po::value<bool>()->zero_tokens(),                            "keep weights files for each iteration") +    ("hstreaming",        po::value<string>(),                                "run in hadoop streaming mode, arg is a task id") +    ("epochs",            po::value<unsigned>()->default_value(10),                            "# of iterations T (per shard)") +    ("k",                 po::value<unsigned>()->default_value(100),                         "how many translations to sample") +    ("sample_from",       po::value<string>()->default_value("kbest"),  "where to sample translations from: 'kbest', 'forest'") +    ("filter",            po::value<string>()->default_value("uniq"),                       "filter kbest list: 'not', 'uniq'") +    ("pair_sampling",     po::value<string>()->default_value("XYX"),              "how to sample pairs: 'all', 'XYX' or 'PRO'") +    ("hi_lo",             po::value<float>()->default_value(0.1),                "hi and lo (X) for XYX (default 0.1), <= 0.5") +    ("pair_threshold",    po::value<score_t>()->default_value(0),                       "bleu [0,1] threshold to filter pairs") +    ("N",                 po::value<unsigned>()->default_value(4),                                       "N for Ngrams (BLEU)") +    ("scorer",            po::value<string>()->default_value("stupid_bleu"),        "scoring: bleu, stupid_, smooth_, approx_") +    ("learning_rate",     po::value<weight_t>()->default_value(0.0001),                                        "learning rate") +    ("gamma",             po::value<weight_t>()->default_value(0),                          "gamma for SVM (0 for perceptron)") +    ("select_weights",    po::value<string>()->default_value("last"),  "output best, last, avg weights ('VOID' to throw away)") +    ("rescale",           po::value<bool>()->zero_tokens(),                           "rescale weight vector after each input") +    ("l1_reg",            po::value<string>()->default_value("none"),   "apply l1 regularization as in 'Tsuroka et al' (2010)") +    ("l1_reg_strength",   po::value<weight_t>(),                                                  "l1 regularization strength") +    ("inc_correct",       po::value<bool>()->zero_tokens(),                      "include correctly ranked pairs into updates") +    ("fselect",           po::value<weight_t>()->default_value(-1),   "TODO select top x percent of features after each epoch") +    ("approx_bleu_scale", po::value<score_t>()->default_value(0.9),                                 "scaling for approx. BLEU")  #ifdef DTRAIN_LOCAL -    ("refs,r",         po::value<string>(),                                                       "references in local mode") +    ("refs,r",            po::value<string>(),                                                      "references in local mode")  #endif -    ("noup",           po::value<bool>()->zero_tokens(),                                             "do not update weights"); +    ("noup",              po::value<bool>()->zero_tokens(),                                            "do not update weights");    po::options_description cl("Command Line Options");    cl.add_options()      ("config,c",         po::value<string>(),              "dtrain config file") @@ -71,11 +73,18 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)      cerr << "Wrong 'filter' param: '" << (*cfg)["filter"].as<string>() << "', use 'uniq' or 'not'." << endl;      return false;    } -  if ((*cfg)["pair_sampling"].as<string>() != "all" && (*cfg)["pair_sampling"].as<string>() != "108010" && +  if ((*cfg)["pair_sampling"].as<string>() != "all" && (*cfg)["pair_sampling"].as<string>() != "XYX" &&          (*cfg)["pair_sampling"].as<string>() != "PRO") {      cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as<string>() << "'." << endl;      return false;    } +  if(cfg->count("hi_lo") && (*cfg)["pair_sampling"].as<string>() != "XYX") { +    cerr << "Warning: hi_lo only works with pair_sampling XYX." << endl; +  } +  if((*cfg)["hi_lo"].as<float>() > 0.5 || (*cfg)["hi_lo"].as<float>() < 0.01) { +    cerr << "hi_lo must lie in [0.01, 0.5]" << endl; +    return false; +  }    if ((*cfg)["pair_threshold"].as<score_t>() < 0) {      cerr << "The threshold must be >= 0!" << endl;      return false; @@ -126,6 +135,7 @@ main(int argc, char** argv)    const string pair_sampling = cfg["pair_sampling"].as<string>();    const score_t pair_threshold = cfg["pair_threshold"].as<score_t>();    const string select_weights = cfg["select_weights"].as<string>(); +  const float hi_lo = cfg["hi_lo"].as<float>();    bool average = false;    if (select_weights == "avg")      average = true; @@ -231,6 +241,8 @@ main(int argc, char** argv)      cerr << setw(25) << "learning rate " << eta << endl;      cerr << setw(25) << "gamma " << gamma << endl;      cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl; +    if (pair_sampling == "XYX") +      cerr << setw(25) << "hi lo " << "'" << hi_lo << "'" << endl;      cerr << setw(25) << "pair threshold " << pair_threshold << endl;      cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl;      if (cfg.count("l1_reg")) @@ -400,10 +412,10 @@ main(int argc, char** argv)        vector<pair<ScoredHyp,ScoredHyp> > pairs;        if (pair_sampling == "all")          all_pairs(samples, pairs, pair_threshold); -      if (pair_sampling == "108010") -        part108010(samples, pairs, pair_threshold); +      if (pair_sampling == "XYX") +        partXYX(samples, pairs, pair_threshold, hi_lo);        if (pair_sampling == "PRO") -        PROsampling(samples, pairs); +        PROsampling(samples, pairs, pair_threshold);        npairs += pairs.size();        pair_count += 2*pairs.size(); @@ -456,7 +468,7 @@ main(int argc, char** argv)            }          }        } else if (l1cumul) { -        weight_t acc_penalty = (ii+1) * l1_reg; // Note: ii is the index of the current input +        weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input          for (unsigned d = 0; d < lambdas.size(); d++) {            if (lambdas.nonzero(d)) {              weight_t v = lambdas.get(d); diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index ac13995a..7b03d258 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -13,7 +13,7 @@  #include "filelib.h" -#define DTRAIN_LOCAL +//#define DTRAIN_LOCAL  #define DTRAIN_DOTS 10 // after how many inputs to display a '.'  #define DTRAIN_GRAMMAR_DELIM "########EOS########" @@ -23,13 +23,15 @@ using namespace std;  using namespace dtrain;  namespace po = boost::program_options; -inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids) { +inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids) +{    vector<string>::const_iterator it;    for (it = strs.begin(); it < strs.end(); it++)      ids.push_back(TD::Convert(*it));  } -inline string gettmpf(const string path, const string infix) { +inline string gettmpf(const string path, const string infix) +{    char fn[1024];    strcpy(fn, path.c_str());    strcat(fn, "/"); diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 93c0630a..66ca1706 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -13,7 +13,7 @@ accept_pair(score_t a, score_t b, score_t threshold)  }  inline void -all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold) +all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused = 1)  {    for (unsigned i = 0; i < s->size()-1; i++) {      for (unsigned j = i+1; j < s->size(); j++) { @@ -35,19 +35,16 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc   *  cmp middle 80% to low 10%   */  bool -_108010_cmp_hyp_by_score(ScoredHyp a, ScoredHyp b) +_XYX_cmp_hyp_by_score(ScoredHyp a, ScoredHyp b)  {    return a.score < b.score;  }  inline void -part108010(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold) +partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float hi_lo)  { -  sort(s->begin(), s->end(), _108010_cmp_hyp_by_score); +  sort(s->begin(), s->end(), _XYX_cmp_hyp_by_score);    unsigned sz = s->size(); -  unsigned slice = 10; -  unsigned sep = sz%slice; -  cout << "sep " << sep <<endl; -  if (sep == 0) sep = sz/slice; +  unsigned sep = sz * hi_lo;    for (unsigned i = 0; i < sep; i++) {      for (unsigned j = sep; j < sz; j++) {        if ((*s)[i].rank < (*s)[j].rank) { @@ -80,7 +77,7 @@ part108010(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, s   * pair sampling as in   * 'Tuning as Ranking' (Hopkins & May, 2011)   *     count = 5000 - * threshold = 5% BLEU + * threshold = 5% BLEU (0.05 for param 3)   *       cut = top 50   */  bool @@ -90,7 +87,7 @@ _PRO_cmp_pair_by_diff(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b)    return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));  }  inline void -PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold=0.05) +PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused = 1)  {    unsigned max_count = 5000, count = 0;    bool b = false; diff --git a/dtrain/score.cc b/dtrain/score.cc index ec844437..d964b4da 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -129,7 +129,7 @@ ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,      ref_len = ref.size();      tmp = glob_onebest_counts + counts;    } -  return 0.9 * Bleu(tmp, hyp_len, ref_len); +  return 0.9 * Bleu(tmp, hyp_len, ref_len); // TODO param  } diff --git a/dtrain/test/example/README b/dtrain/test/example/README new file mode 100644 index 00000000..e5a5de59 --- /dev/null +++ b/dtrain/test/example/README @@ -0,0 +1,6 @@ +Small example of input format for distributed training. +Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini . + +For this to work, disable '#define DTRAIN_LOCAL' from dtrain.h +and recompile. + diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index b59250f3..cd2c75e7 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,5 +1,5 @@  input=test/example/nc-wmt11.1k.gz    # use '-' for STDIN -output=weights.gz                    # a weights file (add .gz for gzip compression) or STDOUT '-' +output=-                             # a weights file (add .gz for gzip compression) or STDOUT '-'  decoder_config=test/example/cdec.ini # config for cdec  # weights for these features will be printed on each iteration  print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough @@ -10,11 +10,12 @@ stop_after=100 # stop epoch after 100 inputs  epochs=3                # run over input 3 times  k=100                   # use 100best lists  N=4                     # optimize (approx) BLEU4 -scorer=stupid_bleu      # use 'stupid' BLEU+1 +scorer=approx_bleu      # use 'stupid' BLEU+1  learning_rate=0.0001    # learning rate  gamma=0                 # use SVM reg  sample_from=kbest       # use kbest lists (as opposed to forest)  filter=uniq             # only unique entries in kbest (surface form) -pair_sampling=108010    # 10 vs 80 vs 10 and 80 vs 10 +pair_sampling=XYX +hi_lo=0.1               # 10 vs 80 vs 10 and 80 vs 10  pair_threshold=0        # minimum distance in BLEU (this will still only use pairs with diff > 0) -select_weights=last     # just output last weights +select_weights=VOID     # don't output weights  | 
