From 01110e92e7429df7882879e026b28aa9c89c724d Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 26 Apr 2012 21:39:11 +0200 Subject: made pair sampling configurable --- dtrain/dtrain.cc | 76 ++++++++++++++++++++++++------------------ dtrain/dtrain.h | 8 +++-- dtrain/pairsampling.h | 17 ++++------ dtrain/score.cc | 2 +- dtrain/test/example/README | 6 ++++ dtrain/test/example/dtrain.ini | 9 ++--- 6 files changed, 68 insertions(+), 50 deletions(-) create mode 100644 dtrain/test/example/README diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index cf913765..ea5b8835 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -6,35 +6,37 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) { po::options_description ini("Configuration File Options"); ini.add_options() - ("input", po::value()->default_value("-"), "input file") - ("output", po::value()->default_value("-"), "output weights file, '-' for STDOUT") - ("input_weights", po::value(), "input weights file (e.g. from previous iteration)") - ("decoder_config", po::value(), "configuration file for cdec") - ("print_weights", po::value(), "weights to print on each iteration") - ("stop_after", po::value()->default_value(0), "stop after X input sentences") - ("tmp", po::value()->default_value("/tmp"), "temp dir to use") - ("keep", po::value()->zero_tokens(), "keep weights files for each iteration") - ("hstreaming", po::value(), "run in hadoop streaming mode, arg is a task id") - ("epochs", po::value()->default_value(10), "# of iterations T (per shard)") - ("k", po::value()->default_value(100), "how many translations to sample") - ("sample_from", po::value()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'") - ("filter", po::value()->default_value("uniq"), "filter kbest list: 'not', 'uniq'") - ("pair_sampling", po::value()->default_value("108010"), "how to sample pairs: 'all', '108010' or 'PRO'") - ("pair_threshold", po::value()->default_value(0), "bleu [0,1] threshold to filter pairs") - ("N", po::value()->default_value(4), "N for Ngrams (BLEU)") - ("scorer", po::value()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_") - ("learning_rate", po::value()->default_value(0.0001), "learning rate") - ("gamma", po::value()->default_value(0), "gamma for SVM (0 for perceptron)") - ("select_weights", po::value()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") - ("rescale", po::value()->zero_tokens(), "rescale weight vector after each input") - ("l1_reg", po::value()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)") - ("l1_reg_strength", po::value(), "l1 regularization strength") - ("inc_correct", po::value()->zero_tokens(), "include correctly ranked pairs into updates") - ("fselect", po::value()->default_value(-1), "TODO select top x percent of features after each epoch") + ("input", po::value()->default_value("-"), "input file") + ("output", po::value()->default_value("-"), "output weights file, '-' for STDOUT") + ("input_weights", po::value(), "input weights file (e.g. from previous iteration)") + ("decoder_config", po::value(), "configuration file for cdec") + ("print_weights", po::value(), "weights to print on each iteration") + ("stop_after", po::value()->default_value(0), "stop after X input sentences") + ("tmp", po::value()->default_value("/tmp"), "temp dir to use") + ("keep", po::value()->zero_tokens(), "keep weights files for each iteration") + ("hstreaming", po::value(), "run in hadoop streaming mode, arg is a task id") + ("epochs", po::value()->default_value(10), "# of iterations T (per shard)") + ("k", po::value()->default_value(100), "how many translations to sample") + ("sample_from", po::value()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'") + ("filter", po::value()->default_value("uniq"), "filter kbest list: 'not', 'uniq'") + ("pair_sampling", po::value()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'") + ("hi_lo", po::value()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5") + ("pair_threshold", po::value()->default_value(0), "bleu [0,1] threshold to filter pairs") + ("N", po::value()->default_value(4), "N for Ngrams (BLEU)") + ("scorer", po::value()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_") + ("learning_rate", po::value()->default_value(0.0001), "learning rate") + ("gamma", po::value()->default_value(0), "gamma for SVM (0 for perceptron)") + ("select_weights", po::value()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") + ("rescale", po::value()->zero_tokens(), "rescale weight vector after each input") + ("l1_reg", po::value()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)") + ("l1_reg_strength", po::value(), "l1 regularization strength") + ("inc_correct", po::value()->zero_tokens(), "include correctly ranked pairs into updates") + ("fselect", po::value()->default_value(-1), "TODO select top x percent of features after each epoch") + ("approx_bleu_scale", po::value()->default_value(0.9), "scaling for approx. BLEU") #ifdef DTRAIN_LOCAL - ("refs,r", po::value(), "references in local mode") + ("refs,r", po::value(), "references in local mode") #endif - ("noup", po::value()->zero_tokens(), "do not update weights"); + ("noup", po::value()->zero_tokens(), "do not update weights"); po::options_description cl("Command Line Options"); cl.add_options() ("config,c", po::value(), "dtrain config file") @@ -71,11 +73,18 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) cerr << "Wrong 'filter' param: '" << (*cfg)["filter"].as() << "', use 'uniq' or 'not'." << endl; return false; } - if ((*cfg)["pair_sampling"].as() != "all" && (*cfg)["pair_sampling"].as() != "108010" && + if ((*cfg)["pair_sampling"].as() != "all" && (*cfg)["pair_sampling"].as() != "XYX" && (*cfg)["pair_sampling"].as() != "PRO") { cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as() << "'." << endl; return false; } + if(cfg->count("hi_lo") && (*cfg)["pair_sampling"].as() != "XYX") { + cerr << "Warning: hi_lo only works with pair_sampling XYX." << endl; + } + if((*cfg)["hi_lo"].as() > 0.5 || (*cfg)["hi_lo"].as() < 0.01) { + cerr << "hi_lo must lie in [0.01, 0.5]" << endl; + return false; + } if ((*cfg)["pair_threshold"].as() < 0) { cerr << "The threshold must be >= 0!" << endl; return false; @@ -126,6 +135,7 @@ main(int argc, char** argv) const string pair_sampling = cfg["pair_sampling"].as(); const score_t pair_threshold = cfg["pair_threshold"].as(); const string select_weights = cfg["select_weights"].as(); + const float hi_lo = cfg["hi_lo"].as(); bool average = false; if (select_weights == "avg") average = true; @@ -231,6 +241,8 @@ main(int argc, char** argv) cerr << setw(25) << "learning rate " << eta << endl; cerr << setw(25) << "gamma " << gamma << endl; cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl; + if (pair_sampling == "XYX") + cerr << setw(25) << "hi lo " << "'" << hi_lo << "'" << endl; cerr << setw(25) << "pair threshold " << pair_threshold << endl; cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl; if (cfg.count("l1_reg")) @@ -400,10 +412,10 @@ main(int argc, char** argv) vector > pairs; if (pair_sampling == "all") all_pairs(samples, pairs, pair_threshold); - if (pair_sampling == "108010") - part108010(samples, pairs, pair_threshold); + if (pair_sampling == "XYX") + partXYX(samples, pairs, pair_threshold, hi_lo); if (pair_sampling == "PRO") - PROsampling(samples, pairs); + PROsampling(samples, pairs, pair_threshold); npairs += pairs.size(); pair_count += 2*pairs.size(); @@ -456,7 +468,7 @@ main(int argc, char** argv) } } } else if (l1cumul) { - weight_t acc_penalty = (ii+1) * l1_reg; // Note: ii is the index of the current input + weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input for (unsigned d = 0; d < lambdas.size(); d++) { if (lambdas.nonzero(d)) { weight_t v = lambdas.get(d); diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index ac13995a..7b03d258 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -13,7 +13,7 @@ #include "filelib.h" -#define DTRAIN_LOCAL +//#define DTRAIN_LOCAL #define DTRAIN_DOTS 10 // after how many inputs to display a '.' #define DTRAIN_GRAMMAR_DELIM "########EOS########" @@ -23,13 +23,15 @@ using namespace std; using namespace dtrain; namespace po = boost::program_options; -inline void register_and_convert(const vector& strs, vector& ids) { +inline void register_and_convert(const vector& strs, vector& ids) +{ vector::const_iterator it; for (it = strs.begin(); it < strs.end(); it++) ids.push_back(TD::Convert(*it)); } -inline string gettmpf(const string path, const string infix) { +inline string gettmpf(const string path, const string infix) +{ char fn[1024]; strcpy(fn, path.c_str()); strcat(fn, "/"); diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 93c0630a..66ca1706 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -13,7 +13,7 @@ accept_pair(score_t a, score_t b, score_t threshold) } inline void -all_pairs(vector* s, vector >& training, score_t threshold) +all_pairs(vector* s, vector >& training, score_t threshold, float _unused = 1) { for (unsigned i = 0; i < s->size()-1; i++) { for (unsigned j = i+1; j < s->size(); j++) { @@ -35,19 +35,16 @@ all_pairs(vector* s, vector >& training, sc * cmp middle 80% to low 10% */ bool -_108010_cmp_hyp_by_score(ScoredHyp a, ScoredHyp b) +_XYX_cmp_hyp_by_score(ScoredHyp a, ScoredHyp b) { return a.score < b.score; } inline void -part108010(vector* s, vector >& training, score_t threshold) +partXYX(vector* s, vector >& training, score_t threshold, float hi_lo) { - sort(s->begin(), s->end(), _108010_cmp_hyp_by_score); + sort(s->begin(), s->end(), _XYX_cmp_hyp_by_score); unsigned sz = s->size(); - unsigned slice = 10; - unsigned sep = sz%slice; - cout << "sep " << sep <* s, vector >& training, s * pair sampling as in * 'Tuning as Ranking' (Hopkins & May, 2011) * count = 5000 - * threshold = 5% BLEU + * threshold = 5% BLEU (0.05 for param 3) * cut = top 50 */ bool @@ -90,7 +87,7 @@ _PRO_cmp_pair_by_diff(pair a, pair b) return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score)); } inline void -PROsampling(vector* s, vector >& training, score_t threshold=0.05) +PROsampling(vector* s, vector >& training, score_t threshold, float _unused = 1) { unsigned max_count = 5000, count = 0; bool b = false; diff --git a/dtrain/score.cc b/dtrain/score.cc index ec844437..d964b4da 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -129,7 +129,7 @@ ApproxBleuScorer::Score(vector& hyp, vector& ref, ref_len = ref.size(); tmp = glob_onebest_counts + counts; } - return 0.9 * Bleu(tmp, hyp_len, ref_len); + return 0.9 * Bleu(tmp, hyp_len, ref_len); // TODO param } diff --git a/dtrain/test/example/README b/dtrain/test/example/README new file mode 100644 index 00000000..e5a5de59 --- /dev/null +++ b/dtrain/test/example/README @@ -0,0 +1,6 @@ +Small example of input format for distributed training. +Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini . + +For this to work, disable '#define DTRAIN_LOCAL' from dtrain.h +and recompile. + diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index b59250f3..cd2c75e7 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,5 +1,5 @@ input=test/example/nc-wmt11.1k.gz # use '-' for STDIN -output=weights.gz # a weights file (add .gz for gzip compression) or STDOUT '-' +output=- # a weights file (add .gz for gzip compression) or STDOUT '-' decoder_config=test/example/cdec.ini # config for cdec # weights for these features will be printed on each iteration print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough @@ -10,11 +10,12 @@ stop_after=100 # stop epoch after 100 inputs epochs=3 # run over input 3 times k=100 # use 100best lists N=4 # optimize (approx) BLEU4 -scorer=stupid_bleu # use 'stupid' BLEU+1 +scorer=approx_bleu # use 'stupid' BLEU+1 learning_rate=0.0001 # learning rate gamma=0 # use SVM reg sample_from=kbest # use kbest lists (as opposed to forest) filter=uniq # only unique entries in kbest (surface form) -pair_sampling=108010 # 10 vs 80 vs 10 and 80 vs 10 +pair_sampling=XYX +hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0) -select_weights=last # just output last weights +select_weights=VOID # don't output weights -- cgit v1.2.3