diff options
-rw-r--r-- | dtrain/README.md | 82 | ||||
-rw-r--r-- | dtrain/dtrain.cc | 52 | ||||
-rw-r--r-- | dtrain/dtrain.h | 6 | ||||
-rw-r--r-- | dtrain/test/example/dtrain.ini | 6 |
4 files changed, 120 insertions, 26 deletions
diff --git a/dtrain/README.md b/dtrain/README.md index b1dbf481..58c6dddc 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -45,6 +45,29 @@ Uncertain, known bugs, problems * input/grammar caching (strings -> WordIDs) * look at forest sampling... * devtest loo or not? why loo grammars larger? (sort psgs | uniq -> grammar) +* lower beam size to be faster? +* why is <unk> -100 in lm so good? +* noise helps? + +random notes +------------ +* learning rate tuned with perceptron +* aer correlation with bleu? +* dtrain (perc) used for some tests because no optimizer instability +* http://www.ark.cs.cmu.edu/cdyer/dtrain/ +* repeat as often as max needed by any learner! +* don't compare lms with diff vocab (stupid backoff paper) +* what does mira/pro optimize? + +features +-------- +* baseline features (take whatever cdec implements for VEST) +* rule identifiers (feature name = rule as string) +* rule discounts (taken from frequency i or frequency interval [i,j] of rule in extraction from parallel training data) +* target ngrams (from nonterminals in rule rhs) +* source-target unigrams (from word alignments used in rule extraction, if they are?) +* lhs, rhs, rule length features +* all other features depend on syntax annotation. FIXME, todo ----------- @@ -52,12 +75,26 @@ FIXME, todo * mapred count shard sents * mapred stats for learning curve (output weights per iter for eval on devtest) * 250 forest sampling is real bad, bug? -* metric reporter of bleu for each shard +* metric reporter of bleu for each shard (reporters, status?) + to draw learning curves for all shards in 1 plot * kenlm not portable (i7-2620M vs Intel(R) Xeon(R) CPU E5620 @ 2.40GHz) * mapred chaining? hamake? * make our sigtest work with cdec -* l1l2 red -* tsuroke? +* l1l2 red (tsuroke)? +* epsilon stopping criterion +* normalize weight vector to get proper model scores for forest sampling +* 108010 with gap(s), and/or fix (same score in diff groups) +* 108010: combine model score + bleu +* visualize weight vector +* *100 runs stats +* correlation of *_bleu to ibm_bleu +* ep: open lm, cutoff @1 +* tune regs +* 3x3 4x4 5x5 .. 10x10 until standard dev ok +* avg weight vector for dtrain? (mira non-avg) +* repeat lm choose with mira/pro +* shuffle training data + Data ---- @@ -116,6 +153,8 @@ lm? lm oov weight pos? -100 no tuning, -100 prob for unk EXPECT: nounk tuning with dtrain EXPECT: open + => + lmtest on cs.giza.loo??? [2] cs? @@ -167,3 +206,40 @@ variables to control [pro] + +-------- +In PRO, a continually growing list of candidates is maintained for +each sentence by concatenating k-best lists from each decoding run, +and the training pairs are sampled from them. This is done to ensure +that the optimizer doesn't forget about bad places in the parameter +space that it visited previously (since some training samples will be +selected from that space). Something like your approach should work +well though, provided you don't overfit to the sentence pair you're +looking at in each iteration. So I guess the question is: what are you +doing in step 2 exactly? A complete optimization? Taking one step? The +other thing is, do you maintain n-best hypotheses from previous +iterations? + +-------- +good grammar? => ability to overfit + berkeley vs giza + not LOO + NO optimizer instability + 20+ iterations + approx_bleu-4 + train on dev => test on dev + train on devtest => test on devtest + dev on dev better? + devtest on devtest better? + (train/test on loo? => lower!) + (test on others => real bad) + + +loo vs non-loo? => generalization + (cs vs non-cs?) + giza||berkeley + LOO + non LOO + 2 fold cross validation + train on dev, test on devtest + train on devtest, test on dev + as above ^^^ diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 4668ad66..2fe7afd7 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -24,6 +24,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("gamma", po::value<weight_t>()->default_value(0), "gamma for SVM (0 for perceptron)") ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use") ("select_weights", po::value<string>()->default_value("last"), "output 'best' or 'last' weights ('VOID' to throw away)") + ("keep_w", po::value<bool>()->zero_tokens(), "protocol weights for each iteration") #ifdef DTRAIN_LOCAL ("refs,r", po::value<string>(), "references in local mode") #endif @@ -92,7 +93,12 @@ main(int argc, char** argv) bool hstreaming = false; if (cfg.count("hstreaming")) { hstreaming = true; + quiet = true; + cerr.precision(17); } + bool keep_w = false; + if (cfg.count("keep_w")) keep_w = true; + const unsigned k = cfg["k"].as<unsigned>(); const unsigned N = cfg["N"].as<unsigned>(); const unsigned T = cfg["epochs"].as<unsigned>(); @@ -104,7 +110,7 @@ main(int argc, char** argv) vector<string> print_weights; if (cfg.count("print_weights")) boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" ")); - + // setup decoder register_feature_functions(); SetSilent(true); @@ -151,6 +157,7 @@ main(int argc, char** argv) weight_t eta = cfg["learning_rate"].as<weight_t>(); weight_t gamma = cfg["gamma"].as<weight_t>(); + // output string output_fn = cfg["output"].as<string>(); // input string input_fn = cfg["input"].as<string>(); @@ -158,9 +165,9 @@ main(int argc, char** argv) // buffer input for t > 0 vector<string> src_str_buf; // source strings (decoder takes only strings) vector<vector<WordID> > ref_ids_buf; // references as WordID vecs - vector<string> weights_files; // remember weights for each iteration // where temp files go string tmp_path = cfg["tmp"].as<string>(); + vector<string> w_tmp_files; // used for protocol_w #ifdef DTRAIN_LOCAL string refs_fn = cfg["refs"].as<string>(); ReadFile refs(refs_fn); @@ -169,7 +176,7 @@ main(int argc, char** argv) ogzstream grammar_buf_out; grammar_buf_out.open(grammar_buf_fn.c_str()); #endif - + unsigned in_sz = UINT_MAX; // input index, input size vector<pair<score_t, score_t> > all_scores; score_t max_score = 0.; @@ -206,6 +213,8 @@ main(int argc, char** argv) for (unsigned t = 0; t < T; t++) // T epochs { + if (hstreaming) cerr << "reporter:status:Iteration #" << t+1 << " of " << T << endl; + time_t start, end; time(&start); #ifndef DTRAIN_LOCAL @@ -231,7 +240,7 @@ main(int argc, char** argv) if (stop_after > 0 && stop_after == ii && !next) stop = true; // produce some pretty output - if (!hstreaming && !quiet && !verbose) { + if (!quiet && !verbose) { if (ii == 0) cerr << " "; if ((ii+1) % (DTRAIN_DOTS) == 0) { cerr << "."; @@ -375,10 +384,12 @@ main(int argc, char** argv) ++ii; - if (hstreaming) cerr << "reporter:counter:dtrain,sid," << ii << endl; + if (hstreaming) cerr << "reporter:counter:dtrain,count,1" << endl; } // input loop + if (hstreaming && t == 0) cerr << "reporter:counter:dtrain,|input|," << ii+1 << endl; + if (scorer_str == "approx_bleu") scorer->Reset(); if (t == 0) { @@ -404,6 +415,11 @@ main(int argc, char** argv) score_diff = score_avg; model_diff = model_avg; } + if (hstreaming) { + cerr << "reporter:counter:dtrain,score avg it " << t+1 << "," << score_avg << endl; + cerr << "reporter:counter:dtrain,model avg it " << t+1 << "," << model_avg << endl; + } + if (!quiet) { cerr << _p5 << _p << "WEIGHTS" << endl; for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) { @@ -439,12 +455,10 @@ main(int argc, char** argv) if (noup) break; // write weights to file - if (select_weights == "best") { - string infix = "dtrain-weights-" + boost::lexical_cast<string>(t); + if (select_weights == "best" || keep_w) { lambdas.init_vector(&dense_weights); - string w_fn = gettmpf(tmp_path, infix, "gz"); + string w_fn = "weights." + boost::lexical_cast<string>(t) + ".gz"; Weights::WriteToFile(w_fn, dense_weights, true); - weights_files.push_back(w_fn); } } // outer loop @@ -467,18 +481,19 @@ main(int argc, char** argv) } else if (select_weights == "VOID") { // do nothing with the weights } else { // best if (output_fn != "-") { - CopyFile(weights_files[best_it], output_fn); // always gzipped + CopyFile("weights."+boost::lexical_cast<string>(best_it)+".gz", output_fn); } else { - ReadFile bestw(weights_files[best_it]); + ReadFile bestw("weights."+boost::lexical_cast<string>(best_it)+".gz"); string o; cout.precision(17); cout << _np; while(getline(*bestw, o)) cout << o << endl; } - for (vector<string>::iterator it = weights_files.begin(); it != weights_files.end(); ++it) { - unlink(it->c_str()); - it->erase(it->end()-3, it->end()); - unlink(it->c_str()); + if (!keep_w) { + for (unsigned i = 0; i < T; i++) { + string s = "weights." + boost::lexical_cast<string>(i) + ".gz"; + unlink(s.c_str()); + } } } if (output_fn == "-" && hstreaming) cout << "__SHARD_COUNT__\t1" << endl; @@ -491,6 +506,13 @@ main(int argc, char** argv) cerr << _p2 << "This took " << overall_time/60. << " min." << endl; } + if (keep_w) { + cout << endl << "Weight files per iteration:" << endl; + for (unsigned i = 0; i < w_tmp_files.size(); i++) { + cout << w_tmp_files[i] << endl; + } + } + return 0; } diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 6c9decf4..6742f343 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -30,17 +30,13 @@ inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids ids.push_back(TD::Convert(*it)); } -inline string gettmpf(const string path, const string infix, const string suffix="") { +inline string gettmpf(const string path, const string infix) { char fn[1024]; strcpy(fn, path.c_str()); strcat(fn, "/"); strcat(fn, infix.c_str()); strcat(fn, "-XXXXXX"); mkstemp(fn); - if (suffix != "") { // we will get 2 files - strcat(fn, "."); - strcat(fn, suffix.c_str()); - } return string(fn); } diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index 40f8e03f..e8a20759 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,13 +1,13 @@ decoder_config=test/example/cdec.ini k=100 N=3 -gamma=0.001 +gamma=0 epochs=20 input=test/example/nc-1k-tabs.gz scorer=stupid_bleu output=weights.gz -#stop_after=100 -sample_from=forest +stop_after=10 +sample_from=kbest pair_sampling=108010 select_weights=VOID print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough |