diff options
| author | Patrick Simianer <p@simianer.de> | 2011-11-10 23:07:57 +0100 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2011-11-10 23:07:57 +0100 | 
| commit | b7e58c8f9c96417d2530be21bd00662b343d6bcd (patch) | |
| tree | cbcb890e9f3e76bd8c602af279db82b159fbf0f1 | |
| parent | 27498c35e5be9da1e05f48b3b67a425301bf9fd4 (diff) | |
some more reporting in hstreaming, keep weights option
| -rw-r--r-- | dtrain/README.md | 82 | ||||
| -rw-r--r-- | dtrain/dtrain.cc | 52 | ||||
| -rw-r--r-- | dtrain/dtrain.h | 6 | ||||
| -rw-r--r-- | dtrain/test/example/dtrain.ini | 6 | 
4 files changed, 120 insertions, 26 deletions
| diff --git a/dtrain/README.md b/dtrain/README.md index b1dbf481..58c6dddc 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -45,6 +45,29 @@ Uncertain, known bugs, problems  * input/grammar caching (strings -> WordIDs)  * look at forest sampling...  * devtest loo or not? why loo grammars larger? (sort psgs | uniq -> grammar) +* lower beam size to be faster? +* why is <unk> -100 in lm so good? +* noise helps? + +random notes +------------ +* learning rate tuned with perceptron +* aer correlation with bleu? +* dtrain (perc) used for some tests because no optimizer instability +* http://www.ark.cs.cmu.edu/cdyer/dtrain/ +* repeat as often as max needed by any learner! +* don't compare lms with diff vocab (stupid backoff paper) +* what does mira/pro optimize? + +features +-------- +* baseline features (take whatever cdec implements for VEST) +* rule identifiers (feature name = rule as string) +* rule discounts (taken from frequency i or frequency interval [i,j] of rule in extraction from parallel training data) +* target ngrams (from nonterminals in rule rhs) +* source-target unigrams (from word alignments used in rule extraction, if they are?) +* lhs, rhs, rule length features +* all other features depend on syntax annotation.   FIXME, todo  ----------- @@ -52,12 +75,26 @@ FIXME, todo  * mapred count shard sents  * mapred stats for learning curve (output weights per iter for eval on devtest)  * 250 forest sampling is real bad, bug? -* metric reporter of bleu for each shard +* metric reporter of bleu for each shard (reporters, status?) +  to draw learning curves for all shards in 1 plot  * kenlm not portable (i7-2620M vs Intel(R) Xeon(R) CPU E5620 @ 2.40GHz)  * mapred chaining? hamake?  * make our sigtest work with cdec -* l1l2 red -* tsuroke? +* l1l2 red (tsuroke)? +* epsilon stopping criterion +* normalize weight vector to get proper model scores for forest sampling +* 108010 with gap(s), and/or fix (same score in diff groups) +* 108010: combine model score + bleu +* visualize weight vector +* *100 runs stats +* correlation of *_bleu to ibm_bleu +* ep: open lm, cutoff @1 +* tune regs +* 3x3 4x4 5x5 .. 10x10 until standard dev ok +* avg weight vector for dtrain? (mira non-avg) +* repeat lm choose with mira/pro +* shuffle training data +  Data  ---- @@ -116,6 +153,8 @@ lm?   lm oov weight pos? -100   no tuning, -100 prob for unk EXPECT: nounk   tuning with dtrain EXPECT: open + => +  lmtest on cs.giza.loo???  [2]  cs? @@ -167,3 +206,40 @@ variables to control  [pro] + +-------- +In PRO, a continually growing list of candidates is maintained for +each sentence by concatenating k-best lists from each decoding run, +and the training pairs are sampled from them. This is done to ensure +that the optimizer doesn't forget about bad places in the parameter +space that it visited previously (since some training samples will be +selected from that space). Something like your approach should work +well though, provided you don't overfit to the sentence pair you're +looking at in each iteration. So I guess the question is: what are you +doing in step 2 exactly? A complete optimization? Taking one step? The +other thing is, do you maintain n-best hypotheses from previous +iterations? + +-------- +good grammar? => ability to overfit + berkeley vs giza + not LOO + NO optimizer instability + 20+ iterations + approx_bleu-4 + train on dev => test on dev + train on devtest => test on devtest + dev on dev better? + devtest on devtest better? + (train/test on loo? => lower!) + (test on others => real bad) + + +loo vs non-loo? => generalization + (cs vs non-cs?) + giza||berkeley + LOO + non LOO + 2 fold cross validation  + train on dev, test on devtest + train on devtest, test on dev + as above ^^^ diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 4668ad66..2fe7afd7 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -24,6 +24,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)      ("gamma",          po::value<weight_t>()->default_value(0),                          "gamma for SVM (0 for perceptron)")      ("tmp",            po::value<string>()->default_value("/tmp"),                                        "temp dir to use")      ("select_weights", po::value<string>()->default_value("last"), "output 'best' or 'last' weights ('VOID' to throw away)") +    ("keep_w",         po::value<bool>()->zero_tokens(),                              "protocol weights for each iteration")  #ifdef DTRAIN_LOCAL      ("refs,r",         po::value<string>(),                                                      "references in local mode")  #endif @@ -92,7 +93,12 @@ main(int argc, char** argv)    bool hstreaming = false;    if (cfg.count("hstreaming")) {      hstreaming = true; +    quiet = true; +    cerr.precision(17);    } +  bool keep_w = false; +  if (cfg.count("keep_w")) keep_w = true; +    const unsigned k = cfg["k"].as<unsigned>();    const unsigned N = cfg["N"].as<unsigned>();     const unsigned T = cfg["epochs"].as<unsigned>(); @@ -104,7 +110,7 @@ main(int argc, char** argv)    vector<string> print_weights;    if (cfg.count("print_weights"))      boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" ")); - +      // setup decoder    register_feature_functions();    SetSilent(true); @@ -151,6 +157,7 @@ main(int argc, char** argv)    weight_t eta = cfg["learning_rate"].as<weight_t>();    weight_t gamma = cfg["gamma"].as<weight_t>(); +  // output    string output_fn = cfg["output"].as<string>();    // input    string input_fn = cfg["input"].as<string>(); @@ -158,9 +165,9 @@ main(int argc, char** argv)    // buffer input for t > 0    vector<string> src_str_buf;          // source strings (decoder takes only strings)    vector<vector<WordID> > ref_ids_buf; // references as WordID vecs -  vector<string> weights_files;        // remember weights for each iteration    // where temp files go    string tmp_path = cfg["tmp"].as<string>(); +  vector<string> w_tmp_files; // used for protocol_w  #ifdef DTRAIN_LOCAL    string refs_fn = cfg["refs"].as<string>();    ReadFile refs(refs_fn); @@ -169,7 +176,7 @@ main(int argc, char** argv)    ogzstream grammar_buf_out;    grammar_buf_out.open(grammar_buf_fn.c_str());  #endif -   +    unsigned in_sz = UINT_MAX; // input index, input size    vector<pair<score_t, score_t> > all_scores;    score_t max_score = 0.; @@ -206,6 +213,8 @@ main(int argc, char** argv)    for (unsigned t = 0; t < T; t++) // T epochs    { +  if (hstreaming) cerr << "reporter:status:Iteration #" << t+1 << " of " << T << endl; +    time_t start, end;      time(&start);  #ifndef DTRAIN_LOCAL @@ -231,7 +240,7 @@ main(int argc, char** argv)      if (stop_after > 0 && stop_after == ii && !next) stop = true;      // produce some pretty output -    if (!hstreaming && !quiet && !verbose) { +    if (!quiet && !verbose) {        if (ii == 0) cerr << " ";        if ((ii+1) % (DTRAIN_DOTS) == 0) {          cerr << "."; @@ -375,10 +384,12 @@ main(int argc, char** argv)      ++ii; -    if (hstreaming) cerr << "reporter:counter:dtrain,sid," << ii << endl; +    if (hstreaming) cerr << "reporter:counter:dtrain,count,1" << endl;    } // input loop +  if (hstreaming && t == 0) cerr << "reporter:counter:dtrain,|input|," << ii+1 << endl; +    if (scorer_str == "approx_bleu") scorer->Reset();    if (t == 0) { @@ -404,6 +415,11 @@ main(int argc, char** argv)      score_diff = score_avg;      model_diff = model_avg;    } +  if (hstreaming) { +    cerr << "reporter:counter:dtrain,score avg it " << t+1 << "," << score_avg << endl; +    cerr << "reporter:counter:dtrain,model avg it " << t+1 << "," << model_avg << endl; +  } +    if (!quiet) {      cerr << _p5 << _p << "WEIGHTS" << endl;      for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) { @@ -439,12 +455,10 @@ main(int argc, char** argv)    if (noup) break;    // write weights to file -  if (select_weights == "best") { -    string infix = "dtrain-weights-" + boost::lexical_cast<string>(t); +  if (select_weights == "best" || keep_w) {      lambdas.init_vector(&dense_weights); -    string w_fn = gettmpf(tmp_path, infix, "gz"); +    string w_fn = "weights." + boost::lexical_cast<string>(t) + ".gz";      Weights::WriteToFile(w_fn, dense_weights, true);  -    weights_files.push_back(w_fn);    }    } // outer loop @@ -467,18 +481,19 @@ main(int argc, char** argv)      } else if (select_weights == "VOID") { // do nothing with the weights      } else { // best        if (output_fn != "-") { -        CopyFile(weights_files[best_it], output_fn);  // always gzipped +        CopyFile("weights."+boost::lexical_cast<string>(best_it)+".gz", output_fn);        } else { -        ReadFile bestw(weights_files[best_it]); +        ReadFile bestw("weights."+boost::lexical_cast<string>(best_it)+".gz");          string o;          cout.precision(17);          cout << _np;          while(getline(*bestw, o)) cout << o << endl;        } -      for (vector<string>::iterator it = weights_files.begin(); it != weights_files.end(); ++it) { -        unlink(it->c_str()); -        it->erase(it->end()-3, it->end()); -        unlink(it->c_str()); +      if (!keep_w) { +        for (unsigned i = 0; i < T; i++) { +          string s = "weights." + boost::lexical_cast<string>(i) + ".gz"; +          unlink(s.c_str()); +        }        }      }      if (output_fn == "-" && hstreaming) cout << "__SHARD_COUNT__\t1" << endl; @@ -491,6 +506,13 @@ main(int argc, char** argv)      cerr << _p2 << "This took " << overall_time/60. << " min." << endl;    } +  if (keep_w) { +    cout << endl << "Weight files per iteration:" << endl; +    for (unsigned i = 0; i < w_tmp_files.size(); i++) { +      cout << w_tmp_files[i] << endl; +    } +  } +    return 0;  } diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 6c9decf4..6742f343 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -30,17 +30,13 @@ inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids      ids.push_back(TD::Convert(*it));  } -inline string gettmpf(const string path, const string infix, const string suffix="") { +inline string gettmpf(const string path, const string infix) {    char fn[1024];    strcpy(fn, path.c_str());    strcat(fn, "/");    strcat(fn, infix.c_str());    strcat(fn, "-XXXXXX");    mkstemp(fn); -  if (suffix != "") { // we will get 2 files -    strcat(fn, "."); -    strcat(fn, suffix.c_str()); -  }    return string(fn);  } diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index 40f8e03f..e8a20759 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,13 +1,13 @@  decoder_config=test/example/cdec.ini  k=100  N=3 -gamma=0.001 +gamma=0  epochs=20  input=test/example/nc-1k-tabs.gz  scorer=stupid_bleu  output=weights.gz -#stop_after=100 -sample_from=forest +stop_after=10 +sample_from=kbest  pair_sampling=108010  select_weights=VOID  print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough | 
