summaryrefslogtreecommitdiff
path: root/dtrain
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-11-10 23:07:57 +0100
committerPatrick Simianer <p@simianer.de>2011-11-10 23:07:57 +0100
commitb7e58c8f9c96417d2530be21bd00662b343d6bcd (patch)
treecbcb890e9f3e76bd8c602af279db82b159fbf0f1 /dtrain
parent27498c35e5be9da1e05f48b3b67a425301bf9fd4 (diff)
some more reporting in hstreaming, keep weights option
Diffstat (limited to 'dtrain')
-rw-r--r--dtrain/README.md82
-rw-r--r--dtrain/dtrain.cc52
-rw-r--r--dtrain/dtrain.h6
-rw-r--r--dtrain/test/example/dtrain.ini6
4 files changed, 120 insertions, 26 deletions
diff --git a/dtrain/README.md b/dtrain/README.md
index b1dbf481..58c6dddc 100644
--- a/dtrain/README.md
+++ b/dtrain/README.md
@@ -45,6 +45,29 @@ Uncertain, known bugs, problems
* input/grammar caching (strings -> WordIDs)
* look at forest sampling...
* devtest loo or not? why loo grammars larger? (sort psgs | uniq -> grammar)
+* lower beam size to be faster?
+* why is <unk> -100 in lm so good?
+* noise helps?
+
+random notes
+------------
+* learning rate tuned with perceptron
+* aer correlation with bleu?
+* dtrain (perc) used for some tests because no optimizer instability
+* http://www.ark.cs.cmu.edu/cdyer/dtrain/
+* repeat as often as max needed by any learner!
+* don't compare lms with diff vocab (stupid backoff paper)
+* what does mira/pro optimize?
+
+features
+--------
+* baseline features (take whatever cdec implements for VEST)
+* rule identifiers (feature name = rule as string)
+* rule discounts (taken from frequency i or frequency interval [i,j] of rule in extraction from parallel training data)
+* target ngrams (from nonterminals in rule rhs)
+* source-target unigrams (from word alignments used in rule extraction, if they are?)
+* lhs, rhs, rule length features
+* all other features depend on syntax annotation.
FIXME, todo
-----------
@@ -52,12 +75,26 @@ FIXME, todo
* mapred count shard sents
* mapred stats for learning curve (output weights per iter for eval on devtest)
* 250 forest sampling is real bad, bug?
-* metric reporter of bleu for each shard
+* metric reporter of bleu for each shard (reporters, status?)
+ to draw learning curves for all shards in 1 plot
* kenlm not portable (i7-2620M vs Intel(R) Xeon(R) CPU E5620 @ 2.40GHz)
* mapred chaining? hamake?
* make our sigtest work with cdec
-* l1l2 red
-* tsuroke?
+* l1l2 red (tsuroke)?
+* epsilon stopping criterion
+* normalize weight vector to get proper model scores for forest sampling
+* 108010 with gap(s), and/or fix (same score in diff groups)
+* 108010: combine model score + bleu
+* visualize weight vector
+* *100 runs stats
+* correlation of *_bleu to ibm_bleu
+* ep: open lm, cutoff @1
+* tune regs
+* 3x3 4x4 5x5 .. 10x10 until standard dev ok
+* avg weight vector for dtrain? (mira non-avg)
+* repeat lm choose with mira/pro
+* shuffle training data
+
Data
----
@@ -116,6 +153,8 @@ lm?
lm oov weight pos? -100
no tuning, -100 prob for unk EXPECT: nounk
tuning with dtrain EXPECT: open
+ =>
+ lmtest on cs.giza.loo???
[2]
cs?
@@ -167,3 +206,40 @@ variables to control
[pro]
+
+--------
+In PRO, a continually growing list of candidates is maintained for
+each sentence by concatenating k-best lists from each decoding run,
+and the training pairs are sampled from them. This is done to ensure
+that the optimizer doesn't forget about bad places in the parameter
+space that it visited previously (since some training samples will be
+selected from that space). Something like your approach should work
+well though, provided you don't overfit to the sentence pair you're
+looking at in each iteration. So I guess the question is: what are you
+doing in step 2 exactly? A complete optimization? Taking one step? The
+other thing is, do you maintain n-best hypotheses from previous
+iterations?
+
+--------
+good grammar? => ability to overfit
+ berkeley vs giza
+ not LOO
+ NO optimizer instability
+ 20+ iterations
+ approx_bleu-4
+ train on dev => test on dev
+ train on devtest => test on devtest
+ dev on dev better?
+ devtest on devtest better?
+ (train/test on loo? => lower!)
+ (test on others => real bad)
+
+
+loo vs non-loo? => generalization
+ (cs vs non-cs?)
+ giza||berkeley
+ LOO + non LOO
+ 2 fold cross validation
+ train on dev, test on devtest
+ train on devtest, test on dev
+ as above ^^^
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 4668ad66..2fe7afd7 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -24,6 +24,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
("gamma", po::value<weight_t>()->default_value(0), "gamma for SVM (0 for perceptron)")
("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use")
("select_weights", po::value<string>()->default_value("last"), "output 'best' or 'last' weights ('VOID' to throw away)")
+ ("keep_w", po::value<bool>()->zero_tokens(), "protocol weights for each iteration")
#ifdef DTRAIN_LOCAL
("refs,r", po::value<string>(), "references in local mode")
#endif
@@ -92,7 +93,12 @@ main(int argc, char** argv)
bool hstreaming = false;
if (cfg.count("hstreaming")) {
hstreaming = true;
+ quiet = true;
+ cerr.precision(17);
}
+ bool keep_w = false;
+ if (cfg.count("keep_w")) keep_w = true;
+
const unsigned k = cfg["k"].as<unsigned>();
const unsigned N = cfg["N"].as<unsigned>();
const unsigned T = cfg["epochs"].as<unsigned>();
@@ -104,7 +110,7 @@ main(int argc, char** argv)
vector<string> print_weights;
if (cfg.count("print_weights"))
boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" "));
-
+
// setup decoder
register_feature_functions();
SetSilent(true);
@@ -151,6 +157,7 @@ main(int argc, char** argv)
weight_t eta = cfg["learning_rate"].as<weight_t>();
weight_t gamma = cfg["gamma"].as<weight_t>();
+ // output
string output_fn = cfg["output"].as<string>();
// input
string input_fn = cfg["input"].as<string>();
@@ -158,9 +165,9 @@ main(int argc, char** argv)
// buffer input for t > 0
vector<string> src_str_buf; // source strings (decoder takes only strings)
vector<vector<WordID> > ref_ids_buf; // references as WordID vecs
- vector<string> weights_files; // remember weights for each iteration
// where temp files go
string tmp_path = cfg["tmp"].as<string>();
+ vector<string> w_tmp_files; // used for protocol_w
#ifdef DTRAIN_LOCAL
string refs_fn = cfg["refs"].as<string>();
ReadFile refs(refs_fn);
@@ -169,7 +176,7 @@ main(int argc, char** argv)
ogzstream grammar_buf_out;
grammar_buf_out.open(grammar_buf_fn.c_str());
#endif
-
+
unsigned in_sz = UINT_MAX; // input index, input size
vector<pair<score_t, score_t> > all_scores;
score_t max_score = 0.;
@@ -206,6 +213,8 @@ main(int argc, char** argv)
for (unsigned t = 0; t < T; t++) // T epochs
{
+ if (hstreaming) cerr << "reporter:status:Iteration #" << t+1 << " of " << T << endl;
+
time_t start, end;
time(&start);
#ifndef DTRAIN_LOCAL
@@ -231,7 +240,7 @@ main(int argc, char** argv)
if (stop_after > 0 && stop_after == ii && !next) stop = true;
// produce some pretty output
- if (!hstreaming && !quiet && !verbose) {
+ if (!quiet && !verbose) {
if (ii == 0) cerr << " ";
if ((ii+1) % (DTRAIN_DOTS) == 0) {
cerr << ".";
@@ -375,10 +384,12 @@ main(int argc, char** argv)
++ii;
- if (hstreaming) cerr << "reporter:counter:dtrain,sid," << ii << endl;
+ if (hstreaming) cerr << "reporter:counter:dtrain,count,1" << endl;
} // input loop
+ if (hstreaming && t == 0) cerr << "reporter:counter:dtrain,|input|," << ii+1 << endl;
+
if (scorer_str == "approx_bleu") scorer->Reset();
if (t == 0) {
@@ -404,6 +415,11 @@ main(int argc, char** argv)
score_diff = score_avg;
model_diff = model_avg;
}
+ if (hstreaming) {
+ cerr << "reporter:counter:dtrain,score avg it " << t+1 << "," << score_avg << endl;
+ cerr << "reporter:counter:dtrain,model avg it " << t+1 << "," << model_avg << endl;
+ }
+
if (!quiet) {
cerr << _p5 << _p << "WEIGHTS" << endl;
for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) {
@@ -439,12 +455,10 @@ main(int argc, char** argv)
if (noup) break;
// write weights to file
- if (select_weights == "best") {
- string infix = "dtrain-weights-" + boost::lexical_cast<string>(t);
+ if (select_weights == "best" || keep_w) {
lambdas.init_vector(&dense_weights);
- string w_fn = gettmpf(tmp_path, infix, "gz");
+ string w_fn = "weights." + boost::lexical_cast<string>(t) + ".gz";
Weights::WriteToFile(w_fn, dense_weights, true);
- weights_files.push_back(w_fn);
}
} // outer loop
@@ -467,18 +481,19 @@ main(int argc, char** argv)
} else if (select_weights == "VOID") { // do nothing with the weights
} else { // best
if (output_fn != "-") {
- CopyFile(weights_files[best_it], output_fn); // always gzipped
+ CopyFile("weights."+boost::lexical_cast<string>(best_it)+".gz", output_fn);
} else {
- ReadFile bestw(weights_files[best_it]);
+ ReadFile bestw("weights."+boost::lexical_cast<string>(best_it)+".gz");
string o;
cout.precision(17);
cout << _np;
while(getline(*bestw, o)) cout << o << endl;
}
- for (vector<string>::iterator it = weights_files.begin(); it != weights_files.end(); ++it) {
- unlink(it->c_str());
- it->erase(it->end()-3, it->end());
- unlink(it->c_str());
+ if (!keep_w) {
+ for (unsigned i = 0; i < T; i++) {
+ string s = "weights." + boost::lexical_cast<string>(i) + ".gz";
+ unlink(s.c_str());
+ }
}
}
if (output_fn == "-" && hstreaming) cout << "__SHARD_COUNT__\t1" << endl;
@@ -491,6 +506,13 @@ main(int argc, char** argv)
cerr << _p2 << "This took " << overall_time/60. << " min." << endl;
}
+ if (keep_w) {
+ cout << endl << "Weight files per iteration:" << endl;
+ for (unsigned i = 0; i < w_tmp_files.size(); i++) {
+ cout << w_tmp_files[i] << endl;
+ }
+ }
+
return 0;
}
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 6c9decf4..6742f343 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -30,17 +30,13 @@ inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids
ids.push_back(TD::Convert(*it));
}
-inline string gettmpf(const string path, const string infix, const string suffix="") {
+inline string gettmpf(const string path, const string infix) {
char fn[1024];
strcpy(fn, path.c_str());
strcat(fn, "/");
strcat(fn, infix.c_str());
strcat(fn, "-XXXXXX");
mkstemp(fn);
- if (suffix != "") { // we will get 2 files
- strcat(fn, ".");
- strcat(fn, suffix.c_str());
- }
return string(fn);
}
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index 40f8e03f..e8a20759 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,13 +1,13 @@
decoder_config=test/example/cdec.ini
k=100
N=3
-gamma=0.001
+gamma=0
epochs=20
input=test/example/nc-1k-tabs.gz
scorer=stupid_bleu
output=weights.gz
-#stop_after=100
-sample_from=forest
+stop_after=10
+sample_from=kbest
pair_sampling=108010
select_weights=VOID
print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough