From 8700cf32a68c546904f624ab6cd5b112fb3652af Mon Sep 17 00:00:00 2001 From: Patrick Simianer
Date: Tue, 1 Nov 2011 19:51:18 +0100 Subject: README again --- dtrain/README.md | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) (limited to 'dtrain') diff --git a/dtrain/README.md b/dtrain/README.md index ea9997ee..b1dbf481 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -33,6 +33,8 @@ Ideas * use separate *TEST SET* for each shard * *REDUCE* training set (50k?) * *SYNTAX* features (CD) +* distribute *DEV* set to all nodes, avg + Uncertain, known bugs, problems ------------------------------- @@ -46,25 +48,31 @@ Uncertain, known bugs, problems FIXME, todo ----------- -* merge dtrain part-X files, for better blocks +* merge dtrain part-X files, for better blocks (how to do this with 4.5tb ep) * mapred count shard sents +* mapred stats for learning curve (output weights per iter for eval on devtest) * 250 forest sampling is real bad, bug? * metric reporter of bleu for each shard * kenlm not portable (i7-2620M vs Intel(R) Xeon(R) CPU E5620 @ 2.40GHz) * mapred chaining? hamake? +* make our sigtest work with cdec +* l1l2 red +* tsuroke? Data ----
-nc-v6.de-en peg -nc-v6.de-en.loo peg -nc-v6.de-en.giza.loo peg -nc-v6.de-en.symgiza.loo peg -nv-v6.de-en.cs peg -nc-v6.de-en.cs.loo peg +nc-v6.de-en apegd +nc-v6.de-en.loo apegd +nc-v6.de-en.giza apegd +nc-v6.de-en.giza.loo apegd +nc-v6.de-en.cs.giza apegd +nc-v6.de-en.cs.giza.loo apegd +nv-v6.de-en.cs apegd +nc-v6.de-en.cs.loo apegd -- -ep-v6.de-en.cs pe -ep-v6.de-en.cs.loo p +ep-v6.de-en.cs apegd +ep-v6.de-en.cs.loo apegd a: alignment:, p: prep, e: extract, g: grammar, d: dtrain @@ -82,7 +90,7 @@ Experiments lm stats oov on dev/devtest/test - perplex on train/dev/devtest/test] + perplex on train/dev/devtest/test?] [0] which word alignment? @@ -96,6 +104,7 @@ which word alignment? run dtrain for 100 iterations w/o all other feats (lm, wp, ...) +Glue measure ibm bleu on exact same sents + ep -> berkeleyaligner ??? (mb per sent, rules per sent) [1] lm? @@ -126,6 +135,7 @@ stability dtrain: 100 [undecided] +do we even need loo for ep? pro metaparam (max) iter regularization @@ -142,4 +152,18 @@ features to try SpanFeatures -> http://www.cs.cmu.edu/~cdyer/wmt11-sysdesc.pdf ArityPenalty -> Arity=0 Arity=1 and Arity=2 +--- +variables to control + +[alignment] + +[lm] + +[vest] + +[mira] + +[dtrain] + +[pro] -- cgit v1.2.3