From 0c28b8dc375722c631486377217c6c8a6a362b5a Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sat, 29 Oct 2011 19:56:55 +0200 Subject: README --- dtrain/README.md | 64 +++++++++++++++++++++++++++++++++++++++++++++++--------- dtrain/dtrain.cc | 2 +- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/dtrain/README.md b/dtrain/README.md index bc96ed18..3d09393c 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -45,7 +45,10 @@ FIXME ----- merge dtrain part-* files mapred count shard sents - +250 forest sampling is real bad, bug? +kenlm not portable (i7-2620M vs Intel(R) Xeon(R) CPU E5620 @ 2.40GHz) +metric reporter of bleu for each shard +mapred chaining? hamake? Data ---- @@ -66,16 +69,57 @@ p: prep, e: extract, g: grammar, d: dtrain Experiments ----------- -features - TODO +grammar stats + oov on dev/devtest/test + size + #rules (uniq) + time for building + ep: 1.5 days on 278 slots (30 nodes) + nc: ~2 hours ^^^ + +lm stats + oov on dev/devtest/test + perplex on train/dev/devtest/test + +which word alignment? + berkeleyaligner + giza++ as of Sep 24 2011, mgizapp 0.6.3 + symgiza as of Oct 1 2011 + --- + randomly sample 100 from train.loo + run mira/dtrain for 50/60 iterations + w/o lm, wp + measure ibm_bleu on exact same sents + +stability + mira: 100 + pro: 100 + vest: 100 + dtrain: 100 + + +pro metaparam + (max) iter + regularization + +mira metaparam + (max) iter: 10 (nc???) vs 15 (ep???) -"lm open better than lm closed when tuned" +lm? + 3-4-5 + open + unk + nounk (-100 for unk) + -- + tune or not??? + lm oov weight pos? -mira100-10 -mira100-17 +features to try + NgramFeatures -> target side ngrams + RuleIdentityFeatures + RuleNgramFeatures -> source side ngrams from rule + RuleShape -> relative orientation of X's and terminals + SpanFeatures -> http://www.cs.cmu.edu/~cdyer/wmt11-sysdesc.pdf + ArityPenalty -> Arity=0 Arity=1 and Arity=2 -baselines - mira - pro - vest diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 277d4e14..c4f6607d 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -25,7 +25,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("tmp", po::value()->default_value("/tmp"), "temp dir to use") ("select_weights", po::value()->default_value("last"), "output 'best' or 'last' weights ('VOID' to throw away)") #ifdef DTRAIN_LOCAL - ("refs,r", po::value(), "references for local mode") + ("refs,r", po::value(), "references in local mode") #endif ("noup", po::value()->zero_tokens(), "do not update weights"); po::options_description cl("Command Line Options"); -- cgit v1.2.3