From f99ba621e5bd10b069b453d11b3b4981dc482b6c Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 13 Nov 2011 22:12:46 +0100 Subject: new reducer: active on all tasks --- dtrain/README.md | 38 +++++++++++++++++++++++++++---- dtrain/hstreaming/nc-wmt11.en.srilm.3.gz | Bin 12173238 -> 0 bytes dtrain/hstreaming/red-all.rb | 26 +++++++++++++++++++++ dtrain/hstreaming/red-avg.rb | 9 ++++---- dtrain/hstreaming/red-test | 1 + 5 files changed, 65 insertions(+), 9 deletions(-) delete mode 100644 dtrain/hstreaming/nc-wmt11.en.srilm.3.gz create mode 100755 dtrain/hstreaming/red-all.rb diff --git a/dtrain/README.md b/dtrain/README.md index faedf8a7..46f783b0 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -34,6 +34,7 @@ Ideas * *REDUCE* training set (50k?) * *SYNTAX* features (CD) * distribute *DEV* set to all nodes, avg +* *PARAPHRASES* for better approx BLEU? Uncertain, known bugs, problems @@ -47,10 +48,11 @@ Uncertain, known bugs, problems * devtest loo or not? why loo grammars larger? (sort psgs | uniq -> grammar) * lower beam size to be faster? * why is -100 in lm so good? -* noise helps? +* noise helps for discriminative training? * what does srilm do with -unk but nothing mapped to unk ( unigram)? => this: http://www-speech.sri.com/pipermail/srilm-user/2007q4/000543.html -* mira translation sampling? +* mira translation sampling? => done +* does AER correlate with BLEU? random notes ------------ @@ -61,16 +63,25 @@ random notes * repeat as often as max needed by any learner! * don't compare lms with diff vocab (stupid backoff paper) * what does mira/pro optimize? +* early stopping +* 10-20k rules per sent normal +* shard size 500 -> 2k +* giza vs. berkeleyaligner: giza less noise? +* compound splitting -> more rules? +* loo => ref can't be reached? (jackknifing) +* prune singletons -> less noise? (do I do this?) +* random sample: take 100 at random features -------- * baseline features (take whatever cdec implements for VEST) * rule identifiers (feature name = rule as string) -* rule discounts (taken from frequency i or frequency interval [i,j] of rule in extraction from parallel training data) -* target ngrams (from nonterminals in rule rhs) +* rule discounts (taken from frequency i or frequency interval [i,j] of rule in extraction from parallel training data) bins +* target ngrams (from nonterminals in rule rhs), with gaps? * source-target unigrams (from word alignments used in rule extraction, if they are?) * lhs, rhs, rule length features * all other features depend on syntax annotation. +* word alignment FIXME, todo ----------- @@ -93,10 +104,15 @@ FIXME, todo * correlation of *_bleu to ibm_bleu * ep: open lm, cutoff @1 * tune regs -* 3x3 4x4 5x5 .. 10x10 until standard dev ok +* 3x3 4x4 5x5 .. 10x10 until standard dev ok, moving avg * avg weight vector for dtrain? (mira non-avg) * repeat lm choose with mira/pro * shuffle training data +* learning rate dynamic (Duh? Tsuroka?) +* divide updates by ? +* mira: 5/10/15, pro: (5)/10/20/30 (on devtest!) +* sample pairs like in pro +* mira forest sampling Data @@ -146,6 +162,8 @@ which word alignment? measure ibm bleu on exact same sents ep -> berkeleyaligner ??? (mb per sent, rules per sent) +*100 -> triples, quadruples + [1] lm? 3-4-5 @@ -194,6 +212,16 @@ features to try SpanFeatures -> http://www.cs.cmu.edu/~cdyer/wmt11-sysdesc.pdf ArityPenalty -> Arity=0 Arity=1 and Arity=2 +--- +shard size: 500-2k +iterations, re-iterate (shuffle w): 10 +gamma, eta +SVM, perceptron +reducer: avg (feats/shard), l1l2, active on all shards +sentence sampling: forest +pair sampling: all, rand, 108010 (sort), PRO +out of domain test? + --- variables to control diff --git a/dtrain/hstreaming/nc-wmt11.en.srilm.3.gz b/dtrain/hstreaming/nc-wmt11.en.srilm.3.gz deleted file mode 100644 index 5a50f8fb..00000000 Binary files a/dtrain/hstreaming/nc-wmt11.en.srilm.3.gz and /dev/null differ diff --git a/dtrain/hstreaming/red-all.rb b/dtrain/hstreaming/red-all.rb new file mode 100755 index 00000000..bbc65945 --- /dev/null +++ b/dtrain/hstreaming/red-all.rb @@ -0,0 +1,26 @@ +#!/usr/bin/env ruby1.9.1 + + +shard_count_key = "__SHARD_COUNT__" + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +w = {} +c = {} +w.default = 0 +c.default = 0 +while line = STDIN.gets + key, val = line.split /\t/ + w[key] += val.to_f + c[key] += 1 +end + +puts "# dtrain reducer: active on all" +shard_count = w["__SHARD_COUNT__"] +puts "shard count #{shard_count}" +w.each_key { |k| + if k == shard_count_key then next end + if c[k] == shard_count then puts "#{k}\t#{w[k]/shard_count}" end +} + diff --git a/dtrain/hstreaming/red-avg.rb b/dtrain/hstreaming/red-avg.rb index 048128f5..771f4c0e 100755 --- a/dtrain/hstreaming/red-avg.rb +++ b/dtrain/hstreaming/red-avg.rb @@ -1,10 +1,11 @@ #!/usr/bin/env ruby1.9.1 -STDIN.set_encoding 'utf-8' - shard_count_key = "__SHARD_COUNT__" +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + w = {} c = {} w.default = 0 @@ -12,11 +13,11 @@ c.default = 0 while line = STDIN.gets key, val = line.split /\t/ w[key] += val.to_f - c[key] += 1.0 + c[key] += 1 end +puts "# dtrain reducer: average" shard_count = w["__SHARD_COUNT__"] - w.each_key { |k| if k == shard_count_key then next end puts "#{k}\t#{w[k]/shard_count}" diff --git a/dtrain/hstreaming/red-test b/dtrain/hstreaming/red-test index b86e7894..a2a0edb1 100644 --- a/dtrain/hstreaming/red-test +++ b/dtrain/hstreaming/red-test @@ -4,4 +4,5 @@ c 3.5 a 1 b 2 c 3.5 +d 1 __SHARD_COUNT__ 2 -- cgit v1.2.3