diff options
author | Patrick Simianer <p@simianer.de> | 2011-09-27 00:14:27 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2011-09-27 00:14:27 +0200 |
commit | c04c964b2a3c4e0f0c5a85acad5404fa6b7fb976 (patch) | |
tree | e7e35f0e1193d61c82140b6d06ab469f15367dc3 /dtrain/hstreaming | |
parent | 63fe5ea76c52f76a924f1d3df9f6bff6a2c0d93d (diff) | |
parent | 40c884b3d74a1779be80974fc6fc926b0812813c (diff) |
more streaming
Diffstat (limited to 'dtrain/hstreaming')
-rwxr-xr-x | dtrain/hstreaming/avgweights.rb | 27 | ||||
-rw-r--r-- | dtrain/hstreaming/cdec.ini | 4 | ||||
-rw-r--r-- | dtrain/hstreaming/dtrain.ini | 9 | ||||
-rwxr-xr-x | dtrain/hstreaming/hadoop-streaming-job.sh | 11 | ||||
-rw-r--r-- | dtrain/hstreaming/nc-wmt11.en.srilm.3.gz | bin | 0 -> 12173238 bytes | |||
-rwxr-xr-x | dtrain/hstreaming/red-avg.rb | 26 |
6 files changed, 39 insertions, 38 deletions
diff --git a/dtrain/hstreaming/avgweights.rb b/dtrain/hstreaming/avgweights.rb deleted file mode 100755 index d5cfaa4d..00000000 --- a/dtrain/hstreaming/avgweights.rb +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env ruby1.9.1 - - -STDIN.set_encoding 'utf-8' - -#shard_count_key = "__SHARD_COUNT__" - -w = {} -c = {} -w.default = 0 -c.default = 0 -while line = STDIN.gets - key, val = line.split /\t/ - w[key] += val.to_f - c[key] += 1.0 -end - -#shard_count = w["__SHARD_COUNT__"] - -w.each_key { |k| - #if k == shard_count_key then next end - #if k == "__bias" then next end - puts "#{k}\t#{w[k]/c[k]}" #{w[k]/shard_count}" -} - -#puts "#{shard_count_key}\t#{w[shard_count_key]}" - diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini index cc0586d6..bea54afe 100644 --- a/dtrain/hstreaming/cdec.ini +++ b/dtrain/hstreaming/cdec.ini @@ -1,7 +1,7 @@ formalism=scfg add_pass_through_rules=true -feature_function=WordPenalty cubepruning_pop_limit=30 +scfg_max_span_limit=15 +feature_function=WordPenalty feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz feature_function=RuleIdentityFeatures -scfg_max_span_limit=15 diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini index c7ce7f44..bb594653 100644 --- a/dtrain/hstreaming/dtrain.ini +++ b/dtrain/hstreaming/dtrain.ini @@ -1,7 +1,10 @@ decoder_config=cdec.ini -kbest=100 -ngrams=4 +k=100 +N=4 epochs=10 input=- -scorer=stupid_bleu output=- +scorer=stupid_bleu +sample_from=forest +pair_sampling=rand +tmp=/var/hadoop/mapred/local diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh index 2cf3f50a..9da0a6c3 100755 --- a/dtrain/hstreaming/hadoop-streaming-job.sh +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -5,16 +5,15 @@ JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.1400m -OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m +OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m-NEW $HSTREAMING \ - -mapper "dtrain.sh" \ - -reducer "avgweights.rb" \ + -mapper "dtrain -c dtrain.ini --hstreaming" \ + -reducer "red-avg.rb" \ -input $IN \ -output $OUT \ - -file avgweights.rb \ - -file dtrain.sh \ - -file dtrain \ + -file red-avg.rb \ + -file ../dtrain \ -file dtrain.ini \ -file cdec.ini \ -file nc-wmt11.en.srilm.3.gz \ diff --git a/dtrain/hstreaming/nc-wmt11.en.srilm.3.gz b/dtrain/hstreaming/nc-wmt11.en.srilm.3.gz Binary files differnew file mode 100644 index 00000000..5a50f8fb --- /dev/null +++ b/dtrain/hstreaming/nc-wmt11.en.srilm.3.gz diff --git a/dtrain/hstreaming/red-avg.rb b/dtrain/hstreaming/red-avg.rb new file mode 100755 index 00000000..11dc0d71 --- /dev/null +++ b/dtrain/hstreaming/red-avg.rb @@ -0,0 +1,26 @@ +#!/usr/bin/env ruby1.9.1 + + +STDIN.set_encoding 'utf-8' + +shard_count_key = "__SHARD_COUNT__" + +w = {} +c = {} +w.default = 0 +c.default = 0 +while line = STDIN.gets + key, val = line.split /\t/ + w[key] += val.to_f + c[key] += 1.0 +end + +shard_count = w["__SHARD_COUNT__"] + +w.each_key { |k| + if k == shard_count_key then next end + puts "#{k}\t{w[k]/shard_count}" +} + +puts "#{shard_count_key}\t#{w[shard_count_key]}" + |