diff options
author | Patrick Simianer <p@simianer.de> | 2011-09-23 20:53:15 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2011-09-23 20:53:15 +0200 |
commit | 9bde56ed23b4b97f8193f9f8f582f18086ff17c1 (patch) | |
tree | 83bd5687f2069405537f7f8fbdfbe208a634ca54 /dtrain/hstreaming | |
parent | 4433886ac335e6db7ded081b5ef673490ee27718 (diff) |
begin refactoring
Diffstat (limited to 'dtrain/hstreaming')
-rwxr-xr-x | dtrain/hstreaming/avgweights.rb | 27 | ||||
-rw-r--r-- | dtrain/hstreaming/cdec.ini | 8 | ||||
-rw-r--r-- | dtrain/hstreaming/dtrain.ini | 10 | ||||
-rwxr-xr-x | dtrain/hstreaming/hadoop-streaming-job.sh | 23 |
4 files changed, 68 insertions, 0 deletions
diff --git a/dtrain/hstreaming/avgweights.rb b/dtrain/hstreaming/avgweights.rb new file mode 100755 index 00000000..d5cfaa4d --- /dev/null +++ b/dtrain/hstreaming/avgweights.rb @@ -0,0 +1,27 @@ +#!/usr/bin/env ruby1.9.1 + + +STDIN.set_encoding 'utf-8' + +#shard_count_key = "__SHARD_COUNT__" + +w = {} +c = {} +w.default = 0 +c.default = 0 +while line = STDIN.gets + key, val = line.split /\t/ + w[key] += val.to_f + c[key] += 1.0 +end + +#shard_count = w["__SHARD_COUNT__"] + +w.each_key { |k| + #if k == shard_count_key then next end + #if k == "__bias" then next end + puts "#{k}\t#{w[k]/c[k]}" #{w[k]/shard_count}" +} + +#puts "#{shard_count_key}\t#{w[shard_count_key]}" + diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini new file mode 100644 index 00000000..0d32f0b7 --- /dev/null +++ b/dtrain/hstreaming/cdec.ini @@ -0,0 +1,8 @@ +formalism=scfg +add_pass_through_rules=true +feature_function=WordPenalty +cubepruning_pop_limit=30 +feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz +feature_function=RuleIdentityFeatures +scfg_max_span_limit=15 + diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini new file mode 100644 index 00000000..079d7d69 --- /dev/null +++ b/dtrain/hstreaming/dtrain.ini @@ -0,0 +1,10 @@ +decoder_config=cdec.ini +kbest=100 +ngrams=4 +epochs=10 +input=- +scorer=stupid_bleu +output=- +#stop_after=100 +#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough + diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh new file mode 100755 index 00000000..2cf3f50a --- /dev/null +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +HADOOP_HOME=/usr/lib/hadoop-0.20 +JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar +HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" + +IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.1400m +OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m + +$HSTREAMING \ + -mapper "dtrain.sh" \ + -reducer "avgweights.rb" \ + -input $IN \ + -output $OUT \ + -file avgweights.rb \ + -file dtrain.sh \ + -file dtrain \ + -file dtrain.ini \ + -file cdec.ini \ + -file nc-wmt11.en.srilm.3.gz \ + -jobconf mapred.reduce.tasks=1 \ + -jobconf mapred.max.map.failures.percent=100 + |