diff options
Diffstat (limited to 'dtrain/hstreaming')
-rw-r--r-- | dtrain/hstreaming/cdec.ini | 7 | ||||
-rw-r--r-- | dtrain/hstreaming/dtrain.ini | 10 | ||||
-rwxr-xr-x | dtrain/hstreaming/dtrain.sh | 8 | ||||
-rwxr-xr-x | dtrain/hstreaming/hadoop-streaming-job.sh | 26 | ||||
-rw-r--r-- | dtrain/hstreaming/nc-wmt11.en.srilm.3.gz | bin | 0 -> 12173238 bytes | |||
-rwxr-xr-x | dtrain/hstreaming/red-avg.rb | 24 | ||||
-rw-r--r-- | dtrain/hstreaming/red-test | 7 |
7 files changed, 82 insertions, 0 deletions
diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini new file mode 100644 index 00000000..bea54afe --- /dev/null +++ b/dtrain/hstreaming/cdec.ini @@ -0,0 +1,7 @@ +formalism=scfg +add_pass_through_rules=true +cubepruning_pop_limit=30 +scfg_max_span_limit=15 +feature_function=WordPenalty +feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz +feature_function=RuleIdentityFeatures diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini new file mode 100644 index 00000000..708bbe46 --- /dev/null +++ b/dtrain/hstreaming/dtrain.ini @@ -0,0 +1,10 @@ +decoder_config=cdec.ini +k=100 +N=4 +epochs=10 +input=- +output=- +scorer=stupid_bleu +sample_from=forest +pair_sampling=all +tmp=/var/hadoop/mapred/local diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh new file mode 100755 index 00000000..6d34012a --- /dev/null +++ b/dtrain/hstreaming/dtrain.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +pushd . +cd .. +ID=$(basename $(pwd)) +popd +./dtrain -c dtrain.ini --hstreaming $ID + diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh new file mode 100755 index 00000000..788c9fd1 --- /dev/null +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +ID= +EXP=test + +HADOOP_HOME=/usr/lib/hadoop-0.20 +JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar +HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" + +IN=nc-v6.de-en/nc-v6.de-en-dtrain.1500m +OUT=nc-v6.de-en/nc-v6.de-en-dtrain.1500m-weights + +$HSTREAMING \ + -mapper "dtrain -c dtrain.ini --hstreaming" \ + -reducer "red-avg.rb" \ + -input $IN \ + -output $OUT \ + -file red-avg.rb \ + -file ../dtrain \ + -file dtrain.ini \ + -file cdec.ini \ + -file nc-wmt11.en.srilm.3.gz \ + -jobconf mapred.reduce.tasks=1 \ + -jobconf mapred.max.map.failures.percent=100 \ + -jobconf mapred.job.name="dtrain $ID $EXP" + diff --git a/dtrain/hstreaming/nc-wmt11.en.srilm.3.gz b/dtrain/hstreaming/nc-wmt11.en.srilm.3.gz Binary files differnew file mode 100644 index 00000000..5a50f8fb --- /dev/null +++ b/dtrain/hstreaming/nc-wmt11.en.srilm.3.gz diff --git a/dtrain/hstreaming/red-avg.rb b/dtrain/hstreaming/red-avg.rb new file mode 100755 index 00000000..048128f5 --- /dev/null +++ b/dtrain/hstreaming/red-avg.rb @@ -0,0 +1,24 @@ +#!/usr/bin/env ruby1.9.1 + + +STDIN.set_encoding 'utf-8' + +shard_count_key = "__SHARD_COUNT__" + +w = {} +c = {} +w.default = 0 +c.default = 0 +while line = STDIN.gets + key, val = line.split /\t/ + w[key] += val.to_f + c[key] += 1.0 +end + +shard_count = w["__SHARD_COUNT__"] + +w.each_key { |k| + if k == shard_count_key then next end + puts "#{k}\t#{w[k]/shard_count}" +} + diff --git a/dtrain/hstreaming/red-test b/dtrain/hstreaming/red-test new file mode 100644 index 00000000..b86e7894 --- /dev/null +++ b/dtrain/hstreaming/red-test @@ -0,0 +1,7 @@ +a 1 +b 2 +c 3.5 +a 1 +b 2 +c 3.5 +__SHARD_COUNT__ 2 |