diff options
Diffstat (limited to 'dtrain/hstreaming/hadoop-streaming-job.sh')
-rwxr-xr-x | dtrain/hstreaming/hadoop-streaming-job.sh | 23 |
1 files changed, 14 insertions, 9 deletions
diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh index 4c0238f3..90c2b790 100755 --- a/dtrain/hstreaming/hadoop-streaming-job.sh +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -1,26 +1,31 @@ -#!/bin/bash +#!/bin/sh -EXP=test +EXP=a_simple_test +# change these vars to fit your hadoop installation HADOOP_HOME=/usr/lib/hadoop-0.20 JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" +# ^^^ - IN=nc-v6.de-en.cs.giza.loo/nc-v6.de-en.cs.giza.loo-dtrain1.sz2 -OUT=out/$EXP-weights + IN=input_on_hdfs +OUT=output_weights_on_hdfs +# you can remove the -reducer line if you want to +# do feature selection/averaging locally (e.g. to +# keep weights of the iterations) $HSTREAMING \ -mapper "dtrain.sh" \ - -reducer "red-avg.rb" \ + -reducer "lplp.rb l2 select_k 100000" \ -input $IN \ -output $OUT \ -file dtrain.sh \ - -file red-avg.rb \ - -file ~/exp/cdec-dtrain-ro/dtrain/dtrain \ + -file lplp.rb \ + -file ../dtrain \ -file dtrain.ini \ -file cdec.ini \ - -file ~/exp/data/nc-v6.en.3.unk.probing.kenv5 \ - -jobconf mapred.reduce.tasks=1 \ + -file ../test/example/nc-wmt11.en.srilm.gz \ + -jobconf mapred.reduce.tasks=30 \ -jobconf mapred.max.map.failures.percent=0 \ -jobconf mapred.job.name="dtrain $EXP" |