diff options
author | Patrick Simianer <p@simianer.de> | 2012-03-13 09:15:46 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2012-03-13 09:15:46 +0100 |
commit | 867bca3e5fa0cdd63bf032e5859fb5092d9a4ca1 (patch) | |
tree | 8504e2343429e8063ff645a6f7e30dd02df02c2a /dtrain/hstreaming | |
parent | 07ab389e9f4034faebc2ad50146456223f5ab6bf (diff) |
polish
Diffstat (limited to 'dtrain/hstreaming')
-rwxr-xr-x | dtrain/hstreaming/avg.rb | 2 | ||||
-rw-r--r-- | dtrain/hstreaming/cdec.ini | 2 | ||||
-rwxr-xr-x | dtrain/hstreaming/hadoop-streaming-job.sh | 23 | ||||
-rw-r--r-- | dtrain/hstreaming/rule_count/red.rb | 2 | ||||
-rw-r--r-- | dtrain/hstreaming/rule_count/rulecount.rb | 2 |
5 files changed, 20 insertions, 11 deletions
diff --git a/dtrain/hstreaming/avg.rb b/dtrain/hstreaming/avg.rb index e0899144..91d4e29a 100755 --- a/dtrain/hstreaming/avg.rb +++ b/dtrain/hstreaming/avg.rb @@ -1,4 +1,4 @@ -# avg.rb +#!/usr/bin/env ruby shard_count_key = "__SHARD_COUNT__" diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini index ce1e1ae2..61f13e86 100644 --- a/dtrain/hstreaming/cdec.ini +++ b/dtrain/hstreaming/cdec.ini @@ -4,7 +4,7 @@ scfg_max_span_limit=15 intersection_strategy=cube_pruning cubepruning_pop_limit=200 feature_function=WordPenalty -feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz +feature_function=KLanguageModel nc-wmt11.en.srilm.gz #feature_function=ArityPenalty #feature_function=CMR2008ReorderingFeatures #feature_function=InputIndicator diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh index 4c0238f3..90c2b790 100755 --- a/dtrain/hstreaming/hadoop-streaming-job.sh +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -1,26 +1,31 @@ -#!/bin/bash +#!/bin/sh -EXP=test +EXP=a_simple_test +# change these vars to fit your hadoop installation HADOOP_HOME=/usr/lib/hadoop-0.20 JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" +# ^^^ - IN=nc-v6.de-en.cs.giza.loo/nc-v6.de-en.cs.giza.loo-dtrain1.sz2 -OUT=out/$EXP-weights + IN=input_on_hdfs +OUT=output_weights_on_hdfs +# you can remove the -reducer line if you want to +# do feature selection/averaging locally (e.g. to +# keep weights of the iterations) $HSTREAMING \ -mapper "dtrain.sh" \ - -reducer "red-avg.rb" \ + -reducer "lplp.rb l2 select_k 100000" \ -input $IN \ -output $OUT \ -file dtrain.sh \ - -file red-avg.rb \ - -file ~/exp/cdec-dtrain-ro/dtrain/dtrain \ + -file lplp.rb \ + -file ../dtrain \ -file dtrain.ini \ -file cdec.ini \ - -file ~/exp/data/nc-v6.en.3.unk.probing.kenv5 \ - -jobconf mapred.reduce.tasks=1 \ + -file ../test/example/nc-wmt11.en.srilm.gz \ + -jobconf mapred.reduce.tasks=30 \ -jobconf mapred.max.map.failures.percent=0 \ -jobconf mapred.job.name="dtrain $EXP" diff --git a/dtrain/hstreaming/rule_count/red.rb b/dtrain/hstreaming/rule_count/red.rb index 8f9109cc..874ae7ac 100644 --- a/dtrain/hstreaming/rule_count/red.rb +++ b/dtrain/hstreaming/rule_count/red.rb @@ -1,3 +1,5 @@ +#!/usr/bin/env ruby + STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' diff --git a/dtrain/hstreaming/rule_count/rulecount.rb b/dtrain/hstreaming/rule_count/rulecount.rb index 035bdf06..67361fa4 100644 --- a/dtrain/hstreaming/rule_count/rulecount.rb +++ b/dtrain/hstreaming/rule_count/rulecount.rb @@ -1,3 +1,5 @@ +#!/usr/bin/env ruby + STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' |