summaryrefslogtreecommitdiff
path: root/dtrain/hstreaming
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2012-03-13 09:15:46 +0100
committerPatrick Simianer <p@simianer.de>2012-03-13 09:15:46 +0100
commit10a232656a0c882b3b955d2bcfac138ce11e8a2e (patch)
tree134e2637908cd85b3548d68ac8590f3aad8d1c49 /dtrain/hstreaming
parente77078e31cd75f0e5983d332b990809a3644b0fb (diff)
polish
Diffstat (limited to 'dtrain/hstreaming')
-rwxr-xr-xdtrain/hstreaming/avg.rb2
-rw-r--r--dtrain/hstreaming/cdec.ini2
-rwxr-xr-xdtrain/hstreaming/hadoop-streaming-job.sh23
-rw-r--r--dtrain/hstreaming/rule_count/red.rb2
-rw-r--r--dtrain/hstreaming/rule_count/rulecount.rb2
5 files changed, 20 insertions, 11 deletions
diff --git a/dtrain/hstreaming/avg.rb b/dtrain/hstreaming/avg.rb
index e0899144..91d4e29a 100755
--- a/dtrain/hstreaming/avg.rb
+++ b/dtrain/hstreaming/avg.rb
@@ -1,4 +1,4 @@
-# avg.rb
+#!/usr/bin/env ruby
shard_count_key = "__SHARD_COUNT__"
diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini
index ce1e1ae2..61f13e86 100644
--- a/dtrain/hstreaming/cdec.ini
+++ b/dtrain/hstreaming/cdec.ini
@@ -4,7 +4,7 @@ scfg_max_span_limit=15
intersection_strategy=cube_pruning
cubepruning_pop_limit=200
feature_function=WordPenalty
-feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz
+feature_function=KLanguageModel nc-wmt11.en.srilm.gz
#feature_function=ArityPenalty
#feature_function=CMR2008ReorderingFeatures
#feature_function=InputIndicator
diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh
index 4c0238f3..90c2b790 100755
--- a/dtrain/hstreaming/hadoop-streaming-job.sh
+++ b/dtrain/hstreaming/hadoop-streaming-job.sh
@@ -1,26 +1,31 @@
-#!/bin/bash
+#!/bin/sh
-EXP=test
+EXP=a_simple_test
+# change these vars to fit your hadoop installation
HADOOP_HOME=/usr/lib/hadoop-0.20
JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
+# ^^^
- IN=nc-v6.de-en.cs.giza.loo/nc-v6.de-en.cs.giza.loo-dtrain1.sz2
-OUT=out/$EXP-weights
+ IN=input_on_hdfs
+OUT=output_weights_on_hdfs
+# you can remove the -reducer line if you want to
+# do feature selection/averaging locally (e.g. to
+# keep weights of the iterations)
$HSTREAMING \
-mapper "dtrain.sh" \
- -reducer "red-avg.rb" \
+ -reducer "lplp.rb l2 select_k 100000" \
-input $IN \
-output $OUT \
-file dtrain.sh \
- -file red-avg.rb \
- -file ~/exp/cdec-dtrain-ro/dtrain/dtrain \
+ -file lplp.rb \
+ -file ../dtrain \
-file dtrain.ini \
-file cdec.ini \
- -file ~/exp/data/nc-v6.en.3.unk.probing.kenv5 \
- -jobconf mapred.reduce.tasks=1 \
+ -file ../test/example/nc-wmt11.en.srilm.gz \
+ -jobconf mapred.reduce.tasks=30 \
-jobconf mapred.max.map.failures.percent=0 \
-jobconf mapred.job.name="dtrain $EXP"
diff --git a/dtrain/hstreaming/rule_count/red.rb b/dtrain/hstreaming/rule_count/red.rb
index 8f9109cc..874ae7ac 100644
--- a/dtrain/hstreaming/rule_count/red.rb
+++ b/dtrain/hstreaming/rule_count/red.rb
@@ -1,3 +1,5 @@
+#!/usr/bin/env ruby
+
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
diff --git a/dtrain/hstreaming/rule_count/rulecount.rb b/dtrain/hstreaming/rule_count/rulecount.rb
index 035bdf06..67361fa4 100644
--- a/dtrain/hstreaming/rule_count/rulecount.rb
+++ b/dtrain/hstreaming/rule_count/rulecount.rb
@@ -1,3 +1,5 @@
+#!/usr/bin/env ruby
+
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'