summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-11-28 11:34:41 +0100
committerPatrick Simianer <p@simianer.de>2011-11-28 11:34:41 +0100
commitb6a8fdc981daa3b01826a3be4c2355ee5cd61f0a (patch)
tree7565cbd3cded132a15961571f6ed820bd5e2566c
parentdd9ed2de7d5f9e3e665e532d73660aa5276680df (diff)
hstreaming stuff
-rw-r--r--dtrain/hstreaming/cdec.ini17
-rw-r--r--dtrain/hstreaming/dtrain.ini17
-rwxr-xr-xdtrain/hstreaming/dtrain.sh4
-rwxr-xr-xdtrain/hstreaming/hadoop-streaming-job.sh16
-rwxr-xr-xdtrain/hstreaming/red-avg.rb10
-rw-r--r--dtrain/hstreaming/red-test1
-rw-r--r--dtrain/test/example/cdec.ini27
7 files changed, 56 insertions, 36 deletions
diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini
index 5afa89a9..ce1e1ae2 100644
--- a/dtrain/hstreaming/cdec.ini
+++ b/dtrain/hstreaming/cdec.ini
@@ -4,5 +4,18 @@ scfg_max_span_limit=15
intersection_strategy=cube_pruning
cubepruning_pop_limit=200
feature_function=WordPenalty
-feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
-feature_function=RuleIdentityFeatures
+feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+#feature_function=RuleIdentityFeatures
+#feature_function=RuleNgramFeatures
+#feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini
index 708bbe46..118a27c5 100644
--- a/dtrain/hstreaming/dtrain.ini
+++ b/dtrain/hstreaming/dtrain.ini
@@ -1,10 +1,15 @@
+input=-
+output=-
decoder_config=cdec.ini
+tmp=/var/hadoop/mapred/local/
+epochs=10
k=100
N=4
-epochs=10
-input=-
-output=-
+learning_rate=0.0001
+gamma=0.00001
scorer=stupid_bleu
-sample_from=forest
-pair_sampling=all
-tmp=/var/hadoop/mapred/local
+sample_from=kbest
+filter=uniq
+pair_sampling=108010
+pair_threshold=0
+select_weights=last
diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh
index b6847591..ea0276dd 100755
--- a/dtrain/hstreaming/dtrain.sh
+++ b/dtrain/hstreaming/dtrain.sh
@@ -1,8 +1,8 @@
#!/bin/bash
-pushd .
+pushd . &>/dev/null
cd ..
ID=$(basename $(pwd)) # attempt_...
-popd
+popd &>/dev/null
./dtrain -c dtrain.ini --hstreaming $ID
diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh
index 788c9fd1..4c0238f3 100755
--- a/dtrain/hstreaming/hadoop-streaming-job.sh
+++ b/dtrain/hstreaming/hadoop-streaming-job.sh
@@ -1,26 +1,26 @@
#!/bin/bash
-ID=
EXP=test
HADOOP_HOME=/usr/lib/hadoop-0.20
JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-IN=nc-v6.de-en/nc-v6.de-en-dtrain.1500m
-OUT=nc-v6.de-en/nc-v6.de-en-dtrain.1500m-weights
+ IN=nc-v6.de-en.cs.giza.loo/nc-v6.de-en.cs.giza.loo-dtrain1.sz2
+OUT=out/$EXP-weights
$HSTREAMING \
- -mapper "dtrain -c dtrain.ini --hstreaming" \
+ -mapper "dtrain.sh" \
-reducer "red-avg.rb" \
-input $IN \
-output $OUT \
+ -file dtrain.sh \
-file red-avg.rb \
- -file ../dtrain \
+ -file ~/exp/cdec-dtrain-ro/dtrain/dtrain \
-file dtrain.ini \
-file cdec.ini \
- -file nc-wmt11.en.srilm.3.gz \
+ -file ~/exp/data/nc-v6.en.3.unk.probing.kenv5 \
-jobconf mapred.reduce.tasks=1 \
- -jobconf mapred.max.map.failures.percent=100 \
- -jobconf mapred.job.name="dtrain $ID $EXP"
+ -jobconf mapred.max.map.failures.percent=0 \
+ -jobconf mapred.job.name="dtrain $EXP"
diff --git a/dtrain/hstreaming/red-avg.rb b/dtrain/hstreaming/red-avg.rb
index 771f4c0e..c0b69eb4 100755
--- a/dtrain/hstreaming/red-avg.rb
+++ b/dtrain/hstreaming/red-avg.rb
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby1.9.1
-
shard_count_key = "__SHARD_COUNT__"
STDIN.set_encoding 'utf-8'
@@ -8,7 +7,7 @@ STDOUT.set_encoding 'utf-8'
w = {}
c = {}
-w.default = 0
+w.default = 0.
c.default = 0
while line = STDIN.gets
key, val = line.split /\t/
@@ -19,7 +18,10 @@ end
puts "# dtrain reducer: average"
shard_count = w["__SHARD_COUNT__"]
w.each_key { |k|
- if k == shard_count_key then next end
- puts "#{k}\t#{w[k]/shard_count}"
+ if k == shard_count_key
+ puts "# shard count: #{shard_count.to_i}"
+ else
+ puts "#{k}\t#{w[k]/shard_count}\t# #{c[k]}"
+ end
}
diff --git a/dtrain/hstreaming/red-test b/dtrain/hstreaming/red-test
index a2a0edb1..2623d697 100644
--- a/dtrain/hstreaming/red-test
+++ b/dtrain/hstreaming/red-test
@@ -5,4 +5,5 @@ a 1
b 2
c 3.5
d 1
+e 2
__SHARD_COUNT__ 2
diff --git a/dtrain/test/example/cdec.ini b/dtrain/test/example/cdec.ini
index 14c1199b..d88779fa 100644
--- a/dtrain/test/example/cdec.ini
+++ b/dtrain/test/example/cdec.ini
@@ -5,18 +5,17 @@ intersection_strategy=cube_pruning
cubepruning_pop_limit=30
feature_function=WordPenalty
feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz
-feature_function=RuleIdentityFeatures
-# these also work with scfg translator
-#feature_function=SpanFeatures
-#feature_function=SourceWordPenalty
-#feature_function=SourceSpanSizeFeatures
-#feature_function=RuleShape
-#feature_function=RuleNgramFeatures
-#feature_function=OutputIndicator
-#feature_function=NonLatinCount
-#feature_function=NgramFeatures
-#feature_function=NewJump
-#feature_function=LexNullJump
-#feature_function=InputIndicator
-#feature_function=CMR2008ReorderingFeatures
#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+#feature_function=RuleIdentityFeatures
+#feature_function=RuleNgramFeatures
+#feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures