summaryrefslogtreecommitdiff
path: root/dtrain/hstreaming
diff options
context:
space:
mode:
Diffstat (limited to 'dtrain/hstreaming')
-rwxr-xr-xdtrain/hstreaming/avg.rb2
-rw-r--r--dtrain/hstreaming/cdec.ini3
-rw-r--r--dtrain/hstreaming/dtrain.ini4
-rwxr-xr-xdtrain/hstreaming/dtrain.sh3
-rwxr-xr-xdtrain/hstreaming/hadoop-streaming-job.sh7
-rwxr-xr-xdtrain/hstreaming/lplp.rb2
-rwxr-xr-xdtrain/hstreaming/rule_count/map.sh4
-rw-r--r--dtrain/hstreaming/rule_count/red.rb24
-rw-r--r--dtrain/hstreaming/rule_count/rulecount.rb13
-rw-r--r--dtrain/hstreaming/rule_count/test8
10 files changed, 11 insertions, 59 deletions
diff --git a/dtrain/hstreaming/avg.rb b/dtrain/hstreaming/avg.rb
index 5deb62e4..2599c732 100755
--- a/dtrain/hstreaming/avg.rb
+++ b/dtrain/hstreaming/avg.rb
@@ -1,4 +1,5 @@
#!/usr/bin/env ruby
+# first arg may be an int of custom shard count
shard_count_key = "__SHARD_COUNT__"
@@ -22,7 +23,6 @@ else
end
w.each_key { |k|
if k == shard_count_key
- #puts "# shard count: #{shard_count.to_i}"
next
else
puts "#{k}\t#{w[k]/shard_count}"
diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini
index 61f13e86..d4f5cecd 100644
--- a/dtrain/hstreaming/cdec.ini
+++ b/dtrain/hstreaming/cdec.ini
@@ -2,11 +2,12 @@ formalism=scfg
add_pass_through_rules=true
scfg_max_span_limit=15
intersection_strategy=cube_pruning
-cubepruning_pop_limit=200
+cubepruning_pop_limit=30
feature_function=WordPenalty
feature_function=KLanguageModel nc-wmt11.en.srilm.gz
#feature_function=ArityPenalty
#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
#feature_function=InputIndicator
#feature_function=LexNullJump
#feature_function=NewJump
diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini
index 118a27c5..05535299 100644
--- a/dtrain/hstreaming/dtrain.ini
+++ b/dtrain/hstreaming/dtrain.ini
@@ -2,11 +2,11 @@ input=-
output=-
decoder_config=cdec.ini
tmp=/var/hadoop/mapred/local/
-epochs=10
+epochs=1
k=100
N=4
learning_rate=0.0001
-gamma=0.00001
+gamma=0
scorer=stupid_bleu
sample_from=kbest
filter=uniq
diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh
index ea0276dd..877ff94c 100755
--- a/dtrain/hstreaming/dtrain.sh
+++ b/dtrain/hstreaming/dtrain.sh
@@ -1,8 +1,9 @@
#!/bin/bash
+# script to run dtrain with a task id
pushd . &>/dev/null
cd ..
ID=$(basename $(pwd)) # attempt_...
popd &>/dev/null
-./dtrain -c dtrain.ini --hstreaming $ID
+./dtrain -c dtrain.ini --hstreaming $ID
diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh
index 90c2b790..92419956 100755
--- a/dtrain/hstreaming/hadoop-streaming-job.sh
+++ b/dtrain/hstreaming/hadoop-streaming-job.sh
@@ -6,17 +6,16 @@ EXP=a_simple_test
HADOOP_HOME=/usr/lib/hadoop-0.20
JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-# ^^^
IN=input_on_hdfs
OUT=output_weights_on_hdfs
-# you can remove the -reducer line if you want to
+# you can -reducer to NONE if you want to
# do feature selection/averaging locally (e.g. to
-# keep weights of the iterations)
+# keep weights of all epochs)
$HSTREAMING \
-mapper "dtrain.sh" \
- -reducer "lplp.rb l2 select_k 100000" \
+ -reducer "ruby lplp.rb l2 select_k 100000" \
-input $IN \
-output $OUT \
-file dtrain.sh \
diff --git a/dtrain/hstreaming/lplp.rb b/dtrain/hstreaming/lplp.rb
index 57353adb..f0cd58c5 100755
--- a/dtrain/hstreaming/lplp.rb
+++ b/dtrain/hstreaming/lplp.rb
@@ -29,7 +29,7 @@ end
# selection
def select_k(weights, norm_fun, n, k=10000)
weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p|
- puts "#{p[0]}\t#{mean(p[1], n)}"
+ puts "#{p[0]}\t#{mean(p[1], n)}"
k -= 1
if k == 0 then break end
}
diff --git a/dtrain/hstreaming/rule_count/map.sh b/dtrain/hstreaming/rule_count/map.sh
deleted file mode 100755
index ae75fece..00000000
--- a/dtrain/hstreaming/rule_count/map.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-
-ruby rulecount.rb | sort | ruby red.rb
-
diff --git a/dtrain/hstreaming/rule_count/red.rb b/dtrain/hstreaming/rule_count/red.rb
deleted file mode 100644
index 874ae7ac..00000000
--- a/dtrain/hstreaming/rule_count/red.rb
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env ruby
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-def output(key, val)
- puts "#{key}\t#{val}"
-end
-
-prev_key = nil
-sum = 0
-while line = STDIN.gets
- key, val = line.strip.split /\t/
- if key != prev_key && sum > 0
- output prev_key, sum
- prev_key = key
- sum = 0
- elsif !prev_key
- prev_key = key
- end
- sum += val.to_i
-end
-output prev_key, sum
-
diff --git a/dtrain/hstreaming/rule_count/rulecount.rb b/dtrain/hstreaming/rule_count/rulecount.rb
deleted file mode 100644
index 67361fa4..00000000
--- a/dtrain/hstreaming/rule_count/rulecount.rb
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env ruby
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-while line = STDIN.gets
- a = line.strip.chomp.split "\t"
- a[3..a.size].each { |r|
- id = r.split("|||")[0..2].join("|||").to_s.strip.gsub("\s", "_")
- puts "#{id}\t1"
- }
-end
-
diff --git a/dtrain/hstreaming/rule_count/test b/dtrain/hstreaming/rule_count/test
deleted file mode 100644
index acd00a5e..00000000
--- a/dtrain/hstreaming/rule_count/test
+++ /dev/null
@@ -1,8 +0,0 @@
-a 1
-a 1
-a 1
-b 1
-b 1
-c 1
-d 1
-a 1