diff options
Diffstat (limited to 'dtrain/hstreaming')
-rwxr-xr-x | dtrain/hstreaming/avg.rb | 2 | ||||
-rw-r--r-- | dtrain/hstreaming/cdec.ini | 3 | ||||
-rw-r--r-- | dtrain/hstreaming/dtrain.ini | 4 | ||||
-rwxr-xr-x | dtrain/hstreaming/dtrain.sh | 3 | ||||
-rwxr-xr-x | dtrain/hstreaming/hadoop-streaming-job.sh | 7 | ||||
-rwxr-xr-x | dtrain/hstreaming/lplp.rb | 2 | ||||
-rwxr-xr-x | dtrain/hstreaming/rule_count/map.sh | 4 | ||||
-rw-r--r-- | dtrain/hstreaming/rule_count/red.rb | 24 | ||||
-rw-r--r-- | dtrain/hstreaming/rule_count/rulecount.rb | 13 | ||||
-rw-r--r-- | dtrain/hstreaming/rule_count/test | 8 |
10 files changed, 11 insertions, 59 deletions
diff --git a/dtrain/hstreaming/avg.rb b/dtrain/hstreaming/avg.rb index 5deb62e4..2599c732 100755 --- a/dtrain/hstreaming/avg.rb +++ b/dtrain/hstreaming/avg.rb @@ -1,4 +1,5 @@ #!/usr/bin/env ruby +# first arg may be an int of custom shard count shard_count_key = "__SHARD_COUNT__" @@ -22,7 +23,6 @@ else end w.each_key { |k| if k == shard_count_key - #puts "# shard count: #{shard_count.to_i}" next else puts "#{k}\t#{w[k]/shard_count}" diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini index 61f13e86..d4f5cecd 100644 --- a/dtrain/hstreaming/cdec.ini +++ b/dtrain/hstreaming/cdec.ini @@ -2,11 +2,12 @@ formalism=scfg add_pass_through_rules=true scfg_max_span_limit=15 intersection_strategy=cube_pruning -cubepruning_pop_limit=200 +cubepruning_pop_limit=30 feature_function=WordPenalty feature_function=KLanguageModel nc-wmt11.en.srilm.gz #feature_function=ArityPenalty #feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf #feature_function=InputIndicator #feature_function=LexNullJump #feature_function=NewJump diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini index 118a27c5..05535299 100644 --- a/dtrain/hstreaming/dtrain.ini +++ b/dtrain/hstreaming/dtrain.ini @@ -2,11 +2,11 @@ input=- output=- decoder_config=cdec.ini tmp=/var/hadoop/mapred/local/ -epochs=10 +epochs=1 k=100 N=4 learning_rate=0.0001 -gamma=0.00001 +gamma=0 scorer=stupid_bleu sample_from=kbest filter=uniq diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh index ea0276dd..877ff94c 100755 --- a/dtrain/hstreaming/dtrain.sh +++ b/dtrain/hstreaming/dtrain.sh @@ -1,8 +1,9 @@ #!/bin/bash +# script to run dtrain with a task id pushd . &>/dev/null cd .. ID=$(basename $(pwd)) # attempt_... popd &>/dev/null -./dtrain -c dtrain.ini --hstreaming $ID +./dtrain -c dtrain.ini --hstreaming $ID diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh index 90c2b790..92419956 100755 --- a/dtrain/hstreaming/hadoop-streaming-job.sh +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -6,17 +6,16 @@ EXP=a_simple_test HADOOP_HOME=/usr/lib/hadoop-0.20 JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" -# ^^^ IN=input_on_hdfs OUT=output_weights_on_hdfs -# you can remove the -reducer line if you want to +# you can -reducer to NONE if you want to # do feature selection/averaging locally (e.g. to -# keep weights of the iterations) +# keep weights of all epochs) $HSTREAMING \ -mapper "dtrain.sh" \ - -reducer "lplp.rb l2 select_k 100000" \ + -reducer "ruby lplp.rb l2 select_k 100000" \ -input $IN \ -output $OUT \ -file dtrain.sh \ diff --git a/dtrain/hstreaming/lplp.rb b/dtrain/hstreaming/lplp.rb index 57353adb..f0cd58c5 100755 --- a/dtrain/hstreaming/lplp.rb +++ b/dtrain/hstreaming/lplp.rb @@ -29,7 +29,7 @@ end # selection def select_k(weights, norm_fun, n, k=10000) weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p| - puts "#{p[0]}\t#{mean(p[1], n)}" + puts "#{p[0]}\t#{mean(p[1], n)}" k -= 1 if k == 0 then break end } diff --git a/dtrain/hstreaming/rule_count/map.sh b/dtrain/hstreaming/rule_count/map.sh deleted file mode 100755 index ae75fece..00000000 --- a/dtrain/hstreaming/rule_count/map.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh - -ruby rulecount.rb | sort | ruby red.rb - diff --git a/dtrain/hstreaming/rule_count/red.rb b/dtrain/hstreaming/rule_count/red.rb deleted file mode 100644 index 874ae7ac..00000000 --- a/dtrain/hstreaming/rule_count/red.rb +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env ruby - -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - -def output(key, val) - puts "#{key}\t#{val}" -end - -prev_key = nil -sum = 0 -while line = STDIN.gets - key, val = line.strip.split /\t/ - if key != prev_key && sum > 0 - output prev_key, sum - prev_key = key - sum = 0 - elsif !prev_key - prev_key = key - end - sum += val.to_i -end -output prev_key, sum - diff --git a/dtrain/hstreaming/rule_count/rulecount.rb b/dtrain/hstreaming/rule_count/rulecount.rb deleted file mode 100644 index 67361fa4..00000000 --- a/dtrain/hstreaming/rule_count/rulecount.rb +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env ruby - -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - -while line = STDIN.gets - a = line.strip.chomp.split "\t" - a[3..a.size].each { |r| - id = r.split("|||")[0..2].join("|||").to_s.strip.gsub("\s", "_") - puts "#{id}\t1" - } -end - diff --git a/dtrain/hstreaming/rule_count/test b/dtrain/hstreaming/rule_count/test deleted file mode 100644 index acd00a5e..00000000 --- a/dtrain/hstreaming/rule_count/test +++ /dev/null @@ -1,8 +0,0 @@ -a 1 -a 1 -a 1 -b 1 -b 1 -c 1 -d 1 -a 1 |