diff options
Diffstat (limited to 'dtrain/hstreaming')
-rw-r--r-- | dtrain/hstreaming/cdec.ini | 21 | ||||
-rw-r--r-- | dtrain/hstreaming/dtrain.ini | 15 | ||||
-rwxr-xr-x | dtrain/hstreaming/dtrain.sh | 8 | ||||
-rwxr-xr-x | dtrain/hstreaming/hadoop-streaming-job.sh | 26 | ||||
-rwxr-xr-x | dtrain/hstreaming/lplp.rb | 101 | ||||
-rwxr-xr-x | dtrain/hstreaming/red-all.rb | 26 | ||||
-rwxr-xr-x | dtrain/hstreaming/red-avg.rb | 27 | ||||
-rw-r--r-- | dtrain/hstreaming/red-test | 9 |
8 files changed, 233 insertions, 0 deletions
diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini new file mode 100644 index 00000000..ce1e1ae2 --- /dev/null +++ b/dtrain/hstreaming/cdec.ini @@ -0,0 +1,21 @@ +formalism=scfg +add_pass_through_rules=true +scfg_max_span_limit=15 +intersection_strategy=cube_pruning +cubepruning_pop_limit=200 +feature_function=WordPenalty +feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini new file mode 100644 index 00000000..118a27c5 --- /dev/null +++ b/dtrain/hstreaming/dtrain.ini @@ -0,0 +1,15 @@ +input=- +output=- +decoder_config=cdec.ini +tmp=/var/hadoop/mapred/local/ +epochs=10 +k=100 +N=4 +learning_rate=0.0001 +gamma=0.00001 +scorer=stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=108010 +pair_threshold=0 +select_weights=last diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh new file mode 100755 index 00000000..ea0276dd --- /dev/null +++ b/dtrain/hstreaming/dtrain.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +pushd . &>/dev/null +cd .. +ID=$(basename $(pwd)) # attempt_... +popd &>/dev/null +./dtrain -c dtrain.ini --hstreaming $ID + diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh new file mode 100755 index 00000000..4c0238f3 --- /dev/null +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +EXP=test + +HADOOP_HOME=/usr/lib/hadoop-0.20 +JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar +HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" + + IN=nc-v6.de-en.cs.giza.loo/nc-v6.de-en.cs.giza.loo-dtrain1.sz2 +OUT=out/$EXP-weights + +$HSTREAMING \ + -mapper "dtrain.sh" \ + -reducer "red-avg.rb" \ + -input $IN \ + -output $OUT \ + -file dtrain.sh \ + -file red-avg.rb \ + -file ~/exp/cdec-dtrain-ro/dtrain/dtrain \ + -file dtrain.ini \ + -file cdec.ini \ + -file ~/exp/data/nc-v6.en.3.unk.probing.kenv5 \ + -jobconf mapred.reduce.tasks=1 \ + -jobconf mapred.max.map.failures.percent=0 \ + -jobconf mapred.job.name="dtrain $EXP" + diff --git a/dtrain/hstreaming/lplp.rb b/dtrain/hstreaming/lplp.rb new file mode 100755 index 00000000..edb93e77 --- /dev/null +++ b/dtrain/hstreaming/lplp.rb @@ -0,0 +1,101 @@ +# lplp.rb + +# norms +def l0(feature_column, n) + if feature_column.size == n then return 1 else return 0 end +end + +def l1(feature_column, n=-1) + return feature_column.reduce { |sum, i| i.abs } +end + +def l2(feature_column, n=-1) + return Math.sqrt feature_column.reduce { |sum, i| i**2 } +end + +def linfty(feature_column, n=-1) + return feature_column.map { |i| i.abs }.max +end + +# stats +def M(feature_column, n) + return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2] +end + +def mean(feature_column, n) + return feature_column.reduce { |sum, i| sum+i } / n +end + +# selection +def select_k(weights, normfn, n, k=10000) + weights.sort{|a,b| normfn.call(b[1], n) <=> normfn.call(a[1], n)}.each { |p| + puts "#{p[0]}\t#{mean(p[1], n)}" + k -= 1 + if k == 0 then break end + } +end + +def cut(weights, normfn, n, epsilon=0.0001) + weights.each { |k,v| + if normfn.call(v).abs > epsilon + puts "#{k}\t#{mean(v, n)}" + end + } +end + + +shard_count_key = "__SHARD_COUNT__" + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +w = {} +shard_count = 0 +while line = STDIN.gets + key, val = line.split /\t/ + if k = shard_count_key + shard_count += 1 + next + end + if w.has_key? key + w[key].push val + else + w[key] = [val] + end +end + +select_k(w, method(:l1), shard_count, 100000) + +def _test() +puts +w = {} +w["a"] = [1, 2, 3] +w["b"] = [1, 2] +w["c"] = [66] +w["d"] = [10, 20, 30] +n = 3 +puts w.to_s +puts +puts "select_k" +puts "l0 expect ad" +select_k(w, method(:l0), n, 2) +puts "l1 expect c" +select_k(w, method(:l1), n, 1) +puts "l2 expect d" +select_k(w, method(:l2), n, 1) +puts +puts "cut" +puts "l1 expect cd" +cut(w, method(:l1), n, 7) +puts +puts "M" +a = [1,3,4,5,6] +puts a.to_s +puts M(a, 7) +puts "that's because we add missing 0s" +puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s +puts +end + +#_test() + diff --git a/dtrain/hstreaming/red-all.rb b/dtrain/hstreaming/red-all.rb new file mode 100755 index 00000000..bbc65945 --- /dev/null +++ b/dtrain/hstreaming/red-all.rb @@ -0,0 +1,26 @@ +#!/usr/bin/env ruby1.9.1 + + +shard_count_key = "__SHARD_COUNT__" + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +w = {} +c = {} +w.default = 0 +c.default = 0 +while line = STDIN.gets + key, val = line.split /\t/ + w[key] += val.to_f + c[key] += 1 +end + +puts "# dtrain reducer: active on all" +shard_count = w["__SHARD_COUNT__"] +puts "shard count #{shard_count}" +w.each_key { |k| + if k == shard_count_key then next end + if c[k] == shard_count then puts "#{k}\t#{w[k]/shard_count}" end +} + diff --git a/dtrain/hstreaming/red-avg.rb b/dtrain/hstreaming/red-avg.rb new file mode 100755 index 00000000..9326ffbe --- /dev/null +++ b/dtrain/hstreaming/red-avg.rb @@ -0,0 +1,27 @@ +#!/usr/bin/env ruby1.9.1 + +shard_count_key = "__SHARD_COUNT__" + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +w = {} +c = {} +w.default = 0 +c.default = 0 +while line = STDIN.gets + key, val = line.split /\t/ + w[key] += val.to_f + c[key] += 1 +end + +puts "# dtrain reducer: average" +shard_count = w["__SHARD_COUNT__"] +w.each_key { |k| + if k == shard_count_key + puts "# shard count: #{shard_count.to_i}" + else + puts "#{k}\t#{w[k]/shard_count}\t# #{c[k]}" + end +} + diff --git a/dtrain/hstreaming/red-test b/dtrain/hstreaming/red-test new file mode 100644 index 00000000..2623d697 --- /dev/null +++ b/dtrain/hstreaming/red-test @@ -0,0 +1,9 @@ +a 1 +b 2 +c 3.5 +a 1 +b 2 +c 3.5 +d 1 +e 2 +__SHARD_COUNT__ 2 |