diff options
Diffstat (limited to 'dtrain/hstreaming')
| -rwxr-xr-x | dtrain/hstreaming/avg.rb | 2 | ||||
| -rw-r--r-- | dtrain/hstreaming/cdec.ini | 3 | ||||
| -rw-r--r-- | dtrain/hstreaming/dtrain.ini | 4 | ||||
| -rwxr-xr-x | dtrain/hstreaming/dtrain.sh | 3 | ||||
| -rwxr-xr-x | dtrain/hstreaming/hadoop-streaming-job.sh | 7 | ||||
| -rwxr-xr-x | dtrain/hstreaming/lplp.rb | 2 | ||||
| -rwxr-xr-x | dtrain/hstreaming/rule_count/map.sh | 4 | ||||
| -rw-r--r-- | dtrain/hstreaming/rule_count/red.rb | 24 | ||||
| -rw-r--r-- | dtrain/hstreaming/rule_count/rulecount.rb | 13 | ||||
| -rw-r--r-- | dtrain/hstreaming/rule_count/test | 8 | 
10 files changed, 11 insertions, 59 deletions
| diff --git a/dtrain/hstreaming/avg.rb b/dtrain/hstreaming/avg.rb index 5deb62e4..2599c732 100755 --- a/dtrain/hstreaming/avg.rb +++ b/dtrain/hstreaming/avg.rb @@ -1,4 +1,5 @@  #!/usr/bin/env ruby +# first arg may be an int of custom shard count  shard_count_key = "__SHARD_COUNT__" @@ -22,7 +23,6 @@ else  end  w.each_key { |k|    if k == shard_count_key -    #puts "# shard count: #{shard_count.to_i}"      next    else      puts "#{k}\t#{w[k]/shard_count}" diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini index 61f13e86..d4f5cecd 100644 --- a/dtrain/hstreaming/cdec.ini +++ b/dtrain/hstreaming/cdec.ini @@ -2,11 +2,12 @@ formalism=scfg  add_pass_through_rules=true  scfg_max_span_limit=15  intersection_strategy=cube_pruning -cubepruning_pop_limit=200 +cubepruning_pop_limit=30  feature_function=WordPenalty  feature_function=KLanguageModel nc-wmt11.en.srilm.gz  #feature_function=ArityPenalty  #feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf  #feature_function=InputIndicator  #feature_function=LexNullJump  #feature_function=NewJump diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini index 118a27c5..05535299 100644 --- a/dtrain/hstreaming/dtrain.ini +++ b/dtrain/hstreaming/dtrain.ini @@ -2,11 +2,11 @@ input=-  output=-  decoder_config=cdec.ini  tmp=/var/hadoop/mapred/local/ -epochs=10 +epochs=1  k=100  N=4  learning_rate=0.0001 -gamma=0.00001 +gamma=0  scorer=stupid_bleu  sample_from=kbest  filter=uniq diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh index ea0276dd..877ff94c 100755 --- a/dtrain/hstreaming/dtrain.sh +++ b/dtrain/hstreaming/dtrain.sh @@ -1,8 +1,9 @@  #!/bin/bash +# script to run dtrain with a task id  pushd . &>/dev/null  cd ..  ID=$(basename $(pwd)) # attempt_...  popd &>/dev/null -./dtrain -c dtrain.ini --hstreaming $ID  +./dtrain -c dtrain.ini --hstreaming $ID diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh index 90c2b790..92419956 100755 --- a/dtrain/hstreaming/hadoop-streaming-job.sh +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -6,17 +6,16 @@ EXP=a_simple_test  HADOOP_HOME=/usr/lib/hadoop-0.20  JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar  HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" -# ^^^   IN=input_on_hdfs  OUT=output_weights_on_hdfs -# you can remove the -reducer line if you want to +# you can -reducer to NONE if you want to  # do feature selection/averaging locally (e.g. to -# keep weights of the iterations) +# keep weights of all epochs)  $HSTREAMING \      -mapper "dtrain.sh" \ -    -reducer "lplp.rb l2 select_k 100000" \ +    -reducer "ruby lplp.rb l2 select_k 100000" \      -input $IN \      -output $OUT \      -file dtrain.sh \ diff --git a/dtrain/hstreaming/lplp.rb b/dtrain/hstreaming/lplp.rb index 57353adb..f0cd58c5 100755 --- a/dtrain/hstreaming/lplp.rb +++ b/dtrain/hstreaming/lplp.rb @@ -29,7 +29,7 @@ end  # selection  def select_k(weights, norm_fun, n, k=10000)    weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p| -    puts "#{p[0]}\t#{mean(p[1], n)}"  +    puts "#{p[0]}\t#{mean(p[1], n)}"      k -= 1      if k == 0 then break end    } diff --git a/dtrain/hstreaming/rule_count/map.sh b/dtrain/hstreaming/rule_count/map.sh deleted file mode 100755 index ae75fece..00000000 --- a/dtrain/hstreaming/rule_count/map.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh - -ruby rulecount.rb | sort | ruby red.rb - diff --git a/dtrain/hstreaming/rule_count/red.rb b/dtrain/hstreaming/rule_count/red.rb deleted file mode 100644 index 874ae7ac..00000000 --- a/dtrain/hstreaming/rule_count/red.rb +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env ruby - -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - -def output(key, val) -  puts "#{key}\t#{val}" -end - -prev_key = nil -sum = 0 -while line = STDIN.gets -   key, val = line.strip.split /\t/ -   if key != prev_key && sum > 0 -      output prev_key, sum -      prev_key = key -      sum = 0 -   elsif !prev_key -      prev_key = key -   end -   sum += val.to_i -end -output prev_key, sum - diff --git a/dtrain/hstreaming/rule_count/rulecount.rb b/dtrain/hstreaming/rule_count/rulecount.rb deleted file mode 100644 index 67361fa4..00000000 --- a/dtrain/hstreaming/rule_count/rulecount.rb +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env ruby - -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - -while line = STDIN.gets -  a = line.strip.chomp.split "\t" -  a[3..a.size].each { |r| -    id = r.split("|||")[0..2].join("|||").to_s.strip.gsub("\s", "_") -    puts "#{id}\t1" -  } -end - diff --git a/dtrain/hstreaming/rule_count/test b/dtrain/hstreaming/rule_count/test deleted file mode 100644 index acd00a5e..00000000 --- a/dtrain/hstreaming/rule_count/test +++ /dev/null @@ -1,8 +0,0 @@ -a	1 -a	1 -a	1 -b	1 -b	1 -c	1 -d	1 -a	1 | 
