diff options
Diffstat (limited to 'dtrain')
| -rw-r--r-- | dtrain/hstreaming/cdec.ini | 17 | ||||
| -rw-r--r-- | dtrain/hstreaming/dtrain.ini | 17 | ||||
| -rwxr-xr-x | dtrain/hstreaming/dtrain.sh | 4 | ||||
| -rwxr-xr-x | dtrain/hstreaming/hadoop-streaming-job.sh | 16 | ||||
| -rwxr-xr-x | dtrain/hstreaming/red-avg.rb | 10 | ||||
| -rw-r--r-- | dtrain/hstreaming/red-test | 1 | ||||
| -rw-r--r-- | dtrain/test/example/cdec.ini | 27 | 
7 files changed, 56 insertions, 36 deletions
| diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini index 5afa89a9..ce1e1ae2 100644 --- a/dtrain/hstreaming/cdec.ini +++ b/dtrain/hstreaming/cdec.ini @@ -4,5 +4,18 @@ scfg_max_span_limit=15  intersection_strategy=cube_pruning  cubepruning_pop_limit=200  feature_function=WordPenalty -feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz -feature_function=RuleIdentityFeatures +feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini index 708bbe46..118a27c5 100644 --- a/dtrain/hstreaming/dtrain.ini +++ b/dtrain/hstreaming/dtrain.ini @@ -1,10 +1,15 @@ +input=- +output=-  decoder_config=cdec.ini +tmp=/var/hadoop/mapred/local/ +epochs=10  k=100  N=4 -epochs=10 -input=- -output=- +learning_rate=0.0001 +gamma=0.00001  scorer=stupid_bleu -sample_from=forest -pair_sampling=all -tmp=/var/hadoop/mapred/local +sample_from=kbest +filter=uniq +pair_sampling=108010 +pair_threshold=0 +select_weights=last diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh index b6847591..ea0276dd 100755 --- a/dtrain/hstreaming/dtrain.sh +++ b/dtrain/hstreaming/dtrain.sh @@ -1,8 +1,8 @@  #!/bin/bash -pushd . +pushd . &>/dev/null  cd ..  ID=$(basename $(pwd)) # attempt_... -popd +popd &>/dev/null  ./dtrain -c dtrain.ini --hstreaming $ID  diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh index 788c9fd1..4c0238f3 100755 --- a/dtrain/hstreaming/hadoop-streaming-job.sh +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -1,26 +1,26 @@  #!/bin/bash -ID=  EXP=test  HADOOP_HOME=/usr/lib/hadoop-0.20  JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar  HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" -IN=nc-v6.de-en/nc-v6.de-en-dtrain.1500m -OUT=nc-v6.de-en/nc-v6.de-en-dtrain.1500m-weights + IN=nc-v6.de-en.cs.giza.loo/nc-v6.de-en.cs.giza.loo-dtrain1.sz2 +OUT=out/$EXP-weights  $HSTREAMING \ -    -mapper "dtrain -c dtrain.ini --hstreaming" \ +    -mapper "dtrain.sh" \      -reducer "red-avg.rb" \      -input $IN \      -output $OUT \ +    -file dtrain.sh \      -file red-avg.rb \ -    -file ../dtrain \ +    -file ~/exp/cdec-dtrain-ro/dtrain/dtrain \      -file dtrain.ini \      -file cdec.ini \ -    -file nc-wmt11.en.srilm.3.gz \ +    -file ~/exp/data/nc-v6.en.3.unk.probing.kenv5 \      -jobconf mapred.reduce.tasks=1 \ -    -jobconf mapred.max.map.failures.percent=100 \ -    -jobconf mapred.job.name="dtrain $ID $EXP" +    -jobconf mapred.max.map.failures.percent=0 \ +    -jobconf mapred.job.name="dtrain $EXP" diff --git a/dtrain/hstreaming/red-avg.rb b/dtrain/hstreaming/red-avg.rb index 771f4c0e..c0b69eb4 100755 --- a/dtrain/hstreaming/red-avg.rb +++ b/dtrain/hstreaming/red-avg.rb @@ -1,6 +1,5 @@  #!/usr/bin/env ruby1.9.1 -  shard_count_key = "__SHARD_COUNT__"  STDIN.set_encoding 'utf-8' @@ -8,7 +7,7 @@ STDOUT.set_encoding 'utf-8'  w = {}  c = {} -w.default = 0 +w.default = 0.  c.default = 0  while line = STDIN.gets    key, val = line.split /\t/ @@ -19,7 +18,10 @@ end  puts "# dtrain reducer: average"  shard_count = w["__SHARD_COUNT__"]  w.each_key { |k| -  if k == shard_count_key then next end -  puts "#{k}\t#{w[k]/shard_count}" +  if k == shard_count_key +    puts "# shard count: #{shard_count.to_i}" +  else +    puts "#{k}\t#{w[k]/shard_count}\t# #{c[k]}" +  end  } diff --git a/dtrain/hstreaming/red-test b/dtrain/hstreaming/red-test index a2a0edb1..2623d697 100644 --- a/dtrain/hstreaming/red-test +++ b/dtrain/hstreaming/red-test @@ -5,4 +5,5 @@ a	1  b	2  c	3.5  d	1 +e	2  __SHARD_COUNT__	2 diff --git a/dtrain/test/example/cdec.ini b/dtrain/test/example/cdec.ini index 14c1199b..d88779fa 100644 --- a/dtrain/test/example/cdec.ini +++ b/dtrain/test/example/cdec.ini @@ -5,18 +5,17 @@ intersection_strategy=cube_pruning  cubepruning_pop_limit=30  feature_function=WordPenalty  feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz -feature_function=RuleIdentityFeatures -# these also work with scfg translator -#feature_function=SpanFeatures -#feature_function=SourceWordPenalty -#feature_function=SourceSpanSizeFeatures -#feature_function=RuleShape -#feature_function=RuleNgramFeatures -#feature_function=OutputIndicator -#feature_function=NonLatinCount -#feature_function=NgramFeatures -#feature_function=NewJump -#feature_function=LexNullJump -#feature_function=InputIndicator -#feature_function=CMR2008ReorderingFeatures  #feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures | 
