From b6a8fdc981daa3b01826a3be4c2355ee5cd61f0a Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 28 Nov 2011 11:34:41 +0100 Subject: hstreaming stuff --- dtrain/hstreaming/cdec.ini | 17 +++++++++++++++-- dtrain/hstreaming/dtrain.ini | 17 +++++++++++------ dtrain/hstreaming/dtrain.sh | 4 ++-- dtrain/hstreaming/hadoop-streaming-job.sh | 16 ++++++++-------- dtrain/hstreaming/red-avg.rb | 10 ++++++---- dtrain/hstreaming/red-test | 1 + 6 files changed, 43 insertions(+), 22 deletions(-) (limited to 'dtrain/hstreaming') diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini index 5afa89a9..ce1e1ae2 100644 --- a/dtrain/hstreaming/cdec.ini +++ b/dtrain/hstreaming/cdec.ini @@ -4,5 +4,18 @@ scfg_max_span_limit=15 intersection_strategy=cube_pruning cubepruning_pop_limit=200 feature_function=WordPenalty -feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz -feature_function=RuleIdentityFeatures +feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini index 708bbe46..118a27c5 100644 --- a/dtrain/hstreaming/dtrain.ini +++ b/dtrain/hstreaming/dtrain.ini @@ -1,10 +1,15 @@ +input=- +output=- decoder_config=cdec.ini +tmp=/var/hadoop/mapred/local/ +epochs=10 k=100 N=4 -epochs=10 -input=- -output=- +learning_rate=0.0001 +gamma=0.00001 scorer=stupid_bleu -sample_from=forest -pair_sampling=all -tmp=/var/hadoop/mapred/local +sample_from=kbest +filter=uniq +pair_sampling=108010 +pair_threshold=0 +select_weights=last diff --git a/dtrain/hstreaming/dtrain.sh b/dtrain/hstreaming/dtrain.sh index b6847591..ea0276dd 100755 --- a/dtrain/hstreaming/dtrain.sh +++ b/dtrain/hstreaming/dtrain.sh @@ -1,8 +1,8 @@ #!/bin/bash -pushd . +pushd . &>/dev/null cd .. ID=$(basename $(pwd)) # attempt_... -popd +popd &>/dev/null ./dtrain -c dtrain.ini --hstreaming $ID diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh index 788c9fd1..4c0238f3 100755 --- a/dtrain/hstreaming/hadoop-streaming-job.sh +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -1,26 +1,26 @@ #!/bin/bash -ID= EXP=test HADOOP_HOME=/usr/lib/hadoop-0.20 JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" -IN=nc-v6.de-en/nc-v6.de-en-dtrain.1500m -OUT=nc-v6.de-en/nc-v6.de-en-dtrain.1500m-weights + IN=nc-v6.de-en.cs.giza.loo/nc-v6.de-en.cs.giza.loo-dtrain1.sz2 +OUT=out/$EXP-weights $HSTREAMING \ - -mapper "dtrain -c dtrain.ini --hstreaming" \ + -mapper "dtrain.sh" \ -reducer "red-avg.rb" \ -input $IN \ -output $OUT \ + -file dtrain.sh \ -file red-avg.rb \ - -file ../dtrain \ + -file ~/exp/cdec-dtrain-ro/dtrain/dtrain \ -file dtrain.ini \ -file cdec.ini \ - -file nc-wmt11.en.srilm.3.gz \ + -file ~/exp/data/nc-v6.en.3.unk.probing.kenv5 \ -jobconf mapred.reduce.tasks=1 \ - -jobconf mapred.max.map.failures.percent=100 \ - -jobconf mapred.job.name="dtrain $ID $EXP" + -jobconf mapred.max.map.failures.percent=0 \ + -jobconf mapred.job.name="dtrain $EXP" diff --git a/dtrain/hstreaming/red-avg.rb b/dtrain/hstreaming/red-avg.rb index 771f4c0e..c0b69eb4 100755 --- a/dtrain/hstreaming/red-avg.rb +++ b/dtrain/hstreaming/red-avg.rb @@ -1,6 +1,5 @@ #!/usr/bin/env ruby1.9.1 - shard_count_key = "__SHARD_COUNT__" STDIN.set_encoding 'utf-8' @@ -8,7 +7,7 @@ STDOUT.set_encoding 'utf-8' w = {} c = {} -w.default = 0 +w.default = 0. c.default = 0 while line = STDIN.gets key, val = line.split /\t/ @@ -19,7 +18,10 @@ end puts "# dtrain reducer: average" shard_count = w["__SHARD_COUNT__"] w.each_key { |k| - if k == shard_count_key then next end - puts "#{k}\t#{w[k]/shard_count}" + if k == shard_count_key + puts "# shard count: #{shard_count.to_i}" + else + puts "#{k}\t#{w[k]/shard_count}\t# #{c[k]}" + end } diff --git a/dtrain/hstreaming/red-test b/dtrain/hstreaming/red-test index a2a0edb1..2623d697 100644 --- a/dtrain/hstreaming/red-test +++ b/dtrain/hstreaming/red-test @@ -5,4 +5,5 @@ a 1 b 2 c 3.5 d 1 +e 2 __SHARD_COUNT__ 2 -- cgit v1.2.3