From 40c7d43f8cd5ccaef22f098937b985d4101368f9 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 27 Sep 2011 00:10:48 +0200 Subject: some updates for hadoop streaming --- dtrain/dtrain.cc | 2 +- dtrain/dtrain.h | 2 +- dtrain/hstreaming/avgweights.rb | 27 --------------------------- dtrain/hstreaming/cdec.ini | 4 ++-- dtrain/hstreaming/dtrain.ini | 8 +++++--- dtrain/hstreaming/hadoop-streaming-job.sh | 11 +++++------ dtrain/hstreaming/nc-wmt11.en.srilm.3.gz | Bin 0 -> 12173238 bytes dtrain/hstreaming/red-avg.rb | 26 ++++++++++++++++++++++++++ 8 files changed, 40 insertions(+), 40 deletions(-) delete mode 100755 dtrain/hstreaming/avgweights.rb create mode 100644 dtrain/hstreaming/nc-wmt11.en.srilm.3.gz create mode 100755 dtrain/hstreaming/red-avg.rb diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 622cd01e..9969a070 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -337,7 +337,7 @@ main(int argc, char** argv) ++ii; - if (hstreaming) cerr << "reporter:counter:dtrain,sid," << in_split[0] << endl; + if (hstreaming) cerr << "reporter:counter:dtrain,sid," << ii << endl; } // input loop diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 0c27167d..71dfbc7f 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -11,7 +11,7 @@ #include "pairsampling.h" #define DTRAIN_DOTS 100 // when to display a '.' -#define DTRAIN_TMP_DIR "/tmp" +#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local" #define DTRAIN_GRAMMAR_DELIM "########EOS########" using namespace std; diff --git a/dtrain/hstreaming/avgweights.rb b/dtrain/hstreaming/avgweights.rb deleted file mode 100755 index d5cfaa4d..00000000 --- a/dtrain/hstreaming/avgweights.rb +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env ruby1.9.1 - - -STDIN.set_encoding 'utf-8' - -#shard_count_key = "__SHARD_COUNT__" - -w = {} -c = {} -w.default = 0 -c.default = 0 -while line = STDIN.gets - key, val = line.split /\t/ - w[key] += val.to_f - c[key] += 1.0 -end - -#shard_count = w["__SHARD_COUNT__"] - -w.each_key { |k| - #if k == shard_count_key then next end - #if k == "__bias" then next end - puts "#{k}\t#{w[k]/c[k]}" #{w[k]/shard_count}" -} - -#puts "#{shard_count_key}\t#{w[shard_count_key]}" - diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini index cc0586d6..bea54afe 100644 --- a/dtrain/hstreaming/cdec.ini +++ b/dtrain/hstreaming/cdec.ini @@ -1,7 +1,7 @@ formalism=scfg add_pass_through_rules=true -feature_function=WordPenalty cubepruning_pop_limit=30 +scfg_max_span_limit=15 +feature_function=WordPenalty feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz feature_function=RuleIdentityFeatures -scfg_max_span_limit=15 diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini index c7ce7f44..14195bf6 100644 --- a/dtrain/hstreaming/dtrain.ini +++ b/dtrain/hstreaming/dtrain.ini @@ -1,7 +1,9 @@ decoder_config=cdec.ini -kbest=100 -ngrams=4 +k=100 +N=4 epochs=10 input=- -scorer=stupid_bleu output=- +scorer=stupid_bleu +sample_from=forest +pair_sampling=rand diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh index 2cf3f50a..9da0a6c3 100755 --- a/dtrain/hstreaming/hadoop-streaming-job.sh +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -5,16 +5,15 @@ JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.1400m -OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m +OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m-NEW $HSTREAMING \ - -mapper "dtrain.sh" \ - -reducer "avgweights.rb" \ + -mapper "dtrain -c dtrain.ini --hstreaming" \ + -reducer "red-avg.rb" \ -input $IN \ -output $OUT \ - -file avgweights.rb \ - -file dtrain.sh \ - -file dtrain \ + -file red-avg.rb \ + -file ../dtrain \ -file dtrain.ini \ -file cdec.ini \ -file nc-wmt11.en.srilm.3.gz \ diff --git a/dtrain/hstreaming/nc-wmt11.en.srilm.3.gz b/dtrain/hstreaming/nc-wmt11.en.srilm.3.gz new file mode 100644 index 00000000..5a50f8fb Binary files /dev/null and b/dtrain/hstreaming/nc-wmt11.en.srilm.3.gz differ diff --git a/dtrain/hstreaming/red-avg.rb b/dtrain/hstreaming/red-avg.rb new file mode 100755 index 00000000..11dc0d71 --- /dev/null +++ b/dtrain/hstreaming/red-avg.rb @@ -0,0 +1,26 @@ +#!/usr/bin/env ruby1.9.1 + + +STDIN.set_encoding 'utf-8' + +shard_count_key = "__SHARD_COUNT__" + +w = {} +c = {} +w.default = 0 +c.default = 0 +while line = STDIN.gets + key, val = line.split /\t/ + w[key] += val.to_f + c[key] += 1.0 +end + +shard_count = w["__SHARD_COUNT__"] + +w.each_key { |k| + if k == shard_count_key then next end + puts "#{k}\t{w[k]/shard_count}" +} + +puts "#{shard_count_key}\t#{w[shard_count_key]}" + -- cgit v1.2.3