From 1ad0eb820ee946e5a142567380fc0488c9a5d6de Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 11 Sep 2011 13:17:33 +0200 Subject: latest version from mtm6 --- dtrain/avgweights.rb | 27 +++++++++++++++++++++++++++ dtrain/common.h | 2 +- dtrain/dtrain.cc | 27 +++++++++++++++++++++++---- dtrain/job/avgweights.rb | 30 ++++++++++++++++++++++++++++++ dtrain/job/cdec.ini | 8 ++++++++ dtrain/job/dtrain.ini | 10 ++++++++++ dtrain/job/dtrain.sh | 6 ++++++ dtrain/job/hadoop-streaming-job.sh | 23 +++++++++++++++++++++++ dtrain/job2/avgweights.rb | 30 ++++++++++++++++++++++++++++++ dtrain/job2/cdec.ini | 8 ++++++++ dtrain/job2/dtrain.ini | 10 ++++++++++ dtrain/job2/dtrain.sh | 6 ++++++ dtrain/job2/hadoop-streaming-job.sh | 23 +++++++++++++++++++++++ dtrain/run.sh | 6 +++--- dtrain/test-reducer | 7 +++++++ dtrain/test/toy.dtrain.ini | 4 ++-- utils/sampler.h | 2 +- 17 files changed, 218 insertions(+), 11 deletions(-) create mode 100755 dtrain/avgweights.rb create mode 100755 dtrain/job/avgweights.rb create mode 100644 dtrain/job/cdec.ini create mode 100644 dtrain/job/dtrain.ini create mode 100755 dtrain/job/dtrain.sh create mode 100755 dtrain/job/hadoop-streaming-job.sh create mode 100755 dtrain/job2/avgweights.rb create mode 100644 dtrain/job2/cdec.ini create mode 100644 dtrain/job2/dtrain.ini create mode 100755 dtrain/job2/dtrain.sh create mode 100755 dtrain/job2/hadoop-streaming-job.sh create mode 100644 dtrain/test-reducer diff --git a/dtrain/avgweights.rb b/dtrain/avgweights.rb new file mode 100755 index 00000000..d5cfaa4d --- /dev/null +++ b/dtrain/avgweights.rb @@ -0,0 +1,27 @@ +#!/usr/bin/env ruby1.9.1 + + +STDIN.set_encoding 'utf-8' + +#shard_count_key = "__SHARD_COUNT__" + +w = {} +c = {} +w.default = 0 +c.default = 0 +while line = STDIN.gets + key, val = line.split /\t/ + w[key] += val.to_f + c[key] += 1.0 +end + +#shard_count = w["__SHARD_COUNT__"] + +w.each_key { |k| + #if k == shard_count_key then next end + #if k == "__bias" then next end + puts "#{k}\t#{w[k]/c[k]}" #{w[k]/shard_count}" +} + +#puts "#{shard_count_key}\t#{w[shard_count_key]}" + diff --git a/dtrain/common.h b/dtrain/common.h index 4ff975e1..49dc85b7 100644 --- a/dtrain/common.h +++ b/dtrain/common.h @@ -30,7 +30,7 @@ #define DTRAIN_DEFAULT_T 1 // iterations #define DTRAIN_DEFAULT_SCORER "stupid_bleu" // scorer #define DTRAIN_DOTS 100 // when to display a '.' -#define DTRAIN_TMP_DIR "/tmp" // put this on a SSD? +#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local" // put this on a SSD? #define DTRAIN_GRAMMAR_DELIM "########EOS########" diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 35996d6d..f005008e 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -13,6 +13,8 @@ //#include using namespace boost::iostreams; +#include +#include #ifdef DTRAIN_DEBUG #include "tests.h" @@ -311,7 +313,7 @@ main( int argc, char** argv ) } if ( broken_grammar ) continue; grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __ - grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl; + grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; decoder.SetSentenceGrammarFromString( grammar_str ); // decode, kbest src_str_buf.push_back( in_split[1] ); @@ -323,7 +325,8 @@ main( int argc, char** argv ) while ( true ) { string g; getline( grammar_buf_in, g ); - if ( g == DTRAIN_GRAMMAR_DELIM ) break; + //if ( g == DTRAIN_GRAMMAR_DELIM ) break; + if (boost::starts_with(g, DTRAIN_GRAMMAR_DELIM)) break; grammar_str += g+"\n"; i += 1; } @@ -430,6 +433,7 @@ main( int argc, char** argv ) } ++sid; + cerr << "reporter:counter:dtrain,sent," << sid << endl; } // input loop @@ -446,6 +450,7 @@ main( int argc, char** argv ) avg_1best_score_diff = avg_1best_score; avg_1best_model_diff = avg_1best_model; } + if ( !quiet ) { cout << _prec5 << _pos << "WEIGHTS" << endl; for (vector::iterator it = wprint.begin(); it != wprint.end(); it++) { cout << setw(16) << *it << " = " << dense_weights[FD::Convert( *it )] << endl; @@ -456,6 +461,7 @@ main( int argc, char** argv ) cout << _pos << " (" << avg_1best_score_diff << ")" << endl; cout << _nopos << "avg model score: " << avg_1best_model; cout << _pos << " (" << avg_1best_model_diff << ")" << endl; + } vector remember_scores; remember_scores.push_back( avg_1best_score ); remember_scores.push_back( avg_1best_model ); @@ -478,7 +484,7 @@ main( int argc, char** argv ) cout << time_dif/(double)in_sz<< " s/S)" << endl; } - if ( t+1 != T ) cout << endl; + if ( t+1 != T && !quiet ) cout << endl; if ( noup ) break; @@ -486,8 +492,21 @@ main( int argc, char** argv ) unlink( grammar_buf_tmp_fn ); if ( !noup ) { + // TODO BEST ITER if ( !quiet ) cout << endl << "writing weights file '" << cfg["output"].as() << "' ..."; - weights.WriteToFile( cfg["output"].as(), true ); + if ( cfg["output"].as() == "-" ) { + for ( SparseVector::const_iterator ti = lambdas.begin(); + ti != lambdas.end(); ++ti ) { + if ( ti->second == 0 ) continue; + //if ( ti->first == "__bias" ) continue; + cout << setprecision(9); + cout << _nopos << FD::Convert(ti->first) << "\t" << ti->second << endl; + //cout << "__SHARD_COUNT__\t1" << endl; + } + } else { + weights.InitFromVector( lambdas ); + weights.WriteToFile( cfg["output"].as(), true ); + } if ( !quiet ) cout << "done" << endl; } diff --git a/dtrain/job/avgweights.rb b/dtrain/job/avgweights.rb new file mode 100755 index 00000000..e635aab4 --- /dev/null +++ b/dtrain/job/avgweights.rb @@ -0,0 +1,30 @@ +#!/usr/bin/env ruby1.9.1 + + +STDIN.set_encoding 'utf-8' + +#shard_count_key = "__SHARD_COUNT__" + +w = {} +#c = {} +w.default = 0 +#c.default = 0 +while line = STDIN.gets + key, val = line.split /\t/ + w[key] += val.to_f + #c[key] += 1.0 +end + +#shard_count = w["__SHARD_COUNT__"] + +num_map = 104.0 + +w.each_key { |k| + #if k == shard_count_key then next end + #if k == "__bias" then next end + puts "#{k}\t#{w[k]/num_map}" + #/c[k]}" #{w[k]/shard_count}" +} + +#puts "#{shard_count_key}\t#{w[shard_count_key]}" + diff --git a/dtrain/job/cdec.ini b/dtrain/job/cdec.ini new file mode 100644 index 00000000..0d32f0b7 --- /dev/null +++ b/dtrain/job/cdec.ini @@ -0,0 +1,8 @@ +formalism=scfg +add_pass_through_rules=true +feature_function=WordPenalty +cubepruning_pop_limit=30 +feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz +feature_function=RuleIdentityFeatures +scfg_max_span_limit=15 + diff --git a/dtrain/job/dtrain.ini b/dtrain/job/dtrain.ini new file mode 100644 index 00000000..079d7d69 --- /dev/null +++ b/dtrain/job/dtrain.ini @@ -0,0 +1,10 @@ +decoder_config=cdec.ini +kbest=100 +ngrams=4 +epochs=10 +input=- +scorer=stupid_bleu +output=- +#stop_after=100 +#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough + diff --git a/dtrain/job/dtrain.sh b/dtrain/job/dtrain.sh new file mode 100755 index 00000000..75ec29ea --- /dev/null +++ b/dtrain/job/dtrain.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +./dtrain -q -c dtrain.ini + +exit 0 + diff --git a/dtrain/job/hadoop-streaming-job.sh b/dtrain/job/hadoop-streaming-job.sh new file mode 100755 index 00000000..2cf3f50a --- /dev/null +++ b/dtrain/job/hadoop-streaming-job.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +HADOOP_HOME=/usr/lib/hadoop-0.20 +JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar +HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" + +IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.1400m +OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m + +$HSTREAMING \ + -mapper "dtrain.sh" \ + -reducer "avgweights.rb" \ + -input $IN \ + -output $OUT \ + -file avgweights.rb \ + -file dtrain.sh \ + -file dtrain \ + -file dtrain.ini \ + -file cdec.ini \ + -file nc-wmt11.en.srilm.3.gz \ + -jobconf mapred.reduce.tasks=1 \ + -jobconf mapred.max.map.failures.percent=100 + diff --git a/dtrain/job2/avgweights.rb b/dtrain/job2/avgweights.rb new file mode 100755 index 00000000..31048f16 --- /dev/null +++ b/dtrain/job2/avgweights.rb @@ -0,0 +1,30 @@ +#!/usr/bin/env ruby1.9.1 + + +STDIN.set_encoding 'utf-8' + +#shard_count_key = "__SHARD_COUNT__" + +w = {} +#c = {} +w.default = 0 +#c.default = 0 +while line = STDIN.gets + key, val = line.split /\t/ + w[key] += val.to_f + #c[key] += 1.0 +end + +#shard_count = w["__SHARD_COUNT__"] + +num_map = 2107.0 + +w.each_key { |k| + #if k == shard_count_key then next end + #if k == "__bias" then next end + puts "#{k}\t#{w[k]/num_map}" + #/c[k]}" #{w[k]/shard_count}" +} + +#puts "#{shard_count_key}\t#{w[shard_count_key]}" + diff --git a/dtrain/job2/cdec.ini b/dtrain/job2/cdec.ini new file mode 100644 index 00000000..0d32f0b7 --- /dev/null +++ b/dtrain/job2/cdec.ini @@ -0,0 +1,8 @@ +formalism=scfg +add_pass_through_rules=true +feature_function=WordPenalty +cubepruning_pop_limit=30 +feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz +feature_function=RuleIdentityFeatures +scfg_max_span_limit=15 + diff --git a/dtrain/job2/dtrain.ini b/dtrain/job2/dtrain.ini new file mode 100644 index 00000000..ec005e46 --- /dev/null +++ b/dtrain/job2/dtrain.ini @@ -0,0 +1,10 @@ +decoder_config=cdec.ini +kbest=100 +ngrams=3 +epochs=10 +input=- +scorer=stupid_bleu +output=- +#stop_after=100 +#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough + diff --git a/dtrain/job2/dtrain.sh b/dtrain/job2/dtrain.sh new file mode 100755 index 00000000..75ec29ea --- /dev/null +++ b/dtrain/job2/dtrain.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +./dtrain -q -c dtrain.ini + +exit 0 + diff --git a/dtrain/job2/hadoop-streaming-job.sh b/dtrain/job2/hadoop-streaming-job.sh new file mode 100755 index 00000000..9ee70a33 --- /dev/null +++ b/dtrain/job2/hadoop-streaming-job.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +HADOOP_HOME=/usr/lib/hadoop-0.20 +JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar +HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" + +IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain +OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights.50 + +$HSTREAMING \ + -mapper "dtrain.sh" \ + -reducer "avgweights.rb" \ + -input $IN \ + -output $OUT \ + -file avgweights.rb \ + -file dtrain.sh \ + -file dtrain \ + -file dtrain.ini \ + -file cdec.ini \ + -file nc-wmt11.en.srilm.3.gz \ + -jobconf mapred.reduce.tasks=1 \ + -jobconf mapred.max.map.failures.percent=100 + diff --git a/dtrain/run.sh b/dtrain/run.sh index 97123dfa..72e56f3e 100755 --- a/dtrain/run.sh +++ b/dtrain/run.sh @@ -2,11 +2,11 @@ #INI=test/blunsom08.dtrain.ini #INI=test/nc-wmt11/dtrain.ini -INI=test/EXAMPLE/dtrain.ini +#INI=test/EXAMPLE/dtrain.ini #INI=test/EXAMPLE/dtrain.ruleids.ini -#INI=test/toy.dtrain.ini +INI=test/toy.dtrain.ini #INI=test/EXAMPLE/dtrain.cdecrid.ini -rm /tmp/dtrain-* +#rm /tmp/dtrain-* ./dtrain -c $INI $1 $2 $3 $4 diff --git a/dtrain/test-reducer b/dtrain/test-reducer new file mode 100644 index 00000000..b86e7894 --- /dev/null +++ b/dtrain/test-reducer @@ -0,0 +1,7 @@ +a 1 +b 2 +c 3.5 +a 1 +b 2 +c 3.5 +__SHARD_COUNT__ 2 diff --git a/dtrain/test/toy.dtrain.ini b/dtrain/test/toy.dtrain.ini index e9ed0ce5..7272e655 100644 --- a/dtrain/test/toy.dtrain.ini +++ b/dtrain/test/toy.dtrain.ini @@ -2,9 +2,9 @@ decoder_config=test/cdec.ini kbest=4 ngrams=1 epochs=3 -input=test/toy.in +input=- #test/toy.in scorer=bleu -output=data/w/toy.gz +output=- #data/w/toy.gz #stop_after=1000 wprint=logp use_shell use_house PassThrough diff --git a/utils/sampler.h b/utils/sampler.h index a14f6e2f..8567e922 100644 --- a/utils/sampler.h +++ b/utils/sampler.h @@ -32,7 +32,7 @@ struct RandomNumberGenerator { std::cerr << "Warning: could not read from /dev/urandom. Seeding from clock" << std::endl; seed = std::time(NULL); } - std::cerr << "Seeding random number sequence to " << seed << std::endl; + //std::cerr << "Seeding random number sequence to " << seed << std::endl; return seed; } -- cgit v1.2.3