summaryrefslogtreecommitdiff
path: root/dtrain
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2011-09-11 13:17:33 +0200
committerPatrick Simianer <p@simianer.de>2011-09-23 19:13:59 +0200
commit1ad0eb820ee946e5a142567380fc0488c9a5d6de (patch)
tree4378d4d617b883fe3431915b0c6e138e1d4324d7 /dtrain
parentfe6acf199a5749f0a604a95e8d7af59bccc7505e (diff)
latest version from mtm6
Diffstat (limited to 'dtrain')
-rwxr-xr-xdtrain/avgweights.rb27
-rw-r--r--dtrain/common.h2
-rw-r--r--dtrain/dtrain.cc27
-rwxr-xr-xdtrain/job/avgweights.rb30
-rw-r--r--dtrain/job/cdec.ini8
-rw-r--r--dtrain/job/dtrain.ini10
-rwxr-xr-xdtrain/job/dtrain.sh6
-rwxr-xr-xdtrain/job/hadoop-streaming-job.sh23
-rwxr-xr-xdtrain/job2/avgweights.rb30
-rw-r--r--dtrain/job2/cdec.ini8
-rw-r--r--dtrain/job2/dtrain.ini10
-rwxr-xr-xdtrain/job2/dtrain.sh6
-rwxr-xr-xdtrain/job2/hadoop-streaming-job.sh23
-rwxr-xr-xdtrain/run.sh6
-rw-r--r--dtrain/test-reducer7
-rw-r--r--dtrain/test/toy.dtrain.ini4
16 files changed, 217 insertions, 10 deletions
diff --git a/dtrain/avgweights.rb b/dtrain/avgweights.rb
new file mode 100755
index 00000000..d5cfaa4d
--- /dev/null
+++ b/dtrain/avgweights.rb
@@ -0,0 +1,27 @@
+#!/usr/bin/env ruby1.9.1
+
+
+STDIN.set_encoding 'utf-8'
+
+#shard_count_key = "__SHARD_COUNT__"
+
+w = {}
+c = {}
+w.default = 0
+c.default = 0
+while line = STDIN.gets
+ key, val = line.split /\t/
+ w[key] += val.to_f
+ c[key] += 1.0
+end
+
+#shard_count = w["__SHARD_COUNT__"]
+
+w.each_key { |k|
+ #if k == shard_count_key then next end
+ #if k == "__bias" then next end
+ puts "#{k}\t#{w[k]/c[k]}" #{w[k]/shard_count}"
+}
+
+#puts "#{shard_count_key}\t#{w[shard_count_key]}"
+
diff --git a/dtrain/common.h b/dtrain/common.h
index 4ff975e1..49dc85b7 100644
--- a/dtrain/common.h
+++ b/dtrain/common.h
@@ -30,7 +30,7 @@
#define DTRAIN_DEFAULT_T 1 // iterations
#define DTRAIN_DEFAULT_SCORER "stupid_bleu" // scorer
#define DTRAIN_DOTS 100 // when to display a '.'
-#define DTRAIN_TMP_DIR "/tmp" // put this on a SSD?
+#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local" // put this on a SSD?
#define DTRAIN_GRAMMAR_DELIM "########EOS########"
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 35996d6d..f005008e 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -13,6 +13,8 @@
//#include <boost/iostreams/filter/bzip2.hpp>
using namespace boost::iostreams;
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/lexical_cast.hpp>
#ifdef DTRAIN_DEBUG
#include "tests.h"
@@ -311,7 +313,7 @@ main( int argc, char** argv )
}
if ( broken_grammar ) continue;
grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __
- grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl;
+ grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
decoder.SetSentenceGrammarFromString( grammar_str );
// decode, kbest
src_str_buf.push_back( in_split[1] );
@@ -323,7 +325,8 @@ main( int argc, char** argv )
while ( true ) {
string g;
getline( grammar_buf_in, g );
- if ( g == DTRAIN_GRAMMAR_DELIM ) break;
+ //if ( g == DTRAIN_GRAMMAR_DELIM ) break;
+ if (boost::starts_with(g, DTRAIN_GRAMMAR_DELIM)) break;
grammar_str += g+"\n";
i += 1;
}
@@ -430,6 +433,7 @@ main( int argc, char** argv )
}
++sid;
+ cerr << "reporter:counter:dtrain,sent," << sid << endl;
} // input loop
@@ -446,6 +450,7 @@ main( int argc, char** argv )
avg_1best_score_diff = avg_1best_score;
avg_1best_model_diff = avg_1best_model;
}
+ if ( !quiet ) {
cout << _prec5 << _pos << "WEIGHTS" << endl;
for (vector<string>::iterator it = wprint.begin(); it != wprint.end(); it++) {
cout << setw(16) << *it << " = " << dense_weights[FD::Convert( *it )] << endl;
@@ -456,6 +461,7 @@ main( int argc, char** argv )
cout << _pos << " (" << avg_1best_score_diff << ")" << endl;
cout << _nopos << "avg model score: " << avg_1best_model;
cout << _pos << " (" << avg_1best_model_diff << ")" << endl;
+ }
vector<double> remember_scores;
remember_scores.push_back( avg_1best_score );
remember_scores.push_back( avg_1best_model );
@@ -478,7 +484,7 @@ main( int argc, char** argv )
cout << time_dif/(double)in_sz<< " s/S)" << endl;
}
- if ( t+1 != T ) cout << endl;
+ if ( t+1 != T && !quiet ) cout << endl;
if ( noup ) break;
@@ -486,8 +492,21 @@ main( int argc, char** argv )
unlink( grammar_buf_tmp_fn );
if ( !noup ) {
+ // TODO BEST ITER
if ( !quiet ) cout << endl << "writing weights file '" << cfg["output"].as<string>() << "' ...";
- weights.WriteToFile( cfg["output"].as<string>(), true );
+ if ( cfg["output"].as<string>() == "-" ) {
+ for ( SparseVector<double>::const_iterator ti = lambdas.begin();
+ ti != lambdas.end(); ++ti ) {
+ if ( ti->second == 0 ) continue;
+ //if ( ti->first == "__bias" ) continue;
+ cout << setprecision(9);
+ cout << _nopos << FD::Convert(ti->first) << "\t" << ti->second << endl;
+ //cout << "__SHARD_COUNT__\t1" << endl;
+ }
+ } else {
+ weights.InitFromVector( lambdas );
+ weights.WriteToFile( cfg["output"].as<string>(), true );
+ }
if ( !quiet ) cout << "done" << endl;
}
diff --git a/dtrain/job/avgweights.rb b/dtrain/job/avgweights.rb
new file mode 100755
index 00000000..e635aab4
--- /dev/null
+++ b/dtrain/job/avgweights.rb
@@ -0,0 +1,30 @@
+#!/usr/bin/env ruby1.9.1
+
+
+STDIN.set_encoding 'utf-8'
+
+#shard_count_key = "__SHARD_COUNT__"
+
+w = {}
+#c = {}
+w.default = 0
+#c.default = 0
+while line = STDIN.gets
+ key, val = line.split /\t/
+ w[key] += val.to_f
+ #c[key] += 1.0
+end
+
+#shard_count = w["__SHARD_COUNT__"]
+
+num_map = 104.0
+
+w.each_key { |k|
+ #if k == shard_count_key then next end
+ #if k == "__bias" then next end
+ puts "#{k}\t#{w[k]/num_map}"
+ #/c[k]}" #{w[k]/shard_count}"
+}
+
+#puts "#{shard_count_key}\t#{w[shard_count_key]}"
+
diff --git a/dtrain/job/cdec.ini b/dtrain/job/cdec.ini
new file mode 100644
index 00000000..0d32f0b7
--- /dev/null
+++ b/dtrain/job/cdec.ini
@@ -0,0 +1,8 @@
+formalism=scfg
+add_pass_through_rules=true
+feature_function=WordPenalty
+cubepruning_pop_limit=30
+feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
+feature_function=RuleIdentityFeatures
+scfg_max_span_limit=15
+
diff --git a/dtrain/job/dtrain.ini b/dtrain/job/dtrain.ini
new file mode 100644
index 00000000..079d7d69
--- /dev/null
+++ b/dtrain/job/dtrain.ini
@@ -0,0 +1,10 @@
+decoder_config=cdec.ini
+kbest=100
+ngrams=4
+epochs=10
+input=-
+scorer=stupid_bleu
+output=-
+#stop_after=100
+#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
+
diff --git a/dtrain/job/dtrain.sh b/dtrain/job/dtrain.sh
new file mode 100755
index 00000000..75ec29ea
--- /dev/null
+++ b/dtrain/job/dtrain.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+./dtrain -q -c dtrain.ini
+
+exit 0
+
diff --git a/dtrain/job/hadoop-streaming-job.sh b/dtrain/job/hadoop-streaming-job.sh
new file mode 100755
index 00000000..2cf3f50a
--- /dev/null
+++ b/dtrain/job/hadoop-streaming-job.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+HADOOP_HOME=/usr/lib/hadoop-0.20
+JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
+HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
+
+IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.1400m
+OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m
+
+$HSTREAMING \
+ -mapper "dtrain.sh" \
+ -reducer "avgweights.rb" \
+ -input $IN \
+ -output $OUT \
+ -file avgweights.rb \
+ -file dtrain.sh \
+ -file dtrain \
+ -file dtrain.ini \
+ -file cdec.ini \
+ -file nc-wmt11.en.srilm.3.gz \
+ -jobconf mapred.reduce.tasks=1 \
+ -jobconf mapred.max.map.failures.percent=100
+
diff --git a/dtrain/job2/avgweights.rb b/dtrain/job2/avgweights.rb
new file mode 100755
index 00000000..31048f16
--- /dev/null
+++ b/dtrain/job2/avgweights.rb
@@ -0,0 +1,30 @@
+#!/usr/bin/env ruby1.9.1
+
+
+STDIN.set_encoding 'utf-8'
+
+#shard_count_key = "__SHARD_COUNT__"
+
+w = {}
+#c = {}
+w.default = 0
+#c.default = 0
+while line = STDIN.gets
+ key, val = line.split /\t/
+ w[key] += val.to_f
+ #c[key] += 1.0
+end
+
+#shard_count = w["__SHARD_COUNT__"]
+
+num_map = 2107.0
+
+w.each_key { |k|
+ #if k == shard_count_key then next end
+ #if k == "__bias" then next end
+ puts "#{k}\t#{w[k]/num_map}"
+ #/c[k]}" #{w[k]/shard_count}"
+}
+
+#puts "#{shard_count_key}\t#{w[shard_count_key]}"
+
diff --git a/dtrain/job2/cdec.ini b/dtrain/job2/cdec.ini
new file mode 100644
index 00000000..0d32f0b7
--- /dev/null
+++ b/dtrain/job2/cdec.ini
@@ -0,0 +1,8 @@
+formalism=scfg
+add_pass_through_rules=true
+feature_function=WordPenalty
+cubepruning_pop_limit=30
+feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
+feature_function=RuleIdentityFeatures
+scfg_max_span_limit=15
+
diff --git a/dtrain/job2/dtrain.ini b/dtrain/job2/dtrain.ini
new file mode 100644
index 00000000..ec005e46
--- /dev/null
+++ b/dtrain/job2/dtrain.ini
@@ -0,0 +1,10 @@
+decoder_config=cdec.ini
+kbest=100
+ngrams=3
+epochs=10
+input=-
+scorer=stupid_bleu
+output=-
+#stop_after=100
+#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
+
diff --git a/dtrain/job2/dtrain.sh b/dtrain/job2/dtrain.sh
new file mode 100755
index 00000000..75ec29ea
--- /dev/null
+++ b/dtrain/job2/dtrain.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+./dtrain -q -c dtrain.ini
+
+exit 0
+
diff --git a/dtrain/job2/hadoop-streaming-job.sh b/dtrain/job2/hadoop-streaming-job.sh
new file mode 100755
index 00000000..9ee70a33
--- /dev/null
+++ b/dtrain/job2/hadoop-streaming-job.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+HADOOP_HOME=/usr/lib/hadoop-0.20
+JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
+HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
+
+IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain
+OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights.50
+
+$HSTREAMING \
+ -mapper "dtrain.sh" \
+ -reducer "avgweights.rb" \
+ -input $IN \
+ -output $OUT \
+ -file avgweights.rb \
+ -file dtrain.sh \
+ -file dtrain \
+ -file dtrain.ini \
+ -file cdec.ini \
+ -file nc-wmt11.en.srilm.3.gz \
+ -jobconf mapred.reduce.tasks=1 \
+ -jobconf mapred.max.map.failures.percent=100
+
diff --git a/dtrain/run.sh b/dtrain/run.sh
index 97123dfa..72e56f3e 100755
--- a/dtrain/run.sh
+++ b/dtrain/run.sh
@@ -2,11 +2,11 @@
#INI=test/blunsom08.dtrain.ini
#INI=test/nc-wmt11/dtrain.ini
-INI=test/EXAMPLE/dtrain.ini
+#INI=test/EXAMPLE/dtrain.ini
#INI=test/EXAMPLE/dtrain.ruleids.ini
-#INI=test/toy.dtrain.ini
+INI=test/toy.dtrain.ini
#INI=test/EXAMPLE/dtrain.cdecrid.ini
-rm /tmp/dtrain-*
+#rm /tmp/dtrain-*
./dtrain -c $INI $1 $2 $3 $4
diff --git a/dtrain/test-reducer b/dtrain/test-reducer
new file mode 100644
index 00000000..b86e7894
--- /dev/null
+++ b/dtrain/test-reducer
@@ -0,0 +1,7 @@
+a 1
+b 2
+c 3.5
+a 1
+b 2
+c 3.5
+__SHARD_COUNT__ 2
diff --git a/dtrain/test/toy.dtrain.ini b/dtrain/test/toy.dtrain.ini
index e9ed0ce5..7272e655 100644
--- a/dtrain/test/toy.dtrain.ini
+++ b/dtrain/test/toy.dtrain.ini
@@ -2,9 +2,9 @@ decoder_config=test/cdec.ini
kbest=4
ngrams=1
epochs=3
-input=test/toy.in
+input=- #test/toy.in
scorer=bleu
-output=data/w/toy.gz
+output=- #data/w/toy.gz
#stop_after=1000
wprint=logp use_shell use_house PassThrough