From 9bde56ed23b4b97f8193f9f8f582f18086ff17c1 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Fri, 23 Sep 2011 20:53:15 +0200
Subject: begin refactoring
---
dtrain/Makefile.am | 7 +-
dtrain/README | 1 +
dtrain/avgweights.rb | 27 ------
dtrain/common.h | 43 ---------
dtrain/dtest.cc | 94 --------------------
dtrain/dtrain.cc | 44 ++++------
dtrain/dtrain.h | 60 +++++++++++++
dtrain/hstreaming/avgweights.rb | 27 ++++++
dtrain/hstreaming/cdec.ini | 8 ++
dtrain/hstreaming/dtrain.ini | 10 +++
dtrain/hstreaming/hadoop-streaming-job.sh | 23 +++++
dtrain/job/avgweights.rb | 30 -------
dtrain/job/cdec.ini | 8 --
dtrain/job/dtrain.ini | 10 ---
dtrain/job/dtrain.sh | 6 --
dtrain/job/hadoop-streaming-job.sh | 23 -----
dtrain/job2/avgweights.rb | 30 -------
dtrain/job2/cdec.ini | 8 --
dtrain/job2/dtrain.ini | 10 ---
dtrain/job2/dtrain.sh | 6 --
dtrain/job2/hadoop-streaming-job.sh | 23 -----
dtrain/pairsampling.h | 64 ++++++++++++++
dtrain/run.sh | 12 ---
dtrain/sample.h | 64 --------------
dtrain/test-reducer | 7 --
dtrain/tests.cc | 141 ------------------------------
dtrain/tests.h | 26 ------
dtrain/updater.h | 107 -----------------------
dtrain/util.cc | 34 -------
dtrain/util.h | 28 ------
30 files changed, 212 insertions(+), 769 deletions(-)
delete mode 100755 dtrain/avgweights.rb
delete mode 100644 dtrain/common.h
delete mode 100644 dtrain/dtest.cc
create mode 100644 dtrain/dtrain.h
create mode 100755 dtrain/hstreaming/avgweights.rb
create mode 100644 dtrain/hstreaming/cdec.ini
create mode 100644 dtrain/hstreaming/dtrain.ini
create mode 100755 dtrain/hstreaming/hadoop-streaming-job.sh
delete mode 100755 dtrain/job/avgweights.rb
delete mode 100644 dtrain/job/cdec.ini
delete mode 100644 dtrain/job/dtrain.ini
delete mode 100755 dtrain/job/dtrain.sh
delete mode 100755 dtrain/job/hadoop-streaming-job.sh
delete mode 100755 dtrain/job2/avgweights.rb
delete mode 100644 dtrain/job2/cdec.ini
delete mode 100644 dtrain/job2/dtrain.ini
delete mode 100755 dtrain/job2/dtrain.sh
delete mode 100755 dtrain/job2/hadoop-streaming-job.sh
create mode 100644 dtrain/pairsampling.h
delete mode 100755 dtrain/run.sh
delete mode 100644 dtrain/sample.h
delete mode 100644 dtrain/test-reducer
delete mode 100644 dtrain/tests.cc
delete mode 100644 dtrain/tests.h
delete mode 100644 dtrain/updater.h
delete mode 100644 dtrain/util.cc
delete mode 100644 dtrain/util.h
(limited to 'dtrain')
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index c08cd1ea..9b5df8bf 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -1,11 +1,8 @@
# TODO I'm sure I can leave something out.
-bin_PROGRAMS = dtrain dtest
+bin_PROGRAMS = dtrain
-dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc sample_hg.cc
+dtrain_SOURCES = dtrain.cc score.cc sample_hg.cc
dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams
-dtest_SOURCES = dtest.cc score.cc util.cc
-dtest_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/dtrain/README b/dtrain/README
index b3f513be..0cc52acc 100644
--- a/dtrain/README
+++ b/dtrain/README
@@ -31,6 +31,7 @@ TODO
use separate TEST SET
KNOWN BUGS PROBLEMS
+ if size of candidate < N => 0 score
cdec kbest vs 1best (no -k param), rescoring? => ok(?)
no sparse vector in decoder => ok
? ok
diff --git a/dtrain/avgweights.rb b/dtrain/avgweights.rb
deleted file mode 100755
index d5cfaa4d..00000000
--- a/dtrain/avgweights.rb
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env ruby1.9.1
-
-
-STDIN.set_encoding 'utf-8'
-
-#shard_count_key = "__SHARD_COUNT__"
-
-w = {}
-c = {}
-w.default = 0
-c.default = 0
-while line = STDIN.gets
- key, val = line.split /\t/
- w[key] += val.to_f
- c[key] += 1.0
-end
-
-#shard_count = w["__SHARD_COUNT__"]
-
-w.each_key { |k|
- #if k == shard_count_key then next end
- #if k == "__bias" then next end
- puts "#{k}\t#{w[k]/c[k]}" #{w[k]/shard_count}"
-}
-
-#puts "#{shard_count_key}\t#{w[shard_count_key]}"
-
diff --git a/dtrain/common.h b/dtrain/common.h
deleted file mode 100644
index 49dc85b7..00000000
--- a/dtrain/common.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef _DTRAIN_COMMON_H_
-#define _DTRAIN_COMMON_H_
-
-
-#include
-#include
-#include
-#include
-#include
-#include
-
-// cdec includes
-#include "sentence_metadata.h"
-#include "verbose.h"
-#include "viterbi.h"
-#include "kbest.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "weights.h"
-
-// boost includes
-#include
-#include
-
-// own headers
-#include "score.h"
-
-#define DTRAIN_DEFAULT_K 100 // k for kbest lists
-#define DTRAIN_DEFAULT_N 4 // N for ngrams (e.g. BLEU)
-#define DTRAIN_DEFAULT_T 1 // iterations
-#define DTRAIN_DEFAULT_SCORER "stupid_bleu" // scorer
-#define DTRAIN_DOTS 100 // when to display a '.'
-#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local" // put this on a SSD?
-#define DTRAIN_GRAMMAR_DELIM "########EOS########"
-
-
-using namespace std;
-using namespace dtrain;
-namespace po = boost::program_options;
-
-
-#endif
-
diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc
deleted file mode 100644
index 36c880a3..00000000
--- a/dtrain/dtest.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "common.h"
-#include "kbestget.h"
-#include "util.h"
-
-
-/*
- * init
- *
- */
-bool
-init(int argc, char** argv, po::variables_map* conf)
-{
- int N;
- po::options_description opts( "Command Line Options" );
- opts.add_options()
- ( "decoder-config,c", po::value(), "configuration file for cdec" )
- ( "weights,w", po::value(), "weights file" )
- ( "ngrams,n", po::value(&N)->default_value(DTRAIN_DEFAULT_N), "N for Ngrams (default 5)" );
- po::options_description cmdline_options;
- cmdline_options.add(opts);
- po::store( parse_command_line(argc, argv, cmdline_options), *conf );
- po::notify( *conf );
- if ( ! (conf->count("decoder-config") || conf->count("weights")) ) {
- cerr << cmdline_options << endl;
- return false;
- }
- return true;
-}
-
-
-/*
- * main
- *
- */
-int
-main(int argc, char** argv)
-{
- SetSilent( true );
- po::variables_map conf;
- if ( !init(argc, argv, &conf) ) return 1;
- register_feature_functions();
- size_t k = 1;
- ReadFile ini_rf( conf["decoder-config"].as() );
- Decoder decoder( ini_rf.stream() );
- KBestGetter observer( k, "no" );
- size_t N = conf["ngrams"].as();
-
- Weights weights;
- if ( conf.count("weights") ) weights.InitFromFile( conf["weights"].as() );
- vector w;
- weights.InitVector( &w );
- decoder.SetWeights( w );
-
- vector in_split, ref_strs;
- vector ref_ids;
- string in, psg;
- size_t sn = 0;
- double overall = 0.0;
- double overall1 = 0.0;
- double overall2 = 0.0;
- while( getline(cin, in) ) {
- in_split.clear();
- boost::split( in_split, in, boost::is_any_of("\t") );
- // grammar
- psg = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ); psg += "\n";
- decoder.SetSentenceGrammarFromString( psg );
- decoder.Decode( in_split[1], &observer );
- KBestList* kb = observer.GetKBest();
- // reference
- ref_strs.clear(); ref_ids.clear();
- boost::split( ref_strs, in_split[2], boost::is_any_of(" ") );
- register_and_convert( ref_strs, ref_ids );
- // scoring kbest
- double score = 0.0;
- double score1 = 0.0;
- double score2 = 0.0;
- NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], N );
- score = smooth_bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
- score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
- score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
- cout << TD::GetString( kb->sents[0] ) << endl;
- overall += score;
- overall1 += score1;
- overall2 += score2;
- sn += 1;
- }
- cerr << "Average score (smooth) : " << overall/(double)(sn+1) << endl;
- cerr << "Average score (stupid) : " << overall1/(double)(sn+1) << endl;
- cerr << "Average score (vanilla): " << overall2/(double)(sn+1) << endl;
- cerr << endl;
-
- return 0;
-}
-
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index f005008e..01821b30 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,24 +1,19 @@
-#include "common.h"
-#include "kbestget.h"
-#include "util.h"
-#include "sample.h"
+#include "dtrain.h"
-#include "ksampler.h"
-// boost compression
-#include
-#include
-#include
-//#include
-//#include
-using namespace boost::iostreams;
-#include
-#include
-
-#ifdef DTRAIN_DEBUG
-#include "tests.h"
-#endif
+/*
+ * register_and_convert
+ *
+ */
+void
+register_and_convert(const vector& strs, vector& ids)
+{
+ vector::const_iterator it;
+ for ( it = strs.begin(); it < strs.end(); it++ ) {
+ ids.push_back( TD::Convert( *it ) );
+ }
+}
/*
@@ -49,12 +44,7 @@ init(int argc, char** argv, po::variables_map* cfg)
clo.add_options()
( "config,c", po::value(), "dtrain config file" )
( "quiet,q", po::value()->zero_tokens(), "be quiet" )
- ( "verbose,v", po::value()->zero_tokens(), "be verbose" )
-#ifndef DTRAIN_DEBUG
- ;
-#else
- ( "test", "run tests and exit");
-#endif
+ ( "verbose,v", po::value()->zero_tokens(), "be verbose" );
po::options_description config_options, cmdline_options;
config_options.add(conff);
@@ -149,9 +139,9 @@ main( int argc, char** argv )
if ( !quiet )
cout << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl;
Decoder decoder( ini_rf.stream() );
- //KBestGetter observer( k, filter_type );
+ KBestGetter observer( k, filter_type );
MT19937 rng;
- KSampler observer( k, &rng );
+ //KSampler observer( k, &rng );
// scoring metric/scorer
string scorer_str = cfg["scorer"].as();
@@ -433,7 +423,7 @@ main( int argc, char** argv )
}
++sid;
- cerr << "reporter:counter:dtrain,sent," << sid << endl;
+ //cerr << "reporter:counter:dtrain,sent," << sid << endl;
} // input loop
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
new file mode 100644
index 00000000..3d319233
--- /dev/null
+++ b/dtrain/dtrain.h
@@ -0,0 +1,60 @@
+#ifndef _DTRAIN_COMMON_H_
+#define _DTRAIN_COMMON_H_
+
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+// cdec includes
+#include "sentence_metadata.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "weights.h"
+
+// boost includes
+#include
+#include
+
+// own headers
+#include "score.h"
+
+#define DTRAIN_DEFAULT_K 100 // k for kbest lists
+#define DTRAIN_DEFAULT_N 4 // N for ngrams (e.g. BLEU)
+#define DTRAIN_DEFAULT_T 1 // iterations
+#define DTRAIN_DEFAULT_SCORER "stupid_bleu" // scorer
+#define DTRAIN_DOTS 100 // when to display a '.'
+#define DTRAIN_TMP_DIR "/tmp" // put this on a SSD?
+#define DTRAIN_GRAMMAR_DELIM "########EOS########"
+
+
+#include "kbestget.h"
+#include "pairsampling.h"
+
+#include "ksampler.h"
+
+// boost compression
+#include
+#include
+#include
+//#include
+//#include
+using namespace boost::iostreams;
+
+#include
+#include
+
+
+using namespace std;
+using namespace dtrain;
+namespace po = boost::program_options;
+
+
+#endif
+
diff --git a/dtrain/hstreaming/avgweights.rb b/dtrain/hstreaming/avgweights.rb
new file mode 100755
index 00000000..d5cfaa4d
--- /dev/null
+++ b/dtrain/hstreaming/avgweights.rb
@@ -0,0 +1,27 @@
+#!/usr/bin/env ruby1.9.1
+
+
+STDIN.set_encoding 'utf-8'
+
+#shard_count_key = "__SHARD_COUNT__"
+
+w = {}
+c = {}
+w.default = 0
+c.default = 0
+while line = STDIN.gets
+ key, val = line.split /\t/
+ w[key] += val.to_f
+ c[key] += 1.0
+end
+
+#shard_count = w["__SHARD_COUNT__"]
+
+w.each_key { |k|
+ #if k == shard_count_key then next end
+ #if k == "__bias" then next end
+ puts "#{k}\t#{w[k]/c[k]}" #{w[k]/shard_count}"
+}
+
+#puts "#{shard_count_key}\t#{w[shard_count_key]}"
+
diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini
new file mode 100644
index 00000000..0d32f0b7
--- /dev/null
+++ b/dtrain/hstreaming/cdec.ini
@@ -0,0 +1,8 @@
+formalism=scfg
+add_pass_through_rules=true
+feature_function=WordPenalty
+cubepruning_pop_limit=30
+feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
+feature_function=RuleIdentityFeatures
+scfg_max_span_limit=15
+
diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini
new file mode 100644
index 00000000..079d7d69
--- /dev/null
+++ b/dtrain/hstreaming/dtrain.ini
@@ -0,0 +1,10 @@
+decoder_config=cdec.ini
+kbest=100
+ngrams=4
+epochs=10
+input=-
+scorer=stupid_bleu
+output=-
+#stop_after=100
+#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
+
diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh
new file mode 100755
index 00000000..2cf3f50a
--- /dev/null
+++ b/dtrain/hstreaming/hadoop-streaming-job.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+HADOOP_HOME=/usr/lib/hadoop-0.20
+JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
+HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
+
+IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.1400m
+OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m
+
+$HSTREAMING \
+ -mapper "dtrain.sh" \
+ -reducer "avgweights.rb" \
+ -input $IN \
+ -output $OUT \
+ -file avgweights.rb \
+ -file dtrain.sh \
+ -file dtrain \
+ -file dtrain.ini \
+ -file cdec.ini \
+ -file nc-wmt11.en.srilm.3.gz \
+ -jobconf mapred.reduce.tasks=1 \
+ -jobconf mapred.max.map.failures.percent=100
+
diff --git a/dtrain/job/avgweights.rb b/dtrain/job/avgweights.rb
deleted file mode 100755
index e635aab4..00000000
--- a/dtrain/job/avgweights.rb
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env ruby1.9.1
-
-
-STDIN.set_encoding 'utf-8'
-
-#shard_count_key = "__SHARD_COUNT__"
-
-w = {}
-#c = {}
-w.default = 0
-#c.default = 0
-while line = STDIN.gets
- key, val = line.split /\t/
- w[key] += val.to_f
- #c[key] += 1.0
-end
-
-#shard_count = w["__SHARD_COUNT__"]
-
-num_map = 104.0
-
-w.each_key { |k|
- #if k == shard_count_key then next end
- #if k == "__bias" then next end
- puts "#{k}\t#{w[k]/num_map}"
- #/c[k]}" #{w[k]/shard_count}"
-}
-
-#puts "#{shard_count_key}\t#{w[shard_count_key]}"
-
diff --git a/dtrain/job/cdec.ini b/dtrain/job/cdec.ini
deleted file mode 100644
index 0d32f0b7..00000000
--- a/dtrain/job/cdec.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-feature_function=WordPenalty
-cubepruning_pop_limit=30
-feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
-feature_function=RuleIdentityFeatures
-scfg_max_span_limit=15
-
diff --git a/dtrain/job/dtrain.ini b/dtrain/job/dtrain.ini
deleted file mode 100644
index 079d7d69..00000000
--- a/dtrain/job/dtrain.ini
+++ /dev/null
@@ -1,10 +0,0 @@
-decoder_config=cdec.ini
-kbest=100
-ngrams=4
-epochs=10
-input=-
-scorer=stupid_bleu
-output=-
-#stop_after=100
-#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
-
diff --git a/dtrain/job/dtrain.sh b/dtrain/job/dtrain.sh
deleted file mode 100755
index 75ec29ea..00000000
--- a/dtrain/job/dtrain.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-./dtrain -q -c dtrain.ini
-
-exit 0
-
diff --git a/dtrain/job/hadoop-streaming-job.sh b/dtrain/job/hadoop-streaming-job.sh
deleted file mode 100755
index 2cf3f50a..00000000
--- a/dtrain/job/hadoop-streaming-job.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-HADOOP_HOME=/usr/lib/hadoop-0.20
-JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
-HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-
-IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.1400m
-OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m
-
-$HSTREAMING \
- -mapper "dtrain.sh" \
- -reducer "avgweights.rb" \
- -input $IN \
- -output $OUT \
- -file avgweights.rb \
- -file dtrain.sh \
- -file dtrain \
- -file dtrain.ini \
- -file cdec.ini \
- -file nc-wmt11.en.srilm.3.gz \
- -jobconf mapred.reduce.tasks=1 \
- -jobconf mapred.max.map.failures.percent=100
-
diff --git a/dtrain/job2/avgweights.rb b/dtrain/job2/avgweights.rb
deleted file mode 100755
index 31048f16..00000000
--- a/dtrain/job2/avgweights.rb
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env ruby1.9.1
-
-
-STDIN.set_encoding 'utf-8'
-
-#shard_count_key = "__SHARD_COUNT__"
-
-w = {}
-#c = {}
-w.default = 0
-#c.default = 0
-while line = STDIN.gets
- key, val = line.split /\t/
- w[key] += val.to_f
- #c[key] += 1.0
-end
-
-#shard_count = w["__SHARD_COUNT__"]
-
-num_map = 2107.0
-
-w.each_key { |k|
- #if k == shard_count_key then next end
- #if k == "__bias" then next end
- puts "#{k}\t#{w[k]/num_map}"
- #/c[k]}" #{w[k]/shard_count}"
-}
-
-#puts "#{shard_count_key}\t#{w[shard_count_key]}"
-
diff --git a/dtrain/job2/cdec.ini b/dtrain/job2/cdec.ini
deleted file mode 100644
index 0d32f0b7..00000000
--- a/dtrain/job2/cdec.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-feature_function=WordPenalty
-cubepruning_pop_limit=30
-feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
-feature_function=RuleIdentityFeatures
-scfg_max_span_limit=15
-
diff --git a/dtrain/job2/dtrain.ini b/dtrain/job2/dtrain.ini
deleted file mode 100644
index ec005e46..00000000
--- a/dtrain/job2/dtrain.ini
+++ /dev/null
@@ -1,10 +0,0 @@
-decoder_config=cdec.ini
-kbest=100
-ngrams=3
-epochs=10
-input=-
-scorer=stupid_bleu
-output=-
-#stop_after=100
-#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
-
diff --git a/dtrain/job2/dtrain.sh b/dtrain/job2/dtrain.sh
deleted file mode 100755
index 75ec29ea..00000000
--- a/dtrain/job2/dtrain.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-./dtrain -q -c dtrain.ini
-
-exit 0
-
diff --git a/dtrain/job2/hadoop-streaming-job.sh b/dtrain/job2/hadoop-streaming-job.sh
deleted file mode 100755
index 9ee70a33..00000000
--- a/dtrain/job2/hadoop-streaming-job.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-HADOOP_HOME=/usr/lib/hadoop-0.20
-JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
-HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-
-IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain
-OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights.50
-
-$HSTREAMING \
- -mapper "dtrain.sh" \
- -reducer "avgweights.rb" \
- -input $IN \
- -output $OUT \
- -file avgweights.rb \
- -file dtrain.sh \
- -file dtrain \
- -file dtrain.ini \
- -file cdec.ini \
- -file nc-wmt11.en.srilm.3.gz \
- -jobconf mapred.reduce.tasks=1 \
- -jobconf mapred.max.map.failures.percent=100
-
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
new file mode 100644
index 00000000..502901af
--- /dev/null
+++ b/dtrain/pairsampling.h
@@ -0,0 +1,64 @@
+#ifndef _DTRAIN_SAMPLE_H_
+#define _DTRAIN_SAMPLE_H_
+
+
+#include "kbestget.h"
+
+
+namespace dtrain
+{
+
+
+struct TPair
+{
+ SparseVector first, second;
+ size_t first_rank, second_rank;
+ double first_score, second_score;
+};
+
+typedef vector TrainingInstances;
+
+
+void
+sample_all( KBestList* kb, TrainingInstances &training )
+{
+ for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
+ for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
+ TPair p;
+ p.first = kb->feats[i];
+ p.second = kb->feats[j];
+ p.first_rank = i;
+ p.second_rank = j;
+ p.first_score = kb->scores[i];
+ p.second_score = kb->scores[j];
+ training.push_back( p );
+ }
+ }
+}
+
+void
+sample_rand( KBestList* kb, TrainingInstances &training )
+{
+ srand( time(NULL) );
+ for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
+ for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
+ if ( rand() % 2 ) {
+ TPair p;
+ p.first = kb->feats[i];
+ p.second = kb->feats[j];
+ p.first_rank = i;
+ p.second_rank = j;
+ p.first_score = kb->scores[i];
+ p.second_score = kb->scores[j];
+ training.push_back( p );
+ }
+ }
+ }
+}
+
+
+} // namespace
+
+
+#endif
+
diff --git a/dtrain/run.sh b/dtrain/run.sh
deleted file mode 100755
index 72e56f3e..00000000
--- a/dtrain/run.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh
-
-#INI=test/blunsom08.dtrain.ini
-#INI=test/nc-wmt11/dtrain.ini
-#INI=test/EXAMPLE/dtrain.ini
-#INI=test/EXAMPLE/dtrain.ruleids.ini
-INI=test/toy.dtrain.ini
-#INI=test/EXAMPLE/dtrain.cdecrid.ini
-
-#rm /tmp/dtrain-*
-./dtrain -c $INI $1 $2 $3 $4
-
diff --git a/dtrain/sample.h b/dtrain/sample.h
deleted file mode 100644
index 502901af..00000000
--- a/dtrain/sample.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef _DTRAIN_SAMPLE_H_
-#define _DTRAIN_SAMPLE_H_
-
-
-#include "kbestget.h"
-
-
-namespace dtrain
-{
-
-
-struct TPair
-{
- SparseVector first, second;
- size_t first_rank, second_rank;
- double first_score, second_score;
-};
-
-typedef vector TrainingInstances;
-
-
-void
-sample_all( KBestList* kb, TrainingInstances &training )
-{
- for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
- for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
- TPair p;
- p.first = kb->feats[i];
- p.second = kb->feats[j];
- p.first_rank = i;
- p.second_rank = j;
- p.first_score = kb->scores[i];
- p.second_score = kb->scores[j];
- training.push_back( p );
- }
- }
-}
-
-void
-sample_rand( KBestList* kb, TrainingInstances &training )
-{
- srand( time(NULL) );
- for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
- for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
- if ( rand() % 2 ) {
- TPair p;
- p.first = kb->feats[i];
- p.second = kb->feats[j];
- p.first_rank = i;
- p.second_rank = j;
- p.first_score = kb->scores[i];
- p.second_score = kb->scores[j];
- training.push_back( p );
- }
- }
- }
-}
-
-
-} // namespace
-
-
-#endif
-
diff --git a/dtrain/test-reducer b/dtrain/test-reducer
deleted file mode 100644
index b86e7894..00000000
--- a/dtrain/test-reducer
+++ /dev/null
@@ -1,7 +0,0 @@
-a 1
-b 2
-c 3.5
-a 1
-b 2
-c 3.5
-__SHARD_COUNT__ 2
diff --git a/dtrain/tests.cc b/dtrain/tests.cc
deleted file mode 100644
index 997eafbb..00000000
--- a/dtrain/tests.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-#include "tests.h"
-
-
-namespace dtrain
-{
-
-
-/*
- * approx_equal
- *
- */
-double
-approx_equal( double x, double y )
-{
- const double EPSILON = 1E-5;
- if ( x == 0 ) return fabs( y ) <= EPSILON;
- if ( y == 0 ) return fabs( x ) <= EPSILON;
- return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
-}
-
-
-/*
- * test_ngrams
- *
- */
-void
-test_ngrams()
-{
- cout << "Testing ngrams..." << endl << endl;
- size_t N = 5;
- cout << "N = " << N << endl;
- vector a; // hyp
- vector b; // ref
- cout << "a ";
- for (size_t i = 1; i <= 8; i++) {
- cout << i << " ";
- a.push_back(i);
- }
- cout << endl << "b ";
- for (size_t i = 1; i <= 4; i++) {
- cout << i << " ";
- b.push_back(i);
- }
- cout << endl << endl;
- NgramCounts c = make_ngram_counts( a, b, N );
- assert( c.clipped[N-1] == 0 );
- assert( c.sum[N-1] == 4 );
- c.print();
- c += c;
- cout << endl;
- c.print();
- cout << endl;
-}
-
-
-/*
- * test_metrics
- *
- */
-void
-test_metrics()
-{
- cout << "Testing metrics..." << endl << endl;
- using namespace boost::assign;
- vector a, b;
- vector expect_vanilla, expect_smooth, expect_stupid;
- a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp
- b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref
- expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0;
- expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587;
- expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707;
- vector aa, bb;
- vector aai, bbi;
- double vanilla, smooth, stupid;
- size_t N = 4;
- cout << "N = " << N << endl << endl;
- for ( size_t i = 0; i < a.size(); i++ ) {
- cout << " hyp: " << a[i] << endl;
- cout << " ref: " << b[i] << endl;
- aa.clear(); bb.clear(); aai.clear(); bbi.clear();
- boost::split( aa, a[i], boost::is_any_of(" ") );
- boost::split( bb, b[i], boost::is_any_of(" ") );
- register_and_convert( aa, aai );
- register_and_convert( bb, bbi );
- NgramCounts counts = make_ngram_counts( aai, bbi, N );
- vanilla = bleu( counts, aa.size(), bb.size(), N);
- smooth = smooth_bleu( counts, aa.size(), bb.size(), N);
- stupid = stupid_bleu( counts, aa.size(), bb.size(), N);
- assert( approx_equal(vanilla, expect_vanilla[i]) );
- assert( approx_equal(smooth, expect_smooth[i]) );
- assert( approx_equal(stupid, expect_stupid[i]) );
- cout << setw(14) << "bleu = " << vanilla << endl;
- cout << setw(14) << "smooth bleu = " << smooth << endl;
- cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
- }
- cout << endl;
-}
-
-
-/*
- * test_SetWeights
- *
- */
-void
-test_SetWeights()
-{
- cout << "Testing Weights::SetWeight..." << endl << endl;
- Weights weights;
- SparseVector lambdas;
- weights.InitSparseVector( &lambdas );
- weights.SetWeight( &lambdas, "test", 0 );
- weights.SetWeight( &lambdas, "test1", 1 );
- WordID fid = FD::Convert( "test2" );
- weights.SetWeight( &lambdas, fid, 2 );
- string fn = "weights-test";
- cout << "FD::NumFeats() " << FD::NumFeats() << endl;
- assert( FD::NumFeats() == 4 );
- weights.WriteToFile( fn, true );
- cout << endl;
-}
-
-
-/*
- * run_tests
- *
- */
-void
-run_tests()
-{
- cout << endl;
- test_ngrams();
- cout << endl;
- test_metrics();
- cout << endl;
- test_SetWeights();
- exit(0);
-}
-
-
-} // namespace
-
diff --git a/dtrain/tests.h b/dtrain/tests.h
deleted file mode 100644
index 9853e3c3..00000000
--- a/dtrain/tests.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _DTRAIN_TESTS_H_
-#define _DTRAIN_TESTS_H_
-
-#include
-#include
-
-#include "common.h"
-#include "util.h"
-
-
-namespace dtrain
-{
-
-
-double approx_equal( double x, double y );
-void test_ngrams();
-void test_metrics();
-void test_SetWeights();
-void run_tests();
-
-
-} // namespace
-
-
-#endif
-
diff --git a/dtrain/updater.h b/dtrain/updater.h
deleted file mode 100644
index b54c25de..00000000
--- a/dtrain/updater.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#ifndef _DTRAIN_LEARNER_H_
-#define _DTRAIN_LEARNER_H_
-
-#include
-#include
-#include