summaryrefslogtreecommitdiff
path: root/dtrain
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-08-03 01:29:52 +0200
committerPatrick Simianer <p@simianer.de>2011-09-23 19:13:57 +0200
commit2e605eb2745e56619b16fdbcb8095e0a6543ab27 (patch)
tree03c122c3add26365eb8f3f84aec2a533d7222cab /dtrain
parentb7568a8dad2720d5ea0418171e9b85229adbbcc5 (diff)
refactoring, cleaning up
Diffstat (limited to 'dtrain')
-rw-r--r--dtrain/Makefile.am4
-rw-r--r--dtrain/common.h37
-rw-r--r--dtrain/dcommon.cc330
-rw-r--r--dtrain/dcommon.h163
-rw-r--r--dtrain/dtest.cc47
-rw-r--r--dtrain/dtrain.cc86
-rw-r--r--dtrain/kbestget.h61
-rw-r--r--dtrain/learner.h133
-rw-r--r--dtrain/score.cc166
-rw-r--r--dtrain/score.h111
-rwxr-xr-xdtrain/scripts/run.sh4
-rwxr-xr-xdtrain/scripts/test.sh6
-rwxr-xr-xdtrain/test.sh4
-rw-r--r--dtrain/tests.cc141
-rw-r--r--dtrain/tests.h26
-rw-r--r--dtrain/util.cc34
-rw-r--r--dtrain/util.h28
17 files changed, 774 insertions, 607 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index c3f14bb0..03e3ccf7 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -1,10 +1,10 @@
# TODO I'm sure I can leave something out.
bin_PROGRAMS = dtrain dtest
-dtrain_SOURCES = dtrain.cc dcommon.cc
+dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc
dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-dtest_SOURCES = dtest.cc dcommon.cc
+dtest_SOURCES = dtest.cc score.cc util.cc
dtest_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/dtrain/common.h b/dtrain/common.h
new file mode 100644
index 00000000..cf365d48
--- /dev/null
+++ b/dtrain/common.h
@@ -0,0 +1,37 @@
+#ifndef _DTRAIN_COMMON_H_
+#define _DTRAIN_COMMON_H_
+
+
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+#include "sentence_metadata.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "weights.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/program_options.hpp>
+
+#include "score.h"
+
+#define DTRAIN_DEFAULT_K 100
+#define DTRAIN_DEFAULT_N 4
+#define DTRAIN_DEFAULT_T 1
+
+#define DTRAIN_DOTOUT 100
+
+
+using namespace std;
+using namespace dtrain;
+namespace po = boost::program_options;
+
+
+#endif
+
diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc
deleted file mode 100644
index 6657bed6..00000000
--- a/dtrain/dcommon.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-#include "dcommon.h"
-
-
-
-
-/******************************************************************************
- * NGRAMS
- *
- *
- * make_ngrams
- *
- */
-typedef map<vector<WordID>, size_t> Ngrams;
-Ngrams
-make_ngrams( vector<WordID>& s, size_t N )
-{
- Ngrams ngrams;
- vector<WordID> ng;
- for ( size_t i = 0; i < s.size(); i++ ) {
- ng.clear();
- for ( size_t j = i; j < min( i+N, s.size() ); j++ ) {
- ng.push_back( s[j] );
- ngrams[ng]++;
- }
- }
- return ngrams;
-}
-
-
-/*
- * ngram_matches
- *
- */
-NgramCounts
-make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N )
-{
- Ngrams hyp_ngrams = make_ngrams( hyp, N );
- Ngrams ref_ngrams = make_ngrams( ref, N );
- NgramCounts counts( N );
- Ngrams::iterator it;
- Ngrams::iterator ti;
- for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) {
- ti = ref_ngrams.find( it->first );
- if ( ti != ref_ngrams.end() ) {
- counts.add( it->second, ti->second, it->first.size() - 1 );
- } else {
- counts.add( it->second, 0, it->first.size() - 1 );
- }
- }
- return counts;
-}
-
-
-
-
-/******************************************************************************
- * SCORES
- *
- *
- * brevity_penaly
- *
- */
-double
-brevity_penaly( const size_t hyp_len, const size_t ref_len )
-{
- if ( hyp_len > ref_len ) return 1;
- return exp( 1 - (double)ref_len/(double)hyp_len );
-}
-
-
-/*
- * bleu
- * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02)
- * page TODO
- * 0 if for N one of the counts = 0
- */
-double
-bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- size_t N, vector<float> weights )
-{
- if ( hyp_len == 0 || ref_len == 0 ) return 0;
- if ( ref_len < N ) N = ref_len;
- float N_ = (float)N;
- if ( weights.empty() )
- {
- for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
- }
- double sum = 0;
- for ( size_t i = 0; i < N; i++ ) {
- if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0;
- sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] );
- }
- return brevity_penaly( hyp_len, ref_len ) * exp( sum );
-}
-
-
-/*
- * stupid_bleu
- * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04)
- * page TODO
- * 0 iff no 1gram match
- */
-double
-stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- size_t N, vector<float> weights )
-{
- if ( hyp_len == 0 || ref_len == 0 ) return 0;
- if ( ref_len < N ) N = ref_len;
- float N_ = (float)N;
- if ( weights.empty() )
- {
- for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
- }
- double sum = 0;
- float add = 0;
- for ( size_t i = 0; i < N; i++ ) {
- if ( i == 1 ) add = 1;
- sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) );
- }
- return brevity_penaly( hyp_len, ref_len ) * exp( sum );
-}
-
-
-/*
- * smooth_bleu
- * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06)
- * page TODO
- * max. 0.9375
- */
-double
-smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- const size_t N, vector<float> weights )
-{
- if ( hyp_len == 0 || ref_len == 0 ) return 0;
- float N_ = (float)N;
- if ( weights.empty() )
- {
- for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
- }
- double sum = 0;
- float j = 1;
- for ( size_t i = 0; i < N; i++ ) {
- if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
- sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 );
- j++;
- }
- return brevity_penaly( hyp_len, ref_len ) * sum;
-}
-
-
-/*
- * approx_bleu
- * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07)
- * page TODO
- *
- */
-double
-approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- const size_t N, vector<float> weights )
-{
- return bleu( counts, hyp_len, ref_len, N, weights );
-}
-
-
-
-
-/******************************************************************************
- * UTILS
- *
- *
- * register_and_convert
- *
- */
-void
-register_and_convert(const vector<string>& strs, vector<WordID>& ids)
-{
- vector<string>::const_iterator it;
- for ( it = strs.begin(); it < strs.end(); it++ ) {
- ids.push_back( TD::Convert( *it ) );
- }
-}
-
-
-/*
- * approx_equal
- *
- */
-double
-approx_equal( double x, double y )
-{
- const double EPSILON = 1E-5;
- if ( x == 0 ) return fabs( y ) <= EPSILON;
- if ( y == 0 ) return fabs( x ) <= EPSILON;
- return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
-}
-
-
-/*
- * print_FD
- *
- */
-void
-print_FD()
-{
- for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
-}
-
-
-
-
-/******************************************************************************
- * TESTS
- *
- *
- * test_ngrams
- *
- */
-void
-test_ngrams()
-{
- cout << "Testing ngrams..." << endl << endl;
- size_t N = 5;
- cout << "N = " << N << endl;
- vector<int> a; // hyp
- vector<int> b; // ref
- cout << "a ";
- for (size_t i = 1; i <= 8; i++) {
- cout << i << " ";
- a.push_back(i);
- }
- cout << endl << "b ";
- for (size_t i = 1; i <= 4; i++) {
- cout << i << " ";
- b.push_back(i);
- }
- cout << endl << endl;
- NgramCounts c = make_ngram_counts( a, b, N );
- assert( c.clipped[N-1] == 0 );
- assert( c.sum[N-1] == 4 );
- c.print();
- c += c;
- cout << endl;
- c.print();
- cout << endl;
-}
-
-
-/*
- * test_metrics
- *
- */
-void
-test_metrics()
-{
- cout << "Testing metrics..." << endl << endl;
- using namespace boost::assign;
- vector<string> a, b;
- vector<double> expect_vanilla, expect_smooth, expect_stupid;
- a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp
- b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref
- expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0;
- expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587;
- expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707;
- vector<string> aa, bb;
- vector<WordID> aai, bbi;
- double vanilla, smooth, stupid;
- size_t N = 4;
- cout << "N = " << N << endl << endl;
- for ( size_t i = 0; i < a.size(); i++ ) {
- cout << " hyp: " << a[i] << endl;
- cout << " ref: " << b[i] << endl;
- aa.clear(); bb.clear(); aai.clear(); bbi.clear();
- boost::split( aa, a[i], boost::is_any_of(" ") );
- boost::split( bb, b[i], boost::is_any_of(" ") );
- register_and_convert( aa, aai );
- register_and_convert( bb, bbi );
- NgramCounts counts = make_ngram_counts( aai, bbi, N );
- vanilla = bleu( counts, aa.size(), bb.size(), N);
- smooth = smooth_bleu( counts, aa.size(), bb.size(), N);
- stupid = stupid_bleu( counts, aa.size(), bb.size(), N);
- assert( approx_equal(vanilla, expect_vanilla[i]) );
- assert( approx_equal(smooth, expect_smooth[i]) );
- assert( approx_equal(stupid, expect_stupid[i]) );
- cout << setw(14) << "bleu = " << vanilla << endl;
- cout << setw(14) << "smooth bleu = " << smooth << endl;
- cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
- }
- cout << endl;
-}
-
-
-/*
- * test_SetWeights
- *
- */
-void
-test_SetWeights()
-{
- cout << "Testing Weights::SetWeight..." << endl << endl;
- Weights weights;
- SparseVector<double> lambdas;
- weights.InitSparseVector( &lambdas );
- weights.SetWeight( &lambdas, "test", 0 );
- weights.SetWeight( &lambdas, "test1", 1 );
- WordID fid = FD::Convert( "test2" );
- weights.SetWeight( &lambdas, fid, 2 );
- string fn = "weights-test";
- cout << "FD::NumFeats() " << FD::NumFeats() << endl;
- assert( FD::NumFeats() == 4 );
- weights.WriteToFile( fn, true );
- cout << endl;
-}
-
-
-/*
- * run_tests
- *
- */
-void
-run_tests()
-{
- cout << endl;
- test_ngrams();
- cout << endl;
- test_metrics();
- cout << endl;
- test_SetWeights();
- exit(0);
-}
-
diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h
deleted file mode 100644
index 6df841bb..00000000
--- a/dtrain/dcommon.h
+++ /dev/null
@@ -1,163 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include "config.h"
-
-#include <boost/shared_ptr.hpp>
-#include <boost/algorithm/string.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "sentence_metadata.h"
-#include "scorer.h"
-#include "verbose.h"
-#include "viterbi.h"
-#include "hg.h"
-#include "prob.h"
-#include "kbest.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "sampler.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-
-
-
-struct ScorePair
-{
- ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {}
- double modelscore_, score_;
- double GetModelScore() { return modelscore_; }
- double GetScore() { return score_; }
-};
-typedef vector<ScorePair> Scores;
-
-
-/*
- * KBestGetter
- *
- */
-struct KBestList {
- vector<SparseVector<double> > feats;
- vector<vector<WordID> > sents;
- vector<double> scores;
-};
-struct KBestGetter : public DecoderObserver
-{
- KBestGetter( const size_t k ) : k_(k) {}
- const size_t k_;
- KBestList kb;
-
- virtual void
- NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
- {
- GetKBest(smeta.GetSentenceID(), *hg);
- }
-
- KBestList* GetKBest() { return &kb; }
-
- void
- GetKBest(int sent_id, const Hypergraph& forest)
- {
- kb.scores.clear();
- kb.sents.clear();
- kb.feats.clear();
- KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ );
- for ( size_t i = 0; i < k_; ++i ) {
- const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
- kbest.LazyKthBest( forest.nodes_.size() - 1, i );
- if (!d) break;
- kb.sents.push_back( d->yield);
- kb.feats.push_back( d->feature_values );
- kb.scores.push_back( d->score );
- }
- }
-};
-
-
-/*
- * NgramCounts
- *
- */
-struct NgramCounts
-{
- NgramCounts( const size_t N ) : N_( N ) {
- reset();
- }
- size_t N_;
- map<size_t, size_t> clipped;
- map<size_t, size_t> sum;
-
- void
- operator+=( const NgramCounts& rhs )
- {
- assert( N_ == rhs.N_ );
- for ( size_t i = 0; i < N_; i++ ) {
- this->clipped[i] += rhs.clipped.find(i)->second;
- this->sum[i] += rhs.sum.find(i)->second;
- }
- }
-
- void
- add( size_t count, size_t ref_count, size_t i )
- {
- assert( i < N_ );
- if ( count > ref_count ) {
- clipped[i] += ref_count;
- sum[i] += count;
- } else {
- clipped[i] += count;
- sum[i] += count;
- }
- }
-
- void
- reset()
- {
- size_t i;
- for ( i = 0; i < N_; i++ ) {
- clipped[i] = 0;
- sum[i] = 0;
- }
- }
-
- void
- print()
- {
- for ( size_t i = 0; i < N_; i++ ) {
- cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
- cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
- }
- }
-};
-
-
-
-
-typedef map<vector<WordID>, size_t> Ngrams;
-Ngrams make_ngrams( vector<WordID>& s, size_t N );
-NgramCounts make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N );
-double brevity_penaly( const size_t hyp_len, const size_t ref_len );
-double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector<float> weights = vector<float>() );
-double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector<float> weights = vector<float>() );
-double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() );
-double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() );
-void register_and_convert(const vector<string>& strs, vector<WordID>& ids);
-void print_FD();
-void run_tests();
-void test_SetWeights();
-#include <boost/assign/std/vector.hpp>
-#include <iomanip>
-void test_metrics();
-double approx_equal( double x, double y );
-void test_ngrams();
-
diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc
index 5ae473e6..d1ff30c0 100644
--- a/dtrain/dtest.cc
+++ b/dtrain/dtest.cc
@@ -1,6 +1,6 @@
-#include "dcommon.h"
-
-
+#include "common.h"
+#include "kbestget.h"
+#include "util.h"
/*
@@ -14,10 +14,10 @@ init(int argc, char** argv, po::variables_map* conf)
bool q;
po::options_description opts( "Options" );
opts.add_options()
- ( "decoder-config,c", po::value<string>(), "configuration file for cdec" )
- ( "weights,w", po::value<string>(), "weights file")
- ( "ngrams,n", po::value<int>(&N)->default_value(4), "N for Ngrams (default 5)" )
- ( "quiet,q", po::value<bool>(&q)->default_value(true), "do not output translations" );
+ ( "decoder-config,c", po::value<string>(), "configuration file for cdec" )
+ ( "weights,w", po::value<string>(), "weights file" )
+ ( "ngrams,n", po::value<int>(&N)->default_value(DTRAIN_DEFAULT_N), "N for Ngrams (default 5)" )
+ ( "quiet,q", po::value<bool>(&q)->default_value(true), "do not output translations" );
po::options_description cmdline_options;
cmdline_options.add(opts);
po::store( parse_command_line(argc, argv, cmdline_options), *conf );
@@ -57,17 +57,17 @@ main(int argc, char** argv)
vector<string> strs, ref_strs;
vector<WordID> ref_ids;
string in, psg;
- size_t sid = 0;
- double overall = 0.0;
+ size_t sn = 0;
+ double overall = 0.0;
double overall1 = 0.0;
double overall2 = 0.0;
- cerr << "(a dot equals 100 lines of input)" << endl;
+ cerr << "(A dot equals " << DTRAIN_DOTOUT << " lines of input.)" << endl;
while( getline(cin, in) ) {
- if ( (sid+1) % 100 == 0 ) {
+ if ( (sn+1) % DTRAIN_DOTOUT == 0 ) {
cerr << ".";
- if ( (sid+1)%1000 == 0 ) cerr << endl;
+ if ( (sn+1) % (20*DTRAIN_DOTOUT) == 0 ) cerr << endl;
}
- //if ( sid > 5000 ) break;
+ //if ( sn > 5000 ) break;
strs.clear();
boost::split( strs, in, boost::is_any_of("\t") );
// grammar
@@ -80,25 +80,22 @@ main(int argc, char** argv)
boost::split( ref_strs, strs[1], boost::is_any_of(" ") );
register_and_convert( ref_strs, ref_ids );
// scoring kbest
- double score = 0.0;
+ double score = 0.0;
double score1 = 0.0;
double score2 = 0.0;
NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], 4 );
- score = smooth_bleu( counts,
- ref_ids.size(),
- kb->sents[0].size(), N );
- score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N) ;
- score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
- //if ( ! quiet )
- cout << TD::GetString( kb->sents[0] ) << endl;
+ score = smooth_bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
+ score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
+ score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
+ if ( ! quiet ) cout << TD::GetString( kb->sents[0] ) << endl;
overall += score;
overall1 += score1;
overall2 += score2;
- sid += 1;
+ sn += 1;
}
- cerr << "Average score (smooth): " << overall/(double)(sid+1) << endl;
- cerr << "Average score (stupid): " << overall1/(double)(sid+1) << endl;
- cerr << "Average score (normal): " << overall2/(double)(sid+1) << endl;
+ cerr << "Average score (smooth) : " << overall/(double)(sn+1) << endl;
+ cerr << "Average score (stupid) : " << overall1/(double)(sn+1) << endl;
+ cerr << "Average score (vanilla): " << overall2/(double)(sn+1) << endl;
cerr << endl;
return 0;
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 373458e8..16b83a70 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,6 +1,11 @@
-#include "dcommon.h"
+#include "common.h"
+#include "kbestget.h"
#include "learner.h"
+#include "util.h"
+#ifdef DTRAIN_DEBUG
+#include "tests.h"
+#endif
@@ -12,20 +17,33 @@ bool
init(int argc, char** argv, po::variables_map* conf)
{
po::options_description opts( "Options" );
+ size_t k, N, T;
+ // TODO scoring metric as parameter/in config
opts.add_options()
- ( "decoder-config,c", po::value<string>(), "configuration file for cdec" )
- ( "kbest,k", po::value<size_t>(), "k for kbest" )
- ( "ngrams,n", po::value<int>(), "n for Ngrams" )
- ( "filter,f", po::value<string>(), "filter kbest list" )
- ( "test", "run tests and exit");
+ ( "decoder-config,c", po::value<string>(), "configuration file for cdec" )
+ ( "kbest,k", po::value<size_t>(&k)->default_value(DTRAIN_DEFAULT_K), "k for kbest" )
+ ( "ngrams,n", po::value<size_t>(&N)->default_value(DTRAIN_DEFAULT_N), "n for Ngrams" )
+ ( "filter,f", po::value<string>(), "filter kbest list" ) // FIXME
+ ( "epochs,t", po::value<size_t>(&T)->default_value(DTRAIN_DEFAULT_T), "# of iterations T" )
+#ifndef DTRAIN_DEBUG
+ ;
+#else
+ ( "test", "run tests and exit");
+#endif
po::options_description cmdline_options;
cmdline_options.add(opts);
po::store( parse_command_line(argc, argv, cmdline_options), *conf );
po::notify( *conf );
- if ( ! (conf->count("decoder-config") || conf->count("test")) ) {
+ if ( ! conf->count("decoder-config") ) {
cerr << cmdline_options << endl;
return false;
}
+ #ifdef DTRAIN_DEBUG
+ if ( ! conf->count("test") ) {
+ cerr << cmdline_options << endl;
+ return false;
+ }
+ #endif
return true;
}
@@ -40,19 +58,21 @@ main(int argc, char** argv)
SetSilent(true);
po::variables_map conf;
if (!init(argc, argv, &conf)) return 1;
+#ifdef DTRAIN_DEBUG
if ( conf.count("test") ) run_tests();
+#endif
register_feature_functions();
size_t k = conf["kbest"].as<size_t>();
- ReadFile ini_rf(conf["decoder-config"].as<string>());
+ ReadFile ini_rf( conf["decoder-config"].as<string>() );
Decoder decoder(ini_rf.stream());
- KBestGetter observer(k);
- size_t N = 3; // TODO as parameter/in config
+ KBestGetter observer( k );
+ size_t N = conf["ngrams"].as<size_t>();
+ size_t T = conf["epochs"].as<size_t>();
- // TODO scoring metric as parameter/in config
// for approx. bleu
- NgramCounts global_counts(N);
- size_t global_hyp_len = 0;
- size_t global_ref_len = 0;
+ //NgramCounts global_counts( N );
+ //size_t global_hyp_len = 0;
+ //size_t global_ref_len = 0;
Weights weights;
SparseVector<double> lambdas;
@@ -62,20 +82,24 @@ main(int argc, char** argv)
vector<string> strs, ref_strs;
vector<WordID> ref_ids;
string in, psg;
- size_t sid = 0;
- cerr << "(1 dot equals 100 lines of input)" << endl;
+ size_t sn = 0;
+ cerr << "(A dot equals " << DTRAIN_DOTOUT << " lines of input.)" << endl;
+
+ for ( size_t t = 0; t < T; t++ )
+ {
+
while( getline(cin, in) ) {
- if ( (sid+1) % 100 == 0 ) {
+ if ( (sn+1) % DTRAIN_DOTOUT == 0 ) {
cerr << ".";
- if ( (sid+1)%1000 == 0 ) cerr << endl;
+ if ( (sn+1) % (20*DTRAIN_DOTOUT) == 0 ) cerr << endl;
}
- //if ( sid > 5000 ) break;
+ //if ( sn > 5000 ) break;
// weights
dense_weights.clear();
weights.InitFromVector( lambdas );
weights.InitVector( &dense_weights );
decoder.SetWeights( dense_weights );
- // handling input..
+ // handling input
strs.clear();
boost::split( strs, in, boost::is_any_of("\t") );
// grammar
@@ -89,11 +113,11 @@ main(int argc, char** argv)
register_and_convert( ref_strs, ref_ids );
// scoring kbest
double score = 0;
- size_t cand_len = 0;
+ //size_t cand_len = 0;
Scores scores;
for ( size_t i = 0; i < kb->sents.size(); i++ ) {
NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N );
- if ( i == 0) {
+ /*if ( i == 0 ) {
global_counts += counts;
global_hyp_len += kb->sents[i].size();
global_ref_len += ref_ids.size();
@@ -101,24 +125,28 @@ main(int argc, char** argv)
} else {
cand_len = kb->sents[i].size();
}
- //score = bleu( global_counts,
- // global_ref_len,
- // global_hyp_len + cand_len, N );
+ score = bleu( global_counts,
+ global_ref_len,
+ global_hyp_len + cand_len, N );*/
score = bleu ( counts, ref_ids.size(), kb->sents[i].size(), N );
ScorePair sp( kb->scores[i], score );
scores.push_back( sp );
- //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl;
+ //cout << "'" << TD::GetString( ref_ids ) << "' vs '";
+ //cout << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl;
//cout << kb->feats[i] << endl;
}
// learner
SofiaLearner learner;
- learner.Init( sid, kb->feats, scores );
+ learner.Init( sn, kb->feats, scores );
learner.Update(lambdas);
//print_FD();
- sid += 1; // TODO does cdec count this already?
+ sn += 1;
}
+
+ } // outer loop
+
cerr << endl;
- weights.WriteToFile( "data/weights-final-normalx", true );
+ weights.WriteToFile( "data/weights-vanilla", false );
return 0;
}
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
new file mode 100644
index 00000000..6d93d3b7
--- /dev/null
+++ b/dtrain/kbestget.h
@@ -0,0 +1,61 @@
+#ifndef _DTRAIN_KBESTGET_H_
+#define _DTRAIN_KBESTGET_H_
+
+
+namespace dtrain
+{
+
+
+/*
+ * KBestList
+ *
+ */
+struct KBestList {
+ vector<SparseVector<double> > feats;
+ vector<vector<WordID> > sents;
+ vector<double> scores;
+};
+
+
+/*
+ * KBestGetter
+ *
+ */
+struct KBestGetter : public DecoderObserver
+{
+ KBestGetter( const size_t k ) : k_(k) {}
+ const size_t k_;
+ KBestList kb;
+
+ virtual void
+ NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+ {
+ GetKBest(smeta.GetSentenceID(), *hg);
+ }
+
+ KBestList* GetKBest() { return &kb; }
+
+ void
+ GetKBest(int sent_id, const Hypergraph& forest)
+ {
+ kb.scores.clear();
+ kb.sents.clear();
+ kb.feats.clear();
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ );
+ for ( size_t i = 0; i < k_; ++i ) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest( forest.nodes_.size() - 1, i );
+ if (!d) break;
+ kb.sents.push_back( d->yield);
+ kb.feats.push_back( d->feature_values );
+ kb.scores.push_back( d->score );
+ }
+ }
+};
+
+
+} // namespace
+
+
+#endif
+
diff --git a/dtrain/learner.h b/dtrain/learner.h
index a953284d..038749e2 100644
--- a/dtrain/learner.h
+++ b/dtrain/learner.h
@@ -1,71 +1,96 @@
-/*class Learnerx
+#ifndef _DTRAIN_LEARNER_H_
+#define _DTRAIN_LEARNER_H_
+
+#include <string>
+#include <vector>
+#include <map>
+
+#include "sparse_vector.h"
+#include "score.h"
+
+
+namespace dtrain
+{
+
+
+class Learner
{
public:
- virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {};
- virtual void Update(SparseVector<double>& lambdas);
-};*/
+ virtual void Init( const vector<SparseVector<double> >& kbest, const Scores& scores,
+ const bool invert_score = false ) {};
+ virtual void Update( SparseVector<double>& lambdas ) {};
+};
-class SofiaLearner //: public Learnerx FIXME
+
+class SofiaLearner : public Learner
{
- // TODO bool invert_score
public:
- void
- Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores )
- {
- assert( kbest.size() == scores.size() );
- ofstream o;
- //unlink( "/tmp/sofia_ml_training_stupid" );
- o.open( "/tmp/sofia_ml_training_normalx", ios::trunc ); // TODO randomize, filename exists
- int fid = 0;
- map<int,int>::iterator ff;
+ void
+ Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const FIXME*/ Scores& scores,
+ const bool invert_score = false )
+ {
+ assert( kbest.size() == scores.size() );
+ ofstream o;
+ unlink( "/tmp/sofia_ml_training" );
+ o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists
+ int fid = 0;
+ map<int,int>::iterator ff;
- for ( size_t k = 0; k < kbest.size(); ++k ) {
- map<int,double> m;
- SparseVector<double>::const_iterator it = kbest[k].begin();
- o << scores[k].GetScore();
- for ( ; it != kbest[k].end(); ++it) {
- ff = fmap.find( it->first );
- if ( ff == fmap.end() ) {
- fmap.insert( pair<int,int>(it->first, fid) );
- fmap1.insert( pair<int,int>(fid, it->first) );
- fid++;
+ double score;
+ for ( size_t k = 0; k < kbest.size(); ++k ) {
+ map<int,double> m;
+ SparseVector<double>::const_iterator it = kbest[k].begin();
+ score = scores[k].GetScore();
+ if ( invert_score ) score = -score;
+ o << score;
+ for ( ; it != kbest[k].end(); ++it ) {
+ ff = fmap.find( it->first );
+ if ( ff == fmap.end() ) {
+ fmap.insert( pair<int,int>(it->first, fid) );
+ fmap1.insert( pair<int,int>(fid, it->first) );
+ fid++;
+ }
+ m.insert( pair<int,double>(fmap[it->first], it->second) );
}
- m.insert(pair<int,double>(fmap[it->first], it->second));
- }
- map<int,double>::iterator ti = m.begin();
- for ( ; ti != m.end(); ++ti ) {
- o << " " << ti->first << ":" << ti->second;
+ map<int,double>::iterator ti = m.begin();
+ for ( ; ti != m.end(); ++ti ) {
+ o << " " << ti->first << ":" << ti->second;
+ }
+ o << endl;
}
- o << endl;
+ o.close();
}
- o.close();
- }
- void
- Update(SparseVector<double>& lambdas)
- {
- string call = "./sofia-ml --training_file /tmp/sofia_ml_training_normalx --model_out /tmp/sofia_ml_model_normalx --loop_type stochastic --lambda 100 --dimensionality ";
- std::stringstream out;
- out << fmap.size();
- call += out.str();
- call += " &>/dev/null";
- system ( call.c_str() );
- ifstream i;
- //unlink( "/tmp/sofia_ml_model_stupid" );
- i.open( "/tmp/sofia_ml_model_normalx", ios::in );
- string model;
- getline( i, model );
- vector<string> strs;
- boost::split( strs, model, boost::is_any_of(" ") );
- int j = 0;
- for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) {
- lambdas.set_value(fmap1[j], atof( it->c_str() ) );
- j++;
+ void
+ Update(SparseVector<double>& lambdas)
+ {
+ string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality ";
+ std::stringstream out;
+ out << fmap.size();
+ call += out.str();
+ call += " &>/dev/null";
+ system ( call.c_str() );
+ ifstream i;
+ unlink( "/tmp/sofia_ml_model" );
+ i.open( "/tmp/sofia_ml_model", ios::in );
+ string model;
+ getline( i, model );
+ vector<string> strs;
+ boost::split( strs, model, boost::is_any_of(" ") );
+ int j = 0;
+ for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) {
+ lambdas.set_value(fmap1[j], atof( it->c_str() ) );
+ j++;
+ }
}
- }
private:
map<int,int> fmap;
map<int,int> fmap1;
};
+
+} // namespace
+
+#endif
+
diff --git a/dtrain/score.cc b/dtrain/score.cc
new file mode 100644
index 00000000..72e6db71
--- /dev/null
+++ b/dtrain/score.cc
@@ -0,0 +1,166 @@
+#include "score.h"
+
+
+namespace dtrain
+{
+
+
+/******************************************************************************
+ * NGRAMS
+ *
+ *
+ * make_ngrams
+ *
+ */
+typedef map<vector<WordID>, size_t> Ngrams;
+Ngrams
+make_ngrams( vector<WordID>& s, size_t N )
+{
+ Ngrams ngrams;
+ vector<WordID> ng;
+ for ( size_t i = 0; i < s.size(); i++ ) {
+ ng.clear();
+ for ( size_t j = i; j < min( i+N, s.size() ); j++ ) {
+ ng.push_back( s[j] );
+ ngrams[ng]++;
+ }
+ }
+ return ngrams;
+}
+
+
+/*
+ * ngram_matches
+ *
+ */
+NgramCounts
+make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N )
+{
+ Ngrams hyp_ngrams = make_ngrams( hyp, N );
+ Ngrams ref_ngrams = make_ngrams( ref, N );
+ NgramCounts counts( N );
+ Ngrams::iterator it;
+ Ngrams::iterator ti;
+ for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) {
+ ti = ref_ngrams.find( it->first );
+ if ( ti != ref_ngrams.end() ) {
+ counts.add( it->second, ti->second, it->first.size() - 1 );
+ } else {
+ counts.add( it->second, 0, it->first.size() - 1 );
+ }
+ }
+ return counts;
+}
+
+
+/******************************************************************************
+ * SCORES
+ *
+ *
+ * brevity_penaly
+ *
+ */
+double
+brevity_penaly( const size_t hyp_len, const size_t ref_len )
+{
+ if ( hyp_len > ref_len ) return 1;
+ return exp( 1 - (double)ref_len/(double)hyp_len );
+}
+
+
+/*
+ * bleu
+ * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02)
+ * page TODO
+ * 0 if for N one of the counts = 0
+ */
+double
+bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ size_t N, vector<float> weights )
+{
+ if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ if ( ref_len < N ) N = ref_len;
+ float N_ = (float)N;
+ if ( weights.empty() )
+ {
+ for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ }
+ double sum = 0;
+ for ( size_t i = 0; i < N; i++ ) {
+ if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0;
+ sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] );
+ }
+ return brevity_penaly( hyp_len, ref_len ) * exp( sum );
+}
+
+
+/*
+ * stupid_bleu
+ * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04)
+ * page TODO
+ * 0 iff no 1gram match
+ */
+double
+stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ size_t N, vector<float> weights )
+{
+ if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ if ( ref_len < N ) N = ref_len;
+ float N_ = (float)N;
+ if ( weights.empty() )
+ {
+ for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ }
+ double sum = 0;
+ float add = 0;
+ for ( size_t i = 0; i < N; i++ ) {
+ if ( i == 1 ) add = 1;
+ sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) );
+ }
+ return brevity_penaly( hyp_len, ref_len ) * exp( sum );
+}
+
+
+/*
+ * smooth_bleu
+ * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06)
+ * page TODO
+ * max. 0.9375
+ */
+double
+smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ const size_t N, vector<float> weights )
+{
+ if ( hyp_len == 0 || ref_len == 0 ) return 0;
+ float N_ = (float)N;
+ if ( weights.empty() )
+ {
+ for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
+ }
+ double sum = 0;
+ float j = 1;
+ for ( size_t i = 0; i < N; i++ ) {
+ if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
+ sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 );
+ j++;
+ }
+ return brevity_penaly( hyp_len, ref_len ) * sum;
+}
+
+
+/*
+ * approx_bleu
+ * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07)
+ * page TODO
+ *
+ */
+double
+approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
+ const size_t N, vector<float> weights )
+{
+ return bleu( counts, hyp_len, ref_len, N, weights );
+}
+
+
+} // namespace
+
diff --git a/dtrain/score.h b/dtrain/score.h
new file mode 100644
index 00000000..e9130e18
--- /dev/null
+++ b/dtrain/score.h
@@ -0,0 +1,111 @@
+#ifndef _DTRAIN_SCORE_H_
+#define _DTRAIN_SCORE_H_
+
+
+#include <iostream>
+#include <vector>
+#include <map>
+#include <cassert>
+#include <cmath>
+
+#include "wordid.h"
+
+using namespace std;
+
+
+namespace dtrain
+{
+
+
+/*
+ * ScorePair
+ *
+ */
+struct ScorePair
+{
+ ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {}
+ double modelscore_, score_;
+ double GetModelScore() { return modelscore_; }
+ double GetScore() { return score_; }
+};
+
+typedef vector<ScorePair> Scores;
+
+
+/*
+ * NgramCounts
+ *
+ */
+struct NgramCounts
+{
+ NgramCounts( const size_t N ) : N_( N ) {
+ reset();
+ }
+ size_t N_;
+ map<size_t, size_t> clipped;
+ map<size_t, size_t> sum;
+
+ void
+ operator+=( const NgramCounts& rhs )
+ {
+ assert( N_ == rhs.N_ );
+ for ( size_t i = 0; i < N_; i++ ) {
+ this->clipped[i] += rhs.clipped.find(i)->second;
+ this->sum[i] += rhs.sum.find(i)->second;
+ }
+ }
+
+ void
+ add( size_t count, size_t ref_count, size_t i )
+ {
+ assert( i < N_ );
+ if ( count > ref_count ) {
+ clipped[i] += ref_count;
+ sum[i] += count;
+ } else {
+ clipped[i] += count;
+ sum[i] += count;
+ }
+ }
+
+ void
+ reset()
+ {
+ size_t i;
+ for ( i = 0; i < N_; i++ ) {
+ clipped[i] = 0;
+ sum[i] = 0;
+ }
+ }
+
+ void
+ print()
+ {
+ for ( size_t i = 0; i < N_; i++ ) {
+ cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
+ cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
+ }
+ }
+};
+
+
+typedef map<vector<WordID>, size_t> Ngrams;
+Ngrams make_ngrams( vector<WordID>& s, size_t N );
+NgramCounts make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N );
+
+double brevity_penaly( const size_t hyp_len, const size_t ref_len );
+double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N,
+ vector<float> weights = vector<float>() );
+double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N,
+ vector<float> weights = vector<float>() );
+double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N,
+ vector<float> weights = vector<float>() );
+double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N,
+ vector<float> weights = vector<float>() );
+
+
+} // namespace
+
+
+#endif
+
diff --git a/dtrain/scripts/run.sh b/dtrain/scripts/run.sh
new file mode 100755
index 00000000..f2b6d600
--- /dev/null
+++ b/dtrain/scripts/run.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+./dtrain -c ./data/cdec.ini -k 200 -n 3 -t 10 < ./data/in.blunsom08 #< data/in.toy
+
diff --git a/dtrain/scripts/test.sh b/dtrain/scripts/test.sh
new file mode 100755
index 00000000..3639dfe7
--- /dev/null
+++ b/dtrain/scripts/test.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+EXP=$1
+#head -5000
+cat ./data/in.blunsom08 | ./dtest -q false -c ./data/cdec.ini -w ./data/weights-$EXP 2> ./output/err.$EXP > ./output/out.$EXP
+
diff --git a/dtrain/test.sh b/dtrain/test.sh
deleted file mode 100755
index bc318ae7..00000000
--- a/dtrain/test.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-
-./dtrain -c data/cdec.ini -k 200 < data/in.blunsom08 #< data/in.toy
-
diff --git a/dtrain/tests.cc b/dtrain/tests.cc
new file mode 100644
index 00000000..997eafbb
--- /dev/null
+++ b/dtrain/tests.cc
@@ -0,0 +1,141 @@
+#include "tests.h"
+
+
+namespace dtrain
+{
+
+
+/*
+ * approx_equal
+ *
+ */
+double
+approx_equal( double x, double y )
+{
+ const double EPSILON = 1E-5;
+ if ( x == 0 ) return fabs( y ) <= EPSILON;
+ if ( y == 0 ) return fabs( x ) <= EPSILON;
+ return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
+}
+
+
+/*
+ * test_ngrams
+ *
+ */
+void
+test_ngrams()
+{
+ cout << "Testing ngrams..." << endl << endl;
+ size_t N = 5;
+ cout << "N = " << N << endl;
+ vector<int> a; // hyp
+ vector<int> b; // ref
+ cout << "a ";
+ for (size_t i = 1; i <= 8; i++) {
+ cout << i << " ";
+ a.push_back(i);
+ }
+ cout << endl << "b ";
+ for (size_t i = 1; i <= 4; i++) {
+ cout << i << " ";
+ b.push_back(i);
+ }
+ cout << endl << endl;
+ NgramCounts c = make_ngram_counts( a, b, N );
+ assert( c.clipped[N-1] == 0 );
+ assert( c.sum[N-1] == 4 );
+ c.print();
+ c += c;
+ cout << endl;
+ c.print();
+ cout << endl;
+}
+
+
+/*
+ * test_metrics
+ *
+ */
+void
+test_metrics()
+{
+ cout << "Testing metrics..." << endl << endl;
+ using namespace boost::assign;
+ vector<string> a, b;
+ vector<double> expect_vanilla, expect_smooth, expect_stupid;
+ a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp
+ b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref
+ expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0;
+ expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587;
+ expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707;
+ vector<string> aa, bb;
+ vector<WordID> aai, bbi;
+ double vanilla, smooth, stupid;
+ size_t N = 4;
+ cout << "N = " << N << endl << endl;
+ for ( size_t i = 0; i < a.size(); i++ ) {
+ cout << " hyp: " << a[i] << endl;
+ cout << " ref: " << b[i] << endl;
+ aa.clear(); bb.clear(); aai.clear(); bbi.clear();
+ boost::split( aa, a[i], boost::is_any_of(" ") );
+ boost::split( bb, b[i], boost::is_any_of(" ") );
+ register_and_convert( aa, aai );
+ register_and_convert( bb, bbi );
+ NgramCounts counts = make_ngram_counts( aai, bbi, N );
+ vanilla = bleu( counts, aa.size(), bb.size(), N);
+ smooth = smooth_bleu( counts, aa.size(), bb.size(), N);
+ stupid = stupid_bleu( counts, aa.size(), bb.size(), N);
+ assert( approx_equal(vanilla, expect_vanilla[i]) );
+ assert( approx_equal(smooth, expect_smooth[i]) );
+ assert( approx_equal(stupid, expect_stupid[i]) );
+ cout << setw(14) << "bleu = " << vanilla << endl;
+ cout << setw(14) << "smooth bleu = " << smooth << endl;
+ cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
+ }
+ cout << endl;
+}
+
+
+/*
+ * test_SetWeights
+ *
+ */
+void
+test_SetWeights()
+{
+ cout << "Testing Weights::SetWeight..." << endl << endl;
+ Weights weights;
+ SparseVector<double> lambdas;
+ weights.InitSparseVector( &lambdas );
+ weights.SetWeight( &lambdas, "test", 0 );
+ weights.SetWeight( &lambdas, "test1", 1 );
+ WordID fid = FD::Convert( "test2" );
+ weights.SetWeight( &lambdas, fid, 2 );
+ string fn = "weights-test";
+ cout << "FD::NumFeats() " << FD::NumFeats() << endl;
+ assert( FD::NumFeats() == 4 );
+ weights.WriteToFile( fn, true );
+ cout << endl;
+}
+
+
+/*
+ * run_tests
+ *
+ */
+void
+run_tests()
+{
+ cout << endl;
+ test_ngrams();
+ cout << endl;
+ test_metrics();
+ cout << endl;
+ test_SetWeights();
+ exit(0);
+}
+
+
+} // namespace
+
diff --git a/dtrain/tests.h b/dtrain/tests.h
new file mode 100644
index 00000000..9853e3c3
--- /dev/null
+++ b/dtrain/tests.h
@@ -0,0 +1,26 @@
+#ifndef _DTRAIN_TESTS_H_
+#define _DTRAIN_TESTS_H_
+
+#include <iomanip>
+#include <boost/assign/std/vector.hpp>
+
+#include "common.h"
+#include "util.h"
+
+
+namespace dtrain
+{
+
+
+double approx_equal( double x, double y );
+void test_ngrams();
+void test_metrics();
+void test_SetWeights();
+void run_tests();
+
+
+} // namespace
+
+
+#endif
+
diff --git a/dtrain/util.cc b/dtrain/util.cc
new file mode 100644
index 00000000..7b3bbe3d
--- /dev/null
+++ b/dtrain/util.cc
@@ -0,0 +1,34 @@
+#include "util.h"
+
+
+namespace dtrain
+{
+
+
+/*
+ * register_and_convert
+ *
+ */
+void
+register_and_convert(const vector<string>& strs, vector<WordID>& ids)
+{
+ vector<string>::const_iterator it;
+ for ( it = strs.begin(); it < strs.end(); it++ ) {
+ ids.push_back( TD::Convert( *it ) );
+ }
+}
+
+
+/*
+ * print_FD
+ *
+ */
+void
+print_FD()
+{
+ for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
+}
+
+
+} // namespace
+
diff --git a/dtrain/util.h b/dtrain/util.h
new file mode 100644
index 00000000..6a548519
--- /dev/null
+++ b/dtrain/util.h
@@ -0,0 +1,28 @@
+#ifndef _DTRAIN_UTIL_H_
+#define _DTRAIN_UTIL_H_
+
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "fdict.h"
+#include "tdict.h"
+#include "wordid.h"
+
+using namespace std;
+
+
+namespace dtrain
+{
+
+
+void register_and_convert(const vector<string>& strs, vector<WordID>& ids);
+void print_FD();
+
+
+} // namespace
+
+
+#endif
+