From 2e605eb2745e56619b16fdbcb8095e0a6543ab27 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Wed, 3 Aug 2011 01:29:52 +0200
Subject: refactoring, cleaning up
---
dtrain/Makefile.am | 4 +-
dtrain/common.h | 37 ++++++
dtrain/dcommon.cc | 330 -------------------------------------------------
dtrain/dcommon.h | 163 ------------------------
dtrain/dtest.cc | 47 ++++---
dtrain/dtrain.cc | 86 ++++++++-----
dtrain/kbestget.h | 61 +++++++++
dtrain/learner.h | 133 ++++++++++++--------
dtrain/score.cc | 166 +++++++++++++++++++++++++
dtrain/score.h | 111 +++++++++++++++++
dtrain/scripts/run.sh | 4 +
dtrain/scripts/test.sh | 6 +
dtrain/test.sh | 4 -
dtrain/tests.cc | 141 +++++++++++++++++++++
dtrain/tests.h | 26 ++++
dtrain/util.cc | 34 +++++
dtrain/util.h | 28 +++++
17 files changed, 774 insertions(+), 607 deletions(-)
create mode 100644 dtrain/common.h
delete mode 100644 dtrain/dcommon.cc
delete mode 100644 dtrain/dcommon.h
create mode 100644 dtrain/kbestget.h
create mode 100644 dtrain/score.cc
create mode 100644 dtrain/score.h
create mode 100755 dtrain/scripts/run.sh
create mode 100755 dtrain/scripts/test.sh
delete mode 100755 dtrain/test.sh
create mode 100644 dtrain/tests.cc
create mode 100644 dtrain/tests.h
create mode 100644 dtrain/util.cc
create mode 100644 dtrain/util.h
(limited to 'dtrain')
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index c3f14bb0..03e3ccf7 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -1,10 +1,10 @@
# TODO I'm sure I can leave something out.
bin_PROGRAMS = dtrain dtest
-dtrain_SOURCES = dtrain.cc dcommon.cc
+dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc
dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-dtest_SOURCES = dtest.cc dcommon.cc
+dtest_SOURCES = dtest.cc score.cc util.cc
dtest_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/dtrain/common.h b/dtrain/common.h
new file mode 100644
index 00000000..cf365d48
--- /dev/null
+++ b/dtrain/common.h
@@ -0,0 +1,37 @@
+#ifndef _DTRAIN_COMMON_H_
+#define _DTRAIN_COMMON_H_
+
+
+#include
+#include
+#include
+#include
+#include
+
+#include "sentence_metadata.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "weights.h"
+
+#include
+#include
+
+#include "score.h"
+
+#define DTRAIN_DEFAULT_K 100
+#define DTRAIN_DEFAULT_N 4
+#define DTRAIN_DEFAULT_T 1
+
+#define DTRAIN_DOTOUT 100
+
+
+using namespace std;
+using namespace dtrain;
+namespace po = boost::program_options;
+
+
+#endif
+
diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc
deleted file mode 100644
index 6657bed6..00000000
--- a/dtrain/dcommon.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-#include "dcommon.h"
-
-
-
-
-/******************************************************************************
- * NGRAMS
- *
- *
- * make_ngrams
- *
- */
-typedef map, size_t> Ngrams;
-Ngrams
-make_ngrams( vector& s, size_t N )
-{
- Ngrams ngrams;
- vector ng;
- for ( size_t i = 0; i < s.size(); i++ ) {
- ng.clear();
- for ( size_t j = i; j < min( i+N, s.size() ); j++ ) {
- ng.push_back( s[j] );
- ngrams[ng]++;
- }
- }
- return ngrams;
-}
-
-
-/*
- * ngram_matches
- *
- */
-NgramCounts
-make_ngram_counts( vector hyp, vector ref, size_t N )
-{
- Ngrams hyp_ngrams = make_ngrams( hyp, N );
- Ngrams ref_ngrams = make_ngrams( ref, N );
- NgramCounts counts( N );
- Ngrams::iterator it;
- Ngrams::iterator ti;
- for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) {
- ti = ref_ngrams.find( it->first );
- if ( ti != ref_ngrams.end() ) {
- counts.add( it->second, ti->second, it->first.size() - 1 );
- } else {
- counts.add( it->second, 0, it->first.size() - 1 );
- }
- }
- return counts;
-}
-
-
-
-
-/******************************************************************************
- * SCORES
- *
- *
- * brevity_penaly
- *
- */
-double
-brevity_penaly( const size_t hyp_len, const size_t ref_len )
-{
- if ( hyp_len > ref_len ) return 1;
- return exp( 1 - (double)ref_len/(double)hyp_len );
-}
-
-
-/*
- * bleu
- * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02)
- * page TODO
- * 0 if for N one of the counts = 0
- */
-double
-bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- size_t N, vector weights )
-{
- if ( hyp_len == 0 || ref_len == 0 ) return 0;
- if ( ref_len < N ) N = ref_len;
- float N_ = (float)N;
- if ( weights.empty() )
- {
- for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
- }
- double sum = 0;
- for ( size_t i = 0; i < N; i++ ) {
- if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0;
- sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] );
- }
- return brevity_penaly( hyp_len, ref_len ) * exp( sum );
-}
-
-
-/*
- * stupid_bleu
- * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04)
- * page TODO
- * 0 iff no 1gram match
- */
-double
-stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- size_t N, vector weights )
-{
- if ( hyp_len == 0 || ref_len == 0 ) return 0;
- if ( ref_len < N ) N = ref_len;
- float N_ = (float)N;
- if ( weights.empty() )
- {
- for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
- }
- double sum = 0;
- float add = 0;
- for ( size_t i = 0; i < N; i++ ) {
- if ( i == 1 ) add = 1;
- sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) );
- }
- return brevity_penaly( hyp_len, ref_len ) * exp( sum );
-}
-
-
-/*
- * smooth_bleu
- * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06)
- * page TODO
- * max. 0.9375
- */
-double
-smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- const size_t N, vector weights )
-{
- if ( hyp_len == 0 || ref_len == 0 ) return 0;
- float N_ = (float)N;
- if ( weights.empty() )
- {
- for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ );
- }
- double sum = 0;
- float j = 1;
- for ( size_t i = 0; i < N; i++ ) {
- if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
- sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 );
- j++;
- }
- return brevity_penaly( hyp_len, ref_len ) * sum;
-}
-
-
-/*
- * approx_bleu
- * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07)
- * page TODO
- *
- */
-double
-approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len,
- const size_t N, vector weights )
-{
- return bleu( counts, hyp_len, ref_len, N, weights );
-}
-
-
-
-
-/******************************************************************************
- * UTILS
- *
- *
- * register_and_convert
- *
- */
-void
-register_and_convert(const vector& strs, vector& ids)
-{
- vector::const_iterator it;
- for ( it = strs.begin(); it < strs.end(); it++ ) {
- ids.push_back( TD::Convert( *it ) );
- }
-}
-
-
-/*
- * approx_equal
- *
- */
-double
-approx_equal( double x, double y )
-{
- const double EPSILON = 1E-5;
- if ( x == 0 ) return fabs( y ) <= EPSILON;
- if ( y == 0 ) return fabs( x ) <= EPSILON;
- return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
-}
-
-
-/*
- * print_FD
- *
- */
-void
-print_FD()
-{
- for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
-}
-
-
-
-
-/******************************************************************************
- * TESTS
- *
- *
- * test_ngrams
- *
- */
-void
-test_ngrams()
-{
- cout << "Testing ngrams..." << endl << endl;
- size_t N = 5;
- cout << "N = " << N << endl;
- vector a; // hyp
- vector b; // ref
- cout << "a ";
- for (size_t i = 1; i <= 8; i++) {
- cout << i << " ";
- a.push_back(i);
- }
- cout << endl << "b ";
- for (size_t i = 1; i <= 4; i++) {
- cout << i << " ";
- b.push_back(i);
- }
- cout << endl << endl;
- NgramCounts c = make_ngram_counts( a, b, N );
- assert( c.clipped[N-1] == 0 );
- assert( c.sum[N-1] == 4 );
- c.print();
- c += c;
- cout << endl;
- c.print();
- cout << endl;
-}
-
-
-/*
- * test_metrics
- *
- */
-void
-test_metrics()
-{
- cout << "Testing metrics..." << endl << endl;
- using namespace boost::assign;
- vector a, b;
- vector expect_vanilla, expect_smooth, expect_stupid;
- a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp
- b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref
- expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0;
- expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587;
- expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707;
- vector aa, bb;
- vector aai, bbi;
- double vanilla, smooth, stupid;
- size_t N = 4;
- cout << "N = " << N << endl << endl;
- for ( size_t i = 0; i < a.size(); i++ ) {
- cout << " hyp: " << a[i] << endl;
- cout << " ref: " << b[i] << endl;
- aa.clear(); bb.clear(); aai.clear(); bbi.clear();
- boost::split( aa, a[i], boost::is_any_of(" ") );
- boost::split( bb, b[i], boost::is_any_of(" ") );
- register_and_convert( aa, aai );
- register_and_convert( bb, bbi );
- NgramCounts counts = make_ngram_counts( aai, bbi, N );
- vanilla = bleu( counts, aa.size(), bb.size(), N);
- smooth = smooth_bleu( counts, aa.size(), bb.size(), N);
- stupid = stupid_bleu( counts, aa.size(), bb.size(), N);
- assert( approx_equal(vanilla, expect_vanilla[i]) );
- assert( approx_equal(smooth, expect_smooth[i]) );
- assert( approx_equal(stupid, expect_stupid[i]) );
- cout << setw(14) << "bleu = " << vanilla << endl;
- cout << setw(14) << "smooth bleu = " << smooth << endl;
- cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
- }
- cout << endl;
-}
-
-
-/*
- * test_SetWeights
- *
- */
-void
-test_SetWeights()
-{
- cout << "Testing Weights::SetWeight..." << endl << endl;
- Weights weights;
- SparseVector lambdas;
- weights.InitSparseVector( &lambdas );
- weights.SetWeight( &lambdas, "test", 0 );
- weights.SetWeight( &lambdas, "test1", 1 );
- WordID fid = FD::Convert( "test2" );
- weights.SetWeight( &lambdas, fid, 2 );
- string fn = "weights-test";
- cout << "FD::NumFeats() " << FD::NumFeats() << endl;
- assert( FD::NumFeats() == 4 );
- weights.WriteToFile( fn, true );
- cout << endl;
-}
-
-
-/*
- * run_tests
- *
- */
-void
-run_tests()
-{
- cout << endl;
- test_ngrams();
- cout << endl;
- test_metrics();
- cout << endl;
- test_SetWeights();
- exit(0);
-}
-
diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h
deleted file mode 100644
index 6df841bb..00000000
--- a/dtrain/dcommon.h
+++ /dev/null
@@ -1,163 +0,0 @@
-#include
-#include
-#include
-#include
-#include
-
-#include "config.h"
-
-#include
-#include
-#include
-#include
-
-#include "sentence_metadata.h"
-#include "scorer.h"
-#include "verbose.h"
-#include "viterbi.h"
-#include "hg.h"
-#include "prob.h"
-#include "kbest.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "sampler.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-
-
-
-struct ScorePair
-{
- ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {}
- double modelscore_, score_;
- double GetModelScore() { return modelscore_; }
- double GetScore() { return score_; }
-};
-typedef vector Scores;
-
-
-/*
- * KBestGetter
- *
- */
-struct KBestList {
- vector > feats;
- vector > sents;
- vector scores;
-};
-struct KBestGetter : public DecoderObserver
-{
- KBestGetter( const size_t k ) : k_(k) {}
- const size_t k_;
- KBestList kb;
-
- virtual void
- NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
- {
- GetKBest(smeta.GetSentenceID(), *hg);
- }
-
- KBestList* GetKBest() { return &kb; }
-
- void
- GetKBest(int sent_id, const Hypergraph& forest)
- {
- kb.scores.clear();
- kb.sents.clear();
- kb.feats.clear();
- KBest::KBestDerivations, ESentenceTraversal> kbest( forest, k_ );
- for ( size_t i = 0; i < k_; ++i ) {
- const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d =
- kbest.LazyKthBest( forest.nodes_.size() - 1, i );
- if (!d) break;
- kb.sents.push_back( d->yield);
- kb.feats.push_back( d->feature_values );
- kb.scores.push_back( d->score );
- }
- }
-};
-
-
-/*
- * NgramCounts
- *
- */
-struct NgramCounts
-{
- NgramCounts( const size_t N ) : N_( N ) {
- reset();
- }
- size_t N_;
- map clipped;
- map sum;
-
- void
- operator+=( const NgramCounts& rhs )
- {
- assert( N_ == rhs.N_ );
- for ( size_t i = 0; i < N_; i++ ) {
- this->clipped[i] += rhs.clipped.find(i)->second;
- this->sum[i] += rhs.sum.find(i)->second;
- }
- }
-
- void
- add( size_t count, size_t ref_count, size_t i )
- {
- assert( i < N_ );
- if ( count > ref_count ) {
- clipped[i] += ref_count;
- sum[i] += count;
- } else {
- clipped[i] += count;
- sum[i] += count;
- }
- }
-
- void
- reset()
- {
- size_t i;
- for ( i = 0; i < N_; i++ ) {
- clipped[i] = 0;
- sum[i] = 0;
- }
- }
-
- void
- print()
- {
- for ( size_t i = 0; i < N_; i++ ) {
- cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
- cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
- }
- }
-};
-
-
-
-
-typedef map, size_t> Ngrams;
-Ngrams make_ngrams( vector& s, size_t N );
-NgramCounts make_ngram_counts( vector hyp, vector ref, size_t N );
-double brevity_penaly( const size_t hyp_len, const size_t ref_len );
-double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector weights = vector() );
-double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector weights = vector() );
-double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector weights = vector() );
-double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector weights = vector() );
-void register_and_convert(const vector& strs, vector& ids);
-void print_FD();
-void run_tests();
-void test_SetWeights();
-#include
-#include
-void test_metrics();
-double approx_equal( double x, double y );
-void test_ngrams();
-
diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc
index 5ae473e6..d1ff30c0 100644
--- a/dtrain/dtest.cc
+++ b/dtrain/dtest.cc
@@ -1,6 +1,6 @@
-#include "dcommon.h"
-
-
+#include "common.h"
+#include "kbestget.h"
+#include "util.h"
/*
@@ -14,10 +14,10 @@ init(int argc, char** argv, po::variables_map* conf)
bool q;
po::options_description opts( "Options" );
opts.add_options()
- ( "decoder-config,c", po::value(), "configuration file for cdec" )
- ( "weights,w", po::value(), "weights file")
- ( "ngrams,n", po::value(&N)->default_value(4), "N for Ngrams (default 5)" )
- ( "quiet,q", po::value(&q)->default_value(true), "do not output translations" );
+ ( "decoder-config,c", po::value(), "configuration file for cdec" )
+ ( "weights,w", po::value(), "weights file" )
+ ( "ngrams,n", po::value(&N)->default_value(DTRAIN_DEFAULT_N), "N for Ngrams (default 5)" )
+ ( "quiet,q", po::value(&q)->default_value(true), "do not output translations" );
po::options_description cmdline_options;
cmdline_options.add(opts);
po::store( parse_command_line(argc, argv, cmdline_options), *conf );
@@ -57,17 +57,17 @@ main(int argc, char** argv)
vector strs, ref_strs;
vector ref_ids;
string in, psg;
- size_t sid = 0;
- double overall = 0.0;
+ size_t sn = 0;
+ double overall = 0.0;
double overall1 = 0.0;
double overall2 = 0.0;
- cerr << "(a dot equals 100 lines of input)" << endl;
+ cerr << "(A dot equals " << DTRAIN_DOTOUT << " lines of input.)" << endl;
while( getline(cin, in) ) {
- if ( (sid+1) % 100 == 0 ) {
+ if ( (sn+1) % DTRAIN_DOTOUT == 0 ) {
cerr << ".";
- if ( (sid+1)%1000 == 0 ) cerr << endl;
+ if ( (sn+1) % (20*DTRAIN_DOTOUT) == 0 ) cerr << endl;
}
- //if ( sid > 5000 ) break;
+ //if ( sn > 5000 ) break;
strs.clear();
boost::split( strs, in, boost::is_any_of("\t") );
// grammar
@@ -80,25 +80,22 @@ main(int argc, char** argv)
boost::split( ref_strs, strs[1], boost::is_any_of(" ") );
register_and_convert( ref_strs, ref_ids );
// scoring kbest
- double score = 0.0;
+ double score = 0.0;
double score1 = 0.0;
double score2 = 0.0;
NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], 4 );
- score = smooth_bleu( counts,
- ref_ids.size(),
- kb->sents[0].size(), N );
- score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N) ;
- score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
- //if ( ! quiet )
- cout << TD::GetString( kb->sents[0] ) << endl;
+ score = smooth_bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
+ score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
+ score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
+ if ( ! quiet ) cout << TD::GetString( kb->sents[0] ) << endl;
overall += score;
overall1 += score1;
overall2 += score2;
- sid += 1;
+ sn += 1;
}
- cerr << "Average score (smooth): " << overall/(double)(sid+1) << endl;
- cerr << "Average score (stupid): " << overall1/(double)(sid+1) << endl;
- cerr << "Average score (normal): " << overall2/(double)(sid+1) << endl;
+ cerr << "Average score (smooth) : " << overall/(double)(sn+1) << endl;
+ cerr << "Average score (stupid) : " << overall1/(double)(sn+1) << endl;
+ cerr << "Average score (vanilla): " << overall2/(double)(sn+1) << endl;
cerr << endl;
return 0;
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 373458e8..16b83a70 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,6 +1,11 @@
-#include "dcommon.h"
+#include "common.h"
+#include "kbestget.h"
#include "learner.h"
+#include "util.h"
+#ifdef DTRAIN_DEBUG
+#include "tests.h"
+#endif
@@ -12,20 +17,33 @@ bool
init(int argc, char** argv, po::variables_map* conf)
{
po::options_description opts( "Options" );
+ size_t k, N, T;
+ // TODO scoring metric as parameter/in config
opts.add_options()
- ( "decoder-config,c", po::value(), "configuration file for cdec" )
- ( "kbest,k", po::value(), "k for kbest" )
- ( "ngrams,n", po::value(), "n for Ngrams" )
- ( "filter,f", po::value(), "filter kbest list" )
- ( "test", "run tests and exit");
+ ( "decoder-config,c", po::value(), "configuration file for cdec" )
+ ( "kbest,k", po::value(&k)->default_value(DTRAIN_DEFAULT_K), "k for kbest" )
+ ( "ngrams,n", po::value(&N)->default_value(DTRAIN_DEFAULT_N), "n for Ngrams" )
+ ( "filter,f", po::value(), "filter kbest list" ) // FIXME
+ ( "epochs,t", po::value(&T)->default_value(DTRAIN_DEFAULT_T), "# of iterations T" )
+#ifndef DTRAIN_DEBUG
+ ;
+#else
+ ( "test", "run tests and exit");
+#endif
po::options_description cmdline_options;
cmdline_options.add(opts);
po::store( parse_command_line(argc, argv, cmdline_options), *conf );
po::notify( *conf );
- if ( ! (conf->count("decoder-config") || conf->count("test")) ) {
+ if ( ! conf->count("decoder-config") ) {
cerr << cmdline_options << endl;
return false;
}
+ #ifdef DTRAIN_DEBUG
+ if ( ! conf->count("test") ) {
+ cerr << cmdline_options << endl;
+ return false;
+ }
+ #endif
return true;
}
@@ -40,19 +58,21 @@ main(int argc, char** argv)
SetSilent(true);
po::variables_map conf;
if (!init(argc, argv, &conf)) return 1;
+#ifdef DTRAIN_DEBUG
if ( conf.count("test") ) run_tests();
+#endif
register_feature_functions();
size_t k = conf["kbest"].as();
- ReadFile ini_rf(conf["decoder-config"].as());
+ ReadFile ini_rf( conf["decoder-config"].as() );
Decoder decoder(ini_rf.stream());
- KBestGetter observer(k);
- size_t N = 3; // TODO as parameter/in config
+ KBestGetter observer( k );
+ size_t N = conf["ngrams"].as();
+ size_t T = conf["epochs"].as();
- // TODO scoring metric as parameter/in config
// for approx. bleu
- NgramCounts global_counts(N);
- size_t global_hyp_len = 0;
- size_t global_ref_len = 0;
+ //NgramCounts global_counts( N );
+ //size_t global_hyp_len = 0;
+ //size_t global_ref_len = 0;
Weights weights;
SparseVector lambdas;
@@ -62,20 +82,24 @@ main(int argc, char** argv)
vector strs, ref_strs;
vector ref_ids;
string in, psg;
- size_t sid = 0;
- cerr << "(1 dot equals 100 lines of input)" << endl;
+ size_t sn = 0;
+ cerr << "(A dot equals " << DTRAIN_DOTOUT << " lines of input.)" << endl;
+
+ for ( size_t t = 0; t < T; t++ )
+ {
+
while( getline(cin, in) ) {
- if ( (sid+1) % 100 == 0 ) {
+ if ( (sn+1) % DTRAIN_DOTOUT == 0 ) {
cerr << ".";
- if ( (sid+1)%1000 == 0 ) cerr << endl;
+ if ( (sn+1) % (20*DTRAIN_DOTOUT) == 0 ) cerr << endl;
}
- //if ( sid > 5000 ) break;
+ //if ( sn > 5000 ) break;
// weights
dense_weights.clear();
weights.InitFromVector( lambdas );
weights.InitVector( &dense_weights );
decoder.SetWeights( dense_weights );
- // handling input..
+ // handling input
strs.clear();
boost::split( strs, in, boost::is_any_of("\t") );
// grammar
@@ -89,11 +113,11 @@ main(int argc, char** argv)
register_and_convert( ref_strs, ref_ids );
// scoring kbest
double score = 0;
- size_t cand_len = 0;
+ //size_t cand_len = 0;
Scores scores;
for ( size_t i = 0; i < kb->sents.size(); i++ ) {
NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N );
- if ( i == 0) {
+ /*if ( i == 0 ) {
global_counts += counts;
global_hyp_len += kb->sents[i].size();
global_ref_len += ref_ids.size();
@@ -101,24 +125,28 @@ main(int argc, char** argv)
} else {
cand_len = kb->sents[i].size();
}
- //score = bleu( global_counts,
- // global_ref_len,
- // global_hyp_len + cand_len, N );
+ score = bleu( global_counts,
+ global_ref_len,
+ global_hyp_len + cand_len, N );*/
score = bleu ( counts, ref_ids.size(), kb->sents[i].size(), N );
ScorePair sp( kb->scores[i], score );
scores.push_back( sp );
- //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl;
+ //cout << "'" << TD::GetString( ref_ids ) << "' vs '";
+ //cout << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl;
//cout << kb->feats[i] << endl;
}
// learner
SofiaLearner learner;
- learner.Init( sid, kb->feats, scores );
+ learner.Init( sn, kb->feats, scores );
learner.Update(lambdas);
//print_FD();
- sid += 1; // TODO does cdec count this already?
+ sn += 1;
}
+
+ } // outer loop
+
cerr << endl;
- weights.WriteToFile( "data/weights-final-normalx", true );
+ weights.WriteToFile( "data/weights-vanilla", false );
return 0;
}
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
new file mode 100644
index 00000000..6d93d3b7
--- /dev/null
+++ b/dtrain/kbestget.h
@@ -0,0 +1,61 @@
+#ifndef _DTRAIN_KBESTGET_H_
+#define _DTRAIN_KBESTGET_H_
+
+
+namespace dtrain
+{
+
+
+/*
+ * KBestList
+ *
+ */
+struct KBestList {
+ vector > feats;
+ vector > sents;
+ vector scores;
+};
+
+
+/*
+ * KBestGetter
+ *
+ */
+struct KBestGetter : public DecoderObserver
+{
+ KBestGetter( const size_t k ) : k_(k) {}
+ const size_t k_;
+ KBestList kb;
+
+ virtual void
+ NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+ {
+ GetKBest(smeta.GetSentenceID(), *hg);
+ }
+
+ KBestList* GetKBest() { return &kb; }
+
+ void
+ GetKBest(int sent_id, const Hypergraph& forest)
+ {
+ kb.scores.clear();
+ kb.sents.clear();
+ kb.feats.clear();
+ KBest::KBestDerivations, ESentenceTraversal> kbest( forest, k_ );
+ for ( size_t i = 0; i < k_; ++i ) {
+ const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest( forest.nodes_.size() - 1, i );
+ if (!d) break;
+ kb.sents.push_back( d->yield);
+ kb.feats.push_back( d->feature_values );
+ kb.scores.push_back( d->score );
+ }
+ }
+};
+
+
+} // namespace
+
+
+#endif
+
diff --git a/dtrain/learner.h b/dtrain/learner.h
index a953284d..038749e2 100644
--- a/dtrain/learner.h
+++ b/dtrain/learner.h
@@ -1,71 +1,96 @@
-/*class Learnerx
+#ifndef _DTRAIN_LEARNER_H_
+#define _DTRAIN_LEARNER_H_
+
+#include
+#include
+#include