From 2e605eb2745e56619b16fdbcb8095e0a6543ab27 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 3 Aug 2011 01:29:52 +0200 Subject: refactoring, cleaning up --- dtrain/Makefile.am | 4 +- dtrain/common.h | 37 ++++++ dtrain/dcommon.cc | 330 ------------------------------------------------- dtrain/dcommon.h | 163 ------------------------ dtrain/dtest.cc | 47 ++++--- dtrain/dtrain.cc | 86 ++++++++----- dtrain/kbestget.h | 61 +++++++++ dtrain/learner.h | 133 ++++++++++++-------- dtrain/score.cc | 166 +++++++++++++++++++++++++ dtrain/score.h | 111 +++++++++++++++++ dtrain/scripts/run.sh | 4 + dtrain/scripts/test.sh | 6 + dtrain/test.sh | 4 - dtrain/tests.cc | 141 +++++++++++++++++++++ dtrain/tests.h | 26 ++++ dtrain/util.cc | 34 +++++ dtrain/util.h | 28 +++++ 17 files changed, 774 insertions(+), 607 deletions(-) create mode 100644 dtrain/common.h delete mode 100644 dtrain/dcommon.cc delete mode 100644 dtrain/dcommon.h create mode 100644 dtrain/kbestget.h create mode 100644 dtrain/score.cc create mode 100644 dtrain/score.h create mode 100755 dtrain/scripts/run.sh create mode 100755 dtrain/scripts/test.sh delete mode 100755 dtrain/test.sh create mode 100644 dtrain/tests.cc create mode 100644 dtrain/tests.h create mode 100644 dtrain/util.cc create mode 100644 dtrain/util.h (limited to 'dtrain') diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index c3f14bb0..03e3ccf7 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -1,10 +1,10 @@ # TODO I'm sure I can leave something out. bin_PROGRAMS = dtrain dtest -dtrain_SOURCES = dtrain.cc dcommon.cc +dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -dtest_SOURCES = dtest.cc dcommon.cc +dtest_SOURCES = dtest.cc score.cc util.cc dtest_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/common.h b/dtrain/common.h new file mode 100644 index 00000000..cf365d48 --- /dev/null +++ b/dtrain/common.h @@ -0,0 +1,37 @@ +#ifndef _DTRAIN_COMMON_H_ +#define _DTRAIN_COMMON_H_ + + +#include +#include +#include +#include +#include + +#include "sentence_metadata.h" +#include "verbose.h" +#include "viterbi.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "weights.h" + +#include +#include + +#include "score.h" + +#define DTRAIN_DEFAULT_K 100 +#define DTRAIN_DEFAULT_N 4 +#define DTRAIN_DEFAULT_T 1 + +#define DTRAIN_DOTOUT 100 + + +using namespace std; +using namespace dtrain; +namespace po = boost::program_options; + + +#endif + diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc deleted file mode 100644 index 6657bed6..00000000 --- a/dtrain/dcommon.cc +++ /dev/null @@ -1,330 +0,0 @@ -#include "dcommon.h" - - - - -/****************************************************************************** - * NGRAMS - * - * - * make_ngrams - * - */ -typedef map, size_t> Ngrams; -Ngrams -make_ngrams( vector& s, size_t N ) -{ - Ngrams ngrams; - vector ng; - for ( size_t i = 0; i < s.size(); i++ ) { - ng.clear(); - for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { - ng.push_back( s[j] ); - ngrams[ng]++; - } - } - return ngrams; -} - - -/* - * ngram_matches - * - */ -NgramCounts -make_ngram_counts( vector hyp, vector ref, size_t N ) -{ - Ngrams hyp_ngrams = make_ngrams( hyp, N ); - Ngrams ref_ngrams = make_ngrams( ref, N ); - NgramCounts counts( N ); - Ngrams::iterator it; - Ngrams::iterator ti; - for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { - ti = ref_ngrams.find( it->first ); - if ( ti != ref_ngrams.end() ) { - counts.add( it->second, ti->second, it->first.size() - 1 ); - } else { - counts.add( it->second, 0, it->first.size() - 1 ); - } - } - return counts; -} - - - - -/****************************************************************************** - * SCORES - * - * - * brevity_penaly - * - */ -double -brevity_penaly( const size_t hyp_len, const size_t ref_len ) -{ - if ( hyp_len > ref_len ) return 1; - return exp( 1 - (double)ref_len/(double)hyp_len ); -} - - -/* - * bleu - * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) - * page TODO - * 0 if for N one of the counts = 0 - */ -double -bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - size_t N, vector weights ) -{ - if ( hyp_len == 0 || ref_len == 0 ) return 0; - if ( ref_len < N ) N = ref_len; - float N_ = (float)N; - if ( weights.empty() ) - { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); - } - double sum = 0; - for ( size_t i = 0; i < N; i++ ) { - if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; - sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); - } - return brevity_penaly( hyp_len, ref_len ) * exp( sum ); -} - - -/* - * stupid_bleu - * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) - * page TODO - * 0 iff no 1gram match - */ -double -stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - size_t N, vector weights ) -{ - if ( hyp_len == 0 || ref_len == 0 ) return 0; - if ( ref_len < N ) N = ref_len; - float N_ = (float)N; - if ( weights.empty() ) - { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); - } - double sum = 0; - float add = 0; - for ( size_t i = 0; i < N; i++ ) { - if ( i == 1 ) add = 1; - sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); - } - return brevity_penaly( hyp_len, ref_len ) * exp( sum ); -} - - -/* - * smooth_bleu - * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) - * page TODO - * max. 0.9375 - */ -double -smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - const size_t N, vector weights ) -{ - if ( hyp_len == 0 || ref_len == 0 ) return 0; - float N_ = (float)N; - if ( weights.empty() ) - { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); - } - double sum = 0; - float j = 1; - for ( size_t i = 0; i < N; i++ ) { - if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; - sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); - j++; - } - return brevity_penaly( hyp_len, ref_len ) * sum; -} - - -/* - * approx_bleu - * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) - * page TODO - * - */ -double -approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - const size_t N, vector weights ) -{ - return bleu( counts, hyp_len, ref_len, N, weights ); -} - - - - -/****************************************************************************** - * UTILS - * - * - * register_and_convert - * - */ -void -register_and_convert(const vector& strs, vector& ids) -{ - vector::const_iterator it; - for ( it = strs.begin(); it < strs.end(); it++ ) { - ids.push_back( TD::Convert( *it ) ); - } -} - - -/* - * approx_equal - * - */ -double -approx_equal( double x, double y ) -{ - const double EPSILON = 1E-5; - if ( x == 0 ) return fabs( y ) <= EPSILON; - if ( y == 0 ) return fabs( x ) <= EPSILON; - return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; -} - - -/* - * print_FD - * - */ -void -print_FD() -{ - for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; -} - - - - -/****************************************************************************** - * TESTS - * - * - * test_ngrams - * - */ -void -test_ngrams() -{ - cout << "Testing ngrams..." << endl << endl; - size_t N = 5; - cout << "N = " << N << endl; - vector a; // hyp - vector b; // ref - cout << "a "; - for (size_t i = 1; i <= 8; i++) { - cout << i << " "; - a.push_back(i); - } - cout << endl << "b "; - for (size_t i = 1; i <= 4; i++) { - cout << i << " "; - b.push_back(i); - } - cout << endl << endl; - NgramCounts c = make_ngram_counts( a, b, N ); - assert( c.clipped[N-1] == 0 ); - assert( c.sum[N-1] == 4 ); - c.print(); - c += c; - cout << endl; - c.print(); - cout << endl; -} - - -/* - * test_metrics - * - */ -void -test_metrics() -{ - cout << "Testing metrics..." << endl << endl; - using namespace boost::assign; - vector a, b; - vector expect_vanilla, expect_smooth, expect_stupid; - a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp - b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref - expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0; - expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587; - expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707; - vector aa, bb; - vector aai, bbi; - double vanilla, smooth, stupid; - size_t N = 4; - cout << "N = " << N << endl << endl; - for ( size_t i = 0; i < a.size(); i++ ) { - cout << " hyp: " << a[i] << endl; - cout << " ref: " << b[i] << endl; - aa.clear(); bb.clear(); aai.clear(); bbi.clear(); - boost::split( aa, a[i], boost::is_any_of(" ") ); - boost::split( bb, b[i], boost::is_any_of(" ") ); - register_and_convert( aa, aai ); - register_and_convert( bb, bbi ); - NgramCounts counts = make_ngram_counts( aai, bbi, N ); - vanilla = bleu( counts, aa.size(), bb.size(), N); - smooth = smooth_bleu( counts, aa.size(), bb.size(), N); - stupid = stupid_bleu( counts, aa.size(), bb.size(), N); - assert( approx_equal(vanilla, expect_vanilla[i]) ); - assert( approx_equal(smooth, expect_smooth[i]) ); - assert( approx_equal(stupid, expect_stupid[i]) ); - cout << setw(14) << "bleu = " << vanilla << endl; - cout << setw(14) << "smooth bleu = " << smooth << endl; - cout << setw(14) << "stupid bleu = " << stupid << endl << endl; - } - cout << endl; -} - - -/* - * test_SetWeights - * - */ -void -test_SetWeights() -{ - cout << "Testing Weights::SetWeight..." << endl << endl; - Weights weights; - SparseVector lambdas; - weights.InitSparseVector( &lambdas ); - weights.SetWeight( &lambdas, "test", 0 ); - weights.SetWeight( &lambdas, "test1", 1 ); - WordID fid = FD::Convert( "test2" ); - weights.SetWeight( &lambdas, fid, 2 ); - string fn = "weights-test"; - cout << "FD::NumFeats() " << FD::NumFeats() << endl; - assert( FD::NumFeats() == 4 ); - weights.WriteToFile( fn, true ); - cout << endl; -} - - -/* - * run_tests - * - */ -void -run_tests() -{ - cout << endl; - test_ngrams(); - cout << endl; - test_metrics(); - cout << endl; - test_SetWeights(); - exit(0); -} - diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h deleted file mode 100644 index 6df841bb..00000000 --- a/dtrain/dcommon.h +++ /dev/null @@ -1,163 +0,0 @@ -#include -#include -#include -#include -#include - -#include "config.h" - -#include -#include -#include -#include - -#include "sentence_metadata.h" -#include "scorer.h" -#include "verbose.h" -#include "viterbi.h" -#include "hg.h" -#include "prob.h" -#include "kbest.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "sampler.h" - -using namespace std; -namespace po = boost::program_options; - - - - -struct ScorePair -{ - ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {} - double modelscore_, score_; - double GetModelScore() { return modelscore_; } - double GetScore() { return score_; } -}; -typedef vector Scores; - - -/* - * KBestGetter - * - */ -struct KBestList { - vector > feats; - vector > sents; - vector scores; -}; -struct KBestGetter : public DecoderObserver -{ - KBestGetter( const size_t k ) : k_(k) {} - const size_t k_; - KBestList kb; - - virtual void - NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) - { - GetKBest(smeta.GetSentenceID(), *hg); - } - - KBestList* GetKBest() { return &kb; } - - void - GetKBest(int sent_id, const Hypergraph& forest) - { - kb.scores.clear(); - kb.sents.clear(); - kb.feats.clear(); - KBest::KBestDerivations, ESentenceTraversal> kbest( forest, k_ ); - for ( size_t i = 0; i < k_; ++i ) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest( forest.nodes_.size() - 1, i ); - if (!d) break; - kb.sents.push_back( d->yield); - kb.feats.push_back( d->feature_values ); - kb.scores.push_back( d->score ); - } - } -}; - - -/* - * NgramCounts - * - */ -struct NgramCounts -{ - NgramCounts( const size_t N ) : N_( N ) { - reset(); - } - size_t N_; - map clipped; - map sum; - - void - operator+=( const NgramCounts& rhs ) - { - assert( N_ == rhs.N_ ); - for ( size_t i = 0; i < N_; i++ ) { - this->clipped[i] += rhs.clipped.find(i)->second; - this->sum[i] += rhs.sum.find(i)->second; - } - } - - void - add( size_t count, size_t ref_count, size_t i ) - { - assert( i < N_ ); - if ( count > ref_count ) { - clipped[i] += ref_count; - sum[i] += count; - } else { - clipped[i] += count; - sum[i] += count; - } - } - - void - reset() - { - size_t i; - for ( i = 0; i < N_; i++ ) { - clipped[i] = 0; - sum[i] = 0; - } - } - - void - print() - { - for ( size_t i = 0; i < N_; i++ ) { - cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; - cout << i+1 << "grams:\t\t\t" << sum[i] << endl; - } - } -}; - - - - -typedef map, size_t> Ngrams; -Ngrams make_ngrams( vector& s, size_t N ); -NgramCounts make_ngram_counts( vector hyp, vector ref, size_t N ); -double brevity_penaly( const size_t hyp_len, const size_t ref_len ); -double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector weights = vector() ); -double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector weights = vector() ); -double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector weights = vector() ); -double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector weights = vector() ); -void register_and_convert(const vector& strs, vector& ids); -void print_FD(); -void run_tests(); -void test_SetWeights(); -#include -#include -void test_metrics(); -double approx_equal( double x, double y ); -void test_ngrams(); - diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc index 5ae473e6..d1ff30c0 100644 --- a/dtrain/dtest.cc +++ b/dtrain/dtest.cc @@ -1,6 +1,6 @@ -#include "dcommon.h" - - +#include "common.h" +#include "kbestget.h" +#include "util.h" /* @@ -14,10 +14,10 @@ init(int argc, char** argv, po::variables_map* conf) bool q; po::options_description opts( "Options" ); opts.add_options() - ( "decoder-config,c", po::value(), "configuration file for cdec" ) - ( "weights,w", po::value(), "weights file") - ( "ngrams,n", po::value(&N)->default_value(4), "N for Ngrams (default 5)" ) - ( "quiet,q", po::value(&q)->default_value(true), "do not output translations" ); + ( "decoder-config,c", po::value(), "configuration file for cdec" ) + ( "weights,w", po::value(), "weights file" ) + ( "ngrams,n", po::value(&N)->default_value(DTRAIN_DEFAULT_N), "N for Ngrams (default 5)" ) + ( "quiet,q", po::value(&q)->default_value(true), "do not output translations" ); po::options_description cmdline_options; cmdline_options.add(opts); po::store( parse_command_line(argc, argv, cmdline_options), *conf ); @@ -57,17 +57,17 @@ main(int argc, char** argv) vector strs, ref_strs; vector ref_ids; string in, psg; - size_t sid = 0; - double overall = 0.0; + size_t sn = 0; + double overall = 0.0; double overall1 = 0.0; double overall2 = 0.0; - cerr << "(a dot equals 100 lines of input)" << endl; + cerr << "(A dot equals " << DTRAIN_DOTOUT << " lines of input.)" << endl; while( getline(cin, in) ) { - if ( (sid+1) % 100 == 0 ) { + if ( (sn+1) % DTRAIN_DOTOUT == 0 ) { cerr << "."; - if ( (sid+1)%1000 == 0 ) cerr << endl; + if ( (sn+1) % (20*DTRAIN_DOTOUT) == 0 ) cerr << endl; } - //if ( sid > 5000 ) break; + //if ( sn > 5000 ) break; strs.clear(); boost::split( strs, in, boost::is_any_of("\t") ); // grammar @@ -80,25 +80,22 @@ main(int argc, char** argv) boost::split( ref_strs, strs[1], boost::is_any_of(" ") ); register_and_convert( ref_strs, ref_ids ); // scoring kbest - double score = 0.0; + double score = 0.0; double score1 = 0.0; double score2 = 0.0; NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], 4 ); - score = smooth_bleu( counts, - ref_ids.size(), - kb->sents[0].size(), N ); - score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N) ; - score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); - //if ( ! quiet ) - cout << TD::GetString( kb->sents[0] ) << endl; + score = smooth_bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); + score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); + score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); + if ( ! quiet ) cout << TD::GetString( kb->sents[0] ) << endl; overall += score; overall1 += score1; overall2 += score2; - sid += 1; + sn += 1; } - cerr << "Average score (smooth): " << overall/(double)(sid+1) << endl; - cerr << "Average score (stupid): " << overall1/(double)(sid+1) << endl; - cerr << "Average score (normal): " << overall2/(double)(sid+1) << endl; + cerr << "Average score (smooth) : " << overall/(double)(sn+1) << endl; + cerr << "Average score (stupid) : " << overall1/(double)(sn+1) << endl; + cerr << "Average score (vanilla): " << overall2/(double)(sn+1) << endl; cerr << endl; return 0; diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 373458e8..16b83a70 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -1,6 +1,11 @@ -#include "dcommon.h" +#include "common.h" +#include "kbestget.h" #include "learner.h" +#include "util.h" +#ifdef DTRAIN_DEBUG +#include "tests.h" +#endif @@ -12,20 +17,33 @@ bool init(int argc, char** argv, po::variables_map* conf) { po::options_description opts( "Options" ); + size_t k, N, T; + // TODO scoring metric as parameter/in config opts.add_options() - ( "decoder-config,c", po::value(), "configuration file for cdec" ) - ( "kbest,k", po::value(), "k for kbest" ) - ( "ngrams,n", po::value(), "n for Ngrams" ) - ( "filter,f", po::value(), "filter kbest list" ) - ( "test", "run tests and exit"); + ( "decoder-config,c", po::value(), "configuration file for cdec" ) + ( "kbest,k", po::value(&k)->default_value(DTRAIN_DEFAULT_K), "k for kbest" ) + ( "ngrams,n", po::value(&N)->default_value(DTRAIN_DEFAULT_N), "n for Ngrams" ) + ( "filter,f", po::value(), "filter kbest list" ) // FIXME + ( "epochs,t", po::value(&T)->default_value(DTRAIN_DEFAULT_T), "# of iterations T" ) +#ifndef DTRAIN_DEBUG + ; +#else + ( "test", "run tests and exit"); +#endif po::options_description cmdline_options; cmdline_options.add(opts); po::store( parse_command_line(argc, argv, cmdline_options), *conf ); po::notify( *conf ); - if ( ! (conf->count("decoder-config") || conf->count("test")) ) { + if ( ! conf->count("decoder-config") ) { cerr << cmdline_options << endl; return false; } + #ifdef DTRAIN_DEBUG + if ( ! conf->count("test") ) { + cerr << cmdline_options << endl; + return false; + } + #endif return true; } @@ -40,19 +58,21 @@ main(int argc, char** argv) SetSilent(true); po::variables_map conf; if (!init(argc, argv, &conf)) return 1; +#ifdef DTRAIN_DEBUG if ( conf.count("test") ) run_tests(); +#endif register_feature_functions(); size_t k = conf["kbest"].as(); - ReadFile ini_rf(conf["decoder-config"].as()); + ReadFile ini_rf( conf["decoder-config"].as() ); Decoder decoder(ini_rf.stream()); - KBestGetter observer(k); - size_t N = 3; // TODO as parameter/in config + KBestGetter observer( k ); + size_t N = conf["ngrams"].as(); + size_t T = conf["epochs"].as(); - // TODO scoring metric as parameter/in config // for approx. bleu - NgramCounts global_counts(N); - size_t global_hyp_len = 0; - size_t global_ref_len = 0; + //NgramCounts global_counts( N ); + //size_t global_hyp_len = 0; + //size_t global_ref_len = 0; Weights weights; SparseVector lambdas; @@ -62,20 +82,24 @@ main(int argc, char** argv) vector strs, ref_strs; vector ref_ids; string in, psg; - size_t sid = 0; - cerr << "(1 dot equals 100 lines of input)" << endl; + size_t sn = 0; + cerr << "(A dot equals " << DTRAIN_DOTOUT << " lines of input.)" << endl; + + for ( size_t t = 0; t < T; t++ ) + { + while( getline(cin, in) ) { - if ( (sid+1) % 100 == 0 ) { + if ( (sn+1) % DTRAIN_DOTOUT == 0 ) { cerr << "."; - if ( (sid+1)%1000 == 0 ) cerr << endl; + if ( (sn+1) % (20*DTRAIN_DOTOUT) == 0 ) cerr << endl; } - //if ( sid > 5000 ) break; + //if ( sn > 5000 ) break; // weights dense_weights.clear(); weights.InitFromVector( lambdas ); weights.InitVector( &dense_weights ); decoder.SetWeights( dense_weights ); - // handling input.. + // handling input strs.clear(); boost::split( strs, in, boost::is_any_of("\t") ); // grammar @@ -89,11 +113,11 @@ main(int argc, char** argv) register_and_convert( ref_strs, ref_ids ); // scoring kbest double score = 0; - size_t cand_len = 0; + //size_t cand_len = 0; Scores scores; for ( size_t i = 0; i < kb->sents.size(); i++ ) { NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N ); - if ( i == 0) { + /*if ( i == 0 ) { global_counts += counts; global_hyp_len += kb->sents[i].size(); global_ref_len += ref_ids.size(); @@ -101,24 +125,28 @@ main(int argc, char** argv) } else { cand_len = kb->sents[i].size(); } - //score = bleu( global_counts, - // global_ref_len, - // global_hyp_len + cand_len, N ); + score = bleu( global_counts, + global_ref_len, + global_hyp_len + cand_len, N );*/ score = bleu ( counts, ref_ids.size(), kb->sents[i].size(), N ); ScorePair sp( kb->scores[i], score ); scores.push_back( sp ); - //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl; + //cout << "'" << TD::GetString( ref_ids ) << "' vs '"; + //cout << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl; //cout << kb->feats[i] << endl; } // learner SofiaLearner learner; - learner.Init( sid, kb->feats, scores ); + learner.Init( sn, kb->feats, scores ); learner.Update(lambdas); //print_FD(); - sid += 1; // TODO does cdec count this already? + sn += 1; } + + } // outer loop + cerr << endl; - weights.WriteToFile( "data/weights-final-normalx", true ); + weights.WriteToFile( "data/weights-vanilla", false ); return 0; } diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h new file mode 100644 index 00000000..6d93d3b7 --- /dev/null +++ b/dtrain/kbestget.h @@ -0,0 +1,61 @@ +#ifndef _DTRAIN_KBESTGET_H_ +#define _DTRAIN_KBESTGET_H_ + + +namespace dtrain +{ + + +/* + * KBestList + * + */ +struct KBestList { + vector > feats; + vector > sents; + vector scores; +}; + + +/* + * KBestGetter + * + */ +struct KBestGetter : public DecoderObserver +{ + KBestGetter( const size_t k ) : k_(k) {} + const size_t k_; + KBestList kb; + + virtual void + NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + { + GetKBest(smeta.GetSentenceID(), *hg); + } + + KBestList* GetKBest() { return &kb; } + + void + GetKBest(int sent_id, const Hypergraph& forest) + { + kb.scores.clear(); + kb.sents.clear(); + kb.feats.clear(); + KBest::KBestDerivations, ESentenceTraversal> kbest( forest, k_ ); + for ( size_t i = 0; i < k_; ++i ) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest( forest.nodes_.size() - 1, i ); + if (!d) break; + kb.sents.push_back( d->yield); + kb.feats.push_back( d->feature_values ); + kb.scores.push_back( d->score ); + } + } +}; + + +} // namespace + + +#endif + diff --git a/dtrain/learner.h b/dtrain/learner.h index a953284d..038749e2 100644 --- a/dtrain/learner.h +++ b/dtrain/learner.h @@ -1,71 +1,96 @@ -/*class Learnerx +#ifndef _DTRAIN_LEARNER_H_ +#define _DTRAIN_LEARNER_H_ + +#include +#include +#include + +#include "sparse_vector.h" +#include "score.h" + + +namespace dtrain +{ + + +class Learner { public: - virtual void Init(const vector >& kbest, const Scores& scores) {}; - virtual void Update(SparseVector& lambdas); -};*/ + virtual void Init( const vector >& kbest, const Scores& scores, + const bool invert_score = false ) {}; + virtual void Update( SparseVector& lambdas ) {}; +}; -class SofiaLearner //: public Learnerx FIXME + +class SofiaLearner : public Learner { - // TODO bool invert_score public: - void - Init( const size_t sid, const vector >& kbest, /*const*/ Scores& scores ) - { - assert( kbest.size() == scores.size() ); - ofstream o; - //unlink( "/tmp/sofia_ml_training_stupid" ); - o.open( "/tmp/sofia_ml_training_normalx", ios::trunc ); // TODO randomize, filename exists - int fid = 0; - map::iterator ff; + void + Init( const size_t sid, const vector >& kbest, /*const FIXME*/ Scores& scores, + const bool invert_score = false ) + { + assert( kbest.size() == scores.size() ); + ofstream o; + unlink( "/tmp/sofia_ml_training" ); + o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists + int fid = 0; + map::iterator ff; - for ( size_t k = 0; k < kbest.size(); ++k ) { - map m; - SparseVector::const_iterator it = kbest[k].begin(); - o << scores[k].GetScore(); - for ( ; it != kbest[k].end(); ++it) { - ff = fmap.find( it->first ); - if ( ff == fmap.end() ) { - fmap.insert( pair(it->first, fid) ); - fmap1.insert( pair(fid, it->first) ); - fid++; + double score; + for ( size_t k = 0; k < kbest.size(); ++k ) { + map m; + SparseVector::const_iterator it = kbest[k].begin(); + score = scores[k].GetScore(); + if ( invert_score ) score = -score; + o << score; + for ( ; it != kbest[k].end(); ++it ) { + ff = fmap.find( it->first ); + if ( ff == fmap.end() ) { + fmap.insert( pair(it->first, fid) ); + fmap1.insert( pair(fid, it->first) ); + fid++; + } + m.insert( pair(fmap[it->first], it->second) ); } - m.insert(pair(fmap[it->first], it->second)); - } - map::iterator ti = m.begin(); - for ( ; ti != m.end(); ++ti ) { - o << " " << ti->first << ":" << ti->second; + map::iterator ti = m.begin(); + for ( ; ti != m.end(); ++ti ) { + o << " " << ti->first << ":" << ti->second; + } + o << endl; } - o << endl; + o.close(); } - o.close(); - } - void - Update(SparseVector& lambdas) - { - string call = "./sofia-ml --training_file /tmp/sofia_ml_training_normalx --model_out /tmp/sofia_ml_model_normalx --loop_type stochastic --lambda 100 --dimensionality "; - std::stringstream out; - out << fmap.size(); - call += out.str(); - call += " &>/dev/null"; - system ( call.c_str() ); - ifstream i; - //unlink( "/tmp/sofia_ml_model_stupid" ); - i.open( "/tmp/sofia_ml_model_normalx", ios::in ); - string model; - getline( i, model ); - vector strs; - boost::split( strs, model, boost::is_any_of(" ") ); - int j = 0; - for ( vector::iterator it = strs.begin(); it != strs.end(); ++it ) { - lambdas.set_value(fmap1[j], atof( it->c_str() ) ); - j++; + void + Update(SparseVector& lambdas) + { + string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality "; + std::stringstream out; + out << fmap.size(); + call += out.str(); + call += " &>/dev/null"; + system ( call.c_str() ); + ifstream i; + unlink( "/tmp/sofia_ml_model" ); + i.open( "/tmp/sofia_ml_model", ios::in ); + string model; + getline( i, model ); + vector strs; + boost::split( strs, model, boost::is_any_of(" ") ); + int j = 0; + for ( vector::iterator it = strs.begin(); it != strs.end(); ++it ) { + lambdas.set_value(fmap1[j], atof( it->c_str() ) ); + j++; + } } - } private: map fmap; map fmap1; }; + +} // namespace + +#endif + diff --git a/dtrain/score.cc b/dtrain/score.cc new file mode 100644 index 00000000..72e6db71 --- /dev/null +++ b/dtrain/score.cc @@ -0,0 +1,166 @@ +#include "score.h" + + +namespace dtrain +{ + + +/****************************************************************************** + * NGRAMS + * + * + * make_ngrams + * + */ +typedef map, size_t> Ngrams; +Ngrams +make_ngrams( vector& s, size_t N ) +{ + Ngrams ngrams; + vector ng; + for ( size_t i = 0; i < s.size(); i++ ) { + ng.clear(); + for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { + ng.push_back( s[j] ); + ngrams[ng]++; + } + } + return ngrams; +} + + +/* + * ngram_matches + * + */ +NgramCounts +make_ngram_counts( vector hyp, vector ref, size_t N ) +{ + Ngrams hyp_ngrams = make_ngrams( hyp, N ); + Ngrams ref_ngrams = make_ngrams( ref, N ); + NgramCounts counts( N ); + Ngrams::iterator it; + Ngrams::iterator ti; + for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { + ti = ref_ngrams.find( it->first ); + if ( ti != ref_ngrams.end() ) { + counts.add( it->second, ti->second, it->first.size() - 1 ); + } else { + counts.add( it->second, 0, it->first.size() - 1 ); + } + } + return counts; +} + + +/****************************************************************************** + * SCORES + * + * + * brevity_penaly + * + */ +double +brevity_penaly( const size_t hyp_len, const size_t ref_len ) +{ + if ( hyp_len > ref_len ) return 1; + return exp( 1 - (double)ref_len/(double)hyp_len ); +} + + +/* + * bleu + * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) + * page TODO + * 0 if for N one of the counts = 0 + */ +double +bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + size_t N, vector weights ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + if ( ref_len < N ) N = ref_len; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + for ( size_t i = 0; i < N; i++ ) { + if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; + sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); + } + return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * stupid_bleu + * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) + * page TODO + * 0 iff no 1gram match + */ +double +stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + size_t N, vector weights ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + if ( ref_len < N ) N = ref_len; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + float add = 0; + for ( size_t i = 0; i < N; i++ ) { + if ( i == 1 ) add = 1; + sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); + } + return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * smooth_bleu + * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) + * page TODO + * max. 0.9375 + */ +double +smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + const size_t N, vector weights ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + float j = 1; + for ( size_t i = 0; i < N; i++ ) { + if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; + sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); + j++; + } + return brevity_penaly( hyp_len, ref_len ) * sum; +} + + +/* + * approx_bleu + * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) + * page TODO + * + */ +double +approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + const size_t N, vector weights ) +{ + return bleu( counts, hyp_len, ref_len, N, weights ); +} + + +} // namespace + diff --git a/dtrain/score.h b/dtrain/score.h new file mode 100644 index 00000000..e9130e18 --- /dev/null +++ b/dtrain/score.h @@ -0,0 +1,111 @@ +#ifndef _DTRAIN_SCORE_H_ +#define _DTRAIN_SCORE_H_ + + +#include +#include +#include +#include +#include + +#include "wordid.h" + +using namespace std; + + +namespace dtrain +{ + + +/* + * ScorePair + * + */ +struct ScorePair +{ + ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {} + double modelscore_, score_; + double GetModelScore() { return modelscore_; } + double GetScore() { return score_; } +}; + +typedef vector Scores; + + +/* + * NgramCounts + * + */ +struct NgramCounts +{ + NgramCounts( const size_t N ) : N_( N ) { + reset(); + } + size_t N_; + map clipped; + map sum; + + void + operator+=( const NgramCounts& rhs ) + { + assert( N_ == rhs.N_ ); + for ( size_t i = 0; i < N_; i++ ) { + this->clipped[i] += rhs.clipped.find(i)->second; + this->sum[i] += rhs.sum.find(i)->second; + } + } + + void + add( size_t count, size_t ref_count, size_t i ) + { + assert( i < N_ ); + if ( count > ref_count ) { + clipped[i] += ref_count; + sum[i] += count; + } else { + clipped[i] += count; + sum[i] += count; + } + } + + void + reset() + { + size_t i; + for ( i = 0; i < N_; i++ ) { + clipped[i] = 0; + sum[i] = 0; + } + } + + void + print() + { + for ( size_t i = 0; i < N_; i++ ) { + cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; + cout << i+1 << "grams:\t\t\t" << sum[i] << endl; + } + } +}; + + +typedef map, size_t> Ngrams; +Ngrams make_ngrams( vector& s, size_t N ); +NgramCounts make_ngram_counts( vector hyp, vector ref, size_t N ); + +double brevity_penaly( const size_t hyp_len, const size_t ref_len ); +double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, + vector weights = vector() ); +double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, + vector weights = vector() ); +double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, + vector weights = vector() ); +double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, + vector weights = vector() ); + + +} // namespace + + +#endif + diff --git a/dtrain/scripts/run.sh b/dtrain/scripts/run.sh new file mode 100755 index 00000000..f2b6d600 --- /dev/null +++ b/dtrain/scripts/run.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +./dtrain -c ./data/cdec.ini -k 200 -n 3 -t 10 < ./data/in.blunsom08 #< data/in.toy + diff --git a/dtrain/scripts/test.sh b/dtrain/scripts/test.sh new file mode 100755 index 00000000..3639dfe7 --- /dev/null +++ b/dtrain/scripts/test.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +EXP=$1 +#head -5000 +cat ./data/in.blunsom08 | ./dtest -q false -c ./data/cdec.ini -w ./data/weights-$EXP 2> ./output/err.$EXP > ./output/out.$EXP + diff --git a/dtrain/test.sh b/dtrain/test.sh deleted file mode 100755 index bc318ae7..00000000 --- a/dtrain/test.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh - -./dtrain -c data/cdec.ini -k 200 < data/in.blunsom08 #< data/in.toy - diff --git a/dtrain/tests.cc b/dtrain/tests.cc new file mode 100644 index 00000000..997eafbb --- /dev/null +++ b/dtrain/tests.cc @@ -0,0 +1,141 @@ +#include "tests.h" + + +namespace dtrain +{ + + +/* + * approx_equal + * + */ +double +approx_equal( double x, double y ) +{ + const double EPSILON = 1E-5; + if ( x == 0 ) return fabs( y ) <= EPSILON; + if ( y == 0 ) return fabs( x ) <= EPSILON; + return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; +} + + +/* + * test_ngrams + * + */ +void +test_ngrams() +{ + cout << "Testing ngrams..." << endl << endl; + size_t N = 5; + cout << "N = " << N << endl; + vector a; // hyp + vector b; // ref + cout << "a "; + for (size_t i = 1; i <= 8; i++) { + cout << i << " "; + a.push_back(i); + } + cout << endl << "b "; + for (size_t i = 1; i <= 4; i++) { + cout << i << " "; + b.push_back(i); + } + cout << endl << endl; + NgramCounts c = make_ngram_counts( a, b, N ); + assert( c.clipped[N-1] == 0 ); + assert( c.sum[N-1] == 4 ); + c.print(); + c += c; + cout << endl; + c.print(); + cout << endl; +} + + +/* + * test_metrics + * + */ +void +test_metrics() +{ + cout << "Testing metrics..." << endl << endl; + using namespace boost::assign; + vector a, b; + vector expect_vanilla, expect_smooth, expect_stupid; + a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp + b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref + expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0; + expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587; + expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707; + vector aa, bb; + vector aai, bbi; + double vanilla, smooth, stupid; + size_t N = 4; + cout << "N = " << N << endl << endl; + for ( size_t i = 0; i < a.size(); i++ ) { + cout << " hyp: " << a[i] << endl; + cout << " ref: " << b[i] << endl; + aa.clear(); bb.clear(); aai.clear(); bbi.clear(); + boost::split( aa, a[i], boost::is_any_of(" ") ); + boost::split( bb, b[i], boost::is_any_of(" ") ); + register_and_convert( aa, aai ); + register_and_convert( bb, bbi ); + NgramCounts counts = make_ngram_counts( aai, bbi, N ); + vanilla = bleu( counts, aa.size(), bb.size(), N); + smooth = smooth_bleu( counts, aa.size(), bb.size(), N); + stupid = stupid_bleu( counts, aa.size(), bb.size(), N); + assert( approx_equal(vanilla, expect_vanilla[i]) ); + assert( approx_equal(smooth, expect_smooth[i]) ); + assert( approx_equal(stupid, expect_stupid[i]) ); + cout << setw(14) << "bleu = " << vanilla << endl; + cout << setw(14) << "smooth bleu = " << smooth << endl; + cout << setw(14) << "stupid bleu = " << stupid << endl << endl; + } + cout << endl; +} + + +/* + * test_SetWeights + * + */ +void +test_SetWeights() +{ + cout << "Testing Weights::SetWeight..." << endl << endl; + Weights weights; + SparseVector lambdas; + weights.InitSparseVector( &lambdas ); + weights.SetWeight( &lambdas, "test", 0 ); + weights.SetWeight( &lambdas, "test1", 1 ); + WordID fid = FD::Convert( "test2" ); + weights.SetWeight( &lambdas, fid, 2 ); + string fn = "weights-test"; + cout << "FD::NumFeats() " << FD::NumFeats() << endl; + assert( FD::NumFeats() == 4 ); + weights.WriteToFile( fn, true ); + cout << endl; +} + + +/* + * run_tests + * + */ +void +run_tests() +{ + cout << endl; + test_ngrams(); + cout << endl; + test_metrics(); + cout << endl; + test_SetWeights(); + exit(0); +} + + +} // namespace + diff --git a/dtrain/tests.h b/dtrain/tests.h new file mode 100644 index 00000000..9853e3c3 --- /dev/null +++ b/dtrain/tests.h @@ -0,0 +1,26 @@ +#ifndef _DTRAIN_TESTS_H_ +#define _DTRAIN_TESTS_H_ + +#include +#include + +#include "common.h" +#include "util.h" + + +namespace dtrain +{ + + +double approx_equal( double x, double y ); +void test_ngrams(); +void test_metrics(); +void test_SetWeights(); +void run_tests(); + + +} // namespace + + +#endif + diff --git a/dtrain/util.cc b/dtrain/util.cc new file mode 100644 index 00000000..7b3bbe3d --- /dev/null +++ b/dtrain/util.cc @@ -0,0 +1,34 @@ +#include "util.h" + + +namespace dtrain +{ + + +/* + * register_and_convert + * + */ +void +register_and_convert(const vector& strs, vector& ids) +{ + vector::const_iterator it; + for ( it = strs.begin(); it < strs.end(); it++ ) { + ids.push_back( TD::Convert( *it ) ); + } +} + + +/* + * print_FD + * + */ +void +print_FD() +{ + for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; +} + + +} // namespace + diff --git a/dtrain/util.h b/dtrain/util.h new file mode 100644 index 00000000..6a548519 --- /dev/null +++ b/dtrain/util.h @@ -0,0 +1,28 @@ +#ifndef _DTRAIN_UTIL_H_ +#define _DTRAIN_UTIL_H_ + + +#include +#include +#include + +#include "fdict.h" +#include "tdict.h" +#include "wordid.h" + +using namespace std; + + +namespace dtrain +{ + + +void register_and_convert(const vector& strs, vector& ids); +void print_FD(); + + +} // namespace + + +#endif + -- cgit v1.2.3