From d980ecbbcd35fba23313aa715046bc0f87a23afd Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 29 Jul 2011 00:48:04 +0200 Subject: first cut for sofia-ml, little change in utils/dict.h, coarse refactoring --- .gitignore | 3 - dtrain/Makefile.am | 11 +- dtrain/cdec.ini | 4 - dtrain/dcommon.cc | 311 ++++++++++++++++++++++++++++ dtrain/dcommon.h | 230 +++++++++++++++++++++ dtrain/dtest.cc | 95 +++++++++ dtrain/dtrain.cc | 595 ++++++----------------------------------------------- dtrain/dtrain.ini | 0 dtrain/in | 2 - dtrain/in.toy | 2 - dtrain/test.sh | 2 +- utils/dict.h | 5 +- 12 files changed, 707 insertions(+), 553 deletions(-) delete mode 100644 dtrain/cdec.ini create mode 100644 dtrain/dcommon.cc create mode 100644 dtrain/dcommon.h create mode 100644 dtrain/dtest.cc delete mode 100644 dtrain/dtrain.ini delete mode 100644 dtrain/in delete mode 100644 dtrain/in.toy diff --git a/.gitignore b/.gitignore index a983d493..95262a09 100644 --- a/.gitignore +++ b/.gitignore @@ -130,6 +130,3 @@ training/mpi_em_optimize training/test_ngram utils/ts training/compute_cllh -dtrain/dtrain -dtrain/weights* - diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index daa20cf3..c3f14bb0 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -1,6 +1,11 @@ -bin_PROGRAMS = dtrain +# TODO I'm sure I can leave something out. +bin_PROGRAMS = dtrain dtest -dtrain_SOURCES = dtrain.cc -dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +dtrain_SOURCES = dtrain.cc dcommon.cc +dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz + +dtest_SOURCES = dtest.cc dcommon.cc +dtest_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval + diff --git a/dtrain/cdec.ini b/dtrain/cdec.ini deleted file mode 100644 index 92a4a335..00000000 --- a/dtrain/cdec.ini +++ /dev/null @@ -1,4 +0,0 @@ -formalism=scfg -#feature_function=KLanguageModel europarl-v6.tok.lc.s-tag.en.arpa.kenlm.v4.mma -#k_best=2 -#add_pass_through_rules=true diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc new file mode 100644 index 00000000..a6bdc92c --- /dev/null +++ b/dtrain/dcommon.cc @@ -0,0 +1,311 @@ +#include "dcommon.h" + + + +/* + * make_ngrams + * + */ +typedef map, size_t> Ngrams; +Ngrams +make_ngrams( vector& s, size_t N ) +{ + Ngrams ngrams; + vector ng; + for ( size_t i = 0; i < s.size(); i++ ) { + ng.clear(); + for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { + ng.push_back( s[j] ); + ngrams[ng]++; + } + } + return ngrams; +} + + + + + +/* + * ngram_matches + * + */ +NgramCounts +make_ngram_counts( vector hyp, vector ref, size_t N ) +{ + Ngrams hyp_ngrams = make_ngrams( hyp, N ); + Ngrams ref_ngrams = make_ngrams( ref, N ); + NgramCounts counts( N ); + Ngrams::iterator it; + Ngrams::iterator ti; + for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { + ti = ref_ngrams.find( it->first ); + if ( ti != ref_ngrams.end() ) { + counts.add( it->second, ti->second, it->first.size() - 1 ); + } else { + counts.add( it->second, 0, it->first.size() - 1 ); + } + } + return counts; +} + + +/* + * brevity_penaly + * + */ +double +brevity_penaly( const size_t hyp_len, const size_t ref_len ) +{ + if ( hyp_len > ref_len ) return 1; + return exp( 1 - (double)ref_len/(double)hyp_len ); +} + + +/* + * bleu + * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) + * page TODO + * 0 if for N one of the counts = 0 + */ +double +bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + size_t N, vector weights ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + if ( ref_len < N ) N = ref_len; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + for ( size_t i = 0; i < N; i++ ) { + if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; + sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); + } + return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * stupid_bleu + * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) + * page TODO + * 0 iff no 1gram match + */ +double +stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + size_t N, vector weights ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + if ( ref_len < N ) N = ref_len; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + float add = 0; + for ( size_t i = 0; i < N; i++ ) { + if ( i == 1 ) add = 1; + sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); + } + return brevity_penaly( hyp_len, ref_len ) * exp( sum ); +} + + +/* + * smooth_bleu + * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) + * page TODO + * max. 0.9375 + */ +double +smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + const size_t N, vector weights ) +{ + if ( hyp_len == 0 || ref_len == 0 ) return 0; + float N_ = (float)N; + if ( weights.empty() ) + { + for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + } + double sum = 0; + float j = 1; + for ( size_t i = 0; i < N; i++ ) { + if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; + sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); + j++; + } + return brevity_penaly( hyp_len, ref_len ) * sum; +} + + +/* + * approx_bleu + * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) + * page TODO + * + */ +double +approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + const size_t N, vector weights ) +{ + return bleu( counts, hyp_len, ref_len, N, weights ); +} + + +/* + * register_and_convert + * + */ +void +register_and_convert(const vector& strs, vector& ids) +{ + vector::const_iterator it; + for ( it = strs.begin(); it < strs.end(); it++ ) { + ids.push_back( TD::Convert( *it ) ); + } +} + + + + +/* + * + * + */ +void +test_ngrams() +{ + cout << "Testing ngrams..." << endl << endl; + size_t N = 5; + cout << "N = " << N << endl; + vector a; // hyp + vector b; // ref + cout << "a "; + for (size_t i = 1; i <= 8; i++) { + cout << i << " "; + a.push_back(i); + } + cout << endl << "b "; + for (size_t i = 1; i <= 4; i++) { + cout << i << " "; + b.push_back(i); + } + cout << endl << endl; + NgramCounts c = make_ngram_counts( a, b, N ); + assert( c.clipped[N-1] == 0 ); + assert( c.sum[N-1] == 4 ); + c.print(); + c += c; + cout << endl; + c.print(); + cout << endl; +} + + +/* + * + * + */ +double +approx_equal( double x, double y ) +{ + const double EPSILON = 1E-5; + if ( x == 0 ) return fabs( y ) <= EPSILON; + if ( y == 0 ) return fabs( x ) <= EPSILON; + return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; +} + + +/* + * + * + */ +void +test_metrics() +{ + cout << "Testing metrics..." << endl << endl; + using namespace boost::assign; + vector a, b; + vector expect_vanilla, expect_smooth, expect_stupid; + a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp + b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref + expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0; + expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587; + expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707; + vector aa, bb; + vector aai, bbi; + double vanilla, smooth, stupid; + size_t N = 4; + cout << "N = " << N << endl << endl; + for ( size_t i = 0; i < a.size(); i++ ) { + cout << " hyp: " << a[i] << endl; + cout << " ref: " << b[i] << endl; + aa.clear(); bb.clear(); aai.clear(); bbi.clear(); + boost::split( aa, a[i], boost::is_any_of(" ") ); + boost::split( bb, b[i], boost::is_any_of(" ") ); + register_and_convert( aa, aai ); + register_and_convert( bb, bbi ); + NgramCounts counts = make_ngram_counts( aai, bbi, N ); + vanilla = bleu( counts, aa.size(), bb.size(), N); + smooth = smooth_bleu( counts, aa.size(), bb.size(), N); + stupid = stupid_bleu( counts, aa.size(), bb.size(), N); + assert( approx_equal(vanilla, expect_vanilla[i]) ); + assert( approx_equal(smooth, expect_smooth[i]) ); + assert( approx_equal(stupid, expect_stupid[i]) ); + cout << setw(14) << "bleu = " << vanilla << endl; + cout << setw(14) << "smooth bleu = " << smooth << endl; + cout << setw(14) << "stupid bleu = " << stupid << endl << endl; + } + cout << endl; +} + +/* + * + * + */ +void +test_SetWeights() +{ + cout << "Testing Weights::SetWeight..." << endl << endl; + Weights weights; + SparseVector lambdas; + weights.InitSparseVector( &lambdas ); + weights.SetWeight( &lambdas, "test", 0 ); + weights.SetWeight( &lambdas, "test1", 1 ); + WordID fid = FD::Convert( "test2" ); + weights.SetWeight( &lambdas, fid, 2 ); + string fn = "weights-test"; + cout << "FD::NumFeats() " << FD::NumFeats() << endl; + assert( FD::NumFeats() == 4 ); + weights.WriteToFile( fn, true ); + cout << endl; +} + + +/* + * + * + */ +void +run_tests() +{ + cout << endl; + test_ngrams(); + cout << endl; + test_metrics(); + cout << endl; + test_SetWeights(); + exit(0); +} + + +void +print_FD() +{ + for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; +} + diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h new file mode 100644 index 00000000..ff796642 --- /dev/null +++ b/dtrain/dcommon.h @@ -0,0 +1,230 @@ +#include +#include +#include +#include +#include + +#include "config.h" + +#include +#include +#include +#include + +#include "sentence_metadata.h" +#include "scorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" +#include "sampler.h" + +using namespace std; +namespace po = boost::program_options; + + +struct ScorePair +{ + ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {} + double modelscore_, score_; + double GetModelScore() { return modelscore_; } + double GetScore() { return score_; } +}; +typedef vector Scores; + + +/* + * KBestGetter + * + */ +struct KBestList { + vector > feats; + vector > sents; + vector scores; +}; +struct KBestGetter : public DecoderObserver +{ + KBestGetter( const size_t k ) : k_(k) {} + const size_t k_; + KBestList kb; + + virtual void + NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + { + GetKBest(smeta.GetSentenceID(), *hg); + } + + KBestList* GetKBest() { return &kb; } + + void + GetKBest(int sent_id, const Hypergraph& forest) + { + kb.scores.clear(); + kb.sents.clear(); + kb.feats.clear(); + KBest::KBestDerivations, ESentenceTraversal> kbest( forest, k_ ); + for ( size_t i = 0; i < k_; ++i ) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest( forest.nodes_.size() - 1, i ); + if (!d) break; + kb.sents.push_back( d->yield); + kb.feats.push_back( d->feature_values ); + kb.scores.push_back( d->score ); + } + } +}; + + +/* + * NgramCounts + * + */ +struct NgramCounts +{ + NgramCounts( const size_t N ) : N_( N ) { + reset(); + } + size_t N_; + map clipped; + map sum; + + void + operator+=( const NgramCounts& rhs ) + { + assert( N_ == rhs.N_ ); + for ( size_t i = 0; i < N_; i++ ) { + this->clipped[i] += rhs.clipped.find(i)->second; + this->sum[i] += rhs.sum.find(i)->second; + } + } + + void + add( size_t count, size_t ref_count, size_t i ) + { + assert( i < N_ ); + if ( count > ref_count ) { + clipped[i] += ref_count; + sum[i] += count; + } else { + clipped[i] += count; + sum[i] += count; + } + } + + void + reset() + { + size_t i; + for ( i = 0; i < N_; i++ ) { + clipped[i] = 0; + sum[i] = 0; + } + } + + void + print() + { + for ( size_t i = 0; i < N_; i++ ) { + cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; + cout << i+1 << "grams:\t\t\t" << sum[i] << endl; + } + } +}; + + +/*class Learnerx +{ + public: + virtual void Init(const vector >& kbest, const Scores& scores) {}; + virtual void Update(SparseVector& lambdas); +};*/ + +class SofiaLearner //: public Learnerx FIXME +{ + // TODO bool invert_score + public: + void + Init( const size_t sid, const vector >& kbest, /*const*/ Scores& scores ) + { + assert( kbest.size() == scores.size() ); + ofstream o; + unlink( "/tmo/sofia_ml_training" ); + o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists + int fid = 0; + map::iterator ff; + for ( size_t k = 0; k < kbest.size(); ++k ) { + SparseVector::const_iterator it = kbest[k].begin(); + o << scores[k].GetScore(); + for ( ; it != kbest[k].end(); ++it) { + ff = fmap.find( it->first ); + if ( ff == fmap.end() ) { + fmap.insert( pair(it->first, fid) ); + fmap1.insert( pair(fid, it->first) ); + fid++; + } + o << " "<< fmap[it->first] << ":" << it->second; + } + o << endl; + } + o.close(); + } + + void + Update(SparseVector& lambdas) + { + string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality "; + std::stringstream out; + out << fmap.size(); + call += out.str(); + call += " &>/dev/null"; + system ( call.c_str() ); + ifstream i; + unlink( "/tmo/sofia_ml_model" ); + i.open( "/tmp/sofia_ml_model", ios::in ); + string model; + getline( i, model ); + //cout << model << endl; + vector strs; + boost::split( strs, model, boost::is_any_of(" ") ); + int j = 0; + for ( vector::iterator it = strs.begin(); it != strs.end(); ++it ) { + lambdas.set_value(fmap1[j], atof( it->c_str() ) ); + j++; + } + + } + + private: + map fmap; + map fmap1; +}; + +typedef map, size_t> Ngrams; +Ngrams make_ngrams( vector& s, size_t N ); +NgramCounts make_ngram_counts( vector hyp, vector ref, size_t N ); +double brevity_penaly( const size_t hyp_len, const size_t ref_len ); +double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector weights = vector() ); +double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, vector weights = vector() ); +double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector weights = vector() ); +double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector weights = vector() ); +void register_and_convert(const vector& strs, vector& ids); + + + + +void print_FD(); +void run_tests(); +void test_SetWeights(); +#include +#include +void test_metrics(); +double approx_equal( double x, double y ); +void test_ngrams(); + diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc new file mode 100644 index 00000000..9975794f --- /dev/null +++ b/dtrain/dtest.cc @@ -0,0 +1,95 @@ +#include "dcommon.h" + + + + +/* + * init + * + */ +bool +init(int argc, char** argv, po::variables_map* conf) +{ + int N; + po::options_description opts( "Options" ); + opts.add_options() + ( "decoder-config,c", po::value(), "configuration file for cdec" ) + ( "weights,w", po::value(), "weights file") + ( "ngrams,n", po::value(&N)->default_value(4), "N for Ngrams (default 5)" ); + po::options_description cmdline_options; + cmdline_options.add(opts); + po::store( parse_command_line(argc, argv, cmdline_options), *conf ); + po::notify( *conf ); + if ( ! (conf->count("decoder-config") || conf->count("weights")) ) { + cerr << cmdline_options << endl; + return false; + } + return true; +} + + +/* + * main + * + */ +int +main(int argc, char** argv) +{ + SetSilent(true); + po::variables_map conf; + if (!init(argc, argv, &conf)) return 1; + register_feature_functions(); + size_t k = 1; + ReadFile ini_rf(conf["decoder-config"].as()); + Decoder decoder(ini_rf.stream()); + KBestGetter observer(k); + size_t N = conf["ngrams"].as(); + + Weights weights; + weights.InitFromFile(conf["weights"].as()); + vector w; + weights.InitVector(&w); + decoder.SetWeights(w); + + vector strs, ref_strs; + vector ref_ids; + string in, psg; + size_t sid = 0; + double overall = 0.0; + cerr << "(1 dot equals 100 lines of input)" << endl; + while( getline(cin, in) ) { + if ( (sid+1) % 100 == 0 ) { + cerr << "."; + if ( (sid+1)%1000 == 0 ) cerr << endl; + } + if ( sid > 5000 ) break; + strs.clear(); + boost::split( strs, in, boost::is_any_of("\t") ); + // grammar + psg = boost::replace_all_copy( strs[2], " __NEXT_RULE__ ", "\n" ); psg += "\n"; + decoder.SetSentenceGrammar( psg ); + decoder.Decode( strs[0], &observer ); + KBestList* kb = observer.GetKBest(); + // reference + ref_strs.clear(); ref_ids.clear(); + boost::split( ref_strs, strs[1], boost::is_any_of(" ") ); + register_and_convert( ref_strs, ref_ids ); + // scoring kbest + double score = 0; + Scores scores; + NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], 4 ); + score = smooth_bleu( counts, + ref_ids.size(), + kb->sents[0].size(), N ); + ScorePair sp( kb->scores[0], score ); + scores.push_back( sp ); + //cout << TD::GetString( kb->sents[0] ) << endl; + overall += score; + sid += 1; + } + cout << "Average score: " << overall/(sid+1) << endl; + cerr << endl; + + return 0; +} + diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 8464a429..95fc81af 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -1,33 +1,6 @@ -#include -#include -#include -#include -#include +#include "dcommon.h" -#include "config.h" -#include -#include -#include -#include - -#include "sentence_metadata.h" -#include "scorer.h" -#include "verbose.h" -#include "viterbi.h" -#include "hg.h" -#include "prob.h" -#include "kbest.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "sampler.h" - -using namespace std; -namespace boostpo = boost::program_options; /* @@ -35,19 +8,19 @@ namespace boostpo = boost::program_options; * */ bool -init(int argc, char** argv, boostpo::variables_map* conf) +init(int argc, char** argv, po::variables_map* conf) { - boostpo::options_description opts( "Options" ); + po::options_description opts( "Options" ); opts.add_options() - ( "decoder-config,c", boostpo::value(), "configuration file for cdec" ) - ( "kbest,k", boostpo::value(), "k for kbest" ) - ( "ngrams,n", boostpo::value(), "n for Ngrams" ) - ( "filter,f", boostpo::value(), "filter kbest list" ) + ( "decoder-config,c", po::value(), "configuration file for cdec" ) + ( "kbest,k", po::value(), "k for kbest" ) + ( "ngrams,n", po::value(), "n for Ngrams" ) + ( "filter,f", po::value(), "filter kbest list" ) ( "test", "run tests and exit"); - boostpo::options_description cmdline_options; + po::options_description cmdline_options; cmdline_options.add(opts); - boostpo::store( parse_command_line(argc, argv, cmdline_options), *conf ); - boostpo::notify( *conf ); + po::store( parse_command_line(argc, argv, cmdline_options), *conf ); + po::notify( *conf ); if ( ! (conf->count("decoder-config") || conf->count("test")) ) { cerr << cmdline_options << endl; return false; @@ -56,443 +29,6 @@ init(int argc, char** argv, boostpo::variables_map* conf) } -/* - * KBestGetter - * - */ -struct KBestList { - vector > feats; - vector > sents; - vector scores; -}; -struct KBestGetter : public DecoderObserver -{ - KBestGetter( const size_t k ) : k_(k) {} - const size_t k_; - KBestList kb; - - virtual void - NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) - { - GetKBest(smeta.GetSentenceID(), *hg); - } - - KBestList* getkb() { return &kb; } - - void - GetKBest(int sent_id, const Hypergraph& forest) - { - kb.scores.clear(); - kb.sents.clear(); - kb.feats.clear(); - KBest::KBestDerivations, ESentenceTraversal> kbest( forest, k_ ); - for ( size_t i = 0; i < k_; ++i ) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest( forest.nodes_.size() - 1, i ); - if (!d) break; - kb.sents.push_back( d->yield); - kb.feats.push_back( d->feature_values ); - kb.scores.push_back( d->score ); - } - } -}; - - -/* - * write_training_data_for_sofia - * - */ -void -sofia_write_training_data() -{ - // TODO -} - - -/* - * call_sofia - * - */ -void -sofia_call() -{ - // TODO -} - - -/* - * sofia_model2weights - * - */ -void -sofia_read_model() -{ - // TODO -} - - -/* - * make_ngrams - * - */ -typedef map, size_t> Ngrams; -Ngrams -make_ngrams( vector& s, size_t N ) -{ - Ngrams ngrams; - vector ng; - for ( size_t i = 0; i < s.size(); i++ ) { - ng.clear(); - for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { - ng.push_back( s[j] ); - ngrams[ng]++; - } - } - return ngrams; -} - - -/* - * NgramCounts - * - */ -struct NgramCounts -{ - NgramCounts( const size_t N ) : N_( N ) { - reset(); - } - size_t N_; - map clipped; - map sum; - - void - operator+=( const NgramCounts& rhs ) - { - assert( N_ == rhs.N_ ); - for ( size_t i = 0; i < N_; i++ ) { - this->clipped[i] += rhs.clipped.find(i)->second; - this->sum[i] += rhs.sum.find(i)->second; - } - } - - void - add( size_t count, size_t ref_count, size_t i ) - { - assert( i < N_ ); - if ( count > ref_count ) { - clipped[i] += ref_count; - sum[i] += count; - } else { - clipped[i] += count; - sum[i] += count; - } - } - - void - reset() - { - size_t i; - for ( i = 0; i < N_; i++ ) { - clipped[i] = 0; - sum[i] = 0; - } - } - - void - print() - { - for ( size_t i = 0; i < N_; i++ ) { - cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; - cout << i+1 << "grams:\t\t\t" << sum[i] << endl; - } - } -}; - - -/* - * ngram_matches - * - */ -NgramCounts -make_ngram_counts( vector hyp, vector ref, size_t N ) -{ - Ngrams hyp_ngrams = make_ngrams( hyp, N ); - Ngrams ref_ngrams = make_ngrams( ref, N ); - NgramCounts counts( N ); - Ngrams::iterator it; - Ngrams::iterator ti; - for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { - ti = ref_ngrams.find( it->first ); - if ( ti != ref_ngrams.end() ) { - counts.add( it->second, ti->second, it->first.size() - 1 ); - } else { - counts.add( it->second, 0, it->first.size() - 1 ); - } - } - return counts; -} - - -/* - * brevity_penaly - * - */ -double -brevity_penaly( const size_t hyp_len, const size_t ref_len ) -{ - if ( hyp_len > ref_len ) return 1; - return exp( 1 - (double)ref_len/(double)hyp_len ); -} - - -/* - * bleu - * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) - * page TODO - * 0 if for N one of the counts = 0 - */ -double -bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - size_t N, vector weights = vector() ) -{ - if ( hyp_len == 0 || ref_len == 0 ) return 0; - if ( ref_len < N ) N = ref_len; - float N_ = (float)N; - if ( weights.empty() ) - { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); - } - double sum = 0; - for ( size_t i = 0; i < N; i++ ) { - if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; - sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); - } - return brevity_penaly( hyp_len, ref_len ) * exp( sum ); -} - - -/* - * stupid_bleu - * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) - * page TODO - * 0 iff no 1gram match - */ -double -stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - size_t N, vector weights = vector() ) -{ - if ( hyp_len == 0 || ref_len == 0 ) return 0; - if ( ref_len < N ) N = ref_len; - float N_ = (float)N; - if ( weights.empty() ) - { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); - } - double sum = 0; - float add = 0; - for ( size_t i = 0; i < N; i++ ) { - if ( i == 1 ) add = 1; - sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); - } - return brevity_penaly( hyp_len, ref_len ) * exp( sum ); -} - - -/* - * smooth_bleu - * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) - * page TODO - * max. 0.9375 - */ -double -smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - const size_t N, vector weights = vector() ) -{ - if ( hyp_len == 0 || ref_len == 0 ) return 0; - float N_ = (float)N; - if ( weights.empty() ) - { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); - } - double sum = 0; - float j = 1; - for ( size_t i = 0; i < N; i++ ) { - if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; - sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); - j++; - } - return brevity_penaly( hyp_len, ref_len ) * sum; -} - - -/* - * approx_bleu - * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) - * page TODO - * - */ -double -approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - const size_t N, vector weights = vector() ) -{ - return bleu( counts, hyp_len, ref_len, N, weights ); -} - - -/* - * register_and_convert - * - */ -void -register_and_convert(const vector& strs, vector& ids) -{ - vector::const_iterator it; - for ( it = strs.begin(); it < strs.end(); it++ ) { - ids.push_back( TD::Convert( *it ) ); - } -} - - -/* - * - * - */ -void -test_ngrams() -{ - cout << "Testing ngrams..." << endl << endl; - size_t N = 5; - cout << "N = " << N << endl; - vector a; // hyp - vector b; // ref - cout << "a "; - for (size_t i = 1; i <= 8; i++) { - cout << i << " "; - a.push_back(i); - } - cout << endl << "b "; - for (size_t i = 1; i <= 4; i++) { - cout << i << " "; - b.push_back(i); - } - cout << endl << endl; - NgramCounts c = make_ngram_counts( a, b, N ); - assert( c.clipped[N-1] == 0 ); - assert( c.sum[N-1] == 4 ); - c.print(); - c += c; - cout << endl; - c.print(); - cout << endl; -} - - -/* - * - * - */ -double -approx_equal( double x, double y ) -{ - const double EPSILON = 1E-5; - if ( x == 0 ) return fabs( y ) <= EPSILON; - if ( y == 0 ) return fabs( x ) <= EPSILON; - return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; -} - - -/* - * - * - */ -#include -#include -void -test_metrics() -{ - cout << "Testing metrics..." << endl << endl; - using namespace boost::assign; - vector a, b; - vector expect_vanilla, expect_smooth, expect_stupid; - a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp - b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref - expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0; - expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587; - expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707; - vector aa, bb; - vector aai, bbi; - double vanilla, smooth, stupid; - size_t N = 4; - cout << "N = " << N << endl << endl; - for ( size_t i = 0; i < a.size(); i++ ) { - cout << " hyp: " << a[i] << endl; - cout << " ref: " << b[i] << endl; - aa.clear(); bb.clear(); aai.clear(); bbi.clear(); - boost::split( aa, a[i], boost::is_any_of(" ") ); - boost::split( bb, b[i], boost::is_any_of(" ") ); - register_and_convert( aa, aai ); - register_and_convert( bb, bbi ); - NgramCounts counts = make_ngram_counts( aai, bbi, N ); - vanilla = bleu( counts, aa.size(), bb.size(), N); - smooth = smooth_bleu( counts, aa.size(), bb.size(), N); - stupid = stupid_bleu( counts, aa.size(), bb.size(), N); - assert( approx_equal(vanilla, expect_vanilla[i]) ); - assert( approx_equal(smooth, expect_smooth[i]) ); - assert( approx_equal(stupid, expect_stupid[i]) ); - cout << setw(14) << "bleu = " << vanilla << endl; - cout << setw(14) << "smooth bleu = " << smooth << endl; - cout << setw(14) << "stupid bleu = " << stupid << endl << endl; - } - cout << endl; -} - -/* - * - * - */ -void -test_SetWeights() -{ - cout << "Testing Weights::SetWeight..." << endl << endl; - Weights weights; - SparseVector lambdas; - weights.InitSparseVector( &lambdas ); - weights.SetWeight( &lambdas, "test", 0 ); - weights.SetWeight( &lambdas, "test1", 1 ); - WordID fid = FD::Convert( "test2" ); - weights.SetWeight( &lambdas, fid, 2 ); - string fn = "weights-test"; - cout << "FD::NumFeats() " << FD::NumFeats() << endl; - assert( FD::NumFeats() == 4 ); - weights.WriteToFile( fn, true ); - cout << endl; -} - - -/* - * - * - */ -void -run_tests() -{ - cout << endl; - test_ngrams(); - cout << endl; - test_metrics(); - cout << endl; - test_SetWeights(); - exit(0); -} - - -void -print_FD() -{ - for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; -} - - /* * main * @@ -500,8 +36,8 @@ print_FD() int main(int argc, char** argv) { - //SetSilent(true); - boostpo::variables_map conf; + SetSilent(true); + po::variables_map conf; if (!init(argc, argv, &conf)) return 1; if ( conf.count("test") ) run_tests(); register_feature_functions(); @@ -509,7 +45,9 @@ main(int argc, char** argv) ReadFile ini_rf(conf["decoder-config"].as()); Decoder decoder(ini_rf.stream()); KBestGetter observer(k); - + size_t N = 4; // TODO as parameter/in config + + // TODO scoring metric as parameter/in config // for approx. bleu //NgramCounts global_counts; //size_t global_hyp_len; @@ -523,82 +61,67 @@ main(int argc, char** argv) lambdas.set_value(FD::Convert("logp"), 0); - vector strs; + vector strs, ref_strs; + vector ref_ids; string in, psg; - size_t i = 0; + size_t sid = 0; + cerr << "(1 dot equals 100 lines of input)" << endl; while( getline(cin, in) ) { - if ( !SILENT ) cerr << endl << endl << "Getting kbest for sentence #" << i << endl; - // why? why!? + //if ( !SILENT ) + // cerr << endl << endl << "Getting kbest for sentence #" << sid << endl; + if ( (sid+1) % 100 == 0 ) { + cerr << "."; + if ( (sid+1)%1000 == 0 ) cerr << endl; + } + if ( sid > 5000 ) break; + // weights dense_weights.clear(); weights.InitFromVector( lambdas ); weights.InitVector( &dense_weights ); decoder.SetWeights( dense_weights ); - //cout << "use_shell " << dense_weights[FD::Convert("use_shell")] << endl; + //if ( sid > 100 ) break; + // handling input.. strs.clear(); boost::split( strs, in, boost::is_any_of("\t") ); + // grammar psg = boost::replace_all_copy( strs[2], " __NEXT_RULE__ ", "\n" ); psg += "\n"; - //decoder.SetId(i); decoder.SetSentenceGrammar( psg ); decoder.Decode( strs[0], &observer ); - KBestList* kb = observer.getkb(); + KBestList* kb = observer.GetKBest(); + // reference + ref_strs.clear(); ref_ids.clear(); + boost::split( ref_strs, strs[1], boost::is_any_of(" ") ); + register_and_convert( ref_strs, ref_ids ); + // scoring kbest + double score = 0; + Scores scores; for ( size_t i = 0; i < k; i++ ) { - cout << i << " "; - for (size_t j = 0; j < kb->sents[i].size(); ++j ) { - cout << TD::Convert( kb->sents[i][j] ) << " "; - } - cout << kb->scores[i]; - cout << endl; + NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], 4 ); + score = smooth_bleu( counts, + ref_ids.size(), + kb->sents[i].size(), N ); + ScorePair sp( kb->scores[i], score ); + scores.push_back( sp ); + //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl; + //cout << kb->feats[i] << endl; } - lambdas.set_value( FD::Convert("use_shell"), 1 ); - lambdas.set_value( FD::Convert("use_a"), 1 ); + //cout << "###" << endl; + SofiaLearner learner; + learner.Init( sid, kb->feats, scores ); + learner.Update(lambdas); + // initializing learner + // TODO + // updating weights + //lambdas.set_value( FD::Convert("use_shell"), 1 ); + //lambdas.set_value( FD::Convert("use_a"), 1 ); //print_FD(); + sid += 1; // TODO does cdec count this already? } - + weights.WriteToFile( "weights-final", true ); + + cerr << endl; return 0; } - // next: FMap, ->sofia, ->FMap, -> Weights - // learner gets all used features (binary! and dense (logprob is sum of logprobs!)) - // only for those feats with weight > 0 after learning - // see decoder line 548 - - -/* - * TODO - * iterate over training set, for t=1..T - * mapred impl - * mapper: main - * reducer: average weights, global NgramCounts for approx. bleu - * 1st cut: hadoop streaming? - * batch, non-batch in the mapper (what sofia gets, regenerated Kbest lists) - * filter kbest yes/no - * sofia: --eta_type explicit - * psg preparation source\tref\tpsg - * set reference for cdec? - * LM - * shared? - * startup? - * X reference(s) for *bleu!? - * kbest nicer (do not iterate twice)!? -> shared_ptr - * multipartite ranking - * weights! global, per sentence from global, featuremap - * const decl... - * sketch: batch/iter options - * weights.cc: why wv_? - * --weights cmd line (for iterations): script to call again/hadoop streaming? - * I do not need to remember features, cdec does - * resocre hg? - * do not use Decoder::Decode!? - * what happens if feature not in FD? 0??? - */ - -/* - * PROBLEMS - * cdec kbest vs 1best (no -k param) - * FD, Weights::wv_ grow too large, see utils/weights.cc; decoder/hg.h; decoder/scfg_translator.cc; utils/fdict.cc!? - * sparse vector instead of vector for weights in Decoder? - * PhraseModel_* features for psg!? (seem to be generated) - */ - diff --git a/dtrain/dtrain.ini b/dtrain/dtrain.ini deleted file mode 100644 index e69de29b..00000000 diff --git a/dtrain/in b/dtrain/in deleted file mode 100644 index 294d009b..00000000 --- a/dtrain/in +++ /dev/null @@ -1,2 +0,0 @@ -vorrichtung means [X] ||| vorrichtung ||| apparatus ||| LogP=-200 ||| 0-0 __NEXT_RULE__ [X] ||| vorrichtung ||| means ||| LogP=-101 ||| 0-0 -eintest test [X] ||| eintest ||| test ||| LogP=-200 ||| 0-0 __NEXT_RULE__ [X] ||| eintest ||| xxx ||| LogP=-101 ||| 0-0 diff --git a/dtrain/in.toy b/dtrain/in.toy deleted file mode 100644 index 71b736a6..00000000 --- a/dtrain/in.toy +++ /dev/null @@ -1,2 +0,0 @@ -ich sah ein kleines haus i saw a little shell [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=-0.5 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=-0.5 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=-1.5 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0 -ich fand ein grosses haus i found a little shell [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=-1000 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=-1 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=-0.9 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=-0.5 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=-1.5 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0 diff --git a/dtrain/test.sh b/dtrain/test.sh index a0ebb420..ad45bd1e 100755 --- a/dtrain/test.sh +++ b/dtrain/test.sh @@ -1,4 +1,4 @@ #!/bin/sh -./dtrain -c cdec.ini -k 4 < in.toy +./dtrain -c data/cdec.ini -k 4 < data/in.blunsom08 #< data/in.toy diff --git a/utils/dict.h b/utils/dict.h index 75ea3def..33cca6cf 100644 --- a/utils/dict.h +++ b/utils/dict.h @@ -1,7 +1,7 @@ #ifndef DICT_H_ #define DICT_H_ - +#include #include #include @@ -73,7 +73,8 @@ class Dict { inline const std::string& Convert(const WordID& id) const { if (id == 0) return b0_; - assert(id <= (int)words_.size()); + //assert(id <= (int)words_.size()); + if (id < 0 || id > (int)words_.size()) return b0_; return words_[id-1]; } -- cgit v1.2.3