From d980ecbbcd35fba23313aa715046bc0f87a23afd Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 29 Jul 2011 00:48:04 +0200 Subject: first cut for sofia-ml, little change in utils/dict.h, coarse refactoring --- dtrain/dtrain.cc | 595 ++++++------------------------------------------------- 1 file changed, 59 insertions(+), 536 deletions(-) (limited to 'dtrain/dtrain.cc') diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 8464a429..95fc81af 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -1,33 +1,6 @@ -#include -#include -#include -#include -#include +#include "dcommon.h" -#include "config.h" -#include -#include -#include -#include - -#include "sentence_metadata.h" -#include "scorer.h" -#include "verbose.h" -#include "viterbi.h" -#include "hg.h" -#include "prob.h" -#include "kbest.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "sampler.h" - -using namespace std; -namespace boostpo = boost::program_options; /* @@ -35,19 +8,19 @@ namespace boostpo = boost::program_options; * */ bool -init(int argc, char** argv, boostpo::variables_map* conf) +init(int argc, char** argv, po::variables_map* conf) { - boostpo::options_description opts( "Options" ); + po::options_description opts( "Options" ); opts.add_options() - ( "decoder-config,c", boostpo::value(), "configuration file for cdec" ) - ( "kbest,k", boostpo::value(), "k for kbest" ) - ( "ngrams,n", boostpo::value(), "n for Ngrams" ) - ( "filter,f", boostpo::value(), "filter kbest list" ) + ( "decoder-config,c", po::value(), "configuration file for cdec" ) + ( "kbest,k", po::value(), "k for kbest" ) + ( "ngrams,n", po::value(), "n for Ngrams" ) + ( "filter,f", po::value(), "filter kbest list" ) ( "test", "run tests and exit"); - boostpo::options_description cmdline_options; + po::options_description cmdline_options; cmdline_options.add(opts); - boostpo::store( parse_command_line(argc, argv, cmdline_options), *conf ); - boostpo::notify( *conf ); + po::store( parse_command_line(argc, argv, cmdline_options), *conf ); + po::notify( *conf ); if ( ! (conf->count("decoder-config") || conf->count("test")) ) { cerr << cmdline_options << endl; return false; @@ -56,443 +29,6 @@ init(int argc, char** argv, boostpo::variables_map* conf) } -/* - * KBestGetter - * - */ -struct KBestList { - vector > feats; - vector > sents; - vector scores; -}; -struct KBestGetter : public DecoderObserver -{ - KBestGetter( const size_t k ) : k_(k) {} - const size_t k_; - KBestList kb; - - virtual void - NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) - { - GetKBest(smeta.GetSentenceID(), *hg); - } - - KBestList* getkb() { return &kb; } - - void - GetKBest(int sent_id, const Hypergraph& forest) - { - kb.scores.clear(); - kb.sents.clear(); - kb.feats.clear(); - KBest::KBestDerivations, ESentenceTraversal> kbest( forest, k_ ); - for ( size_t i = 0; i < k_; ++i ) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest( forest.nodes_.size() - 1, i ); - if (!d) break; - kb.sents.push_back( d->yield); - kb.feats.push_back( d->feature_values ); - kb.scores.push_back( d->score ); - } - } -}; - - -/* - * write_training_data_for_sofia - * - */ -void -sofia_write_training_data() -{ - // TODO -} - - -/* - * call_sofia - * - */ -void -sofia_call() -{ - // TODO -} - - -/* - * sofia_model2weights - * - */ -void -sofia_read_model() -{ - // TODO -} - - -/* - * make_ngrams - * - */ -typedef map, size_t> Ngrams; -Ngrams -make_ngrams( vector& s, size_t N ) -{ - Ngrams ngrams; - vector ng; - for ( size_t i = 0; i < s.size(); i++ ) { - ng.clear(); - for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { - ng.push_back( s[j] ); - ngrams[ng]++; - } - } - return ngrams; -} - - -/* - * NgramCounts - * - */ -struct NgramCounts -{ - NgramCounts( const size_t N ) : N_( N ) { - reset(); - } - size_t N_; - map clipped; - map sum; - - void - operator+=( const NgramCounts& rhs ) - { - assert( N_ == rhs.N_ ); - for ( size_t i = 0; i < N_; i++ ) { - this->clipped[i] += rhs.clipped.find(i)->second; - this->sum[i] += rhs.sum.find(i)->second; - } - } - - void - add( size_t count, size_t ref_count, size_t i ) - { - assert( i < N_ ); - if ( count > ref_count ) { - clipped[i] += ref_count; - sum[i] += count; - } else { - clipped[i] += count; - sum[i] += count; - } - } - - void - reset() - { - size_t i; - for ( i = 0; i < N_; i++ ) { - clipped[i] = 0; - sum[i] = 0; - } - } - - void - print() - { - for ( size_t i = 0; i < N_; i++ ) { - cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; - cout << i+1 << "grams:\t\t\t" << sum[i] << endl; - } - } -}; - - -/* - * ngram_matches - * - */ -NgramCounts -make_ngram_counts( vector hyp, vector ref, size_t N ) -{ - Ngrams hyp_ngrams = make_ngrams( hyp, N ); - Ngrams ref_ngrams = make_ngrams( ref, N ); - NgramCounts counts( N ); - Ngrams::iterator it; - Ngrams::iterator ti; - for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { - ti = ref_ngrams.find( it->first ); - if ( ti != ref_ngrams.end() ) { - counts.add( it->second, ti->second, it->first.size() - 1 ); - } else { - counts.add( it->second, 0, it->first.size() - 1 ); - } - } - return counts; -} - - -/* - * brevity_penaly - * - */ -double -brevity_penaly( const size_t hyp_len, const size_t ref_len ) -{ - if ( hyp_len > ref_len ) return 1; - return exp( 1 - (double)ref_len/(double)hyp_len ); -} - - -/* - * bleu - * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) - * page TODO - * 0 if for N one of the counts = 0 - */ -double -bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - size_t N, vector weights = vector() ) -{ - if ( hyp_len == 0 || ref_len == 0 ) return 0; - if ( ref_len < N ) N = ref_len; - float N_ = (float)N; - if ( weights.empty() ) - { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); - } - double sum = 0; - for ( size_t i = 0; i < N; i++ ) { - if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; - sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); - } - return brevity_penaly( hyp_len, ref_len ) * exp( sum ); -} - - -/* - * stupid_bleu - * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) - * page TODO - * 0 iff no 1gram match - */ -double -stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - size_t N, vector weights = vector() ) -{ - if ( hyp_len == 0 || ref_len == 0 ) return 0; - if ( ref_len < N ) N = ref_len; - float N_ = (float)N; - if ( weights.empty() ) - { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); - } - double sum = 0; - float add = 0; - for ( size_t i = 0; i < N; i++ ) { - if ( i == 1 ) add = 1; - sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); - } - return brevity_penaly( hyp_len, ref_len ) * exp( sum ); -} - - -/* - * smooth_bleu - * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) - * page TODO - * max. 0.9375 - */ -double -smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - const size_t N, vector weights = vector() ) -{ - if ( hyp_len == 0 || ref_len == 0 ) return 0; - float N_ = (float)N; - if ( weights.empty() ) - { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); - } - double sum = 0; - float j = 1; - for ( size_t i = 0; i < N; i++ ) { - if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; - sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); - j++; - } - return brevity_penaly( hyp_len, ref_len ) * sum; -} - - -/* - * approx_bleu - * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) - * page TODO - * - */ -double -approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - const size_t N, vector weights = vector() ) -{ - return bleu( counts, hyp_len, ref_len, N, weights ); -} - - -/* - * register_and_convert - * - */ -void -register_and_convert(const vector& strs, vector& ids) -{ - vector::const_iterator it; - for ( it = strs.begin(); it < strs.end(); it++ ) { - ids.push_back( TD::Convert( *it ) ); - } -} - - -/* - * - * - */ -void -test_ngrams() -{ - cout << "Testing ngrams..." << endl << endl; - size_t N = 5; - cout << "N = " << N << endl; - vector a; // hyp - vector b; // ref - cout << "a "; - for (size_t i = 1; i <= 8; i++) { - cout << i << " "; - a.push_back(i); - } - cout << endl << "b "; - for (size_t i = 1; i <= 4; i++) { - cout << i << " "; - b.push_back(i); - } - cout << endl << endl; - NgramCounts c = make_ngram_counts( a, b, N ); - assert( c.clipped[N-1] == 0 ); - assert( c.sum[N-1] == 4 ); - c.print(); - c += c; - cout << endl; - c.print(); - cout << endl; -} - - -/* - * - * - */ -double -approx_equal( double x, double y ) -{ - const double EPSILON = 1E-5; - if ( x == 0 ) return fabs( y ) <= EPSILON; - if ( y == 0 ) return fabs( x ) <= EPSILON; - return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; -} - - -/* - * - * - */ -#include -#include -void -test_metrics() -{ - cout << "Testing metrics..." << endl << endl; - using namespace boost::assign; - vector a, b; - vector expect_vanilla, expect_smooth, expect_stupid; - a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp - b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref - expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0; - expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587; - expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707; - vector aa, bb; - vector aai, bbi; - double vanilla, smooth, stupid; - size_t N = 4; - cout << "N = " << N << endl << endl; - for ( size_t i = 0; i < a.size(); i++ ) { - cout << " hyp: " << a[i] << endl; - cout << " ref: " << b[i] << endl; - aa.clear(); bb.clear(); aai.clear(); bbi.clear(); - boost::split( aa, a[i], boost::is_any_of(" ") ); - boost::split( bb, b[i], boost::is_any_of(" ") ); - register_and_convert( aa, aai ); - register_and_convert( bb, bbi ); - NgramCounts counts = make_ngram_counts( aai, bbi, N ); - vanilla = bleu( counts, aa.size(), bb.size(), N); - smooth = smooth_bleu( counts, aa.size(), bb.size(), N); - stupid = stupid_bleu( counts, aa.size(), bb.size(), N); - assert( approx_equal(vanilla, expect_vanilla[i]) ); - assert( approx_equal(smooth, expect_smooth[i]) ); - assert( approx_equal(stupid, expect_stupid[i]) ); - cout << setw(14) << "bleu = " << vanilla << endl; - cout << setw(14) << "smooth bleu = " << smooth << endl; - cout << setw(14) << "stupid bleu = " << stupid << endl << endl; - } - cout << endl; -} - -/* - * - * - */ -void -test_SetWeights() -{ - cout << "Testing Weights::SetWeight..." << endl << endl; - Weights weights; - SparseVector lambdas; - weights.InitSparseVector( &lambdas ); - weights.SetWeight( &lambdas, "test", 0 ); - weights.SetWeight( &lambdas, "test1", 1 ); - WordID fid = FD::Convert( "test2" ); - weights.SetWeight( &lambdas, fid, 2 ); - string fn = "weights-test"; - cout << "FD::NumFeats() " << FD::NumFeats() << endl; - assert( FD::NumFeats() == 4 ); - weights.WriteToFile( fn, true ); - cout << endl; -} - - -/* - * - * - */ -void -run_tests() -{ - cout << endl; - test_ngrams(); - cout << endl; - test_metrics(); - cout << endl; - test_SetWeights(); - exit(0); -} - - -void -print_FD() -{ - for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; -} - - /* * main * @@ -500,8 +36,8 @@ print_FD() int main(int argc, char** argv) { - //SetSilent(true); - boostpo::variables_map conf; + SetSilent(true); + po::variables_map conf; if (!init(argc, argv, &conf)) return 1; if ( conf.count("test") ) run_tests(); register_feature_functions(); @@ -509,7 +45,9 @@ main(int argc, char** argv) ReadFile ini_rf(conf["decoder-config"].as()); Decoder decoder(ini_rf.stream()); KBestGetter observer(k); - + size_t N = 4; // TODO as parameter/in config + + // TODO scoring metric as parameter/in config // for approx. bleu //NgramCounts global_counts; //size_t global_hyp_len; @@ -523,82 +61,67 @@ main(int argc, char** argv) lambdas.set_value(FD::Convert("logp"), 0); - vector strs; + vector strs, ref_strs; + vector ref_ids; string in, psg; - size_t i = 0; + size_t sid = 0; + cerr << "(1 dot equals 100 lines of input)" << endl; while( getline(cin, in) ) { - if ( !SILENT ) cerr << endl << endl << "Getting kbest for sentence #" << i << endl; - // why? why!? + //if ( !SILENT ) + // cerr << endl << endl << "Getting kbest for sentence #" << sid << endl; + if ( (sid+1) % 100 == 0 ) { + cerr << "."; + if ( (sid+1)%1000 == 0 ) cerr << endl; + } + if ( sid > 5000 ) break; + // weights dense_weights.clear(); weights.InitFromVector( lambdas ); weights.InitVector( &dense_weights ); decoder.SetWeights( dense_weights ); - //cout << "use_shell " << dense_weights[FD::Convert("use_shell")] << endl; + //if ( sid > 100 ) break; + // handling input.. strs.clear(); boost::split( strs, in, boost::is_any_of("\t") ); + // grammar psg = boost::replace_all_copy( strs[2], " __NEXT_RULE__ ", "\n" ); psg += "\n"; - //decoder.SetId(i); decoder.SetSentenceGrammar( psg ); decoder.Decode( strs[0], &observer ); - KBestList* kb = observer.getkb(); + KBestList* kb = observer.GetKBest(); + // reference + ref_strs.clear(); ref_ids.clear(); + boost::split( ref_strs, strs[1], boost::is_any_of(" ") ); + register_and_convert( ref_strs, ref_ids ); + // scoring kbest + double score = 0; + Scores scores; for ( size_t i = 0; i < k; i++ ) { - cout << i << " "; - for (size_t j = 0; j < kb->sents[i].size(); ++j ) { - cout << TD::Convert( kb->sents[i][j] ) << " "; - } - cout << kb->scores[i]; - cout << endl; + NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], 4 ); + score = smooth_bleu( counts, + ref_ids.size(), + kb->sents[i].size(), N ); + ScorePair sp( kb->scores[i], score ); + scores.push_back( sp ); + //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl; + //cout << kb->feats[i] << endl; } - lambdas.set_value( FD::Convert("use_shell"), 1 ); - lambdas.set_value( FD::Convert("use_a"), 1 ); + //cout << "###" << endl; + SofiaLearner learner; + learner.Init( sid, kb->feats, scores ); + learner.Update(lambdas); + // initializing learner + // TODO + // updating weights + //lambdas.set_value( FD::Convert("use_shell"), 1 ); + //lambdas.set_value( FD::Convert("use_a"), 1 ); //print_FD(); + sid += 1; // TODO does cdec count this already? } - + weights.WriteToFile( "weights-final", true ); + + cerr << endl; return 0; } - // next: FMap, ->sofia, ->FMap, -> Weights - // learner gets all used features (binary! and dense (logprob is sum of logprobs!)) - // only for those feats with weight > 0 after learning - // see decoder line 548 - - -/* - * TODO - * iterate over training set, for t=1..T - * mapred impl - * mapper: main - * reducer: average weights, global NgramCounts for approx. bleu - * 1st cut: hadoop streaming? - * batch, non-batch in the mapper (what sofia gets, regenerated Kbest lists) - * filter kbest yes/no - * sofia: --eta_type explicit - * psg preparation source\tref\tpsg - * set reference for cdec? - * LM - * shared? - * startup? - * X reference(s) for *bleu!? - * kbest nicer (do not iterate twice)!? -> shared_ptr - * multipartite ranking - * weights! global, per sentence from global, featuremap - * const decl... - * sketch: batch/iter options - * weights.cc: why wv_? - * --weights cmd line (for iterations): script to call again/hadoop streaming? - * I do not need to remember features, cdec does - * resocre hg? - * do not use Decoder::Decode!? - * what happens if feature not in FD? 0??? - */ - -/* - * PROBLEMS - * cdec kbest vs 1best (no -k param) - * FD, Weights::wv_ grow too large, see utils/weights.cc; decoder/hg.h; decoder/scfg_translator.cc; utils/fdict.cc!? - * sparse vector instead of vector for weights in Decoder? - * PhraseModel_* features for psg!? (seem to be generated) - */ - -- cgit v1.2.3