diff options
author | Patrick Simianer <p@simianer.de> | 2011-07-31 19:24:02 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2011-09-23 19:13:57 +0200 |
commit | 06829982fb0c03a5b0bbd95ee04de5a0019c5263 (patch) | |
tree | c48d2b25092ce08bc5557c0410dc1a93c8e5fa16 /dtrain | |
parent | d980ecbbcd35fba23313aa715046bc0f87a23afd (diff) |
bugfixing, begin refactoring
Diffstat (limited to 'dtrain')
-rw-r--r-- | dtrain/dcommon.cc | 79 | ||||
-rw-r--r-- | dtrain/dcommon.h | 71 | ||||
-rw-r--r-- | dtrain/dtest.cc | 33 | ||||
-rw-r--r-- | dtrain/dtrain.cc | 48 | ||||
-rw-r--r-- | dtrain/learner.h | 71 | ||||
-rwxr-xr-x | dtrain/test.sh | 2 |
6 files changed, 168 insertions, 136 deletions
diff --git a/dtrain/dcommon.cc b/dtrain/dcommon.cc index a6bdc92c..6657bed6 100644 --- a/dtrain/dcommon.cc +++ b/dtrain/dcommon.cc @@ -2,7 +2,11 @@ -/* + +/****************************************************************************** + * NGRAMS + * + * * make_ngrams * */ @@ -23,9 +27,6 @@ make_ngrams( vector<WordID>& s, size_t N ) } - - - /* * ngram_matches * @@ -50,7 +51,12 @@ make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ) } -/* + + +/****************************************************************************** + * SCORES + * + * * brevity_penaly * */ @@ -156,7 +162,12 @@ approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, } -/* + + +/****************************************************************************** + * UTILS + * + * * register_and_convert * */ @@ -170,11 +181,39 @@ register_and_convert(const vector<string>& strs, vector<WordID>& ids) } +/* + * approx_equal + * + */ +double +approx_equal( double x, double y ) +{ + const double EPSILON = 1E-5; + if ( x == 0 ) return fabs( y ) <= EPSILON; + if ( y == 0 ) return fabs( x ) <= EPSILON; + return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; +} /* + * print_FD + * + */ +void +print_FD() +{ + for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; +} + + + + +/****************************************************************************** + * TESTS * * + * test_ngrams + * */ void test_ngrams() @@ -207,21 +246,7 @@ test_ngrams() /* - * - * - */ -double -approx_equal( double x, double y ) -{ - const double EPSILON = 1E-5; - if ( x == 0 ) return fabs( y ) <= EPSILON; - if ( y == 0 ) return fabs( x ) <= EPSILON; - return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; -} - - -/* - * + * test_metrics * */ void @@ -263,8 +288,9 @@ test_metrics() cout << endl; } + /* - * + * test_SetWeights * */ void @@ -287,7 +313,7 @@ test_SetWeights() /* - * + * run_tests * */ void @@ -302,10 +328,3 @@ run_tests() exit(0); } - -void -print_FD() -{ - for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; -} - diff --git a/dtrain/dcommon.h b/dtrain/dcommon.h index ff796642..6df841bb 100644 --- a/dtrain/dcommon.h +++ b/dtrain/dcommon.h @@ -30,6 +30,8 @@ using namespace std; namespace po = boost::program_options; + + struct ScorePair { ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {} @@ -139,72 +141,7 @@ struct NgramCounts }; -/*class Learnerx -{ - public: - virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {}; - virtual void Update(SparseVector<double>& lambdas); -};*/ - -class SofiaLearner //: public Learnerx FIXME -{ - // TODO bool invert_score - public: - void - Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores ) - { - assert( kbest.size() == scores.size() ); - ofstream o; - unlink( "/tmo/sofia_ml_training" ); - o.open( "/tmp/sofia_ml_training", ios::trunc ); // TODO randomize, filename exists - int fid = 0; - map<int,int>::iterator ff; - for ( size_t k = 0; k < kbest.size(); ++k ) { - SparseVector<double>::const_iterator it = kbest[k].begin(); - o << scores[k].GetScore(); - for ( ; it != kbest[k].end(); ++it) { - ff = fmap.find( it->first ); - if ( ff == fmap.end() ) { - fmap.insert( pair<int,int>(it->first, fid) ); - fmap1.insert( pair<int,int>(fid, it->first) ); - fid++; - } - o << " "<< fmap[it->first] << ":" << it->second; - } - o << endl; - } - o.close(); - } - - void - Update(SparseVector<double>& lambdas) - { - string call = "./sofia-ml --training_file /tmp/sofia_ml_training --model_out /tmp/sofia_ml_model --loop_type stochastic --lambda 100 --dimensionality "; - std::stringstream out; - out << fmap.size(); - call += out.str(); - call += " &>/dev/null"; - system ( call.c_str() ); - ifstream i; - unlink( "/tmo/sofia_ml_model" ); - i.open( "/tmp/sofia_ml_model", ios::in ); - string model; - getline( i, model ); - //cout << model << endl; - vector<string> strs; - boost::split( strs, model, boost::is_any_of(" ") ); - int j = 0; - for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) { - lambdas.set_value(fmap1[j], atof( it->c_str() ) ); - j++; - } - - } - private: - map<int,int> fmap; - map<int,int> fmap1; -}; typedef map<vector<WordID>, size_t> Ngrams; Ngrams make_ngrams( vector<WordID>& s, size_t N ); @@ -215,10 +152,6 @@ double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_ double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() ); double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, vector<float> weights = vector<float>() ); void register_and_convert(const vector<string>& strs, vector<WordID>& ids); - - - - void print_FD(); void run_tests(); void test_SetWeights(); diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc index 9975794f..5ae473e6 100644 --- a/dtrain/dtest.cc +++ b/dtrain/dtest.cc @@ -11,11 +11,13 @@ bool init(int argc, char** argv, po::variables_map* conf) { int N; + bool q; po::options_description opts( "Options" ); opts.add_options() - ( "decoder-config,c", po::value<string>(), "configuration file for cdec" ) - ( "weights,w", po::value<string>(), "weights file") - ( "ngrams,n", po::value<int>(&N)->default_value(4), "N for Ngrams (default 5)" ); + ( "decoder-config,c", po::value<string>(), "configuration file for cdec" ) + ( "weights,w", po::value<string>(), "weights file") + ( "ngrams,n", po::value<int>(&N)->default_value(4), "N for Ngrams (default 5)" ) + ( "quiet,q", po::value<bool>(&q)->default_value(true), "do not output translations" ); po::options_description cmdline_options; cmdline_options.add(opts); po::store( parse_command_line(argc, argv, cmdline_options), *conf ); @@ -44,6 +46,7 @@ main(int argc, char** argv) Decoder decoder(ini_rf.stream()); KBestGetter observer(k); size_t N = conf["ngrams"].as<int>(); + bool quiet = conf["quiet"].as<bool>(); Weights weights; weights.InitFromFile(conf["weights"].as<string>()); @@ -56,13 +59,15 @@ main(int argc, char** argv) string in, psg; size_t sid = 0; double overall = 0.0; - cerr << "(1 dot equals 100 lines of input)" << endl; + double overall1 = 0.0; + double overall2 = 0.0; + cerr << "(a dot equals 100 lines of input)" << endl; while( getline(cin, in) ) { if ( (sid+1) % 100 == 0 ) { cerr << "."; if ( (sid+1)%1000 == 0 ) cerr << endl; } - if ( sid > 5000 ) break; + //if ( sid > 5000 ) break; strs.clear(); boost::split( strs, in, boost::is_any_of("\t") ); // grammar @@ -75,19 +80,25 @@ main(int argc, char** argv) boost::split( ref_strs, strs[1], boost::is_any_of(" ") ); register_and_convert( ref_strs, ref_ids ); // scoring kbest - double score = 0; - Scores scores; + double score = 0.0; + double score1 = 0.0; + double score2 = 0.0; NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], 4 ); score = smooth_bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); - ScorePair sp( kb->scores[0], score ); - scores.push_back( sp ); - //cout << TD::GetString( kb->sents[0] ) << endl; + score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N) ; + score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); + //if ( ! quiet ) + cout << TD::GetString( kb->sents[0] ) << endl; overall += score; + overall1 += score1; + overall2 += score2; sid += 1; } - cout << "Average score: " << overall/(sid+1) << endl; + cerr << "Average score (smooth): " << overall/(double)(sid+1) << endl; + cerr << "Average score (stupid): " << overall1/(double)(sid+1) << endl; + cerr << "Average score (normal): " << overall2/(double)(sid+1) << endl; cerr << endl; return 0; diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 95fc81af..373458e8 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -1,4 +1,5 @@ #include "dcommon.h" +#include "learner.h" @@ -45,41 +46,35 @@ main(int argc, char** argv) ReadFile ini_rf(conf["decoder-config"].as<string>()); Decoder decoder(ini_rf.stream()); KBestGetter observer(k); - size_t N = 4; // TODO as parameter/in config + size_t N = 3; // TODO as parameter/in config // TODO scoring metric as parameter/in config // for approx. bleu - //NgramCounts global_counts; - //size_t global_hyp_len; - //size_t global_ref_len; + NgramCounts global_counts(N); + size_t global_hyp_len = 0; + size_t global_ref_len = 0; Weights weights; SparseVector<double> lambdas; weights.InitSparseVector(&lambdas); vector<double> dense_weights; - lambdas.set_value(FD::Convert("logp"), 0); - - vector<string> strs, ref_strs; vector<WordID> ref_ids; string in, psg; size_t sid = 0; cerr << "(1 dot equals 100 lines of input)" << endl; while( getline(cin, in) ) { - //if ( !SILENT ) - // cerr << endl << endl << "Getting kbest for sentence #" << sid << endl; if ( (sid+1) % 100 == 0 ) { cerr << "."; if ( (sid+1)%1000 == 0 ) cerr << endl; } - if ( sid > 5000 ) break; + //if ( sid > 5000 ) break; // weights dense_weights.clear(); weights.InitFromVector( lambdas ); weights.InitVector( &dense_weights ); decoder.SetWeights( dense_weights ); - //if ( sid > 100 ) break; // handling input.. strs.clear(); boost::split( strs, in, boost::is_any_of("\t") ); @@ -94,33 +89,36 @@ main(int argc, char** argv) register_and_convert( ref_strs, ref_ids ); // scoring kbest double score = 0; + size_t cand_len = 0; Scores scores; - for ( size_t i = 0; i < k; i++ ) { - NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], 4 ); - score = smooth_bleu( counts, - ref_ids.size(), - kb->sents[i].size(), N ); + for ( size_t i = 0; i < kb->sents.size(); i++ ) { + NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N ); + if ( i == 0) { + global_counts += counts; + global_hyp_len += kb->sents[i].size(); + global_ref_len += ref_ids.size(); + cand_len = 0; + } else { + cand_len = kb->sents[i].size(); + } + //score = bleu( global_counts, + // global_ref_len, + // global_hyp_len + cand_len, N ); + score = bleu ( counts, ref_ids.size(), kb->sents[i].size(), N ); ScorePair sp( kb->scores[i], score ); scores.push_back( sp ); //cout << "'" << TD::GetString( ref_ids ) << "' vs '" << TD::GetString( kb->sents[i] ) << "' SCORE=" << score << endl; //cout << kb->feats[i] << endl; } - //cout << "###" << endl; + // learner SofiaLearner learner; learner.Init( sid, kb->feats, scores ); learner.Update(lambdas); - // initializing learner - // TODO - // updating weights - //lambdas.set_value( FD::Convert("use_shell"), 1 ); - //lambdas.set_value( FD::Convert("use_a"), 1 ); //print_FD(); sid += 1; // TODO does cdec count this already? } - - weights.WriteToFile( "weights-final", true ); - cerr << endl; + weights.WriteToFile( "data/weights-final-normalx", true ); return 0; } diff --git a/dtrain/learner.h b/dtrain/learner.h new file mode 100644 index 00000000..a953284d --- /dev/null +++ b/dtrain/learner.h @@ -0,0 +1,71 @@ +/*class Learnerx +{ + public: + virtual void Init(const vector<SparseVector<double> >& kbest, const Scores& scores) {}; + virtual void Update(SparseVector<double>& lambdas); +};*/ + +class SofiaLearner //: public Learnerx FIXME +{ + // TODO bool invert_score + public: + void + Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const*/ Scores& scores ) + { + assert( kbest.size() == scores.size() ); + ofstream o; + //unlink( "/tmp/sofia_ml_training_stupid" ); + o.open( "/tmp/sofia_ml_training_normalx", ios::trunc ); // TODO randomize, filename exists + int fid = 0; + map<int,int>::iterator ff; + + for ( size_t k = 0; k < kbest.size(); ++k ) { + map<int,double> m; + SparseVector<double>::const_iterator it = kbest[k].begin(); + o << scores[k].GetScore(); + for ( ; it != kbest[k].end(); ++it) { + ff = fmap.find( it->first ); + if ( ff == fmap.end() ) { + fmap.insert( pair<int,int>(it->first, fid) ); + fmap1.insert( pair<int,int>(fid, it->first) ); + fid++; + } + m.insert(pair<int,double>(fmap[it->first], it->second)); + } + map<int,double>::iterator ti = m.begin(); + for ( ; ti != m.end(); ++ti ) { + o << " " << ti->first << ":" << ti->second; + } + o << endl; + } + o.close(); + } + + void + Update(SparseVector<double>& lambdas) + { + string call = "./sofia-ml --training_file /tmp/sofia_ml_training_normalx --model_out /tmp/sofia_ml_model_normalx --loop_type stochastic --lambda 100 --dimensionality "; + std::stringstream out; + out << fmap.size(); + call += out.str(); + call += " &>/dev/null"; + system ( call.c_str() ); + ifstream i; + //unlink( "/tmp/sofia_ml_model_stupid" ); + i.open( "/tmp/sofia_ml_model_normalx", ios::in ); + string model; + getline( i, model ); + vector<string> strs; + boost::split( strs, model, boost::is_any_of(" ") ); + int j = 0; + for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) { + lambdas.set_value(fmap1[j], atof( it->c_str() ) ); + j++; + } + } + + private: + map<int,int> fmap; + map<int,int> fmap1; +}; + diff --git a/dtrain/test.sh b/dtrain/test.sh index ad45bd1e..bc318ae7 100755 --- a/dtrain/test.sh +++ b/dtrain/test.sh @@ -1,4 +1,4 @@ #!/bin/sh -./dtrain -c data/cdec.ini -k 4 < data/in.blunsom08 #< data/in.toy +./dtrain -c data/cdec.ini -k 200 < data/in.blunsom08 #< data/in.toy |