From 2b3b084ba45bf4e2e2dc5152afed268b616ee308 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 9 Sep 2011 00:41:17 +0200 Subject: added svm, ksampler --- dtrain/Makefile.am | 2 +- dtrain/dtrain.cc | 140 ++++++++++++++++++++++++----------------- dtrain/kbestget.h | 6 +- dtrain/run.sh | 3 +- dtrain/sample.h | 64 +++++++++++-------- dtrain/test/EXAMPLE/cdec.ini | 2 +- dtrain/test/EXAMPLE/dtrain.ini | 6 +- dtrain/test/toy.dtrain.ini | 2 +- 8 files changed, 133 insertions(+), 92 deletions(-) (limited to 'dtrain') diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index afab00f2..c08cd1ea 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -1,7 +1,7 @@ # TODO I'm sure I can leave something out. bin_PROGRAMS = dtrain dtest -dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc +dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc sample_hg.cc dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams dtest_SOURCES = dtest.cc score.cc util.cc diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 4554e417..d58478a8 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -3,6 +3,8 @@ #include "util.h" #include "sample.h" +#include "ksampler.h" + // boost compression #include #include @@ -11,6 +13,7 @@ //#include using namespace boost::iostreams; + #ifdef DTRAIN_DEBUG #include "tests.h" #endif @@ -101,6 +104,7 @@ ostream& _prec5( ostream& out ) { return out << setprecision(5); } int main( int argc, char** argv ) { + cout << setprecision( 5 ); // handle most parameters po::variables_map cfg; if ( ! init(argc, argv, &cfg) ) exit(1); // something is wrong @@ -143,7 +147,9 @@ main( int argc, char** argv ) if ( !quiet ) cout << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; Decoder decoder( ini_rf.stream() ); - KBestGetter observer( k, filter_type ); + //KBestGetter observer( k, filter_type ); + MT19937 rng; + KSampler observer( k, &rng ); // scoring metric/scorer string scorer_str = cfg["scorer"].as(); @@ -207,11 +213,13 @@ main( int argc, char** argv ) size_t cand_len = 0; double overall_time = 0.; - cout << setprecision( 5 ); - - // for the perceptron - double eta = 0.5; // TODO as parameter + // for the perceptron/SVM; TODO as params + double eta = 0.0005; + double gamma = 0.01; // -> SVM lambdas.add_value( FD::Convert("__bias"), 0 ); + + // for random sampling + srand ( time(NULL) ); for ( size_t t = 0; t < T; t++ ) // T epochs @@ -284,44 +292,44 @@ main( int argc, char** argv ) weights.InitVector( &dense_weights ); decoder.SetWeights( dense_weights ); - srand ( time(NULL) ); - - switch ( t ) { - case 0: - // handling input - in_split.clear(); - boost::split( in_split, in, boost::is_any_of("\t") ); - // in_split[0] is id - //cout << in_split[0] << endl; - // getting reference - ref_tok.clear(); ref_ids.clear(); - boost::split( ref_tok, in_split[2], boost::is_any_of(" ") ); - register_and_convert( ref_tok, ref_ids ); - ref_ids_buf.push_back( ref_ids ); - // process and set grammar - //grammar_buf << in_split[3] << endl; - grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __ - grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl; - decoder.SetSentenceGrammarFromString( grammar_str ); - // decode, kbest - src_str_buf.push_back( in_split[1] ); - decoder.Decode( in_split[1], &observer ); - break; - default: - // get buffered grammar - grammar_str.clear(); - int i = 1; - while ( true ) { - string g; - getline( grammar_buf_in, g ); - if ( g == DTRAIN_GRAMMAR_DELIM ) break; - grammar_str += g+"\n"; - i += 1; + if ( t == 0 ) { + // handling input + in_split.clear(); + boost::split( in_split, in, boost::is_any_of("\t") ); // in_split[0] is id + // getting reference + ref_tok.clear(); ref_ids.clear(); + boost::split( ref_tok, in_split[2], boost::is_any_of(" ") ); + register_and_convert( ref_tok, ref_ids ); + ref_ids_buf.push_back( ref_ids ); + // process and set grammar + bool broken_grammar = true; + for ( string::iterator ti = in_split[3].begin(); ti != in_split[3].end(); ti++ ) { + if ( !isspace(*ti) ) { + broken_grammar = false; + break; } - decoder.SetSentenceGrammarFromString( grammar_str ); - // decode, kbest - decoder.Decode( src_str_buf[sid], &observer ); - break; + } + if ( broken_grammar ) continue; + grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __ + grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl; + decoder.SetSentenceGrammarFromString( grammar_str ); + // decode, kbest + src_str_buf.push_back( in_split[1] ); + decoder.Decode( in_split[1], &observer ); + } else { + // get buffered grammar + grammar_str.clear(); + int i = 1; + while ( true ) { + string g; + getline( grammar_buf_in, g ); + if ( g == DTRAIN_GRAMMAR_DELIM ) break; + grammar_str += g+"\n"; + i += 1; + } + decoder.SetSentenceGrammarFromString( grammar_str ); + // decode, kbest + decoder.Decode( src_str_buf[sid], &observer ); } // get kbest list @@ -346,6 +354,7 @@ main( int argc, char** argv ) cand_len = kb->sents[i].size(); } NgramCounts counts_tmp = global_counts + counts; + // TODO as param score = 0.9 * scorer( counts_tmp, global_ref_len, global_hyp_len + cand_len, N, bleu_weights ); @@ -380,31 +389,48 @@ main( int argc, char** argv ) TrainingInstances pairs; - sample_all(kb, pairs); + sample_all_rand(kb, pairs); + cout << pairs.size() << endl; for ( TrainingInstances::iterator ti = pairs.begin(); ti != pairs.end(); ti++ ) { - // perceptron + SparseVector dv; - if ( ti->type == -1 ) { + if ( ti->first_score - ti->second_score < 0 ) { dv = ti->second - ti->first; - } else { - dv = ti->first - ti->second; - } - dv.add_value(FD::Convert("__bias"), -1); + //} else { + //dv = ti->first - ti->second; + //} + dv.add_value( FD::Convert("__bias"), -1 ); + + SparseVector reg; + reg = lambdas * ( 2 * gamma ); + dv -= reg; lambdas += dv * eta; - /*if ( verbose ) { - cout << "{{ f(i) > f(j) but g(i) < g(j), so update" << endl; - cout << " i " << TD::GetString(kb->sents[ii]) << endl; - cout << " " << kb->feats[ii] << endl; - cout << " j " << TD::GetString(kb->sents[jj]) << endl; - cout << " " << kb->feats[jj] << endl; - cout << " dv " << dv << endl; + if ( verbose ) { + cout << "{{ f("<< ti->first_rank <<") > f(" << ti->second_rank << ") but g(i)="<< ti->first_score <<" < g(j)="<< ti->second_score << " so update" << endl; + cout << " i " << TD::GetString(kb->sents[ti->first_rank]) << endl; + cout << " " << kb->feats[ti->first_rank] << endl; + cout << " j " << TD::GetString(kb->sents[ti->second_rank]) << endl; + cout << " " << kb->feats[ti->second_rank] << endl; + cout << " diff vec: " << dv << endl; + cout << " lambdas after update: " << lambdas << endl; cout << "}}" << endl; - }*/ + } + + } else { + //if ( 0 ) { + SparseVector reg; + reg = lambdas * ( gamma * 2 ); + lambdas += reg * ( -eta ); + //} + } } + //double l2 = lambdas.l2norm(); + //if ( l2 ) lambdas /= lambdas.l2norm(); + } ++sid; diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h index ae4588c9..cf466fe4 100644 --- a/dtrain/kbestget.h +++ b/dtrain/kbestget.h @@ -1,8 +1,10 @@ #ifndef _DTRAIN_KBESTGET_H_ #define _DTRAIN_KBESTGET_H_ + #include "kbest.h" + namespace dtrain { @@ -65,7 +67,7 @@ struct KBestGetter : public DecoderObserver if (!d) break; kb.sents.push_back( d->yield); kb.feats.push_back( d->feature_values ); - kb.model_scores.push_back( d->score ); + kb.model_scores.push_back( log(d->score) ); } } @@ -83,7 +85,7 @@ struct KBestGetter : public DecoderObserver if (!d) break; kb.sents.push_back( d->yield); kb.feats.push_back( d->feature_values ); - kb.model_scores.push_back( d->score ); + kb.model_scores.push_back( log(d->score) ); } } }; diff --git a/dtrain/run.sh b/dtrain/run.sh index b2012bcf..16575c25 100755 --- a/dtrain/run.sh +++ b/dtrain/run.sh @@ -3,7 +3,8 @@ #INI=test/blunsom08.dtrain.ini #INI=test/nc-wmt11/dtrain.ini #INI=test/EXAMPLE/dtrain.ini -INI=test/toy.dtrain.ini +INI=test/EXAMPLE/dtrain.ruleids.ini +#INI=test/toy.dtrain.ini rm /tmp/dtrain-* ./dtrain -c $INI $1 $2 $3 $4 diff --git a/dtrain/sample.h b/dtrain/sample.h index b9bc4461..b6aa9abd 100644 --- a/dtrain/sample.h +++ b/dtrain/sample.h @@ -1,3 +1,7 @@ +#ifndef _DTRAIN_SAMPLE_H_ +#define _DTRAIN_SAMPLE_H_ + + #include "kbestget.h" @@ -7,9 +11,9 @@ namespace dtrain struct TPair { - double type; - SparseVector first; - SparseVector second; + SparseVector first, second; + size_t first_rank, second_rank; + double first_score, second_score; }; typedef vector TrainingInstances; @@ -18,35 +22,43 @@ typedef vector TrainingInstances; void sample_all( KBestList* kb, TrainingInstances &training ) { - double type; for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { - for ( size_t j = i+1; j < kb->GetSize(); j++ ) { - if ( kb->scores[i] - kb->scores[j] < 0 ) { - type = -1; - } else { - type = 1; - } - TPair p; - p.type = type; - p.first = kb->feats[i]; - p.second = kb->feats[j]; - training.push_back( p ); - } - } -} - -/*void -sample_all_only_neg(, vector > pairs) -{ - + for ( size_t j = i+1; j < kb->GetSize(); j++ ) { + TPair p; + p.first = kb->feats[i]; + p.second = kb->feats[j]; + p.first_rank = i; + p.second_rank = j; + p.first_score = kb->scores[i]; + p.second_score = kb->scores[j]; + training.push_back( p ); + } + } } void -sample_random_pos() +sample_all_rand( KBestList* kb, TrainingInstances &training ) { - if ( rand() % 2 ) { // sample it? -}*/ + srand( time(NULL) ); + for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { + for ( size_t j = i+1; j < kb->GetSize(); j++ ) { + if ( rand() % 2 ) { + TPair p; + p.first = kb->feats[i]; + p.second = kb->feats[j]; + p.first_rank = i; + p.second_rank = j; + p.first_score = kb->scores[i]; + p.second_score = kb->scores[j]; + training.push_back( p ); + } + } + } +} } // namespace + +#endif + diff --git a/dtrain/test/EXAMPLE/cdec.ini b/dtrain/test/EXAMPLE/cdec.ini index e57138b0..bda0d12c 100644 --- a/dtrain/test/EXAMPLE/cdec.ini +++ b/dtrain/test/EXAMPLE/cdec.ini @@ -2,6 +2,6 @@ formalism=scfg add_pass_through_rules=true feature_function=WordPenalty cubepruning_pop_limit=30 -feature_function=KLanguageModel data/nc-wmt11.en.srilm.gz +feature_function=KLanguageModel /home/pks/src/cdec/dtrain/data/nc-wmt11.en.srilm.gz scfg_max_span_limit=15 diff --git a/dtrain/test/EXAMPLE/dtrain.ini b/dtrain/test/EXAMPLE/dtrain.ini index ffafd0b8..7645921a 100644 --- a/dtrain/test/EXAMPLE/dtrain.ini +++ b/dtrain/test/EXAMPLE/dtrain.ini @@ -1,10 +1,10 @@ decoder_config=test/EXAMPLE/cdec.ini kbest=100 ngrams=3 -epochs=8 +epochs=3 input=test/EXAMPLE/dtrain.nc-1k -scorer=approx_bleu +scorer=stupid_bleu output=test/EXAMPLE/weights.gz -stop_after=1000 +stop_after=100 wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough diff --git a/dtrain/test/toy.dtrain.ini b/dtrain/test/toy.dtrain.ini index 35f76281..e9ed0ce5 100644 --- a/dtrain/test/toy.dtrain.ini +++ b/dtrain/test/toy.dtrain.ini @@ -4,7 +4,7 @@ ngrams=1 epochs=3 input=test/toy.in scorer=bleu -output=toy.gz +output=data/w/toy.gz #stop_after=1000 wprint=logp use_shell use_house PassThrough -- cgit v1.2.3