diff options
-rw-r--r-- | dtrain/dtrain.cc | 77 | ||||
-rw-r--r-- | dtrain/dtrain.h | 25 | ||||
-rw-r--r-- | dtrain/kbestget.h | 46 | ||||
-rw-r--r-- | dtrain/ksampler.h | 11 | ||||
-rw-r--r-- | dtrain/pairsampling.h | 6 | ||||
-rw-r--r-- | dtrain/score.cc | 37 | ||||
-rw-r--r-- | dtrain/score.h | 44 | ||||
-rw-r--r-- | dtrain/test/example/dtrain.ini | 8 | ||||
-rw-r--r-- | dtrain/test/example/weights.gz | bin | 12001 -> 395 bytes |
9 files changed, 98 insertions, 156 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 35e6cc46..622cd01e 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -95,38 +95,32 @@ main(int argc, char** argv) cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; Decoder decoder(ini_rf.stream()); - MT19937 rng; // random number generator - // setup decoder observer - HypSampler* observer; - if (sample_from == "kbest") { - observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type)); - } else { - observer = dynamic_cast<KSampler*>(new KSampler(k, &rng)); - } - // scoring metric/scorer string scorer_str = cfg["scorer"].as<string>(); - /*score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>); + LocalScorer* scorer; if (scorer_str == "bleu") { - scorer = &bleu; } else if (scorer_str == "stupid_bleu") { - scorer = &stupid_bleu; + scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer); } else if (scorer_str == "smooth_bleu") { - scorer = &smooth_bleu; + scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer); } else if (scorer_str == "approx_bleu") { - scorer = &approx_bleu; + scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer); // FIXME } else { cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl; exit(1); } - NgramCounts global_counts(N); // counts for 1 best translations - unsigned global_hyp_len = 0; // sum hypothesis lengths - unsigned global_ref_len = 0; // sum reference lengths - // ^^^ global_* for approx_bleu*/ - vector<score_t> bleu_weights; // we leave this empty -> 1/N - //if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; - StupidBleuScorer scorer; - scorer.Init(N, bleu_weights); + vector<score_t> bleu_weights; + scorer->Init(N, bleu_weights); + if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; + + // setup decoder observer + MT19937 rng; // random number generator + HypSampler* observer; + if (sample_from == "kbest") + observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type)); + else + observer = dynamic_cast<KSampler*>(new KSampler(k, &rng)); + observer->SetScorer(scorer); // init weights Weights weights; @@ -240,10 +234,10 @@ main(int argc, char** argv) vector<WordID> ref_ids; // reference as vector<WordID> if (t == 0) { // handling input - strsplit(in, in_split, '\t', 4); + boost::split(in_split, in, boost::is_any_of("\t")); // getting reference vector<string> ref_tok; - strsplit(in_split[2], ref_tok, ' '); + boost::split(ref_tok, in_split[2], boost::is_any_of(" ")); register_and_convert(ref_tok, ref_ids); ref_ids_buf.push_back(ref_ids); // process and set grammar @@ -259,8 +253,9 @@ main(int argc, char** argv) in_split[3] += "\n"; grammar_buf_out << in_split[3] << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; decoder.SetSentenceGrammarFromString(in_split[3]); - // decode src_str_buf.push_back(in_split[1]); + // decode + observer->SetRef(ref_ids); decoder.Decode(in_split[1], observer); } else { // get buffered grammar @@ -273,32 +268,24 @@ main(int argc, char** argv) } decoder.SetSentenceGrammarFromString(grammar_str); // decode + observer->SetRef(ref_ids_buf[ii]); decoder.Decode(src_str_buf[ii], observer); } + // get (scored) samples vector<ScoredHyp>* samples = observer->GetSamples(); - // (local) scoring - if (t > 0) ref_ids = ref_ids_buf[ii]; - for (unsigned i = 0; i < samples->size(); i++) { - //cout << ii << " " << i << endl; - - cout << _p9; - (*samples)[i].score = scorer.Score((*samples)[i], ref_ids, ii); - if (i == 0) { - score_sum += (*samples)[i].score; - model_sum += (*samples)[i].model; - } - - if (verbose) { - if (i == 0) cerr << "'" << TD::GetString(ref_ids) << "' [ref]" << endl; - cerr << _p5 << _np << "[hyp " << i << "] " << "'" << TD::GetString((*samples)[i].w) << "'"; - cerr << " [SCORE=" << (*samples)[i].score << ",model="<< (*samples)[i].model << "]" << endl; - cerr << (*samples)[i].f << endl; - } + if (verbose) { + cout << "[ref: '"; + if (t > 0) cout << ref_ids_buf[ii]; + else cout << ref_ids; + cout << endl; + cout << _p5 << _np << "1best: " << "'" << (*samples)[0].w << "'" << endl; + cout << "SCORE=" << (*samples)[0].score << ",model="<< (*samples)[0].model << endl; + cout << "F{" << (*samples)[0].f << "} ]" << endl << endl; } - - if (verbose) cerr << endl; + score_sum += (*samples)[0].score; + model_sum += (*samples)[0].model; ////////////////////////////////////////////////////////// // UPDATE WEIGHTS diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index ed75a297..0c27167d 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -7,14 +7,6 @@ #include <boost/algorithm/string.hpp> #include <boost/program_options.hpp> -#include "verbose.h" -#include "viterbi.h" -#include "ff_register.h" -#include "decoder.h" -#include "weights.h" - -#include "score.h" -#include "kbestget.h" #include "ksampler.h" #include "pairsampling.h" @@ -31,27 +23,12 @@ inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids for (it = strs.begin(); it < strs.end(); it++) ids.push_back(TD::Convert(*it)); } + inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); } inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); } inline ostream& _p2(ostream& out) { return out << setprecision(2); } inline ostream& _p5(ostream& out) { return out << setprecision(5); } inline ostream& _p9(ostream& out) { return out << setprecision(9); } -inline void strsplit(string &s, vector<string>& v, char d = '\t', unsigned parts = 0) { - stringstream ss(s); - string t; - unsigned i = 0; - while(true) - { - if (parts > 0 && i == parts-1) { - getline(ss, t); - v.push_back(t); - break; - } - if (!getline(ss, t, d)) break; - v.push_back(t); - i++; - } -} #endif diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h index 2a2c6073..c0fd3f47 100644 --- a/dtrain/kbestget.h +++ b/dtrain/kbestget.h @@ -1,12 +1,6 @@ #ifndef _DTRAIN_KBESTGET_H_ #define _DTRAIN_KBESTGET_H_ - -#include <vector> -#include <string> - -using namespace std; - #include "kbest.h" // cdec #include "verbose.h" #include "viterbi.h" @@ -14,11 +8,13 @@ using namespace std; #include "decoder.h" #include "weights.h" +using namespace std; + namespace dtrain { -typedef double score_t; // float +typedef double score_t; // float struct ScoredHyp { @@ -29,10 +25,44 @@ struct ScoredHyp unsigned rank; }; +struct LocalScorer +{ + unsigned N_; + vector<score_t> w_; + + virtual score_t + Score(vector<WordID>& hyp, vector<WordID>& ref)=0; + + void + Init(unsigned N, vector<score_t> weights) + { + assert(N > 0); + N_ = N; + if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_); + else w_ = weights; + } + + score_t + brevity_penaly(const unsigned hyp_len, const unsigned ref_len) + { + if (hyp_len > ref_len) return 1; + return exp(1 - (score_t)ref_len/hyp_len); + } +}; + struct HypSampler : public DecoderObserver { + LocalScorer* scorer_; + vector<WordID>* ref_; virtual vector<ScoredHyp>* GetSamples()=0; + void SetScorer(LocalScorer* scorer) { scorer_ = scorer; } + void SetRef(vector<WordID>& ref) { ref_ = &ref; } }; +///////////////////////////////////////////////////////////////////// +// wtf + + + struct KBestGetter : public HypSampler { @@ -77,6 +107,7 @@ struct KBestGetter : public HypSampler h.f = d->feature_values; h.model = log(d->score); h.rank = i; + h.score = scorer_->Score(h.w, *ref_); s_.push_back(h); } } @@ -95,6 +126,7 @@ struct KBestGetter : public HypSampler h.f = d->feature_values; h.model = log(d->score); h.rank = i; + h.score = scorer_->Score(h.w, *ref_); s_.push_back(h); } } diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index 767dc42e..7567f43a 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -1,15 +1,9 @@ #ifndef _DTRAIN_KSAMPLER_H_ #define _DTRAIN_KSAMPLER_H_ -#include "kbestget.h" #include "hgsampler.h" -#include <vector> -#include <string> - -using namespace std; - -#include "kbest.h" // cdec -#include "sampler.h" +#include "kbestget.h" +#include "score.h" namespace dtrain { @@ -43,6 +37,7 @@ struct KSampler : public HypSampler h.f = samples[i].fmap; h.model = log(samples[i].model_score); h.rank = i; + h.score = scorer_->Score(h.w, *ref_); s_.push_back(h); } } diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 4a6d93d1..6db0c045 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -1,12 +1,6 @@ #ifndef _DTRAIN_PAIRSAMPLING_H_ #define _DTRAIN_PAIRSAMPLING_H_ -#include "kbestget.h" -#include "score.h" -#include <vector> -#include <string> -using namespace std; -#include "sampler.h" // cdec, MT19937 namespace dtrain { diff --git a/dtrain/score.cc b/dtrain/score.cc index 9b22508b..93c4e80b 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -22,17 +22,17 @@ BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref score_t sum = 0; for (unsigned i = 0; i < M; i++) { if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0; - sum += w_[i] * log((score_t)counts.clipped[i] / counts.sum[i]); + sum += w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]); } return brevity_penaly(hyp_len, ref_len) * exp(sum); } score_t -BleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id) +BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref) { - unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size(); + unsigned hyp_len = hyp.size(), ref_len = ref.size(); if (hyp_len == 0 || ref_len == 0) return 0; - NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_); + NgramCounts counts = make_ngram_counts(hyp, ref, N_); return Bleu(counts, hyp_len, ref_len); } @@ -47,30 +47,18 @@ BleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id) * NOTE: 0 iff no 1gram match */ score_t -StupidBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id) +StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref) { - unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size(); + unsigned hyp_len = hyp.size(), ref_len = ref.size(); if (hyp_len == 0 || ref_len == 0) return 0; - NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_); + NgramCounts counts = make_ngram_counts(hyp, ref, N_); unsigned M = N_; if (ref_len < N_) M = ref_len; score_t sum = 0, add = 0; for (unsigned i = 0; i < M; i++) { if (i == 1) add = 1; - //cout << ((score_t)counts.clipped[i] + add) << "/" << counts.sum[i] +add << "." << endl; - //cout << "w_[i] " << w_[i] << endl; - sum += w_[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add))); - //cout << "sum += "<< w_[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add))) << endl; + sum += w_[i] * log(((score_t)counts.clipped[i] + add)/((counts.sum[i] + add))); } - /*cout << ref_ids << endl; - cout << hyp.w << endl; - cout << "ref_len " << ref_len << endl; - cout << "hyp_len " << hyp_len << endl; - cout << "bp " << brevity_penaly(hyp_len, ref_len) << endl; - cout << "exp(sum) " << exp(sum) << endl; - counts.Print(); - cout << brevity_penaly(hyp_len, ref_len) * exp(sum) << endl; - cout << "---" << endl;*/ return brevity_penaly(hyp_len, ref_len) * exp(sum); } @@ -84,21 +72,22 @@ StupidBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id) * NOTE: max is 0.9375 */ score_t -SmoothBleuScorer::Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id) +SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref) { - unsigned hyp_len = hyp.w.size(), ref_len = ref_ids.size(); + unsigned hyp_len = hyp.size(), ref_len = ref.size(); if (hyp_len == 0 || ref_len == 0) return 0; - NgramCounts counts = make_ngram_counts(hyp.w, ref_ids, N_); + NgramCounts counts = make_ngram_counts(hyp, ref, N_); score_t sum = 0; unsigned j = 1; for (unsigned i = 0; i < N_; i++) { if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue; - sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]))) / pow(2, N_-j+1); + sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i])))/pow(2, N_-j+1); j++; } return brevity_penaly(hyp_len, ref_len) * sum; } +// FIXME /* * approx. bleu * diff --git a/dtrain/score.h b/dtrain/score.h index f87d708c..9af56ef9 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -1,16 +1,8 @@ #ifndef _DTRAIN_SCORE_H_ #define _DTRAIN_SCORE_H_ -#include <iostream> -#include <vector> -#include <map> -#include <cassert> -#include <cmath> - #include "kbestget.h" -#include "wordid.h" // cdec - using namespace std; namespace dtrain @@ -111,51 +103,28 @@ make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const un return counts; } -struct LocalScorer -{ - unsigned N_; - vector<score_t> w_; - - virtual score_t - Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id)=0; - - void - Init(unsigned N, vector<score_t> weights) - { - assert(N > 0); - N_ = N; - if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_); - else w_ = weights; - } - - score_t - brevity_penaly(const unsigned hyp_len, const unsigned ref_len) - { - if (hyp_len > ref_len) return 1; - return exp(1 - (score_t)ref_len/hyp_len); - } -}; - struct BleuScorer : public LocalScorer { score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len); - score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id); + score_t Score(vector<WordID>& hyp, vector<WordID>& ref_ids); }; struct StupidBleuScorer : public LocalScorer { - score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id); + score_t Score(vector<WordID>& hyp, vector<WordID>& ref); }; struct SmoothBleuScorer : public LocalScorer { - score_t Score(ScoredHyp& hyp, vector<WordID>& ref_ids, unsigned id); + score_t Score(vector<WordID>& hyp, vector<WordID>& ref); }; // FIXME /*struct ApproxBleuScorer : public LocalScorer { - NgramCounts glob_onebest_counts; + bool prepped; + + NgramCounts* glob_onebest_counts; unsigned glob_hyp_len, glob_ref_len; void Prep(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len); @@ -171,7 +140,6 @@ struct SmoothBleuScorer : public LocalScorer };*/ - } // namespace #endif diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index df746e51..fd3a3841 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,11 +1,11 @@ decoder_config=test/example/cdec.ini k=100 N=4 -epochs=10 +epochs=100 input=test/example/nc-1k.gz scorer=stupid_bleu output=test/example/weights.gz -stop_after=10 -sample_from=kbest -pair_sampling=all +stop_after=0 +sample_from=forest +pair_sampling=rand print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough diff --git a/dtrain/test/example/weights.gz b/dtrain/test/example/weights.gz Binary files differindex e7baa367..7960a05a 100644 --- a/dtrain/test/example/weights.gz +++ b/dtrain/test/example/weights.gz |