diff options
-rw-r--r-- | dtrain/Makefile.am | 2 | ||||
-rw-r--r-- | dtrain/dtrain.cc | 10 | ||||
-rw-r--r-- | dtrain/hgsampler.cc (renamed from dtrain/sample_hg.cc) | 2 | ||||
-rw-r--r-- | dtrain/hgsampler.h (renamed from dtrain/sample_hg.h) | 17 | ||||
-rw-r--r-- | dtrain/ksampler.h | 2 | ||||
-rw-r--r-- | dtrain/pairsampling.h | 35 | ||||
-rw-r--r-- | dtrain/score.cc | 165 | ||||
-rw-r--r-- | dtrain/score.h | 53 |
8 files changed, 133 insertions, 153 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index 9b5df8bf..12084a70 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -1,7 +1,7 @@ # TODO I'm sure I can leave something out. bin_PROGRAMS = dtrain -dtrain_SOURCES = dtrain.cc score.cc sample_hg.cc +dtrain_SOURCES = dtrain.cc score.cc hgsampler.cc dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 01821b30..01119997 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -347,10 +347,9 @@ main( int argc, char** argv ) cand_len = kb->sents[i].size(); } NgramCounts counts_tmp = global_counts + counts; - // TODO as param - score = 0.9 * scorer( counts_tmp, - global_ref_len, - global_hyp_len + cand_len, N, bleu_weights ); + score = .9*scorer( counts_tmp, + global_ref_len, + global_hyp_len + cand_len, N, bleu_weights ); } else { // other scorers cand_len = kb->sents[i].size(); @@ -381,7 +380,8 @@ main( int argc, char** argv ) if ( !noup ) { TrainingInstances pairs; - sample_all( kb, pairs ); + sample_all_pairs(kb, pairs); + //sample_rand_pairs( kb, pairs, &rng ); for ( TrainingInstances::iterator ti = pairs.begin(); ti != pairs.end(); ti++ ) { diff --git a/dtrain/sample_hg.cc b/dtrain/hgsampler.cc index 33872fb8..7a00a3d3 100644 --- a/dtrain/sample_hg.cc +++ b/dtrain/hgsampler.cc @@ -1,4 +1,4 @@ -#include "sample_hg.h" +#include "hgsampler.h" #include <queue> diff --git a/dtrain/sample_hg.h b/dtrain/hgsampler.h index 932fd369..b840c07f 100644 --- a/dtrain/sample_hg.h +++ b/dtrain/hgsampler.h @@ -1,5 +1,6 @@ -#ifndef _SAMPLE_HG_H_ -#define _SAMPLE_HG_H_ +#ifndef _DTRAIN_HGSAMPLER_H_ +#define _DTRAIN_HGSAMPLER_H_ + #include <vector> #include "sparse_vector.h" @@ -9,16 +10,20 @@ class Hypergraph; struct HypergraphSampler { + struct Hypothesis { std::vector<WordID> words; SparseVector<double> fmap; prob_t model_score; }; - static void sample_hypotheses(const Hypergraph& hg, - unsigned n, - MT19937* rng, - std::vector<Hypothesis>* hypos); + static void + sample_hypotheses(const Hypergraph& hg, + unsigned n, + MT19937* rng, + std::vector<Hypothesis>* hypos); }; + #endif + diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index a28b69c9..914e9723 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -2,7 +2,7 @@ #define _DTRAIN_KSAMPLER_H_ #include "kbest.h" -#include "sample_hg.h" +#include "hgsampler.h" #include "sampler.h" namespace dtrain diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 502901af..9774ba4a 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -1,9 +1,8 @@ -#ifndef _DTRAIN_SAMPLE_H_ -#define _DTRAIN_SAMPLE_H_ - +#ifndef _DTRAIN_PAIRSAMPLING_H_ +#define _DTRAIN_PAIRSAMPLING_H_ #include "kbestget.h" - +#include "sampler.h" // cdec MT19937 namespace dtrain { @@ -11,19 +10,18 @@ namespace dtrain struct TPair { - SparseVector<double> first, second; - size_t first_rank, second_rank; - double first_score, second_score; + SparseVector<double> first, second; + size_t first_rank, second_rank; + double first_score, second_score; }; typedef vector<TPair> TrainingInstances; - void -sample_all( KBestList* kb, TrainingInstances &training ) +sample_all_pairs(KBestList* kb, TrainingInstances &training) { - for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { - for ( size_t j = i+1; j < kb->GetSize(); j++ ) { + for (size_t i = 0; i < kb->GetSize()-1; i++) { + for (size_t j = i+1; j < kb->GetSize(); j++) { TPair p; p.first = kb->feats[i]; p.second = kb->feats[j]; @@ -31,18 +29,18 @@ sample_all( KBestList* kb, TrainingInstances &training ) p.second_rank = j; p.first_score = kb->scores[i]; p.second_score = kb->scores[j]; - training.push_back( p ); + training.push_back(p); } } } void -sample_rand( KBestList* kb, TrainingInstances &training ) +sample_rand_pairs(KBestList* kb, TrainingInstances &training, MT19937* prng) { - srand( time(NULL) ); - for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { - for ( size_t j = i+1; j < kb->GetSize(); j++ ) { - if ( rand() % 2 ) { + srand(time(NULL)); + for (size_t i = 0; i < kb->GetSize()-1; i++) { + for (size_t j = i+1; j < kb->GetSize(); j++) { + if (prng->next() < .5) { TPair p; p.first = kb->feats[i]; p.second = kb->feats[j]; @@ -50,10 +48,11 @@ sample_rand( KBestList* kb, TrainingInstances &training ) p.second_rank = j; p.first_score = kb->scores[i]; p.second_score = kb->scores[j]; - training.push_back( p ); + training.push_back(p); } } } + cout << training.size() << " sampled" << endl; } diff --git a/dtrain/score.cc b/dtrain/score.cc index 1e98c11d..d08e87f3 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -1,166 +1,149 @@ #include "score.h" - namespace dtrain { -/****************************************************************************** - * NGRAMS - * - * - * make_ngrams - * - */ -typedef map<vector<WordID>, size_t> Ngrams; Ngrams -make_ngrams( vector<WordID>& s, size_t N ) +make_ngrams(vector<WordID>& s, size_t N) { Ngrams ngrams; vector<WordID> ng; - for ( size_t i = 0; i < s.size(); i++ ) { + for (size_t i = 0; i < s.size(); i++) { ng.clear(); - for ( size_t j = i; j < min( i+N, s.size() ); j++ ) { - ng.push_back( s[j] ); + for (size_t j = i; j < min(i+N, s.size()); j++) { + ng.push_back(s[j]); ngrams[ng]++; } } return ngrams; } - -/* - * ngram_matches - * - */ NgramCounts -make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ) +make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N) { - Ngrams hyp_ngrams = make_ngrams( hyp, N ); - Ngrams ref_ngrams = make_ngrams( ref, N ); - NgramCounts counts( N ); + Ngrams hyp_ngrams = make_ngrams(hyp, N); + Ngrams ref_ngrams = make_ngrams(ref, N); + NgramCounts counts(N); Ngrams::iterator it; Ngrams::iterator ti; - for ( it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++ ) { - ti = ref_ngrams.find( it->first ); - if ( ti != ref_ngrams.end() ) { - counts.add( it->second, ti->second, it->first.size() - 1 ); + for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) { + ti = ref_ngrams.find(it->first); + if (ti != ref_ngrams.end()) { + counts.add(it->second, ti->second, it->first.size() - 1); } else { - counts.add( it->second, 0, it->first.size() - 1 ); + counts.add(it->second, 0, it->first.size() - 1); } } return counts; } - -/****************************************************************************** - * SCORERS - * +/* + * bleu * - * brevity_penaly + * as in "BLEU: a Method for Automatic Evaluation + * of Machine Translation" + * (Papineni et al. '02) * + * NOTE: 0 if one n in {1..N} has 0 count */ double -brevity_penaly( const size_t hyp_len, const size_t ref_len ) +brevity_penaly(const size_t hyp_len, const size_t ref_len) { - if ( hyp_len > ref_len ) return 1; - return exp( 1 - (double)ref_len/(double)hyp_len ); + if (hyp_len > ref_len) return 1; + return exp(1 - (double)ref_len/(double)hyp_len); } - - -/* - * bleu - * as in "BLEU: a Method for Automatic Evaluation of Machine Translation" (Papineni et al. '02) - * page TODO - * 0 if for N one of the counts = 0 - */ double -bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - size_t N, vector<float> weights ) +bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + size_t N, vector<float> weights ) { - if ( hyp_len == 0 || ref_len == 0 ) return 0; - if ( ref_len < N ) N = ref_len; + if (hyp_len == 0 || ref_len == 0) return 0; + if (ref_len < N) N = ref_len; float N_ = (float)N; - if ( weights.empty() ) + if (weights.empty()) { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + for (size_t i = 0; i < N; i++) weights.push_back(1/N_); } double sum = 0; - for ( size_t i = 0; i < N; i++ ) { - if ( counts.clipped[i] == 0 || counts.sum[i] == 0 ) return 0; - sum += weights[i] * log( (double)counts.clipped[i] / (double)counts.sum[i] ); + for (size_t i = 0; i < N; i++) { + if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0; + sum += weights[i] * log((double)counts.clipped[i] / (double)counts.sum[i]); } - return brevity_penaly( hyp_len, ref_len ) * exp( sum ); + return brevity_penaly(hyp_len, ref_len) * exp(sum); } - /* - * stupid_bleu - * as in "ORANGE: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation (Lin & Och '04) - * page TODO - * 0 iff no 1gram match + * 'stupid' bleu + * + * as in "ORANGE: a Method for Evaluating + * Automatic Evaluation Metrics + * for Machine Translation" + * (Lin & Och '04) + * + * NOTE: 0 iff no 1gram match */ double -stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - size_t N, vector<float> weights ) +stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + size_t N, vector<float> weights ) { - if ( hyp_len == 0 || ref_len == 0 ) return 0; - if ( ref_len < N ) N = ref_len; + if (hyp_len == 0 || ref_len == 0) return 0; + if (ref_len < N) N = ref_len; float N_ = (float)N; - if ( weights.empty() ) + if (weights.empty()) { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + for (size_t i = 0; i < N; i++) weights.push_back(1/N_); } double sum = 0; float add = 0; - for ( size_t i = 0; i < N; i++ ) { - if ( i == 1 ) add = 1; - sum += weights[i] * log( ((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add) ); + for (size_t i = 0; i < N; i++) { + if (i == 1) add = 1; + sum += weights[i] * log(((double)counts.clipped[i] + add) / ((double)counts.sum[i] + add)); } - return brevity_penaly( hyp_len, ref_len ) * exp( sum ); + return brevity_penaly(hyp_len, ref_len) * exp(sum); } - /* - * smooth_bleu - * as in "An End-to-End Discriminative Approach to Machine Translation" (Liang et al. '06) - * page TODO - * max. 0.9375 + * smooth bleu + * + * as in "An End-to-End Discriminative Approach + * to Machine Translation" + * (Liang et al. '06) + * + * NOTE: max is 0.9375 */ double -smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - const size_t N, vector<float> weights ) +smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + const size_t N, vector<float> weights ) { - if ( hyp_len == 0 || ref_len == 0 ) return 0; + if (hyp_len == 0 || ref_len == 0) return 0; float N_ = (float)N; - if ( weights.empty() ) + if (weights.empty()) { - for ( size_t i = 0; i < N; i++ ) weights.push_back( 1/N_ ); + for (size_t i = 0; i < N; i++) weights.push_back(1/N_); } double sum = 0; float j = 1; - for ( size_t i = 0; i < N; i++ ) { - if ( counts.clipped[i] == 0 || counts.sum[i] == 0) continue; - sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow( 2, N_-j+1 ); + for (size_t i = 0; i < N; i++) { + if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue; + sum += exp((weights[i] * log((double)counts.clipped[i]/(double)counts.sum[i]))) / pow(2, N_-j+1); j++; } - return brevity_penaly( hyp_len, ref_len ) * sum; + return brevity_penaly(hyp_len, ref_len) * sum; } - /* - * approx_bleu - * as in "Online Large-Margin Training for Statistical Machine Translation" (Watanabe et al. '07) - * CHIANG, RESNIK, synt struct features - * .9* - * page TODO + * approx. bleu * + * as in "Online Large-Margin Training of Syntactic + * and Structural Translation Features" + * (Chiang et al. '08) */ double -approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, - const size_t N, vector<float> weights ) +approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, + const size_t N, vector<float> weights) { - return bleu( counts, hyp_len, ref_len, N, weights ); + return brevity_penaly(hyp_len, ref_len) + * 0.9 * bleu(counts, hyp_len, ref_len, N, weights); } diff --git a/dtrain/score.h b/dtrain/score.h index e88387c5..0afb6237 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -1,29 +1,23 @@ #ifndef _DTRAIN_SCORE_H_ #define _DTRAIN_SCORE_H_ - #include <iostream> #include <vector> #include <map> #include <cassert> #include <cmath> -#include "wordid.h" +#include "wordid.h" // cdec using namespace std; - namespace dtrain { -/* - * NgramCounts - * - */ struct NgramCounts { - NgramCounts( const size_t N ) : N_( N ) { + NgramCounts(const size_t N) : N_(N) { reset(); } size_t N_; @@ -31,17 +25,17 @@ struct NgramCounts map<size_t, size_t> sum; void - operator+=( const NgramCounts& rhs ) + operator+=(const NgramCounts& rhs) { - assert( N_ == rhs.N_ ); - for ( size_t i = 0; i < N_; i++ ) { + assert(N_ == rhs.N_); + for (size_t i = 0; i < N_; i++) { this->clipped[i] += rhs.clipped.find(i)->second; this->sum[i] += rhs.sum.find(i)->second; } } const NgramCounts - operator+( const NgramCounts &other ) const + operator+(const NgramCounts &other) const { NgramCounts result = *this; result += other; @@ -49,10 +43,10 @@ struct NgramCounts } void - add( size_t count, size_t ref_count, size_t i ) + add(size_t count, size_t ref_count, size_t i) { - assert( i < N_ ); - if ( count > ref_count ) { + assert(i < N_); + if (count > ref_count) { clipped[i] += ref_count; sum[i] += count; } else { @@ -65,7 +59,7 @@ struct NgramCounts reset() { size_t i; - for ( i = 0; i < N_; i++ ) { + for (i = 0; i < N_; i++) { clipped[i] = 0; sum[i] = 0; } @@ -74,27 +68,26 @@ struct NgramCounts void print() { - for ( size_t i = 0; i < N_; i++ ) { + for (size_t i = 0; i < N_; i++) { cout << i+1 << "grams (clipped):\t" << clipped[i] << endl; cout << i+1 << "grams:\t\t\t" << sum[i] << endl; } } }; - typedef map<vector<WordID>, size_t> Ngrams; -Ngrams make_ngrams( vector<WordID>& s, size_t N ); -NgramCounts make_ngram_counts( vector<WordID> hyp, vector<WordID> ref, size_t N ); - -double brevity_penaly( const size_t hyp_len, const size_t ref_len ); -double bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, - vector<float> weights = vector<float>() ); -double stupid_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, - vector<float> weights = vector<float>() ); -double smooth_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, - vector<float> weights = vector<float>() ); -double approx_bleu( NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, - vector<float> weights = vector<float>() ); +Ngrams make_ngrams(vector<WordID>& s, size_t N); +NgramCounts make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N); + +double brevity_penaly(const size_t hyp_len, const size_t ref_len); +double bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, + vector<float> weights = vector<float>()); +double stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, + vector<float> weights = vector<float>()); +double smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, + vector<float> weights = vector<float>()); +double approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, + vector<float> weights = vector<float>()); } // namespace |