From 78a0ee61c2d2d846306b60a8ac862a2d649bcf59 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 15 May 2012 00:44:03 +0200 Subject: loss margin cfg, XYX improved, smooth bleu variant --- dtrain/dtrain.cc | 16 ++-- dtrain/ksampler.h | 7 ++ dtrain/pairsampling.h | 16 ++-- dtrain/score.cc | 22 +++++- dtrain/score.h | 5 ++ dtrain/test/example/dtrain.ini | 3 +- dtrain/test/example/expected-output | 143 ++++++++++++++++++------------------ 7 files changed, 129 insertions(+), 83 deletions(-) diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 864eb153..717d47a2 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -33,6 +33,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("fselect", po::value()->default_value(-1), "TODO select top x percent (or by threshold) of features after each epoch") ("approx_bleu_d", po::value()->default_value(0.9), "discount for approx. BLEU") ("scale_bleu_diff", po::value()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") + ("loss_margin", po::value()->default_value(0.), "update if no error in pref pair but model scores this near") #ifdef DTRAIN_LOCAL ("refs,r", po::value(), "references in local mode") #endif @@ -134,6 +135,8 @@ main(int argc, char** argv) const string select_weights = cfg["select_weights"].as(); const float hi_lo = cfg["hi_lo"].as(); const score_t approx_bleu_d = cfg["approx_bleu_d"].as(); + weight_t loss_margin = cfg["loss_margin"].as(); + if (loss_margin > 9998.) loss_margin = std::numeric_limits::max(); bool scale_bleu_diff = false; if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true; bool average = false; @@ -160,6 +163,8 @@ main(int argc, char** argv) scorer = dynamic_cast(new StupidBleuScorer); } else if (scorer_str == "smooth_bleu") { scorer = dynamic_cast(new SmoothBleuScorer); + } else if (scorer_str == "smooth_single_bleu") { + scorer = dynamic_cast(new SmoothSingleBleuScorer); } else if (scorer_str == "approx_bleu") { scorer = dynamic_cast(new ApproxBleuScorer(N, approx_bleu_d)); } else { @@ -220,7 +225,7 @@ main(int argc, char** argv) grammar_buf_out.open(grammar_buf_fn.c_str()); #endif - unsigned in_sz = UINT_MAX; // input index, input size + unsigned in_sz = std::numeric_limits::max(); // input index, input size vector > all_scores; score_t max_score = 0.; unsigned best_it = 0; @@ -242,6 +247,7 @@ main(int argc, char** argv) if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl; else cerr << setw(25) << "learning rate " << "bleu diff" << endl; cerr << setw(25) << "gamma " << gamma << endl; + cerr << setw(25) << "loss margin " << loss_margin << endl; cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl; if (pair_sampling == "XYX") cerr << setw(25) << "hi lo " << hi_lo << endl; @@ -427,15 +433,15 @@ main(int argc, char** argv) #ifdef DTRAIN_FASTER_PERCEPTRON bool rank_error = true; // pair filtering already did this for us rank_errors++; - score_t margin = 2.; // compiler, could you get rid of the margin? + score_t margin = std::numeric_limits::max(); #else bool rank_error = it->first.model <= it->second.model; if (rank_error) rank_errors++; - score_t margin = fabs(it->first.model - it->second.model); - if (!rank_error && margin < 1.) margin_violations++; + score_t margin = fabs(fabs(it->first.model) - fabs(it->second.model)); + if (!rank_error && margin < loss_margin) margin_violations++; #endif if (scale_bleu_diff) eta = it->first.score - it->second.score; - if (rank_error || (gamma && margin<1.)) { + if (rank_error || margin < loss_margin) { SparseVector diff_vec = it->first.f - it->second.f; lambdas.plus_eq_v_times_s(diff_vec, eta); if (gamma) diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index f52fb649..bc2f56cd 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -8,6 +8,11 @@ namespace dtrain { +bool +cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b) +{ + return a.model > b.model; +} struct KSampler : public HypSampler { @@ -44,6 +49,8 @@ struct KSampler : public HypSampler sz_++; f_count_ += h.f.size(); } + sort(s_.begin(), s_.end(), cmp_hyp_by_model_d); + for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i; } }; diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 5085738e..32006a41 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -46,11 +46,15 @@ all_pairs(vector* s, vector >& training, sc inline void partXYX(vector* s, vector >& training, score_t threshold, float hi_lo) { - sort(s->begin(), s->end(), cmp_hyp_by_score_d); unsigned sz = s->size(); + if (sz < 2) return; + sort(s->begin(), s->end(), cmp_hyp_by_score_d); unsigned sep = round(sz*hi_lo); - for (unsigned i = 0; i < sep; i++) { - for (unsigned j = sep; j < sz; j++) { + unsigned sep_hi = sep; + if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi; + else sep_hi = 1; + for (unsigned i = 0; i < sep_hi; i++) { + for (unsigned j = sep_hi; j < sz; j++) { #ifdef DTRAIN_FASTER_PERCEPTRON if ((*s)[i].model <= (*s)[j].model) { #endif @@ -66,8 +70,10 @@ partXYX(vector* s, vector >& training, scor #endif } } - for (unsigned i = sep; i < sz-sep; i++) { - for (unsigned j = sz-sep; j < sz; j++) { + unsigned sep_lo = sz-sep; + while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo; + for (unsigned i = sep_hi; i < sz-sep_lo; i++) { + for (unsigned j = sz-sep_lo; j < sz; j++) { #ifdef DTRAIN_FASTER_PERCEPTRON if ((*s)[i].model <= (*s)[j].model) { #endif diff --git a/dtrain/score.cc b/dtrain/score.cc index 7b1f6be4..b331dc4f 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -103,7 +103,27 @@ SmoothBleuScorer::Score(vector& hyp, vector& ref, i_bleu[j] += (1/((score_t)j+1)) * i_ng; } } - sum += exp(i_bleu[i])/(pow(2.0, static_cast(N_-i))); + sum += exp(i_bleu[i])/(pow(2.0, N_-i)); + } + return brevity_penalty(hyp_len, ref_len) * sum; +} + +// variant of smooth_bleu; i-Bleu scores only single 'i' +score_t +SmoothSingleBleuScorer::Score(vector& hyp, vector& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + if (ref_len < N_) M = ref_len; + score_t sum = 0.; + unsigned j = 1; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; + sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2.0, N_-j+1); + j++; } return brevity_penalty(hyp_len, ref_len) * sum; } diff --git a/dtrain/score.h b/dtrain/score.h index eb8ad912..d4fba22c 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -128,6 +128,11 @@ struct SmoothBleuScorer : public LocalScorer score_t Score(vector& hyp, vector& ref, const unsigned /*rank*/, const unsigned /*src_len*/); }; +struct SmoothSingleBleuScorer : public LocalScorer +{ + score_t Score(vector& hyp, vector& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + struct ApproxBleuScorer : public BleuScorer { NgramCounts glob_onebest_counts_; diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index e43d6b34..c8ac7c3f 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -5,7 +5,7 @@ decoder_config=test/example/cdec.ini # config for cdec # weights for these features will be printed on each iteration print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough tmp=/tmp -stop_after=20 # stop epoch after 20 inputs +stop_after=10 # stop epoch after 10 inputs # interesting stuff epochs=3 # run over input 3 times @@ -19,3 +19,4 @@ filter=uniq # only unique entries in kbest (surface form) pair_sampling=XYX hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0) +loss_margin=0 diff --git a/dtrain/test/example/expected-output b/dtrain/test/example/expected-output index 08733dd4..25d2c069 100644 --- a/dtrain/test/example/expected-output +++ b/dtrain/test/example/expected-output @@ -15,7 +15,7 @@ State is 0 bytes for feature RuleNgramFeatures feature: RuleShape (no config parameters) Example feature: Shape_S00000_T00000 State is 0 bytes for feature RuleShape -Seeding random number sequence to 380245307 +Seeding random number sequence to 1072059181 dtrain Parameters: @@ -27,6 +27,7 @@ Parameters: filter 'uniq' learning rate 0.0001 gamma 0 + loss margin 0 pairs 'XYX' hi lo 0.1 pair threshold 0 @@ -35,90 +36,90 @@ Parameters: cdec cfg 'test/example/cdec.ini' input 'test/example/nc-wmt11.1k.gz' output '-' - stop_after 20 + stop_after 10 (a dot represents 10 inputs) Iteration #1 of 3. - .. 20 -Stopping after 20 input sentences. + . 10 +Stopping after 10 input sentences. WEIGHTS - Glue = -0.1015 - WordPenalty = -0.0152 - LanguageModel = +0.21493 - LanguageModel_OOV = -0.3257 - PhraseModel_0 = -0.050844 - PhraseModel_1 = +0.25074 - PhraseModel_2 = +0.27944 - PhraseModel_3 = -0.038384 - PhraseModel_4 = -0.12041 - PhraseModel_5 = +0.1047 - PhraseModel_6 = -0.1289 - PassThrough = -0.3094 + Glue = -0.0293 + WordPenalty = +0.049075 + LanguageModel = +0.24345 + LanguageModel_OOV = -0.2029 + PhraseModel_0 = +0.0084102 + PhraseModel_1 = +0.021729 + PhraseModel_2 = +0.014922 + PhraseModel_3 = +0.104 + PhraseModel_4 = -0.14308 + PhraseModel_5 = +0.0247 + PhraseModel_6 = -0.012 + PassThrough = -0.2161 --- - 1best avg score: 0.17508 (+0.17508) - 1best avg model score: -1.2392 (-1.2392) - avg # pairs: 1329.8 - avg # rank err: 649.1 - avg # margin viol: 677.5 - non0 feature count: 874 - avg list sz: 88.6 - avg f count: 85.643 -(time 0.25 min, 0.75 s/S) + 1best avg score: 0.16872 (+0.16872) + 1best avg model score: -1.8276 (-1.8276) + avg # pairs: 1121.1 + avg # rank err: 555.6 + avg # margin viol: 0 + non0 feature count: 277 + avg list sz: 77.2 + avg f count: 90.96 +(time 0.1 min, 0.6 s/S) Iteration #2 of 3. - .. 20 + . 10 WEIGHTS - Glue = -0.0792 - WordPenalty = -0.056198 - LanguageModel = +0.31038 - LanguageModel_OOV = -0.4011 - PhraseModel_0 = +0.072188 - PhraseModel_1 = +0.11473 - PhraseModel_2 = +0.049774 - PhraseModel_3 = -0.18448 - PhraseModel_4 = -0.12092 - PhraseModel_5 = +0.1599 - PhraseModel_6 = -0.0606 - PassThrough = -0.3848 + Glue = -0.3526 + WordPenalty = +0.067576 + LanguageModel = +1.155 + LanguageModel_OOV = -0.2728 + PhraseModel_0 = -0.025529 + PhraseModel_1 = +0.095869 + PhraseModel_2 = +0.094567 + PhraseModel_3 = +0.12482 + PhraseModel_4 = -0.36533 + PhraseModel_5 = +0.1068 + PhraseModel_6 = -0.1517 + PassThrough = -0.286 --- - 1best avg score: 0.24015 (+0.065075) - 1best avg model score: -10.131 (-8.8914) - avg # pairs: 1324.7 - avg # rank err: 558.65 - avg # margin viol: 752.85 - non0 feature count: 1236 - avg list sz: 84.9 - avg f count: 88.306 -(time 0.22 min, 0.65 s/S) + 1best avg score: 0.18394 (+0.015221) + 1best avg model score: 3.205 (+5.0326) + avg # pairs: 1168.3 + avg # rank err: 594.8 + avg # margin viol: 0 + non0 feature count: 543 + avg list sz: 77.5 + avg f count: 85.916 +(time 0.083 min, 0.5 s/S) Iteration #3 of 3. - .. 20 + . 10 WEIGHTS - Glue = -0.051 - WordPenalty = -0.077956 - LanguageModel = +0.33699 - LanguageModel_OOV = -0.4726 - PhraseModel_0 = +0.040228 - PhraseModel_1 = +0.18 - PhraseModel_2 = +0.15618 - PhraseModel_3 = -0.098908 - PhraseModel_4 = -0.036555 - PhraseModel_5 = +0.1619 - PhraseModel_6 = +0.0078 - PassThrough = -0.4563 + Glue = -0.392 + WordPenalty = +0.071963 + LanguageModel = +0.81266 + LanguageModel_OOV = -0.4177 + PhraseModel_0 = -0.2649 + PhraseModel_1 = -0.17931 + PhraseModel_2 = +0.038261 + PhraseModel_3 = +0.20261 + PhraseModel_4 = -0.42621 + PhraseModel_5 = +0.3198 + PhraseModel_6 = -0.1437 + PassThrough = -0.4309 --- - 1best avg score: 0.25527 (+0.015113) - 1best avg model score: -13.906 (-3.7756) - avg # pairs: 1356.3 - avg # rank err: 562.1 - avg # margin viol: 757.35 - non0 feature count: 1482 - avg list sz: 86.65 - avg f count: 87.475 -(time 0.23 min, 0.7 s/S) + 1best avg score: 0.2962 (+0.11225) + 1best avg model score: -36.274 (-39.479) + avg # pairs: 1109.6 + avg # rank err: 515.9 + avg # margin viol: 0 + non0 feature count: 741 + avg list sz: 77 + avg f count: 88.982 +(time 0.083 min, 0.5 s/S) Writing weights file to '-' ... done --- -Best iteration: 3 [SCORE 'stupid_bleu'=0.25527]. -This took 0.7 min. +Best iteration: 3 [SCORE 'stupid_bleu'=0.2962]. +This took 0.26667 min. -- cgit v1.2.3