From 5a8ef8ba65b244837e9cedbd64793b82bf284f93 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 2 May 2012 15:02:59 +0200 Subject: small improvements --- dtrain/Makefile.am | 2 +- dtrain/dtrain.cc | 12 ++++++++++-- dtrain/dtrain.h | 2 +- dtrain/pairsampling.h | 15 +++++++++++++++ dtrain/test/example/README | 4 ++-- dtrain/test/example/dtrain.ini | 2 +- 6 files changed, 30 insertions(+), 7 deletions(-) (limited to 'dtrain') diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index f39d161e..64fef489 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -3,5 +3,5 @@ bin_PROGRAMS = dtrain dtrain_SOURCES = dtrain.cc score.cc dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -AM_CPPFLAGS = -O3 -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 8b1fc953..864eb153 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -424,12 +424,18 @@ main(int argc, char** argv) for (vector >::iterator it = pairs.begin(); it != pairs.end(); it++) { +#ifdef DTRAIN_FASTER_PERCEPTRON + bool rank_error = true; // pair filtering already did this for us + rank_errors++; + score_t margin = 2.; // compiler, could you get rid of the margin? +#else bool rank_error = it->first.model <= it->second.model; if (rank_error) rank_errors++; score_t margin = fabs(it->first.model - it->second.model); - if (!rank_error && margin < 1) margin_violations++; + if (!rank_error && margin < 1.) margin_violations++; +#endif if (scale_bleu_diff) eta = it->first.score - it->second.score; - if (rank_error || (gamma && margin<1)) { + if (rank_error || (gamma && margin<1.)) { SparseVector diff_vec = it->first.f - it->second.f; lambdas.plus_eq_v_times_s(diff_vec, eta); if (gamma) @@ -534,8 +540,10 @@ main(int argc, char** argv) cerr << _np << npairs/(float)in_sz << endl; cerr << " avg # rank err: "; cerr << rank_errors/(float)in_sz << endl; +#ifndef DTRAIN_FASTER_PERCEPTRON cerr << " avg # margin viol: "; cerr << margin_violations/(float)in_sz << endl; +#endif cerr << " non0 feature count: " << nonz << endl; cerr << " avg list sz: " << list_sz/(float)in_sz << endl; cerr << " avg f count: " << f_count/(float)list_sz << endl; diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 94d149ce..ac0345a4 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -32,7 +32,7 @@ inline void register_and_convert(const vector& strs, vector& ids inline string gettmpf(const string path, const string infix) { - char fn[1024]; + char fn[path.size() + infix.size() + 8]; strcpy(fn, path.c_str()); strcat(fn, "/"); strcat(fn, infix.c_str()); diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index bac132c6..52eeedd6 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -1,6 +1,9 @@ #ifndef _DTRAIN_PAIRSAMPLING_H_ #define _DTRAIN_PAIRSAMPLING_H_ +#define DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs + // DO NOT USE WITH SVM! + namespace dtrain { @@ -51,6 +54,9 @@ partXYX(vector* s, vector >& training, scor unsigned sep = round(sz*hi_lo); for (unsigned i = 0; i < sep; i++) { for (unsigned j = sep; j < sz; j++) { +#ifdef DTRAIN_FASTER_PERCEPTRON + if ((*s)[i].model <= (*s)[j].model) { +#endif if (threshold > 0) { if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) training.push_back(make_pair((*s)[i], (*s)[j])); @@ -58,10 +64,16 @@ partXYX(vector* s, vector >& training, scor if ((*s)[i].score != (*s)[j].score) training.push_back(make_pair((*s)[i], (*s)[j])); } +#ifdef DTRAIN_FASTER_PERCEPTRON + } +#endif } } for (unsigned i = sep; i < sz-sep; i++) { for (unsigned j = sz-sep; j < sz; j++) { +#ifdef DTRAIN_FASTER_PERCEPTRON + if ((*s)[i].model <= (*s)[j].model) { +#endif if (threshold > 0) { if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) training.push_back(make_pair((*s)[i], (*s)[j])); @@ -69,6 +81,9 @@ partXYX(vector* s, vector >& training, scor if ((*s)[i].score != (*s)[j].score) training.push_back(make_pair((*s)[i], (*s)[j])); } +#ifdef DTRAIN_FASTER_PERCEPTRON + } +#endif } } } diff --git a/dtrain/test/example/README b/dtrain/test/example/README index b3ea5f06..6937b11b 100644 --- a/dtrain/test/example/README +++ b/dtrain/test/example/README @@ -1,8 +1,8 @@ Small example of input format for distributed training. Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini . -For this to work, disable '#define DTRAIN_LOCAL' from dtrain.h +For this to work, undef 'DTRAIN_LOCAL' in dtrain.h and recompile. -Data is here: http://simianer.de/dtrain +Data is here: http://simianer.de/#dtrain diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index f87ee9cf..e43d6b34 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -5,7 +5,7 @@ decoder_config=test/example/cdec.ini # config for cdec # weights for these features will be printed on each iteration print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough tmp=/tmp -stop_after=10 # stop epoch after 20 inputs +stop_after=20 # stop epoch after 20 inputs # interesting stuff epochs=3 # run over input 3 times -- cgit v1.2.3 From eb3ee28dc0eb1d3e5ed01ba0df843be329ae450d Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 2 May 2012 15:33:58 +0200 Subject: defines --- dtrain/dtrain.h | 12 ++-- dtrain/pairsampling.h | 3 - dtrain/test/example/expected-output | 124 ++++++++++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 7 deletions(-) create mode 100644 dtrain/test/example/expected-output (limited to 'dtrain') diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index ac0345a4..d8dc14b6 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -1,6 +1,14 @@ #ifndef _DTRAIN_H_ #define _DTRAIN_H_ +#undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs + // DO NOT USE WITH SVM! +#undef DTRAIN_LOCAL +#define DTRAIN_DOTS 10 // after how many inputs to display a '.' +#define DTRAIN_GRAMMAR_DELIM "########EOS########" +#define DTRAIN_SCALE 100000 + + #include #include #include @@ -13,11 +21,7 @@ #include "filelib.h" -#undef DTRAIN_LOCAL -#define DTRAIN_DOTS 10 // after how many inputs to display a '.' -#define DTRAIN_GRAMMAR_DELIM "########EOS########" -#define DTRAIN_SCALE 100000 using namespace std; using namespace dtrain; diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 52eeedd6..5085738e 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -1,9 +1,6 @@ #ifndef _DTRAIN_PAIRSAMPLING_H_ #define _DTRAIN_PAIRSAMPLING_H_ -#define DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs - // DO NOT USE WITH SVM! - namespace dtrain { diff --git a/dtrain/test/example/expected-output b/dtrain/test/example/expected-output new file mode 100644 index 00000000..08733dd4 --- /dev/null +++ b/dtrain/test/example/expected-output @@ -0,0 +1,124 @@ + cdec cfg 'test/example/cdec.ini' +feature: WordPenalty (no config parameters) +State is 0 bytes for feature WordPenalty +feature: KLanguageModel (with config parameters 'test/example/nc-wmt11.en.srilm.gz') +Loading the LM will be faster if you build a binary file. +Reading test/example/nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** +Loaded 5-gram KLM from test/example/nc-wmt11.en.srilm.gz (MapSize=49581) +State is 98 bytes for feature KLanguageModel test/example/nc-wmt11.en.srilm.gz +feature: RuleIdentityFeatures (no config parameters) +State is 0 bytes for feature RuleIdentityFeatures +feature: RuleNgramFeatures (no config parameters) +State is 0 bytes for feature RuleNgramFeatures +feature: RuleShape (no config parameters) + Example feature: Shape_S00000_T00000 +State is 0 bytes for feature RuleShape +Seeding random number sequence to 380245307 + +dtrain +Parameters: + k 100 + N 4 + T 3 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 0.0001 + gamma 0 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'VOID' + l1 reg 0 'none' + cdec cfg 'test/example/cdec.ini' + input 'test/example/nc-wmt11.1k.gz' + output '-' + stop_after 20 +(a dot represents 10 inputs) +Iteration #1 of 3. + .. 20 +Stopping after 20 input sentences. +WEIGHTS + Glue = -0.1015 + WordPenalty = -0.0152 + LanguageModel = +0.21493 + LanguageModel_OOV = -0.3257 + PhraseModel_0 = -0.050844 + PhraseModel_1 = +0.25074 + PhraseModel_2 = +0.27944 + PhraseModel_3 = -0.038384 + PhraseModel_4 = -0.12041 + PhraseModel_5 = +0.1047 + PhraseModel_6 = -0.1289 + PassThrough = -0.3094 + --- + 1best avg score: 0.17508 (+0.17508) + 1best avg model score: -1.2392 (-1.2392) + avg # pairs: 1329.8 + avg # rank err: 649.1 + avg # margin viol: 677.5 + non0 feature count: 874 + avg list sz: 88.6 + avg f count: 85.643 +(time 0.25 min, 0.75 s/S) + +Iteration #2 of 3. + .. 20 +WEIGHTS + Glue = -0.0792 + WordPenalty = -0.056198 + LanguageModel = +0.31038 + LanguageModel_OOV = -0.4011 + PhraseModel_0 = +0.072188 + PhraseModel_1 = +0.11473 + PhraseModel_2 = +0.049774 + PhraseModel_3 = -0.18448 + PhraseModel_4 = -0.12092 + PhraseModel_5 = +0.1599 + PhraseModel_6 = -0.0606 + PassThrough = -0.3848 + --- + 1best avg score: 0.24015 (+0.065075) + 1best avg model score: -10.131 (-8.8914) + avg # pairs: 1324.7 + avg # rank err: 558.65 + avg # margin viol: 752.85 + non0 feature count: 1236 + avg list sz: 84.9 + avg f count: 88.306 +(time 0.22 min, 0.65 s/S) + +Iteration #3 of 3. + .. 20 +WEIGHTS + Glue = -0.051 + WordPenalty = -0.077956 + LanguageModel = +0.33699 + LanguageModel_OOV = -0.4726 + PhraseModel_0 = +0.040228 + PhraseModel_1 = +0.18 + PhraseModel_2 = +0.15618 + PhraseModel_3 = -0.098908 + PhraseModel_4 = -0.036555 + PhraseModel_5 = +0.1619 + PhraseModel_6 = +0.0078 + PassThrough = -0.4563 + --- + 1best avg score: 0.25527 (+0.015113) + 1best avg model score: -13.906 (-3.7756) + avg # pairs: 1356.3 + avg # rank err: 562.1 + avg # margin viol: 757.35 + non0 feature count: 1482 + avg list sz: 86.65 + avg f count: 87.475 +(time 0.23 min, 0.7 s/S) + +Writing weights file to '-' ... +done + +--- +Best iteration: 3 [SCORE 'stupid_bleu'=0.25527]. +This took 0.7 min. -- cgit v1.2.3 From 78a0ee61c2d2d846306b60a8ac862a2d649bcf59 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 15 May 2012 00:44:03 +0200 Subject: loss margin cfg, XYX improved, smooth bleu variant --- dtrain/dtrain.cc | 16 ++-- dtrain/ksampler.h | 7 ++ dtrain/pairsampling.h | 16 ++-- dtrain/score.cc | 22 +++++- dtrain/score.h | 5 ++ dtrain/test/example/dtrain.ini | 3 +- dtrain/test/example/expected-output | 143 ++++++++++++++++++------------------ 7 files changed, 129 insertions(+), 83 deletions(-) (limited to 'dtrain') diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 864eb153..717d47a2 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -33,6 +33,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("fselect", po::value()->default_value(-1), "TODO select top x percent (or by threshold) of features after each epoch") ("approx_bleu_d", po::value()->default_value(0.9), "discount for approx. BLEU") ("scale_bleu_diff", po::value()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") + ("loss_margin", po::value()->default_value(0.), "update if no error in pref pair but model scores this near") #ifdef DTRAIN_LOCAL ("refs,r", po::value(), "references in local mode") #endif @@ -134,6 +135,8 @@ main(int argc, char** argv) const string select_weights = cfg["select_weights"].as(); const float hi_lo = cfg["hi_lo"].as(); const score_t approx_bleu_d = cfg["approx_bleu_d"].as(); + weight_t loss_margin = cfg["loss_margin"].as(); + if (loss_margin > 9998.) loss_margin = std::numeric_limits::max(); bool scale_bleu_diff = false; if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true; bool average = false; @@ -160,6 +163,8 @@ main(int argc, char** argv) scorer = dynamic_cast(new StupidBleuScorer); } else if (scorer_str == "smooth_bleu") { scorer = dynamic_cast(new SmoothBleuScorer); + } else if (scorer_str == "smooth_single_bleu") { + scorer = dynamic_cast(new SmoothSingleBleuScorer); } else if (scorer_str == "approx_bleu") { scorer = dynamic_cast(new ApproxBleuScorer(N, approx_bleu_d)); } else { @@ -220,7 +225,7 @@ main(int argc, char** argv) grammar_buf_out.open(grammar_buf_fn.c_str()); #endif - unsigned in_sz = UINT_MAX; // input index, input size + unsigned in_sz = std::numeric_limits::max(); // input index, input size vector > all_scores; score_t max_score = 0.; unsigned best_it = 0; @@ -242,6 +247,7 @@ main(int argc, char** argv) if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl; else cerr << setw(25) << "learning rate " << "bleu diff" << endl; cerr << setw(25) << "gamma " << gamma << endl; + cerr << setw(25) << "loss margin " << loss_margin << endl; cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl; if (pair_sampling == "XYX") cerr << setw(25) << "hi lo " << hi_lo << endl; @@ -427,15 +433,15 @@ main(int argc, char** argv) #ifdef DTRAIN_FASTER_PERCEPTRON bool rank_error = true; // pair filtering already did this for us rank_errors++; - score_t margin = 2.; // compiler, could you get rid of the margin? + score_t margin = std::numeric_limits::max(); #else bool rank_error = it->first.model <= it->second.model; if (rank_error) rank_errors++; - score_t margin = fabs(it->first.model - it->second.model); - if (!rank_error && margin < 1.) margin_violations++; + score_t margin = fabs(fabs(it->first.model) - fabs(it->second.model)); + if (!rank_error && margin < loss_margin) margin_violations++; #endif if (scale_bleu_diff) eta = it->first.score - it->second.score; - if (rank_error || (gamma && margin<1.)) { + if (rank_error || margin < loss_margin) { SparseVector diff_vec = it->first.f - it->second.f; lambdas.plus_eq_v_times_s(diff_vec, eta); if (gamma) diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index f52fb649..bc2f56cd 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -8,6 +8,11 @@ namespace dtrain { +bool +cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b) +{ + return a.model > b.model; +} struct KSampler : public HypSampler { @@ -44,6 +49,8 @@ struct KSampler : public HypSampler sz_++; f_count_ += h.f.size(); } + sort(s_.begin(), s_.end(), cmp_hyp_by_model_d); + for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i; } }; diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 5085738e..32006a41 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -46,11 +46,15 @@ all_pairs(vector* s, vector >& training, sc inline void partXYX(vector* s, vector >& training, score_t threshold, float hi_lo) { - sort(s->begin(), s->end(), cmp_hyp_by_score_d); unsigned sz = s->size(); + if (sz < 2) return; + sort(s->begin(), s->end(), cmp_hyp_by_score_d); unsigned sep = round(sz*hi_lo); - for (unsigned i = 0; i < sep; i++) { - for (unsigned j = sep; j < sz; j++) { + unsigned sep_hi = sep; + if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi; + else sep_hi = 1; + for (unsigned i = 0; i < sep_hi; i++) { + for (unsigned j = sep_hi; j < sz; j++) { #ifdef DTRAIN_FASTER_PERCEPTRON if ((*s)[i].model <= (*s)[j].model) { #endif @@ -66,8 +70,10 @@ partXYX(vector* s, vector >& training, scor #endif } } - for (unsigned i = sep; i < sz-sep; i++) { - for (unsigned j = sz-sep; j < sz; j++) { + unsigned sep_lo = sz-sep; + while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo; + for (unsigned i = sep_hi; i < sz-sep_lo; i++) { + for (unsigned j = sz-sep_lo; j < sz; j++) { #ifdef DTRAIN_FASTER_PERCEPTRON if ((*s)[i].model <= (*s)[j].model) { #endif diff --git a/dtrain/score.cc b/dtrain/score.cc index 7b1f6be4..b331dc4f 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -103,7 +103,27 @@ SmoothBleuScorer::Score(vector& hyp, vector& ref, i_bleu[j] += (1/((score_t)j+1)) * i_ng; } } - sum += exp(i_bleu[i])/(pow(2.0, static_cast(N_-i))); + sum += exp(i_bleu[i])/(pow(2.0, N_-i)); + } + return brevity_penalty(hyp_len, ref_len) * sum; +} + +// variant of smooth_bleu; i-Bleu scores only single 'i' +score_t +SmoothSingleBleuScorer::Score(vector& hyp, vector& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + if (ref_len < N_) M = ref_len; + score_t sum = 0.; + unsigned j = 1; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; + sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2.0, N_-j+1); + j++; } return brevity_penalty(hyp_len, ref_len) * sum; } diff --git a/dtrain/score.h b/dtrain/score.h index eb8ad912..d4fba22c 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -128,6 +128,11 @@ struct SmoothBleuScorer : public LocalScorer score_t Score(vector& hyp, vector& ref, const unsigned /*rank*/, const unsigned /*src_len*/); }; +struct SmoothSingleBleuScorer : public LocalScorer +{ + score_t Score(vector& hyp, vector& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + struct ApproxBleuScorer : public BleuScorer { NgramCounts glob_onebest_counts_; diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index e43d6b34..c8ac7c3f 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -5,7 +5,7 @@ decoder_config=test/example/cdec.ini # config for cdec # weights for these features will be printed on each iteration print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough tmp=/tmp -stop_after=20 # stop epoch after 20 inputs +stop_after=10 # stop epoch after 10 inputs # interesting stuff epochs=3 # run over input 3 times @@ -19,3 +19,4 @@ filter=uniq # only unique entries in kbest (surface form) pair_sampling=XYX hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0) +loss_margin=0 diff --git a/dtrain/test/example/expected-output b/dtrain/test/example/expected-output index 08733dd4..25d2c069 100644 --- a/dtrain/test/example/expected-output +++ b/dtrain/test/example/expected-output @@ -15,7 +15,7 @@ State is 0 bytes for feature RuleNgramFeatures feature: RuleShape (no config parameters) Example feature: Shape_S00000_T00000 State is 0 bytes for feature RuleShape -Seeding random number sequence to 380245307 +Seeding random number sequence to 1072059181 dtrain Parameters: @@ -27,6 +27,7 @@ Parameters: filter 'uniq' learning rate 0.0001 gamma 0 + loss margin 0 pairs 'XYX' hi lo 0.1 pair threshold 0 @@ -35,90 +36,90 @@ Parameters: cdec cfg 'test/example/cdec.ini' input 'test/example/nc-wmt11.1k.gz' output '-' - stop_after 20 + stop_after 10 (a dot represents 10 inputs) Iteration #1 of 3. - .. 20 -Stopping after 20 input sentences. + . 10 +Stopping after 10 input sentences. WEIGHTS - Glue = -0.1015 - WordPenalty = -0.0152 - LanguageModel = +0.21493 - LanguageModel_OOV = -0.3257 - PhraseModel_0 = -0.050844 - PhraseModel_1 = +0.25074 - PhraseModel_2 = +0.27944 - PhraseModel_3 = -0.038384 - PhraseModel_4 = -0.12041 - PhraseModel_5 = +0.1047 - PhraseModel_6 = -0.1289 - PassThrough = -0.3094 + Glue = -0.0293 + WordPenalty = +0.049075 + LanguageModel = +0.24345 + LanguageModel_OOV = -0.2029 + PhraseModel_0 = +0.0084102 + PhraseModel_1 = +0.021729 + PhraseModel_2 = +0.014922 + PhraseModel_3 = +0.104 + PhraseModel_4 = -0.14308 + PhraseModel_5 = +0.0247 + PhraseModel_6 = -0.012 + PassThrough = -0.2161 --- - 1best avg score: 0.17508 (+0.17508) - 1best avg model score: -1.2392 (-1.2392) - avg # pairs: 1329.8 - avg # rank err: 649.1 - avg # margin viol: 677.5 - non0 feature count: 874 - avg list sz: 88.6 - avg f count: 85.643 -(time 0.25 min, 0.75 s/S) + 1best avg score: 0.16872 (+0.16872) + 1best avg model score: -1.8276 (-1.8276) + avg # pairs: 1121.1 + avg # rank err: 555.6 + avg # margin viol: 0 + non0 feature count: 277 + avg list sz: 77.2 + avg f count: 90.96 +(time 0.1 min, 0.6 s/S) Iteration #2 of 3. - .. 20 + . 10 WEIGHTS - Glue = -0.0792 - WordPenalty = -0.056198 - LanguageModel = +0.31038 - LanguageModel_OOV = -0.4011 - PhraseModel_0 = +0.072188 - PhraseModel_1 = +0.11473 - PhraseModel_2 = +0.049774 - PhraseModel_3 = -0.18448 - PhraseModel_4 = -0.12092 - PhraseModel_5 = +0.1599 - PhraseModel_6 = -0.0606 - PassThrough = -0.3848 + Glue = -0.3526 + WordPenalty = +0.067576 + LanguageModel = +1.155 + LanguageModel_OOV = -0.2728 + PhraseModel_0 = -0.025529 + PhraseModel_1 = +0.095869 + PhraseModel_2 = +0.094567 + PhraseModel_3 = +0.12482 + PhraseModel_4 = -0.36533 + PhraseModel_5 = +0.1068 + PhraseModel_6 = -0.1517 + PassThrough = -0.286 --- - 1best avg score: 0.24015 (+0.065075) - 1best avg model score: -10.131 (-8.8914) - avg # pairs: 1324.7 - avg # rank err: 558.65 - avg # margin viol: 752.85 - non0 feature count: 1236 - avg list sz: 84.9 - avg f count: 88.306 -(time 0.22 min, 0.65 s/S) + 1best avg score: 0.18394 (+0.015221) + 1best avg model score: 3.205 (+5.0326) + avg # pairs: 1168.3 + avg # rank err: 594.8 + avg # margin viol: 0 + non0 feature count: 543 + avg list sz: 77.5 + avg f count: 85.916 +(time 0.083 min, 0.5 s/S) Iteration #3 of 3. - .. 20 + . 10 WEIGHTS - Glue = -0.051 - WordPenalty = -0.077956 - LanguageModel = +0.33699 - LanguageModel_OOV = -0.4726 - PhraseModel_0 = +0.040228 - PhraseModel_1 = +0.18 - PhraseModel_2 = +0.15618 - PhraseModel_3 = -0.098908 - PhraseModel_4 = -0.036555 - PhraseModel_5 = +0.1619 - PhraseModel_6 = +0.0078 - PassThrough = -0.4563 + Glue = -0.392 + WordPenalty = +0.071963 + LanguageModel = +0.81266 + LanguageModel_OOV = -0.4177 + PhraseModel_0 = -0.2649 + PhraseModel_1 = -0.17931 + PhraseModel_2 = +0.038261 + PhraseModel_3 = +0.20261 + PhraseModel_4 = -0.42621 + PhraseModel_5 = +0.3198 + PhraseModel_6 = -0.1437 + PassThrough = -0.4309 --- - 1best avg score: 0.25527 (+0.015113) - 1best avg model score: -13.906 (-3.7756) - avg # pairs: 1356.3 - avg # rank err: 562.1 - avg # margin viol: 757.35 - non0 feature count: 1482 - avg list sz: 86.65 - avg f count: 87.475 -(time 0.23 min, 0.7 s/S) + 1best avg score: 0.2962 (+0.11225) + 1best avg model score: -36.274 (-39.479) + avg # pairs: 1109.6 + avg # rank err: 515.9 + avg # margin viol: 0 + non0 feature count: 741 + avg list sz: 77 + avg f count: 88.982 +(time 0.083 min, 0.5 s/S) Writing weights file to '-' ... done --- -Best iteration: 3 [SCORE 'stupid_bleu'=0.25527]. -This took 0.7 min. +Best iteration: 3 [SCORE 'stupid_bleu'=0.2962]. +This took 0.26667 min. -- cgit v1.2.3 From 8fb12f413a0974ee0e6fe0c04b3f760463cf9e30 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 21 May 2012 11:42:10 +0200 Subject: merging --- dtrain/README.md | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'dtrain') diff --git a/dtrain/README.md b/dtrain/README.md index 9580df6d..45f21ad5 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -48,6 +48,13 @@ Next + make svm doable; no subgradient? + reranking while sgd? + try PRO, mira emulations ++ sampling (MBR) ++ forest (on train)? ++ best BLEU transl (Sokolov)? ++ entire reg. path ++ resharding [nfold cross val.] ++ bigger LM, feats (target side Ng., word alignments etc.) ++ merge kbest lists Legal ----- -- cgit v1.2.3 From aadabfdf37dfd451485277cb77fad02f77b361c6 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 23 May 2012 09:46:27 +0200 Subject: README++ --- dtrain/README.md | 2 ++ 1 file changed, 2 insertions(+) (limited to 'dtrain') diff --git a/dtrain/README.md b/dtrain/README.md index 45f21ad5..350c7423 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -41,6 +41,8 @@ DTRAIN_LOCAL. Next ---- ++ approx. Bleu? ++ turn off inclusion + (dtrain|decoder) more meta-parameters testing + feature selection directly in dtrain + feature template: target side rule ngrams -- cgit v1.2.3 From 62c805c90c5347b844f92574e240db5c65578e12 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 31 May 2012 14:33:59 +0200 Subject: new scorer, stuff --- dtrain/README.md | 3 ++- dtrain/dtrain.cc | 75 +++++++++++++++++++++++++++------------------------ dtrain/pairsampling.h | 6 ++--- dtrain/score.cc | 35 +++++++++++++++++++++++- dtrain/score.h | 49 ++++++++++++++++++++++++++++++--- 5 files changed, 125 insertions(+), 43 deletions(-) (limited to 'dtrain') diff --git a/dtrain/README.md b/dtrain/README.md index 350c7423..7aefcc55 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -41,7 +41,7 @@ DTRAIN_LOCAL. Next ---- -+ approx. Bleu? ++ approx. Bleu? proper lc_bleu (init with X) + turn off inclusion + (dtrain|decoder) more meta-parameters testing + feature selection directly in dtrain @@ -57,6 +57,7 @@ Next + resharding [nfold cross val.] + bigger LM, feats (target side Ng., word alignments etc.) + merge kbest lists ++ proper eval, pairwise ranking, forced transl Legal ----- diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 717d47a2..88413a1d 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -6,38 +6,39 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) { po::options_description ini("Configuration File Options"); ini.add_options() - ("input", po::value()->default_value("-"), "input file") - ("output", po::value()->default_value("-"), "output weights file, '-' for STDOUT") - ("input_weights", po::value(), "input weights file (e.g. from previous iteration)") - ("decoder_config", po::value(), "configuration file for cdec") - ("print_weights", po::value(), "weights to print on each iteration") - ("stop_after", po::value()->default_value(0), "stop after X input sentences") - ("tmp", po::value()->default_value("/tmp"), "temp dir to use") - ("keep", po::value()->zero_tokens(), "keep weights files for each iteration") - ("hstreaming", po::value(), "run in hadoop streaming mode, arg is a task id") - ("epochs", po::value()->default_value(10), "# of iterations T (per shard)") - ("k", po::value()->default_value(100), "how many translations to sample") - ("sample_from", po::value()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'") - ("filter", po::value()->default_value("uniq"), "filter kbest list: 'not', 'uniq'") - ("pair_sampling", po::value()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'") - ("hi_lo", po::value()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5") - ("pair_threshold", po::value()->default_value(0.), "bleu [0,1] threshold to filter pairs") - ("N", po::value()->default_value(4), "N for Ngrams (BLEU)") - ("scorer", po::value()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_") - ("learning_rate", po::value()->default_value(0.0001), "learning rate") - ("gamma", po::value()->default_value(0.), "gamma for SVM (0 for perceptron)") - ("select_weights", po::value()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") - ("rescale", po::value()->zero_tokens(), "rescale weight vector after each input") - ("l1_reg", po::value()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)") - ("l1_reg_strength", po::value(), "l1 regularization strength") - ("fselect", po::value()->default_value(-1), "TODO select top x percent (or by threshold) of features after each epoch") - ("approx_bleu_d", po::value()->default_value(0.9), "discount for approx. BLEU") - ("scale_bleu_diff", po::value()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") - ("loss_margin", po::value()->default_value(0.), "update if no error in pref pair but model scores this near") + ("input", po::value()->default_value("-"), "input file") + ("output", po::value()->default_value("-"), "output weights file, '-' for STDOUT") + ("input_weights", po::value(), "input weights file (e.g. from previous iteration)") + ("decoder_config", po::value(), "configuration file for cdec") + ("print_weights", po::value(), "weights to print on each iteration") + ("stop_after", po::value()->default_value(0), "stop after X input sentences") + ("tmp", po::value()->default_value("/tmp"), "temp dir to use") + ("keep", po::value()->zero_tokens(), "keep weights files for each iteration") + ("hstreaming", po::value(), "run in hadoop streaming mode, arg is a task id") + ("epochs", po::value()->default_value(10), "# of iterations T (per shard)") + ("k", po::value()->default_value(100), "how many translations to sample") + ("sample_from", po::value()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'") + ("filter", po::value()->default_value("uniq"), "filter kbest list: 'not', 'uniq'") + ("pair_sampling", po::value()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'") + ("hi_lo", po::value()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5") + ("pair_threshold", po::value()->default_value(0.), "bleu [0,1] threshold to filter pairs") + ("N", po::value()->default_value(4), "N for Ngrams (BLEU)") + ("scorer", po::value()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_, lc_") + ("learning_rate", po::value()->default_value(0.0001), "learning rate") + ("gamma", po::value()->default_value(0.), "gamma for SVM (0 for perceptron)") + ("select_weights", po::value()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") + ("rescale", po::value()->zero_tokens(), "rescale weight vector after each input") + ("l1_reg", po::value()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)") + ("l1_reg_strength", po::value(), "l1 regularization strength") + ("fselect", po::value()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPL") // TODO + ("approx_bleu_d", po::value()->default_value(0.9), "discount for approx. BLEU") + ("scale_bleu_diff", po::value()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") + ("loss_margin", po::value()->default_value(0.), "update if no error in pref pair but model scores this near") + ("max_pairs", po::value()->default_value(std::numeric_limits::max()), "max. # of pairs per Sent.") #ifdef DTRAIN_LOCAL - ("refs,r", po::value(), "references in local mode") + ("refs,r", po::value(), "references in local mode") #endif - ("noup", po::value()->zero_tokens(), "do not update weights"); + ("noup", po::value()->zero_tokens(), "do not update weights"); po::options_description cl("Command Line Options"); cl.add_options() ("config,c", po::value(), "dtrain config file") @@ -135,6 +136,7 @@ main(int argc, char** argv) const string select_weights = cfg["select_weights"].as(); const float hi_lo = cfg["hi_lo"].as(); const score_t approx_bleu_d = cfg["approx_bleu_d"].as(); + const unsigned max_pairs = cfg["max_pairs"].as(); weight_t loss_margin = cfg["loss_margin"].as(); if (loss_margin > 9998.) loss_margin = std::numeric_limits::max(); bool scale_bleu_diff = false; @@ -167,6 +169,8 @@ main(int argc, char** argv) scorer = dynamic_cast(new SmoothSingleBleuScorer); } else if (scorer_str == "approx_bleu") { scorer = dynamic_cast(new ApproxBleuScorer(N, approx_bleu_d)); + } else if (scorer_str == "lc_bleu") { + scorer = dynamic_cast(new LinearBleuScorer(N)); } else { cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl; exit(1); @@ -257,6 +261,7 @@ main(int argc, char** argv) cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as() << "'" << endl; if (rescale) cerr << setw(25) << "rescale " << rescale << endl; + cerr << "max pairs " << max_pairs << endl; cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; #ifdef DTRAIN_LOCAL @@ -421,17 +426,17 @@ main(int argc, char** argv) // get pairs vector > pairs; if (pair_sampling == "all") - all_pairs(samples, pairs, pair_threshold); + all_pairs(samples, pairs, pair_threshold, max_pairs); if (pair_sampling == "XYX") - partXYX(samples, pairs, pair_threshold, hi_lo); + partXYX(samples, pairs, pair_threshold, max_pairs, hi_lo); if (pair_sampling == "PRO") - PROsampling(samples, pairs, pair_threshold); + PROsampling(samples, pairs, pair_threshold, max_pairs); npairs += pairs.size(); for (vector >::iterator it = pairs.begin(); it != pairs.end(); it++) { #ifdef DTRAIN_FASTER_PERCEPTRON - bool rank_error = true; // pair filtering already did this for us + bool rank_error = true; // pair sampling already did this for us rank_errors++; score_t margin = std::numeric_limits::max(); #else @@ -498,7 +503,7 @@ main(int argc, char** argv) if (average) w_average += lambdas; - if (scorer_str == "approx_bleu") scorer->Reset(); + if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset(); if (t == 0) { in_sz = ii; // remember size of input (# lines) diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 32006a41..71c8ae59 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -19,7 +19,7 @@ cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b) } inline void -all_pairs(vector* s, vector >& training, score_t threshold, float _unused=1) +all_pairs(vector* s, vector >& training, score_t threshold, unsigned max, float _unused=1) { sort(s->begin(), s->end(), cmp_hyp_by_score_d); unsigned sz = s->size(); @@ -44,7 +44,7 @@ all_pairs(vector* s, vector >& training, sc */ inline void -partXYX(vector* s, vector >& training, score_t threshold, float hi_lo) +partXYX(vector* s, vector >& training, score_t threshold, unsigned max, float hi_lo) { unsigned sz = s->size(); if (sz < 2) return; @@ -104,7 +104,7 @@ _PRO_cmp_pair_by_diff_d(pair a, pair b return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score)); } inline void -PROsampling(vector* s, vector >& training, score_t threshold, float _unused=1) +PROsampling(vector* s, vector >& training, score_t threshold, unsigned max, float _unused=1) { unsigned max_count = 5000, count = 0, sz = s->size(); bool b = false; diff --git a/dtrain/score.cc b/dtrain/score.cc index b331dc4f..5c356c0f 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -122,12 +122,13 @@ SmoothSingleBleuScorer::Score(vector& hyp, vector& ref, unsigned j = 1; for (unsigned i = 0; i < M; i++) { if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; - sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2.0, N_-j+1); + sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2., N_-j+1); j++; } return brevity_penalty(hyp_len, ref_len) * sum; } + /* * approx. bleu * @@ -160,6 +161,38 @@ ApproxBleuScorer::Score(vector& hyp, vector& ref, return (score_t)glob_src_len_ * score; } +/* + * Linear (Corpus) Bleu + * + * as in "Lattice Minimum Bayes-Risk Decoding + * for Statistical Machine Translation" + * (Tromble et al. '08) + * + */ +score_t +LinearBleuScorer::Score(vector& hyp, vector& ref, + const unsigned rank, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (ref_len == 0) return 0.; + unsigned M = N_; + if (ref_len < N_) M = ref_len; + NgramCounts counts(M); + if (hyp_len > 0) + counts = make_ngram_counts(hyp, ref, M); + score_t ret = 0.; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || onebest_counts_.sum_[i] == 0) break; + ret += counts.sum_[i]/onebest_counts_.sum_[i]; + } + ret = -(hyp_len/(score_t)onebest_len_) + (1./M) * ret; + if (rank == 0) { + onebest_len_ += hyp_len; + onebest_counts_ += counts; + } + return ret; +} + } // namespace diff --git a/dtrain/score.h b/dtrain/score.h index d4fba22c..c5be2829 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -20,7 +20,7 @@ struct NgramCounts inline void operator+=(const NgramCounts& rhs) { - assert(N_ == rhs.N_); + if (rhs.N_ > N_) Resize(rhs.N_); for (unsigned i = 0; i < N_; i++) { this->clipped_[i] += rhs.clipped_.find(i)->second; this->sum_[i] += rhs.sum_.find(i)->second; @@ -59,13 +59,21 @@ struct NgramCounts inline void Zero() { - unsigned i; - for (i = 0; i < N_; i++) { + for (unsigned i = 0; i < N_; i++) { clipped_[i] = 0.; sum_[i] = 0.; } } + inline void + One() + { + for (unsigned i = 0; i < N_; i++) { + clipped_[i] = 1.; + sum_[i] = 1.; + } + } + inline void Print() { @@ -74,6 +82,23 @@ struct NgramCounts cout << i+1 << "grams:\t\t\t" << sum_[i] << endl; } } + + inline void Resize(unsigned N) + { + if (N == N_) return; + else if (N > N_) { + for (unsigned i = N_; i < N; i++) { + clipped_[i] = 0.; + sum_[i] = 0.; + } + } else { // N < N_ + for (unsigned i = N_-1; i > N-1; i--) { + clipped_.erase(i); + sum_.erase(i); + } + } + N_ = N; + } }; typedef map, unsigned> Ngrams; @@ -152,6 +177,24 @@ struct ApproxBleuScorer : public BleuScorer score_t Score(vector& hyp, vector& ref, const unsigned rank, const unsigned src_len); }; +struct LinearBleuScorer : public BleuScorer +{ + unsigned onebest_len_; + NgramCounts onebest_counts_; + + LinearBleuScorer(unsigned N) : onebest_len_(1), onebest_counts_(N) + { + onebest_counts_.One(); + } + + score_t Score(vector& hyp, vector& ref, const unsigned rank, const unsigned /*src_len*/); + + inline void Reset() { + onebest_len_ = 1; + onebest_counts_.One(); + } +}; + } // namespace -- cgit v1.2.3 From d80b98fa834698b9aa660249ea439670a45ab64e Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 13 Jun 2012 14:52:23 +0200 Subject: max pairs arg --- dtrain/dtrain.cc | 2 +- dtrain/pairsampling.h | 15 +++++++++++++++ dtrain/score.cc | 6 ++++-- 3 files changed, 20 insertions(+), 3 deletions(-) (limited to 'dtrain') diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 88413a1d..7f44d4cf 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -261,7 +261,7 @@ main(int argc, char** argv) cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as() << "'" << endl; if (rescale) cerr << setw(25) << "rescale " << rescale << endl; - cerr << "max pairs " << max_pairs << endl; + cerr << setw(25) << "max pairs " << max_pairs << endl; cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; #ifdef DTRAIN_LOCAL diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 71c8ae59..84be1efb 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -23,6 +23,8 @@ all_pairs(vector* s, vector >& training, sc { sort(s->begin(), s->end(), cmp_hyp_by_score_d); unsigned sz = s->size(); + bool b = false; + unsigned count = 0; for (unsigned i = 0; i < sz-1; i++) { for (unsigned j = i+1; j < sz; j++) { if (threshold > 0) { @@ -32,7 +34,12 @@ all_pairs(vector* s, vector >& training, sc if ((*s)[i].score != (*s)[j].score) training.push_back(make_pair((*s)[i], (*s)[j])); } + if (++count == max) { + b = true; + break; + } } + if (b) break; } } @@ -53,6 +60,8 @@ partXYX(vector* s, vector >& training, scor unsigned sep_hi = sep; if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi; else sep_hi = 1; + bool b = false; + unsigned count = 0; for (unsigned i = 0; i < sep_hi; i++) { for (unsigned j = sep_hi; j < sz; j++) { #ifdef DTRAIN_FASTER_PERCEPTRON @@ -65,10 +74,15 @@ partXYX(vector* s, vector >& training, scor if ((*s)[i].score != (*s)[j].score) training.push_back(make_pair((*s)[i], (*s)[j])); } + if (++count == max) { + b = true; + break; + } #ifdef DTRAIN_FASTER_PERCEPTRON } #endif } + if (b) break; } unsigned sep_lo = sz-sep; while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo; @@ -84,6 +98,7 @@ partXYX(vector* s, vector >& training, scor if ((*s)[i].score != (*s)[j].score) training.push_back(make_pair((*s)[i], (*s)[j])); } + if (++count == max) return; #ifdef DTRAIN_FASTER_PERCEPTRON } #endif diff --git a/dtrain/score.cc b/dtrain/score.cc index 5c356c0f..5bb0bcaa 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -128,6 +128,7 @@ SmoothSingleBleuScorer::Score(vector& hyp, vector& ref, return brevity_penalty(hyp_len, ref_len) * sum; } +// TODO single variants! /* * approx. bleu @@ -136,7 +137,8 @@ SmoothSingleBleuScorer::Score(vector& hyp, vector& ref, * and Structural Translation Features" * (Chiang et al. '08) * - * NOTE: needs some more code in dtrain.cc + * NOTE: Needs some more code in dtrain.cc . + * No scaling by src len. */ score_t ApproxBleuScorer::Score(vector& hyp, vector& ref, @@ -158,7 +160,7 @@ ApproxBleuScorer::Score(vector& hyp, vector& ref, glob_ref_len_ = discount_ * (glob_ref_len_ + ref_len); glob_src_len_ = discount_ * (glob_src_len_ + src_len); } - return (score_t)glob_src_len_ * score; + return score; } /* -- cgit v1.2.3 From 1d1172a3f85ab7423ae9537cf3c73afdfe7dc693 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 14 Jun 2012 20:11:57 +0200 Subject: 'sum bleu' --- dtrain/README.md | 2 ++ dtrain/dtrain.cc | 8 ++++++-- dtrain/dtrain.h | 2 +- dtrain/score.cc | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- dtrain/score.h | 12 ++++++++++- 5 files changed, 78 insertions(+), 8 deletions(-) (limited to 'dtrain') diff --git a/dtrain/README.md b/dtrain/README.md index 7aefcc55..843874fa 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -58,6 +58,8 @@ Next + bigger LM, feats (target side Ng., word alignments etc.) + merge kbest lists + proper eval, pairwise ranking, forced transl ++ smmoth bleu variants X ++ MMERT exp Legal ----- diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 7f44d4cf..eea58393 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -165,8 +165,12 @@ main(int argc, char** argv) scorer = dynamic_cast(new StupidBleuScorer); } else if (scorer_str == "smooth_bleu") { scorer = dynamic_cast(new SmoothBleuScorer); - } else if (scorer_str == "smooth_single_bleu") { - scorer = dynamic_cast(new SmoothSingleBleuScorer); + } else if (scorer_str == "sum_bleu") { + scorer = dynamic_cast(new SumBleuScorer); + } else if (scorer_str == "sumexp_bleu") { + scorer = dynamic_cast(new SumExpBleuScorer); + } else if (scorer_str == "sumwhatever_bleu") { + scorer = dynamic_cast(new SumWhateverBleuScorer); } else if (scorer_str == "approx_bleu") { scorer = dynamic_cast(new ApproxBleuScorer(N, approx_bleu_d)); } else if (scorer_str == "lc_bleu") { diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index d8dc14b6..7e084a79 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -3,7 +3,7 @@ #undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs // DO NOT USE WITH SVM! -#undef DTRAIN_LOCAL +#define DTRAIN_LOCAL #define DTRAIN_DOTS 10 // after how many inputs to display a '.' #define DTRAIN_GRAMMAR_DELIM "########EOS########" #define DTRAIN_SCALE 100000 diff --git a/dtrain/score.cc b/dtrain/score.cc index 5bb0bcaa..4a7cac6e 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -80,7 +80,7 @@ StupidBleuScorer::Score(vector& hyp, vector& ref, * to Machine Translation" * (Liang et al. '06) * - * NOTE: max is 0.9375 + * NOTE: max is 0.9375 (with N=4) */ score_t SmoothBleuScorer::Score(vector& hyp, vector& ref, @@ -108,9 +108,13 @@ SmoothBleuScorer::Score(vector& hyp, vector& ref, return brevity_penalty(hyp_len, ref_len) * sum; } -// variant of smooth_bleu; i-Bleu scores only single 'i' +/* + * 'sum' bleu + * + * sum up Ngram precisions + */ score_t -SmoothSingleBleuScorer::Score(vector& hyp, vector& ref, +SumBleuScorer::Score(vector& hyp, vector& ref, const unsigned /*rank*/, const unsigned /*src_len*/) { unsigned hyp_len = hyp.size(), ref_len = ref.size(); @@ -128,7 +132,57 @@ SmoothSingleBleuScorer::Score(vector& hyp, vector& ref, return brevity_penalty(hyp_len, ref_len) * sum; } -// TODO single variants! +/* + * 'sum' (exp) bleu + * + * sum up exp(Ngram precisions) + */ +score_t +SumExpBleuScorer::Score(vector& hyp, vector& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + if (ref_len < N_) M = ref_len; + score_t sum = 0.; + unsigned j = 1; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; + sum += exp(((score_t)counts.clipped_[i]/counts.sum_[i]))/pow(2., N_-j+1); + j++; + } + return brevity_penalty(hyp_len, ref_len) * sum; +} + +/* + * 'sum' (whatever) bleu + * + * sum up exp(weight * log(Ngram precisions)) + */ +score_t +SumWhateverBleuScorer::Score(vector& hyp, vector& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + vector v = w_; + if (ref_len < N_) { + M = ref_len; + for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M); + } + score_t sum = 0.; + unsigned j = 1; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; + sum += exp(v[i] * log(((score_t)counts.clipped_[i]/counts.sum_[i])))/pow(2., N_-j+1); + j++; + } + return brevity_penalty(hyp_len, ref_len) * sum; +} /* * approx. bleu diff --git a/dtrain/score.h b/dtrain/score.h index c5be2829..f317c903 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -153,7 +153,17 @@ struct SmoothBleuScorer : public LocalScorer score_t Score(vector& hyp, vector& ref, const unsigned /*rank*/, const unsigned /*src_len*/); }; -struct SmoothSingleBleuScorer : public LocalScorer +struct SumBleuScorer : public LocalScorer +{ + score_t Score(vector& hyp, vector& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct SumExpBleuScorer : public LocalScorer +{ + score_t Score(vector& hyp, vector& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct SumWhateverBleuScorer : public LocalScorer { score_t Score(vector& hyp, vector& ref, const unsigned /*rank*/, const unsigned /*src_len*/); }; -- cgit v1.2.3 From ee1520c5095ea8648617a3658b20eedfd4dd2007 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 18 Jun 2012 17:26:33 +0200 Subject: extract_rules cdec param --- .gitignore | 1 + decoder/decoder.cc | 22 +++++++++++++++------- decoder/viterbi.cc | 12 +++++++++++- decoder/viterbi.h | 5 ++++- dtrain/README.md | 22 ---------------------- 5 files changed, 31 insertions(+), 31 deletions(-) (limited to 'dtrain') diff --git a/.gitignore b/.gitignore index 27c6a739..943e6dc5 100644 --- a/.gitignore +++ b/.gitignore @@ -128,6 +128,7 @@ decoder/rule_lexer.cc training/atools training/collapse_weights training/lbfgs_test +training/libtraining.a training/mr_optimize_reduce training/mr_em_adapted_reduce training/mr_em_map_adapter diff --git a/decoder/decoder.cc b/decoder/decoder.cc index cbb97a0d..333f0fb6 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -3,6 +3,7 @@ #include #include #include +#include #include "program_options.h" #include "stringlib.h" @@ -187,8 +188,8 @@ struct DecoderImpl { } void SetId(int next_sent_id) { sent_id = next_sent_id - 1; } - void forest_stats(Hypergraph &forest,string name,bool show_tree,bool show_deriv=false) { - cerr << viterbi_stats(forest,name,true,show_tree,show_deriv); + void forest_stats(Hypergraph &forest,string name,bool show_tree,bool show_deriv=false, bool extract_rules=false, boost::shared_ptr extract_file = boost::make_shared()) { + cerr << viterbi_stats(forest,name,true,show_tree,show_deriv,extract_rules, extract_file); cerr << endl; } @@ -424,7 +425,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("tagger_tagset,t", po::value(), "(Tagger) file containing tag set") ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format") ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice") - ("extract_rules", po::value(), "Extract the rules used in translation (de-duped) to this file") + ("extract_rules", po::value(), "Extract the rules used in translation (not de-duped!) to a file in this directory") ("show_derivations", po::value(), "Directory to print the derivation structures to") ("graphviz","Show (constrained) translation forest in GraphViz format") ("max_translation_beam,x", po::value(), "Beam approximation to get max translation from the chart") @@ -570,6 +571,11 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream // cube pruning pop-limit: we may want to configure this on a per-pass basis pop_limit = conf["cubepruning_pop_limit"].as(); + if (conf.count("extract_rules")) { + if (!DirectoryExists(conf["extract_rules"].as())) + MkDirP(conf["extract_rules"].as()); + } + // determine the number of rescoring/pruning/weighting passes configured const int MAX_PASSES = 3; for (int pass = 0; pass < MAX_PASSES; ++pass) { @@ -712,9 +718,11 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream cfg_options.Validate(); #endif - if (conf.count("extract_rules")) - extract_file.reset(new WriteFile(str("extract_rules",conf))); - + if (conf.count("extract_rules")) { + stringstream ss; + ss << sent_id; + extract_file.reset(new WriteFile(str("extract_rules",conf)+"/"+ss.str())); + } combine_size = conf["combine_size"].as(); if (combine_size < 1) combine_size = 1; sent_id = -1; @@ -851,7 +859,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { #endif forest.swap(rescored_forest); forest.Reweight(cur_weights); - if (!SILENT) forest_stats(forest," " + passtr +" forest",show_tree_structure,oracle.show_derivation); + if (!SILENT) forest_stats(forest," " + passtr +" forest",show_tree_structure,oracle.show_derivation, conf.count("extract_rules"), extract_file); } if (conf.count("show_partition")) { diff --git a/decoder/viterbi.cc b/decoder/viterbi.cc index 9d19914b..1b9c6665 100644 --- a/decoder/viterbi.cc +++ b/decoder/viterbi.cc @@ -5,11 +5,12 @@ #include #include "hg.h" + //#define DEBUG_VITERBI_SORT using namespace std; -std::string viterbi_stats(Hypergraph const& hg, std::string const& name, bool estring, bool etree,bool show_derivation) +std::string viterbi_stats(Hypergraph const& hg, std::string const& name, bool estring, bool etree,bool show_derivation, bool extract_rules, boost::shared_ptr extract_file) { ostringstream o; o << hg.stats(name); @@ -22,6 +23,9 @@ std::string viterbi_stats(Hypergraph const& hg, std::string const& name, bool es if (etree) { o<stream()); + } if (show_derivation) { o< edges; + Viterbi(hg, &edges); + for (unsigned i = 0; i < edges.size(); i++) + (*o) << edges[i]->rule_->AsString(true) << endl; +} string ViterbiETree(const Hypergraph& hg) { vector tmp; diff --git a/decoder/viterbi.h b/decoder/viterbi.h index 3092f6da..03e961a2 100644 --- a/decoder/viterbi.h +++ b/decoder/viterbi.h @@ -5,8 +5,10 @@ #include "prob.h" #include "hg.h" #include "tdict.h" +#include "filelib.h" +#include -std::string viterbi_stats(Hypergraph const& hg, std::string const& name="forest", bool estring=true, bool etree=false, bool derivation_tree=false); +std::string viterbi_stats(Hypergraph const& hg, std::string const& name="forest", bool estring=true, bool etree=false, bool derivation_tree=false, bool extract_rules=false, boost::shared_ptr extract_file = boost::make_shared()); /// computes for each hg node the best (according to WeightType/WeightFunction) derivation, and some homomorphism (bottom up expression tree applied through Traversal) of it. T is the "return type" of Traversal, which is called only once for the best edge for a node's result (i.e. result will start default constructed) //TODO: make T a typename inside Traversal and WeightType a typename inside WeightFunction? @@ -201,6 +203,7 @@ struct FeatureVectorTraversal { std::string JoshuaVisualizationString(const Hypergraph& hg); prob_t ViterbiESentence(const Hypergraph& hg, std::vector* result); std::string ViterbiETree(const Hypergraph& hg); +void ViterbiRules(const Hypergraph& hg, std::ostream* s); prob_t ViterbiFSentence(const Hypergraph& hg, std::vector* result); std::string ViterbiFTree(const Hypergraph& hg); int ViterbiELength(const Hypergraph& hg); diff --git a/dtrain/README.md b/dtrain/README.md index 843874fa..7edabbf1 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -39,28 +39,6 @@ For an example of local usage (with the 'distributed' format) the see test/example/ . This expects dtrain to be built without DTRAIN_LOCAL. -Next ----- -+ approx. Bleu? proper lc_bleu (init with X) -+ turn off inclusion -+ (dtrain|decoder) more meta-parameters testing -+ feature selection directly in dtrain -+ feature template: target side rule ngrams -+ sa-extract -> leave-one-out for grammar of training set? -+ make svm doable; no subgradient? -+ reranking while sgd? -+ try PRO, mira emulations -+ sampling (MBR) -+ forest (on train)? -+ best BLEU transl (Sokolov)? -+ entire reg. path -+ resharding [nfold cross val.] -+ bigger LM, feats (target side Ng., word alignments etc.) -+ merge kbest lists -+ proper eval, pairwise ranking, forced transl -+ smmoth bleu variants X -+ MMERT exp - Legal ----- Copyright (c) 2012 by Patrick Simianer -- cgit v1.2.3