summaryrefslogtreecommitdiff
path: root/dtrain
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-05-15 00:44:03 +0200
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-05-15 00:44:03 +0200
commit78a0ee61c2d2d846306b60a8ac862a2d649bcf59 (patch)
tree7fe54c5f63c360a6d43428a57386ac934394dc51 /dtrain
parentfb42639b433d2fc0c68f300666be1192dc3b4f59 (diff)
loss margin cfg, XYX improved, smooth bleu variant
Diffstat (limited to 'dtrain')
-rw-r--r--dtrain/dtrain.cc16
-rw-r--r--dtrain/ksampler.h7
-rw-r--r--dtrain/pairsampling.h16
-rw-r--r--dtrain/score.cc22
-rw-r--r--dtrain/score.h5
-rw-r--r--dtrain/test/example/dtrain.ini3
-rw-r--r--dtrain/test/example/expected-output143
7 files changed, 129 insertions, 83 deletions
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 864eb153..717d47a2 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -33,6 +33,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
("fselect", po::value<weight_t>()->default_value(-1), "TODO select top x percent (or by threshold) of features after each epoch")
("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU")
("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair")
+ ("loss_margin", po::value<weight_t>()->default_value(0.), "update if no error in pref pair but model scores this near")
#ifdef DTRAIN_LOCAL
("refs,r", po::value<string>(), "references in local mode")
#endif
@@ -134,6 +135,8 @@ main(int argc, char** argv)
const string select_weights = cfg["select_weights"].as<string>();
const float hi_lo = cfg["hi_lo"].as<float>();
const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>();
+ weight_t loss_margin = cfg["loss_margin"].as<weight_t>();
+ if (loss_margin > 9998.) loss_margin = std::numeric_limits<float>::max();
bool scale_bleu_diff = false;
if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true;
bool average = false;
@@ -160,6 +163,8 @@ main(int argc, char** argv)
scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer);
} else if (scorer_str == "smooth_bleu") {
scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);
+ } else if (scorer_str == "smooth_single_bleu") {
+ scorer = dynamic_cast<SmoothSingleBleuScorer*>(new SmoothSingleBleuScorer);
} else if (scorer_str == "approx_bleu") {
scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d));
} else {
@@ -220,7 +225,7 @@ main(int argc, char** argv)
grammar_buf_out.open(grammar_buf_fn.c_str());
#endif
- unsigned in_sz = UINT_MAX; // input index, input size
+ unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size
vector<pair<score_t, score_t> > all_scores;
score_t max_score = 0.;
unsigned best_it = 0;
@@ -242,6 +247,7 @@ main(int argc, char** argv)
if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl;
else cerr << setw(25) << "learning rate " << "bleu diff" << endl;
cerr << setw(25) << "gamma " << gamma << endl;
+ cerr << setw(25) << "loss margin " << loss_margin << endl;
cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;
if (pair_sampling == "XYX")
cerr << setw(25) << "hi lo " << hi_lo << endl;
@@ -427,15 +433,15 @@ main(int argc, char** argv)
#ifdef DTRAIN_FASTER_PERCEPTRON
bool rank_error = true; // pair filtering already did this for us
rank_errors++;
- score_t margin = 2.; // compiler, could you get rid of the margin?
+ score_t margin = std::numeric_limits<float>::max();
#else
bool rank_error = it->first.model <= it->second.model;
if (rank_error) rank_errors++;
- score_t margin = fabs(it->first.model - it->second.model);
- if (!rank_error && margin < 1.) margin_violations++;
+ score_t margin = fabs(fabs(it->first.model) - fabs(it->second.model));
+ if (!rank_error && margin < loss_margin) margin_violations++;
#endif
if (scale_bleu_diff) eta = it->first.score - it->second.score;
- if (rank_error || (gamma && margin<1.)) {
+ if (rank_error || margin < loss_margin) {
SparseVector<weight_t> diff_vec = it->first.f - it->second.f;
lambdas.plus_eq_v_times_s(diff_vec, eta);
if (gamma)
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index f52fb649..bc2f56cd 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -8,6 +8,11 @@
namespace dtrain
{
+bool
+cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b)
+{
+ return a.model > b.model;
+}
struct KSampler : public HypSampler
{
@@ -44,6 +49,8 @@ struct KSampler : public HypSampler
sz_++;
f_count_ += h.f.size();
}
+ sort(s_.begin(), s_.end(), cmp_hyp_by_model_d);
+ for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i;
}
};
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 5085738e..32006a41 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -46,11 +46,15 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc
inline void
partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float hi_lo)
{
- sort(s->begin(), s->end(), cmp_hyp_by_score_d);
unsigned sz = s->size();
+ if (sz < 2) return;
+ sort(s->begin(), s->end(), cmp_hyp_by_score_d);
unsigned sep = round(sz*hi_lo);
- for (unsigned i = 0; i < sep; i++) {
- for (unsigned j = sep; j < sz; j++) {
+ unsigned sep_hi = sep;
+ if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi;
+ else sep_hi = 1;
+ for (unsigned i = 0; i < sep_hi; i++) {
+ for (unsigned j = sep_hi; j < sz; j++) {
#ifdef DTRAIN_FASTER_PERCEPTRON
if ((*s)[i].model <= (*s)[j].model) {
#endif
@@ -66,8 +70,10 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
#endif
}
}
- for (unsigned i = sep; i < sz-sep; i++) {
- for (unsigned j = sz-sep; j < sz; j++) {
+ unsigned sep_lo = sz-sep;
+ while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo;
+ for (unsigned i = sep_hi; i < sz-sep_lo; i++) {
+ for (unsigned j = sz-sep_lo; j < sz; j++) {
#ifdef DTRAIN_FASTER_PERCEPTRON
if ((*s)[i].model <= (*s)[j].model) {
#endif
diff --git a/dtrain/score.cc b/dtrain/score.cc
index 7b1f6be4..b331dc4f 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -103,7 +103,27 @@ SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
i_bleu[j] += (1/((score_t)j+1)) * i_ng;
}
}
- sum += exp(i_bleu[i])/(pow(2.0, static_cast<double>(N_-i)));
+ sum += exp(i_bleu[i])/(pow(2.0, N_-i));
+ }
+ return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+// variant of smooth_bleu; i-Bleu scores only single 'i'
+score_t
+SmoothSingleBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (hyp_len == 0 || ref_len == 0) return 0.;
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ unsigned M = N_;
+ if (ref_len < N_) M = ref_len;
+ score_t sum = 0.;
+ unsigned j = 1;
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break;
+ sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2.0, N_-j+1);
+ j++;
}
return brevity_penalty(hyp_len, ref_len) * sum;
}
diff --git a/dtrain/score.h b/dtrain/score.h
index eb8ad912..d4fba22c 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -128,6 +128,11 @@ struct SmoothBleuScorer : public LocalScorer
score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
};
+struct SmoothSingleBleuScorer : public LocalScorer
+{
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
struct ApproxBleuScorer : public BleuScorer
{
NgramCounts glob_onebest_counts_;
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index e43d6b34..c8ac7c3f 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -5,7 +5,7 @@ decoder_config=test/example/cdec.ini # config for cdec
# weights for these features will be printed on each iteration
print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
tmp=/tmp
-stop_after=20 # stop epoch after 20 inputs
+stop_after=10 # stop epoch after 10 inputs
# interesting stuff
epochs=3 # run over input 3 times
@@ -19,3 +19,4 @@ filter=uniq # only unique entries in kbest (surface form)
pair_sampling=XYX
hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here
pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0)
+loss_margin=0
diff --git a/dtrain/test/example/expected-output b/dtrain/test/example/expected-output
index 08733dd4..25d2c069 100644
--- a/dtrain/test/example/expected-output
+++ b/dtrain/test/example/expected-output
@@ -15,7 +15,7 @@ State is 0 bytes for feature RuleNgramFeatures
feature: RuleShape (no config parameters)
Example feature: Shape_S00000_T00000
State is 0 bytes for feature RuleShape
-Seeding random number sequence to 380245307
+Seeding random number sequence to 1072059181
dtrain
Parameters:
@@ -27,6 +27,7 @@ Parameters:
filter 'uniq'
learning rate 0.0001
gamma 0
+ loss margin 0
pairs 'XYX'
hi lo 0.1
pair threshold 0
@@ -35,90 +36,90 @@ Parameters:
cdec cfg 'test/example/cdec.ini'
input 'test/example/nc-wmt11.1k.gz'
output '-'
- stop_after 20
+ stop_after 10
(a dot represents 10 inputs)
Iteration #1 of 3.
- .. 20
-Stopping after 20 input sentences.
+ . 10
+Stopping after 10 input sentences.
WEIGHTS
- Glue = -0.1015
- WordPenalty = -0.0152
- LanguageModel = +0.21493
- LanguageModel_OOV = -0.3257
- PhraseModel_0 = -0.050844
- PhraseModel_1 = +0.25074
- PhraseModel_2 = +0.27944
- PhraseModel_3 = -0.038384
- PhraseModel_4 = -0.12041
- PhraseModel_5 = +0.1047
- PhraseModel_6 = -0.1289
- PassThrough = -0.3094
+ Glue = -0.0293
+ WordPenalty = +0.049075
+ LanguageModel = +0.24345
+ LanguageModel_OOV = -0.2029
+ PhraseModel_0 = +0.0084102
+ PhraseModel_1 = +0.021729
+ PhraseModel_2 = +0.014922
+ PhraseModel_3 = +0.104
+ PhraseModel_4 = -0.14308
+ PhraseModel_5 = +0.0247
+ PhraseModel_6 = -0.012
+ PassThrough = -0.2161
---
- 1best avg score: 0.17508 (+0.17508)
- 1best avg model score: -1.2392 (-1.2392)
- avg # pairs: 1329.8
- avg # rank err: 649.1
- avg # margin viol: 677.5
- non0 feature count: 874
- avg list sz: 88.6
- avg f count: 85.643
-(time 0.25 min, 0.75 s/S)
+ 1best avg score: 0.16872 (+0.16872)
+ 1best avg model score: -1.8276 (-1.8276)
+ avg # pairs: 1121.1
+ avg # rank err: 555.6
+ avg # margin viol: 0
+ non0 feature count: 277
+ avg list sz: 77.2
+ avg f count: 90.96
+(time 0.1 min, 0.6 s/S)
Iteration #2 of 3.
- .. 20
+ . 10
WEIGHTS
- Glue = -0.0792
- WordPenalty = -0.056198
- LanguageModel = +0.31038
- LanguageModel_OOV = -0.4011
- PhraseModel_0 = +0.072188
- PhraseModel_1 = +0.11473
- PhraseModel_2 = +0.049774
- PhraseModel_3 = -0.18448
- PhraseModel_4 = -0.12092
- PhraseModel_5 = +0.1599
- PhraseModel_6 = -0.0606
- PassThrough = -0.3848
+ Glue = -0.3526
+ WordPenalty = +0.067576
+ LanguageModel = +1.155
+ LanguageModel_OOV = -0.2728
+ PhraseModel_0 = -0.025529
+ PhraseModel_1 = +0.095869
+ PhraseModel_2 = +0.094567
+ PhraseModel_3 = +0.12482
+ PhraseModel_4 = -0.36533
+ PhraseModel_5 = +0.1068
+ PhraseModel_6 = -0.1517
+ PassThrough = -0.286
---
- 1best avg score: 0.24015 (+0.065075)
- 1best avg model score: -10.131 (-8.8914)
- avg # pairs: 1324.7
- avg # rank err: 558.65
- avg # margin viol: 752.85
- non0 feature count: 1236
- avg list sz: 84.9
- avg f count: 88.306
-(time 0.22 min, 0.65 s/S)
+ 1best avg score: 0.18394 (+0.015221)
+ 1best avg model score: 3.205 (+5.0326)
+ avg # pairs: 1168.3
+ avg # rank err: 594.8
+ avg # margin viol: 0
+ non0 feature count: 543
+ avg list sz: 77.5
+ avg f count: 85.916
+(time 0.083 min, 0.5 s/S)
Iteration #3 of 3.
- .. 20
+ . 10
WEIGHTS
- Glue = -0.051
- WordPenalty = -0.077956
- LanguageModel = +0.33699
- LanguageModel_OOV = -0.4726
- PhraseModel_0 = +0.040228
- PhraseModel_1 = +0.18
- PhraseModel_2 = +0.15618
- PhraseModel_3 = -0.098908
- PhraseModel_4 = -0.036555
- PhraseModel_5 = +0.1619
- PhraseModel_6 = +0.0078
- PassThrough = -0.4563
+ Glue = -0.392
+ WordPenalty = +0.071963
+ LanguageModel = +0.81266
+ LanguageModel_OOV = -0.4177
+ PhraseModel_0 = -0.2649
+ PhraseModel_1 = -0.17931
+ PhraseModel_2 = +0.038261
+ PhraseModel_3 = +0.20261
+ PhraseModel_4 = -0.42621
+ PhraseModel_5 = +0.3198
+ PhraseModel_6 = -0.1437
+ PassThrough = -0.4309
---
- 1best avg score: 0.25527 (+0.015113)
- 1best avg model score: -13.906 (-3.7756)
- avg # pairs: 1356.3
- avg # rank err: 562.1
- avg # margin viol: 757.35
- non0 feature count: 1482
- avg list sz: 86.65
- avg f count: 87.475
-(time 0.23 min, 0.7 s/S)
+ 1best avg score: 0.2962 (+0.11225)
+ 1best avg model score: -36.274 (-39.479)
+ avg # pairs: 1109.6
+ avg # rank err: 515.9
+ avg # margin viol: 0
+ non0 feature count: 741
+ avg list sz: 77
+ avg f count: 88.982
+(time 0.083 min, 0.5 s/S)
Writing weights file to '-' ...
done
---
-Best iteration: 3 [SCORE 'stupid_bleu'=0.25527].
-This took 0.7 min.
+Best iteration: 3 [SCORE 'stupid_bleu'=0.2962].
+This took 0.26667 min.