summaryrefslogtreecommitdiff
path: root/dtrain
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-04-27 01:54:47 +0200
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-04-27 01:54:47 +0200
commit0ac66e310d57f9aea5ddeea900c84df08abfe8c2 (patch)
tree1d428ccbe1c63c90499e09e89d314f74fff11047 /dtrain
parent01110e92e7429df7882879e026b28aa9c89c724d (diff)
fix approx. BLEU of (Chiang et al. '08)
Diffstat (limited to 'dtrain')
-rw-r--r--dtrain/Makefile.am2
-rw-r--r--dtrain/README.md3
-rw-r--r--dtrain/dtrain.cc11
-rw-r--r--dtrain/hstreaming/dtrain.ini2
-rw-r--r--dtrain/kbestget.h12
-rw-r--r--dtrain/ksampler.h6
-rw-r--r--dtrain/pairsampling.h2
-rw-r--r--dtrain/score.cc58
-rw-r--r--dtrain/score.h52
-rw-r--r--dtrain/test/example/dtrain.ini6
10 files changed, 87 insertions, 67 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index 64fef489..f39d161e 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -3,5 +3,5 @@ bin_PROGRAMS = dtrain
dtrain_SOURCES = dtrain.cc score.cc
dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
+AM_CPPFLAGS = -O3 -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/dtrain/README.md b/dtrain/README.md
index 2a24ec22..92d6ba0d 100644
--- a/dtrain/README.md
+++ b/dtrain/README.md
@@ -3,7 +3,8 @@ which is able to train the weights of very many (sparse) features.
It was used here:
"Joint Feature Selection in Distributed Stochastic
Learning for Large-Scale Discriminative Training in
- SMT" Simianer, Riezler, Dyer; ACL 2012
+ SMT"
+(Simianer, Riezler, Dyer; ACL 2012)
Building
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index ea5b8835..3dee10f2 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -32,7 +32,7 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength")
("inc_correct", po::value<bool>()->zero_tokens(), "include correctly ranked pairs into updates")
("fselect", po::value<weight_t>()->default_value(-1), "TODO select top x percent of features after each epoch")
- ("approx_bleu_scale", po::value<score_t>()->default_value(0.9), "scaling for approx. BLEU")
+ ("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU")
#ifdef DTRAIN_LOCAL
("refs,r", po::value<string>(), "references in local mode")
#endif
@@ -136,6 +136,7 @@ main(int argc, char** argv)
const score_t pair_threshold = cfg["pair_threshold"].as<score_t>();
const string select_weights = cfg["select_weights"].as<string>();
const float hi_lo = cfg["hi_lo"].as<float>();
+ const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>();
bool average = false;
if (select_weights == "avg")
average = true;
@@ -161,7 +162,7 @@ main(int argc, char** argv)
} else if (scorer_str == "smooth_bleu") {
scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);
} else if (scorer_str == "approx_bleu") {
- scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N));
+ scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d));
} else {
cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;
exit(1);
@@ -235,6 +236,8 @@ main(int argc, char** argv)
cerr << setw(25) << "N " << N << endl;
cerr << setw(25) << "T " << T << endl;
cerr << setw(25) << "scorer '" << scorer_str << "'" << endl;
+ if (scorer_str == "approx_bleu")
+ cerr << setw(25) << "approx. B discount " << approx_bleu_d << endl;
cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl;
if (sample_from == "kbest")
cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl;
@@ -242,7 +245,7 @@ main(int argc, char** argv)
cerr << setw(25) << "gamma " << gamma << endl;
cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;
if (pair_sampling == "XYX")
- cerr << setw(25) << "hi lo " << "'" << hi_lo << "'" << endl;
+ cerr << setw(25) << "hi lo " << hi_lo << endl;
cerr << setw(25) << "pair threshold " << pair_threshold << endl;
cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl;
if (cfg.count("l1_reg"))
@@ -261,7 +264,7 @@ main(int argc, char** argv)
cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl;
if (cfg.count("stop-after"))
cerr << setw(25) << "stop_after " << stop_after << endl;
- if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " lines of input)" << endl;
+ if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " inputs)" << endl;
}
diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini
index 05535299..a2c219a1 100644
--- a/dtrain/hstreaming/dtrain.ini
+++ b/dtrain/hstreaming/dtrain.ini
@@ -10,6 +10,6 @@ gamma=0
scorer=stupid_bleu
sample_from=kbest
filter=uniq
-pair_sampling=108010
+pair_sampling=XYX
pair_threshold=0
select_weights=last
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index bcd82610..77d4a139 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -2,6 +2,8 @@
#define _DTRAIN_KBESTGET_H_
#include "kbest.h" // cdec
+#include "sentence_metadata.h"
+
#include "verbose.h"
#include "viterbi.h"
#include "ff_register.h"
@@ -32,7 +34,7 @@ struct LocalScorer
vector<score_t> w_;
virtual score_t
- Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank)=0;
+ Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len)=0;
void Reset() {} // only for approx bleu
@@ -71,13 +73,15 @@ struct KBestGetter : public HypSampler
const unsigned k_;
const string filter_type_;
vector<ScoredHyp> s_;
+ unsigned src_len_;
KBestGetter(const unsigned k, const string filter_type) :
k_(k), filter_type_(filter_type) {}
virtual void
- NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg)
+ NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
{
+ src_len_ = smeta.GetSourceLength();
KBestScored(*hg);
}
@@ -109,7 +113,7 @@ struct KBestGetter : public HypSampler
h.f = d->feature_values;
h.model = log(d->score);
h.rank = i;
- h.score = scorer_->Score(h.w, *ref_, i);
+ h.score = scorer_->Score(h.w, *ref_, i, src_len_);
s_.push_back(h);
}
}
@@ -128,7 +132,7 @@ struct KBestGetter : public HypSampler
h.f = d->feature_values;
h.model = log(d->score);
h.rank = i;
- h.score = scorer_->Score(h.w, *ref_, i);
+ h.score = scorer_->Score(h.w, *ref_, i, src_len_);
s_.push_back(h);
}
}
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index eb4813ab..0783f98b 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -15,13 +15,15 @@ struct KSampler : public HypSampler
vector<ScoredHyp> s_;
MT19937* prng_;
score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);
+ unsigned src_len_;
explicit KSampler(const unsigned k, MT19937* prng) :
k_(k), prng_(prng) {}
virtual void
- NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg)
+ NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
{
+ src_len_ = smeta.GetSourceLength();
ScoredSamples(*hg);
}
@@ -37,7 +39,7 @@ struct KSampler : public HypSampler
h.f = samples[i].fmap;
h.model = log(samples[i].model_score);
h.rank = i;
- h.score = scorer_->Score(h.w, *ref_, i);
+ h.score = scorer_->Score(h.w, *ref_, i, src_len_);
s_.push_back(h);
}
}
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 66ca1706..56702b86 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -44,7 +44,7 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
{
sort(s->begin(), s->end(), _XYX_cmp_hyp_by_score);
unsigned sz = s->size();
- unsigned sep = sz * hi_lo;
+ unsigned sep = round(sz*hi_lo);
for (unsigned i = 0; i < sep; i++) {
for (unsigned j = sep; j < sz; j++) {
if ((*s)[i].rank < (*s)[j].rank) {
diff --git a/dtrain/score.cc b/dtrain/score.cc
index d964b4da..d0f9e8a0 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -16,23 +16,23 @@ namespace dtrain
score_t
BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len)
{
- if (hyp_len == 0 || ref_len == 0) return 0;
+ if (hyp_len == 0 || ref_len == 0) return 0.;
unsigned M = N_;
if (ref_len < N_) M = ref_len;
score_t sum = 0;
for (unsigned i = 0; i < M; i++) {
- if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0;
- sum += w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]);
+ if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) return 0.;
+ sum += w_[i] * log((score_t)counts.clipped_[i]/counts.sum_[i]);
}
return brevity_penalty(hyp_len, ref_len) * exp(sum);
}
score_t
BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
- const unsigned /*rank*/)
+ const unsigned /*rank*/, const unsigned /*src_len*/)
{
unsigned hyp_len = hyp.size(), ref_len = ref.size();
- if (hyp_len == 0 || ref_len == 0) return 0;
+ if (hyp_len == 0 || ref_len == 0) return 0.;
NgramCounts counts = make_ngram_counts(hyp, ref, N_);
return Bleu(counts, hyp_len, ref_len);
}
@@ -49,18 +49,18 @@ BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
*/
score_t
StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
- const unsigned /*rank*/)
+ const unsigned /*rank*/, const unsigned /*src_len*/)
{
unsigned hyp_len = hyp.size(), ref_len = ref.size();
- if (hyp_len == 0 || ref_len == 0) return 0;
+ if (hyp_len == 0 || ref_len == 0) return 0.;
NgramCounts counts = make_ngram_counts(hyp, ref, N_);
unsigned M = N_;
if (ref_len < N_) M = ref_len;
score_t sum = 0, add = 0;
for (unsigned i = 0; i < M; i++) {
- if (i == 0 && (counts.clipped[i] == 0 || counts.sum[i] == 0)) return 0;
+ if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.;
if (i == 1) add = 1;
- sum += w_[i] * log(((score_t)counts.clipped[i] + add)/((counts.sum[i] + add)));
+ sum += w_[i] * log(((score_t)counts.clipped_[i] + add)/((counts.sum_[i] + add)));
}
return brevity_penalty(hyp_len, ref_len) * exp(sum);
}
@@ -76,10 +76,10 @@ StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
*/
score_t
SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
- const unsigned /*rank*/)
+ const unsigned /*rank*/, const unsigned /*src_len*/)
{
unsigned hyp_len = hyp.size(), ref_len = ref.size();
- if (hyp_len == 0 || ref_len == 0) return 0;
+ if (hyp_len == 0 || ref_len == 0) return 0.;
NgramCounts counts = make_ngram_counts(hyp, ref, N_);
unsigned M = N_;
if (ref_len < N_) M = ref_len;
@@ -87,10 +87,10 @@ SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
vector<score_t> i_bleu;
for (unsigned i = 0; i < M; i++) i_bleu.push_back(0.);
for (unsigned i = 0; i < M; i++) {
- if (counts.clipped[i] == 0 || counts.sum[i] == 0) {
+ if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) {
break;
} else {
- score_t i_ng = log((score_t)counts.clipped[i]/counts.sum[i]);
+ score_t i_ng = log((score_t)counts.clipped_[i]/counts.sum_[i]);
for (unsigned j = i; j < M; j++) {
i_bleu[j] += (1/((score_t)j+1)) * i_ng;
}
@@ -107,29 +107,29 @@ SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
* and Structural Translation Features"
* (Chiang et al. '08)
*
- * NOTE: needs some code in dtrain.cc
+ * NOTE: needs some more code in dtrain.cc
*/
score_t
ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
- const unsigned rank)
+ const unsigned rank, const unsigned src_len)
{
unsigned hyp_len = hyp.size(), ref_len = ref.size();
- if (hyp_len == 0 || ref_len == 0) return 0;
- NgramCounts counts = make_ngram_counts(hyp, ref, N_);
- NgramCounts tmp(N_);
+ if (ref_len == 0) return 0.;
+ score_t score = 0.;
+ NgramCounts counts(N_);
+ if (hyp_len > 0) {
+ counts = make_ngram_counts(hyp, ref, N_);
+ NgramCounts tmp = glob_onebest_counts_ + counts;
+ score = Bleu(tmp, hyp_len, ref_len);
+ }
if (rank == 0) { // 'context of 1best translations'
- glob_onebest_counts += counts;
- glob_hyp_len += hyp_len;
- glob_ref_len += ref_len;
- hyp_len = glob_hyp_len;
- ref_len = glob_ref_len;
- tmp = glob_onebest_counts;
- } else {
- hyp_len = hyp.size();
- ref_len = ref.size();
- tmp = glob_onebest_counts + counts;
+ glob_onebest_counts_ += counts;
+ glob_onebest_counts_ *= discount_;
+ glob_hyp_len_ = discount_ * (glob_hyp_len_ + hyp_len);
+ glob_ref_len_ = discount_ * (glob_ref_len_ + ref_len);
+ glob_src_len_ = discount_ * (glob_src_len_ + src_len);
}
- return 0.9 * Bleu(tmp, hyp_len, ref_len); // TODO param
+ return (score_t)glob_src_len_ * score;
}
diff --git a/dtrain/score.h b/dtrain/score.h
index 5aceb81f..d0e79f65 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -12,8 +12,8 @@ namespace dtrain
struct NgramCounts
{
unsigned N_;
- map<unsigned, unsigned> clipped;
- map<unsigned, unsigned> sum;
+ map<unsigned, score_t> clipped_;
+ map<unsigned, score_t> sum_;
NgramCounts(const unsigned N) : N_(N) { Zero(); }
@@ -22,8 +22,8 @@ struct NgramCounts
{
assert(N_ == rhs.N_);
for (unsigned i = 0; i < N_; i++) {
- this->clipped[i] += rhs.clipped.find(i)->second;
- this->sum[i] += rhs.sum.find(i)->second;
+ this->clipped_[i] += rhs.clipped_.find(i)->second;
+ this->sum_[i] += rhs.sum_.find(i)->second;
}
}
@@ -36,15 +36,24 @@ struct NgramCounts
}
inline void
+ operator*=(const score_t rhs)
+ {
+ for (unsigned i = 0; i < N_; i++) {
+ this->clipped_[i] *= rhs;
+ this->sum_[i] *= rhs;
+ }
+ }
+
+ inline void
Add(const unsigned count, const unsigned ref_count, const unsigned i)
{
assert(i < N_);
if (count > ref_count) {
- clipped[i] += ref_count;
+ clipped_[i] += ref_count;
} else {
- clipped[i] += count;
+ clipped_[i] += count;
}
- sum[i] += count;
+ sum_[i] += count;
}
inline void
@@ -52,8 +61,8 @@ struct NgramCounts
{
unsigned i;
for (i = 0; i < N_; i++) {
- clipped[i] = 0;
- sum[i] = 0;
+ clipped_[i] = 0;
+ sum_[i] = 0;
}
}
@@ -61,8 +70,8 @@ struct NgramCounts
Print()
{
for (unsigned i = 0; i < N_; i++) {
- cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;
- cout << i+1 << "grams:\t\t\t" << sum[i] << endl;
+ cout << i+1 << "grams (clipped):\t" << clipped_[i] << endl;
+ cout << i+1 << "grams:\t\t\t" << sum_[i] << endl;
}
}
};
@@ -106,35 +115,36 @@ make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const un
struct BleuScorer : public LocalScorer
{
score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len);
- score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank);
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
};
struct StupidBleuScorer : public LocalScorer
{
- score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank);
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
};
struct SmoothBleuScorer : public LocalScorer
{
- score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank);
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
};
struct ApproxBleuScorer : public BleuScorer
{
- NgramCounts glob_onebest_counts;
- unsigned glob_hyp_len, glob_ref_len;
+ NgramCounts glob_onebest_counts_;
+ unsigned glob_hyp_len_, glob_ref_len_, glob_src_len_;
+ score_t discount_;
- ApproxBleuScorer(unsigned N) : glob_onebest_counts(NgramCounts(N))
+ ApproxBleuScorer(unsigned N, score_t d) : glob_onebest_counts_(NgramCounts(N)), discount_(d)
{
- glob_hyp_len = glob_ref_len = 0;
+ glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0;
}
inline void Reset() {
- glob_onebest_counts.Zero();
- glob_hyp_len = glob_ref_len = 0;
+ glob_onebest_counts_.Zero();
+ glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0.;
}
- score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank);
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len);
};
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index cd2c75e7..2ad44688 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -4,18 +4,18 @@ decoder_config=test/example/cdec.ini # config for cdec
# weights for these features will be printed on each iteration
print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
tmp=/tmp
-stop_after=100 # stop epoch after 100 inputs
+stop_after=20 # stop epoch after 20 inputs
# interesting stuff
epochs=3 # run over input 3 times
k=100 # use 100best lists
N=4 # optimize (approx) BLEU4
-scorer=approx_bleu # use 'stupid' BLEU+1
+scorer=stupid_bleu # use 'stupid' BLEU+1
learning_rate=0.0001 # learning rate
gamma=0 # use SVM reg
sample_from=kbest # use kbest lists (as opposed to forest)
filter=uniq # only unique entries in kbest (surface form)
pair_sampling=XYX
-hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10
+hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here
pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0)
select_weights=VOID # don't output weights