summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dtrain/README.md3
-rw-r--r--dtrain/dtrain.cc75
-rw-r--r--dtrain/pairsampling.h6
-rw-r--r--dtrain/score.cc35
-rw-r--r--dtrain/score.h49
5 files changed, 125 insertions, 43 deletions
diff --git a/dtrain/README.md b/dtrain/README.md
index 350c7423..7aefcc55 100644
--- a/dtrain/README.md
+++ b/dtrain/README.md
@@ -41,7 +41,7 @@ DTRAIN_LOCAL.
Next
----
-+ approx. Bleu?
++ approx. Bleu? proper lc_bleu (init with X)
+ turn off inclusion
+ (dtrain|decoder) more meta-parameters testing
+ feature selection directly in dtrain
@@ -57,6 +57,7 @@ Next
+ resharding [nfold cross val.]
+ bigger LM, feats (target side Ng., word alignments etc.)
+ merge kbest lists
++ proper eval, pairwise ranking, forced transl
Legal
-----
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index 717d47a2..88413a1d 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -6,38 +6,39 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
{
po::options_description ini("Configuration File Options");
ini.add_options()
- ("input", po::value<string>()->default_value("-"), "input file")
- ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT")
- ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)")
- ("decoder_config", po::value<string>(), "configuration file for cdec")
- ("print_weights", po::value<string>(), "weights to print on each iteration")
- ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences")
- ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use")
- ("keep", po::value<bool>()->zero_tokens(), "keep weights files for each iteration")
- ("hstreaming", po::value<string>(), "run in hadoop streaming mode, arg is a task id")
- ("epochs", po::value<unsigned>()->default_value(10), "# of iterations T (per shard)")
- ("k", po::value<unsigned>()->default_value(100), "how many translations to sample")
- ("sample_from", po::value<string>()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'")
- ("filter", po::value<string>()->default_value("uniq"), "filter kbest list: 'not', 'uniq'")
- ("pair_sampling", po::value<string>()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'")
- ("hi_lo", po::value<float>()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5")
- ("pair_threshold", po::value<score_t>()->default_value(0.), "bleu [0,1] threshold to filter pairs")
- ("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)")
- ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_")
- ("learning_rate", po::value<weight_t>()->default_value(0.0001), "learning rate")
- ("gamma", po::value<weight_t>()->default_value(0.), "gamma for SVM (0 for perceptron)")
- ("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)")
- ("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input")
- ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)")
- ("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength")
- ("fselect", po::value<weight_t>()->default_value(-1), "TODO select top x percent (or by threshold) of features after each epoch")
- ("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU")
- ("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair")
- ("loss_margin", po::value<weight_t>()->default_value(0.), "update if no error in pref pair but model scores this near")
+ ("input", po::value<string>()->default_value("-"), "input file")
+ ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT")
+ ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)")
+ ("decoder_config", po::value<string>(), "configuration file for cdec")
+ ("print_weights", po::value<string>(), "weights to print on each iteration")
+ ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences")
+ ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use")
+ ("keep", po::value<bool>()->zero_tokens(), "keep weights files for each iteration")
+ ("hstreaming", po::value<string>(), "run in hadoop streaming mode, arg is a task id")
+ ("epochs", po::value<unsigned>()->default_value(10), "# of iterations T (per shard)")
+ ("k", po::value<unsigned>()->default_value(100), "how many translations to sample")
+ ("sample_from", po::value<string>()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'")
+ ("filter", po::value<string>()->default_value("uniq"), "filter kbest list: 'not', 'uniq'")
+ ("pair_sampling", po::value<string>()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'")
+ ("hi_lo", po::value<float>()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5")
+ ("pair_threshold", po::value<score_t>()->default_value(0.), "bleu [0,1] threshold to filter pairs")
+ ("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)")
+ ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_, lc_")
+ ("learning_rate", po::value<weight_t>()->default_value(0.0001), "learning rate")
+ ("gamma", po::value<weight_t>()->default_value(0.), "gamma for SVM (0 for perceptron)")
+ ("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)")
+ ("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input")
+ ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)")
+ ("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength")
+ ("fselect", po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPL") // TODO
+ ("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU")
+ ("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair")
+ ("loss_margin", po::value<weight_t>()->default_value(0.), "update if no error in pref pair but model scores this near")
+ ("max_pairs", po::value<unsigned>()->default_value(std::numeric_limits<unsigned>::max()), "max. # of pairs per Sent.")
#ifdef DTRAIN_LOCAL
- ("refs,r", po::value<string>(), "references in local mode")
+ ("refs,r", po::value<string>(), "references in local mode")
#endif
- ("noup", po::value<bool>()->zero_tokens(), "do not update weights");
+ ("noup", po::value<bool>()->zero_tokens(), "do not update weights");
po::options_description cl("Command Line Options");
cl.add_options()
("config,c", po::value<string>(), "dtrain config file")
@@ -135,6 +136,7 @@ main(int argc, char** argv)
const string select_weights = cfg["select_weights"].as<string>();
const float hi_lo = cfg["hi_lo"].as<float>();
const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>();
+ const unsigned max_pairs = cfg["max_pairs"].as<unsigned>();
weight_t loss_margin = cfg["loss_margin"].as<weight_t>();
if (loss_margin > 9998.) loss_margin = std::numeric_limits<float>::max();
bool scale_bleu_diff = false;
@@ -167,6 +169,8 @@ main(int argc, char** argv)
scorer = dynamic_cast<SmoothSingleBleuScorer*>(new SmoothSingleBleuScorer);
} else if (scorer_str == "approx_bleu") {
scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d));
+ } else if (scorer_str == "lc_bleu") {
+ scorer = dynamic_cast<LinearBleuScorer*>(new LinearBleuScorer(N));
} else {
cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;
exit(1);
@@ -257,6 +261,7 @@ main(int argc, char** argv)
cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl;
if (rescale)
cerr << setw(25) << "rescale " << rescale << endl;
+ cerr << "max pairs " << max_pairs << endl;
cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
#ifdef DTRAIN_LOCAL
@@ -421,17 +426,17 @@ main(int argc, char** argv)
// get pairs
vector<pair<ScoredHyp,ScoredHyp> > pairs;
if (pair_sampling == "all")
- all_pairs(samples, pairs, pair_threshold);
+ all_pairs(samples, pairs, pair_threshold, max_pairs);
if (pair_sampling == "XYX")
- partXYX(samples, pairs, pair_threshold, hi_lo);
+ partXYX(samples, pairs, pair_threshold, max_pairs, hi_lo);
if (pair_sampling == "PRO")
- PROsampling(samples, pairs, pair_threshold);
+ PROsampling(samples, pairs, pair_threshold, max_pairs);
npairs += pairs.size();
for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin();
it != pairs.end(); it++) {
#ifdef DTRAIN_FASTER_PERCEPTRON
- bool rank_error = true; // pair filtering already did this for us
+ bool rank_error = true; // pair sampling already did this for us
rank_errors++;
score_t margin = std::numeric_limits<float>::max();
#else
@@ -498,7 +503,7 @@ main(int argc, char** argv)
if (average) w_average += lambdas;
- if (scorer_str == "approx_bleu") scorer->Reset();
+ if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset();
if (t == 0) {
in_sz = ii; // remember size of input (# lines)
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index 32006a41..71c8ae59 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -19,7 +19,7 @@ cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b)
}
inline void
-all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused=1)
+all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float _unused=1)
{
sort(s->begin(), s->end(), cmp_hyp_by_score_d);
unsigned sz = s->size();
@@ -44,7 +44,7 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc
*/
inline void
-partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float hi_lo)
+partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float hi_lo)
{
unsigned sz = s->size();
if (sz < 2) return;
@@ -104,7 +104,7 @@ _PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b
return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
}
inline void
-PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused=1)
+PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float _unused=1)
{
unsigned max_count = 5000, count = 0, sz = s->size();
bool b = false;
diff --git a/dtrain/score.cc b/dtrain/score.cc
index b331dc4f..5c356c0f 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -122,12 +122,13 @@ SmoothSingleBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
unsigned j = 1;
for (unsigned i = 0; i < M; i++) {
if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break;
- sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2.0, N_-j+1);
+ sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2., N_-j+1);
j++;
}
return brevity_penalty(hyp_len, ref_len) * sum;
}
+
/*
* approx. bleu
*
@@ -160,6 +161,38 @@ ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
return (score_t)glob_src_len_ * score;
}
+/*
+ * Linear (Corpus) Bleu
+ *
+ * as in "Lattice Minimum Bayes-Risk Decoding
+ * for Statistical Machine Translation"
+ * (Tromble et al. '08)
+ *
+ */
+score_t
+LinearBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned rank, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (ref_len == 0) return 0.;
+ unsigned M = N_;
+ if (ref_len < N_) M = ref_len;
+ NgramCounts counts(M);
+ if (hyp_len > 0)
+ counts = make_ngram_counts(hyp, ref, M);
+ score_t ret = 0.;
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.sum_[i] == 0 || onebest_counts_.sum_[i] == 0) break;
+ ret += counts.sum_[i]/onebest_counts_.sum_[i];
+ }
+ ret = -(hyp_len/(score_t)onebest_len_) + (1./M) * ret;
+ if (rank == 0) {
+ onebest_len_ += hyp_len;
+ onebest_counts_ += counts;
+ }
+ return ret;
+}
+
} // namespace
diff --git a/dtrain/score.h b/dtrain/score.h
index d4fba22c..c5be2829 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -20,7 +20,7 @@ struct NgramCounts
inline void
operator+=(const NgramCounts& rhs)
{
- assert(N_ == rhs.N_);
+ if (rhs.N_ > N_) Resize(rhs.N_);
for (unsigned i = 0; i < N_; i++) {
this->clipped_[i] += rhs.clipped_.find(i)->second;
this->sum_[i] += rhs.sum_.find(i)->second;
@@ -59,14 +59,22 @@ struct NgramCounts
inline void
Zero()
{
- unsigned i;
- for (i = 0; i < N_; i++) {
+ for (unsigned i = 0; i < N_; i++) {
clipped_[i] = 0.;
sum_[i] = 0.;
}
}
inline void
+ One()
+ {
+ for (unsigned i = 0; i < N_; i++) {
+ clipped_[i] = 1.;
+ sum_[i] = 1.;
+ }
+ }
+
+ inline void
Print()
{
for (unsigned i = 0; i < N_; i++) {
@@ -74,6 +82,23 @@ struct NgramCounts
cout << i+1 << "grams:\t\t\t" << sum_[i] << endl;
}
}
+
+ inline void Resize(unsigned N)
+ {
+ if (N == N_) return;
+ else if (N > N_) {
+ for (unsigned i = N_; i < N; i++) {
+ clipped_[i] = 0.;
+ sum_[i] = 0.;
+ }
+ } else { // N < N_
+ for (unsigned i = N_-1; i > N-1; i--) {
+ clipped_.erase(i);
+ sum_.erase(i);
+ }
+ }
+ N_ = N;
+ }
};
typedef map<vector<WordID>, unsigned> Ngrams;
@@ -152,6 +177,24 @@ struct ApproxBleuScorer : public BleuScorer
score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len);
};
+struct LinearBleuScorer : public BleuScorer
+{
+ unsigned onebest_len_;
+ NgramCounts onebest_counts_;
+
+ LinearBleuScorer(unsigned N) : onebest_len_(1), onebest_counts_(N)
+ {
+ onebest_counts_.One();
+ }
+
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned /*src_len*/);
+
+ inline void Reset() {
+ onebest_len_ = 1;
+ onebest_counts_.One();
+ }
+};
+
} // namespace