From 288263494237aab8b7628b22b03f9b70ac93fb56 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Fri, 16 Oct 2015 10:27:13 +0200
Subject: dtrain
---
training/dtrain/README.md | 18 +++++
training/dtrain/dtrain.cc | 166 +++++++++++++++++++++++++++++++---------------
training/dtrain/dtrain.h | 110 ++++++++++++++++++------------
3 files changed, 197 insertions(+), 97 deletions(-)
(limited to 'training')
diff --git a/training/dtrain/README.md b/training/dtrain/README.md
index 73a6a5a5..dc473568 100644
--- a/training/dtrain/README.md
+++ b/training/dtrain/README.md
@@ -16,6 +16,24 @@ Running
-------
Download runnable examples for all use cases from [1] and extract here.
+TODO
+----
+ * "stop_after" stop after X inputs
+ * "select_weights" average, best, last
+ * "rescale" rescale weight vector
+ * implement SVM objective?
+ * other variants of l1 regularization?
+ * l2 regularization?
+ * l1/l2 regularization?
+ * scale updates by bleu difference
+ * AdaGrad, per-coordinate learning rates
+ * batch update
+ * "repeat" iterate over k-best lists
+ * show k-best loss improvement
+ * "quiet"
+ * "verbose"
+ * fix output
+
Legal
-----
Copyright (c) 2012-2015 by Patrick Simianer
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
index b39fff3e..e563f541 100644
--- a/training/dtrain/dtrain.cc
+++ b/training/dtrain/dtrain.cc
@@ -13,45 +13,65 @@ main(int argc, char** argv)
if (!dtrain_init(argc, argv, &conf))
return 1;
const size_t k = conf["k"].as();
+ const bool unique_kbest = conf["unique_kbest"].as();
+ const bool forest_sample = conf["forest_sample"].as();
const string score_name = conf["score"].as();
+ const weight_t nakov_fix = conf["nakov_fix"].as();
+ const weight_t chiang_decay = conf["chiang_decay"].as();
const size_t N = conf["N"].as();
const size_t T = conf["iterations"].as();
const weight_t eta = conf["learning_rate"].as();
const weight_t margin = conf["margin"].as();
+ const weight_t cut = conf["cut"].as();
+ const bool adjust_cut = conf["adjust"].as();
+ const bool all_pairs = cut==0;
const bool average = conf["average"].as();
- const bool structured = conf["struct"].as();
+ const bool pro = conf["pro_sampling"].as();
+ const bool structured = conf["structured"].as();
+ const weight_t threshold = conf["threshold"].as();
+ const size_t max_up = conf["max_pairs"].as();
const weight_t l1_reg = conf["l1_reg"].as();
const bool keep = conf["keep"].as();
const bool noup = conf["disable_learning"].as();
const string output_fn = conf["output"].as();
- const string output_data_which = conf["output_data"].as();
- const bool output_data = output_data_which!="";
vector print_weights;
boost::split(print_weights, conf["print_weights"].as(),
boost::is_any_of(" "));
+ const string output_updates_fn = conf["output_updates"].as();
+ const bool output_updates = output_updates_fn!="";
+ const string output_raw_fn = conf["output_raw"].as();
+ const bool output_raw = output_raw_fn!="";
- // setup decoder and scorer
+ // setup decoder
register_feature_functions();
SetSilent(true);
ReadFile f(conf["decoder_conf"].as());
Decoder decoder(f.stream());
+
+ // setup scorer & observer
Scorer* scorer;
if (score_name == "nakov") {
- scorer = static_cast(new PerSentenceBleuScorer(N));
+ scorer = static_cast(new NakovBleuScorer(N, nakov_fix));
} else if (score_name == "papineni") {
- scorer = static_cast(new BleuScorer(N));
+ scorer = static_cast(new PapineniBleuScorer(N));
} else if (score_name == "lin") {
- scorer = static_cast\
- (new OriginalPerSentenceBleuScorer(N));
+ scorer = static_cast(new LinBleuScorer(N));
} else if (score_name == "liang") {
- scorer = static_cast\
- (new SmoothPerSentenceBleuScorer(N));
+ scorer = static_cast(new LiangBleuScorer(N));
} else if (score_name == "chiang") {
- scorer = static_cast(new ApproxBleuScorer(N));
+ scorer = static_cast(new ChiangBleuScorer(N));
+ } else if (score_name == "sum") {
+ scorer = static_cast(new SumBleuScorer(N));
} else {
assert(false);
}
- ScoredKbest* observer = new ScoredKbest(k, scorer);
+ HypSampler* observer;
+ if (forest_sample)
+ observer = new KSampler(k, scorer);
+ else if (unique_kbest)
+ observer = new KBestSampler(k, scorer);
+ else
+ observer = new KBestNoFilterSampler(k, scorer);
// weights
vector& decoder_weights = decoder.CurrentWeightVector();
@@ -65,22 +85,46 @@ main(int argc, char** argv)
string input_fn = conf["bitext"].as();
ReadFile input(input_fn);
vector buf; // decoder only accepts strings as input
- vector > buf_ngs; // compute ngrams and lengths of references
- vector > buf_ls; // just once
+ vector > buffered_ngrams; // compute ngrams and lengths of references
+ vector > buffered_lengths; // (just once)
size_t input_sz = 0;
- cerr << _p4;
+ cerr << setprecision(4);
// output configuration
cerr << "Parameters:" << endl;
cerr << setw(25) << "bitext " << "'" << input_fn << "'" << endl;
cerr << setw(25) << "k " << k << endl;
+ if (unique_kbest && !forest_sample)
+ cerr << setw(25) << "unique k-best " << unique_kbest << endl;
+ if (forest_sample)
+ cerr << setw(25) << "forest " << forest_sample << endl;
+ if (all_pairs)
+ cerr << setw(25) << "all pairs " << all_pairs << endl;
+ else if (pro)
+ cerr << setw(25) << "PRO " << pro << endl;
cerr << setw(25) << "score " << "'" << score_name << "'" << endl;
+ if (score_name == "nakov")
+ cerr << setw(25) << "nakov fix " << nakov_fix << endl;
+ if (score_name == "chiang")
+ cerr << setw(25) << "chiang decay " << chiang_decay << endl;
cerr << setw(25) << "N " << N << endl;
cerr << setw(25) << "T " << T << endl;
cerr << setw(25) << "learning rate " << eta << endl;
cerr << setw(25) << "margin " << margin << endl;
+ if (!structured) {
+ cerr << setw(25) << "cut " << cut << endl;
+ cerr << setw(25) << "adjust " << adjust_cut << endl;
+ } else {
+ cerr << setw(25) << "struct. obj " << structured << endl;
+ }
+ if (threshold > 0)
+ cerr << setw(25) << "threshold " << threshold << endl;
+ if (max_up != numeric_limits::max())
+ cerr << setw(25) << "max up. " << max_up << endl;
+ if (noup)
+ cerr << setw(25) << "no up. " << noup << endl;
cerr << setw(25) << "average " << average << endl;
- cerr << setw(25) << "l1 reg " << l1_reg << endl;
+ cerr << setw(25) << "l1 reg. " << l1_reg << endl;
cerr << setw(25) << "decoder conf " << "'"
<< conf["decoder_conf"].as() << "'" << endl;
cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
@@ -89,6 +133,8 @@ main(int argc, char** argv)
cerr << setw(25) << "weights in " << "'"
<< conf["input_weights"].as() << "'" << endl;
}
+ if (noup)
+ cerr << setw(25) << "no updates!" << endl;
cerr << "(1 dot per processed input)" << endl;
// meta
@@ -96,6 +142,13 @@ main(int argc, char** argv)
size_t best_iteration = 0;
time_t total_time = 0.;
+ // output
+ WriteFile raw_out;
+ if (output_raw) raw_out.Init(output_raw_fn);
+ WriteFile updates_out;
+ if (output_updates) updates_out.Init(output_raw_fn);
+
+
for (size_t t = 0; t < T; t++) // T iterations
{
@@ -120,16 +173,16 @@ main(int argc, char** argv)
boost::algorithm::split_regex(parts, in, boost::regex(" \\|\\|\\| "));
buf.push_back(parts[0]);
parts.erase(parts.begin());
- buf_ngs.push_back({});
- buf_ls.push_back({});
+ buffered_ngrams.push_back({});
+ buffered_lengths.push_back({});
for (auto s: parts) {
vector r;
vector toks;
boost::split(toks, s, boost::is_any_of(" "));
for (auto tok: toks)
r.push_back(TD::Convert(tok));
- buf_ngs.back().emplace_back(MakeNgrams(r, N));
- buf_ls.back().push_back(r.size());
+ buffered_ngrams.back().emplace_back(ngrams(r, N));
+ buffered_lengths.back().push_back(r.size());
}
}
} else {
@@ -155,50 +208,54 @@ main(int argc, char** argv)
// decode
if (t > 0 || i > 0)
lambdas.init_vector(&decoder_weights);
- observer->SetReference(buf_ngs[i], buf_ls[i]);
+ observer->reference_ngrams = &buffered_ngrams[i];
+ observer->reference_lengths = &buffered_lengths[i];
decoder.Decode(buf[i], observer);
- vector* samples = observer->GetSamples();
-
- // stats for 1best
- gold_sum += samples->front().gold;
- model_sum += samples->front().model;
- feature_count += observer->GetFeatureCount();
- list_sz += observer->GetSize();
-
- if (output_data) {
- if (output_data_which == "kbest") {
- OutputKbest(samples);
- } else if (output_data_which == "default") {
- OutputMultipartitePairs(samples, margin);
- } else if (output_data_which == "all") {
- OutputAllPairs(samples);
- }
- }
+ vector* sample = &(observer->sample);
+
+ // stats for 1-best
+ gold_sum += sample->front().gold;
+ model_sum += sample->front().model;
+ feature_count += observer->feature_count;
+ list_sz += observer->effective_size;
+
+ if (output_raw)
+ output_sample(sample);
- // get pairs and update
+ // update model
if (!noup) {
SparseVector updates;
if (structured)
- num_up += CollectUpdatesStruct(samples, updates);
+ num_up += update_structured(sample, updates, margin/*,
+ output_updates, updates_out.get()*/); // FIXME
+ else if (all_pairs)
+ num_up += updates_all(sample, updates, max_up, threshold/*,
+ output_updates, updates_out.get()*/); // FIXME
+ else if (pro)
+ num_up += updates_pro(sample, updates, cut, max_up, threshold/*,
+ output_updates, updates_out.get()*/); // FIXME
else
- num_up += CollectUpdates(samples, updates, margin);
+ num_up += updates_multipartite(sample, updates, cut, margin,
+ max_up, threshold, adjust_cut/*,
+ output_updates, updates_out.get()*/); // FIXME
SparseVector lambdas_copy;
if (l1_reg)
lambdas_copy = lambdas;
lambdas.plus_eq_v_times_s(updates, eta);
- // update context for approx. BLEU
+ // update context for Chiang's approx. BLEU
if (score_name == "chiang") {
- for (auto it: *samples) {
+ for (auto it: *sample) {
if (it.rank == 0) {
- scorer->UpdateContext(it.w, buf_ngs[i], buf_ls[i], 0.9);
+ scorer->update_context(it.w, buffered_ngrams[i],
+ buffered_lengths[i], chiang_decay);
break;
}
}
}
- // l1 regularization
+ // \ell_1 regularization
// NB: regularization is done after each sentence,
// not after every single pair!
if (l1_reg) {
@@ -234,19 +291,22 @@ main(int argc, char** argv)
// stats
weight_t gold_avg = gold_sum/(weight_t)input_sz;
- cerr << _p << "WEIGHTS" << endl;
+ cerr << setiosflags(ios::showpos) << "WEIGHTS" << endl;
for (auto name: print_weights)
- cerr << setw(18) << name << " = " << lambdas.get(FD::Convert(name)) << endl;
+ cerr << setw(18) << name << " = "
+ << lambdas.get(FD::Convert(name)) << endl;
cerr << " ---" << endl;
- cerr << _np << " 1best avg score: " << gold_avg*100;
- cerr << _p << " (" << (gold_avg-gold_prev)*100 << ")" << endl;
+ cerr << resetiosflags(ios::showpos)
+ << " 1best avg score: " << gold_avg*100;
+ cerr << setiosflags(ios::showpos) << " ("
+ << (gold_avg-gold_prev)*100 << ")" << endl;
cerr << " 1best avg model score: "
<< model_sum/(weight_t)input_sz << endl;
cerr << " avg # updates: ";
- cerr << _np << num_up/(float)input_sz << endl;
- cerr << " non-0 feature count: " << lambdas.num_nonzero() << endl;
- cerr << " avg f count: " << feature_count/(float)list_sz << endl;
- cerr << " avg list sz: " << list_sz/(float)input_sz << endl;
+ cerr << resetiosflags(ios::showpos) << num_up/(float)input_sz << endl;
+ cerr << " non-0 feature count: " << lambdas.num_nonzero() << endl;
+ cerr << " avg f count: " << feature_count/(float)list_sz << endl;
+ cerr << " avg list sz: " << list_sz/(float)input_sz << endl;
if (gold_avg > best) {
best = gold_avg;
diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h
index 0bbb5c9b..18a7dbdc 100644
--- a/training/dtrain/dtrain.h
+++ b/training/dtrain/dtrain.h
@@ -22,59 +22,90 @@ namespace po = boost::program_options;
namespace dtrain
{
-struct ScoredHyp
+struct Hyp
{
+ Hyp() {}
+ Hyp(vector w, SparseVector f, weight_t model, weight_t gold,
+ size_t rank) : w(w), f(f), model(model), gold(gold), rank(rank) {}
+
vector w;
SparseVector f;
weight_t model, gold;
size_t rank;
};
-inline void
-PrintWordIDVec(vector& v, ostream& os=cerr)
-{
- for (size_t i = 0; i < v.size(); i++) {
- os << TD::Convert(v[i]);
- if (i < v.size()-1) os << " ";
- }
-}
-
-inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }
-inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); }
-inline ostream& _p4(ostream& out) { return out << setprecision(4); }
-
bool
-dtrain_init(int argc, char** argv, po::variables_map* conf)
+dtrain_init(int argc,
+ char** argv,
+ po::variables_map* conf)
{
po::options_description opts("Configuration File Options");
opts.add_options()
- ("bitext,b", po::value(), "bitext")
- ("decoder_conf,C", po::value(), "configuration file for decoder")
- ("iterations,T", po::value()->default_value(15), "number of iterations T (per shard)")
- ("k", po::value()->default_value(100), "size of kbest list")
- ("learning_rate,l", po::value()->default_value(0.00001), "learning rate")
- ("l1_reg,r", po::value()->default_value(0.), "l1 regularization strength")
- ("margin,m", po::value()->default_value(1.0), "margin for margin perceptron")
- ("score,s", po::value()->default_value("chiang"), "per-sentence BLEU approx.")
- ("N", po::value()->default_value(4), "N for BLEU approximation")
- ("input_weights,w", po::value(), "input weights file")
- ("average,a", po::bool_switch()->default_value(true), "output average weights")
- ("keep,K", po::bool_switch()->default_value(false), "output a weight file per iteration")
- ("struct,S", po::bool_switch()->default_value(false), "structured SGD with hope/fear")
- ("output,o", po::value()->default_value("-"), "output weights file, '-' for STDOUT")
- ("disable_learning,X", po::bool_switch()->default_value(false), "disable learning")
- ("output_data,D", po::value()->default_value(""), "output data to STDOUT; arg. is 'kbest', 'default' or 'all'")
+ ("bitext,b", po::value(),
+ "bitext, source and references in a single file [e ||| f]")
+ ("decoder_conf,C", po::value(),
+ "decoder configuration file")
+ ("iterations,T", po::value()->default_value(15),
+ "number of iterations T")
+ ("k", po::value()->default_value(100),
+ "sample size per input (e.g. size of k-best lists)")
+ ("unique_kbest", po::bool_switch()->default_value(true),
+ "unique k-best lists")
+ ("forest_sample", po::bool_switch()->default_value(false),
+ "sample k hyptheses from forest instead of using k-best list")
+ ("learning_rate,l", po::value()->default_value(0.00001),
+ "learning rate [only meaningful if margin>0 or input weights are given]")
+ ("l1_reg,r", po::value()->default_value(0.),
+ "l1 regularization strength [see Tsuruoka, Tsujii and Ananiadou (2009)]")
+ ("margin,m", po::value()->default_value(1.0),
+ "margin for margin perceptron [set =0 for standard perceptron]")
+ ("cut,u", po::value()->default_value(0.1),
+ "use top/bottom 10% (default) of k-best as 'good' and 'bad' for \
+pair sampling, 0 to use all pairs TODO")
+ ("adjust,A", po::bool_switch()->default_value(false),
+ "adjust cut for optimal pos. in k-best to cut")
+ ("score,s", po::value()->default_value("chiang"),
+ "per-sentence BLEU (approx.)")
+ ("nakov_fix", po::value()->default_value(1.0),
+ "add to reference length [see score.h]")
+ ("chiang_decay", po::value()->default_value(0.9),
+ "decaying factor for Chiang's approx. BLEU")
+ ("N", po::value()->default_value(4),
+ "N for BLEU approximation")
+ ("input_weights,w", po::value(),
+ "weights to initialize model")
+ ("average,a", po::bool_switch()->default_value(true),
+ "output average weights")
+ ("keep,K", po::bool_switch()->default_value(false),
+ "output a weight file per iteration [as weights.T.gz]")
+ ("structured,S", po::bool_switch()->default_value(false),
+ "structured prediction objective [hope/fear] w/ SGD")
+ ("pro_sampling", po::bool_switch()->default_value(false),
+ "updates from pairs selected as shown in Fig.4 of (Hopkins and May, 2011) [Gamma=max_pairs (default 5000), Xi=cut (default 50); threshold default 0.05]")
+ ("threshold", po::value()->default_value(0.),
+ "(min.) threshold in terms of gold score for pair selection")
+ ("max_pairs",
+ po::value()->default_value(numeric_limits::max()),
+ "max. number of updates/pairs")
+ ("output,o", po::value()->default_value("-"),
+ "output weights file, '-' for STDOUT")
+ ("disable_learning,X", po::bool_switch()->default_value(false),
+ "fix model")
+ ("output_updates,U", po::value()->default_value(""),
+ "output updates (diff. vectors) [to filename]")
+ ("output_raw,R", po::value()->default_value(""),
+ "output raw data (e.g. k-best lists) [to filename]")
("print_weights,P", po::value()->default_value("EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV"),
- "list of weights to print after each iteration");
+ "list of weights to print after each iteration");
po::options_description clopts("Command Line Options");
clopts.add_options()
("conf,c", po::value(), "dtrain configuration file")
("help,h", po::bool_switch(), "display options");
opts.add(clopts);
po::store(parse_command_line(argc, argv, opts), *conf);
- cerr << "dtrain" << endl << endl;
+ cerr << "*dtrain*" << endl << endl;
if ((*conf)["help"].as()) {
- cerr << opts << endl;
+ cerr << setprecision(3) << opts << endl;
return false;
}
@@ -90,20 +121,11 @@ dtrain_init(int argc, char** argv, po::variables_map* conf)
return false;
}
if (!conf->count("bitext")) {
- cerr << "No input given." << endl;
+ cerr << "No input bitext." << endl;
cerr << opts << endl;
return false;
}
- if ((*conf)["output_data"].as() != "") {
- if ((*conf)["output_data"].as() != "kbest" &&
- (*conf)["output_data"].as() != "default" &&
- (*conf)["output_data"].as() != "all") {
- cerr << "Wrong 'output_data' argument: ";
- cerr << (*conf)["output_data"].as() << endl;
- return false;
- }
- }
return true;
}
--
cgit v1.2.3