diff options
-rw-r--r-- | training/dtrain/dtrain.cc | 111 | ||||
-rw-r--r-- | training/dtrain/dtrain.h | 15 | ||||
-rw-r--r-- | training/dtrain/update.h | 36 |
3 files changed, 125 insertions, 37 deletions
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 3e9902ab..53e8cd50 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -41,6 +41,13 @@ main(int argc, char** argv) const bool output_updates = output_updates_fn!=""; const string output_raw_fn = conf["output_raw"].as<string>(); const bool output_raw = output_raw_fn!=""; + const bool use_adadelta = conf["adadelta"].as<bool>(); + const weight_t adadelta_decay = conf["adadelta_decay"].as<weight_t>(); + const weight_t adadelta_eta = 0.000001; + const string adadelta_input = conf["adadelta_input"].as<string>(); + const string adadelta_output = conf["adadelta_output"].as<string>(); + const size_t max_input = conf["stop_after"].as<size_t>(); + const bool batch = conf["batch"].as<bool>(); // setup decoder register_feature_functions(); @@ -89,8 +96,8 @@ main(int argc, char** argv) vector<vector<size_t> > buffered_lengths; // (just once) size_t input_sz = 0; - cerr << setprecision(4); // output configuration + cerr << fixed << setprecision(4); cerr << "Parameters:" << endl; cerr << setw(25) << "bitext " << "'" << input_fn << "'" << endl; cerr << setw(25) << "k " << k << endl; @@ -109,10 +116,10 @@ main(int argc, char** argv) cerr << setw(25) << "chiang decay " << chiang_decay << endl; cerr << setw(25) << "N " << N << endl; cerr << setw(25) << "T " << T << endl; - cerr << setw(25) << "learning rate " << eta << endl; + cerr << scientific << setw(25) << "learning rate " << eta << endl; cerr << setw(25) << "margin " << margin << endl; if (!structured) { - cerr << setw(25) << "cut " << round(cut*100) << "%" << endl; + cerr << fixed << setw(25) << "cut " << round(cut*100) << "%" << endl; cerr << setw(25) << "adjust " << adjust_cut << endl; } else { cerr << setw(25) << "struct. obj " << structured << endl; @@ -124,7 +131,7 @@ main(int argc, char** argv) if (noup) cerr << setw(25) << "no up. " << noup << endl; cerr << setw(25) << "average " << average << endl; - cerr << setw(25) << "l1 reg. " << l1_reg << endl; + cerr << scientific << setw(25) << "l1 reg. " << l1_reg << endl; cerr << setw(25) << "decoder conf " << "'" << conf["decoder_conf"].as<string>() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; @@ -133,8 +140,17 @@ main(int argc, char** argv) cerr << setw(25) << "weights in " << "'" << conf["input_weights"].as<string>() << "'" << endl; } + cerr << setw(25) << "batch " << batch << endl; if (noup) cerr << setw(25) << "no updates!" << endl; + if (use_adadelta) { + cerr << setw(25) << "adadelta " << use_adadelta << endl; + cerr << setw(25) << " decay " << adadelta_decay << endl; + if (adadelta_input != "") + cerr << setw(25) << "-input " << adadelta_input << endl; + if (adadelta_output != "") + cerr << setw(25) << "-output " << adadelta_output << endl; + } cerr << "(1 dot per processed input)" << endl; // meta @@ -153,10 +169,23 @@ main(int argc, char** argv) *out_up << setprecision(numeric_limits<double>::digits10+1); } + // adadelta + SparseVector<weight_t> gradient_accum, update_accum; + if (use_adadelta && adadelta_input!="") { + vector<weight_t> grads_tmp; + Weights::InitFromFile(adadelta_input+".gradient", &grads_tmp); + Weights::InitSparseVector(grads_tmp, &gradient_accum); + vector<weight_t> update_tmp; + Weights::InitFromFile(adadelta_input+".update", &update_tmp); + Weights::InitSparseVector(update_tmp, &update_accum); + } for (size_t t = 0; t < T; t++) // T iterations { + // batch update + SparseVector<weight_t> batch_update; + time_t start, end; time(&start); weight_t gold_sum=0., model_sum=0.; @@ -194,6 +223,9 @@ main(int argc, char** argv) next = i<input_sz; } + if (max_input == i) + next = false; + // produce some pretty output if (next) { if (i%20 == 0) @@ -225,7 +257,7 @@ main(int argc, char** argv) list_sz += observer->effective_size; if (output_raw) - output_sample(sample, *out_raw, i); + output_sample(sample, out_raw, i); // update model if (!noup) { @@ -233,21 +265,46 @@ main(int argc, char** argv) SparseVector<weight_t> updates; if (structured) num_up += update_structured(sample, updates, margin, - output_updates, *out_up, i); + out_up, i); else if (all_pairs) num_up += updates_all(sample, updates, max_up, threshold, - output_updates, *out_up, i); + out_up, i); else if (pro) num_up += updates_pro(sample, updates, cut, max_up, threshold, - output_updates, *out_up, i); + out_up, i); else num_up += updates_multipartite(sample, updates, cut, margin, max_up, threshold, adjust_cut, - output_updates, *out_up, i); + out_up, i); + SparseVector<weight_t> lambdas_copy; if (l1_reg) lambdas_copy = lambdas; - lambdas.plus_eq_v_times_s(updates, eta); + + if (use_adadelta) { // adadelta update + SparseVector<weight_t> squared; + for (auto it: updates) + squared[it.first] = pow(it.second, 2.0); + gradient_accum *= adadelta_decay; + squared *= 1.0-adadelta_decay; + gradient_accum += squared; + SparseVector<weight_t> u = gradient_accum + update_accum; + for (auto it: u) + u[it.first] = -1.0*( + sqrt(update_accum[it.first]+adadelta_eta) + / + sqrt(gradient_accum[it.first]+adadelta_eta) + ) * updates[it.first]; + lambdas += u; + update_accum *= adadelta_decay; + for (auto it: u) + u[it.first] = pow(it.second, 2.0); + update_accum = update_accum + (u*(1.0-adadelta_decay)); + } else if (batch) { + batch_update += updates; + } else { // regular update + lambdas.plus_eq_v_times_s(updates, eta); + } // update context for Chiang's approx. BLEU if (score_name == "chiang") { @@ -290,23 +347,47 @@ main(int argc, char** argv) if (t == 0) input_sz = i; // remember size of input (# lines) + // batch + if (batch) { + batch_update /= (weight_t)num_up; + lambdas.plus_eq_v_times_s(batch_update, eta); + lambdas.init_vector(&decoder_weights); + } + // update average if (average) w_average += lambdas; + if (adadelta_output != "") { + WriteFile g(adadelta_output+".gradient.gz"); + for (auto it: gradient_accum) + *g << FD::Convert(it.first) << " " << it.second << endl; + WriteFile u(adadelta_output+".update.gz"); + for (auto it: update_accum) + *u << FD::Convert(it.first) << " " << it.second << endl; + } + // stats weight_t gold_avg = gold_sum/(weight_t)input_sz; - cerr << setiosflags(ios::showpos) << "WEIGHTS" << endl; - for (auto name: print_weights) + cerr << setiosflags(ios::showpos) << scientific << "WEIGHTS" << endl; + for (auto name: print_weights) { cerr << setw(18) << name << " = " - << lambdas.get(FD::Convert(name)) << endl; + << lambdas.get(FD::Convert(name)); + if (use_adadelta) { + weight_t rate = -1.0*(sqrt(update_accum[FD::Convert(name)]+adadelta_eta) + / sqrt(gradient_accum[FD::Convert(name)]+adadelta_eta)); + cerr << " {" << rate << "}"; + } + cerr << endl; + } cerr << " ---" << endl; cerr << resetiosflags(ios::showpos) << " 1best avg score: " << gold_avg*100; - cerr << setiosflags(ios::showpos) << " (" + cerr << setiosflags(ios::showpos) << fixed << " (" << (gold_avg-gold_prev)*100 << ")" << endl; - cerr << " 1best avg model score: " + cerr << scientific << " 1best avg model score: " << model_sum/(weight_t)input_sz << endl; + cerr << fixed; cerr << " avg # updates: "; cerr << resetiosflags(ios::showpos) << num_up/(float)input_sz << endl; cerr << " non-0 feature count: " << lambdas.num_nonzero() << endl; diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h index b07edfdf..ce5b2101 100644 --- a/training/dtrain/dtrain.h +++ b/training/dtrain/dtrain.h @@ -57,11 +57,18 @@ dtrain_init(int argc, "learning rate [only meaningful if margin>0 or input weights are given]") ("l1_reg,r", po::value<weight_t>()->default_value(0.), "l1 regularization strength [see Tsuruoka, Tsujii and Ananiadou (2009)]") + ("adadelta,D", po::bool_switch()->default_value(false), + "use AdaDelta dynamic learning rates") + ("adadelta_decay", po::value<weight_t>()->default_value(0.9), + "decay for AdaDelta algorithm") + ("adadelta_input", po::value<string>()->default_value(""), + "input for AdaDelta's parameters, two files: file.gradient, and file.update") + ("adadelta_output", po::value<string>()->default_value(""), + "prefix for outputting AdaDelta's parameters") ("margin,m", po::value<weight_t>()->default_value(1.0), "margin for margin perceptron [set =0 for standard perceptron]") ("cut,u", po::value<weight_t>()->default_value(0.1), - "use top/bottom 10% (default) of k-best as 'good' and 'bad' for \ -pair sampling, 0 to use all pairs TODO") + "use top/bottom 10% (default) of k-best as 'good' and 'bad' for pair sampling, 0 to use all pairs TODO") ("adjust,A", po::bool_switch()->default_value(false), "adjust cut for optimal pos. in k-best to cut") ("score,s", po::value<string>()->default_value("nakov"), @@ -87,6 +94,8 @@ pair sampling, 0 to use all pairs TODO") ("max_pairs", po::value<size_t>()->default_value(numeric_limits<size_t>::max()), "max. number of updates/pairs") + ("batch,B", po::bool_switch()->default_value(false), + "perform batch updates") ("output,o", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT") ("disable_learning,X", po::bool_switch()->default_value(false), @@ -95,6 +104,8 @@ pair sampling, 0 to use all pairs TODO") "output updates (diff. vectors) [to filename]") ("output_raw,R", po::value<string>()->default_value(""), "output raw data (e.g. k-best lists) [to filename]") + ("stop_after", po::value<size_t>()->default_value(numeric_limits<size_t>::max()), + "only look at this number of segments") ("print_weights,P", po::value<string>()->default_value("EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV"), "list of weights to print after each iteration"); po::options_description clopts("Command Line Options"); diff --git a/training/dtrain/update.h b/training/dtrain/update.h index f6aa9842..405a3f76 100644 --- a/training/dtrain/update.h +++ b/training/dtrain/update.h @@ -20,9 +20,8 @@ updates_multipartite(vector<Hyp>* sample, size_t max_up, weight_t threshold, bool adjust, - bool output=false, - ostream& os=cout, - size_t id=0) + WriteFile& output, + size_t id) { size_t up = 0; size_t sz = sample->size(); @@ -50,7 +49,7 @@ updates_multipartite(vector<Hyp>* sample, || (threshold && (first.gold-second.gold < threshold))) continue; if (output) - os << id << "\t" << first.f-second.f << endl; + *output << id << "\t" << first.f-second.f << endl; updates += first.f-second.f; if (++up==max_up) return up; @@ -70,7 +69,7 @@ updates_multipartite(vector<Hyp>* sample, || (threshold && (first.gold-second.gold < threshold))) continue; if (output) - os << id << "\t" << first.f-second.f << endl; + *output << id << "\t" << first.f-second.f << endl; updates += first.f-second.f; if (++up==max_up) break; @@ -91,9 +90,8 @@ updates_all(vector<Hyp>* sample, SparseVector<weight_t>& updates, size_t max_up, weight_t threshold, - bool output=false, - ostream& os=cout, - size_t id=0) + WriteFile output, + size_t id) { size_t up = 0; size_t sz = sample->size(); @@ -108,7 +106,7 @@ updates_all(vector<Hyp>* sample, || (threshold && (first.gold-second.gold < threshold))) continue; if (output) - os << id << "\t" << first.f-second.f << endl; + *output << id << "\t" << first.f-second.f << endl; updates += first.f-second.f; if (++up==max_up) break; @@ -127,9 +125,8 @@ inline size_t update_structured(vector<Hyp>* sample, SparseVector<weight_t>& updates, weight_t margin, - bool output=false, - ostream& os=cout, - size_t id=0) + WriteFile output, + size_t id) { // hope sort(sample->begin(), sample->end(), [](Hyp first, Hyp second) @@ -147,13 +144,13 @@ update_structured(vector<Hyp>* sample, if (hope.gold != fear.gold) { updates += hope.f - fear.f; if (output) - os << id << "\t" << hope.f << "\t" << fear.f << endl; + *output << id << "\t" << hope.f << "\t" << fear.f << endl; return 1; } if (output) - os << endl; + *output << endl; return 0; } @@ -172,9 +169,8 @@ updates_pro(vector<Hyp>* sample, size_t maxs, size_t max_up, weight_t threshold, - bool output=false, - ostream& os=cout, - size_t id=0) + WriteFile& output, + size_t id) { size_t sz = sample->size(), s; @@ -202,7 +198,7 @@ updates_pro(vector<Hyp>* sample, for (auto i: g) { if (output) - os << id << "\t" << i.first->f-i.second->f << endl; + *output << id << "\t" << i.first->f-i.second->f << endl; updates += i.first->f-i.second->f; } @@ -215,7 +211,7 @@ updates_pro(vector<Hyp>* sample, */ inline void output_sample(vector<Hyp>* sample, - ostream& os=cout, + WriteFile& output, size_t id=0, bool sorted=true) { @@ -227,7 +223,7 @@ output_sample(vector<Hyp>* sample, } size_t j = 0; for (auto k: *sample) { - os << id << "\t" << j << "\t" << k.gold << "\t" << k.model + *output << id << "\t" << j << "\t" << k.gold << "\t" << k.model << "\t" << k.f << endl; j++; } |