From d5b5e9e31ca3f222ba6cfe5e788a14a087c0b66d Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 27 Jan 2016 17:26:24 +0100 Subject: dtrain_net_interface: support for per coordinate learning rates --- training/dtrain/dtrain_net_interface.cc | 141 +++++++++++++++++++++++++------- training/dtrain/dtrain_net_interface.h | 45 ++++++++-- 2 files changed, 151 insertions(+), 35 deletions(-) (limited to 'training') diff --git a/training/dtrain/dtrain_net_interface.cc b/training/dtrain/dtrain_net_interface.cc index 77ccde55..340b9a86 100644 --- a/training/dtrain/dtrain_net_interface.cc +++ b/training/dtrain/dtrain_net_interface.cc @@ -18,8 +18,6 @@ main(int argc, char** argv) exit(1); // something is wrong const size_t k = conf["k"].as(); const size_t N = conf["N"].as(); - weight_t eta = conf["learning_rate"].as(); - weight_t eta_sparse = conf["learning_rate_sparse"].as(); const weight_t margin = conf["margin"].as(); const string master_addr = conf["addr"].as(); const string output_fn = conf["output"].as(); @@ -46,18 +44,37 @@ main(int argc, char** argv) Weights::InitSparseVector(decoder_weights, &original_lambdas); } + // learning rates + SparseVector learning_rates, original_learning_rates; + weight_t learning_rate_R, original_learning_rate_R; + weight_t learning_rate_RB, original_learning_rate_RB; + weight_t learning_rate_Shape, original_learning_rate_Shape; + vector l; + Weights::InitFromFile(conf["learning_rates"].as(), &l); + Weights::InitSparseVector(l, &learning_rates); + original_learning_rates = learning_rates; + learning_rate_R = conf["learning_rate_R"].as(); + original_learning_rate_R = learning_rate_R; + learning_rate_RB = conf["learning_rate_RB"].as(); + original_learning_rate_RB = learning_rate_RB; + learning_rate_Shape = conf["learning_rate_Shape"].as(); + original_learning_rate_Shape = learning_rate_Shape; + cerr << _p4; // output configuration cerr << "dtrain_net_interface" << endl << "Parameters:" << endl; cerr << setw(25) << "k " << k << endl; cerr << setw(25) << "N " << N << endl; - cerr << setw(25) << "eta " << eta << endl; - cerr << setw(25) << "eta (sparse) " << eta_sparse << endl; cerr << setw(25) << "margin " << margin << endl; cerr << setw(25) << "decoder conf " << "'" << conf["decoder_conf"].as() << "'" << endl; cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; - cerr << setw(25) << "debug " << "'" << debug_fn << "'" << endl; + cerr << setw(25) << "debug " << "'" << debug_fn << "'" << endl; + cerr << setw(25) << "learning rates " << "'" + << conf["learning_rates"].as() << "'" << endl; + cerr << setw(25) << "learning rate R " << learning_rate_R << endl; + cerr << setw(25) << "learning rate RB " << learning_rate_RB << endl; + cerr << setw(25) << "learning rate Shape " << learning_rate_Shape << endl; // setup socket nn::socket sock(AF_SP, NN_PAIR); @@ -89,23 +106,62 @@ main(int argc, char** argv) const string in(buf, buf+sz); nn::freemsg(buf); cerr << "[dtrain] got input '" << in << "'" << endl; - if (boost::starts_with(in, "set_learning_rate")) { // set learning rate + if (boost::starts_with(in, "set_learning_rates")) { // set learning rates stringstream ss(in); - string x; weight_t w; - ss >> x; ss >> w; - cerr << "[dtrain] setting (dense) learning rate to " << w << " (was: " << eta << ")" << endl; - eta = w; + string _,name; weight_t w; + ss >> _; ss >> name; ss >> w; + weight_t before = 0; + ostringstream long_name; + if (name == "R") { + before = learning_rate_R; + learning_rate_R = w; + long_name << "rule id feature group"; + } else if (name == "RB") { + before = learning_rate_RB; + learning_rate_RB = w; + long_name << "rule bigram feature group"; + } else if (name == "Shape") { + before = learning_rate_Shape; + learning_rate_Shape = w; + long_name << "rule shape feature group"; + } else { + unsigned fid = FD::Convert(name); + before = learning_rates[fid]; + learning_rates[fid] = w; + long_name << "feature '" << name << "'"; + } + ostringstream o; + o << "set learning rate for " << long_name.str() << " to " << w + << " (was: " << before << ")" << endl; + string s = o.str(); + cerr << "[dtrain] " << s; + cerr << "[dtrain] done, looping again" << endl; + sock.send(s.c_str(), s.size()+1, 0); + continue; + } else if (boost::starts_with(in, "reset_learning_rates")) { + cerr << "[dtrain] resetting learning rates" << endl; + learning_rates = original_learning_rates; + learning_rate_R = original_learning_rate_R; + learning_rate_RB = original_learning_rate_RB; + learning_rate_Shape = original_learning_rate_Shape; cerr << "[dtrain] done, looping again" << endl; sock.send(done.c_str(), done.size()+1, 0); continue; - } else if (boost::starts_with(in, "set_sparse_learning_rate")) { // set sparse learning rate + } else if (boost::starts_with(in, "set_weights")) { // set learning rates stringstream ss(in); - string x; weight_t w; - ss >> x; ss >> w; - cerr << "[dtrain] setting sparse learning rate to " << w << " (was: " << eta_sparse << ")" << endl; - eta_sparse = w; + string _,name; weight_t w; + ss >> _; ss >> name; ss >> w; + weight_t before = 0; + ostringstream o; + unsigned fid = FD::Convert(name); + before = lambdas[fid]; + lambdas[fid] = w; + o << "set weight for feature '" << name << "'" + << "' to " << w << " (was: " << before << ")" << endl; + string s = o.str(); + cerr << "[dtrain] " << s; cerr << "[dtrain] done, looping again" << endl; - sock.send(done.c_str(), done.size()+1, 0); + sock.send(s.c_str(), s.size()+1, 0); continue; } else if (boost::starts_with(in, "reset_weights")) { // reset weights cerr << "[dtrain] resetting weights" << endl; @@ -143,7 +199,9 @@ main(int argc, char** argv) cerr << "[dtrain] learning ..." << endl; source = parts[0]; // debug -- - debug_output << "\"source\":\"" << source.substr(source.find_first_of(">")+2, source.find_last_of(">")-6) << "\"," << endl; + debug_output << "\"source\":\"" + << source.substr(source.find_first_of(">")+2, source.find_last_of(">")-6) + << "\"," << endl; debug_output << "\"target\":\"" << parts[1] << "\"," << endl; // -- debug parts.erase(parts.begin()); @@ -176,7 +234,8 @@ main(int argc, char** argv) debug_output << "\"kbest\":[" << endl; size_t h = 0; for (auto s: *samples) { - debug_output << "\"" << s.gold << " ||| " << s.model << " ||| " << s.rank << " ||| "; + debug_output << "\"" << s.gold << " ||| " + << s.model << " ||| " << s.rank << " ||| "; for (auto o: s.f) debug_output << FD::Convert(o.first) << "=" << o.second << " "; debug_output << " ||| "; @@ -191,37 +250,59 @@ main(int argc, char** argv) debug_output << "]," << endl; debug_output << "\"samples_size\":" << samples->size() << "," << endl; debug_output << "\"weights_before\":{" << endl; - weightsToJson(lambdas, debug_output); + sparseVectorToJson(lambdas, debug_output); debug_output << "}," << endl; // -- debug - // get pairs and update + // get pairs SparseVector updates; size_t num_up = CollectUpdates(samples, updates, margin); + + // debug -- debug_output << "\"1best_features\":\"" << (*samples)[0].f << "\"," << endl; debug_output << "\"update_raw\":\"" << updates << "\"," << endl; - updates *= eta_sparse; // apply learning rate for sparse features - for (auto feat: dense_features) { // apply learning rate for dense features - updates[FD::Convert(feat)] /= eta_sparse; - updates[FD::Convert(feat)] *= eta; + // -- debug + + // update + for (auto it: updates) { + string fname = FD::Convert(it.first); + unsigned k = it.first; + weight_t v = it.second; + if (learning_rates.find(it.first) != learning_rates.end()) { + updates[k] = learning_rates[k]*v; + } else { + if (boost::starts_with(fname, "R:")) { + updates[k] = learning_rate_R*v; + } else if (boost::starts_with(fname, "RBS:") || + boost::starts_with(fname, "RBT:")) { + updates[k] = learning_rate_RB*v; + } else if (boost::starts_with(fname, "Shape_")) { + updates[k] = learning_rate_Shape*v; + } + } } - debug_output << "\"update\":\"" << updates << "\"," << endl; + lambdas.plus_eq_v_times_s(updates, 1.0); + i++; + // debug -- + debug_output << "\"update\":\"" << updates << "\"," << endl; debug_output << "\"num_up\":" << num_up << "," << endl; debug_output << "\"updated_features\":" << updates.size() << "," << endl; - debug_output << "\"learning_rate\":" << eta << "," << endl; - debug_output << "\"learning_rate_sparse\":" << eta_sparse << "," << endl; + debug_output << "\"learning_rate_R\":" << learning_rate_R << "," << endl; + debug_output << "\"learning_rate_RB\":" << learning_rate_R << "," << endl; + debug_output << "\"learning_rate_Shape\":" << learning_rate_R << "," << endl; + debug_output << "\"learning_rates\":{" << endl; + sparseVectorToJson(learning_rates, debug_output); + debug_output << "}," << endl; debug_output << "\"best_match\":\""; PrintWordIDVec((*samples)[0].w, debug_output); debug_output << "\"," << endl; debug_output << "\"best_match_score\":" << (*samples)[0].gold << "," << endl ; // -- debug - lambdas.plus_eq_v_times_s(updates, 1.0); - i++; // debug -- debug_output << "\"weights_after\":{" << endl; - weightsToJson(lambdas, debug_output); + sparseVectorToJson(lambdas, debug_output); debug_output << "}" << endl; debug_output << "}" << endl; // -- debug diff --git a/training/dtrain/dtrain_net_interface.h b/training/dtrain/dtrain_net_interface.h index b201c7a3..720c4c9b 100644 --- a/training/dtrain/dtrain_net_interface.h +++ b/training/dtrain/dtrain_net_interface.h @@ -6,13 +6,42 @@ namespace dtrain { +/* + * source: http://stackoverflow.com/questions/7724448/\ + simple-json-string-escape-for-c/33799784#33799784 + * + */ +inline string +escapeJson(const string& s) { + ostringstream o; + for (auto c = s.cbegin(); c != s.cend(); c++) { + switch (*c) { + case '"': o << "\\\""; break; + case '\\': o << "\\\\"; break; + case '\b': o << "\\b"; break; + case '\f': o << "\\f"; break; + case '\n': o << "\\n"; break; + case '\r': o << "\\r"; break; + case '\t': o << "\\t"; break; + default: + if ('\x00' <= *c && *c <= '\x1f') { + o << "\\u" + << std::hex << std::setw(4) << std::setfill('0') << (int)*c; + } else { + o << *c; + } + } + } + return o.str(); +} + inline void -weightsToJson(SparseVector& w, ostringstream& os) +sparseVectorToJson(SparseVector& w, ostringstream& os) { vector strs; for (typename SparseVector::iterator it=w.begin(),e=w.end(); it!=e; ++it) { ostringstream a; - a << "\"" << FD::Convert(it->first) << "\":" << it->second; + a << "\"" << escapeJson(FD::Convert(it->first)) << "\":" << it->second; strs.push_back(a.str()); } for (vector::const_iterator it=strs.begin(); it!=strs.end(); it++) { @@ -62,10 +91,12 @@ dtrain_net_init(int argc, char** argv, po::variables_map* conf) ("margin,m", po::value()->default_value(0.), "margin for margin perceptron") ("output,o", po::value()->default_value(""), "final weights file") ("input_weights,w", po::value(), "input weights file") - ("learning_rate,l", po::value()->default_value(0.001), "learning rate") - ("learning_rate_sparse,l", po::value()->default_value(0.00001), "learning rate for sparse features") + ("learning_rates,l", po::value(), "pre-defined learning rates per feature") + ("learning_rate_R", po::value(), "learning rate for rule id features") + ("learning_rate_RB", po::value(), "learning rate for rule bigram features") + ("learning_rate_Shape", po::value(), "learning rate for shape features") ("output_derivation,E", po::bool_switch()->default_value(false), "output derivation, not viterbi str") - ("output_rules,R", po::bool_switch()->default_value(false), "also output rules") + ("output_rules,R", po::bool_switch()->default_value(false), "also output rules") ("dense_features,D", po::value()->default_value("EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV Shape_S01111_T11011 Shape_S11110_T11011 Shape_S11100_T11000 Shape_S01110_T01110 Shape_S01111_T01111 Shape_S01100_T11000 Shape_S10000_T10000 Shape_S11100_T11100 Shape_S11110_T11110 Shape_S11110_T11010 Shape_S01100_T11100 Shape_S01000_T01000 Shape_S01010_T01010 Shape_S01111_T01011 Shape_S01100_T01100 Shape_S01110_T11010 Shape_S11000_T11000 Shape_S11000_T01100 IsSupportedOnline NewRule KnownRule OOVFix"), "dense features") ("debug_output,d", po::value()->default_value(""), "file for debug output"); @@ -84,6 +115,10 @@ dtrain_net_init(int argc, char** argv, po::variables_map* conf) cerr << "Missing decoder configuration. Exiting." << endl; return false; } + if (!conf->count("learning_rates")) { + cerr << "Missing learning rates. Exiting." << endl; + return false; + } if (!conf->count("addr")) { cerr << "No master address given! Exiting." << endl; return false; -- cgit v1.2.3