From 5f55de43762bbab3702edfee5ad3dc9c0a38db01 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 30 Nov 2013 18:03:30 -0500 Subject: fix l1 implementation to ensure greater sparsity --- training/crf/mpi_adagrad_optimize.cc | 58 ++++++++++++++++++++++++++++-------- training/crf/mpi_compute_cllh.cc | 1 + 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/training/crf/mpi_adagrad_optimize.cc b/training/crf/mpi_adagrad_optimize.cc index af963e3a..e1ee789c 100644 --- a/training/crf/mpi_adagrad_optimize.cc +++ b/training/crf/mpi_adagrad_optimize.cc @@ -188,7 +188,7 @@ class AdaGradOptimizer { explicit AdaGradOptimizer(double e) : eta(e), G() {} - void update(const SparseVector& g, vector* x) { + void update(const SparseVector& g, vector* x, SparseVector* sx) { if (x->size() > G.size()) G.resize(x->size(), 0.0); #if HAVE_CXX11 for (auto& gi : g) { @@ -199,6 +199,7 @@ class AdaGradOptimizer { if (gi.second) { G[gi.first] += gi.second * gi.second; (*x)[gi.first] -= eta / sqrt(G[gi.first]) * gi.second; + sx->add_value(gi.first, -eta / sqrt(G[gi.first]) * gi.second); } } } @@ -213,7 +214,7 @@ class AdaGradL1Optimizer { eta(e), lambda(l), G() {} - void update(const SparseVector& g, vector* x) { + void update(const SparseVector& g, vector* x, SparseVector* sx) { t += 1.0; if (x->size() > G.size()) { G.resize(x->size(), 0.0); @@ -228,13 +229,37 @@ class AdaGradL1Optimizer { if (gi.second) { u[gi.first] += gi.second; G[gi.first] += gi.second * gi.second; - double z = fabs(u[gi.first] / t) - lambda; - double s = 1; - if (u[gi.first] > 0) s = -1; - if (z > 0 && G[gi.first]) - (*x)[gi.first] = eta * s * z * t / sqrt(G[gi.first]); - else - (*x)[gi.first] = 0.0; + sx->set_value(gi.first, 1.0); // this is a dummy value to trigger recomputation + } + } + + // compute updates (avoid invalidating iterators by putting them all + // in the vector vupdate and applying them after this) + vector> vupdate; +#if HAVE_CXX11 + for (auto& xi : *sx) { +#else + for (SparseVector::const_iterator it = sx->begin(); it != sx->end(); ++it) { + const pair& gi = *it; +#endif + double z = fabs(u[xi.first] / t) - lambda; + double s = 1; + if (u[xi.first] > 0) s = -1; + if (z > 0 && G[xi.first]) { + vupdate.push_back(make_pair(xi.first, eta * s * z * t / sqrt(G[xi.first]))); + } else { + vupdate.push_back(make_pair(xi.first, 0.0)); + } + } + + // apply updates + for (unsigned i = 0; i < vupdate.size(); ++i) { + if (vupdate[i].second) { + sx->set_value(vupdate[i].first, vupdate[i].second); + (*x)[vupdate[i].first] = vupdate[i].second; + } else { + (*x)[vupdate[i].first] = 0.0; + sx->erase(vupdate[i].first); } } } @@ -323,6 +348,8 @@ int main(int argc, char** argv) { lambdas.swap(init_weights); init_weights.clear(); } + SparseVector lambdas_sparse; + Weights::InitSparseVector(lambdas, &lambdas_sparse); //AdaGradOptimizer adagrad(conf["eta"].as()); AdaGradL1Optimizer adagrad(conf["eta"].as(), conf["regularization_strength"].as()); @@ -338,6 +365,13 @@ int main(int argc, char** argv) { mpi::timer timer; #endif ++iter; + if (iter > 1) { + lambdas_sparse.init_vector(&lambdas); + if (rank == 0) { + Weights::SanityCheck(lambdas); + Weights::ShowLargestFeatures(lambdas); + } + } observer.Reset(); if (rank == 0) { converged = (iter == max_iteration); @@ -376,12 +410,10 @@ int main(int argc, char** argv) { if (rank == 0) { g /= minibatch_size; lambdas.resize(FD::NumFeats(), 0.0); // might have seen new features - adagrad.update(g, &lambdas); - Weights::SanityCheck(lambdas); - Weights::ShowLargestFeatures(lambdas); + adagrad.update(g, &lambdas, &lambdas_sparse); } #ifdef HAVE_MPI - broadcast(world, lambdas, 0); + broadcast(world, lambdas_sparse, 0); broadcast(world, converged, 0); world.barrier(); if (rank == 0) { cerr << " ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; } diff --git a/training/crf/mpi_compute_cllh.cc b/training/crf/mpi_compute_cllh.cc index 066389d0..7e38da3a 100644 --- a/training/crf/mpi_compute_cllh.cc +++ b/training/crf/mpi_compute_cllh.cc @@ -120,6 +120,7 @@ int main(int argc, char** argv) { reduce(world, observer.trg_words, total_words, std::plus(), 0); #else objective = observer.acc_obj; + total_words = observer.trg_words; #endif if (rank == 0) { -- cgit v1.2.3