diff options
author | Chris Dyer <prguest11@taipan.cs> | 2012-02-22 16:10:56 +0000 |
---|---|---|
committer | Chris Dyer <prguest11@taipan.cs> | 2012-02-22 16:10:56 +0000 |
commit | dd16e83d4a593392465ee317c43ffc2c490add2e (patch) | |
tree | 93784ca8fa33c2d84785ca43079f3ffb116d952f | |
parent | c0e9dc2889b6beb039c5365ebd0af6486b7ec574 (diff) |
add regularization
-rw-r--r-- | training/lbl_model.cc | 50 |
1 files changed, 41 insertions, 9 deletions
diff --git a/training/lbl_model.cc b/training/lbl_model.cc index eb3e194d..a114bba7 100644 --- a/training/lbl_model.cc +++ b/training/lbl_model.cc @@ -12,6 +12,7 @@ #include <cstring> // memset #include <ctime> +#include <boost/math/special_functions/fpclassify.hpp> #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> #include <Eigen/Dense> @@ -27,7 +28,7 @@ namespace po = boost::program_options; using namespace std; -#define kDIMENSIONS 8 +#define kDIMENSIONS 110 typedef Eigen::Matrix<float, kDIMENSIONS, 1> RVector; typedef Eigen::Matrix<float, 1, kDIMENSIONS> RTVector; typedef Eigen::Matrix<float, kDIMENSIONS, kDIMENSIONS> TMatrix; @@ -38,8 +39,9 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("input,i",po::value<string>(),"Input file") ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training") + ("regularization_strength,C",po::value<float>()->default_value(0.1),"L2 regularization strength (0 for no regularization)") ("eta,e", po::value<float>()->default_value(0.1f), "Eta for SGD") - ("random_seed", po::value<unsigned>(), "Random seed") + ("random_seed,s", po::value<unsigned>(), "Random seed") ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); po::options_description clo("Command line options"); @@ -67,6 +69,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { void Normalize(RVector* v) { float norm = v->norm(); + assert(norm > 0.0f); *v /= norm; } @@ -74,21 +77,42 @@ void Flatten(const TMatrix& m, vector<double>* v) { unsigned c = 0; v->resize(kDIMENSIONS * kDIMENSIONS); for (unsigned i = 0; i < kDIMENSIONS; ++i) - for (unsigned j = 0; j < kDIMENSIONS; ++j) + for (unsigned j = 0; j < kDIMENSIONS; ++j) { + assert(boost::math::isnormal(m(i, j))); (*v)[c++] = m(i,j); + } } void Unflatten(const vector<double>& v, TMatrix* m) { unsigned c = 0; for (unsigned i = 0; i < kDIMENSIONS; ++i) - for (unsigned j = 0; j < kDIMENSIONS; ++j) + for (unsigned j = 0; j < kDIMENSIONS; ++j) { + assert(boost::math::isnormal(v[c])); (*m)(i, j) = v[c++]; + } +} + +double ApplyRegularization(const double C, + const vector<double>& weights, + vector<double>* g) { + assert(weights.size() == g->size()); + double reg = 0; + for (size_t i = 0; i < weights.size(); ++i) { + const double& w_i = weights[i]; + double& g_i = (*g)[i]; + reg += C * w_i * w_i; + g_i += 2 * C * w_i; + } + return reg; } int main(int argc, char** argv) { po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) return 1; const string fname = conf["input"].as<string>(); + const float reg_strength = conf["regularization_strength"].as<float>(); + const bool has_l2 = reg_strength; + assert(reg_strength >= 0.0f); const int ITERATIONS = conf["iterations"].as<unsigned>(); const float eta = conf["eta"].as<float>(); const double diagonal_tension = conf["diagonal_tension"].as<double>(); @@ -147,7 +171,7 @@ int main(int argc, char** argv) { cerr << "Random seed: " << seed << endl; srand(seed); } - TMatrix t = TMatrix::Random() / 1024.0; + TMatrix t = TMatrix::Random() / 50.0; for (unsigned i = 1; i < r_trg.size(); ++i) { r_trg[i] = RVector::Random(); r_src[i] = RVector::Random(); @@ -159,7 +183,7 @@ int main(int argc, char** argv) { vector<set<unsigned> > trg_pos(TD::NumWords() + 1); // do optimization - TMatrix g; + TMatrix g = TMatrix::Zero(); vector<TMatrix> exp_src; vector<double> z_src; vector<double> flat_g, flat_t; @@ -265,11 +289,19 @@ int main(int argc, char** argv) { const double base2_likelihood = likelihood / log(2); cerr << " log_e likelihood: " << likelihood << endl; cerr << " log_2 likelihood: " << base2_likelihood << endl; - cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; - cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; + cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; + cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; if (!SGD) { Flatten(g, &flat_g); - lbfgs.Optimize(-likelihood, flat_g, &flat_t); + double obj = -likelihood; + if (has_l2) { + const double r = ApplyRegularization(reg_strength, + flat_t, + &flat_g); + obj += r; + cerr << " regularization: " << r << endl; + } + lbfgs.Optimize(obj, flat_g, &flat_t); Unflatten(flat_t, &t); if (lbfgs.HasConverged()) break; } |