summaryrefslogtreecommitdiff
path: root/training/lbl_model.cc
diff options
context:
space:
mode:
authorChris Dyer <prguest11@taipan.cs>2012-02-22 16:10:56 +0000
committerChris Dyer <prguest11@taipan.cs>2012-02-22 16:10:56 +0000
commitdd16e83d4a593392465ee317c43ffc2c490add2e (patch)
tree93784ca8fa33c2d84785ca43079f3ffb116d952f /training/lbl_model.cc
parentc0e9dc2889b6beb039c5365ebd0af6486b7ec574 (diff)
add regularization
Diffstat (limited to 'training/lbl_model.cc')
-rw-r--r--training/lbl_model.cc50
1 files changed, 41 insertions, 9 deletions
diff --git a/training/lbl_model.cc b/training/lbl_model.cc
index eb3e194d..a114bba7 100644
--- a/training/lbl_model.cc
+++ b/training/lbl_model.cc
@@ -12,6 +12,7 @@
#include <cstring> // memset
#include <ctime>
+#include <boost/math/special_functions/fpclassify.hpp>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
#include <Eigen/Dense>
@@ -27,7 +28,7 @@
namespace po = boost::program_options;
using namespace std;
-#define kDIMENSIONS 8
+#define kDIMENSIONS 110
typedef Eigen::Matrix<float, kDIMENSIONS, 1> RVector;
typedef Eigen::Matrix<float, 1, kDIMENSIONS> RTVector;
typedef Eigen::Matrix<float, kDIMENSIONS, kDIMENSIONS> TMatrix;
@@ -38,8 +39,9 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
opts.add_options()
("input,i",po::value<string>(),"Input file")
("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training")
+ ("regularization_strength,C",po::value<float>()->default_value(0.1),"L2 regularization strength (0 for no regularization)")
("eta,e", po::value<float>()->default_value(0.1f), "Eta for SGD")
- ("random_seed", po::value<unsigned>(), "Random seed")
+ ("random_seed,s", po::value<unsigned>(), "Random seed")
("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)")
("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model");
po::options_description clo("Command line options");
@@ -67,6 +69,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
void Normalize(RVector* v) {
float norm = v->norm();
+ assert(norm > 0.0f);
*v /= norm;
}
@@ -74,21 +77,42 @@ void Flatten(const TMatrix& m, vector<double>* v) {
unsigned c = 0;
v->resize(kDIMENSIONS * kDIMENSIONS);
for (unsigned i = 0; i < kDIMENSIONS; ++i)
- for (unsigned j = 0; j < kDIMENSIONS; ++j)
+ for (unsigned j = 0; j < kDIMENSIONS; ++j) {
+ assert(boost::math::isnormal(m(i, j)));
(*v)[c++] = m(i,j);
+ }
}
void Unflatten(const vector<double>& v, TMatrix* m) {
unsigned c = 0;
for (unsigned i = 0; i < kDIMENSIONS; ++i)
- for (unsigned j = 0; j < kDIMENSIONS; ++j)
+ for (unsigned j = 0; j < kDIMENSIONS; ++j) {
+ assert(boost::math::isnormal(v[c]));
(*m)(i, j) = v[c++];
+ }
+}
+
+double ApplyRegularization(const double C,
+ const vector<double>& weights,
+ vector<double>* g) {
+ assert(weights.size() == g->size());
+ double reg = 0;
+ for (size_t i = 0; i < weights.size(); ++i) {
+ const double& w_i = weights[i];
+ double& g_i = (*g)[i];
+ reg += C * w_i * w_i;
+ g_i += 2 * C * w_i;
+ }
+ return reg;
}
int main(int argc, char** argv) {
po::variables_map conf;
if (!InitCommandLine(argc, argv, &conf)) return 1;
const string fname = conf["input"].as<string>();
+ const float reg_strength = conf["regularization_strength"].as<float>();
+ const bool has_l2 = reg_strength;
+ assert(reg_strength >= 0.0f);
const int ITERATIONS = conf["iterations"].as<unsigned>();
const float eta = conf["eta"].as<float>();
const double diagonal_tension = conf["diagonal_tension"].as<double>();
@@ -147,7 +171,7 @@ int main(int argc, char** argv) {
cerr << "Random seed: " << seed << endl;
srand(seed);
}
- TMatrix t = TMatrix::Random() / 1024.0;
+ TMatrix t = TMatrix::Random() / 50.0;
for (unsigned i = 1; i < r_trg.size(); ++i) {
r_trg[i] = RVector::Random();
r_src[i] = RVector::Random();
@@ -159,7 +183,7 @@ int main(int argc, char** argv) {
vector<set<unsigned> > trg_pos(TD::NumWords() + 1);
// do optimization
- TMatrix g;
+ TMatrix g = TMatrix::Zero();
vector<TMatrix> exp_src;
vector<double> z_src;
vector<double> flat_g, flat_t;
@@ -265,11 +289,19 @@ int main(int argc, char** argv) {
const double base2_likelihood = likelihood / log(2);
cerr << " log_e likelihood: " << likelihood << endl;
cerr << " log_2 likelihood: " << base2_likelihood << endl;
- cerr << " cross entropy: " << (-base2_likelihood / denom) << endl;
- cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl;
+ cerr << " cross entropy: " << (-base2_likelihood / denom) << endl;
+ cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl;
if (!SGD) {
Flatten(g, &flat_g);
- lbfgs.Optimize(-likelihood, flat_g, &flat_t);
+ double obj = -likelihood;
+ if (has_l2) {
+ const double r = ApplyRegularization(reg_strength,
+ flat_t,
+ &flat_g);
+ obj += r;
+ cerr << " regularization: " << r << endl;
+ }
+ lbfgs.Optimize(obj, flat_g, &flat_t);
Unflatten(flat_t, &t);
if (lbfgs.HasConverged()) break;
}