summaryrefslogtreecommitdiff
path: root/training
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-02-29 01:12:40 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2012-02-29 01:12:40 -0500
commit54bcfb835232d190a5ab6f0bd825de8a50dae126 (patch)
tree161988135be75a47524cdf2efbba7bdf06c9bd65 /training
parent89238977fc9d8f8d9a6421b0d4f35afc200f08e7 (diff)
cleanup, mpi-ify lblmodel
Diffstat (limited to 'training')
-rw-r--r--training/lbl_model.cc179
1 files changed, 97 insertions, 82 deletions
diff --git a/training/lbl_model.cc b/training/lbl_model.cc
index 2af848b5..def5075a 100644
--- a/training/lbl_model.cc
+++ b/training/lbl_model.cc
@@ -12,11 +12,17 @@
#include <cstring> // memset
#include <ctime>
+#ifdef HAVE_MPI
+#include <boost/mpi/timer.hpp>
+#include <boost/mpi.hpp>
+namespace mpi = boost::mpi;
+#endif
#include <boost/math/special_functions/fpclassify.hpp>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
#include <Eigen/Dense>
+#include "corpus_tools.h"
#include "optimize.h"
#include "array2d.h"
#include "m.h"
@@ -29,9 +35,9 @@ namespace po = boost::program_options;
using namespace std;
#define kDIMENSIONS 100
-typedef Eigen::Matrix<float, kDIMENSIONS, 1> RVector;
-typedef Eigen::Matrix<float, 1, kDIMENSIONS> RTVector;
-typedef Eigen::Matrix<float, kDIMENSIONS, kDIMENSIONS> TMatrix;
+typedef Eigen::Matrix<double, kDIMENSIONS, 1> RVector;
+typedef Eigen::Matrix<double, 1, kDIMENSIONS> RTVector;
+typedef Eigen::Matrix<double, kDIMENSIONS, kDIMENSIONS> TMatrix;
vector<RVector> r_src, r_trg;
bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
@@ -39,8 +45,8 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
opts.add_options()
("input,i",po::value<string>(),"Input file")
("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training")
- ("regularization_strength,C",po::value<float>()->default_value(0.1),"L2 regularization strength (0 for no regularization)")
- ("eta", po::value<float>()->default_value(0.1f), "Eta for SGD")
+ ("regularization_strength,C",po::value<double>()->default_value(0.1),"L2 regularization strength (0 for no regularization)")
+ ("eta", po::value<double>()->default_value(0.1f), "Eta for SGD")
("source_embeddings,f", po::value<string>(), "File containing source embeddings (if unset, random vectors will be used)")
("target_embeddings,e", po::value<string>(), "File containing target embeddings (if unset, random vectors will be used)")
("random_seed,s", po::value<unsigned>(), "Random seed")
@@ -70,7 +76,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
}
void Normalize(RVector* v) {
- float norm = v->norm();
+ double norm = v->norm();
assert(norm > 0.0f);
*v /= norm;
}
@@ -80,7 +86,7 @@ void Flatten(const TMatrix& m, vector<double>* v) {
v->resize(kDIMENSIONS * kDIMENSIONS);
for (unsigned i = 0; i < kDIMENSIONS; ++i)
for (unsigned j = 0; j < kDIMENSIONS; ++j) {
- assert(boost::math::isnormal(m(i, j)));
+ assert(boost::math::isfinite(m(i, j)));
(*v)[c++] = m(i,j);
}
}
@@ -89,7 +95,7 @@ void Unflatten(const vector<double>& v, TMatrix* m) {
unsigned c = 0;
for (unsigned i = 0; i < kDIMENSIONS; ++i)
for (unsigned j = 0; j < kDIMENSIONS; ++j) {
- assert(boost::math::isnormal(v[c]));
+ assert(boost::math::isfinite(v[c]));
(*m)(i, j) = v[c++];
}
}
@@ -162,14 +168,25 @@ void LoadEmbeddings(const string& filename, vector<RVector>* pv) {
}
int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+ std::cerr << "**MPI enabled.\n";
+ mpi::environment env(argc, argv);
+ mpi::communicator world;
+ const int size = world.size();
+ const int rank = world.rank();
+#else
+ std::cerr << "**MPI disabled.\n";
+ const int rank = 0;
+ const int size = 1;
+#endif
po::variables_map conf;
if (!InitCommandLine(argc, argv, &conf)) return 1;
const string fname = conf["input"].as<string>();
- const float reg_strength = conf["regularization_strength"].as<float>();
+ const double reg_strength = conf["regularization_strength"].as<double>();
const bool has_l2 = reg_strength;
assert(reg_strength >= 0.0f);
const int ITERATIONS = conf["iterations"].as<unsigned>();
- const float eta = conf["eta"].as<float>();
+ const double eta = conf["eta"].as<double>();
const double diagonal_tension = conf["diagonal_tension"].as<double>();
bool SGD = false;
if (diagonal_tension < 0.0) {
@@ -181,61 +198,44 @@ int main(int argc, char** argv) {
unsigned lc = 0;
vector<double> unnormed_a_i;
- string line;
- string ssrc, strg;
bool flag = false;
- Lattice src, trg;
+ vector<vector<WordID> > srcs, trgs;
vector<WordID> vocab_e;
- { // read through corpus, initialize int map, check lines are good
- set<WordID> svocab_e;
- cerr << "INITIAL READ OF " << fname << endl;
- ReadFile rf(fname);
- istream& in = *rf.stream();
- while(getline(in, line)) {
- ++lc;
- if (lc % 1000 == 0) { cerr << '.'; flag = true; }
- if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
- ParseTranslatorInput(line, &ssrc, &strg);
- LatticeTools::ConvertTextToLattice(ssrc, &src);
- LatticeTools::ConvertTextToLattice(strg, &trg);
- if (src.size() == 0 || trg.size() == 0) {
- cerr << "Error: " << lc << "\n" << line << endl;
- assert(src.size() > 0);
- assert(trg.size() > 0);
- }
- if (src.size() > unnormed_a_i.size())
- unnormed_a_i.resize(src.size());
- for (unsigned i = 0; i < trg.size(); ++i) {
- assert(trg[i].size() == 1);
- svocab_e.insert(trg[i][0].label);
- }
- }
+ {
+ set<WordID> svocab_e, svocab_f;
+ CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size);
copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e));
}
- if (flag) cerr << endl;
cerr << "Number of target word types: " << vocab_e.size() << endl;
- const float num_examples = lc;
+ const double num_examples = lc;
- LBFGSOptimizer lbfgs(kDIMENSIONS * kDIMENSIONS, 100);
+ boost::shared_ptr<LBFGSOptimizer> lbfgs;
+ if (rank == 0)
+ lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100));
r_trg.resize(TD::NumWords() + 1);
r_src.resize(TD::NumWords() + 1);
+ vector<set<unsigned> > trg_pos(TD::NumWords() + 1);
+
if (conf.count("random_seed")) {
srand(conf["random_seed"].as<unsigned>());
} else {
- unsigned seed = time(NULL);
+ unsigned seed = time(NULL) + rank * 100;
cerr << "Random seed: " << seed << endl;
srand(seed);
}
- TMatrix t = TMatrix::Random() / 50.0;
- for (unsigned i = 1; i < r_trg.size(); ++i) {
- r_trg[i] = RVector::Random();
- r_src[i] = RVector::Random();
+
+ TMatrix t;
+ if (rank == 0) {
+ t = TMatrix::Random() / 50.0;
+ for (unsigned i = 1; i < r_trg.size(); ++i) {
+ r_trg[i] = RVector::Random();
+ r_src[i] = RVector::Random();
+ }
+ if (conf.count("source_embeddings"))
+ LoadEmbeddings(conf["source_embeddings"].as<string>(), &r_src);
+ if (conf.count("target_embeddings"))
+ LoadEmbeddings(conf["target_embeddings"].as<string>(), &r_trg);
}
- if (conf.count("source_embeddings"))
- LoadEmbeddings(conf["source_embeddings"].as<string>(), &r_src);
- if (conf.count("target_embeddings"))
- LoadEmbeddings(conf["target_embeddings"].as<string>(), &r_trg);
- vector<set<unsigned> > trg_pos(TD::NumWords() + 1);
// do optimization
TMatrix g = TMatrix::Zero();
@@ -243,22 +243,25 @@ int main(int argc, char** argv) {
vector<double> z_src;
vector<double> flat_g, flat_t;
Flatten(t, &flat_t);
- for (int iter = 0; iter < ITERATIONS; ++iter) {
+ bool converged = false;
+ // TODO broadcast embeddings
+ for (int iter = 0; !converged && iter < ITERATIONS; ++iter) {
+#ifdef HAVE_MPI
+ mpi::broadcast(world, &flat_t[0], flat_t.size(), 0);
+#endif
+ Unflatten(flat_t, &t);
cerr << "ITERATION " << (iter + 1) << endl;
- ReadFile rf(fname);
- istream& in = *rf.stream();
double likelihood = 0;
double denom = 0.0;
lc = 0;
flag = false;
g *= 0;
- while(getline(in, line)) {
+ for (unsigned i = 0; i < srcs.size(); ++i) {
+ const vector<WordID>& src = srcs[i];
+ const vector<WordID>& trg = trgs[i];
++lc;
- if (lc % 1000 == 0) { cerr << '.'; flag = true; }
- if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
- ParseTranslatorInput(line, &ssrc, &strg);
- LatticeTools::ConvertTextToLattice(ssrc, &src);
- LatticeTools::ConvertTextToLattice(strg, &trg);
+ if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; }
+ if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
denom += trg.size();
exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero());
@@ -266,10 +269,10 @@ int main(int argc, char** argv) {
Array2D<TMatrix> exp_refs(src.size(), trg.size(), TMatrix::Zero());
Array2D<double> z_refs(src.size(), trg.size(), 0.0);
for (unsigned j = 0; j < trg.size(); ++j)
- trg_pos[trg[j][0].label].insert(j);
+ trg_pos[trg[j]].insert(j);
for (unsigned i = 0; i < src.size(); ++i) {
- const RVector& r_s = r_src[src[i][0].label];
+ const RVector& r_s = r_src[src[i]];
const RTVector pred = r_s.transpose() * t;
TMatrix& exp_m = exp_src[i];
double& z = z_src[i];
@@ -293,7 +296,7 @@ int main(int argc, char** argv) {
}
}
for (unsigned j = 0; j < trg.size(); ++j)
- trg_pos[trg[j][0].label].clear();
+ trg_pos[trg[j]].clear();
// model expectations for a single target generation with
// uniform alignment prior
@@ -323,8 +326,8 @@ int main(int argc, char** argv) {
// TODO handle alignment prob
}
if (ref_z <= 0) {
- cerr << "TRG=" << TD::Convert(trg[j][0].label) << endl;
- cerr << " LINE=" << line << endl;
+ cerr << "TRG=" << TD::Convert(trg[j]) << endl;
+ cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl;
cerr << " REF_EXP=\n" << ref_exp << endl;
cerr << " M_EXP=\n" << m_exp << endl;
abort();
@@ -339,30 +342,42 @@ int main(int argc, char** argv) {
}
}
- if (iter == (ITERATIONS - 1) || lc == 28) { cerr << al << endl; }
+ if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; }
}
- if (flag) { cerr << endl; }
+ if (flag && rank == 0) { cerr << endl; }
- const double base2_likelihood = likelihood / log(2);
- cerr << " log_e likelihood: " << likelihood << endl;
- cerr << " log_2 likelihood: " << base2_likelihood << endl;
- cerr << " cross entropy: " << (-base2_likelihood / denom) << endl;
- cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl;
+ double obj = 0;
if (!SGD) {
Flatten(g, &flat_g);
- double obj = -likelihood;
- if (has_l2) {
- const double r = ApplyRegularization(reg_strength,
- flat_t,
- &flat_g);
- obj += r;
- cerr << " regularization: " << r << endl;
+ obj = -likelihood;
+ // TODO - reduce gradient
+ }
+
+ if (rank == 0) {
+ double gn = 0;
+ for (unsigned i = 0; i < flat_g.size(); ++i)
+ gn += flat_g[i]*flat_g[i];
+ const double base2_likelihood = likelihood / log(2);
+ cerr << " log_e likelihood: " << likelihood << endl;
+ cerr << " log_2 likelihood: " << base2_likelihood << endl;
+ cerr << " cross entropy: " << (-base2_likelihood / denom) << endl;
+ cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl;
+ cerr << " gradient norm: " << sqrt(gn) << endl;
+ if (!SGD) {
+ if (has_l2) {
+ const double r = ApplyRegularization(reg_strength,
+ flat_t,
+ &flat_g);
+ obj += r;
+ cerr << " regularization: " << r << endl;
+ }
+ lbfgs->Optimize(obj, flat_g, &flat_t);
+ converged = (lbfgs->HasConverged());
}
- lbfgs.Optimize(obj, flat_g, &flat_t);
- Unflatten(flat_t, &t);
- if (lbfgs.HasConverged()) break;
}
- cerr << t << endl;
+#ifdef HAVE_MPI
+ mpi::broadcast(world, converged, 0);
+#endif
}
cerr << "TRANSLATION MATRIX:" << endl << t << endl;
return 0;