From 54bcfb835232d190a5ab6f0bd825de8a50dae126 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 29 Feb 2012 01:12:40 -0500 Subject: cleanup, mpi-ify lblmodel --- utils/corpus_tools.cc | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 utils/corpus_tools.cc (limited to 'utils/corpus_tools.cc') diff --git a/utils/corpus_tools.cc b/utils/corpus_tools.cc new file mode 100644 index 00000000..a0542b6e --- /dev/null +++ b/utils/corpus_tools.cc @@ -0,0 +1,62 @@ +#include "corpus_tools.h" + +#include + +#include "tdict.h" +#include "filelib.h" +#include "verbose.h" + +using namespace std; + +void CorpusTools::ReadFromFile(const string& filename, + vector >* src, + set* src_vocab, + vector >* trg, + set* trg_vocab, + int rank, + int size) { + assert(rank >= 0); + assert(size > 0); + assert(rank < size); + if (src) src->clear(); + if (src_vocab) src_vocab->clear(); + if (trg) trg->clear(); + if (trg_vocab) trg_vocab->clear(); + const int expected_fields = 1 + (trg == NULL ? 0 : 1); + if (!SILENT) cerr << "Reading from " << filename << " ...\n"; + ReadFile rf(filename); + istream& in = *rf.stream(); + string line; + int lc = 0; + static const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(getline(in, line)) { + const bool skip = (lc % size != rank); + ++lc; + if (skip) continue; + TD::ConvertSentence(line, &tmp); + src->push_back(vector()); + vector* d = &src->back(); + set* v = src_vocab; + int s = 0; + for (unsigned i = 0; i < tmp.size(); ++i) { + if (tmp[i] == kDIV) { + ++s; + if (s > 1) { cerr << "Unexpected format in line " << lc << ": " << line << endl; abort(); } + assert(trg); + trg->push_back(vector()); + d = &trg->back(); + v = trg_vocab; + } else { + d->push_back(tmp[i]); + if (v) v->insert(tmp[i]); + } + } + ++s; + if (expected_fields != s) { + cerr << "Wrong number of fields in line " << lc << ": " << line << endl; abort(); + } + } +} + + -- cgit v1.2.3 From a872f46ce1212703b8bed562c894ea1a932c0746 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 29 Feb 2012 07:00:49 +0000 Subject: mpi fixes --- training/lbl_model.cc | 54 +++++++++++++++++++++++++++++++++++++++++---------- utils/corpus_tools.cc | 16 +++++++++------ 2 files changed, 54 insertions(+), 16 deletions(-) (limited to 'utils/corpus_tools.cc') diff --git a/training/lbl_model.cc b/training/lbl_model.cc index def5075a..a46ce33c 100644 --- a/training/lbl_model.cc +++ b/training/lbl_model.cc @@ -15,6 +15,7 @@ #ifdef HAVE_MPI #include #include +#include namespace mpi = boost::mpi; #endif #include @@ -34,12 +35,26 @@ namespace mpi = boost::mpi; namespace po = boost::program_options; using namespace std; -#define kDIMENSIONS 100 +#define kDIMENSIONS 10 typedef Eigen::Matrix RVector; typedef Eigen::Matrix RTVector; typedef Eigen::Matrix TMatrix; vector r_src, r_trg; +#if HAVE_MPI +namespace boost { +namespace serialization { + +template +void serialize(Archive & ar, RVector & v, const unsigned int version) { + for (unsigned i = 0; i < kDIMENSIONS; ++i) + ar & v[i]; +} + +} // namespace serialization +} // namespace boost +#endif + bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() @@ -224,7 +239,7 @@ int main(int argc, char** argv) { srand(seed); } - TMatrix t; + TMatrix t = TMatrix::Zero(); if (rank == 0) { t = TMatrix::Random() / 50.0; for (unsigned i = 1; i < r_trg.size(); ++i) { @@ -241,16 +256,18 @@ int main(int argc, char** argv) { TMatrix g = TMatrix::Zero(); vector exp_src; vector z_src; - vector flat_g, flat_t; + vector flat_g, flat_t, rcv_grad; Flatten(t, &flat_t); bool converged = false; - // TODO broadcast embeddings - for (int iter = 0; !converged && iter < ITERATIONS; ++iter) { -#ifdef HAVE_MPI - mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); +#if HAVE_MPI + mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); + mpi::broadcast(world, r_trg, 0); + mpi::broadcast(world, r_src, 0); #endif + cerr << "rank=" << rank << ": " << r_trg[0][4] << endl; + for (int iter = 0; !converged && iter < ITERATIONS; ++iter) { + if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl; Unflatten(flat_t, &t); - cerr << "ITERATION " << (iter + 1) << endl; double likelihood = 0; double denom = 0.0; lc = 0; @@ -350,7 +367,22 @@ int main(int argc, char** argv) { if (!SGD) { Flatten(g, &flat_g); obj = -likelihood; - // TODO - reduce gradient +#if HAVE_MPI + rcv_grad.resize(flat_g.size(), 0.0); + mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus(), 0); + swap(flat_g, rcv_grad); + rcv_grad.clear(); + + double to = 0; + mpi::reduce(world, obj, to, plus(), 0); + obj = to; + double tlh = 0; + mpi::reduce(world, likelihood, tlh, plus(), 0); + likelihood = tlh; + double td = 0; + mpi::reduce(world, denom, td, plus(), 0); + denom = td; +#endif } if (rank == 0) { @@ -376,10 +408,12 @@ int main(int argc, char** argv) { } } #ifdef HAVE_MPI + mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); mpi::broadcast(world, converged, 0); #endif } - cerr << "TRANSLATION MATRIX:" << endl << t << endl; + if (rank == 0) + cerr << "TRANSLATION MATRIX:" << endl << t << endl; return 0; } diff --git a/utils/corpus_tools.cc b/utils/corpus_tools.cc index a0542b6e..d17785af 100644 --- a/utils/corpus_tools.cc +++ b/utils/corpus_tools.cc @@ -33,10 +33,12 @@ void CorpusTools::ReadFromFile(const string& filename, while(getline(in, line)) { const bool skip = (lc % size != rank); ++lc; - if (skip) continue; TD::ConvertSentence(line, &tmp); - src->push_back(vector()); - vector* d = &src->back(); + vector* d = NULL; + if (!skip) { + src->push_back(vector()); + d = &src->back(); + } set* v = src_vocab; int s = 0; for (unsigned i = 0; i < tmp.size(); ++i) { @@ -44,11 +46,13 @@ void CorpusTools::ReadFromFile(const string& filename, ++s; if (s > 1) { cerr << "Unexpected format in line " << lc << ": " << line << endl; abort(); } assert(trg); - trg->push_back(vector()); - d = &trg->back(); + if (!skip) { + trg->push_back(vector()); + d = &trg->back(); + } v = trg_vocab; } else { - d->push_back(tmp[i]); + if (d) d->push_back(tmp[i]); if (v) v->insert(tmp[i]); } } -- cgit v1.2.3