#include #include "config.h" #ifndef HAVE_EIGEN int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; } #else #include #include #include #include #include // memset #include #ifdef HAVE_MPI #include #include #include namespace mpi = boost::mpi; #endif #include #include #include #include #include "corpus_tools.h" #include "optimize.h" #include "array2d.h" #include "m.h" #include "lattice.h" #include "stringlib.h" #include "filelib.h" #include "tdict.h" namespace po = boost::program_options; using namespace std; #define kDIMENSIONS 10 typedef Eigen::Matrix RVector; typedef Eigen::Matrix RTVector; typedef Eigen::Matrix TMatrix; vector r_src, r_trg; #if HAVE_MPI namespace boost { namespace serialization { template void serialize(Archive & ar, RVector & v, const unsigned int version) { for (unsigned i = 0; i < kDIMENSIONS; ++i) ar & v[i]; } } // namespace serialization } // namespace boost #endif bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("input,i",po::value(),"Input file") ("iterations,I",po::value()->default_value(1000),"Number of iterations of training") ("regularization_strength,C",po::value()->default_value(0.1),"L2 regularization strength (0 for no regularization)") ("eta", po::value()->default_value(0.1f), "Eta for SGD") ("source_embeddings,f", po::value(), "File containing source embeddings (if unset, random vectors will be used)") ("target_embeddings,e", po::value(), "File containing target embeddings (if unset, random vectors will be used)") ("random_seed,s", po::value(), "Random seed") ("diagonal_tension,T", po::value()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") ("testset,x", po::value(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); po::options_description clo("Command line options"); clo.add_options() ("config", po::value(), "Configuration file") ("help,h", "Print this help message and exit"); po::options_description dconfig_options, dcmdline_options; dconfig_options.add(opts); dcmdline_options.add(opts).add(clo); po::store(parse_command_line(argc, argv, dcmdline_options), *conf); if (conf->count("config")) { ifstream config((*conf)["config"].as().c_str()); po::store(po::parse_config_file(config, dconfig_options), *conf); } po::notify(*conf); if (argc < 2 || conf->count("help")) { cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n"; cerr << dcmdline_options << endl; return false; } return true; } void Normalize(RVector* v) { double norm = v->norm(); assert(norm > 0.0f); *v /= norm; } void Flatten(const TMatrix& m, vector* v) { unsigned c = 0; v->resize(kDIMENSIONS * kDIMENSIONS); for (unsigned i = 0; i < kDIMENSIONS; ++i) for (unsigned j = 0; j < kDIMENSIONS; ++j) { assert(boost::math::isfinite(m(i, j))); (*v)[c++] = m(i,j); } } void Unflatten(const vector& v, TMatrix* m) { unsigned c = 0; for (unsigned i = 0; i < kDIMENSIONS; ++i) for (unsigned j = 0; j < kDIMENSIONS; ++j) { assert(boost::math::isfinite(v[c])); (*m)(i, j) = v[c++]; } } double ApplyRegularization(const double C, const vector& weights, vector* g) { assert(weights.size() == g->size()); double reg = 0; for (size_t i = 0; i < weights.size(); ++i) { const double& w_i = weights[i]; double& g_i = (*g)[i]; reg += C * w_i * w_i; g_i += 2 * C * w_i; } return reg; } void LoadEmbeddings(const string& filename, vector* pv) { vector& v = *pv; cerr << "Reading embeddings from " << filename << " ...\n"; ReadFile rf(filename); istream& in = *rf.stream(); string line; unsigned lc = 0; while(getline(in, line)) { ++lc; size_t cur = line.find(' '); if (cur == string::npos || cur == 0) { cerr << "Parse error reading line " << lc << ":\n" << line << endl; abort(); } WordID w = TD::Convert(line.substr(0, cur)); if (w >= v.size()) continue; RVector& curv = v[w]; line[cur] = 0; size_t start = cur + 1; cur = start + 1; size_t c = 0; while(cur < line.size()) { if (line[cur] == ' ') { line[cur] = 0; curv[c++] = strtod(&line[start], NULL); start = cur + 1; cur = start; if (c == kDIMENSIONS) break; } ++cur; } if (c < kDIMENSIONS && cur != start) { if (cur < line.size()) line[cur] = 0; curv[c++] = strtod(&line[start], NULL); } if (c != kDIMENSIONS) { static bool first = true; if (first) { cerr << " read " << c << " dimensions from embedding file, but built with " << kDIMENSIONS << " (filling in with random values)\n"; first = false; } for (; c < kDIMENSIONS; ++c) curv[c] = rand(); } if (c == kDIMENSIONS && cur != line.size()) { static bool first = true; if (first) { cerr << " embedding file contains more dimensions than configured with, truncating.\n"; first = false; } } } } int main(int argc, char** argv) { #ifdef HAVE_MPI std::cerr << "**MPI enabled.\n"; mpi::environment env(argc, argv); mpi::communicator world; const int size = world.size(); const int rank = world.rank(); #else std::cerr << "**MPI disabled.\n"; const int rank = 0; const int size = 1; #endif po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) return 1; const string fname = conf["input"].as(); const double reg_strength = conf["regularization_strength"].as(); const bool has_l2 = reg_strength; assert(reg_strength >= 0.0f); const int ITERATIONS = conf["iterations"].as(); const double eta = conf["eta"].as(); const double diagonal_tension = conf["diagonal_tension"].as(); bool SGD = false; if (diagonal_tension < 0.0) { cerr << "Invalid value for diagonal_tension: must be >= 0\n"; return 1; } string testset; if (conf.count("testset")) testset = conf["testset"].as(); unsigned lc = 0; vector unnormed_a_i; bool flag = false; vector > srcs, trgs; vector vocab_e; { set svocab_e, svocab_f; CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size); copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e)); } cerr << "Number of target word types: " << vocab_e.size() << endl; const double num_examples = lc; boost::shared_ptr lbfgs; if (rank == 0) lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100)); r_trg.resize(TD::NumWords() + 1); r_src.resize(TD::NumWords() + 1); vector > trg_pos(TD::NumWords() + 1); if (conf.count("random_seed")) { srand(conf["random_seed"].as()); } else { unsigned seed = time(NULL) + rank * 100; cerr << "Random seed: " << seed << endl; srand(seed); } TMatrix t = TMatrix::Zero(); if (rank == 0) { t = TMatrix::Random() / 50.0; for (unsigned i = 1; i < r_trg.size(); ++i) { r_trg[i] = RVector::Random(); r_src[i] = RVector::Random(); } if (conf.count("source_embeddings")) LoadEmbeddings(conf["source_embeddings"].as(), &r_src); if (conf.count("target_embeddings")) LoadEmbeddings(conf["target_embeddings"].as(), &r_trg); } // do optimization TMatrix g = TMatrix::Zero(); vector exp_src; vector z_src; vector flat_g, flat_t, rcv_grad; Flatten(t, &flat_t); bool converged = false; #if HAVE_MPI mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); mpi::broadcast(world, r_trg, 0); mpi::broadcast(world, r_src, 0); #endif cerr << "rank=" << rank << ": " << r_trg[0][4] << endl; for (int iter = 0; !converged && iter < ITERATIONS; ++iter) { if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl; Unflatten(flat_t, &t); double likelihood = 0; double denom = 0.0; lc = 0; flag = false; g *= 0; for (unsigned i = 0; i < srcs.size(); ++i) { const vector& src = srcs[i]; const vector& trg = trgs[i]; ++lc; if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; } if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } denom += trg.size(); exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero()); z_src.clear(); z_src.resize(src.size(), 0.0); Array2D exp_refs(src.size(), trg.size(), TMatrix::Zero()); Array2D z_refs(src.size(), trg.size(), 0.0); for (unsigned j = 0; j < trg.size(); ++j) trg_pos[trg[j]].insert(j); for (unsigned i = 0; i < src.size(); ++i) { const RVector& r_s = r_src[src[i]]; const RTVector pred = r_s.transpose() * t; TMatrix& exp_m = exp_src[i]; double& z = z_src[i]; for (unsigned k = 0; k < vocab_e.size(); ++k) { const WordID v_k = vocab_e[k]; const RVector& r_t = r_trg[v_k]; const double dot_prod = pred * r_t; const double u = exp(dot_prod); z += u; const TMatrix v = r_s * r_t.transpose() * u; exp_m += v; set& ref_locs = trg_pos[v_k]; if (!ref_locs.empty()) { for (set::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) { TMatrix& exp_ref_ij = exp_refs(i, *it); double& z_ref_ij = z_refs(i, *it); z_ref_ij += u; exp_ref_ij += v; } } } } for (unsigned j = 0; j < trg.size(); ++j) trg_pos[trg[j]].clear(); // model expectations for a single target generation with // uniform alignment prior // TODO: when using a non-uniform alignment, m_exp will be // a function of j (below) double m_z = 0; TMatrix m_exp = TMatrix::Zero(); for (unsigned i = 0; i < src.size(); ++i) { m_exp += exp_src[i]; m_z += z_src[i]; } m_exp /= m_z; Array2D al(src.size(), trg.size(), false); for (unsigned j = 0; j < trg.size(); ++j) { double ref_z = 0; TMatrix ref_exp = TMatrix::Zero(); int max_i = 0; double max_s = -9999999; for (unsigned i = 0; i < src.size(); ++i) { ref_exp += exp_refs(i, j); ref_z += z_refs(i, j); if (log(z_refs(i, j)) > max_s) { max_s = log(z_refs(i, j)); max_i = i; } // TODO handle alignment prob } if (ref_z <= 0) { cerr << "TRG=" << TD::Convert(trg[j]) << endl; cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl; cerr << " REF_EXP=\n" << ref_exp << endl; cerr << " M_EXP=\n" << m_exp << endl; abort(); } al(max_i, j) = true; ref_exp /= ref_z; g += m_exp - ref_exp; likelihood += log(ref_z) - log(m_z); if (SGD) { t -= g * eta / num_examples; g *= 0; } } if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; } } if (flag && rank == 0) { cerr << endl; } double obj = 0; if (!SGD) { Flatten(g, &flat_g); obj = -likelihood; #if HAVE_MPI rcv_grad.resize(flat_g.size(), 0.0); mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus(), 0); swap(flat_g, rcv_grad); rcv_grad.clear(); double to = 0; mpi::reduce(world, obj, to, plus(), 0); obj = to; double tlh = 0; mpi::reduce(world, likelihood, tlh, plus(), 0); likelihood = tlh; double td = 0; mpi::reduce(world, denom, td, plus(), 0); denom = td; #endif } if (rank == 0) { double gn = 0; for (unsigned i = 0; i < flat_g.size(); ++i) gn += flat_g[i]*flat_g[i]; const double base2_likelihood = likelihood / log(2); cerr << " log_e likelihood: " << likelihood << endl; cerr << " log_2 likelihood: " << base2_likelihood << endl; cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; cerr << " gradient norm: " << sqrt(gn) << endl; if (!SGD) { if (has_l2) { const double r = ApplyRegularization(reg_strength, flat_t, &flat_g); obj += r; cerr << " regularization: " << r << endl; } lbfgs->Optimize(obj, flat_g, &flat_t); converged = (lbfgs->HasConverged()); } } #ifdef HAVE_MPI mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); mpi::broadcast(world, converged, 0); #endif } if (rank == 0) cerr << "TRANSLATION MATRIX:" << endl << t << endl; return 0; } #endif