diff options
Diffstat (limited to 'training')
-rw-r--r-- | training/Makefile.am | 8 | ||||
-rw-r--r-- | training/atools.cc | 370 | ||||
-rw-r--r-- | training/em_utils.h | 24 | ||||
-rw-r--r-- | training/lbl_model.cc | 421 | ||||
-rw-r--r-- | training/model1.cc | 109 | ||||
-rw-r--r-- | training/mpi_flex_optimize.cc | 13 | ||||
-rw-r--r-- | training/mr_em_adapted_reduce.cc | 6 | ||||
-rw-r--r-- | training/ttables.h | 4 |
8 files changed, 539 insertions, 416 deletions
diff --git a/training/Makefile.am b/training/Makefile.am index 2a11ae52..991ac210 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -1,12 +1,12 @@ bin_PROGRAMS = \ model1 \ + lbl_model \ test_ngram \ mr_em_map_adapter \ mr_em_adapted_reduce \ mr_reduce_to_weights \ mr_optimize_reduce \ grammar_convert \ - atools \ plftools \ collapse_weights \ mpi_extract_reachable \ @@ -47,12 +47,12 @@ augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/lib test_ngram_SOURCES = test_ngram.cc test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -atools_SOURCES = atools.cc -atools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - model1_SOURCES = model1.cc ttables.cc model1_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +lbl_model_SOURCES = lbl_model.cc optimize.cc +lbl_model_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz + grammar_convert_SOURCES = grammar_convert.cc grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz diff --git a/training/atools.cc b/training/atools.cc deleted file mode 100644 index 42579627..00000000 --- a/training/atools.cc +++ /dev/null @@ -1,370 +0,0 @@ -#include <iostream> -#include <sstream> -#include <vector> - -#include <queue> -#include <map> -#include <boost/program_options.hpp> -#include <boost/shared_ptr.hpp> - -#include "filelib.h" -#include "aligner.h" -#include "alignment_pharaoh.h" - -namespace po = boost::program_options; -using namespace std; -using boost::shared_ptr; - -struct Command { - virtual ~Command() {} - virtual string Name() const = 0; - - // returns 1 for alignment grid output [default] - // returns 2 if Summary() should be called [for AER, etc] - virtual int Result() const { return 1; } - - virtual bool RequiresTwoOperands() const { return true; } - virtual void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) = 0; - void EnsureSize(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) { - x->resize(max(a.width(), b.width()), max(a.height(), b.height())); - } - static bool Safe(const Array2D<bool>& a, int i, int j) { - if (i >= 0 && j >= 0 && i < a.width() && j < a.height()) - return a(i,j); - else - return false; - } - virtual void Summary() { assert(!"Summary should have been overridden"); } -}; - -// compute fmeasure, second alignment is reference, first is hyp -struct FMeasureCommand : public Command { - FMeasureCommand() : matches(), num_predicted(), num_in_ref() {} - int Result() const { return 2; } - string Name() const { return "fmeasure"; } - bool RequiresTwoOperands() const { return true; } - void Apply(const Array2D<bool>& hyp, const Array2D<bool>& ref, Array2D<bool>* x) { - (void) x; // AER just computes statistics, not an alignment - int i_len = ref.width(); - int j_len = ref.height(); - for (int i = 0; i < i_len; ++i) { - for (int j = 0; j < j_len; ++j) { - if (ref(i,j)) { - ++num_in_ref; - if (Safe(hyp, i, j)) ++matches; - } - } - } - for (int i = 0; i < hyp.width(); ++i) - for (int j = 0; j < hyp.height(); ++j) - if (hyp(i,j)) ++num_predicted; - } - void Summary() { - if (num_predicted == 0 || num_in_ref == 0) { - cerr << "Insufficient statistics to compute f-measure!\n"; - abort(); - } - const double prec = static_cast<double>(matches) / num_predicted; - const double rec = static_cast<double>(matches) / num_in_ref; - cout << "P: " << prec << endl; - cout << "R: " << rec << endl; - const double f = (2.0 * prec * rec) / (rec + prec); - cout << "F: " << f << endl; - } - int matches; - int num_predicted; - int num_in_ref; -}; - -struct DisplayCommand : public Command { - string Name() const { return "display"; } - bool RequiresTwoOperands() const { return false; } - void Apply(const Array2D<bool>& in, const Array2D<bool>¬_used, Array2D<bool>* x) { - *x = in; - cout << *x << endl; - } -}; - -struct ConvertCommand : public Command { - string Name() const { return "convert"; } - bool RequiresTwoOperands() const { return false; } - void Apply(const Array2D<bool>& in, const Array2D<bool>¬_used, Array2D<bool>* x) { - *x = in; - } -}; - -struct InvertCommand : public Command { - string Name() const { return "invert"; } - bool RequiresTwoOperands() const { return false; } - void Apply(const Array2D<bool>& in, const Array2D<bool>¬_used, Array2D<bool>* x) { - Array2D<bool>& res = *x; - res.resize(in.height(), in.width()); - for (int i = 0; i < in.height(); ++i) - for (int j = 0; j < in.width(); ++j) - res(i, j) = in(j, i); - } -}; - -struct IntersectCommand : public Command { - string Name() const { return "intersect"; } - bool RequiresTwoOperands() const { return true; } - void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) { - EnsureSize(a, b, x); - Array2D<bool>& res = *x; - for (int i = 0; i < a.width(); ++i) - for (int j = 0; j < a.height(); ++j) - res(i, j) = Safe(a, i, j) && Safe(b, i, j); - } -}; - -struct UnionCommand : public Command { - string Name() const { return "union"; } - bool RequiresTwoOperands() const { return true; } - void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) { - EnsureSize(a, b, x); - Array2D<bool>& res = *x; - for (int i = 0; i < res.width(); ++i) - for (int j = 0; j < res.height(); ++j) - res(i, j) = Safe(a, i, j) || Safe(b, i, j); - } -}; - -struct RefineCommand : public Command { - RefineCommand() { - neighbors_.push_back(make_pair(1,0)); - neighbors_.push_back(make_pair(-1,0)); - neighbors_.push_back(make_pair(0,1)); - neighbors_.push_back(make_pair(0,-1)); - } - bool RequiresTwoOperands() const { return true; } - - void Align(int i, int j) { - res_(i, j) = true; - is_i_aligned_[i] = true; - is_j_aligned_[j] = true; - } - - bool IsNeighborAligned(int i, int j) const { - for (int k = 0; k < neighbors_.size(); ++k) { - const int di = neighbors_[k].first; - const int dj = neighbors_[k].second; - if (Safe(res_, i + di, j + dj)) - return true; - } - return false; - } - - bool IsNeitherAligned(int i, int j) const { - return !(is_i_aligned_[i] || is_j_aligned_[j]); - } - - bool IsOneOrBothUnaligned(int i, int j) const { - return !(is_i_aligned_[i] && is_j_aligned_[j]); - } - - bool KoehnAligned(int i, int j) const { - return IsOneOrBothUnaligned(i, j) && IsNeighborAligned(i, j); - } - - typedef bool (RefineCommand::*Predicate)(int i, int j) const; - - protected: - void InitRefine( - const Array2D<bool>& a, - const Array2D<bool>& b) { - res_.clear(); - EnsureSize(a, b, &res_); - in_.clear(); un_.clear(); is_i_aligned_.clear(); is_j_aligned_.clear(); - EnsureSize(a, b, &in_); - EnsureSize(a, b, &un_); - is_i_aligned_.resize(res_.width(), false); - is_j_aligned_.resize(res_.height(), false); - for (int i = 0; i < in_.width(); ++i) - for (int j = 0; j < in_.height(); ++j) { - un_(i, j) = Safe(a, i, j) || Safe(b, i, j); - in_(i, j) = Safe(a, i, j) && Safe(b, i, j); - if (in_(i, j)) Align(i, j); - } - } - // "grow" the resulting alignment using the points in adds - // if they match the constraints determined by pred - void Grow(Predicate pred, bool idempotent, const Array2D<bool>& adds) { - if (idempotent) { - for (int i = 0; i < adds.width(); ++i) - for (int j = 0; j < adds.height(); ++j) { - if (adds(i, j) && !res_(i, j) && - (this->*pred)(i, j)) Align(i, j); - } - return; - } - set<pair<int, int> > p; - for (int i = 0; i < adds.width(); ++i) - for (int j = 0; j < adds.height(); ++j) - if (adds(i, j) && !res_(i, j)) - p.insert(make_pair(i, j)); - bool keep_going = !p.empty(); - while (keep_going) { - keep_going = false; - for (set<pair<int, int> >::iterator pi = p.begin(); - pi != p.end(); ++pi) { - if ((this->*pred)(pi->first, pi->second)) { - Align(pi->first, pi->second); - p.erase(pi); - keep_going = true; - } - } - } - } - Array2D<bool> res_; // refined alignment - Array2D<bool> in_; // intersection alignment - Array2D<bool> un_; // union alignment - vector<bool> is_i_aligned_; - vector<bool> is_j_aligned_; - vector<pair<int,int> > neighbors_; -}; - -struct DiagCommand : public RefineCommand { - DiagCommand() { - neighbors_.push_back(make_pair(1,1)); - neighbors_.push_back(make_pair(-1,1)); - neighbors_.push_back(make_pair(1,-1)); - neighbors_.push_back(make_pair(-1,-1)); - } -}; - -struct GDCommand : public DiagCommand { - string Name() const { return "grow-diag"; } - void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) { - InitRefine(a, b); - Grow(&RefineCommand::KoehnAligned, false, un_); - *x = res_; - } -}; - -struct GDFCommand : public DiagCommand { - string Name() const { return "grow-diag-final"; } - void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) { - InitRefine(a, b); - Grow(&RefineCommand::KoehnAligned, false, un_); - Grow(&RefineCommand::IsOneOrBothUnaligned, true, a); - Grow(&RefineCommand::IsOneOrBothUnaligned, true, b); - *x = res_; - } -}; - -struct GDFACommand : public DiagCommand { - string Name() const { return "grow-diag-final-and"; } - void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) { - InitRefine(a, b); - Grow(&RefineCommand::KoehnAligned, false, un_); - Grow(&RefineCommand::IsNeitherAligned, true, a); - Grow(&RefineCommand::IsNeitherAligned, true, b); - *x = res_; - } -}; - -map<string, boost::shared_ptr<Command> > commands; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - ostringstream os; - os << "[REQ] Operation to perform:"; - for (map<string, boost::shared_ptr<Command> >::iterator it = commands.begin(); - it != commands.end(); ++it) { - os << ' ' << it->first; - } - string cstr = os.str(); - opts.add_options() - ("input_1,i", po::value<string>(), "[REQ] Alignment 1 file, - for STDIN") - ("input_2,j", po::value<string>(), "[OPT] Alignment 2 file, - for STDIN") - ("command,c", po::value<string>()->default_value("convert"), cstr.c_str()) - ("help,h", "Print this help message and exit"); - po::options_description clo("Command line options"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - - if (conf->count("help") || conf->count("input_1") == 0 || conf->count("command") == 0) { - cerr << dcmdline_options << endl; - exit(1); - } - const string cmd = (*conf)["command"].as<string>(); - if (commands.count(cmd) == 0) { - cerr << "Don't understand command: " << cmd << endl; - exit(1); - } - if (commands[cmd]->RequiresTwoOperands()) { - if (conf->count("input_2") == 0) { - cerr << "Command '" << cmd << "' requires two alignment files\n"; - exit(1); - } - if ((*conf)["input_1"].as<string>() == "-" && (*conf)["input_2"].as<string>() == "-") { - cerr << "Both inputs cannot be STDIN\n"; - exit(1); - } - } else { - if (conf->count("input_2") != 0) { - cerr << "Command '" << cmd << "' requires only one alignment file\n"; - exit(1); - } - } -} - -template<class C> static void AddCommand() { - C* c = new C; - commands[c->Name()].reset(c); -} - -int main(int argc, char **argv) { - AddCommand<ConvertCommand>(); - AddCommand<DisplayCommand>(); - AddCommand<InvertCommand>(); - AddCommand<IntersectCommand>(); - AddCommand<UnionCommand>(); - AddCommand<GDCommand>(); - AddCommand<GDFCommand>(); - AddCommand<GDFACommand>(); - AddCommand<FMeasureCommand>(); - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - Command& cmd = *commands[conf["command"].as<string>()]; - boost::shared_ptr<ReadFile> rf1(new ReadFile(conf["input_1"].as<string>())); - boost::shared_ptr<ReadFile> rf2; - if (cmd.RequiresTwoOperands()) - rf2.reset(new ReadFile(conf["input_2"].as<string>())); - istream* in1 = rf1->stream(); - istream* in2 = NULL; - if (rf2) in2 = rf2->stream(); - while(*in1) { - string line1; - string line2; - getline(*in1, line1); - if (in2) { - getline(*in2, line2); - if ((*in1 && !*in2) || (*in2 && !*in1)) { - cerr << "Mismatched number of lines!\n"; - exit(1); - } - } - if (line1.empty() && !*in1) break; - shared_ptr<Array2D<bool> > out(new Array2D<bool>); - shared_ptr<Array2D<bool> > a1 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line1); - if (in2) { - shared_ptr<Array2D<bool> > a2 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line2); - cmd.Apply(*a1, *a2, out.get()); - } else { - Array2D<bool> dummy; - cmd.Apply(*a1, dummy, out.get()); - } - - if (cmd.Result() == 1) { - AlignmentPharaoh::SerializePharaohFormat(*out, &cout); - } - } - if (cmd.Result() == 2) - cmd.Summary(); - return 0; -} - diff --git a/training/em_utils.h b/training/em_utils.h deleted file mode 100644 index 37762978..00000000 --- a/training/em_utils.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _EM_UTILS_H_ -#define _EM_UTILS_H_ - -#include "config.h" -#ifdef HAVE_BOOST_DIGAMMA -#include <boost/math/special_functions/digamma.hpp> -using boost::math::digamma; -#else -#warning Using Mark Johnsons digamma() -#include <cmath> -inline double digamma(double x) { - double result = 0, xx, xx2, xx4; - assert(x > 0); - for ( ; x < 7; ++x) - result -= 1/x; - x -= 1.0/2.0; - xx = 1.0/x; - xx2 = xx*xx; - xx4 = xx2*xx2; - result += log(x)+(1./24.)*xx2-(7.0/960.0)*xx4+(31.0/8064.0)*xx4*xx2-(127.0/30720.0)*xx4*xx4; - return result; -} -#endif -#endif diff --git a/training/lbl_model.cc b/training/lbl_model.cc new file mode 100644 index 00000000..a46ce33c --- /dev/null +++ b/training/lbl_model.cc @@ -0,0 +1,421 @@ +#include <iostream> + +#include "config.h" +#ifndef HAVE_EIGEN + int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; } +#else + +#include <cstdlib> +#include <algorithm> +#include <cmath> +#include <set> +#include <cstring> // memset +#include <ctime> + +#ifdef HAVE_MPI +#include <boost/mpi/timer.hpp> +#include <boost/mpi.hpp> +#include <boost/archive/text_oarchive.hpp> +namespace mpi = boost::mpi; +#endif +#include <boost/math/special_functions/fpclassify.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> +#include <Eigen/Dense> + +#include "corpus_tools.h" +#include "optimize.h" +#include "array2d.h" +#include "m.h" +#include "lattice.h" +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" + +namespace po = boost::program_options; +using namespace std; + +#define kDIMENSIONS 10 +typedef Eigen::Matrix<double, kDIMENSIONS, 1> RVector; +typedef Eigen::Matrix<double, 1, kDIMENSIONS> RTVector; +typedef Eigen::Matrix<double, kDIMENSIONS, kDIMENSIONS> TMatrix; +vector<RVector> r_src, r_trg; + +#if HAVE_MPI +namespace boost { +namespace serialization { + +template<class Archive> +void serialize(Archive & ar, RVector & v, const unsigned int version) { + for (unsigned i = 0; i < kDIMENSIONS; ++i) + ar & v[i]; +} + +} // namespace serialization +} // namespace boost +#endif + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("input,i",po::value<string>(),"Input file") + ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training") + ("regularization_strength,C",po::value<double>()->default_value(0.1),"L2 regularization strength (0 for no regularization)") + ("eta", po::value<double>()->default_value(0.1f), "Eta for SGD") + ("source_embeddings,f", po::value<string>(), "File containing source embeddings (if unset, random vectors will be used)") + ("target_embeddings,e", po::value<string>(), "File containing target embeddings (if unset, random vectors will be used)") + ("random_seed,s", po::value<unsigned>(), "Random seed") + ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") + ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (argc < 2 || conf->count("help")) { + cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n"; + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +void Normalize(RVector* v) { + double norm = v->norm(); + assert(norm > 0.0f); + *v /= norm; +} + +void Flatten(const TMatrix& m, vector<double>* v) { + unsigned c = 0; + v->resize(kDIMENSIONS * kDIMENSIONS); + for (unsigned i = 0; i < kDIMENSIONS; ++i) + for (unsigned j = 0; j < kDIMENSIONS; ++j) { + assert(boost::math::isfinite(m(i, j))); + (*v)[c++] = m(i,j); + } +} + +void Unflatten(const vector<double>& v, TMatrix* m) { + unsigned c = 0; + for (unsigned i = 0; i < kDIMENSIONS; ++i) + for (unsigned j = 0; j < kDIMENSIONS; ++j) { + assert(boost::math::isfinite(v[c])); + (*m)(i, j) = v[c++]; + } +} + +double ApplyRegularization(const double C, + const vector<double>& weights, + vector<double>* g) { + assert(weights.size() == g->size()); + double reg = 0; + for (size_t i = 0; i < weights.size(); ++i) { + const double& w_i = weights[i]; + double& g_i = (*g)[i]; + reg += C * w_i * w_i; + g_i += 2 * C * w_i; + } + return reg; +} + +void LoadEmbeddings(const string& filename, vector<RVector>* pv) { + vector<RVector>& v = *pv; + cerr << "Reading embeddings from " << filename << " ...\n"; + ReadFile rf(filename); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + size_t cur = line.find(' '); + if (cur == string::npos || cur == 0) { + cerr << "Parse error reading line " << lc << ":\n" << line << endl; + abort(); + } + WordID w = TD::Convert(line.substr(0, cur)); + if (w >= v.size()) continue; + RVector& curv = v[w]; + line[cur] = 0; + size_t start = cur + 1; + cur = start + 1; + size_t c = 0; + while(cur < line.size()) { + if (line[cur] == ' ') { + line[cur] = 0; + curv[c++] = strtod(&line[start], NULL); + start = cur + 1; + cur = start; + if (c == kDIMENSIONS) break; + } + ++cur; + } + if (c < kDIMENSIONS && cur != start) { + if (cur < line.size()) line[cur] = 0; + curv[c++] = strtod(&line[start], NULL); + } + if (c != kDIMENSIONS) { + static bool first = true; + if (first) { + cerr << " read " << c << " dimensions from embedding file, but built with " << kDIMENSIONS << " (filling in with random values)\n"; + first = false; + } + for (; c < kDIMENSIONS; ++c) curv[c] = rand(); + } + if (c == kDIMENSIONS && cur != line.size()) { + static bool first = true; + if (first) { + cerr << " embedding file contains more dimensions than configured with, truncating.\n"; + first = false; + } + } + } +} + +int main(int argc, char** argv) { +#ifdef HAVE_MPI + std::cerr << "**MPI enabled.\n"; + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); +#else + std::cerr << "**MPI disabled.\n"; + const int rank = 0; + const int size = 1; +#endif + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) return 1; + const string fname = conf["input"].as<string>(); + const double reg_strength = conf["regularization_strength"].as<double>(); + const bool has_l2 = reg_strength; + assert(reg_strength >= 0.0f); + const int ITERATIONS = conf["iterations"].as<unsigned>(); + const double eta = conf["eta"].as<double>(); + const double diagonal_tension = conf["diagonal_tension"].as<double>(); + bool SGD = false; + if (diagonal_tension < 0.0) { + cerr << "Invalid value for diagonal_tension: must be >= 0\n"; + return 1; + } + string testset; + if (conf.count("testset")) testset = conf["testset"].as<string>(); + + unsigned lc = 0; + vector<double> unnormed_a_i; + bool flag = false; + vector<vector<WordID> > srcs, trgs; + vector<WordID> vocab_e; + { + set<WordID> svocab_e, svocab_f; + CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size); + copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e)); + } + cerr << "Number of target word types: " << vocab_e.size() << endl; + const double num_examples = lc; + + boost::shared_ptr<LBFGSOptimizer> lbfgs; + if (rank == 0) + lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100)); + r_trg.resize(TD::NumWords() + 1); + r_src.resize(TD::NumWords() + 1); + vector<set<unsigned> > trg_pos(TD::NumWords() + 1); + + if (conf.count("random_seed")) { + srand(conf["random_seed"].as<unsigned>()); + } else { + unsigned seed = time(NULL) + rank * 100; + cerr << "Random seed: " << seed << endl; + srand(seed); + } + + TMatrix t = TMatrix::Zero(); + if (rank == 0) { + t = TMatrix::Random() / 50.0; + for (unsigned i = 1; i < r_trg.size(); ++i) { + r_trg[i] = RVector::Random(); + r_src[i] = RVector::Random(); + } + if (conf.count("source_embeddings")) + LoadEmbeddings(conf["source_embeddings"].as<string>(), &r_src); + if (conf.count("target_embeddings")) + LoadEmbeddings(conf["target_embeddings"].as<string>(), &r_trg); + } + + // do optimization + TMatrix g = TMatrix::Zero(); + vector<TMatrix> exp_src; + vector<double> z_src; + vector<double> flat_g, flat_t, rcv_grad; + Flatten(t, &flat_t); + bool converged = false; +#if HAVE_MPI + mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); + mpi::broadcast(world, r_trg, 0); + mpi::broadcast(world, r_src, 0); +#endif + cerr << "rank=" << rank << ": " << r_trg[0][4] << endl; + for (int iter = 0; !converged && iter < ITERATIONS; ++iter) { + if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl; + Unflatten(flat_t, &t); + double likelihood = 0; + double denom = 0.0; + lc = 0; + flag = false; + g *= 0; + for (unsigned i = 0; i < srcs.size(); ++i) { + const vector<WordID>& src = srcs[i]; + const vector<WordID>& trg = trgs[i]; + ++lc; + if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; } + if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } + denom += trg.size(); + + exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero()); + z_src.clear(); z_src.resize(src.size(), 0.0); + Array2D<TMatrix> exp_refs(src.size(), trg.size(), TMatrix::Zero()); + Array2D<double> z_refs(src.size(), trg.size(), 0.0); + for (unsigned j = 0; j < trg.size(); ++j) + trg_pos[trg[j]].insert(j); + + for (unsigned i = 0; i < src.size(); ++i) { + const RVector& r_s = r_src[src[i]]; + const RTVector pred = r_s.transpose() * t; + TMatrix& exp_m = exp_src[i]; + double& z = z_src[i]; + for (unsigned k = 0; k < vocab_e.size(); ++k) { + const WordID v_k = vocab_e[k]; + const RVector& r_t = r_trg[v_k]; + const double dot_prod = pred * r_t; + const double u = exp(dot_prod); + z += u; + const TMatrix v = r_s * r_t.transpose() * u; + exp_m += v; + set<unsigned>& ref_locs = trg_pos[v_k]; + if (!ref_locs.empty()) { + for (set<unsigned>::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) { + TMatrix& exp_ref_ij = exp_refs(i, *it); + double& z_ref_ij = z_refs(i, *it); + z_ref_ij += u; + exp_ref_ij += v; + } + } + } + } + for (unsigned j = 0; j < trg.size(); ++j) + trg_pos[trg[j]].clear(); + + // model expectations for a single target generation with + // uniform alignment prior + // TODO: when using a non-uniform alignment, m_exp will be + // a function of j (below) + double m_z = 0; + TMatrix m_exp = TMatrix::Zero(); + for (unsigned i = 0; i < src.size(); ++i) { + m_exp += exp_src[i]; + m_z += z_src[i]; + } + m_exp /= m_z; + + Array2D<bool> al(src.size(), trg.size(), false); + for (unsigned j = 0; j < trg.size(); ++j) { + double ref_z = 0; + TMatrix ref_exp = TMatrix::Zero(); + int max_i = 0; + double max_s = -9999999; + for (unsigned i = 0; i < src.size(); ++i) { + ref_exp += exp_refs(i, j); + ref_z += z_refs(i, j); + if (log(z_refs(i, j)) > max_s) { + max_s = log(z_refs(i, j)); + max_i = i; + } + // TODO handle alignment prob + } + if (ref_z <= 0) { + cerr << "TRG=" << TD::Convert(trg[j]) << endl; + cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl; + cerr << " REF_EXP=\n" << ref_exp << endl; + cerr << " M_EXP=\n" << m_exp << endl; + abort(); + } + al(max_i, j) = true; + ref_exp /= ref_z; + g += m_exp - ref_exp; + likelihood += log(ref_z) - log(m_z); + if (SGD) { + t -= g * eta / num_examples; + g *= 0; + } + } + + if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; } + } + if (flag && rank == 0) { cerr << endl; } + + double obj = 0; + if (!SGD) { + Flatten(g, &flat_g); + obj = -likelihood; +#if HAVE_MPI + rcv_grad.resize(flat_g.size(), 0.0); + mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus<double>(), 0); + swap(flat_g, rcv_grad); + rcv_grad.clear(); + + double to = 0; + mpi::reduce(world, obj, to, plus<double>(), 0); + obj = to; + double tlh = 0; + mpi::reduce(world, likelihood, tlh, plus<double>(), 0); + likelihood = tlh; + double td = 0; + mpi::reduce(world, denom, td, plus<double>(), 0); + denom = td; +#endif + } + + if (rank == 0) { + double gn = 0; + for (unsigned i = 0; i < flat_g.size(); ++i) + gn += flat_g[i]*flat_g[i]; + const double base2_likelihood = likelihood / log(2); + cerr << " log_e likelihood: " << likelihood << endl; + cerr << " log_2 likelihood: " << base2_likelihood << endl; + cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; + cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; + cerr << " gradient norm: " << sqrt(gn) << endl; + if (!SGD) { + if (has_l2) { + const double r = ApplyRegularization(reg_strength, + flat_t, + &flat_g); + obj += r; + cerr << " regularization: " << r << endl; + } + lbfgs->Optimize(obj, flat_g, &flat_t); + converged = (lbfgs->HasConverged()); + } + } +#ifdef HAVE_MPI + mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); + mpi::broadcast(world, converged, 0); +#endif + } + if (rank == 0) + cerr << "TRANSLATION MATRIX:" << endl << t << endl; + return 0; +} + +#endif + diff --git a/training/model1.cc b/training/model1.cc index b9590ece..73104304 100644 --- a/training/model1.cc +++ b/training/model1.cc @@ -4,12 +4,12 @@ #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> +#include "m.h" #include "lattice.h" #include "stringlib.h" #include "filelib.h" #include "ttables.h" #include "tdict.h" -#include "em_utils.h" namespace po = boost::program_options; using namespace std; @@ -20,7 +20,12 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("iterations,i",po::value<unsigned>()->default_value(5),"Number of iterations of EM training") ("beam_threshold,t",po::value<double>()->default_value(-4),"log_10 of beam threshold (-10000 to include everything, 0 max)") ("no_null_word,N","Do not generate from the null token") + ("write_alignments,A", "Write alignments instead of parameters") + ("favor_diagonal,d", "Use a static alignment distribution that assigns higher probabilities to alignments near the diagonal") + ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (<1 = flat >1 = sharp)") + ("prob_align_null", po::value<double>()->default_value(0.08), "When --favor_diagonal is set, what's the probability of a null alignment?") ("variational_bayes,v","Add a symmetric Dirichlet prior and infer VB estimate of weights") + ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model") ("alpha,a", po::value<double>()->default_value(0.01), "Hyperparameter for optional Dirichlet prior") ("no_add_viterbi,V","Do not add Viterbi alignment points (may generate a grammar where some training sentence pairs are unreachable)"); po::options_description clo("Command line options"); @@ -56,7 +61,14 @@ int main(int argc, char** argv) { const WordID kNULL = TD::Convert("<eps>"); const bool add_viterbi = (conf.count("no_add_viterbi") == 0); const bool variational_bayes = (conf.count("variational_bayes") > 0); + const bool write_alignments = (conf.count("write_alignments") > 0); + const double diagonal_tension = conf["diagonal_tension"].as<double>(); + const double prob_align_null = conf["prob_align_null"].as<double>(); + string testset; + if (conf.count("testset")) testset = conf["testset"].as<string>(); + const double prob_align_not_null = 1.0 - prob_align_null; const double alpha = conf["alpha"].as<double>(); + const bool favor_diagonal = conf.count("favor_diagonal"); if (variational_bayes && alpha <= 0.0) { cerr << "--alpha must be > 0\n"; return 1; @@ -64,6 +76,9 @@ int main(int argc, char** argv) { TTable tt; TTable::Word2Word2Double was_viterbi; + double tot_len_ratio = 0; + double mean_srclen_multiplier = 0; + vector<double> unnormed_a_i; for (int iter = 0; iter < ITERATIONS; ++iter) { const bool final_iteration = (iter == (ITERATIONS - 1)); cerr << "ITERATION " << (iter + 1) << (final_iteration ? " (FINAL)" : "") << endl; @@ -74,13 +89,13 @@ int main(int argc, char** argv) { int lc = 0; bool flag = false; string line; + string ssrc, strg; while(true) { getline(in, line); if (!in) break; ++lc; if (lc % 1000 == 0) { cerr << '.'; flag = true; } if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } - string ssrc, strg; ParseTranslatorInput(line, &ssrc, &strg); Lattice src, trg; LatticeTools::ConvertTextToLattice(ssrc, &src); @@ -90,34 +105,60 @@ int main(int argc, char** argv) { assert(src.size() > 0); assert(trg.size() > 0); } + if (src.size() > unnormed_a_i.size()) + unnormed_a_i.resize(src.size()); + if (iter == 0) + tot_len_ratio += static_cast<double>(trg.size()) / static_cast<double>(src.size()); denom += trg.size(); vector<double> probs(src.size() + 1); - const double src_logprob = -log(src.size() + 1); + bool first_al = true; // used for write_alignments for (int j = 0; j < trg.size(); ++j) { const WordID& f_j = trg[j][0].label; double sum = 0; + const double j_over_ts = double(j) / trg.size(); + double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) if (use_null) { - probs[0] = tt.prob(kNULL, f_j); + if (favor_diagonal) prob_a_i = prob_align_null; + probs[0] = tt.prob(kNULL, f_j) * prob_a_i; sum += probs[0]; } + double az = 0; + if (favor_diagonal) { + for (int ta = 0; ta < src.size(); ++ta) { + unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); + az += unnormed_a_i[ta]; + } + az /= prob_align_not_null; + } for (int i = 1; i <= src.size(); ++i) { - probs[i] = tt.prob(src[i-1][0].label, f_j); + if (favor_diagonal) + prob_a_i = unnormed_a_i[i-1] / az; + probs[i] = tt.prob(src[i-1][0].label, f_j) * prob_a_i; sum += probs[i]; } if (final_iteration) { - if (add_viterbi) { + if (add_viterbi || write_alignments) { WordID max_i = 0; double max_p = -1; + int max_index = -1; if (use_null) { max_i = kNULL; + max_index = 0; max_p = probs[0]; } for (int i = 1; i <= src.size(); ++i) { if (probs[i] > max_p) { + max_index = i; max_p = probs[i]; max_i = src[i-1][0].label; } } + if (write_alignments) { + if (max_index > 0) { + if (first_al) first_al = false; else cout << ' '; + cout << (max_index - 1) << "-" << j; + } + } was_viterbi[max_i][f_j] = 1.0; } } else { @@ -126,14 +167,19 @@ int main(int argc, char** argv) { for (int i = 1; i <= src.size(); ++i) tt.Increment(src[i-1][0].label, f_j, probs[i] / sum); } - likelihood += log(sum) + src_logprob; + likelihood += log(sum); } + if (write_alignments && final_iteration) cout << endl; } // log(e) = 1.0 double base2_likelihood = likelihood / log(2); if (flag) { cerr << endl; } + if (iter == 0) { + mean_srclen_multiplier = tot_len_ratio / lc; + cerr << "expected target length = source length * " << mean_srclen_multiplier << endl; + } cerr << " log_e likelihood: " << likelihood << endl; cerr << " log_2 likelihood: " << base2_likelihood << endl; cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; @@ -145,6 +191,55 @@ int main(int argc, char** argv) { tt.Normalize(); } } + if (testset.size()) { + ReadFile rf(testset); + istream& in = *rf.stream(); + int lc = 0; + double tlp = 0; + string ssrc, strg, line; + while (getline(in, line)) { + ++lc; + ParseTranslatorInput(line, &ssrc, &strg); + Lattice src, trg; + LatticeTools::ConvertTextToLattice(ssrc, &src); + LatticeTools::ConvertTextToLattice(strg, &trg); + double log_prob = Md::log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier); + if (src.size() > unnormed_a_i.size()) + unnormed_a_i.resize(src.size()); + + // compute likelihood + for (int j = 0; j < trg.size(); ++j) { + const WordID& f_j = trg[j][0].label; + double sum = 0; + const double j_over_ts = double(j) / trg.size(); + double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) + if (use_null) { + if (favor_diagonal) prob_a_i = prob_align_null; + sum += tt.prob(kNULL, f_j) * prob_a_i; + } + double az = 0; + if (favor_diagonal) { + for (int ta = 0; ta < src.size(); ++ta) { + unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); + az += unnormed_a_i[ta]; + } + az /= prob_align_not_null; + } + for (int i = 1; i <= src.size(); ++i) { + if (favor_diagonal) + prob_a_i = unnormed_a_i[i-1] / az; + sum += tt.prob(src[i-1][0].label, f_j) * prob_a_i; + } + log_prob += log(sum); + } + tlp += log_prob; + cerr << ssrc << " ||| " << strg << " ||| " << log_prob << endl; + } + cerr << "TOTAL LOG PROB " << tlp << endl; + } + + if (write_alignments) return 0; + for (TTable::Word2Word2Double::iterator ei = tt.ttable.begin(); ei != tt.ttable.end(); ++ei) { const TTable::Word2Double& cpd = ei->second; const TTable::Word2Double& vit = was_viterbi[ei->first]; diff --git a/training/mpi_flex_optimize.cc b/training/mpi_flex_optimize.cc index 00746532..a9197208 100644 --- a/training/mpi_flex_optimize.cc +++ b/training/mpi_flex_optimize.cc @@ -205,7 +205,7 @@ int main(int argc, char** argv) { const int size = 1; const int rank = 0; #endif - if (size > 0) SetSilent(true); // turn off verbose decoder output + if (size > 1) SetSilent(true); // turn off verbose decoder output register_feature_functions(); MT19937* rng = NULL; @@ -272,6 +272,7 @@ int main(int argc, char** argv) { int iter = -1; bool converged = false; + vector<double> gg; while (!converged) { #ifdef HAVE_MPI mpi::timer timer; @@ -343,7 +344,7 @@ int main(int argc, char** argv) { double obj = 0; #ifdef HAVE_MPI - // TODO obj + reduce(world, local_obj, obj, std::plus<double>(), 0); reduce(world, local_grad, g, std::plus<SparseVector<double> >(), 0); #else obj = local_obj; @@ -354,13 +355,14 @@ int main(int argc, char** argv) { // g /= (size_per_proc * size); if (!o) o.reset(new LBFGSOptimizer(FD::NumFeats(), lbfgs_memory_buffers)); - vector<double> gg(FD::NumFeats()); + gg.clear(); + gg.resize(FD::NumFeats()); if (gg.size() != cur_weights.size()) { cur_weights.resize(gg.size()); } for (SparseVector<double>::const_iterator it = g.begin(); it != g.end(); ++it) if (it->first) { gg[it->first] = it->second; } g.clear(); double r = ApplyRegularizationTerms(regularization_strength, - time_series_strength * (iter == 0 ? 0.0 : 1.0), + time_series_strength, // * (iter == 0 ? 0.0 : 1.0), cur_weights, prev_weights, &gg); @@ -375,10 +377,9 @@ int main(int argc, char** argv) { o->Optimize(obj, gg, &cur_weights); } #ifdef HAVE_MPI - // broadcast(world, x, 0); + broadcast(world, cur_weights, 0); broadcast(world, converged, 0); world.barrier(); - if (rank == 0) { cerr << " ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; } #endif } prev_weights = cur_weights; diff --git a/training/mr_em_adapted_reduce.cc b/training/mr_em_adapted_reduce.cc index d4c16a2f..f65b5440 100644 --- a/training/mr_em_adapted_reduce.cc +++ b/training/mr_em_adapted_reduce.cc @@ -10,7 +10,7 @@ #include "fdict.h" #include "weights.h" #include "sparse_vector.h" -#include "em_utils.h" +#include "m.h" using namespace std; namespace po = boost::program_options; @@ -63,11 +63,11 @@ void Maximize(const bool use_vb, assert(tot > 0.0); double ltot = log(tot); if (use_vb) - ltot = digamma(tot + total_event_types * alpha); + ltot = Md::digamma(tot + total_event_types * alpha); for (SparseVector<double>::const_iterator it = counts.begin(); it != counts.end(); ++it) { if (use_vb) { - pc->set_value(it->first, NoZero(digamma(it->second + alpha) - ltot)); + pc->set_value(it->first, NoZero(Md::digamma(it->second + alpha) - ltot)); } else { pc->set_value(it->first, NoZero(log(it->second) - ltot)); } diff --git a/training/ttables.h b/training/ttables.h index 50d85a68..bf3351d2 100644 --- a/training/ttables.h +++ b/training/ttables.h @@ -4,9 +4,9 @@ #include <iostream> #include <tr1/unordered_map> +#include "m.h" #include "wordid.h" #include "tdict.h" -#include "em_utils.h" class TTable { public: @@ -39,7 +39,7 @@ class TTable { for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) tot += it->second + alpha; for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - it->second = exp(digamma(it->second + alpha) - digamma(tot)); + it->second = exp(Md::digamma(it->second + alpha) - Md::digamma(tot)); } counts.clear(); } |