diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-02-19 04:27:55 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-02-19 04:27:55 -0500 |
commit | c4ffa6df1fdd89e3db9c6d3829b7b84edac20bcf (patch) | |
tree | 695176a31796d634e81db3274859bad98d58b4c1 /training/lbl_model.cc | |
parent | 0498227db2f45bcf7ac44809106846866a6f85e1 (diff) |
lbl preliminary clean up
Diffstat (limited to 'training/lbl_model.cc')
-rw-r--r-- | training/lbl_model.cc | 84 |
1 files changed, 54 insertions, 30 deletions
diff --git a/training/lbl_model.cc b/training/lbl_model.cc index 72d80a56..ccd29255 100644 --- a/training/lbl_model.cc +++ b/training/lbl_model.cc @@ -6,6 +6,7 @@ #else #include <cmath> +#include <set> #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> @@ -20,10 +21,17 @@ namespace po = boost::program_options; using namespace std; +#define kDIMENSIONS 10 +typedef Eigen::Matrix<float, kDIMENSIONS, 1> RVector; +typedef Eigen::Matrix<float, 1, kDIMENSIONS> RTVector; +typedef Eigen::Matrix<float, kDIMENSIONS, kDIMENSIONS> TMatrix; +vector<RVector> r_src, r_trg; + bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("iterations,i",po::value<unsigned>()->default_value(5),"Number of iterations of training") + ("input,i",po::value<string>(),"Input file") + ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training") ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); po::options_description clo("Command line options"); @@ -42,7 +50,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::notify(*conf); if (argc < 2 || conf->count("help")) { - cerr << "Usage " << argv[0] << " [OPTIONS] corpus.fr-en\n"; + cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n"; cerr << dcmdline_options << endl; return false; } @@ -52,33 +60,32 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { int main(int argc, char** argv) { po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) return 1; - const string fname = argv[argc - 1]; + const string fname = conf["input"].as<string>(); const int ITERATIONS = conf["iterations"].as<unsigned>(); const double diagonal_tension = conf["diagonal_tension"].as<double>(); + if (diagonal_tension < 0.0) { + cerr << "Invalid value for diagonal_tension: must be >= 0\n"; + return 1; + } string testset; if (conf.count("testset")) testset = conf["testset"].as<string>(); - double tot_len_ratio = 0; - double mean_srclen_multiplier = 0; + int lc = 0; vector<double> unnormed_a_i; - for (int iter = 0; iter < ITERATIONS; ++iter) { - cerr << "ITERATION " << (iter + 1) << endl; + string line; + string ssrc, strg; + bool flag = false; + Lattice src, trg; + set<WordID> vocab_e; + { // read through corpus, initialize int map, check lines are good + cerr << "INITIAL READ OF " << fname << endl; ReadFile rf(fname); istream& in = *rf.stream(); - double likelihood = 0; - double denom = 0.0; - int lc = 0; - bool flag = false; - string line; - string ssrc, strg; - while(true) { - getline(in, line); - if (!in) break; + while(getline(in, line)) { ++lc; if (lc % 1000 == 0) { cerr << '.'; flag = true; } if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } ParseTranslatorInput(line, &ssrc, &strg); - Lattice src, trg; LatticeTools::ConvertTextToLattice(ssrc, &src); LatticeTools::ConvertTextToLattice(strg, &trg); if (src.size() == 0 || trg.size() == 0) { @@ -88,37 +95,54 @@ int main(int argc, char** argv) { } if (src.size() > unnormed_a_i.size()) unnormed_a_i.resize(src.size()); - if (iter == 0) - tot_len_ratio += static_cast<double>(trg.size()) / static_cast<double>(src.size()); + for (unsigned i = 0; i < trg.size(); ++i) { + assert(trg[i].size() == 1); + vocab_e.insert(trg[i][0].label); + } + } + } + if (flag) cerr << endl; + + // do optimization + for (int iter = 0; iter < ITERATIONS; ++iter) { + cerr << "ITERATION " << (iter + 1) << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + double likelihood = 0; + double denom = 0.0; + lc = 0; + flag = false; + while(true) { + getline(in, line); + if (!in) break; + ++lc; + if (lc % 1000 == 0) { cerr << '.'; flag = true; } + if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } + ParseTranslatorInput(line, &ssrc, &strg); + LatticeTools::ConvertTextToLattice(ssrc, &src); + LatticeTools::ConvertTextToLattice(strg, &trg); denom += trg.size(); vector<double> probs(src.size() + 1); - bool first_al = true; // used for write_alignments for (int j = 0; j < trg.size(); ++j) { const WordID& f_j = trg[j][0].label; double sum = 0; const double j_over_ts = double(j) / trg.size(); - double prob_a_i = 1.0 / src.size(); double az = 0; for (int ta = 0; ta < src.size(); ++ta) { unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); az += unnormed_a_i[ta]; } for (int i = 1; i <= src.size(); ++i) { - prob_a_i = unnormed_a_i[i-1] / az; + const double prob_a_i = unnormed_a_i[i-1] / az; + // TODO probs[i] = 1; // tt.prob(src[i-1][0].label, f_j) * prob_a_i; sum += probs[i]; } } } - - // log(e) = 1.0 - double base2_likelihood = likelihood / log(2); - if (flag) { cerr << endl; } - if (iter == 0) { - mean_srclen_multiplier = tot_len_ratio / lc; - cerr << "expected target length = source length * " << mean_srclen_multiplier << endl; - } + + const double base2_likelihood = likelihood / log(2); cerr << " log_e likelihood: " << likelihood << endl; cerr << " log_2 likelihood: " << base2_likelihood << endl; cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; |