basic lbl model, nothing to see here

author: Chris Dyer <cdyer@cs.cmu.edu> 2012-02-21 11:53:01 -0500
committer: Chris Dyer <cdyer@cs.cmu.edu> 2012-02-21 11:53:01 -0500
commit: 9e45f895aaec5c7a2f362aa532ca5ca4325e102b (patch)
tree: c33929265e5f8617467cd471ef1056285380b52e
parent: 9e82ba7123310640e96aef5507eebffb434b9a51 (diff)
1 files changed, 126 insertions, 21 deletions
diff --git a/training/lbl_model.cc b/training/lbl_model.cc
index ccd29255..4759eedc 100644
--- a/training/lbl_model.cc
+++ b/training/lbl_model.cc
@@ -5,13 +5,18 @@
   int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; }
 #else
 
+#include <cstdlib>
+#include <algorithm>
 #include <cmath>
 #include <set>
+#include <cstring> // memset
+#include <ctime>
 
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 #include <Eigen/Dense>
 
+#include "array2d.h"
 #include "m.h"
 #include "lattice.h"
 #include "stringlib.h"
@@ -21,7 +26,7 @@
 namespace po = boost::program_options;
 using namespace std;
 
-#define kDIMENSIONS 10
+#define kDIMENSIONS 25
 typedef Eigen::Matrix<float, kDIMENSIONS, 1> RVector;
 typedef Eigen::Matrix<float, 1, kDIMENSIONS> RTVector;
 typedef Eigen::Matrix<float, kDIMENSIONS, kDIMENSIONS> TMatrix;
@@ -32,6 +37,8 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   opts.add_options()
         ("input,i",po::value<string>(),"Input file")
         ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training")
+        ("eta,e", po::value<float>()->default_value(0.1f), "Eta for SGD")
+        ("random_seed", po::value<unsigned>(), "Random seed")
         ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)")
         ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model");
   po::options_description clo("Command line options");
@@ -57,12 +64,19 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   return true;
 }
 
+void Normalize(RVector* v) {
+  float norm = v->norm();
+  *v /= norm;
+}
+
 int main(int argc, char** argv) {
   po::variables_map conf;
   if (!InitCommandLine(argc, argv, &conf)) return 1;
   const string fname = conf["input"].as<string>();
   const int ITERATIONS = conf["iterations"].as<unsigned>();
+  const float eta = conf["eta"].as<float>();
   const double diagonal_tension = conf["diagonal_tension"].as<double>();
+  bool SGD = true;
   if (diagonal_tension < 0.0) {
     cerr << "Invalid value for diagonal_tension: must be >= 0\n";
     return 1;
@@ -70,14 +84,15 @@ int main(int argc, char** argv) {
   string testset;
   if (conf.count("testset")) testset = conf["testset"].as<string>();
 
-  int lc = 0;
+  unsigned lc = 0;
   vector<double> unnormed_a_i;
   string line;
   string ssrc, strg;
   bool flag = false;
   Lattice src, trg;
-  set<WordID> vocab_e;
+  vector<WordID> vocab_e;
   { // read through corpus, initialize int map, check lines are good
+    set<WordID> svocab_e;
     cerr << "INITIAL READ OF " << fname << endl;
     ReadFile rf(fname);
     istream& in = *rf.stream();
@@ -97,13 +112,39 @@ int main(int argc, char** argv) {
         unnormed_a_i.resize(src.size());
       for (unsigned i = 0; i < trg.size(); ++i) {
         assert(trg[i].size() == 1);
-        vocab_e.insert(trg[i][0].label);
+        svocab_e.insert(trg[i][0].label);
       }
     }
+    copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e));
   }
   if (flag) cerr << endl;
+  cerr << "Number of target word types: " << vocab_e.size() << endl;
+  const float num_examples = lc;
+
+  r_trg.resize(TD::NumWords() + 1);
+  r_src.resize(TD::NumWords() + 1);
+  if (conf.count("random_seed")) {
+    srand(conf["random_seed"].as<unsigned>());
+  } else {
+    unsigned seed = time(NULL);
+    cerr << "Random seed: " << seed << endl;
+    srand(seed);
+  }
+  TMatrix t = TMatrix::Random() / 100.0;
+  for (unsigned i = 1; i < r_trg.size(); ++i) {
+    r_trg[i] = RVector::Random();
+    r_src[i] = RVector::Random();
+    r_trg[i][i % kDIMENSIONS] = 0.5;
+    r_src[i][(i-1) % kDIMENSIONS] = 0.5;
+    Normalize(&r_trg[i]);
+    Normalize(&r_src[i]);
+  }
+  vector<set<unsigned> > trg_pos(TD::NumWords() + 1);
 
   // do optimization
+  TMatrix g;
+  vector<TMatrix> exp_src;
+  vector<double> z_src;
   for (int iter = 0; iter < ITERATIONS; ++iter) {
     cerr << "ITERATION " << (iter + 1) << endl;
     ReadFile rf(fname);
@@ -112,9 +153,8 @@ int main(int argc, char** argv) {
     double denom = 0.0;
     lc = 0;
     flag = false;
-    while(true) {
-      getline(in, line);
-      if (!in) break;
+    g *= 0;
+    while(getline(in, line)) {
       ++lc;
       if (lc % 1000 == 0) { cerr << '.'; flag = true; }
       if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
@@ -122,23 +162,86 @@ int main(int argc, char** argv) {
       LatticeTools::ConvertTextToLattice(ssrc, &src);
       LatticeTools::ConvertTextToLattice(strg, &trg);
       denom += trg.size();
-      vector<double> probs(src.size() + 1);
-      for (int j = 0; j < trg.size(); ++j) {
-        const WordID& f_j = trg[j][0].label;
-        double sum = 0;
-        const double j_over_ts = double(j) / trg.size();
-        double az = 0;
-        for (int ta = 0; ta < src.size(); ++ta) {
-          unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension);
-          az += unnormed_a_i[ta];
+
+      exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero());
+      z_src.clear(); z_src.resize(src.size(), 0.0);
+      Array2D<TMatrix> exp_refs(src.size(), trg.size(), TMatrix::Zero());
+      Array2D<double> z_refs(src.size(), trg.size(), 0.0);
+      for (unsigned j = 0; j < trg.size(); ++j)
+        trg_pos[trg[j][0].label].insert(j);
+
+      for (unsigned i = 0; i < src.size(); ++i) {
+        const RVector& r_s = r_src[src[i][0].label];
+        const RTVector pred = r_s.transpose() * t;
+        TMatrix& exp_m = exp_src[i];
+        double& z = z_src[i];
+        for (unsigned k = 0; k < vocab_e.size(); ++k) {
+          const WordID v_k = vocab_e[k];
+          const RVector& r_t = r_trg[v_k];
+          const double dot_prod = pred * r_t;
+          const double u = exp(dot_prod);
+          z += u;
+          const TMatrix v = r_s * r_t.transpose() * u;
+          exp_m += v;
+          set<unsigned>& ref_locs = trg_pos[v_k];
+          if (!ref_locs.empty()) {
+            for (set<unsigned>::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) {
+              TMatrix& exp_ref_ij = exp_refs(i, *it);
+              double& z_ref_ij = z_refs(i, *it);
+              z_ref_ij += u;
+              exp_ref_ij += v;
+            }
+          }
+        }
+      }
+      for (unsigned j = 0; j < trg.size(); ++j)
+        trg_pos[trg[j][0].label].clear();
+
+      // model expectations for a single target generation with
+      // uniform alignment prior
+      double m_z = 0;
+      TMatrix m_exp = TMatrix::Zero();
+      for (unsigned i = 0; i < src.size(); ++i) {
+        m_exp += exp_src[i];
+        m_z += z_src[i];
+      }
+      m_exp /= m_z;
+
+      Array2D<bool> al(src.size(), trg.size(), false);
+      for (unsigned j = 0; j < trg.size(); ++j) {
+        double ref_z = 0;
+        TMatrix ref_exp = TMatrix::Zero();
+        int max_i = 0;
+        double max_s = -9999999;
+        for (unsigned i = 0; i < src.size(); ++i) {
+          ref_exp += exp_refs(i, j);
+          ref_z += z_refs(i, j);
+          if (log(z_refs(i, j)) > max_s) {
+            max_s = log(z_refs(i, j));
+            max_i = i;
+          }
+          // TODO handle alignment prob
+        }
+        if (ref_z <= 0) { 
+          cerr << "TRG=" << TD::Convert(trg[j][0].label) << endl;
+          cerr << " LINE=" << line << endl;
+          cerr << " REF_EXP=\n" << ref_exp << endl;
+          cerr << " M_EXP=\n" << m_exp << endl;
+          abort();
         }
-        for (int i = 1; i <= src.size(); ++i) {
-          const double prob_a_i = unnormed_a_i[i-1] / az;
-          // TODO
-          probs[i] = 1; // tt.prob(src[i-1][0].label, f_j) * prob_a_i;
-          sum += probs[i];
+        al(max_i, j) = true;
+        ref_exp /= ref_z;
+        g += m_exp - ref_exp;
+        likelihood += log(ref_z) - log(m_z);
+        if (SGD) {
+          t -= g * eta / num_examples;
+          g *= 0;
+        } else {
+          assert(!"not implemented");
         }
       }
+      
+      if (iter == (ITERATIONS - 1) || lc == 28) { cerr << al << endl; }
     }
     if (flag) { cerr << endl; }
 
@@ -147,7 +250,9 @@ int main(int argc, char** argv) {
     cerr << "  log_2 likelihood: " << base2_likelihood << endl;
     cerr << "   cross entropy: " << (-base2_likelihood / denom) << endl;
     cerr << "      perplexity: " << pow(2.0, -base2_likelihood / denom) << endl;
+    cerr << t << endl;
   }
+  cerr << "TRANSLATION MATRIX:" << endl << t << endl;
   return 0;
 }
author	Chris Dyer <cdyer@cs.cmu.edu>	2012-02-21 11:53:01 -0500
committer	Chris Dyer <cdyer@cs.cmu.edu>	2012-02-21 11:53:01 -0500
commit	9e45f895aaec5c7a2f362aa532ca5ca4325e102b (patch)
tree	c33929265e5f8617467cd471ef1056285380b52e
parent	9e82ba7123310640e96aef5507eebffb434b9a51 (diff)