Merge remote-tracking branch 'upstream/master'

author: Patrick Simianer <p@simianer.de> 2014-06-12 13:56:42 +0200
committer: Patrick Simianer <p@simianer.de> 2014-06-12 13:56:42 +0200
commit: 244971287003d079e46193b8a209c28955f90134 (patch)
tree: 8beaae6b12b913acb213fc7f2415fd63886192f9 /word-aligner
parent: 5250fd67a4b8f242068cff87f0a6a4211f8b0fcf (diff)
parent: b66e838ed52decc0be1eb5817b2a77c3840db2c5 (diff)
5 files changed, 253 insertions, 43 deletions
diff --git a/word-aligner/Makefile.am b/word-aligner/Makefile.am
index 1f7f78ae..075ad009 100644
--- a/word-aligner/Makefile.am
+++ b/word-aligner/Makefile.am
@@ -1,8 +1,11 @@
-bin_PROGRAMS = fast_align
+bin_PROGRAMS = fast_align binderiv
 
 fast_align_SOURCES = fast_align.cc ttables.cc da.h ttables.h
 fast_align_LDADD = ../utils/libutils.a
 
+binderiv_SOURCES = binderiv.cc
+binderiv_LDADD = ../utils/libutils.a
+
 EXTRA_DIST = aligner.pl ortho-norm support makefiles stemmers
 
 AM_CPPFLAGS = -W -Wall -I$(top_srcdir) -I$(top_srcdir)/utils -I$(top_srcdir)/training
diff --git a/word-aligner/binderiv.cc b/word-aligner/binderiv.cc
new file mode 100644
index 00000000..8ebc1105
--- /dev/null
+++ b/word-aligner/binderiv.cc
@@ -0,0 +1,202 @@
+#include <iostream>
+#include <string>
+#include <queue>
+#include <sstream>
+
+#include "alignment_io.h"
+#include "tdict.h"
+
+using namespace std;
+
+enum CombinationType {
+  kNONE = 0,
+  kAXIOM,
+  kMONO, kSWAP, kCONTAINS_L, kCONTAINS_R, kINTERLEAVE
+};
+
+string nm(CombinationType x) {
+  switch (x) {
+    case kNONE: return "NONE";
+    case kAXIOM: return "AXIOM";
+    case kMONO: return "MONO";
+    case kSWAP: return "SWAP";
+    case kCONTAINS_L: return "CONTAINS_L";
+    case kCONTAINS_R: return "CONTAINS_R";
+    case kINTERLEAVE: return "INTERLEAVE";
+  }
+}
+
+string Substring(const vector<WordID>& s, unsigned i, unsigned j) {
+  ostringstream os;
+  for (unsigned k = i; k < j; ++k) {
+    if (k > i) os << ' ';
+    os << TD::Convert(s[k]);
+  }
+  return os.str();
+}
+
+inline int min4(int a, int b, int c, int d) {
+  int l = a;
+  if (b < a) l = b;
+  int l2 = c;
+  if (d < c) l2 = c;
+  return min(l, l2);
+}
+
+inline int max4(int a, int b, int c, int d) {
+  int l = a;
+  if (b > a) l = b;
+  int l2 = c;
+  if (d > c) l2 = d;
+  return max(l, l2);
+}
+
+struct State {
+  int s,t,u,v;
+  State() : s(), t(), u(), v() {}
+  State(int a, int b, int c, int d) : s(a), t(b), u(c), v(d) {
+    assert(s <= t);
+    assert(u <= v);
+  }
+  bool IsGood() const {
+    return (s != 0 || t != 0 || u != 0 || v != 0);
+  }
+  CombinationType operator&(const State& r) const {
+    if (r.s != t) return kNONE;
+    if (v <= r.u) return kMONO;
+    if (r.v <= u) return kSWAP;
+    if (v >= r.v && u <= r.u) return kCONTAINS_R;
+    if (r.v >= v && r.u <= u) return kCONTAINS_L;
+    return kINTERLEAVE;
+  }
+  State& operator*=(const State& r) {
+    assert(r.s == t);
+    t = r.t;
+    const int tu = min4(u, v, r.u, r.v);
+    v = max4(u, v, r.u, r.v);
+    u = tu;
+    return *this;
+  }
+};
+
+double score(CombinationType x) {
+  switch (x) {
+    case kNONE: return 0.0;
+    case kAXIOM: return 1.0;
+    case kMONO: return 16.0;
+    case kSWAP: return  8.0;
+    case kCONTAINS_R: return 4.0;
+    case kCONTAINS_L: return 2.0;
+    case kINTERLEAVE: return 1.0;
+  }
+}
+
+State operator*(const State& l, const State& r) {
+  State res = l;
+  res *= r;
+  return res;
+}
+
+ostream& operator<<(ostream& os, const State& s) {
+  return os << '[' << s.s << ", " << s.t << ", " << s.u << ", " << s.v << ']';
+}
+
+string NT(const State& s) {
+  bool decorate=true;
+  if (decorate) {
+    ostringstream os;
+    os << "[X_" << s.s << '_' << s.t << '_' << s.u << '_' << s.v << "]";
+    return os.str();
+  } else {
+    return "[X]";
+  }
+}
+
+void CreateEdge(const vector<WordID>& f, const vector<WordID>& e, CombinationType ct, const State& cur, const State& left, const State& right) {
+  switch(ct) {
+    case kINTERLEAVE:
+    case kAXIOM:
+      cerr << NT(cur) << " ||| " << Substring(f, cur.s, cur.t) << " ||| " << Substring(e, cur.u, cur.v) << "\n";
+      break;
+    case kMONO:
+      cerr << NT(cur) << " ||| " << NT(left) << ' ' << NT(right) << " ||| [1] [2]\n";
+      break;
+    case kSWAP:
+      cerr << NT(cur) << " ||| " << NT(left) << ' ' << NT(right) << " ||| [2] [1]\n";
+      break;
+    case kCONTAINS_L:
+      cerr << NT(cur) << " ||| " << Substring(f, right.s, left.s) << ' ' << NT(left) << ' ' << Substring(f, left.t, right.t) << " ||| " << Substring(e, right.u, left.u) << " [1] " << Substring(e, left.v, right.v) << endl;
+      break;
+    case kCONTAINS_R:
+      cerr << NT(cur) << " ||| " << Substring(f, left.s, right.s) << ' ' << NT(right) << ' ' << Substring(f, right.t, left.t) << " ||| " << Substring(e, left.u, right.u) << " [1] " << Substring(e, right.v, left.v) << endl;
+      break;
+    }
+}
+
+void BuildArity2Forest(const vector<WordID>& f, const vector<WordID>& e, const vector<State>& axioms) {
+  const unsigned n = f.size();
+  Array2D<State> chart(n, n+1);
+  Array2D<CombinationType> ctypes(n, n+1);
+  Array2D<double> cscore(n, n+1);
+  Array2D<int> cmids(n, n+1, -1);
+  for (const auto& axiom : axioms) {
+    chart(axiom.s, axiom.t) = axiom;
+    ctypes(axiom.s, axiom.t) = kAXIOM;
+    cscore(axiom.s, axiom.t) = 1.0;
+    CreateEdge(f, e, kAXIOM, axiom, axiom, axiom);
+    //cerr << "AXIOM " << axiom.s << ", " << axiom.t << " : " << chart(axiom.s, axiom.t) << " : " << 1 << endl;
+  }
+  for (unsigned l = 2; l <= n; ++l) {
+    const unsigned i_end = n + 1 - l;
+    for (unsigned i = 0; i < i_end; ++i) {
+      const unsigned j = i + l;
+      for (unsigned k = i + 1; k < j; ++k) {
+        const State& left = chart(i, k);
+        const State& right = chart(k, j);
+        if (!left.IsGood() || !right.IsGood()) continue;
+        CombinationType comb = left & right;
+        if (comb != kNONE) {
+          double ns = cscore(i,k) + cscore(k,j) + score(comb);
+          if (ns > cscore(i,j)) {
+            cscore(i,j) = ns;
+            chart(i,j) = left * right;
+            cmids(i,j) = k;
+            ctypes(i,j) = comb;
+            //cerr << "PROVED " << chart(i,j) << " : " << cscore(i,j) << "  [" << nm(comb) << " " << left << " * " << right << "]\n";
+          } else {
+            //cerr << "SUBOPTIMAL " << (left*right) << " : " << ns << "  [" << nm(comb) << " " << left << " * " << right << "]\n";
+          }
+          CreateEdge(f, e, comb, left * right, left, right);
+        } else {
+          //cerr << "CAN'T " << left << " * " << right << endl;
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  State s;
+  vector<WordID> e,f;
+  TD::ConvertSentence("B C that A", &e);
+  TD::ConvertSentence("A de B C", &f);
+  State w0(0,1,3,4), w1(1,2,2,3), w2(2,3,0,1), w3(3,4,1,2);
+  vector<State> al = {w0, w1, w2, w3};
+  // f cannot have any unaligned words, however, multiple overlapping axioms are possible
+  // so you can write code to align unaligned words in all ways to surrounding words
+  BuildArity2Forest(f, e, al);
+
+  TD::ConvertSentence("A B C D", &e);
+  TD::ConvertSentence("A B , C D", &f);
+  vector<State> al2 = {State(0,1,0,1), State(1,2,1,2), State(1,3,1,2), State(2,4,2,3), State(4,5,3,4)};
+  BuildArity2Forest(f, e, al2);
+
+  TD::ConvertSentence("A B C D", &e);
+  TD::ConvertSentence("C A D B", &f);
+  vector<State> al3 = {State(0,1,2,3), State(1,2,0,1), State(2,3,3,4), State(3,4,1,2)};
+  BuildArity2Forest(f, e, al3);
+
+  // things to do: run EM, do posterior inference with a Dirichlet prior, etc.
+  return 0;
+}
+
diff --git a/word-aligner/fast_align.cc b/word-aligner/fast_align.cc
index f54233eb..73b72399 100644
--- a/word-aligner/fast_align.cc
+++ b/word-aligner/fast_align.cc
@@ -190,6 +190,7 @@ int main(int argc, char** argv) {
                   cout << (max_index - 1) << '-' << j;
               }
             }
+            if (s2t_viterbi.size() <= static_cast<unsigned>(max_i)) s2t_viterbi.resize(max_i + 1);
             s2t_viterbi[max_i][f_j] = 1.0;
           }
         } else {
@@ -308,17 +309,17 @@ int main(int argc, char** argv) {
 
   if (output_parameters) {
     WriteFile params_out(conf["output_parameters"].as<string>());
-    for (TTable::Word2Word2Double::iterator ei = s2t.ttable.begin(); ei != s2t.ttable.end(); ++ei) {
-      const TTable::Word2Double& cpd = ei->second;
-      const TTable::Word2Double& vit = s2t_viterbi[ei->first];
-      const string& esym = TD::Convert(ei->first);
+    for (unsigned eind = 1; eind < s2t.ttable.size(); ++eind) {
+      const auto& cpd = s2t.ttable[eind];
+      const TTable::Word2Double& vit = s2t_viterbi[eind];
+      const string& esym = TD::Convert(eind);
       double max_p = -1;
-      for (TTable::Word2Double::const_iterator fi = cpd.begin(); fi != cpd.end(); ++fi)
-        if (fi->second > max_p) max_p = fi->second;
+      for (auto& fi : cpd)
+        if (fi.second > max_p) max_p = fi.second;
       const double threshold = max_p * BEAM_THRESHOLD;
-      for (TTable::Word2Double::const_iterator fi = cpd.begin(); fi != cpd.end(); ++fi) {
-        if (fi->second > threshold || (vit.find(fi->first) != vit.end())) {
-          *params_out << esym << ' ' << TD::Convert(fi->first) << ' ' << log(fi->second) << endl;
+      for (auto& fi : cpd) {
+        if (fi.second > threshold || (vit.find(fi.first) != vit.end())) {
+          *params_out << esym << ' ' << TD::Convert(fi.first) << ' ' << log(fi.second) << endl;
         }
       } 
     }
diff --git a/word-aligner/ttables.cc b/word-aligner/ttables.cc
index a56bbcef..64d54bdf 100644
--- a/word-aligner/ttables.cc
+++ b/word-aligner/ttables.cc
@@ -8,28 +8,32 @@ using namespace std;
 
 void TTable::DeserializeProbsFromText(std::istream* in) {
   int c = 0;
+  string e;
+  string f;
+  double p;
   while(*in) {
-    string e;
-    string f;
-    double p;
     (*in) >> e >> f >> p;
     if (e.empty()) break;
     ++c;
-    ttable[TD::Convert(e)][TD::Convert(f)] = p;
+    WordID ie = TD::Convert(e);
+    if (ie >= static_cast<int>(ttable.size())) ttable.resize(ie + 1);
+    ttable[ie][TD::Convert(f)] = p;
   }
   cerr << "Loaded " << c << " translation parameters.\n";
 }
 
 void TTable::DeserializeLogProbsFromText(std::istream* in) {
   int c = 0;
+  string e;
+  string f;
+  double p;
   while(*in) {
-    string e;
-    string f;
-    double p;
     (*in) >> e >> f >> p;
     if (e.empty()) break;
     ++c;
-    ttable[TD::Convert(e)][TD::Convert(f)] = exp(p);
+    WordID ie = TD::Convert(e);
+    if (ie >= static_cast<int>(ttable.size())) ttable.resize(ie + 1);
+    ttable[ie][TD::Convert(f)] = exp(p);
   }
   cerr << "Loaded " << c << " translation parameters.\n";
 }
diff --git a/word-aligner/ttables.h b/word-aligner/ttables.h
index d82aff72..b9964225 100644
--- a/word-aligner/ttables.h
+++ b/word-aligner/ttables.h
@@ -2,6 +2,7 @@
 #define _TTABLES_H_
 
 #include <iostream>
+#include <vector>
 #ifndef HAVE_OLD_CPP
 # include <unordered_map>
 #else
@@ -18,11 +19,10 @@ class TTable {
  public:
   TTable() {}
   typedef std::unordered_map<WordID, double> Word2Double;
-  typedef std::unordered_map<WordID, Word2Double> Word2Word2Double;
+  typedef std::vector<Word2Double> Word2Word2Double;
   inline double prob(const int& e, const int& f) const {
-    const Word2Word2Double::const_iterator cit = ttable.find(e);
-    if (cit != ttable.end()) {
-      const Word2Double& cpd = cit->second;
+    if (e < static_cast<int>(ttable.size())) {
+      const Word2Double& cpd = ttable[e];
       const Word2Double::const_iterator it = cpd.find(f);
       if (it == cpd.end()) return 1e-9;
       return it->second;
@@ -31,19 +31,21 @@ class TTable {
     }
   }
   inline void Increment(const int& e, const int& f) {
+    if (e >= static_cast<int>(ttable.size())) counts.resize(e + 1);
     counts[e][f] += 1.0;
   }
   inline void Increment(const int& e, const int& f, double x) {
+    if (e >= static_cast<int>(counts.size())) counts.resize(e + 1);
     counts[e][f] += x;
   }
   void NormalizeVB(const double alpha) {
     ttable.swap(counts);
-    for (Word2Word2Double::iterator cit = ttable.begin();
-         cit != ttable.end(); ++cit) {
+    for (unsigned i = 0; i < ttable.size(); ++i) {
       double tot = 0;
-      Word2Double& cpd = cit->second;
+      Word2Double& cpd = ttable[i];
       for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
         tot += it->second + alpha;
+      if (!tot) tot = 1;
       for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
         it->second = exp(Md::digamma(it->second + alpha) - Md::digamma(tot));
     }
@@ -51,12 +53,12 @@ class TTable {
   }
   void Normalize() {
     ttable.swap(counts);
-    for (Word2Word2Double::iterator cit = ttable.begin();
-         cit != ttable.end(); ++cit) {
+    for (unsigned i = 0; i < ttable.size(); ++i) {
       double tot = 0;
-      Word2Double& cpd = cit->second;
+      Word2Double& cpd = ttable[i];
       for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
         tot += it->second;
+      if (!tot) tot = 1;
       for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
         it->second /= tot;
     }
@@ -64,29 +66,27 @@ class TTable {
   }
   // adds counts from another TTable - probabilities remain unchanged
   TTable& operator+=(const TTable& rhs) {
-    for (Word2Word2Double::const_iterator it = rhs.counts.begin();
-         it != rhs.counts.end(); ++it) {
-      const Word2Double& cpd = it->second;
-      Word2Double& tgt = counts[it->first];
-      for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) {
-        tgt[j->first] += j->second;
-      }
+    if (rhs.counts.size() > counts.size()) counts.resize(rhs.counts.size());
+    for (unsigned i = 0; i < rhs.counts.size(); ++i) {
+      const Word2Double& cpd = rhs.counts[i];
+      Word2Double& tgt = counts[i];
+      for (auto p : cpd) tgt[p.first] += p.second;
     }
     return *this;
   }
   void ShowTTable() const {
-    for (Word2Word2Double::const_iterator it = ttable.begin(); it != ttable.end(); ++it) {
-      const Word2Double& cpd = it->second;
-      for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) {
-        std::cerr << "P(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl;
+    for (unsigned it = 0; it < ttable.size(); ++it) {
+      const Word2Double& cpd = ttable[it];
+      for (auto& p : cpd) {
+        std::cerr << "c(" << TD::Convert(p.first) << '|' << TD::Convert(it) << ") = " << p.second << std::endl;
       }
     }
   }
   void ShowCounts() const {
-    for (Word2Word2Double::const_iterator it = counts.begin(); it != counts.end(); ++it) {
-      const Word2Double& cpd = it->second;
-      for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) {
-        std::cerr << "c(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl;
+    for (unsigned it = 0; it < counts.size(); ++it) {
+      const Word2Double& cpd = counts[it];
+      for (auto& p : cpd) {
+        std::cerr << "c(" << TD::Convert(p.first) << '|' << TD::Convert(it) << ") = " << p.second << std::endl;
       }
     }
   }
author	Patrick Simianer <p@simianer.de>	2014-06-12 13:56:42 +0200
committer	Patrick Simianer <p@simianer.de>	2014-06-12 13:56:42 +0200
commit	244971287003d079e46193b8a209c28955f90134 (patch)
tree	8beaae6b12b913acb213fc7f2415fd63886192f9 /word-aligner
parent	5250fd67a4b8f242068cff87f0a6a4211f8b0fcf (diff)
parent	b66e838ed52decc0be1eb5817b2a77c3840db2c5 (diff)