From 9a6dd9643577b11a5833a3e2fbfb4511979a5969 Mon Sep 17 00:00:00 2001
From: "trevor.cohn" <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Thu, 1 Jul 2010 21:11:59 +0000
Subject: New agreement objective

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@93 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 gi/posterior-regularisation/train_pr_agree.py    | 328 +++++++++++++++++++++++
 gi/posterior-regularisation/train_pr_parallel.py |  12 +-
 2 files changed, 334 insertions(+), 6 deletions(-)
 create mode 100644 gi/posterior-regularisation/train_pr_agree.py

(limited to 'gi/posterior-regularisation')
diff --git a/gi/posterior-regularisation/train_pr_agree.py b/gi/posterior-regularisation/train_pr_agree.py
new file mode 100644
index 00000000..bbd6e007
--- /dev/null
+++ b/gi/posterior-regularisation/train_pr_agree.py
@@ -0,0 +1,328 @@
+import sys
+import scipy.optimize
+from scipy.stats import geom
+from numpy import *
+from numpy.random import random, seed
+
+style = sys.argv[1]
+if len(sys.argv) >= 3:
+     seed(int(sys.argv[2]))
+
+#
+# Step 1: load the concordance counts
+# 
+
+edges = []
+word_types = {}
+phrase_types = {}
+context_types = {}
+
+for line in sys.stdin:
+    phrase, rest = line.strip().split('\t')
+    ptoks = tuple(map(lambda t: word_types.setdefault(t, len(word_types)), phrase.split()))
+    pid = phrase_types.setdefault(ptoks, len(phrase_types))
+
+    parts = rest.split('|||')
+    for i in range(0, len(parts), 2):
+        context, count = parts[i:i+2]
+
+        ctx = filter(lambda x: x != '<PHRASE>', context.split())
+        ctoks = tuple(map(lambda t: word_types.setdefault(t, len(word_types)), ctx))
+        cid = context_types.setdefault(ctoks, len(context_types))
+
+        cnt = int(count.strip()[2:])
+        edges.append((pid, cid, cnt))
+
+word_type_list = [None] * len(word_types)
+for typ, index in word_types.items():
+    word_type_list[index] = typ
+
+phrase_type_list = [None] * len(phrase_types)
+for typ, index in phrase_types.items():
+    phrase_type_list[index] = typ
+
+context_type_list = [None] * len(context_types)
+for typ, index in context_types.items():
+    context_type_list[index] = typ
+
+num_tags = 5
+num_types = len(word_types)
+num_phrases = len(phrase_types)
+num_contexts = len(context_types)
+num_edges = len(edges)
+
+print 'Read in', num_edges, 'edges', num_phrases, 'phrases', num_contexts, 'contexts and', num_types, 'word types'
+
+#
+# Step 2:  expectation maximisation 
+#
+
+def normalise(a):
+    return a / float(sum(a))
+
+class PhraseToContextModel:
+    def __init__(self):
+        # Pr(tag | phrase)
+        self.tagDist = [normalise(random(num_tags)+1) for p in range(num_phrases)]
+        # Pr(context at pos i = w | tag) indexed by i, tag, word
+        self.contextWordDist = [[normalise(random(num_types)+1) for t in range(num_tags)] for i in range(4)]
+
+    def prob(self, pid, cid):
+        # return distribution p(tag, context | phrase) as vector of length |tags|
+        context = context_type_list[cid]
+        dist = zeros(num_tags)
+        for t in range(num_tags):
+            prob = self.tagDist[pid][t]
+            for k, tokid in enumerate(context):
+                prob *= self.contextWordDist[k][t][tokid]
+            dist[t] = prob
+        return dist
+
+    def expectation_maximisation_step(self, lamba=None):
+        tagCounts = zeros((num_phrases, num_tags))
+        contextWordCounts = zeros((4, num_tags, num_types))
+
+        # E-step
+        llh = 0
+        for pid, cid, cnt in edges:
+            q = self.prob(pid, cid)
+            z = sum(q)
+            q /= z
+            llh += log(z)
+            context = context_type_list[cid]
+            if lamba != None:
+                q *= exp(lamba)
+                q /= sum(q)
+            for t in range(num_tags):
+                tagCounts[pid][t] += cnt * q[t]
+            for i in range(4):
+                for t in range(num_tags):
+                    contextWordCounts[i][t][context[i]] += cnt * q[t]
+
+        # M-step
+        for p in range(num_phrases):
+            self.tagDist[p] = normalise(tagCounts[p])
+        for i in range(4):
+            for t in range(num_tags):
+                self.contextWordDist[i][t] = normalise(contextWordCounts[i,t])
+
+        return llh
+
+class ContextToPhraseModel:
+    def __init__(self):
+        # Pr(tag | context) = Multinomial
+        self.tagDist = [normalise(random(num_tags)+1) for p in range(num_contexts)]
+        # Pr(phrase = w | tag) = Multinomial
+        self.phraseSingleDist = [normalise(random(num_types)+1) for t in range(num_tags)]
+        # Pr(phrase_1 = w | tag) = Multinomial
+        self.phraseLeftDist = [normalise(random(num_types)+1) for t in range(num_tags)]
+        # Pr(phrase_-1 = w | tag) = Multinomial
+        self.phraseRightDist = [normalise(random(num_types)+1) for t in range(num_tags)]
+        # Pr(|phrase| = l | tag) = Geometric
+        self.phraseLengthDist = [0.5] * num_tags
+        # n.b. internal words for phrases of length >= 3 are drawn from uniform distribution
+
+    def prob(self, pid, cid):
+        # return distribution p(tag, phrase | context) as vector of length |tags|
+        phrase = phrase_type_list[pid]
+        dist = zeros(num_tags)
+        for t in range(num_tags):
+            prob = self.tagDist[cid][t]
+            f = self.phraseLengthDist[t]
+            prob *= geom.pmf(len(phrase), f)
+            if len(phrase) == 1:
+                prob *= self.phraseSingleDist[t][phrase[0]]
+            else:
+                prob *= self.phraseLeftDist[t][phrase[0]]
+                prob *= self.phraseRightDist[t][phrase[-1]]
+            dist[t] = prob
+        return dist
+
+    def expectation_maximisation_step(self, lamba=None):
+        tagCounts = zeros((num_contexts, num_tags))
+        phraseSingleCounts = zeros((num_tags, num_types))
+        phraseLeftCounts = zeros((num_tags, num_types))
+        phraseRightCounts = zeros((num_tags, num_types))
+        phraseLength = zeros(num_types)
+
+        # E-step
+        llh = 0
+        for pid, cid, cnt in edges:
+            q = self.prob(pid, cid)
+            z = sum(q)
+            q /= z
+            llh += log(z)
+            if lamba != None:
+                q *= exp(lamba)
+                q /= sum(q)
+            #print 'p', phrase_type_list[pid], 'c', context_type_list[cid], 'q', q
+            phrase = phrase_type_list[pid]
+            for t in range(num_tags):
+                tagCounts[cid][t] += cnt * q[t]
+                phraseLength[t] += cnt * len(phrase) * q[t]
+                if len(phrase) == 1:
+                    phraseSingleCounts[t][phrase[0]] += cnt * q[t]
+                else:
+                    phraseLeftCounts[t][phrase[0]] += cnt * q[t]
+                    phraseRightCounts[t][phrase[-1]] += cnt * q[t]
+
+        # M-step
+        for t in range(num_tags):
+            self.phraseLengthDist[t] = min(max(sum(tagCounts[:,t]) / phraseLength[t], 1e-6), 1-1e-6)
+            self.phraseSingleDist[t] = normalise(phraseSingleCounts[t])
+            self.phraseLeftDist[t] = normalise(phraseLeftCounts[t])
+            self.phraseRightDist[t] = normalise(phraseRightCounts[t])
+        for c in range(num_contexts):
+            self.tagDist[c] = normalise(tagCounts[c])
+
+        #print 't', self.tagDist
+        #print 'l', self.phraseLengthDist
+        #print 's', self.phraseSingleDist
+        #print 'L', self.phraseLeftDist
+        #print 'R', self.phraseRightDist
+
+        return llh
+
+class ProductModel:
+    """
+    WARNING: I haven't verified the maths behind this model. It's quite likely to be incorrect.
+    """
+
+    def __init__(self):
+        self.pcm = PhraseToContextModel()
+        self.cpm = ContextToPhraseModel()
+
+    def prob(self, pid, cid):
+        p1 = self.pcm.prob(pid, cid)
+        p2 = self.cpm.prob(pid, cid)
+        return (p1 / sum(p1)) * (p2 / sum(p2))
+
+    def expectation_maximisation_step(self):
+        tagCountsGivenPhrase = zeros((num_phrases, num_tags))
+        contextWordCounts = zeros((4, num_tags, num_types))
+
+        tagCountsGivenContext = zeros((num_contexts, num_tags))
+        phraseSingleCounts = zeros((num_tags, num_types))
+        phraseLeftCounts = zeros((num_tags, num_types))
+        phraseRightCounts = zeros((num_tags, num_types))
+        phraseLength = zeros(num_types)
+
+        kl = llh1 = llh2 = 0
+        for pid, cid, cnt in edges:
+            p1 = self.pcm.prob(pid, cid)
+            llh1 += log(sum(p1)) * cnt
+            p2 = self.cpm.prob(pid, cid)
+            llh2 += log(sum(p2)) * cnt
+
+            q = (p1 / sum(p1)) * (p2 / sum(p2))
+            kl += log(sum(q)) * cnt
+            qi = sqrt(q)
+            qi /= sum(qi)
+
+            phrase = phrase_type_list[pid]
+            context = context_type_list[cid]
+            for t in range(num_tags):
+                tagCountsGivenPhrase[pid][t] += cnt * qi[t]
+                tagCountsGivenContext[cid][t] += cnt * qi[t]
+                phraseLength[t] += cnt * len(phrase) * qi[t]
+                if len(phrase) == 1:
+                    phraseSingleCounts[t][phrase[0]] += cnt * qi[t]
+                else:
+                    phraseLeftCounts[t][phrase[0]] += cnt * qi[t]
+                    phraseRightCounts[t][phrase[-1]] += cnt * qi[t]
+                for i in range(4):
+                    contextWordCounts[i][t][context[i]] += cnt * qi[t]
+
+        kl *= -2
+
+        for t in range(num_tags):
+            for i in range(4):
+                self.pcm.contextWordDist[i][t] = normalise(contextWordCounts[i,t])
+            self.cpm.phraseLengthDist[t] = min(max(sum(tagCountsGivenContext[:,t]) / phraseLength[t], 1e-6), 1-1e-6)
+            self.cpm.phraseSingleDist[t] = normalise(phraseSingleCounts[t])
+            self.cpm.phraseLeftDist[t] = normalise(phraseLeftCounts[t])
+            self.cpm.phraseRightDist[t] = normalise(phraseRightCounts[t])
+        for p in range(num_phrases):
+            self.pcm.tagDist[p] = normalise(tagCountsGivenPhrase[p])
+        for c in range(num_contexts):
+            self.cpm.tagDist[c] = normalise(tagCountsGivenContext[c])
+
+        # return the overall objective
+        return llh1 + llh2 + kl
+
+class InterpolatedModel:
+    def __init__(self, epsilon):
+        self.pcm = PhraseToContextModel()
+        self.cpm = ContextToPhraseModel()
+        self.epsilon = epsilon
+        self.lamba = zeros(num_tags)
+
+    def prob(self, pid, cid):
+        p1 = self.pcm.prob(pid, cid)
+        p2 = self.cpm.prob(pid, cid)
+        return (p1 + p2) / 2
+
+    def dual(self, lamba):
+        return self.logz(lamba) + self.epsilon * dot(lamba, lamba) ** 0.5
+
+    def dual_gradient(self, lamba):
+        return self.expected_features(lamba) + self.epsilon * 2 * lamba
+
+    def expectation_maximisation_step(self):
+        # PR-step: optimise lambda to minimise log(z_lambda) + eps ||lambda||_2
+        self.lamba = scipy.optimize.fmin_slsqp(self.dual, self.lamba,
+                                bounds=[(0, 1e100)] * num_tags,
+                                fprime=self.dual_gradient, iprint=0)
+
+        # E,M-steps: collect expected counts under q_lambda and normalise
+        #llh1 = self.pcm.expectation_maximisation_step(self.lamba)
+        #llh2 = self.cpm.expectation_maximisation_step(-self.lamba)
+
+        # return the overall objective: llh1 + llh2 - KL(q||p1.p2)
+        pass
+
+    def logz(self, lamba):
+        # FIXME: complete this!
+        lz = 0
+        for pid, cid, cnt in edges:
+            p1 = self.pcm.prob(pid, cid)
+            z1 = dot(p1 / sum(p1), exp(lamba))
+            lz += log(z1) * cnt
+
+            p2 = self.cpm.prob(pid, cid)
+            z2 = dot(p2 / sum(p2), exp(-lamba))
+            lz += log(z2) * cnt
+        return lz
+
+    def expected_features(self, lamba):
+        fs = zeros(num_tags)
+        for pid, cid, cnt in edges:
+            p1 = self.pcm.prob(pid, cid)
+            q1 = (p1 / sum(p1)) * exp(lamba)
+            fs += cnt * q1 / sum(q1)
+
+            p2 = self.cpm.prob(pid, cid)
+            q2 = (p2 / sum(p2)) * exp(-lamba)
+            fs -= cnt * q2 / sum(q2)
+        return fs
+
+if style == 'p2c':
+    m = PhraseToContextModel()
+elif style == 'c2p':
+    m = ContextToPhraseModel()
+elif style == 'prod':
+    m = ProductModel()
+elif style == 'sum':
+    m = InterpolatedModel()
+
+for iteration in range(30):
+    obj = m.expectation_maximisation_step()
+    print 'iteration', iteration, 'objective', obj
+
+for pid, cid, cnt in edges:
+    p = m.prob(pid, cid)
+    phrase = phrase_type_list[pid]
+    phrase_str = ' '.join(map(word_type_list.__getitem__, phrase))
+    context = context_type_list[cid]
+    context_str = ' '.join(map(word_type_list.__getitem__, context))
+    print '%s\t%s ||| C=%d' % (phrase_str, context_str, argmax(p))
diff --git a/gi/posterior-regularisation/train_pr_parallel.py b/gi/posterior-regularisation/train_pr_parallel.py
index d5df87b5..4de7f504 100644
--- a/gi/posterior-regularisation/train_pr_parallel.py
+++ b/gi/posterior-regularisation/train_pr_parallel.py
@@ -37,8 +37,6 @@ for line in sys.stdin:
 
         num_edges += 1
 
-print 'Read in', num_edges, 'edges and', len(types), 'word types'
-
 #
 # Step 2: initialise the model parameters
 #
@@ -53,6 +51,8 @@ local = sys.argv[2] == 'local'
 if len(sys.argv) >= 2:
      seed(int(sys.argv[3]))
 
+print 'Read in', num_edges, 'edges', num_phrases, 'phrases', num_contexts, 'contexts and', len(types), 'word types'
+
 def normalise(a):
     return a / float(sum(a))
 
@@ -131,7 +131,7 @@ class GlobalDualObjective:
         for j, (phrase, edges) in enumerate(edges_phrase_to_context):
             for i, (context, count) in enumerate(edges):
                 for t in range(num_tags):
-                    cons[j,t] -= ls[index,t]
+                    cons[j,t] -= ls[index,t] * count
                 index += 1
         return cons.ravel()
 
@@ -142,7 +142,7 @@ class GlobalDualObjective:
         for j, (phrase, edges) in enumerate(edges_phrase_to_context):
             for i, (context, count) in enumerate(edges):
                 for t in range(num_tags):
-                    gradient[j,t,index,t] -= 1
+                    gradient[j,t,index,t] -= count
                 index += 1
         return gradient.reshape((num_phrases*num_tags, num_edges*num_tags))
 
@@ -231,7 +231,7 @@ class LocalDualObjective:
         cons = ones(num_tags) * self.scale
         for t in range(num_tags):
             for i, (context, count) in enumerate(edges):
-                cons[t] -= ls[i,t]
+                cons[t] -= ls[i,t] * count
         return cons
 
     def constraints_gradient(self, ls):
@@ -240,7 +240,7 @@ class LocalDualObjective:
         gradient = zeros((num_tags, len(edges), num_tags))
         for t in range(num_tags):
             for i, (context, count) in enumerate(edges):
-                gradient[t,i,t] -= 1
+                gradient[t,i,t] -= count
         return gradient.reshape((num_tags, len(edges)*num_tags))
 
     def optimize(self):
-- 
cgit v1.2.3