summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-01 21:11:59 +0000
committertrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-01 21:11:59 +0000
commit43e7da3dba13f4868190809d9f2668d6d0664343 (patch)
treec53422b1035aca8ca143522ac1adc1095fa2e7f4
parent31242eb4c9f60b2b9a36e94c28c1167f64ff6008 (diff)
New agreement objective
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@93 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--gi/posterior-regularisation/train_pr_agree.py328
-rw-r--r--gi/posterior-regularisation/train_pr_parallel.py12
2 files changed, 334 insertions, 6 deletions
diff --git a/gi/posterior-regularisation/train_pr_agree.py b/gi/posterior-regularisation/train_pr_agree.py
new file mode 100644
index 00000000..bbd6e007
--- /dev/null
+++ b/gi/posterior-regularisation/train_pr_agree.py
@@ -0,0 +1,328 @@
+import sys
+import scipy.optimize
+from scipy.stats import geom
+from numpy import *
+from numpy.random import random, seed
+
+style = sys.argv[1]
+if len(sys.argv) >= 3:
+ seed(int(sys.argv[2]))
+
+#
+# Step 1: load the concordance counts
+#
+
+edges = []
+word_types = {}
+phrase_types = {}
+context_types = {}
+
+for line in sys.stdin:
+ phrase, rest = line.strip().split('\t')
+ ptoks = tuple(map(lambda t: word_types.setdefault(t, len(word_types)), phrase.split()))
+ pid = phrase_types.setdefault(ptoks, len(phrase_types))
+
+ parts = rest.split('|||')
+ for i in range(0, len(parts), 2):
+ context, count = parts[i:i+2]
+
+ ctx = filter(lambda x: x != '<PHRASE>', context.split())
+ ctoks = tuple(map(lambda t: word_types.setdefault(t, len(word_types)), ctx))
+ cid = context_types.setdefault(ctoks, len(context_types))
+
+ cnt = int(count.strip()[2:])
+ edges.append((pid, cid, cnt))
+
+word_type_list = [None] * len(word_types)
+for typ, index in word_types.items():
+ word_type_list[index] = typ
+
+phrase_type_list = [None] * len(phrase_types)
+for typ, index in phrase_types.items():
+ phrase_type_list[index] = typ
+
+context_type_list = [None] * len(context_types)
+for typ, index in context_types.items():
+ context_type_list[index] = typ
+
+num_tags = 5
+num_types = len(word_types)
+num_phrases = len(phrase_types)
+num_contexts = len(context_types)
+num_edges = len(edges)
+
+print 'Read in', num_edges, 'edges', num_phrases, 'phrases', num_contexts, 'contexts and', num_types, 'word types'
+
+#
+# Step 2: expectation maximisation
+#
+
+def normalise(a):
+ return a / float(sum(a))
+
+class PhraseToContextModel:
+ def __init__(self):
+ # Pr(tag | phrase)
+ self.tagDist = [normalise(random(num_tags)+1) for p in range(num_phrases)]
+ # Pr(context at pos i = w | tag) indexed by i, tag, word
+ self.contextWordDist = [[normalise(random(num_types)+1) for t in range(num_tags)] for i in range(4)]
+
+ def prob(self, pid, cid):
+ # return distribution p(tag, context | phrase) as vector of length |tags|
+ context = context_type_list[cid]
+ dist = zeros(num_tags)
+ for t in range(num_tags):
+ prob = self.tagDist[pid][t]
+ for k, tokid in enumerate(context):
+ prob *= self.contextWordDist[k][t][tokid]
+ dist[t] = prob
+ return dist
+
+ def expectation_maximisation_step(self, lamba=None):
+ tagCounts = zeros((num_phrases, num_tags))
+ contextWordCounts = zeros((4, num_tags, num_types))
+
+ # E-step
+ llh = 0
+ for pid, cid, cnt in edges:
+ q = self.prob(pid, cid)
+ z = sum(q)
+ q /= z
+ llh += log(z)
+ context = context_type_list[cid]
+ if lamba != None:
+ q *= exp(lamba)
+ q /= sum(q)
+ for t in range(num_tags):
+ tagCounts[pid][t] += cnt * q[t]
+ for i in range(4):
+ for t in range(num_tags):
+ contextWordCounts[i][t][context[i]] += cnt * q[t]
+
+ # M-step
+ for p in range(num_phrases):
+ self.tagDist[p] = normalise(tagCounts[p])
+ for i in range(4):
+ for t in range(num_tags):
+ self.contextWordDist[i][t] = normalise(contextWordCounts[i,t])
+
+ return llh
+
+class ContextToPhraseModel:
+ def __init__(self):
+ # Pr(tag | context) = Multinomial
+ self.tagDist = [normalise(random(num_tags)+1) for p in range(num_contexts)]
+ # Pr(phrase = w | tag) = Multinomial
+ self.phraseSingleDist = [normalise(random(num_types)+1) for t in range(num_tags)]
+ # Pr(phrase_1 = w | tag) = Multinomial
+ self.phraseLeftDist = [normalise(random(num_types)+1) for t in range(num_tags)]
+ # Pr(phrase_-1 = w | tag) = Multinomial
+ self.phraseRightDist = [normalise(random(num_types)+1) for t in range(num_tags)]
+ # Pr(|phrase| = l | tag) = Geometric
+ self.phraseLengthDist = [0.5] * num_tags
+ # n.b. internal words for phrases of length >= 3 are drawn from uniform distribution
+
+ def prob(self, pid, cid):
+ # return distribution p(tag, phrase | context) as vector of length |tags|
+ phrase = phrase_type_list[pid]
+ dist = zeros(num_tags)
+ for t in range(num_tags):
+ prob = self.tagDist[cid][t]
+ f = self.phraseLengthDist[t]
+ prob *= geom.pmf(len(phrase), f)
+ if len(phrase) == 1:
+ prob *= self.phraseSingleDist[t][phrase[0]]
+ else:
+ prob *= self.phraseLeftDist[t][phrase[0]]
+ prob *= self.phraseRightDist[t][phrase[-1]]
+ dist[t] = prob
+ return dist
+
+ def expectation_maximisation_step(self, lamba=None):
+ tagCounts = zeros((num_contexts, num_tags))
+ phraseSingleCounts = zeros((num_tags, num_types))
+ phraseLeftCounts = zeros((num_tags, num_types))
+ phraseRightCounts = zeros((num_tags, num_types))
+ phraseLength = zeros(num_types)
+
+ # E-step
+ llh = 0
+ for pid, cid, cnt in edges:
+ q = self.prob(pid, cid)
+ z = sum(q)
+ q /= z
+ llh += log(z)
+ if lamba != None:
+ q *= exp(lamba)
+ q /= sum(q)
+ #print 'p', phrase_type_list[pid], 'c', context_type_list[cid], 'q', q
+ phrase = phrase_type_list[pid]
+ for t in range(num_tags):
+ tagCounts[cid][t] += cnt * q[t]
+ phraseLength[t] += cnt * len(phrase) * q[t]
+ if len(phrase) == 1:
+ phraseSingleCounts[t][phrase[0]] += cnt * q[t]
+ else:
+ phraseLeftCounts[t][phrase[0]] += cnt * q[t]
+ phraseRightCounts[t][phrase[-1]] += cnt * q[t]
+
+ # M-step
+ for t in range(num_tags):
+ self.phraseLengthDist[t] = min(max(sum(tagCounts[:,t]) / phraseLength[t], 1e-6), 1-1e-6)
+ self.phraseSingleDist[t] = normalise(phraseSingleCounts[t])
+ self.phraseLeftDist[t] = normalise(phraseLeftCounts[t])
+ self.phraseRightDist[t] = normalise(phraseRightCounts[t])
+ for c in range(num_contexts):
+ self.tagDist[c] = normalise(tagCounts[c])
+
+ #print 't', self.tagDist
+ #print 'l', self.phraseLengthDist
+ #print 's', self.phraseSingleDist
+ #print 'L', self.phraseLeftDist
+ #print 'R', self.phraseRightDist
+
+ return llh
+
+class ProductModel:
+ """
+ WARNING: I haven't verified the maths behind this model. It's quite likely to be incorrect.
+ """
+
+ def __init__(self):
+ self.pcm = PhraseToContextModel()
+ self.cpm = ContextToPhraseModel()
+
+ def prob(self, pid, cid):
+ p1 = self.pcm.prob(pid, cid)
+ p2 = self.cpm.prob(pid, cid)
+ return (p1 / sum(p1)) * (p2 / sum(p2))
+
+ def expectation_maximisation_step(self):
+ tagCountsGivenPhrase = zeros((num_phrases, num_tags))
+ contextWordCounts = zeros((4, num_tags, num_types))
+
+ tagCountsGivenContext = zeros((num_contexts, num_tags))
+ phraseSingleCounts = zeros((num_tags, num_types))
+ phraseLeftCounts = zeros((num_tags, num_types))
+ phraseRightCounts = zeros((num_tags, num_types))
+ phraseLength = zeros(num_types)
+
+ kl = llh1 = llh2 = 0
+ for pid, cid, cnt in edges:
+ p1 = self.pcm.prob(pid, cid)
+ llh1 += log(sum(p1)) * cnt
+ p2 = self.cpm.prob(pid, cid)
+ llh2 += log(sum(p2)) * cnt
+
+ q = (p1 / sum(p1)) * (p2 / sum(p2))
+ kl += log(sum(q)) * cnt
+ qi = sqrt(q)
+ qi /= sum(qi)
+
+ phrase = phrase_type_list[pid]
+ context = context_type_list[cid]
+ for t in range(num_tags):
+ tagCountsGivenPhrase[pid][t] += cnt * qi[t]
+ tagCountsGivenContext[cid][t] += cnt * qi[t]
+ phraseLength[t] += cnt * len(phrase) * qi[t]
+ if len(phrase) == 1:
+ phraseSingleCounts[t][phrase[0]] += cnt * qi[t]
+ else:
+ phraseLeftCounts[t][phrase[0]] += cnt * qi[t]
+ phraseRightCounts[t][phrase[-1]] += cnt * qi[t]
+ for i in range(4):
+ contextWordCounts[i][t][context[i]] += cnt * qi[t]
+
+ kl *= -2
+
+ for t in range(num_tags):
+ for i in range(4):
+ self.pcm.contextWordDist[i][t] = normalise(contextWordCounts[i,t])
+ self.cpm.phraseLengthDist[t] = min(max(sum(tagCountsGivenContext[:,t]) / phraseLength[t], 1e-6), 1-1e-6)
+ self.cpm.phraseSingleDist[t] = normalise(phraseSingleCounts[t])
+ self.cpm.phraseLeftDist[t] = normalise(phraseLeftCounts[t])
+ self.cpm.phraseRightDist[t] = normalise(phraseRightCounts[t])
+ for p in range(num_phrases):
+ self.pcm.tagDist[p] = normalise(tagCountsGivenPhrase[p])
+ for c in range(num_contexts):
+ self.cpm.tagDist[c] = normalise(tagCountsGivenContext[c])
+
+ # return the overall objective
+ return llh1 + llh2 + kl
+
+class InterpolatedModel:
+ def __init__(self, epsilon):
+ self.pcm = PhraseToContextModel()
+ self.cpm = ContextToPhraseModel()
+ self.epsilon = epsilon
+ self.lamba = zeros(num_tags)
+
+ def prob(self, pid, cid):
+ p1 = self.pcm.prob(pid, cid)
+ p2 = self.cpm.prob(pid, cid)
+ return (p1 + p2) / 2
+
+ def dual(self, lamba):
+ return self.logz(lamba) + self.epsilon * dot(lamba, lamba) ** 0.5
+
+ def dual_gradient(self, lamba):
+ return self.expected_features(lamba) + self.epsilon * 2 * lamba
+
+ def expectation_maximisation_step(self):
+ # PR-step: optimise lambda to minimise log(z_lambda) + eps ||lambda||_2
+ self.lamba = scipy.optimize.fmin_slsqp(self.dual, self.lamba,
+ bounds=[(0, 1e100)] * num_tags,
+ fprime=self.dual_gradient, iprint=0)
+
+ # E,M-steps: collect expected counts under q_lambda and normalise
+ #llh1 = self.pcm.expectation_maximisation_step(self.lamba)
+ #llh2 = self.cpm.expectation_maximisation_step(-self.lamba)
+
+ # return the overall objective: llh1 + llh2 - KL(q||p1.p2)
+ pass
+
+ def logz(self, lamba):
+ # FIXME: complete this!
+ lz = 0
+ for pid, cid, cnt in edges:
+ p1 = self.pcm.prob(pid, cid)
+ z1 = dot(p1 / sum(p1), exp(lamba))
+ lz += log(z1) * cnt
+
+ p2 = self.cpm.prob(pid, cid)
+ z2 = dot(p2 / sum(p2), exp(-lamba))
+ lz += log(z2) * cnt
+ return lz
+
+ def expected_features(self, lamba):
+ fs = zeros(num_tags)
+ for pid, cid, cnt in edges:
+ p1 = self.pcm.prob(pid, cid)
+ q1 = (p1 / sum(p1)) * exp(lamba)
+ fs += cnt * q1 / sum(q1)
+
+ p2 = self.cpm.prob(pid, cid)
+ q2 = (p2 / sum(p2)) * exp(-lamba)
+ fs -= cnt * q2 / sum(q2)
+ return fs
+
+if style == 'p2c':
+ m = PhraseToContextModel()
+elif style == 'c2p':
+ m = ContextToPhraseModel()
+elif style == 'prod':
+ m = ProductModel()
+elif style == 'sum':
+ m = InterpolatedModel()
+
+for iteration in range(30):
+ obj = m.expectation_maximisation_step()
+ print 'iteration', iteration, 'objective', obj
+
+for pid, cid, cnt in edges:
+ p = m.prob(pid, cid)
+ phrase = phrase_type_list[pid]
+ phrase_str = ' '.join(map(word_type_list.__getitem__, phrase))
+ context = context_type_list[cid]
+ context_str = ' '.join(map(word_type_list.__getitem__, context))
+ print '%s\t%s ||| C=%d' % (phrase_str, context_str, argmax(p))
diff --git a/gi/posterior-regularisation/train_pr_parallel.py b/gi/posterior-regularisation/train_pr_parallel.py
index d5df87b5..4de7f504 100644
--- a/gi/posterior-regularisation/train_pr_parallel.py
+++ b/gi/posterior-regularisation/train_pr_parallel.py
@@ -37,8 +37,6 @@ for line in sys.stdin:
num_edges += 1
-print 'Read in', num_edges, 'edges and', len(types), 'word types'
-
#
# Step 2: initialise the model parameters
#
@@ -53,6 +51,8 @@ local = sys.argv[2] == 'local'
if len(sys.argv) >= 2:
seed(int(sys.argv[3]))
+print 'Read in', num_edges, 'edges', num_phrases, 'phrases', num_contexts, 'contexts and', len(types), 'word types'
+
def normalise(a):
return a / float(sum(a))
@@ -131,7 +131,7 @@ class GlobalDualObjective:
for j, (phrase, edges) in enumerate(edges_phrase_to_context):
for i, (context, count) in enumerate(edges):
for t in range(num_tags):
- cons[j,t] -= ls[index,t]
+ cons[j,t] -= ls[index,t] * count
index += 1
return cons.ravel()
@@ -142,7 +142,7 @@ class GlobalDualObjective:
for j, (phrase, edges) in enumerate(edges_phrase_to_context):
for i, (context, count) in enumerate(edges):
for t in range(num_tags):
- gradient[j,t,index,t] -= 1
+ gradient[j,t,index,t] -= count
index += 1
return gradient.reshape((num_phrases*num_tags, num_edges*num_tags))
@@ -231,7 +231,7 @@ class LocalDualObjective:
cons = ones(num_tags) * self.scale
for t in range(num_tags):
for i, (context, count) in enumerate(edges):
- cons[t] -= ls[i,t]
+ cons[t] -= ls[i,t] * count
return cons
def constraints_gradient(self, ls):
@@ -240,7 +240,7 @@ class LocalDualObjective:
gradient = zeros((num_tags, len(edges), num_tags))
for t in range(num_tags):
for i, (context, count) in enumerate(edges):
- gradient[t,i,t] -= 1
+ gradient[t,i,t] -= count
return gradient.reshape((num_tags, len(edges)*num_tags))
def optimize(self):