summaryrefslogtreecommitdiff
path: root/gi/posterior-regularisation/train_pr_agree.py
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
committerKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
commit5f98fe5c4f2a2090eeb9d30c030305a70a8347d1 (patch)
tree9b6002f850e6dea1e3400c6b19bb31a9cdf3067f /gi/posterior-regularisation/train_pr_agree.py
parentcf9994131993b40be62e90e213b1e11e6b550143 (diff)
parent21825a09d97c2e0afd20512f306fb25fed55e529 (diff)
Merge remote branch 'upstream/master'
Conflicts: Jamroot bjam decoder/Jamfile decoder/cdec.cc dpmert/Jamfile jam-files/sanity.jam klm/lm/Jamfile klm/util/Jamfile mira/Jamfile
Diffstat (limited to 'gi/posterior-regularisation/train_pr_agree.py')
-rw-r--r--gi/posterior-regularisation/train_pr_agree.py400
1 files changed, 0 insertions, 400 deletions
diff --git a/gi/posterior-regularisation/train_pr_agree.py b/gi/posterior-regularisation/train_pr_agree.py
deleted file mode 100644
index 9d41362d..00000000
--- a/gi/posterior-regularisation/train_pr_agree.py
+++ /dev/null
@@ -1,400 +0,0 @@
-import sys
-import scipy.optimize
-from scipy.stats import geom
-from numpy import *
-from numpy.random import random, seed
-
-style = sys.argv[1]
-if len(sys.argv) >= 3:
- seed(int(sys.argv[2]))
-
-#
-# Step 1: load the concordance counts
-#
-
-edges = []
-word_types = {}
-phrase_types = {}
-context_types = {}
-
-for line in sys.stdin:
- phrase, rest = line.strip().split('\t')
- ptoks = tuple(map(lambda t: word_types.setdefault(t, len(word_types)), phrase.split()))
- pid = phrase_types.setdefault(ptoks, len(phrase_types))
-
- parts = rest.split('|||')
- for i in range(0, len(parts), 2):
- context, count = parts[i:i+2]
-
- ctx = filter(lambda x: x != '<PHRASE>', context.split())
- ctoks = tuple(map(lambda t: word_types.setdefault(t, len(word_types)), ctx))
- cid = context_types.setdefault(ctoks, len(context_types))
-
- cnt = int(count.strip()[2:])
- edges.append((pid, cid, cnt))
-
-word_type_list = [None] * len(word_types)
-for typ, index in word_types.items():
- word_type_list[index] = typ
-
-phrase_type_list = [None] * len(phrase_types)
-for typ, index in phrase_types.items():
- phrase_type_list[index] = typ
-
-context_type_list = [None] * len(context_types)
-for typ, index in context_types.items():
- context_type_list[index] = typ
-
-num_tags = 5
-num_types = len(word_types)
-num_phrases = len(phrase_types)
-num_contexts = len(context_types)
-num_edges = len(edges)
-
-print 'Read in', num_edges, 'edges', num_phrases, 'phrases', num_contexts, 'contexts and', num_types, 'word types'
-
-#
-# Step 2: expectation maximisation
-#
-
-def normalise(a):
- return a / float(sum(a))
-
-class PhraseToContextModel:
- def __init__(self):
- # Pr(tag | phrase)
- self.tagDist = [normalise(random(num_tags)+1) for p in range(num_phrases)]
- # Pr(context at pos i = w | tag) indexed by i, tag, word
- self.contextWordDist = [[normalise(random(num_types)+1) for t in range(num_tags)] for i in range(4)]
-
- def prob(self, pid, cid):
- # return distribution p(tag, context | phrase) as vector of length |tags|
- context = context_type_list[cid]
- dist = zeros(num_tags)
- for t in range(num_tags):
- prob = self.tagDist[pid][t]
- for k, tokid in enumerate(context):
- prob *= self.contextWordDist[k][t][tokid]
- dist[t] = prob
- return dist
-
- def expectation_maximisation_step(self, lamba=None):
- tagCounts = zeros((num_phrases, num_tags))
- contextWordCounts = zeros((4, num_tags, num_types))
-
- # E-step
- llh = 0
- for pid, cid, cnt in edges:
- q = self.prob(pid, cid)
- z = sum(q)
- q /= z
- llh += log(z)
- context = context_type_list[cid]
- if lamba != None:
- q *= exp(lamba)
- q /= sum(q)
- for t in range(num_tags):
- tagCounts[pid][t] += cnt * q[t]
- for i in range(4):
- for t in range(num_tags):
- contextWordCounts[i][t][context[i]] += cnt * q[t]
-
- # M-step
- for p in range(num_phrases):
- self.tagDist[p] = normalise(tagCounts[p])
- for i in range(4):
- for t in range(num_tags):
- self.contextWordDist[i][t] = normalise(contextWordCounts[i,t])
-
- return llh
-
-class ContextToPhraseModel:
- def __init__(self):
- # Pr(tag | context) = Multinomial
- self.tagDist = [normalise(random(num_tags)+1) for p in range(num_contexts)]
- # Pr(phrase = w | tag) = Multinomial
- self.phraseSingleDist = [normalise(random(num_types)+1) for t in range(num_tags)]
- # Pr(phrase_1 = w | tag) = Multinomial
- self.phraseLeftDist = [normalise(random(num_types)+1) for t in range(num_tags)]
- # Pr(phrase_-1 = w | tag) = Multinomial
- self.phraseRightDist = [normalise(random(num_types)+1) for t in range(num_tags)]
- # Pr(|phrase| = l | tag) = Geometric
- self.phraseLengthDist = [0.5] * num_tags
- # n.b. internal words for phrases of length >= 3 are drawn from uniform distribution
-
- def prob(self, pid, cid):
- # return distribution p(tag, phrase | context) as vector of length |tags|
- phrase = phrase_type_list[pid]
- dist = zeros(num_tags)
- for t in range(num_tags):
- prob = self.tagDist[cid][t]
- f = self.phraseLengthDist[t]
- prob *= geom.pmf(len(phrase), f)
- if len(phrase) == 1:
- prob *= self.phraseSingleDist[t][phrase[0]]
- else:
- prob *= self.phraseLeftDist[t][phrase[0]]
- prob *= self.phraseRightDist[t][phrase[-1]]
- dist[t] = prob
- return dist
-
- def expectation_maximisation_step(self, lamba=None):
- tagCounts = zeros((num_contexts, num_tags))
- phraseSingleCounts = zeros((num_tags, num_types))
- phraseLeftCounts = zeros((num_tags, num_types))
- phraseRightCounts = zeros((num_tags, num_types))
- phraseLength = zeros(num_types)
-
- # E-step
- llh = 0
- for pid, cid, cnt in edges:
- q = self.prob(pid, cid)
- z = sum(q)
- q /= z
- llh += log(z)
- if lamba != None:
- q *= exp(lamba)
- q /= sum(q)
- #print 'p', phrase_type_list[pid], 'c', context_type_list[cid], 'q', q
- phrase = phrase_type_list[pid]
- for t in range(num_tags):
- tagCounts[cid][t] += cnt * q[t]
- phraseLength[t] += cnt * len(phrase) * q[t]
- if len(phrase) == 1:
- phraseSingleCounts[t][phrase[0]] += cnt * q[t]
- else:
- phraseLeftCounts[t][phrase[0]] += cnt * q[t]
- phraseRightCounts[t][phrase[-1]] += cnt * q[t]
-
- # M-step
- for t in range(num_tags):
- self.phraseLengthDist[t] = min(max(sum(tagCounts[:,t]) / phraseLength[t], 1e-6), 1-1e-6)
- self.phraseSingleDist[t] = normalise(phraseSingleCounts[t])
- self.phraseLeftDist[t] = normalise(phraseLeftCounts[t])
- self.phraseRightDist[t] = normalise(phraseRightCounts[t])
- for c in range(num_contexts):
- self.tagDist[c] = normalise(tagCounts[c])
-
- #print 't', self.tagDist
- #print 'l', self.phraseLengthDist
- #print 's', self.phraseSingleDist
- #print 'L', self.phraseLeftDist
- #print 'R', self.phraseRightDist
-
- return llh
-
-class ProductModel:
- """
- WARNING: I haven't verified the maths behind this model. It's quite likely to be incorrect.
- """
-
- def __init__(self):
- self.pcm = PhraseToContextModel()
- self.cpm = ContextToPhraseModel()
-
- def prob(self, pid, cid):
- p1 = self.pcm.prob(pid, cid)
- p2 = self.cpm.prob(pid, cid)
- return (p1 / sum(p1)) * (p2 / sum(p2))
-
- def expectation_maximisation_step(self):
- tagCountsGivenPhrase = zeros((num_phrases, num_tags))
- contextWordCounts = zeros((4, num_tags, num_types))
-
- tagCountsGivenContext = zeros((num_contexts, num_tags))
- phraseSingleCounts = zeros((num_tags, num_types))
- phraseLeftCounts = zeros((num_tags, num_types))
- phraseRightCounts = zeros((num_tags, num_types))
- phraseLength = zeros(num_types)
-
- kl = llh1 = llh2 = 0
- for pid, cid, cnt in edges:
- p1 = self.pcm.prob(pid, cid)
- llh1 += log(sum(p1)) * cnt
- p2 = self.cpm.prob(pid, cid)
- llh2 += log(sum(p2)) * cnt
-
- q = (p1 / sum(p1)) * (p2 / sum(p2))
- kl += log(sum(q)) * cnt
- qi = sqrt(q)
- qi /= sum(qi)
-
- phrase = phrase_type_list[pid]
- context = context_type_list[cid]
- for t in range(num_tags):
- tagCountsGivenPhrase[pid][t] += cnt * qi[t]
- tagCountsGivenContext[cid][t] += cnt * qi[t]
- phraseLength[t] += cnt * len(phrase) * qi[t]
- if len(phrase) == 1:
- phraseSingleCounts[t][phrase[0]] += cnt * qi[t]
- else:
- phraseLeftCounts[t][phrase[0]] += cnt * qi[t]
- phraseRightCounts[t][phrase[-1]] += cnt * qi[t]
- for i in range(4):
- contextWordCounts[i][t][context[i]] += cnt * qi[t]
-
- kl *= -2
-
- for t in range(num_tags):
- for i in range(4):
- self.pcm.contextWordDist[i][t] = normalise(contextWordCounts[i,t])
- self.cpm.phraseLengthDist[t] = min(max(sum(tagCountsGivenContext[:,t]) / phraseLength[t], 1e-6), 1-1e-6)
- self.cpm.phraseSingleDist[t] = normalise(phraseSingleCounts[t])
- self.cpm.phraseLeftDist[t] = normalise(phraseLeftCounts[t])
- self.cpm.phraseRightDist[t] = normalise(phraseRightCounts[t])
- for p in range(num_phrases):
- self.pcm.tagDist[p] = normalise(tagCountsGivenPhrase[p])
- for c in range(num_contexts):
- self.cpm.tagDist[c] = normalise(tagCountsGivenContext[c])
-
- # return the overall objective
- return llh1 + llh2 + kl
-
-class RegularisedProductModel:
- # as above, but with a slack regularisation term which kills the
- # closed-form solution for the E-step
-
- def __init__(self, epsilon):
- self.pcm = PhraseToContextModel()
- self.cpm = ContextToPhraseModel()
- self.epsilon = epsilon
- self.lamba = zeros(num_tags)
-
- def prob(self, pid, cid):
- p1 = self.pcm.prob(pid, cid)
- p2 = self.cpm.prob(pid, cid)
- return (p1 / sum(p1)) * (p2 / sum(p2))
-
- def dual(self, lamba):
- return self.logz(lamba) + self.epsilon * dot(lamba, lamba) ** 0.5
-
- def dual_gradient(self, lamba):
- return self.expected_features(lamba) + self.epsilon * 2 * lamba
-
- def expectation_maximisation_step(self):
- # PR-step: optimise lambda to minimise log(z_lambda) + eps ||lambda||_2
- self.lamba = scipy.optimize.fmin_slsqp(self.dual, self.lamba,
- bounds=[(0, 1e100)] * num_tags,
- fprime=self.dual_gradient, iprint=1)
-
- # E,M-steps: collect expected counts under q_lambda and normalise
- llh1 = self.pcm.expectation_maximisation_step(self.lamba)
- llh2 = self.cpm.expectation_maximisation_step(-self.lamba)
-
- # return the overall objective: llh - KL(q||p1.p2)
- # llh = llh1 + llh2
- # kl = sum q log q / p1 p2 = sum q { lambda . phi } - log Z
- return llh1 + llh2 + self.logz(self.lamba) \
- - dot(self.lamba, self.expected_features(self.lamba))
-
- def logz(self, lamba):
- lz = 0
- for pid, cid, cnt in edges:
- p1 = self.pcm.prob(pid, cid)
- z1 = dot(p1 / sum(p1), exp(lamba))
- lz += log(z1) * cnt
-
- p2 = self.cpm.prob(pid, cid)
- z2 = dot(p2 / sum(p2), exp(-lamba))
- lz += log(z2) * cnt
- return lz
-
- def expected_features(self, lamba):
- fs = zeros(num_tags)
- for pid, cid, cnt in edges:
- p1 = self.pcm.prob(pid, cid)
- q1 = (p1 / sum(p1)) * exp(lamba)
- fs += cnt * q1 / sum(q1)
-
- p2 = self.cpm.prob(pid, cid)
- q2 = (p2 / sum(p2)) * exp(-lamba)
- fs -= cnt * q2 / sum(q2)
- return fs
-
-
-class InterpolatedModel:
- def __init__(self, epsilon):
- self.pcm = PhraseToContextModel()
- self.cpm = ContextToPhraseModel()
- self.epsilon = epsilon
- self.lamba = zeros(num_tags)
-
- def prob(self, pid, cid):
- p1 = self.pcm.prob(pid, cid)
- p2 = self.cpm.prob(pid, cid)
- return (p1 + p2) / 2
-
- def dual(self, lamba):
- return self.logz(lamba) + self.epsilon * dot(lamba, lamba) ** 0.5
-
- def dual_gradient(self, lamba):
- return self.expected_features(lamba) + self.epsilon * 2 * lamba
-
- def expectation_maximisation_step(self):
- # PR-step: optimise lambda to minimise log(z_lambda) + eps ||lambda||_2
- self.lamba = scipy.optimize.fmin_slsqp(self.dual, self.lamba,
- bounds=[(0, 1e100)] * num_tags,
- fprime=self.dual_gradient, iprint=2)
-
- # E,M-steps: collect expected counts under q_lambda and normalise
- llh1 = self.pcm.expectation_maximisation_step(self.lamba)
- llh2 = self.cpm.expectation_maximisation_step(self.lamba)
-
- # return the overall objective: llh1 + llh2 - KL(q||p1.p2)
- # kl = sum_y q log q / 0.5 * (p1 + p2) = sum_y q(y) { -lambda . phi(y) } - log Z
- # = -log Z + lambda . (E_q1[-phi] + E_q2[-phi]) / 2
- kl = -self.logz(self.lamba) + dot(self.lamba, self.expected_features(self.lamba))
- return llh1 + llh2 - kl, llh1, llh2, kl
- # FIXME: KL comes out negative...
-
- def logz(self, lamba):
- lz = 0
- for pid, cid, cnt in edges:
- p1 = self.pcm.prob(pid, cid)
- q1 = p1 / sum(p1) * exp(-lamba)
- q1z = sum(q1)
-
- p2 = self.cpm.prob(pid, cid)
- q2 = p2 / sum(p2) * exp(-lamba)
- q2z = sum(q2)
-
- lz += log(0.5 * (q1z + q2z)) * cnt
- return lz
-
- # z = 1/2 * (sum_y p1(y|x) exp (-lambda . phi(y)) + sum_y p2(y|x) exp (-lambda . phi(y)))
- # = 1/2 (z1 + z2)
- # d (log z) / dlambda = 1/2 (E_q1 [ -phi ] + E_q2 [ -phi ] )
- def expected_features(self, lamba):
- fs = zeros(num_tags)
- for pid, cid, cnt in edges:
- p1 = self.pcm.prob(pid, cid)
- q1 = (p1 / sum(p1)) * exp(-lamba)
- fs -= 0.5 * cnt * q1 / sum(q1)
-
- p2 = self.cpm.prob(pid, cid)
- q2 = (p2 / sum(p2)) * exp(-lamba)
- fs -= 0.5 * cnt * q2 / sum(q2)
- return fs
-
-if style == 'p2c':
- m = PhraseToContextModel()
-elif style == 'c2p':
- m = ContextToPhraseModel()
-elif style == 'prod':
- m = ProductModel()
-elif style == 'prodslack':
- m = RegularisedProductModel(0.5)
-elif style == 'sum':
- m = InterpolatedModel(0.5)
-
-for iteration in range(30):
- obj = m.expectation_maximisation_step()
- print 'iteration', iteration, 'objective', obj
-
-for pid, cid, cnt in edges:
- p = m.prob(pid, cid)
- phrase = phrase_type_list[pid]
- phrase_str = ' '.join(map(word_type_list.__getitem__, phrase))
- context = context_type_list[cid]
- context_str = ' '.join(map(word_type_list.__getitem__, context))
- print '%s\t%s ||| C=%d' % (phrase_str, context_str, argmax(p))