From 9a6dd9643577b11a5833a3e2fbfb4511979a5969 Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Thu, 1 Jul 2010 21:11:59 +0000 Subject: New agreement objective git-svn-id: https://ws10smt.googlecode.com/svn/trunk@93 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/posterior-regularisation/train_pr_agree.py | 328 +++++++++++++++++++++++ gi/posterior-regularisation/train_pr_parallel.py | 12 +- 2 files changed, 334 insertions(+), 6 deletions(-) create mode 100644 gi/posterior-regularisation/train_pr_agree.py (limited to 'gi/posterior-regularisation') diff --git a/gi/posterior-regularisation/train_pr_agree.py b/gi/posterior-regularisation/train_pr_agree.py new file mode 100644 index 00000000..bbd6e007 --- /dev/null +++ b/gi/posterior-regularisation/train_pr_agree.py @@ -0,0 +1,328 @@ +import sys +import scipy.optimize +from scipy.stats import geom +from numpy import * +from numpy.random import random, seed + +style = sys.argv[1] +if len(sys.argv) >= 3: + seed(int(sys.argv[2])) + +# +# Step 1: load the concordance counts +# + +edges = [] +word_types = {} +phrase_types = {} +context_types = {} + +for line in sys.stdin: + phrase, rest = line.strip().split('\t') + ptoks = tuple(map(lambda t: word_types.setdefault(t, len(word_types)), phrase.split())) + pid = phrase_types.setdefault(ptoks, len(phrase_types)) + + parts = rest.split('|||') + for i in range(0, len(parts), 2): + context, count = parts[i:i+2] + + ctx = filter(lambda x: x != '', context.split()) + ctoks = tuple(map(lambda t: word_types.setdefault(t, len(word_types)), ctx)) + cid = context_types.setdefault(ctoks, len(context_types)) + + cnt = int(count.strip()[2:]) + edges.append((pid, cid, cnt)) + +word_type_list = [None] * len(word_types) +for typ, index in word_types.items(): + word_type_list[index] = typ + +phrase_type_list = [None] * len(phrase_types) +for typ, index in phrase_types.items(): + phrase_type_list[index] = typ + +context_type_list = [None] * len(context_types) +for typ, index in context_types.items(): + context_type_list[index] = typ + +num_tags = 5 +num_types = len(word_types) +num_phrases = len(phrase_types) +num_contexts = len(context_types) +num_edges = len(edges) + +print 'Read in', num_edges, 'edges', num_phrases, 'phrases', num_contexts, 'contexts and', num_types, 'word types' + +# +# Step 2: expectation maximisation +# + +def normalise(a): + return a / float(sum(a)) + +class PhraseToContextModel: + def __init__(self): + # Pr(tag | phrase) + self.tagDist = [normalise(random(num_tags)+1) for p in range(num_phrases)] + # Pr(context at pos i = w | tag) indexed by i, tag, word + self.contextWordDist = [[normalise(random(num_types)+1) for t in range(num_tags)] for i in range(4)] + + def prob(self, pid, cid): + # return distribution p(tag, context | phrase) as vector of length |tags| + context = context_type_list[cid] + dist = zeros(num_tags) + for t in range(num_tags): + prob = self.tagDist[pid][t] + for k, tokid in enumerate(context): + prob *= self.contextWordDist[k][t][tokid] + dist[t] = prob + return dist + + def expectation_maximisation_step(self, lamba=None): + tagCounts = zeros((num_phrases, num_tags)) + contextWordCounts = zeros((4, num_tags, num_types)) + + # E-step + llh = 0 + for pid, cid, cnt in edges: + q = self.prob(pid, cid) + z = sum(q) + q /= z + llh += log(z) + context = context_type_list[cid] + if lamba != None: + q *= exp(lamba) + q /= sum(q) + for t in range(num_tags): + tagCounts[pid][t] += cnt * q[t] + for i in range(4): + for t in range(num_tags): + contextWordCounts[i][t][context[i]] += cnt * q[t] + + # M-step + for p in range(num_phrases): + self.tagDist[p] = normalise(tagCounts[p]) + for i in range(4): + for t in range(num_tags): + self.contextWordDist[i][t] = normalise(contextWordCounts[i,t]) + + return llh + +class ContextToPhraseModel: + def __init__(self): + # Pr(tag | context) = Multinomial + self.tagDist = [normalise(random(num_tags)+1) for p in range(num_contexts)] + # Pr(phrase = w | tag) = Multinomial + self.phraseSingleDist = [normalise(random(num_types)+1) for t in range(num_tags)] + # Pr(phrase_1 = w | tag) = Multinomial + self.phraseLeftDist = [normalise(random(num_types)+1) for t in range(num_tags)] + # Pr(phrase_-1 = w | tag) = Multinomial + self.phraseRightDist = [normalise(random(num_types)+1) for t in range(num_tags)] + # Pr(|phrase| = l | tag) = Geometric + self.phraseLengthDist = [0.5] * num_tags + # n.b. internal words for phrases of length >= 3 are drawn from uniform distribution + + def prob(self, pid, cid): + # return distribution p(tag, phrase | context) as vector of length |tags| + phrase = phrase_type_list[pid] + dist = zeros(num_tags) + for t in range(num_tags): + prob = self.tagDist[cid][t] + f = self.phraseLengthDist[t] + prob *= geom.pmf(len(phrase), f) + if len(phrase) == 1: + prob *= self.phraseSingleDist[t][phrase[0]] + else: + prob *= self.phraseLeftDist[t][phrase[0]] + prob *= self.phraseRightDist[t][phrase[-1]] + dist[t] = prob + return dist + + def expectation_maximisation_step(self, lamba=None): + tagCounts = zeros((num_contexts, num_tags)) + phraseSingleCounts = zeros((num_tags, num_types)) + phraseLeftCounts = zeros((num_tags, num_types)) + phraseRightCounts = zeros((num_tags, num_types)) + phraseLength = zeros(num_types) + + # E-step + llh = 0 + for pid, cid, cnt in edges: + q = self.prob(pid, cid) + z = sum(q) + q /= z + llh += log(z) + if lamba != None: + q *= exp(lamba) + q /= sum(q) + #print 'p', phrase_type_list[pid], 'c', context_type_list[cid], 'q', q + phrase = phrase_type_list[pid] + for t in range(num_tags): + tagCounts[cid][t] += cnt * q[t] + phraseLength[t] += cnt * len(phrase) * q[t] + if len(phrase) == 1: + phraseSingleCounts[t][phrase[0]] += cnt * q[t] + else: + phraseLeftCounts[t][phrase[0]] += cnt * q[t] + phraseRightCounts[t][phrase[-1]] += cnt * q[t] + + # M-step + for t in range(num_tags): + self.phraseLengthDist[t] = min(max(sum(tagCounts[:,t]) / phraseLength[t], 1e-6), 1-1e-6) + self.phraseSingleDist[t] = normalise(phraseSingleCounts[t]) + self.phraseLeftDist[t] = normalise(phraseLeftCounts[t]) + self.phraseRightDist[t] = normalise(phraseRightCounts[t]) + for c in range(num_contexts): + self.tagDist[c] = normalise(tagCounts[c]) + + #print 't', self.tagDist + #print 'l', self.phraseLengthDist + #print 's', self.phraseSingleDist + #print 'L', self.phraseLeftDist + #print 'R', self.phraseRightDist + + return llh + +class ProductModel: + """ + WARNING: I haven't verified the maths behind this model. It's quite likely to be incorrect. + """ + + def __init__(self): + self.pcm = PhraseToContextModel() + self.cpm = ContextToPhraseModel() + + def prob(self, pid, cid): + p1 = self.pcm.prob(pid, cid) + p2 = self.cpm.prob(pid, cid) + return (p1 / sum(p1)) * (p2 / sum(p2)) + + def expectation_maximisation_step(self): + tagCountsGivenPhrase = zeros((num_phrases, num_tags)) + contextWordCounts = zeros((4, num_tags, num_types)) + + tagCountsGivenContext = zeros((num_contexts, num_tags)) + phraseSingleCounts = zeros((num_tags, num_types)) + phraseLeftCounts = zeros((num_tags, num_types)) + phraseRightCounts = zeros((num_tags, num_types)) + phraseLength = zeros(num_types) + + kl = llh1 = llh2 = 0 + for pid, cid, cnt in edges: + p1 = self.pcm.prob(pid, cid) + llh1 += log(sum(p1)) * cnt + p2 = self.cpm.prob(pid, cid) + llh2 += log(sum(p2)) * cnt + + q = (p1 / sum(p1)) * (p2 / sum(p2)) + kl += log(sum(q)) * cnt + qi = sqrt(q) + qi /= sum(qi) + + phrase = phrase_type_list[pid] + context = context_type_list[cid] + for t in range(num_tags): + tagCountsGivenPhrase[pid][t] += cnt * qi[t] + tagCountsGivenContext[cid][t] += cnt * qi[t] + phraseLength[t] += cnt * len(phrase) * qi[t] + if len(phrase) == 1: + phraseSingleCounts[t][phrase[0]] += cnt * qi[t] + else: + phraseLeftCounts[t][phrase[0]] += cnt * qi[t] + phraseRightCounts[t][phrase[-1]] += cnt * qi[t] + for i in range(4): + contextWordCounts[i][t][context[i]] += cnt * qi[t] + + kl *= -2 + + for t in range(num_tags): + for i in range(4): + self.pcm.contextWordDist[i][t] = normalise(contextWordCounts[i,t]) + self.cpm.phraseLengthDist[t] = min(max(sum(tagCountsGivenContext[:,t]) / phraseLength[t], 1e-6), 1-1e-6) + self.cpm.phraseSingleDist[t] = normalise(phraseSingleCounts[t]) + self.cpm.phraseLeftDist[t] = normalise(phraseLeftCounts[t]) + self.cpm.phraseRightDist[t] = normalise(phraseRightCounts[t]) + for p in range(num_phrases): + self.pcm.tagDist[p] = normalise(tagCountsGivenPhrase[p]) + for c in range(num_contexts): + self.cpm.tagDist[c] = normalise(tagCountsGivenContext[c]) + + # return the overall objective + return llh1 + llh2 + kl + +class InterpolatedModel: + def __init__(self, epsilon): + self.pcm = PhraseToContextModel() + self.cpm = ContextToPhraseModel() + self.epsilon = epsilon + self.lamba = zeros(num_tags) + + def prob(self, pid, cid): + p1 = self.pcm.prob(pid, cid) + p2 = self.cpm.prob(pid, cid) + return (p1 + p2) / 2 + + def dual(self, lamba): + return self.logz(lamba) + self.epsilon * dot(lamba, lamba) ** 0.5 + + def dual_gradient(self, lamba): + return self.expected_features(lamba) + self.epsilon * 2 * lamba + + def expectation_maximisation_step(self): + # PR-step: optimise lambda to minimise log(z_lambda) + eps ||lambda||_2 + self.lamba = scipy.optimize.fmin_slsqp(self.dual, self.lamba, + bounds=[(0, 1e100)] * num_tags, + fprime=self.dual_gradient, iprint=0) + + # E,M-steps: collect expected counts under q_lambda and normalise + #llh1 = self.pcm.expectation_maximisation_step(self.lamba) + #llh2 = self.cpm.expectation_maximisation_step(-self.lamba) + + # return the overall objective: llh1 + llh2 - KL(q||p1.p2) + pass + + def logz(self, lamba): + # FIXME: complete this! + lz = 0 + for pid, cid, cnt in edges: + p1 = self.pcm.prob(pid, cid) + z1 = dot(p1 / sum(p1), exp(lamba)) + lz += log(z1) * cnt + + p2 = self.cpm.prob(pid, cid) + z2 = dot(p2 / sum(p2), exp(-lamba)) + lz += log(z2) * cnt + return lz + + def expected_features(self, lamba): + fs = zeros(num_tags) + for pid, cid, cnt in edges: + p1 = self.pcm.prob(pid, cid) + q1 = (p1 / sum(p1)) * exp(lamba) + fs += cnt * q1 / sum(q1) + + p2 = self.cpm.prob(pid, cid) + q2 = (p2 / sum(p2)) * exp(-lamba) + fs -= cnt * q2 / sum(q2) + return fs + +if style == 'p2c': + m = PhraseToContextModel() +elif style == 'c2p': + m = ContextToPhraseModel() +elif style == 'prod': + m = ProductModel() +elif style == 'sum': + m = InterpolatedModel() + +for iteration in range(30): + obj = m.expectation_maximisation_step() + print 'iteration', iteration, 'objective', obj + +for pid, cid, cnt in edges: + p = m.prob(pid, cid) + phrase = phrase_type_list[pid] + phrase_str = ' '.join(map(word_type_list.__getitem__, phrase)) + context = context_type_list[cid] + context_str = ' '.join(map(word_type_list.__getitem__, context)) + print '%s\t%s ||| C=%d' % (phrase_str, context_str, argmax(p)) diff --git a/gi/posterior-regularisation/train_pr_parallel.py b/gi/posterior-regularisation/train_pr_parallel.py index d5df87b5..4de7f504 100644 --- a/gi/posterior-regularisation/train_pr_parallel.py +++ b/gi/posterior-regularisation/train_pr_parallel.py @@ -37,8 +37,6 @@ for line in sys.stdin: num_edges += 1 -print 'Read in', num_edges, 'edges and', len(types), 'word types' - # # Step 2: initialise the model parameters # @@ -53,6 +51,8 @@ local = sys.argv[2] == 'local' if len(sys.argv) >= 2: seed(int(sys.argv[3])) +print 'Read in', num_edges, 'edges', num_phrases, 'phrases', num_contexts, 'contexts and', len(types), 'word types' + def normalise(a): return a / float(sum(a)) @@ -131,7 +131,7 @@ class GlobalDualObjective: for j, (phrase, edges) in enumerate(edges_phrase_to_context): for i, (context, count) in enumerate(edges): for t in range(num_tags): - cons[j,t] -= ls[index,t] + cons[j,t] -= ls[index,t] * count index += 1 return cons.ravel() @@ -142,7 +142,7 @@ class GlobalDualObjective: for j, (phrase, edges) in enumerate(edges_phrase_to_context): for i, (context, count) in enumerate(edges): for t in range(num_tags): - gradient[j,t,index,t] -= 1 + gradient[j,t,index,t] -= count index += 1 return gradient.reshape((num_phrases*num_tags, num_edges*num_tags)) @@ -231,7 +231,7 @@ class LocalDualObjective: cons = ones(num_tags) * self.scale for t in range(num_tags): for i, (context, count) in enumerate(edges): - cons[t] -= ls[i,t] + cons[t] -= ls[i,t] * count return cons def constraints_gradient(self, ls): @@ -240,7 +240,7 @@ class LocalDualObjective: gradient = zeros((num_tags, len(edges), num_tags)) for t in range(num_tags): for i, (context, count) in enumerate(edges): - gradient[t,i,t] -= 1 + gradient[t,i,t] -= count return gradient.reshape((num_tags, len(edges)*num_tags)) def optimize(self): -- cgit v1.2.3