diff options
| -rw-r--r-- | gi/posterior-regularisation/train_pr_agree.py | 328 | ||||
| -rw-r--r-- | gi/posterior-regularisation/train_pr_parallel.py | 12 | 
2 files changed, 334 insertions, 6 deletions
| diff --git a/gi/posterior-regularisation/train_pr_agree.py b/gi/posterior-regularisation/train_pr_agree.py new file mode 100644 index 00000000..bbd6e007 --- /dev/null +++ b/gi/posterior-regularisation/train_pr_agree.py @@ -0,0 +1,328 @@ +import sys +import scipy.optimize +from scipy.stats import geom +from numpy import * +from numpy.random import random, seed + +style = sys.argv[1] +if len(sys.argv) >= 3: +     seed(int(sys.argv[2])) + +# +# Step 1: load the concordance counts +#  + +edges = [] +word_types = {} +phrase_types = {} +context_types = {} + +for line in sys.stdin: +    phrase, rest = line.strip().split('\t') +    ptoks = tuple(map(lambda t: word_types.setdefault(t, len(word_types)), phrase.split())) +    pid = phrase_types.setdefault(ptoks, len(phrase_types)) + +    parts = rest.split('|||') +    for i in range(0, len(parts), 2): +        context, count = parts[i:i+2] + +        ctx = filter(lambda x: x != '<PHRASE>', context.split()) +        ctoks = tuple(map(lambda t: word_types.setdefault(t, len(word_types)), ctx)) +        cid = context_types.setdefault(ctoks, len(context_types)) + +        cnt = int(count.strip()[2:]) +        edges.append((pid, cid, cnt)) + +word_type_list = [None] * len(word_types) +for typ, index in word_types.items(): +    word_type_list[index] = typ + +phrase_type_list = [None] * len(phrase_types) +for typ, index in phrase_types.items(): +    phrase_type_list[index] = typ + +context_type_list = [None] * len(context_types) +for typ, index in context_types.items(): +    context_type_list[index] = typ + +num_tags = 5 +num_types = len(word_types) +num_phrases = len(phrase_types) +num_contexts = len(context_types) +num_edges = len(edges) + +print 'Read in', num_edges, 'edges', num_phrases, 'phrases', num_contexts, 'contexts and', num_types, 'word types' + +# +# Step 2:  expectation maximisation  +# + +def normalise(a): +    return a / float(sum(a)) + +class PhraseToContextModel: +    def __init__(self): +        # Pr(tag | phrase) +        self.tagDist = [normalise(random(num_tags)+1) for p in range(num_phrases)] +        # Pr(context at pos i = w | tag) indexed by i, tag, word +        self.contextWordDist = [[normalise(random(num_types)+1) for t in range(num_tags)] for i in range(4)] + +    def prob(self, pid, cid): +        # return distribution p(tag, context | phrase) as vector of length |tags| +        context = context_type_list[cid] +        dist = zeros(num_tags) +        for t in range(num_tags): +            prob = self.tagDist[pid][t] +            for k, tokid in enumerate(context): +                prob *= self.contextWordDist[k][t][tokid] +            dist[t] = prob +        return dist + +    def expectation_maximisation_step(self, lamba=None): +        tagCounts = zeros((num_phrases, num_tags)) +        contextWordCounts = zeros((4, num_tags, num_types)) + +        # E-step +        llh = 0 +        for pid, cid, cnt in edges: +            q = self.prob(pid, cid) +            z = sum(q) +            q /= z +            llh += log(z) +            context = context_type_list[cid] +            if lamba != None: +                q *= exp(lamba) +                q /= sum(q) +            for t in range(num_tags): +                tagCounts[pid][t] += cnt * q[t] +            for i in range(4): +                for t in range(num_tags): +                    contextWordCounts[i][t][context[i]] += cnt * q[t] + +        # M-step +        for p in range(num_phrases): +            self.tagDist[p] = normalise(tagCounts[p]) +        for i in range(4): +            for t in range(num_tags): +                self.contextWordDist[i][t] = normalise(contextWordCounts[i,t]) + +        return llh + +class ContextToPhraseModel: +    def __init__(self): +        # Pr(tag | context) = Multinomial +        self.tagDist = [normalise(random(num_tags)+1) for p in range(num_contexts)] +        # Pr(phrase = w | tag) = Multinomial +        self.phraseSingleDist = [normalise(random(num_types)+1) for t in range(num_tags)] +        # Pr(phrase_1 = w | tag) = Multinomial +        self.phraseLeftDist = [normalise(random(num_types)+1) for t in range(num_tags)] +        # Pr(phrase_-1 = w | tag) = Multinomial +        self.phraseRightDist = [normalise(random(num_types)+1) for t in range(num_tags)] +        # Pr(|phrase| = l | tag) = Geometric +        self.phraseLengthDist = [0.5] * num_tags +        # n.b. internal words for phrases of length >= 3 are drawn from uniform distribution + +    def prob(self, pid, cid): +        # return distribution p(tag, phrase | context) as vector of length |tags| +        phrase = phrase_type_list[pid] +        dist = zeros(num_tags) +        for t in range(num_tags): +            prob = self.tagDist[cid][t] +            f = self.phraseLengthDist[t] +            prob *= geom.pmf(len(phrase), f) +            if len(phrase) == 1: +                prob *= self.phraseSingleDist[t][phrase[0]] +            else: +                prob *= self.phraseLeftDist[t][phrase[0]] +                prob *= self.phraseRightDist[t][phrase[-1]] +            dist[t] = prob +        return dist + +    def expectation_maximisation_step(self, lamba=None): +        tagCounts = zeros((num_contexts, num_tags)) +        phraseSingleCounts = zeros((num_tags, num_types)) +        phraseLeftCounts = zeros((num_tags, num_types)) +        phraseRightCounts = zeros((num_tags, num_types)) +        phraseLength = zeros(num_types) + +        # E-step +        llh = 0 +        for pid, cid, cnt in edges: +            q = self.prob(pid, cid) +            z = sum(q) +            q /= z +            llh += log(z) +            if lamba != None: +                q *= exp(lamba) +                q /= sum(q) +            #print 'p', phrase_type_list[pid], 'c', context_type_list[cid], 'q', q +            phrase = phrase_type_list[pid] +            for t in range(num_tags): +                tagCounts[cid][t] += cnt * q[t] +                phraseLength[t] += cnt * len(phrase) * q[t] +                if len(phrase) == 1: +                    phraseSingleCounts[t][phrase[0]] += cnt * q[t] +                else: +                    phraseLeftCounts[t][phrase[0]] += cnt * q[t] +                    phraseRightCounts[t][phrase[-1]] += cnt * q[t] + +        # M-step +        for t in range(num_tags): +            self.phraseLengthDist[t] = min(max(sum(tagCounts[:,t]) / phraseLength[t], 1e-6), 1-1e-6) +            self.phraseSingleDist[t] = normalise(phraseSingleCounts[t]) +            self.phraseLeftDist[t] = normalise(phraseLeftCounts[t]) +            self.phraseRightDist[t] = normalise(phraseRightCounts[t]) +        for c in range(num_contexts): +            self.tagDist[c] = normalise(tagCounts[c]) + +        #print 't', self.tagDist +        #print 'l', self.phraseLengthDist +        #print 's', self.phraseSingleDist +        #print 'L', self.phraseLeftDist +        #print 'R', self.phraseRightDist + +        return llh + +class ProductModel: +    """ +    WARNING: I haven't verified the maths behind this model. It's quite likely to be incorrect. +    """ + +    def __init__(self): +        self.pcm = PhraseToContextModel() +        self.cpm = ContextToPhraseModel() + +    def prob(self, pid, cid): +        p1 = self.pcm.prob(pid, cid) +        p2 = self.cpm.prob(pid, cid) +        return (p1 / sum(p1)) * (p2 / sum(p2)) + +    def expectation_maximisation_step(self): +        tagCountsGivenPhrase = zeros((num_phrases, num_tags)) +        contextWordCounts = zeros((4, num_tags, num_types)) + +        tagCountsGivenContext = zeros((num_contexts, num_tags)) +        phraseSingleCounts = zeros((num_tags, num_types)) +        phraseLeftCounts = zeros((num_tags, num_types)) +        phraseRightCounts = zeros((num_tags, num_types)) +        phraseLength = zeros(num_types) + +        kl = llh1 = llh2 = 0 +        for pid, cid, cnt in edges: +            p1 = self.pcm.prob(pid, cid) +            llh1 += log(sum(p1)) * cnt +            p2 = self.cpm.prob(pid, cid) +            llh2 += log(sum(p2)) * cnt + +            q = (p1 / sum(p1)) * (p2 / sum(p2)) +            kl += log(sum(q)) * cnt +            qi = sqrt(q) +            qi /= sum(qi) + +            phrase = phrase_type_list[pid] +            context = context_type_list[cid] +            for t in range(num_tags): +                tagCountsGivenPhrase[pid][t] += cnt * qi[t] +                tagCountsGivenContext[cid][t] += cnt * qi[t] +                phraseLength[t] += cnt * len(phrase) * qi[t] +                if len(phrase) == 1: +                    phraseSingleCounts[t][phrase[0]] += cnt * qi[t] +                else: +                    phraseLeftCounts[t][phrase[0]] += cnt * qi[t] +                    phraseRightCounts[t][phrase[-1]] += cnt * qi[t] +                for i in range(4): +                    contextWordCounts[i][t][context[i]] += cnt * qi[t] + +        kl *= -2 + +        for t in range(num_tags): +            for i in range(4): +                self.pcm.contextWordDist[i][t] = normalise(contextWordCounts[i,t]) +            self.cpm.phraseLengthDist[t] = min(max(sum(tagCountsGivenContext[:,t]) / phraseLength[t], 1e-6), 1-1e-6) +            self.cpm.phraseSingleDist[t] = normalise(phraseSingleCounts[t]) +            self.cpm.phraseLeftDist[t] = normalise(phraseLeftCounts[t]) +            self.cpm.phraseRightDist[t] = normalise(phraseRightCounts[t]) +        for p in range(num_phrases): +            self.pcm.tagDist[p] = normalise(tagCountsGivenPhrase[p]) +        for c in range(num_contexts): +            self.cpm.tagDist[c] = normalise(tagCountsGivenContext[c]) + +        # return the overall objective +        return llh1 + llh2 + kl + +class InterpolatedModel: +    def __init__(self, epsilon): +        self.pcm = PhraseToContextModel() +        self.cpm = ContextToPhraseModel() +        self.epsilon = epsilon +        self.lamba = zeros(num_tags) + +    def prob(self, pid, cid): +        p1 = self.pcm.prob(pid, cid) +        p2 = self.cpm.prob(pid, cid) +        return (p1 + p2) / 2 + +    def dual(self, lamba): +        return self.logz(lamba) + self.epsilon * dot(lamba, lamba) ** 0.5 + +    def dual_gradient(self, lamba): +        return self.expected_features(lamba) + self.epsilon * 2 * lamba + +    def expectation_maximisation_step(self): +        # PR-step: optimise lambda to minimise log(z_lambda) + eps ||lambda||_2 +        self.lamba = scipy.optimize.fmin_slsqp(self.dual, self.lamba, +                                bounds=[(0, 1e100)] * num_tags, +                                fprime=self.dual_gradient, iprint=0) + +        # E,M-steps: collect expected counts under q_lambda and normalise +        #llh1 = self.pcm.expectation_maximisation_step(self.lamba) +        #llh2 = self.cpm.expectation_maximisation_step(-self.lamba) + +        # return the overall objective: llh1 + llh2 - KL(q||p1.p2) +        pass + +    def logz(self, lamba): +        # FIXME: complete this! +        lz = 0 +        for pid, cid, cnt in edges: +            p1 = self.pcm.prob(pid, cid) +            z1 = dot(p1 / sum(p1), exp(lamba)) +            lz += log(z1) * cnt + +            p2 = self.cpm.prob(pid, cid) +            z2 = dot(p2 / sum(p2), exp(-lamba)) +            lz += log(z2) * cnt +        return lz + +    def expected_features(self, lamba): +        fs = zeros(num_tags) +        for pid, cid, cnt in edges: +            p1 = self.pcm.prob(pid, cid) +            q1 = (p1 / sum(p1)) * exp(lamba) +            fs += cnt * q1 / sum(q1) + +            p2 = self.cpm.prob(pid, cid) +            q2 = (p2 / sum(p2)) * exp(-lamba) +            fs -= cnt * q2 / sum(q2) +        return fs + +if style == 'p2c': +    m = PhraseToContextModel() +elif style == 'c2p': +    m = ContextToPhraseModel() +elif style == 'prod': +    m = ProductModel() +elif style == 'sum': +    m = InterpolatedModel() + +for iteration in range(30): +    obj = m.expectation_maximisation_step() +    print 'iteration', iteration, 'objective', obj + +for pid, cid, cnt in edges: +    p = m.prob(pid, cid) +    phrase = phrase_type_list[pid] +    phrase_str = ' '.join(map(word_type_list.__getitem__, phrase)) +    context = context_type_list[cid] +    context_str = ' '.join(map(word_type_list.__getitem__, context)) +    print '%s\t%s ||| C=%d' % (phrase_str, context_str, argmax(p)) diff --git a/gi/posterior-regularisation/train_pr_parallel.py b/gi/posterior-regularisation/train_pr_parallel.py index d5df87b5..4de7f504 100644 --- a/gi/posterior-regularisation/train_pr_parallel.py +++ b/gi/posterior-regularisation/train_pr_parallel.py @@ -37,8 +37,6 @@ for line in sys.stdin:          num_edges += 1 -print 'Read in', num_edges, 'edges and', len(types), 'word types' -  #  # Step 2: initialise the model parameters  # @@ -53,6 +51,8 @@ local = sys.argv[2] == 'local'  if len(sys.argv) >= 2:       seed(int(sys.argv[3])) +print 'Read in', num_edges, 'edges', num_phrases, 'phrases', num_contexts, 'contexts and', len(types), 'word types' +  def normalise(a):      return a / float(sum(a)) @@ -131,7 +131,7 @@ class GlobalDualObjective:          for j, (phrase, edges) in enumerate(edges_phrase_to_context):              for i, (context, count) in enumerate(edges):                  for t in range(num_tags): -                    cons[j,t] -= ls[index,t] +                    cons[j,t] -= ls[index,t] * count                  index += 1          return cons.ravel() @@ -142,7 +142,7 @@ class GlobalDualObjective:          for j, (phrase, edges) in enumerate(edges_phrase_to_context):              for i, (context, count) in enumerate(edges):                  for t in range(num_tags): -                    gradient[j,t,index,t] -= 1 +                    gradient[j,t,index,t] -= count                  index += 1          return gradient.reshape((num_phrases*num_tags, num_edges*num_tags)) @@ -231,7 +231,7 @@ class LocalDualObjective:          cons = ones(num_tags) * self.scale          for t in range(num_tags):              for i, (context, count) in enumerate(edges): -                cons[t] -= ls[i,t] +                cons[t] -= ls[i,t] * count          return cons      def constraints_gradient(self, ls): @@ -240,7 +240,7 @@ class LocalDualObjective:          gradient = zeros((num_tags, len(edges), num_tags))          for t in range(num_tags):              for i, (context, count) in enumerate(edges): -                gradient[t,i,t] -= 1 +                gradient[t,i,t] -= count          return gradient.reshape((num_tags, len(edges)*num_tags))      def optimize(self): | 
