First bits of code for PR training

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@44 ec762483-ff6d-05da-a07a-a48fb63a330f
author: trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-28 19:34:58 +0000
committer: trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-28 19:34:58 +0000
commit: ab3534c45f463e541f3baf05006a50b64e3bbe31 (patch)
tree: a778207e83c2d490593b97334d112e6ec1db310d /gi/posterior-regularisation/PhraseContextModel.java
parent: 207e694dd81046c2d2fa740f565ec2bb31f8152e (diff)
1 files changed, 317 insertions, 0 deletions
diff --git a/gi/posterior-regularisation/PhraseContextModel.java b/gi/posterior-regularisation/PhraseContextModel.java
new file mode 100644
index 00000000..3af72d54
--- /dev/null
+++ b/gi/posterior-regularisation/PhraseContextModel.java
@@ -0,0 +1,317 @@
+// Input of the form:
+// " the phantom of the opera "    tickets for <PHRASE> tonight ? ||| C=1 ||| seats for <PHRASE> ? </s> ||| C=1 ||| i see <PHRASE> ? </s> ||| C=1
+//                      phrase TAB [context]+
+// where    context =   phrase ||| C=...        which are separated by |||
+
+// Model parameterised as follows:
+// - each phrase, p, is allocated a latent state, t
+// - this is used to generate the contexts, c
+// - each context is generated using 4 independent multinomials, one for each position LL, L, R, RR
+
+// Training with EM:
+// - e-step is estimating q(t) = P(t|p,c) for all x,c
+// - m-step is estimating model parameters P(c,t|p) = P(t) P(c|t)
+// - PR uses alternate e-step, which first optimizes lambda 
+//      min_q KL(q||p) + delta sum_pt max_c E_q[phi_ptc]
+//   where
+//      q(t|p,c) propto p(t,c|p) exp( -phi_ptc )
+//   Then q is used to obtain expectations for vanilla M-step.
+
+// Sexing it up:
+// - learn p-specific conditionals P(t|p)
+// - or generate phrase internals, e.g., generate edge words from
+//   different distribution to central words
+// - agreement between phrase->context model and context->phrase model
+
+import java.io.*;
+import java.util.*;
+import java.util.regex.*;
+import static java.lang.Math.*;
+
+class Lexicon<T>
+{
+    public int insert(T word)
+    {
+        Integer i = wordToIndex.get(word);
+        if (i == null)
+        {
+            i = indexToWord.size();
+            wordToIndex.put(word, i);
+            indexToWord.add(word);
+        }
+        return i;
+    }
+
+    public T lookup(int index)
+    {
+        return indexToWord.get(index);
+    }
+
+    public int size()
+    {
+        return indexToWord.size();
+    }
+    
+    private Map<T,Integer> wordToIndex = new HashMap<T,Integer>();
+    private List<T> indexToWord = new ArrayList<T>();
+}
+
+class PhraseContextModel
+{
+    // model/optimisation configuration parameters
+    int numTags;
+    int numPRIterations = 5;
+    boolean posteriorRegularisation = false;
+    double constraintScale = 10;
+
+    // book keeping
+    Lexicon<String> tokenLexicon = new Lexicon<String>();
+    int numPositions;
+    Random rng = new Random();
+
+    // training set; 1 entry for each unique phrase
+    PhraseAndContexts training[];
+
+    // model parameters (learnt)
+    double emissions[][][]; // position in 0 .. 3 x tag x word      Pr(word | tag, position)
+    double prior[][];       // phrase x tag                         Pr(tag | phrase)
+    //double lambda[][][];    // word x context x tag               langrange multipliers
+
+    PhraseContextModel(File infile, int tags) throws IOException
+    {
+        numTags = tags;
+        readTrainingFromFile(new FileReader(infile));
+        assert(training.length > 0);
+
+        // now initialise emissions
+        assert(training[0].contexts.length > 0);
+        numPositions = training[0].contexts[0].tokens.length;
+
+        emissions = new double[numPositions][numTags][tokenLexicon.size()];
+        prior = new double[training.length][numTags];
+        //lambda = new double[tokenLexicon.size()][???][numTags]
+
+        for (double[][] emissionTW: emissions)
+            for (double[] emissionW: emissionTW)
+                randomise(emissionW);
+
+        for (double[] priorTag: prior)
+            randomise(priorTag);
+    }
+
+    void expectationMaximisation(int numIterations)
+    { 
+        for (int iteration = 0; iteration < numIterations; ++iteration)
+        {
+            double emissionsCounts[][][] = new double[numPositions][numTags][tokenLexicon.size()];
+            double priorCounts[][] = new double[training.length][numTags];
+
+            // E-step
+            double llh = 0;
+            for (int i = 0; i < training.length; ++i)
+            {
+                PhraseAndContexts instance = training[i];
+                for (Context ctx: instance.contexts)
+                {
+                    double probs[] = posterior(i, ctx);
+                    double z = normalise(probs);
+                    llh += log(z) * ctx.count;
+
+                    for (int t = 0; t < numTags; ++t)
+                    {
+                        priorCounts[i][t] += ctx.count * probs[t];
+                        for (int j = 0; j < ctx.tokens.length; ++j)
+                            emissionsCounts[j][t][ctx.tokens[j]] += ctx.count * probs[t];
+                    }
+                }
+            }
+
+            // M-step: normalise
+            for (double[][] emissionTW: emissionsCounts)
+                for (double[] emissionW: emissionTW)
+                    normalise(emissionW);
+
+            for (double[] priorTag: priorCounts)
+                normalise(priorTag);
+
+            emissions = emissionsCounts;
+            prior = priorCounts;
+
+            System.out.println("Iteration " + iteration + " llh " + llh);
+        }
+    }
+
+    static double normalise(double probs[])
+    {
+        double z = 0;
+        for (double p: probs)
+            z += p;
+        for (int i = 0; i < probs.length; ++i)
+            probs[i] /= z;
+        return z;
+    }
+
+    void randomise(double probs[])
+    {
+        double z = 0;
+        for (int i = 0; i < probs.length; ++i)
+        {
+            probs[i] = 10 + rng.nextDouble();
+            z += probs[i];
+        }
+            
+        for (int i = 0; i < probs.length; ++i)
+            probs[i] /= z;
+    }
+    
+    static int argmax(double probs[])
+    {
+        double m = Double.NEGATIVE_INFINITY;
+        int mi = -1;
+        for (int i = 0; i < probs.length; ++i)
+        {
+            if (probs[i] > m)
+            {
+                m = probs[i];
+                mi = i;
+            }
+        }
+        return mi;
+    }
+
+    double[] posterior(int phraseId, Context c) // unnormalised
+    {
+        double probs[] = new double[numTags];
+        for (int t = 0; t < numTags; ++t)
+        {
+            probs[t] = prior[phraseId][t];
+            for (int j = 0; j < c.tokens.length; ++j)
+                probs[t] *= emissions[j][t][c.tokens[j]];
+        }
+        return probs;
+    }
+
+    private void readTrainingFromFile(Reader in) throws IOException
+    {
+        // read in line-by-line
+        BufferedReader bin = new BufferedReader(in);
+        String line;
+        List<PhraseAndContexts> instances = new ArrayList<PhraseAndContexts>();
+        Pattern separator = Pattern.compile(" \\|\\|\\| ");
+
+        int numEdges = 0;
+        while ((line = bin.readLine()) != null)
+        {
+            // split into phrase and contexts
+            StringTokenizer st = new StringTokenizer(line, "\t");
+            assert(st.hasMoreTokens());
+            String phrase = st.nextToken();
+            assert(st.hasMoreTokens());
+            String rest = st.nextToken();
+            assert(!st.hasMoreTokens());
+
+            // process phrase
+            st = new StringTokenizer(phrase, " ");
+            List<Integer> ptoks = new ArrayList<Integer>();
+            while (st.hasMoreTokens())
+                ptoks.add(tokenLexicon.insert(st.nextToken()));
+
+            // process contexts
+            ArrayList<Context> contexts = new ArrayList<Context>();
+            String[] parts = separator.split(rest);
+            assert(parts.length % 2 == 0);
+            for (int i = 0; i < parts.length; i += 2)
+            {
+                // process pairs of strings - context and count
+                ArrayList<Integer> ctx = new ArrayList<Integer>();
+                String ctxString = parts[i];
+                String countString = parts[i+1];
+                StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " ");
+                while (ctxStrtok.hasMoreTokens())
+                {
+                    String token = ctxStrtok.nextToken();
+                    if (!token.equals("<PHRASE>"))
+                        ctx.add(tokenLexicon.insert(token));
+                }
+
+                assert(countString.startsWith("C="));
+                Context c = new Context();
+                c.count = Integer.parseInt(countString.substring(2).trim());
+                // damn unboxing doesn't work with toArray
+                c.tokens = new int[ctx.size()];
+                for (int k = 0; k < ctx.size(); ++k)
+                    c.tokens[k] = ctx.get(k);
+                contexts.add(c);
+
+                numEdges += 1;
+            }
+
+            // package up
+            PhraseAndContexts instance = new PhraseAndContexts();
+            // damn unboxing doesn't work with toArray
+            instance.phraseTokens = new int[ptoks.size()];
+            for (int k = 0; k < ptoks.size(); ++k)
+                instance.phraseTokens[k] = ptoks.get(k);
+            instance.contexts = contexts.toArray(new Context[] {});
+            instances.add(instance);
+        }
+
+        training = instances.toArray(new PhraseAndContexts[] {});
+
+        System.out.println("Read in " + training.length + " phrases and " + numEdges + " edges");
+    }
+
+    void displayPosterior()
+    {
+        for (int i = 0; i < training.length; ++i)
+        {
+            PhraseAndContexts instance = training[i];
+            for (Context ctx: instance.contexts)
+            {
+                double probs[] = posterior(i, ctx);
+                double z = normalise(probs);
+
+                // emit phrase
+                for (int t: instance.phraseTokens)
+                    System.out.print(tokenLexicon.lookup(t) + " ");
+                System.out.print("\t");
+                for (int c: ctx.tokens)
+                    System.out.print(tokenLexicon.lookup(c) + " ");
+                System.out.print("||| C=" + ctx.count + " |||");
+
+                System.out.print(" " + argmax(probs));
+                //for (int t = 0; t < numTags; ++t)
+                    //System.out.print(" " + probs[t]);
+                System.out.println();
+            }
+        }
+    }
+
+    class PhraseAndContexts
+    {
+        int phraseTokens[];
+        Context contexts[];
+    }
+
+    class Context
+    {
+        int count;
+        int[] tokens;
+    }
+
+    public static void main(String []args)
+    {
+        assert(args.length >= 2);
+        try
+        {
+            PhraseContextModel model = new PhraseContextModel(new File(args[0]), Integer.parseInt(args[1]));
+            model.expectationMaximisation(Integer.parseInt(args[2]));
+            model.displayPosterior();
+        }
+        catch (IOException e)
+        {
+            System.out.println("Failed to read input file: " + args[0]);
+            e.printStackTrace();
+        }
+    }
+}
author	trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-28 19:34:58 +0000
committer	trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-28 19:34:58 +0000
commit	ab3534c45f463e541f3baf05006a50b64e3bbe31 (patch)
tree	a778207e83c2d490593b97334d112e6ec1db310d /gi/posterior-regularisation/PhraseContextModel.java
parent	207e694dd81046c2d2fa740f565ec2bb31f8152e (diff)