First bits of code for PR training

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@44 ec762483-ff6d-05da-a07a-a48fb63a330f
author: trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-28 19:34:58 +0000
committer: trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-28 19:34:58 +0000
commit: ab3534c45f463e541f3baf05006a50b64e3bbe31 (patch)
tree: a778207e83c2d490593b97334d112e6ec1db310d /gi/posterior-regularisation
parent: 207e694dd81046c2d2fa740f565ec2bb31f8152e (diff)
9 files changed, 1649 insertions, 0 deletions
diff --git a/gi/posterior-regularisation/PhraseContextModel.java b/gi/posterior-regularisation/PhraseContextModel.java
new file mode 100644
index 00000000..3af72d54
--- /dev/null
+++ b/gi/posterior-regularisation/PhraseContextModel.java
@@ -0,0 +1,317 @@
+// Input of the form:
+// " the phantom of the opera "    tickets for <PHRASE> tonight ? ||| C=1 ||| seats for <PHRASE> ? </s> ||| C=1 ||| i see <PHRASE> ? </s> ||| C=1
+//                      phrase TAB [context]+
+// where    context =   phrase ||| C=...        which are separated by |||
+
+// Model parameterised as follows:
+// - each phrase, p, is allocated a latent state, t
+// - this is used to generate the contexts, c
+// - each context is generated using 4 independent multinomials, one for each position LL, L, R, RR
+
+// Training with EM:
+// - e-step is estimating q(t) = P(t|p,c) for all x,c
+// - m-step is estimating model parameters P(c,t|p) = P(t) P(c|t)
+// - PR uses alternate e-step, which first optimizes lambda 
+//      min_q KL(q||p) + delta sum_pt max_c E_q[phi_ptc]
+//   where
+//      q(t|p,c) propto p(t,c|p) exp( -phi_ptc )
+//   Then q is used to obtain expectations for vanilla M-step.
+
+// Sexing it up:
+// - learn p-specific conditionals P(t|p)
+// - or generate phrase internals, e.g., generate edge words from
+//   different distribution to central words
+// - agreement between phrase->context model and context->phrase model
+
+import java.io.*;
+import java.util.*;
+import java.util.regex.*;
+import static java.lang.Math.*;
+
+class Lexicon<T>
+{
+    public int insert(T word)
+    {
+        Integer i = wordToIndex.get(word);
+        if (i == null)
+        {
+            i = indexToWord.size();
+            wordToIndex.put(word, i);
+            indexToWord.add(word);
+        }
+        return i;
+    }
+
+    public T lookup(int index)
+    {
+        return indexToWord.get(index);
+    }
+
+    public int size()
+    {
+        return indexToWord.size();
+    }
+    
+    private Map<T,Integer> wordToIndex = new HashMap<T,Integer>();
+    private List<T> indexToWord = new ArrayList<T>();
+}
+
+class PhraseContextModel
+{
+    // model/optimisation configuration parameters
+    int numTags;
+    int numPRIterations = 5;
+    boolean posteriorRegularisation = false;
+    double constraintScale = 10;
+
+    // book keeping
+    Lexicon<String> tokenLexicon = new Lexicon<String>();
+    int numPositions;
+    Random rng = new Random();
+
+    // training set; 1 entry for each unique phrase
+    PhraseAndContexts training[];
+
+    // model parameters (learnt)
+    double emissions[][][]; // position in 0 .. 3 x tag x word      Pr(word | tag, position)
+    double prior[][];       // phrase x tag                         Pr(tag | phrase)
+    //double lambda[][][];    // word x context x tag               langrange multipliers
+
+    PhraseContextModel(File infile, int tags) throws IOException
+    {
+        numTags = tags;
+        readTrainingFromFile(new FileReader(infile));
+        assert(training.length > 0);
+
+        // now initialise emissions
+        assert(training[0].contexts.length > 0);
+        numPositions = training[0].contexts[0].tokens.length;
+
+        emissions = new double[numPositions][numTags][tokenLexicon.size()];
+        prior = new double[training.length][numTags];
+        //lambda = new double[tokenLexicon.size()][???][numTags]
+
+        for (double[][] emissionTW: emissions)
+            for (double[] emissionW: emissionTW)
+                randomise(emissionW);
+
+        for (double[] priorTag: prior)
+            randomise(priorTag);
+    }
+
+    void expectationMaximisation(int numIterations)
+    { 
+        for (int iteration = 0; iteration < numIterations; ++iteration)
+        {
+            double emissionsCounts[][][] = new double[numPositions][numTags][tokenLexicon.size()];
+            double priorCounts[][] = new double[training.length][numTags];
+
+            // E-step
+            double llh = 0;
+            for (int i = 0; i < training.length; ++i)
+            {
+                PhraseAndContexts instance = training[i];
+                for (Context ctx: instance.contexts)
+                {
+                    double probs[] = posterior(i, ctx);
+                    double z = normalise(probs);
+                    llh += log(z) * ctx.count;
+
+                    for (int t = 0; t < numTags; ++t)
+                    {
+                        priorCounts[i][t] += ctx.count * probs[t];
+                        for (int j = 0; j < ctx.tokens.length; ++j)
+                            emissionsCounts[j][t][ctx.tokens[j]] += ctx.count * probs[t];
+                    }
+                }
+            }
+
+            // M-step: normalise
+            for (double[][] emissionTW: emissionsCounts)
+                for (double[] emissionW: emissionTW)
+                    normalise(emissionW);
+
+            for (double[] priorTag: priorCounts)
+                normalise(priorTag);
+
+            emissions = emissionsCounts;
+            prior = priorCounts;
+
+            System.out.println("Iteration " + iteration + " llh " + llh);
+        }
+    }
+
+    static double normalise(double probs[])
+    {
+        double z = 0;
+        for (double p: probs)
+            z += p;
+        for (int i = 0; i < probs.length; ++i)
+            probs[i] /= z;
+        return z;
+    }
+
+    void randomise(double probs[])
+    {
+        double z = 0;
+        for (int i = 0; i < probs.length; ++i)
+        {
+            probs[i] = 10 + rng.nextDouble();
+            z += probs[i];
+        }
+            
+        for (int i = 0; i < probs.length; ++i)
+            probs[i] /= z;
+    }
+    
+    static int argmax(double probs[])
+    {
+        double m = Double.NEGATIVE_INFINITY;
+        int mi = -1;
+        for (int i = 0; i < probs.length; ++i)
+        {
+            if (probs[i] > m)
+            {
+                m = probs[i];
+                mi = i;
+            }
+        }
+        return mi;
+    }
+
+    double[] posterior(int phraseId, Context c) // unnormalised
+    {
+        double probs[] = new double[numTags];
+        for (int t = 0; t < numTags; ++t)
+        {
+            probs[t] = prior[phraseId][t];
+            for (int j = 0; j < c.tokens.length; ++j)
+                probs[t] *= emissions[j][t][c.tokens[j]];
+        }
+        return probs;
+    }
+
+    private void readTrainingFromFile(Reader in) throws IOException
+    {
+        // read in line-by-line
+        BufferedReader bin = new BufferedReader(in);
+        String line;
+        List<PhraseAndContexts> instances = new ArrayList<PhraseAndContexts>();
+        Pattern separator = Pattern.compile(" \\|\\|\\| ");
+
+        int numEdges = 0;
+        while ((line = bin.readLine()) != null)
+        {
+            // split into phrase and contexts
+            StringTokenizer st = new StringTokenizer(line, "\t");
+            assert(st.hasMoreTokens());
+            String phrase = st.nextToken();
+            assert(st.hasMoreTokens());
+            String rest = st.nextToken();
+            assert(!st.hasMoreTokens());
+
+            // process phrase
+            st = new StringTokenizer(phrase, " ");
+            List<Integer> ptoks = new ArrayList<Integer>();
+            while (st.hasMoreTokens())
+                ptoks.add(tokenLexicon.insert(st.nextToken()));
+
+            // process contexts
+            ArrayList<Context> contexts = new ArrayList<Context>();
+            String[] parts = separator.split(rest);
+            assert(parts.length % 2 == 0);
+            for (int i = 0; i < parts.length; i += 2)
+            {
+                // process pairs of strings - context and count
+                ArrayList<Integer> ctx = new ArrayList<Integer>();
+                String ctxString = parts[i];
+                String countString = parts[i+1];
+                StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " ");
+                while (ctxStrtok.hasMoreTokens())
+                {
+                    String token = ctxStrtok.nextToken();
+                    if (!token.equals("<PHRASE>"))
+                        ctx.add(tokenLexicon.insert(token));
+                }
+
+                assert(countString.startsWith("C="));
+                Context c = new Context();
+                c.count = Integer.parseInt(countString.substring(2).trim());
+                // damn unboxing doesn't work with toArray
+                c.tokens = new int[ctx.size()];
+                for (int k = 0; k < ctx.size(); ++k)
+                    c.tokens[k] = ctx.get(k);
+                contexts.add(c);
+
+                numEdges += 1;
+            }
+
+            // package up
+            PhraseAndContexts instance = new PhraseAndContexts();
+            // damn unboxing doesn't work with toArray
+            instance.phraseTokens = new int[ptoks.size()];
+            for (int k = 0; k < ptoks.size(); ++k)
+                instance.phraseTokens[k] = ptoks.get(k);
+            instance.contexts = contexts.toArray(new Context[] {});
+            instances.add(instance);
+        }
+
+        training = instances.toArray(new PhraseAndContexts[] {});
+
+        System.out.println("Read in " + training.length + " phrases and " + numEdges + " edges");
+    }
+
+    void displayPosterior()
+    {
+        for (int i = 0; i < training.length; ++i)
+        {
+            PhraseAndContexts instance = training[i];
+            for (Context ctx: instance.contexts)
+            {
+                double probs[] = posterior(i, ctx);
+                double z = normalise(probs);
+
+                // emit phrase
+                for (int t: instance.phraseTokens)
+                    System.out.print(tokenLexicon.lookup(t) + " ");
+                System.out.print("\t");
+                for (int c: ctx.tokens)
+                    System.out.print(tokenLexicon.lookup(c) + " ");
+                System.out.print("||| C=" + ctx.count + " |||");
+
+                System.out.print(" " + argmax(probs));
+                //for (int t = 0; t < numTags; ++t)
+                    //System.out.print(" " + probs[t]);
+                System.out.println();
+            }
+        }
+    }
+
+    class PhraseAndContexts
+    {
+        int phraseTokens[];
+        Context contexts[];
+    }
+
+    class Context
+    {
+        int count;
+        int[] tokens;
+    }
+
+    public static void main(String []args)
+    {
+        assert(args.length >= 2);
+        try
+        {
+            PhraseContextModel model = new PhraseContextModel(new File(args[0]), Integer.parseInt(args[1]));
+            model.expectationMaximisation(Integer.parseInt(args[2]));
+            model.displayPosterior();
+        }
+        catch (IOException e)
+        {
+            System.out.println("Failed to read input file: " + args[0]);
+            e.printStackTrace();
+        }
+    }
+}
diff --git a/gi/posterior-regularisation/README b/gi/posterior-regularisation/README
new file mode 100644
index 00000000..a3d54ffc
--- /dev/null
+++ b/gi/posterior-regularisation/README
@@ -0,0 +1,3 @@
+  557  ./cdec_extools/extractor -i btec/split.zh-en.al -c 500000 -L 12 -C  | sort -t $'\t' -k 1 | ./cdec_extools/mr_stripe_rule_reduce > btec.concordance
+  559  wc -l btec.concordance 
+  588  cat btec.concordance  | sed  's/.*	//' | awk '{ for (i=1; i < NF; i++) { x=substr($i, 1, 2); if (x == "C=") printf "\n"; else if (x != "||") printf "%s ", $i; }; printf "\n"; }' | sort | uniq | wc -l
diff --git a/gi/posterior-regularisation/alphabet.hh b/gi/posterior-regularisation/alphabet.hh
new file mode 100644
index 00000000..1db928da
--- /dev/null
+++ b/gi/posterior-regularisation/alphabet.hh
@@ -0,0 +1,61 @@
+#ifndef _alphabet_hh
+#define _alphabet_hh
+
+#include <cassert>
+#include <iosfwd>
+#include <map>
+#include <string>
+#include <vector>
+
+// Alphabet: indexes a set of types 
+template <typename T>
+class Alphabet: protected std::map<T, int>
+{
+public:
+    Alphabet() {};
+
+    bool empty() const { return std::map<T,int>::empty(); }
+    int size() const { return std::map<T,int>::size(); }
+
+    int operator[](const T &k) const
+    {
+        typename std::map<T,int>::const_iterator cit = find(k);
+        if (cit != std::map<T,int>::end())
+            return cit->second;
+        else
+            return -1;
+    }
+
+    int lookup(const T &k) const { return (*this)[k]; }
+
+    int insert(const T &k) 
+    {
+        int sz = size();
+        assert((unsigned) sz == _items.size());
+
+        std::pair<typename std::map<T,int>::iterator, bool>
+            ins = std::map<T,int>::insert(make_pair(k, sz));
+
+        if (ins.second) 
+            _items.push_back(k);
+
+        return ins.first->second;
+    }
+
+    const T &type(int i) const
+    {
+        assert(i >= 0);
+        assert(i < size());
+        return _items[i];
+    }
+
+    std::ostream &display(std::ostream &out, int i) const
+    {
+        return out << type(i);
+    }
+
+private:
+    std::vector<T> _items;
+};
+
+#endif
diff --git a/gi/posterior-regularisation/canned.concordance b/gi/posterior-regularisation/canned.concordance
new file mode 100644
index 00000000..710973ff
--- /dev/null
+++ b/gi/posterior-regularisation/canned.concordance
@@ -0,0 +1,4 @@
+a	0 0 <PHRASE> 0 0 ||| C=1 ||| 1 1 <PHRASE> 1 1 ||| C=1 ||| 2 2 <PHRASE> 2 2 ||| C=1
+b	0 0 <PHRASE> 0 0 ||| C=1 ||| 1 1 <PHRASE> 1 1 ||| C=1 
+c	2 2 <PHRASE> 2 2 ||| C=1 ||| 4 4 <PHRASE> 4 4 ||| C=1 ||| 5 5 <PHRASE> 5 5 ||| C=1
+d	4 4 <PHRASE> 4 4 ||| C=1 ||| 5 5 <PHRASE> 5 5 ||| C=1
diff --git a/gi/posterior-regularisation/em.cc b/gi/posterior-regularisation/em.cc
new file mode 100644
index 00000000..f6c9fd68
--- /dev/null
+++ b/gi/posterior-regularisation/em.cc
@@ -0,0 +1,830 @@
+// Input of the form:
+// " the phantom of the opera "    tickets for <PHRASE> tonight ? ||| C=1 ||| seats for <PHRASE> ? </s> ||| C=1 ||| i see <PHRASE> ? </s> ||| C=1
+//                      phrase TAB [context]+
+// where    context =   phrase ||| C=...        which are separated by |||
+
+// Model parameterised as follows:
+// - each phrase, p, is allocated a latent state, t
+// - this is used to generate the contexts, c
+// - each context is generated using 4 independent multinomials, one for each position LL, L, R, RR
+
+// Training with EM:
+// - e-step is estimating P(t|p,c) for all x,c
+// - m-step is estimating model parameters P(p,c,t) = P(t) P(p|t) P(c|t)
+
+// Sexing it up:
+// - constrain the posteriors P(t|c) and P(t|p) to have few high-magnitude entries
+// - improve the generation of phrase internals, e.g., generate edge words from
+//   different distribution to central words
+
+#include "alphabet.hh"
+#include "log_add.hh"
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+#include <tr1/random>
+#include <tr1/tuple>
+#include <nlopt.h>
+
+using namespace std;
+using namespace std::tr1;
+
+const int numTags = 5;
+const int numIterations = 100;
+const bool posterior_regularisation = true;
+const double PHRASE_VIOLATION_WEIGHT = 10;
+const double CONTEXT_VIOLATION_WEIGHT = 0;
+const bool includePhraseProb = false;
+
+// Data structures:
+Alphabet<string> lexicon;
+typedef vector<int> Phrase;
+typedef tuple<int, int, int, int> Context;
+Alphabet<Phrase> phrases;
+Alphabet<Context> contexts;
+
+typedef map<int, int> ContextCounts;
+typedef map<int, int> PhraseCounts;
+typedef map<int, ContextCounts> PhraseToContextCounts;
+typedef map<int, PhraseCounts> ContextToPhraseCounts;
+
+PhraseToContextCounts concordancePhraseToContexts;
+ContextToPhraseCounts concordanceContextToPhrases;
+
+typedef vector<double> Dist;
+typedef vector<Dist> ConditionalDist;
+Dist prior; // class -> P(class)
+vector<ConditionalDist> probCtx; // word -> class -> P(word | class), for each position of context word
+ConditionalDist probPhrase; // class -> P(word | class)
+Dist probPhraseLength; // class -> P(length | class) expressed as geometric distribution parameter
+
+mt19937 randomGenerator((size_t) time(NULL));
+uniform_real<double> uniDist(0.0, 1e-1);
+variate_generator< mt19937, uniform_real<double> > rng(randomGenerator, uniDist);
+
+void addRandomNoise(Dist &d);
+void normalise(Dist &d);
+void addTo(Dist &d, const Dist &e);
+int argmax(const Dist &d);
+
+map<Phrase, map<Context, int> > lambda_indices;
+
+Dist conditional_probs(const Phrase &phrase, const Context &context, double *normalisation = 0);
+template <typename T>
+Dist
+penalised_conditionals(const Phrase &phrase, const Context &context, 
+                       const T &lambda, double *normalisation);
+//Dist penalised_conditionals(const Phrase &phrase, const Context &context, const double *lambda, double *normalisation = 0);
+double penalised_log_likelihood(int n, const double *lambda, double *gradient, void *data);
+void optimise_lambda(double delta, double gamma, vector<double> &lambda);
+double expected_violation_phrases(const double *lambda);
+double expected_violation_contexts(const double *lambda);
+double primal_kl_divergence(const double *lambda);
+double dual(const double *lambda);
+void print_primal_dual(const double *lambda, double delta, double gamma);
+
+ostream &operator<<(ostream &, const Phrase &);
+ostream &operator<<(ostream &, const Context &);
+ostream &operator<<(ostream &, const Dist &);
+ostream &operator<<(ostream &, const ConditionalDist &);
+
+int
+main(int argc, char *argv[])
+{
+    randomGenerator.seed(time(NULL));
+
+    int edges = 0;
+    istream &input = cin;
+    while (input.good())
+    {
+        // read the phrase
+        string phraseString;
+        Phrase phrase;
+        getline(input, phraseString, '\t');
+        istringstream pinput(phraseString);
+        string token;
+        while (pinput >> token)
+            phrase.push_back(lexicon.insert(token));
+        int phraseId = phrases.insert(phrase);
+
+        // read the rest, storing each context
+        string remainder;
+        getline(input, remainder, '\n');
+        istringstream rinput(remainder);
+        Context context(-1, -1, -1, -1);
+        int index = 0;
+        while (rinput >> token)
+        {
+            if (token != "|||" && token != "<PHRASE>")
+            {
+                if (index < 4)
+                {
+                    // eugh! damn templates
+                    switch (index)
+                    {
+                        case 0: get<0>(context) = lexicon.insert(token); break;
+                        case 1: get<1>(context) = lexicon.insert(token); break;
+                        case 2: get<2>(context) = lexicon.insert(token); break;
+                        case 3: get<3>(context) = lexicon.insert(token); break;
+                        default: assert(false);
+                    }
+                    index += 1;
+                }
+                else if (token.find("C=") == 0)
+                {
+                    int contextId = contexts.insert(context);
+                    int count = atoi(token.substr(strlen("C=")).c_str());
+                    concordancePhraseToContexts[phraseId][contextId] += count;
+                    concordanceContextToPhrases[contextId][phraseId] += count;
+                    index = 0;
+                    context = Context(-1, -1, -1, -1);
+                    edges += 1;
+                }
+            }
+        }
+
+        // trigger EOF
+        input >> ws;
+    }
+
+    cout << "Read in " << phrases.size() << " phrases"
+         << " and " << contexts.size() << " contexts"
+         << " and " << edges << " edges"
+         << " and " << lexicon.size() << " word types\n";
+
+    // FIXME: filter out low count phrases and low count contexts (based on individual words?)
+    // now populate model parameters with uniform + random noise
+    prior.resize(numTags, 1.0);
+    addRandomNoise(prior);
+    normalise(prior);
+
+    probCtx.resize(4, ConditionalDist(numTags, Dist(lexicon.size(), 1.0)));
+    if (includePhraseProb)
+        probPhrase.resize(numTags, Dist(lexicon.size(), 1.0));
+    for (int t = 0; t < numTags; ++t)
+    {
+        for (int j = 0; j < 4; ++j)
+        {
+            addRandomNoise(probCtx[j][t]);
+            normalise(probCtx[j][t]);
+        }
+        if (includePhraseProb)
+        {
+            addRandomNoise(probPhrase[t]);
+            normalise(probPhrase[t]);
+        }
+    }
+    if (includePhraseProb)
+    {
+        probPhraseLength.resize(numTags, 0.5); // geometric distribution p=0.5
+        addRandomNoise(probPhraseLength);
+    }
+
+    cout << "\tprior:     " << prior << "\n";
+    //cout << "\tcontext:   " << probCtx << "\n";
+    //cout << "\tphrase:    " << probPhrase << "\n";
+    //cout << "\tphraseLen: " << probPhraseLength << endl;
+
+    vector<double> lambda;
+
+    // now do EM training
+    for (int iteration = 0; iteration < numIterations; ++iteration)
+    {
+        cout << "EM iteration " << iteration << endl;
+
+        if (posterior_regularisation)
+            optimise_lambda(PHRASE_VIOLATION_WEIGHT, CONTEXT_VIOLATION_WEIGHT, lambda);
+        //cout << "\tlambda " << lambda << endl;
+
+        Dist countsPrior(numTags, 0.0);
+        vector<ConditionalDist> countsCtx(4, ConditionalDist(numTags, Dist(lexicon.size(), 1e-10)));
+        ConditionalDist countsPhrase(numTags, Dist(lexicon.size(), 1e-10));
+        Dist countsPhraseLength(numTags, 0.0);
+        Dist nPhrases(numTags, 0.0);
+
+        double llh = 0;
+        for (PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.begin();
+             pcit != concordancePhraseToContexts.end(); ++pcit)
+        {
+            const Phrase &phrase = phrases.type(pcit->first);
+
+            // e-step: estimate latent class probs; compile (class,word) stats for m-step
+            for (ContextCounts::iterator ccit = pcit->second.begin();
+                 ccit != pcit->second.end(); ++ccit)
+            {
+                const Context &context = contexts.type(ccit->first);
+
+                double z = 0;
+                Dist tagCounts;
+                if (!posterior_regularisation)
+                    tagCounts = conditional_probs(phrase, context, &z);
+                else
+                    tagCounts = penalised_conditionals(phrase, context, lambda, &z);
+
+                llh += log(z) * ccit->second;
+                addTo(countsPrior, tagCounts); // FIXME: times ccit->secon
+
+                for (int t = 0; t < numTags; ++t)
+                {
+                    for (int j = 0; j < 4; ++j)
+                        countsCtx[j][t][get<0>(context)] += tagCounts[t] * ccit->second;
+
+                    if (includePhraseProb)
+                    {
+                        for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit)
+                            countsPhrase[t][*pit] += tagCounts[t] * ccit->second;
+                        countsPhraseLength[t] += phrase.size() * tagCounts[t] * ccit->second;
+                        nPhrases[t] += tagCounts[t] * ccit->second;
+                    }
+                }
+            }
+        }
+
+        cout << "M-step\n";
+
+        // m-step: normalise prior and (class,word) stats and assign to model parameters
+        normalise(countsPrior);
+        prior = countsPrior;
+        for (int t = 0; t < numTags; ++t)
+        {
+            //cout << "\t\tt " << t << " prior " << countsPrior[t] << "\n";
+            for (int j = 0; j < 4; ++j)
+                normalise(countsCtx[j][t]);
+            if (includePhraseProb)
+            {
+                normalise(countsPhrase[t]);
+                countsPhraseLength[t] = nPhrases[t] / countsPhraseLength[t];
+            }
+        }
+        probCtx = countsCtx;
+        if (includePhraseProb)
+        {
+            probPhrase = countsPhrase;
+            probPhraseLength = countsPhraseLength;
+        }
+
+        double *larray = new double[lambda.size()];
+        copy(lambda.begin(), lambda.end(), larray);
+        print_primal_dual(larray, PHRASE_VIOLATION_WEIGHT, CONTEXT_VIOLATION_WEIGHT);
+        delete [] larray;
+
+        //cout << "\tllh " << llh << endl;
+        //cout << "\tprior:     " << prior << "\n";
+        //cout << "\tcontext:   " << probCtx << "\n";
+        //cout << "\tphrase:    " << probPhrase << "\n";
+        //cout << "\tphraseLen: " << probPhraseLength << "\n";
+    }
+
+    // output class membership
+    for (PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.begin();
+         pcit != concordancePhraseToContexts.end(); ++pcit)
+    {
+        const Phrase &phrase = phrases.type(pcit->first);
+        for (ContextCounts::iterator ccit = pcit->second.begin();
+             ccit != pcit->second.end(); ++ccit)
+        {
+            const Context &context = contexts.type(ccit->first);
+            Dist tagCounts = conditional_probs(phrase, context, 0);
+            cout << phrase << " ||| " << context << " ||| " << argmax(tagCounts) << "\n";
+        }
+    }
+
+    return 0;
+}
+
+void addRandomNoise(Dist &d)
+{
+    for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit)
+        *dit += rng();
+}
+
+void normalise(Dist &d)
+{
+    double z = 0;
+    for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit)
+        z += *dit;
+    for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit)
+        *dit /= z;
+}
+
+void addTo(Dist &d, const Dist &e)
+{
+    assert(d.size() == e.size());
+    for (int i = 0; i < (int) d.size(); ++i)
+        d[i] += e[i];
+}
+
+int argmax(const Dist &d)
+{
+    double best = d[0];
+    int index = 0;
+    for (int i = 1; i < (int) d.size(); ++i)
+    {
+        if (d[i] > best)
+        {
+            best = d[i];
+            index = i;
+        }
+    }
+    return index;
+}
+
+ostream &operator<<(ostream &out, const Phrase &phrase)
+{
+    for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit)
+        lexicon.display(((pit == phrase.begin()) ? out : out << " "), *pit);
+    return out;
+}
+
+ostream &operator<<(ostream &out, const Context &context)
+{
+    lexicon.display(out, get<0>(context));
+    lexicon.display(out << " ", get<1>(context));
+    lexicon.display(out << " <PHRASE> ", get<2>(context));
+    lexicon.display(out << " ", get<3>(context));
+    return out;
+}
+
+ostream &operator<<(ostream &out, const Dist &dist)
+{
+    for (Dist::const_iterator dit = dist.begin(); dit != dist.end(); ++dit)
+        out << ((dit == dist.begin()) ? "" : " ") << *dit;
+    return out;
+}
+
+ostream &operator<<(ostream &out, const ConditionalDist &dist)
+{
+    for (ConditionalDist::const_iterator dit = dist.begin(); dit != dist.end(); ++dit)
+        out << ((dit == dist.begin()) ? "" : "; ") << *dit;
+    return out;
+}
+
+// FIXME: slow - just use the phrase index, context index to do the mapping
+// (n.b. it's a sparse setup, not just equal to 3d array index)
+int
+lambda_index(const Phrase &phrase, const Context &context, int tag)
+{
+    return lambda_indices[phrase][context] + tag;
+}
+
+template <typename T>
+Dist
+penalised_conditionals(const Phrase &phrase, const Context &context, 
+                       const T &lambda, double *normalisation)
+{
+    Dist d = conditional_probs(phrase, context, 0);
+
+    double z = 0;
+    for (int t = 0; t < numTags; ++t)
+    {
+        d[t] *= exp(-lambda[lambda_index(phrase, context, t)]);
+        z += d[t];
+    }
+
+    if (normalisation)
+        *normalisation = z;
+
+    for (int t = 0; t < numTags; ++t)
+        d[t] /= z;
+
+    return d;
+}
+
+Dist 
+conditional_probs(const Phrase &phrase, const Context &context, double *normalisation)
+{
+    Dist tagCounts(numTags, 0.0);
+    double z = 0;
+    for (int t = 0; t < numTags; ++t)
+    {
+        double prob = prior[t];
+        prob *= (probCtx[0][t][get<0>(context)] * probCtx[1][t][get<1>(context)] *
+                 probCtx[2][t][get<2>(context)] * probCtx[3][t][get<3>(context)]);
+
+        if (includePhraseProb)
+        {
+            prob *= pow(1 - probPhraseLength[t], phrase.size() - 1) * probPhraseLength[t];
+            for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit)
+                prob *= probPhrase[t][*pit];
+        }
+
+        tagCounts[t] = prob;
+        z += prob;
+    }
+    if (normalisation)
+        *normalisation = z;
+
+    for (int t = 0; t < numTags; ++t)
+        tagCounts[t] /= z;
+
+    return tagCounts;
+}
+
+double 
+penalised_log_likelihood(int n, const double *lambda, double *grad, void *)
+{
+    // return log Z(lambda, theta) over the corpus
+    // where theta are the global parameters (prior, probCtx*, probPhrase*) 
+    // and lambda are lagrange multipliers for the posterior sparsity constraints
+    //
+    // this is formulated as: 
+    // f = log Z(lambda) = sum_i log ( sum_i p_theta(t_i|p_i,c_i) exp [-lambda_{t_i,p_i,c_i}] )
+    // where i indexes the training examples - specifying the (p, c) pair (which may occur with count > 1)
+    //
+    // with derivative:
+    // f'_{tpc} = frac { - count(t,p,c) p_theta(t|p,c) exp (-lambda_{t,p,c}) }
+    //                 { sum_t' p_theta(t'|p,c) exp (-lambda_{t',p,c}) }
+
+    //cout << "penalised_log_likelihood with lambda ";
+    //copy(lambda, lambda+n, ostream_iterator<double>(cout, " "));
+    //cout << "\n";
+
+    double f = 0;
+    if (grad)
+    {
+        for (int i = 0; i < n; ++i)
+            grad[i] = 0.0;
+    }
+
+    for (int p = 0; p < phrases.size(); ++p)
+    {
+        const Phrase &phrase = phrases.type(p);
+        PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p);
+        for (ContextCounts::const_iterator ccit = pcit->second.begin();
+             ccit != pcit->second.end(); ++ccit)
+        {
+            const Context &context = contexts.type(ccit->first);
+            double z = 0;
+            Dist scores = penalised_conditionals(phrase, context, lambda, &z);
+
+            f += ccit->second * log(z);
+            //cout << "\tphrase: " << phrase << " context: " << context << " count: " << ccit->second << " z " << z << endl;
+            //cout << "\t\tscores: " << scores << "\n";
+
+            if (grad)
+            {
+                for (int t = 0; t < numTags; ++t)
+                {
+                    int i = lambda_index(phrase, context, t); // FIXME: redundant lookups
+                    assert(grad[i] == 0.0);
+                    grad[i] = - ccit->second * scores[t];
+                }
+            }
+        }
+    }
+
+    //cout << "penalised_log_likelihood returning " << f;
+    //if (grad)
+    //{
+        //cout << "\ngradient: ";
+        //copy(grad, grad+n, ostream_iterator<double>(cout, " "));
+    //}
+    //cout << "\n";
+
+    return f;
+}
+
+typedef struct 
+{
+    // one of p or c should be set to -1, in which case it will be marginalised out 
+    // i.e. sum_p' lambda_{p'ct} <= threshold
+    //   or sum_c' lambda_{pc't} <= threshold
+    int p, c, t, threshold;
+} constraint_data;
+
+double 
+constraint_and_gradient(int n, const double *lambda, double *grad, void *data)
+{
+    constraint_data *d = (constraint_data *) data;
+    assert(d->t >= 0);
+    assert(d->threshold >= 0);
+
+    //cout << "constraint_and_gradient: t " << d->t << " p " << d->p << " c " << d->c << " tau " << d->threshold << endl;
+    //cout << "\tlambda ";
+    //copy(lambda, lambda+n, ostream_iterator<double>(cout, " "));
+    //cout << "\n";
+
+    // FIXME: it's crazy to use a dense gradient here => will only have a handful of non-zero entries
+    if (grad)
+    {
+        for (int i = 0; i < n; ++i)
+            grad[i] = 0.0;
+    }
+
+    //cout << "constraint_and_gradient: " << d->p << "; " << d->c << "; " << d->t << "; " << d->threshold << endl;
+
+    if (d->p >= 0)
+    {
+        assert(d->c < 0);
+        //    sum_c lambda_pct          <= delta [a.k.a. threshold]
+        // => sum_c lambda_pct - delta  <= 0
+        // derivative_pct = { 1, if p and t match; 0, otherwise }
+
+        double val = -d->threshold;
+
+        const Phrase &phrase = phrases.type(d->p);
+        PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(d->p);
+        assert(pcit != concordancePhraseToContexts.end());
+        for (ContextCounts::const_iterator ccit = pcit->second.begin();
+             ccit != pcit->second.end(); ++ccit)
+        {
+            const Context &context = contexts.type(ccit->first);
+            int i = lambda_index(phrase, context, d->t);
+            val += lambda[i];
+            if (grad) grad[i] = 1;
+        }
+        //cout << "\treturning " << val << endl;
+
+        return val;
+    }
+    else
+    {
+        assert(d->c >= 0);
+        assert(d->p < 0);
+        //    sum_p lambda_pct          <= gamma [a.k.a. threshold]
+        // => sum_p lambda_pct - gamma  <= 0
+        // derivative_pct = { 1, if c and t match; 0, otherwise }
+
+        double val = -d->threshold;
+
+        const Context &context = contexts.type(d->c);
+        ContextToPhraseCounts::iterator cpit = concordanceContextToPhrases.find(d->c);
+        assert(cpit != concordanceContextToPhrases.end());
+        for (PhraseCounts::iterator pcit = cpit->second.begin();
+             pcit != cpit->second.end(); ++pcit)
+        {
+            const Phrase &phrase = phrases.type(pcit->first);
+            int i = lambda_index(phrase, context, d->t);
+            val += lambda[i];
+            if (grad) grad[i] = 1;
+        }
+        //cout << "\treturning " << val << endl;
+
+        return val;
+    }
+}
+
+void
+optimise_lambda(double delta, double gamma, vector<double> &lambdav)
+{
+    int num_lambdas = lambdav.size();
+    if (lambda_indices.empty() || lambdav.empty())
+    {
+        lambda_indices.clear();
+        lambdav.clear();
+
+        int i = 0;
+        for (int p = 0; p < phrases.size(); ++p)
+        {
+            const Phrase &phrase = phrases.type(p);
+            PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.find(p);
+            for (ContextCounts::iterator ccit = pcit->second.begin();
+                 ccit != pcit->second.end(); ++ccit)
+            {
+                const Context &context = contexts.type(ccit->first);
+                lambda_indices[phrase][context] = i;
+                i += numTags;
+            }
+        }
+        num_lambdas = i;
+        lambdav.resize(num_lambdas);
+    }
+    //cout << "optimise_lambda: #langrange multipliers " << num_lambdas << endl;
+
+    // FIXME: better to work with an implicit representation to save memory usage
+    int num_constraints = (((delta > 0) ? phrases.size() : 0) + ((gamma > 0) ? contexts.size() : 0)) * numTags;
+    //cout << "optimise_lambda: #constraints " << num_constraints << endl;
+    constraint_data *data = new constraint_data[num_constraints];
+    int i = 0;
+    if (delta > 0)
+    {
+        for (int p = 0; p < phrases.size(); ++p)
+        {
+            for (int t = 0; t < numTags; ++t, ++i)
+            {
+                constraint_data &d = data[i];
+                d.p = p;
+                d.c = -1;
+                d.t = t;
+                d.threshold = delta;
+            }
+        }
+    }
+
+    if (gamma > 0)
+    {
+        for (int c = 0; c < contexts.size(); ++c)
+        {
+            for (int t = 0; t < numTags; ++t, ++i)
+            {
+                constraint_data &d = data[i];
+                d.p = -1;
+                d.c = c;
+                d.t = t;
+                d.threshold = gamma;
+            }
+        }
+    }
+    assert(i == num_constraints);
+
+    double lambda[num_lambdas];
+    double lb[num_lambdas], ub[num_lambdas];
+    for (i = 0; i < num_lambdas; ++i)
+    {
+        lambda[i] = lambdav[i]; // starting value
+        lb[i] = 0;              // lower bound
+        if (delta <= 0)         // upper bound
+            ub[i] = gamma;      
+        else if (gamma <= 0)
+            ub[i] = delta;
+        else
+            assert(false);
+    }
+
+    //print_primal_dual(lambda, delta, gamma);
+   
+    double minf;
+    int error_code = nlopt_minimize_constrained(NLOPT_LN_COBYLA, num_lambdas, penalised_log_likelihood, NULL,
+                                                num_constraints, constraint_and_gradient, data, sizeof(constraint_data),
+                                                lb, ub, lambda, &minf, -HUGE_VAL, 0.0, 0.0, 1e-4, NULL, 0, 0.0);
+    //cout << "optimise error code " << error_code << endl;
+
+    //print_primal_dual(lambda, delta, gamma);
+
+    delete [] data;
+
+    if (error_code < 0)
+        cout << "WARNING: optimisation failed with error code: " << error_code << endl;
+    //else
+    //{
+        //cout << "success; minf " << minf << endl;
+        //print_primal_dual(lambda, delta, gamma);
+    //}
+
+    lambdav = vector<double>(&lambda[0], &lambda[0] + num_lambdas);
+}
+
+// FIXME: inefficient - cache the scores
+double
+expected_violation_phrases(const double *lambda)
+{
+    // sum_pt max_c E_q[phi_pct]
+    double violation = 0;
+
+    for (int p = 0; p < phrases.size(); ++p)
+    {
+        const Phrase &phrase = phrases.type(p);
+        PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p);
+
+        for (int t = 0; t < numTags; ++t)
+        {
+            double best = 0;
+            for (ContextCounts::const_iterator ccit = pcit->second.begin();
+                 ccit != pcit->second.end(); ++ccit)
+            {
+                const Context &context = contexts.type(ccit->first);
+                Dist scores = penalised_conditionals(phrase, context, lambda, 0);
+                best = max(best, scores[t]);
+            }
+            violation += best;
+        }
+    }
+
+    return violation;
+}
+
+// FIXME: inefficient - cache the scores
+double
+expected_violation_contexts(const double *lambda)
+{
+    // sum_ct max_p E_q[phi_pct]
+    double violation = 0;
+
+    for (int c = 0; c < contexts.size(); ++c)
+    {
+        const Context &context = contexts.type(c);
+        ContextToPhraseCounts::iterator cpit = concordanceContextToPhrases.find(c);
+
+        for (int t = 0; t < numTags; ++t)
+        {
+            double best = 0;
+            for (PhraseCounts::iterator pit = cpit->second.begin();
+                 pit != cpit->second.end(); ++pit)
+            {
+                const Phrase &phrase = phrases.type(pit->first);
+                Dist scores = penalised_conditionals(phrase, context, lambda, 0);
+                best = max(best, scores[t]);
+            }
+            violation += best;
+        }
+    }
+
+    return violation;
+}
+
+// FIXME: possibly inefficient
+double 
+primal_likelihood() // FIXME: primal evaluation needs to use lambda and calculate l1linf terms
+{
+    double llh = 0;
+    for (int p = 0; p < phrases.size(); ++p)
+    {
+        const Phrase &phrase = phrases.type(p);
+        PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p);
+        for (ContextCounts::const_iterator ccit = pcit->second.begin();
+             ccit != pcit->second.end(); ++ccit)
+        {
+            const Context &context = contexts.type(ccit->first);
+            double z = 0;
+            Dist scores = conditional_probs(phrase, context, &z);
+            llh += ccit->second * log(z);
+        }
+    }
+    return llh;
+}
+
+// FIXME: inefficient - cache the scores
+double 
+primal_kl_divergence(const double *lambda)
+{
+    // return KL(q || p) = sum_y q(y) { log q(y) - log p(y | x) }
+    //                   = sum_y q(y) { log p(y | x) - lambda . phi(x, y) - log Z - log p(y | x) }
+    //                   = sum_y q(y) { - lambda . phi(x, y) } - log Z
+    // and q(y) factors with each edge, ditto for Z
+    
+    double feature_sum = 0, log_z = 0;
+    for (int p = 0; p < phrases.size(); ++p)
+    {
+        const Phrase &phrase = phrases.type(p);
+        PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p);
+        for (ContextCounts::const_iterator ccit = pcit->second.begin();
+             ccit != pcit->second.end(); ++ccit)
+        {
+            const Context &context = contexts.type(ccit->first);
+
+            double local_z = 0;
+            double local_f = 0;
+            Dist d = conditional_probs(phrase, context, 0);
+            for (int t = 0; t < numTags; ++t)
+            {
+                int i = lambda_index(phrase, context, t);
+                double s = d[t] * exp(-lambda[i]);
+                local_f += lambda[i] * s;
+                local_z += s;
+            }
+
+            log_z += ccit->second * log(local_z);
+            feature_sum += ccit->second * (local_f / local_z);
+        }
+    }
+
+    return -feature_sum - log_z;
+}
+
+// FIXME: inefficient - cache the scores
+double 
+dual(const double *lambda)
+{
+    // return log(Z) = - log { sum_y p(y | x) exp( - lambda . phi(x, y) }
+    // n.b. have flipped the sign as we're minimising
+    
+    double z = 0;
+    for (int p = 0; p < phrases.size(); ++p)
+    {
+        const Phrase &phrase = phrases.type(p);
+        PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p);
+        for (ContextCounts::const_iterator ccit = pcit->second.begin();
+             ccit != pcit->second.end(); ++ccit)
+        {
+            const Context &context = contexts.type(ccit->first);
+            double lz = 0;
+            Dist scores = penalised_conditionals(phrase, context, lambda, &z);
+            z += lz * ccit->second;
+        }
+    }
+    return log(z);
+}
+
+void
+print_primal_dual(const double *lambda, double delta, double gamma)
+{
+    double likelihood = primal_likelihood();
+    double kl = primal_kl_divergence(lambda);
+    double sum_pt = expected_violation_phrases(lambda);
+    double sum_ct = expected_violation_contexts(lambda);
+    //double d = dual(lambda);
+
+    cout << "\tllh=" << likelihood
+         << " kl=" << kl
+         << " violations phrases=" << sum_pt
+         << " contexts=" << sum_ct
+         //<< " primal=" << (kl + delta * sum_pt + gamma * sum_ct) 
+         //<< " dual=" << d
+         << " objective=" << (likelihood - kl + delta * sum_pt + gamma * sum_ct) 
+         << endl;
+}
diff --git a/gi/posterior-regularisation/invert.hh b/gi/posterior-regularisation/invert.hh
new file mode 100644
index 00000000..d06356e9
--- /dev/null
+++ b/gi/posterior-regularisation/invert.hh
@@ -0,0 +1,45 @@
+// The following code inverts the matrix input using LU-decomposition with
+// backsubstitution of unit vectors. Reference: Numerical Recipies in C, 2nd
+// ed., by Press, Teukolsky, Vetterling & Flannery. 
+// Code written by Fredrik Orderud.
+// http://www.crystalclearsoftware.com/cgi-bin/boost_wiki/wiki.pl?LU_Matrix_Inversion
+
+#ifndef INVERT_MATRIX_HPP
+#define INVERT_MATRIX_HPP
+
+// REMEMBER to update "lu.hpp" header includes from boost-CVS
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+namespace ublas = boost::numeric::ublas;
+
+/* Matrix inversion routine.
+   Uses lu_factorize and lu_substitute in uBLAS to invert a matrix */
+template<class T>
+bool invert_matrix(const ublas::matrix<T>& input, ublas::matrix<T>& inverse) 
+{
+    using namespace boost::numeric::ublas;
+    typedef permutation_matrix<std::size_t> pmatrix;
+    // create a working copy of the input
+    matrix<T> A(input);
+    // create a permutation matrix for the LU-factorization
+    pmatrix pm(A.size1());
+
+    // perform LU-factorization
+    int res = lu_factorize(A,pm);
+    if( res != 0 ) return false;
+
+    // create identity matrix of "inverse"
+    inverse.assign(ublas::identity_matrix<T>(A.size1()));
+
+    // backsubstitute to get the inverse
+    lu_substitute(A, pm, inverse);
+    
+    return true;
+}
+
+#endif //INVERT_MATRIX_HPP
diff --git a/gi/posterior-regularisation/log_add.hh b/gi/posterior-regularisation/log_add.hh
new file mode 100644
index 00000000..e0620c5a
--- /dev/null
+++ b/gi/posterior-regularisation/log_add.hh
@@ -0,0 +1,30 @@
+#ifndef log_add_hh
+#define log_add_hh
+
+#include <limits>
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+template <typename T>
+struct Log
+{
+    static T zero() { return -std::numeric_limits<T>::infinity(); } 
+
+    static T add(T l1, T l2)
+    {
+        if (l1 == zero()) return l2;
+        if (l1 > l2) 
+            return l1 + std::log(1 + exp(l2 - l1));
+        else
+            return l2 + std::log(1 + exp(l1 - l2));
+    }
+
+    static T subtract(T l1, T l2)
+    {
+        //std::assert(l1 >= l2);
+        return l1 + log(1 - exp(l2 - l1));
+    }
+};
+
+#endif
diff --git a/gi/posterior-regularisation/projected_gradient.cc b/gi/posterior-regularisation/projected_gradient.cc
new file mode 100644
index 00000000..f7c39817
--- /dev/null
+++ b/gi/posterior-regularisation/projected_gradient.cc
@@ -0,0 +1,87 @@
+//
+// Minimises given functional using the projected gradient method. Based on
+// algorithm and demonstration example in Linear and Nonlinear Programming,
+// Luenberger and Ye, 3rd ed., p 370.
+//
+
+#include "invert.hh"
+#include <iostream>
+
+using namespace std;
+
+double 
+f(double x1, double x2, double x3, double x4)
+{
+    return x1 * x1 + x2 * x2 + x3 * x3 + x4 * x4 - 2 * x1 - 3 * x4;
+}
+
+ublas::vector<double> 
+g(double x1, double x2, double x3, double x4)
+{
+    ublas::vector<double> v(4);
+    v(0) = 2 * x1 - 2;
+    v(1) = 2 * x2;
+    v(2) = 2 * x3;
+    v(3) = 2 * x4 - 3;
+    return v;
+}
+
+ublas::matrix<double> 
+activeConstraints(double x1, double x2, double x3, double x4)
+{
+    int n = 2;
+    if (x1 == 0) ++n;
+    if (x2 == 0) ++n;
+    if (x3 == 0) ++n;
+    if (x4 == 0) ++n;
+
+    ublas::matrix<double> a(n,4);
+    a(0, 0) = 2; a(0, 1) = 1; a(0, 2) = 1; a(0, 3) = 4;
+    a(1, 0) = 1; a(1, 1) = 1; a(1, 2) = 2; a(1, 3) = 1;
+
+    int c = 2;
+    if (x1 == 0) a(c++, 0) = 1;
+    if (x2 == 0) a(c++, 1) = 1;
+    if (x3 == 0) a(c++, 2) = 1;
+    if (x4 == 0) a(c++, 3) = 1;
+
+    return a;
+}
+
+ublas::matrix<double>
+projection(const ublas::matrix<double> &a)
+{
+    ublas::matrix<double> aT = ublas::trans(a);
+    ublas::matrix<double> inv(a.size1(), a.size1());
+    bool ok = invert_matrix(ublas::matrix<double>(ublas::prod(a, aT)), inv);
+    assert(ok && "Failed to invert matrix");
+    return ublas::identity_matrix<double>(4) - 
+        ublas::prod(aT, ublas::matrix<double>(ublas::prod(inv, a)));
+}
+
+int main(int argc, char *argv[])
+{
+    double x1 = 2, x2 = 2, x3 = 1, x4 = 0;
+
+    double fval = f(x1, x2, x3, x4);
+    cout << "f = " << fval << endl;
+    ublas::vector<double> grad = g(x1, x2, x3, x4);
+    cout << "g = " << grad << endl;
+    ublas::matrix<double> A = activeConstraints(x1, x2, x3, x4);
+    cout << "A = " << A << endl;
+    ublas::matrix<double> P = projection(A);
+    cout << "P = " << P << endl;
+    // the direction of movement
+    ublas::vector<double> d = prod(P, grad);
+    cout << "d = " << (d / d(0)) << endl;
+
+    // special case for d = 0
+
+    // next solve for limits on the line search
+
+    // then use golden rule technique between these values (if bounded)
+
+    // or simple Armijo's rule technique
+
+    return 0;
+}
diff --git a/gi/posterior-regularisation/train_pr_global.py b/gi/posterior-regularisation/train_pr_global.py
new file mode 100644
index 00000000..467069ef
--- /dev/null
+++ b/gi/posterior-regularisation/train_pr_global.py
@@ -0,0 +1,272 @@
+import sys
+import scipy.optimize
+from numpy import *
+from numpy.random import random
+
+#
+# Step 1: load the concordance counts
+# 
+
+edges_phrase_to_context = {}
+edges_context_to_phrase = {}
+types = {}
+num_edges = 0
+
+for line in sys.stdin:
+    phrase, rest = line.strip().split('\t')
+    parts = rest.split('|||')
+    for i in range(0, len(parts), 2):
+        context, count = parts[i:i+2]
+
+        ctx = tuple(filter(lambda x: x != '<PHRASE>', context.split()))
+        cnt = int(count.strip()[2:])
+        ccs = edges_phrase_to_context.setdefault(phrase, {})
+        ccs[ctx] = cnt
+        pcs = edges_context_to_phrase.setdefault(ctx, {})
+        pcs[phrase] = cnt
+
+        for token in ctx:
+            types.setdefault(token, len(types))
+        for token in phrase.split():
+            types.setdefault(token, len(types))
+
+        num_edges += 1
+
+print 'Read in', num_edges, 'edges and', len(types), 'word types'
+
+#
+# Step 2: initialise the model parameters
+#
+
+num_tags = 5
+num_types = len(types)
+delta = int(sys.argv[1])
+gamma = int(sys.argv[2])
+
+def normalise(a):
+    return a / sum(a)
+
+# Pr(tag)
+tagDist = normalise(random(num_tags)+1)
+# Pr(context at pos i = w | tag) indexed by i, tag, word
+contextWordDist = [[normalise(random(num_types)+1) for t in range(num_tags)] for i in range(4)]
+# PR langrange multipliers
+lamba = zeros(num_edges * num_tags)
+lamba_index = {}
+next = 0
+for phrase, ccs in edges_phrase_to_context.items():
+    for context in ccs.keys():
+        lamba_index[phrase,context] = next
+        next += num_tags
+
+#
+# Step 3: expectation maximisation
+#
+
+for iteration in range(20):
+    tagCounts = zeros(num_tags)
+    contextWordCounts = [[zeros(num_types) for t in range(num_tags)] for i in range(4)]
+
+    #print 'tagDist', tagDist
+    #print 'contextWordCounts[0][0]', contextWordCounts[0][0]
+
+    # Tune lambda
+    # dual: min log Z(lamba) s.t. lamba >= 0;
+    # sum_c lamba_pct <= delta; sum_p lamba_pct <= gamma
+    def dual(ls):
+        logz = 0
+        for phrase, ccs in edges_phrase_to_context.items():
+            for context, count in ccs.items():
+                conditionals = zeros(num_tags)
+                for t in range(num_tags):
+                    prob = tagDist[t]
+                    for i in range(4):
+                        prob *= contextWordDist[i][t][types[context[i]]]
+                    conditionals[t] = prob
+                cz = sum(conditionals)
+                conditionals /= cz
+
+                local_z = 0
+                for t in range(num_tags):
+                    local_z += conditionals[t] * exp(-ls[lamba_index[phrase,context] + t])
+                logz += log(local_z) * count
+
+        #print 'ls', ls
+        #print 'lambda', list(ls)
+        #print 'dual', logz
+        return logz
+
+    def primal(ls):
+        # FIXME: returns negative values for KL (impossible)
+        logz = dual(ls)
+        kl = -logz
+
+        expectations = zeros(lamba.shape)
+        for phrase, ccs in edges_phrase_to_context.items():
+            for context, count in ccs.items():
+                conditionals = zeros(num_tags)
+                for t in range(num_tags):
+                    prob = tagDist[t]
+                    for i in range(4):
+                        prob *= contextWordDist[i][t][types[context[i]]]
+                    conditionals[t] = prob
+                cz = sum(conditionals)
+                conditionals /= cz
+
+                scores = zeros(num_tags)
+                for t in range(num_tags):
+                    scores[t] = conditionals[t] * exp(-ls[lamba_index[phrase,context] + t])
+                local_z = sum(scores)
+
+                for t in range(num_tags):
+                    li = lamba_index[phrase,context] + t
+                    expectations[li] = scores[t] / local_z * count
+                    kl -= expectations[li] * ls[li]
+
+        pt_l1linf = 0
+        for phrase, ccs in edges_phrase_to_context.items():
+            for t in range(num_tags):
+                best = -1e500
+                for context, count in ccs.items():
+                    li = lamba_index[phrase,context] + t
+                    s = expectations[li]
+                    if s > best: best = s
+                pt_l1linf += best
+
+        ct_l1linf = 0
+        for context, pcs in edges_context_to_phrase.items():
+            for t in range(num_tags):
+                best = -1e500
+                for phrase, count in pcs.items():
+                    li = lamba_index[phrase,context] + t
+                    s = expectations[li]
+                    if s > best: best = s
+                ct_l1linf += best
+
+        return kl, pt_l1linf, ct_l1linf, kl + delta * pt_l1linf + gamma * ct_l1linf
+
+    def dual_deriv(ls):
+        # d/dl log(z) = E_q[phi]
+        deriv = zeros(num_edges * num_tags)
+        for phrase, ccs in edges_phrase_to_context.items():
+            for context, count in ccs.items():
+                conditionals = zeros(num_tags)
+                for t in range(num_tags):
+                    prob = tagDist[t]
+                    for i in range(4):
+                        prob *= contextWordDist[i][t][types[context[i]]]
+                    conditionals[t] = prob
+                cz = sum(conditionals)
+                conditionals /= cz
+
+                scores = zeros(num_tags)
+                for t in range(num_tags):
+                    scores[t] = conditionals[t] * exp(-ls[lamba_index[phrase,context] + t])
+                local_z = sum(scores)
+
+                for t in range(num_tags):
+                    deriv[lamba_index[phrase,context] + t] -= count * scores[t] / local_z
+
+        #print 'ddual', deriv
+        return deriv
+
+    def constraints(ls):
+        cons = []
+        if delta > 0:
+            for phrase, ccs in edges_phrase_to_context.items():
+                for t in range(num_tags):
+                    total = delta
+                    for cprime in ccs.keys():
+                        total -= ls[lamba_index[phrase, cprime] + t]
+                    cons.append(total)
+
+        if gamma > 0:
+            for context, pcs in edges_context_to_phrase.items():
+                for t in range(num_tags):
+                    total = gamma
+                    for pprime in pcs.keys():
+                        total -= ls[lamba_index[pprime, context] + t]
+                    cons.append(total)
+        #print 'cons', cons
+        return cons
+
+    def constraints_deriv(ls):
+        cons = []
+        if delta > 0:
+            for phrase, ccs in edges_phrase_to_context.items():
+                for t in range(num_tags):
+                    d = zeros(num_edges * num_tags)
+                    for cprime in ccs.keys():
+                        d[lamba_index[phrase, cprime] + t] = -1
+                    cons.append(d)
+
+        if gamma > 0:
+            for context, pcs in edges_context_to_phrase.items():
+                for t in range(num_tags):
+                    d = zeros(num_edges * num_tags)
+                    for pprime in pcs.keys():
+                        d[lamba_index[pprime, context] + t] = -1
+                    cons.append(d)
+        #print 'dcons', cons
+        return cons
+
+    print 'Pre lambda optimisation dual', dual(lamba), 'primal', primal(lamba)
+    lamba = scipy.optimize.fmin_slsqp(dual, lamba,
+                            bounds=[(0, delta)] * (num_edges * num_tags),
+                            f_ieqcons=constraints,
+                            fprime=dual_deriv,
+                            fprime_ieqcons=constraints_deriv,
+                            iprint=0)
+    print 'Post lambda optimisation dual', dual(lamba), 'primal', primal(lamba)
+
+    # E-step
+    llh = z = 0
+    for phrase, ccs in edges_phrase_to_context.items():
+        for context, count in ccs.items():
+            conditionals = zeros(num_tags)
+            for t in range(num_tags):
+                prob = tagDist[t]
+                for i in range(4):
+                    prob *= contextWordDist[i][t][types[context[i]]]
+                conditionals[t] = prob
+            cz = sum(conditionals)
+            conditionals /= cz
+            llh += log(cz) * count
+
+            scores = zeros(num_tags)
+            li = lamba_index[phrase, context]
+            for t in range(num_tags):
+                scores[t] = conditionals[t] * exp(-lamba[li + t])
+            z += count * sum(scores)
+
+            tagCounts += count * scores
+            for i in range(4):
+                for t in range(num_tags):
+                    contextWordCounts[i][t][types[context[i]]] += count * scores[t]
+
+    print 'iteration', iteration, 'llh', llh, 'logz', log(z)
+
+    # M-step
+    tagDist = normalise(tagCounts)
+    for i in range(4):
+        for t in range(num_tags):
+            contextWordDist[i][t] = normalise(contextWordCounts[i][t])
+
+
+for phrase, ccs in edges_phrase_to_context.items():
+    for context, count in ccs.items():
+        conditionals = zeros(num_tags)
+        for t in range(num_tags):
+            prob = tagDist[t]
+            for i in range(4):
+                prob *= contextWordDist[i][t][types[context[i]]]
+            conditionals[t] = prob
+        cz = sum(conditionals)
+        conditionals /= cz
+
+        scores = zeros(num_tags)
+        li = lamba_index[phrase, context]
+        for t in range(num_tags):
+            scores[t] = conditionals[t] * exp(-lamba[li + t])
+
+        print '%s\t%s ||| C=%d ||| %d |||' % (phrase, context, count, argmax(scores)), scores / sum(scores)
author	trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-28 19:34:58 +0000
committer	trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-28 19:34:58 +0000
commit	ab3534c45f463e541f3baf05006a50b64e3bbe31 (patch)
tree	a778207e83c2d490593b97334d112e6ec1db310d /gi/posterior-regularisation
parent	207e694dd81046c2d2fa740f565ec2bb31f8152e (diff)