summaryrefslogtreecommitdiff
path: root/gi/posterior-regularisation
diff options
context:
space:
mode:
authortrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-28 19:34:58 +0000
committertrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-28 19:34:58 +0000
commitab3534c45f463e541f3baf05006a50b64e3bbe31 (patch)
treea778207e83c2d490593b97334d112e6ec1db310d /gi/posterior-regularisation
parent207e694dd81046c2d2fa740f565ec2bb31f8152e (diff)
First bits of code for PR training
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@44 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/posterior-regularisation')
-rw-r--r--gi/posterior-regularisation/PhraseContextModel.java317
-rw-r--r--gi/posterior-regularisation/README3
-rw-r--r--gi/posterior-regularisation/alphabet.hh61
-rw-r--r--gi/posterior-regularisation/canned.concordance4
-rw-r--r--gi/posterior-regularisation/em.cc830
-rw-r--r--gi/posterior-regularisation/invert.hh45
-rw-r--r--gi/posterior-regularisation/log_add.hh30
-rw-r--r--gi/posterior-regularisation/projected_gradient.cc87
-rw-r--r--gi/posterior-regularisation/train_pr_global.py272
9 files changed, 1649 insertions, 0 deletions
diff --git a/gi/posterior-regularisation/PhraseContextModel.java b/gi/posterior-regularisation/PhraseContextModel.java
new file mode 100644
index 00000000..3af72d54
--- /dev/null
+++ b/gi/posterior-regularisation/PhraseContextModel.java
@@ -0,0 +1,317 @@
+// Input of the form:
+// " the phantom of the opera " tickets for <PHRASE> tonight ? ||| C=1 ||| seats for <PHRASE> ? </s> ||| C=1 ||| i see <PHRASE> ? </s> ||| C=1
+// phrase TAB [context]+
+// where context = phrase ||| C=... which are separated by |||
+
+// Model parameterised as follows:
+// - each phrase, p, is allocated a latent state, t
+// - this is used to generate the contexts, c
+// - each context is generated using 4 independent multinomials, one for each position LL, L, R, RR
+
+// Training with EM:
+// - e-step is estimating q(t) = P(t|p,c) for all x,c
+// - m-step is estimating model parameters P(c,t|p) = P(t) P(c|t)
+// - PR uses alternate e-step, which first optimizes lambda
+// min_q KL(q||p) + delta sum_pt max_c E_q[phi_ptc]
+// where
+// q(t|p,c) propto p(t,c|p) exp( -phi_ptc )
+// Then q is used to obtain expectations for vanilla M-step.
+
+// Sexing it up:
+// - learn p-specific conditionals P(t|p)
+// - or generate phrase internals, e.g., generate edge words from
+// different distribution to central words
+// - agreement between phrase->context model and context->phrase model
+
+import java.io.*;
+import java.util.*;
+import java.util.regex.*;
+import static java.lang.Math.*;
+
+class Lexicon<T>
+{
+ public int insert(T word)
+ {
+ Integer i = wordToIndex.get(word);
+ if (i == null)
+ {
+ i = indexToWord.size();
+ wordToIndex.put(word, i);
+ indexToWord.add(word);
+ }
+ return i;
+ }
+
+ public T lookup(int index)
+ {
+ return indexToWord.get(index);
+ }
+
+ public int size()
+ {
+ return indexToWord.size();
+ }
+
+ private Map<T,Integer> wordToIndex = new HashMap<T,Integer>();
+ private List<T> indexToWord = new ArrayList<T>();
+}
+
+class PhraseContextModel
+{
+ // model/optimisation configuration parameters
+ int numTags;
+ int numPRIterations = 5;
+ boolean posteriorRegularisation = false;
+ double constraintScale = 10;
+
+ // book keeping
+ Lexicon<String> tokenLexicon = new Lexicon<String>();
+ int numPositions;
+ Random rng = new Random();
+
+ // training set; 1 entry for each unique phrase
+ PhraseAndContexts training[];
+
+ // model parameters (learnt)
+ double emissions[][][]; // position in 0 .. 3 x tag x word Pr(word | tag, position)
+ double prior[][]; // phrase x tag Pr(tag | phrase)
+ //double lambda[][][]; // word x context x tag langrange multipliers
+
+ PhraseContextModel(File infile, int tags) throws IOException
+ {
+ numTags = tags;
+ readTrainingFromFile(new FileReader(infile));
+ assert(training.length > 0);
+
+ // now initialise emissions
+ assert(training[0].contexts.length > 0);
+ numPositions = training[0].contexts[0].tokens.length;
+
+ emissions = new double[numPositions][numTags][tokenLexicon.size()];
+ prior = new double[training.length][numTags];
+ //lambda = new double[tokenLexicon.size()][???][numTags]
+
+ for (double[][] emissionTW: emissions)
+ for (double[] emissionW: emissionTW)
+ randomise(emissionW);
+
+ for (double[] priorTag: prior)
+ randomise(priorTag);
+ }
+
+ void expectationMaximisation(int numIterations)
+ {
+ for (int iteration = 0; iteration < numIterations; ++iteration)
+ {
+ double emissionsCounts[][][] = new double[numPositions][numTags][tokenLexicon.size()];
+ double priorCounts[][] = new double[training.length][numTags];
+
+ // E-step
+ double llh = 0;
+ for (int i = 0; i < training.length; ++i)
+ {
+ PhraseAndContexts instance = training[i];
+ for (Context ctx: instance.contexts)
+ {
+ double probs[] = posterior(i, ctx);
+ double z = normalise(probs);
+ llh += log(z) * ctx.count;
+
+ for (int t = 0; t < numTags; ++t)
+ {
+ priorCounts[i][t] += ctx.count * probs[t];
+ for (int j = 0; j < ctx.tokens.length; ++j)
+ emissionsCounts[j][t][ctx.tokens[j]] += ctx.count * probs[t];
+ }
+ }
+ }
+
+ // M-step: normalise
+ for (double[][] emissionTW: emissionsCounts)
+ for (double[] emissionW: emissionTW)
+ normalise(emissionW);
+
+ for (double[] priorTag: priorCounts)
+ normalise(priorTag);
+
+ emissions = emissionsCounts;
+ prior = priorCounts;
+
+ System.out.println("Iteration " + iteration + " llh " + llh);
+ }
+ }
+
+ static double normalise(double probs[])
+ {
+ double z = 0;
+ for (double p: probs)
+ z += p;
+ for (int i = 0; i < probs.length; ++i)
+ probs[i] /= z;
+ return z;
+ }
+
+ void randomise(double probs[])
+ {
+ double z = 0;
+ for (int i = 0; i < probs.length; ++i)
+ {
+ probs[i] = 10 + rng.nextDouble();
+ z += probs[i];
+ }
+
+ for (int i = 0; i < probs.length; ++i)
+ probs[i] /= z;
+ }
+
+ static int argmax(double probs[])
+ {
+ double m = Double.NEGATIVE_INFINITY;
+ int mi = -1;
+ for (int i = 0; i < probs.length; ++i)
+ {
+ if (probs[i] > m)
+ {
+ m = probs[i];
+ mi = i;
+ }
+ }
+ return mi;
+ }
+
+ double[] posterior(int phraseId, Context c) // unnormalised
+ {
+ double probs[] = new double[numTags];
+ for (int t = 0; t < numTags; ++t)
+ {
+ probs[t] = prior[phraseId][t];
+ for (int j = 0; j < c.tokens.length; ++j)
+ probs[t] *= emissions[j][t][c.tokens[j]];
+ }
+ return probs;
+ }
+
+ private void readTrainingFromFile(Reader in) throws IOException
+ {
+ // read in line-by-line
+ BufferedReader bin = new BufferedReader(in);
+ String line;
+ List<PhraseAndContexts> instances = new ArrayList<PhraseAndContexts>();
+ Pattern separator = Pattern.compile(" \\|\\|\\| ");
+
+ int numEdges = 0;
+ while ((line = bin.readLine()) != null)
+ {
+ // split into phrase and contexts
+ StringTokenizer st = new StringTokenizer(line, "\t");
+ assert(st.hasMoreTokens());
+ String phrase = st.nextToken();
+ assert(st.hasMoreTokens());
+ String rest = st.nextToken();
+ assert(!st.hasMoreTokens());
+
+ // process phrase
+ st = new StringTokenizer(phrase, " ");
+ List<Integer> ptoks = new ArrayList<Integer>();
+ while (st.hasMoreTokens())
+ ptoks.add(tokenLexicon.insert(st.nextToken()));
+
+ // process contexts
+ ArrayList<Context> contexts = new ArrayList<Context>();
+ String[] parts = separator.split(rest);
+ assert(parts.length % 2 == 0);
+ for (int i = 0; i < parts.length; i += 2)
+ {
+ // process pairs of strings - context and count
+ ArrayList<Integer> ctx = new ArrayList<Integer>();
+ String ctxString = parts[i];
+ String countString = parts[i+1];
+ StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " ");
+ while (ctxStrtok.hasMoreTokens())
+ {
+ String token = ctxStrtok.nextToken();
+ if (!token.equals("<PHRASE>"))
+ ctx.add(tokenLexicon.insert(token));
+ }
+
+ assert(countString.startsWith("C="));
+ Context c = new Context();
+ c.count = Integer.parseInt(countString.substring(2).trim());
+ // damn unboxing doesn't work with toArray
+ c.tokens = new int[ctx.size()];
+ for (int k = 0; k < ctx.size(); ++k)
+ c.tokens[k] = ctx.get(k);
+ contexts.add(c);
+
+ numEdges += 1;
+ }
+
+ // package up
+ PhraseAndContexts instance = new PhraseAndContexts();
+ // damn unboxing doesn't work with toArray
+ instance.phraseTokens = new int[ptoks.size()];
+ for (int k = 0; k < ptoks.size(); ++k)
+ instance.phraseTokens[k] = ptoks.get(k);
+ instance.contexts = contexts.toArray(new Context[] {});
+ instances.add(instance);
+ }
+
+ training = instances.toArray(new PhraseAndContexts[] {});
+
+ System.out.println("Read in " + training.length + " phrases and " + numEdges + " edges");
+ }
+
+ void displayPosterior()
+ {
+ for (int i = 0; i < training.length; ++i)
+ {
+ PhraseAndContexts instance = training[i];
+ for (Context ctx: instance.contexts)
+ {
+ double probs[] = posterior(i, ctx);
+ double z = normalise(probs);
+
+ // emit phrase
+ for (int t: instance.phraseTokens)
+ System.out.print(tokenLexicon.lookup(t) + " ");
+ System.out.print("\t");
+ for (int c: ctx.tokens)
+ System.out.print(tokenLexicon.lookup(c) + " ");
+ System.out.print("||| C=" + ctx.count + " |||");
+
+ System.out.print(" " + argmax(probs));
+ //for (int t = 0; t < numTags; ++t)
+ //System.out.print(" " + probs[t]);
+ System.out.println();
+ }
+ }
+ }
+
+ class PhraseAndContexts
+ {
+ int phraseTokens[];
+ Context contexts[];
+ }
+
+ class Context
+ {
+ int count;
+ int[] tokens;
+ }
+
+ public static void main(String []args)
+ {
+ assert(args.length >= 2);
+ try
+ {
+ PhraseContextModel model = new PhraseContextModel(new File(args[0]), Integer.parseInt(args[1]));
+ model.expectationMaximisation(Integer.parseInt(args[2]));
+ model.displayPosterior();
+ }
+ catch (IOException e)
+ {
+ System.out.println("Failed to read input file: " + args[0]);
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/gi/posterior-regularisation/README b/gi/posterior-regularisation/README
new file mode 100644
index 00000000..a3d54ffc
--- /dev/null
+++ b/gi/posterior-regularisation/README
@@ -0,0 +1,3 @@
+ 557 ./cdec_extools/extractor -i btec/split.zh-en.al -c 500000 -L 12 -C | sort -t $'\t' -k 1 | ./cdec_extools/mr_stripe_rule_reduce > btec.concordance
+ 559 wc -l btec.concordance
+ 588 cat btec.concordance | sed 's/.* //' | awk '{ for (i=1; i < NF; i++) { x=substr($i, 1, 2); if (x == "C=") printf "\n"; else if (x != "||") printf "%s ", $i; }; printf "\n"; }' | sort | uniq | wc -l
diff --git a/gi/posterior-regularisation/alphabet.hh b/gi/posterior-regularisation/alphabet.hh
new file mode 100644
index 00000000..1db928da
--- /dev/null
+++ b/gi/posterior-regularisation/alphabet.hh
@@ -0,0 +1,61 @@
+#ifndef _alphabet_hh
+#define _alphabet_hh
+
+#include <cassert>
+#include <iosfwd>
+#include <map>
+#include <string>
+#include <vector>
+
+// Alphabet: indexes a set of types
+template <typename T>
+class Alphabet: protected std::map<T, int>
+{
+public:
+ Alphabet() {};
+
+ bool empty() const { return std::map<T,int>::empty(); }
+ int size() const { return std::map<T,int>::size(); }
+
+ int operator[](const T &k) const
+ {
+ typename std::map<T,int>::const_iterator cit = find(k);
+ if (cit != std::map<T,int>::end())
+ return cit->second;
+ else
+ return -1;
+ }
+
+ int lookup(const T &k) const { return (*this)[k]; }
+
+ int insert(const T &k)
+ {
+ int sz = size();
+ assert((unsigned) sz == _items.size());
+
+ std::pair<typename std::map<T,int>::iterator, bool>
+ ins = std::map<T,int>::insert(make_pair(k, sz));
+
+ if (ins.second)
+ _items.push_back(k);
+
+ return ins.first->second;
+ }
+
+ const T &type(int i) const
+ {
+ assert(i >= 0);
+ assert(i < size());
+ return _items[i];
+ }
+
+ std::ostream &display(std::ostream &out, int i) const
+ {
+ return out << type(i);
+ }
+
+private:
+ std::vector<T> _items;
+};
+
+#endif
diff --git a/gi/posterior-regularisation/canned.concordance b/gi/posterior-regularisation/canned.concordance
new file mode 100644
index 00000000..710973ff
--- /dev/null
+++ b/gi/posterior-regularisation/canned.concordance
@@ -0,0 +1,4 @@
+a 0 0 <PHRASE> 0 0 ||| C=1 ||| 1 1 <PHRASE> 1 1 ||| C=1 ||| 2 2 <PHRASE> 2 2 ||| C=1
+b 0 0 <PHRASE> 0 0 ||| C=1 ||| 1 1 <PHRASE> 1 1 ||| C=1
+c 2 2 <PHRASE> 2 2 ||| C=1 ||| 4 4 <PHRASE> 4 4 ||| C=1 ||| 5 5 <PHRASE> 5 5 ||| C=1
+d 4 4 <PHRASE> 4 4 ||| C=1 ||| 5 5 <PHRASE> 5 5 ||| C=1
diff --git a/gi/posterior-regularisation/em.cc b/gi/posterior-regularisation/em.cc
new file mode 100644
index 00000000..f6c9fd68
--- /dev/null
+++ b/gi/posterior-regularisation/em.cc
@@ -0,0 +1,830 @@
+// Input of the form:
+// " the phantom of the opera " tickets for <PHRASE> tonight ? ||| C=1 ||| seats for <PHRASE> ? </s> ||| C=1 ||| i see <PHRASE> ? </s> ||| C=1
+// phrase TAB [context]+
+// where context = phrase ||| C=... which are separated by |||
+
+// Model parameterised as follows:
+// - each phrase, p, is allocated a latent state, t
+// - this is used to generate the contexts, c
+// - each context is generated using 4 independent multinomials, one for each position LL, L, R, RR
+
+// Training with EM:
+// - e-step is estimating P(t|p,c) for all x,c
+// - m-step is estimating model parameters P(p,c,t) = P(t) P(p|t) P(c|t)
+
+// Sexing it up:
+// - constrain the posteriors P(t|c) and P(t|p) to have few high-magnitude entries
+// - improve the generation of phrase internals, e.g., generate edge words from
+// different distribution to central words
+
+#include "alphabet.hh"
+#include "log_add.hh"
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+#include <tr1/random>
+#include <tr1/tuple>
+#include <nlopt.h>
+
+using namespace std;
+using namespace std::tr1;
+
+const int numTags = 5;
+const int numIterations = 100;
+const bool posterior_regularisation = true;
+const double PHRASE_VIOLATION_WEIGHT = 10;
+const double CONTEXT_VIOLATION_WEIGHT = 0;
+const bool includePhraseProb = false;
+
+// Data structures:
+Alphabet<string> lexicon;
+typedef vector<int> Phrase;
+typedef tuple<int, int, int, int> Context;
+Alphabet<Phrase> phrases;
+Alphabet<Context> contexts;
+
+typedef map<int, int> ContextCounts;
+typedef map<int, int> PhraseCounts;
+typedef map<int, ContextCounts> PhraseToContextCounts;
+typedef map<int, PhraseCounts> ContextToPhraseCounts;
+
+PhraseToContextCounts concordancePhraseToContexts;
+ContextToPhraseCounts concordanceContextToPhrases;
+
+typedef vector<double> Dist;
+typedef vector<Dist> ConditionalDist;
+Dist prior; // class -> P(class)
+vector<ConditionalDist> probCtx; // word -> class -> P(word | class), for each position of context word
+ConditionalDist probPhrase; // class -> P(word | class)
+Dist probPhraseLength; // class -> P(length | class) expressed as geometric distribution parameter
+
+mt19937 randomGenerator((size_t) time(NULL));
+uniform_real<double> uniDist(0.0, 1e-1);
+variate_generator< mt19937, uniform_real<double> > rng(randomGenerator, uniDist);
+
+void addRandomNoise(Dist &d);
+void normalise(Dist &d);
+void addTo(Dist &d, const Dist &e);
+int argmax(const Dist &d);
+
+map<Phrase, map<Context, int> > lambda_indices;
+
+Dist conditional_probs(const Phrase &phrase, const Context &context, double *normalisation = 0);
+template <typename T>
+Dist
+penalised_conditionals(const Phrase &phrase, const Context &context,
+ const T &lambda, double *normalisation);
+//Dist penalised_conditionals(const Phrase &phrase, const Context &context, const double *lambda, double *normalisation = 0);
+double penalised_log_likelihood(int n, const double *lambda, double *gradient, void *data);
+void optimise_lambda(double delta, double gamma, vector<double> &lambda);
+double expected_violation_phrases(const double *lambda);
+double expected_violation_contexts(const double *lambda);
+double primal_kl_divergence(const double *lambda);
+double dual(const double *lambda);
+void print_primal_dual(const double *lambda, double delta, double gamma);
+
+ostream &operator<<(ostream &, const Phrase &);
+ostream &operator<<(ostream &, const Context &);
+ostream &operator<<(ostream &, const Dist &);
+ostream &operator<<(ostream &, const ConditionalDist &);
+
+int
+main(int argc, char *argv[])
+{
+ randomGenerator.seed(time(NULL));
+
+ int edges = 0;
+ istream &input = cin;
+ while (input.good())
+ {
+ // read the phrase
+ string phraseString;
+ Phrase phrase;
+ getline(input, phraseString, '\t');
+ istringstream pinput(phraseString);
+ string token;
+ while (pinput >> token)
+ phrase.push_back(lexicon.insert(token));
+ int phraseId = phrases.insert(phrase);
+
+ // read the rest, storing each context
+ string remainder;
+ getline(input, remainder, '\n');
+ istringstream rinput(remainder);
+ Context context(-1, -1, -1, -1);
+ int index = 0;
+ while (rinput >> token)
+ {
+ if (token != "|||" && token != "<PHRASE>")
+ {
+ if (index < 4)
+ {
+ // eugh! damn templates
+ switch (index)
+ {
+ case 0: get<0>(context) = lexicon.insert(token); break;
+ case 1: get<1>(context) = lexicon.insert(token); break;
+ case 2: get<2>(context) = lexicon.insert(token); break;
+ case 3: get<3>(context) = lexicon.insert(token); break;
+ default: assert(false);
+ }
+ index += 1;
+ }
+ else if (token.find("C=") == 0)
+ {
+ int contextId = contexts.insert(context);
+ int count = atoi(token.substr(strlen("C=")).c_str());
+ concordancePhraseToContexts[phraseId][contextId] += count;
+ concordanceContextToPhrases[contextId][phraseId] += count;
+ index = 0;
+ context = Context(-1, -1, -1, -1);
+ edges += 1;
+ }
+ }
+ }
+
+ // trigger EOF
+ input >> ws;
+ }
+
+ cout << "Read in " << phrases.size() << " phrases"
+ << " and " << contexts.size() << " contexts"
+ << " and " << edges << " edges"
+ << " and " << lexicon.size() << " word types\n";
+
+ // FIXME: filter out low count phrases and low count contexts (based on individual words?)
+ // now populate model parameters with uniform + random noise
+ prior.resize(numTags, 1.0);
+ addRandomNoise(prior);
+ normalise(prior);
+
+ probCtx.resize(4, ConditionalDist(numTags, Dist(lexicon.size(), 1.0)));
+ if (includePhraseProb)
+ probPhrase.resize(numTags, Dist(lexicon.size(), 1.0));
+ for (int t = 0; t < numTags; ++t)
+ {
+ for (int j = 0; j < 4; ++j)
+ {
+ addRandomNoise(probCtx[j][t]);
+ normalise(probCtx[j][t]);
+ }
+ if (includePhraseProb)
+ {
+ addRandomNoise(probPhrase[t]);
+ normalise(probPhrase[t]);
+ }
+ }
+ if (includePhraseProb)
+ {
+ probPhraseLength.resize(numTags, 0.5); // geometric distribution p=0.5
+ addRandomNoise(probPhraseLength);
+ }
+
+ cout << "\tprior: " << prior << "\n";
+ //cout << "\tcontext: " << probCtx << "\n";
+ //cout << "\tphrase: " << probPhrase << "\n";
+ //cout << "\tphraseLen: " << probPhraseLength << endl;
+
+ vector<double> lambda;
+
+ // now do EM training
+ for (int iteration = 0; iteration < numIterations; ++iteration)
+ {
+ cout << "EM iteration " << iteration << endl;
+
+ if (posterior_regularisation)
+ optimise_lambda(PHRASE_VIOLATION_WEIGHT, CONTEXT_VIOLATION_WEIGHT, lambda);
+ //cout << "\tlambda " << lambda << endl;
+
+ Dist countsPrior(numTags, 0.0);
+ vector<ConditionalDist> countsCtx(4, ConditionalDist(numTags, Dist(lexicon.size(), 1e-10)));
+ ConditionalDist countsPhrase(numTags, Dist(lexicon.size(), 1e-10));
+ Dist countsPhraseLength(numTags, 0.0);
+ Dist nPhrases(numTags, 0.0);
+
+ double llh = 0;
+ for (PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.begin();
+ pcit != concordancePhraseToContexts.end(); ++pcit)
+ {
+ const Phrase &phrase = phrases.type(pcit->first);
+
+ // e-step: estimate latent class probs; compile (class,word) stats for m-step
+ for (ContextCounts::iterator ccit = pcit->second.begin();
+ ccit != pcit->second.end(); ++ccit)
+ {
+ const Context &context = contexts.type(ccit->first);
+
+ double z = 0;
+ Dist tagCounts;
+ if (!posterior_regularisation)
+ tagCounts = conditional_probs(phrase, context, &z);
+ else
+ tagCounts = penalised_conditionals(phrase, context, lambda, &z);
+
+ llh += log(z) * ccit->second;
+ addTo(countsPrior, tagCounts); // FIXME: times ccit->secon
+
+ for (int t = 0; t < numTags; ++t)
+ {
+ for (int j = 0; j < 4; ++j)
+ countsCtx[j][t][get<0>(context)] += tagCounts[t] * ccit->second;
+
+ if (includePhraseProb)
+ {
+ for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit)
+ countsPhrase[t][*pit] += tagCounts[t] * ccit->second;
+ countsPhraseLength[t] += phrase.size() * tagCounts[t] * ccit->second;
+ nPhrases[t] += tagCounts[t] * ccit->second;
+ }
+ }
+ }
+ }
+
+ cout << "M-step\n";
+
+ // m-step: normalise prior and (class,word) stats and assign to model parameters
+ normalise(countsPrior);
+ prior = countsPrior;
+ for (int t = 0; t < numTags; ++t)
+ {
+ //cout << "\t\tt " << t << " prior " << countsPrior[t] << "\n";
+ for (int j = 0; j < 4; ++j)
+ normalise(countsCtx[j][t]);
+ if (includePhraseProb)
+ {
+ normalise(countsPhrase[t]);
+ countsPhraseLength[t] = nPhrases[t] / countsPhraseLength[t];
+ }
+ }
+ probCtx = countsCtx;
+ if (includePhraseProb)
+ {
+ probPhrase = countsPhrase;
+ probPhraseLength = countsPhraseLength;
+ }
+
+ double *larray = new double[lambda.size()];
+ copy(lambda.begin(), lambda.end(), larray);
+ print_primal_dual(larray, PHRASE_VIOLATION_WEIGHT, CONTEXT_VIOLATION_WEIGHT);
+ delete [] larray;
+
+ //cout << "\tllh " << llh << endl;
+ //cout << "\tprior: " << prior << "\n";
+ //cout << "\tcontext: " << probCtx << "\n";
+ //cout << "\tphrase: " << probPhrase << "\n";
+ //cout << "\tphraseLen: " << probPhraseLength << "\n";
+ }
+
+ // output class membership
+ for (PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.begin();
+ pcit != concordancePhraseToContexts.end(); ++pcit)
+ {
+ const Phrase &phrase = phrases.type(pcit->first);
+ for (ContextCounts::iterator ccit = pcit->second.begin();
+ ccit != pcit->second.end(); ++ccit)
+ {
+ const Context &context = contexts.type(ccit->first);
+ Dist tagCounts = conditional_probs(phrase, context, 0);
+ cout << phrase << " ||| " << context << " ||| " << argmax(tagCounts) << "\n";
+ }
+ }
+
+ return 0;
+}
+
+void addRandomNoise(Dist &d)
+{
+ for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit)
+ *dit += rng();
+}
+
+void normalise(Dist &d)
+{
+ double z = 0;
+ for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit)
+ z += *dit;
+ for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit)
+ *dit /= z;
+}
+
+void addTo(Dist &d, const Dist &e)
+{
+ assert(d.size() == e.size());
+ for (int i = 0; i < (int) d.size(); ++i)
+ d[i] += e[i];
+}
+
+int argmax(const Dist &d)
+{
+ double best = d[0];
+ int index = 0;
+ for (int i = 1; i < (int) d.size(); ++i)
+ {
+ if (d[i] > best)
+ {
+ best = d[i];
+ index = i;
+ }
+ }
+ return index;
+}
+
+ostream &operator<<(ostream &out, const Phrase &phrase)
+{
+ for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit)
+ lexicon.display(((pit == phrase.begin()) ? out : out << " "), *pit);
+ return out;
+}
+
+ostream &operator<<(ostream &out, const Context &context)
+{
+ lexicon.display(out, get<0>(context));
+ lexicon.display(out << " ", get<1>(context));
+ lexicon.display(out << " <PHRASE> ", get<2>(context));
+ lexicon.display(out << " ", get<3>(context));
+ return out;
+}
+
+ostream &operator<<(ostream &out, const Dist &dist)
+{
+ for (Dist::const_iterator dit = dist.begin(); dit != dist.end(); ++dit)
+ out << ((dit == dist.begin()) ? "" : " ") << *dit;
+ return out;
+}
+
+ostream &operator<<(ostream &out, const ConditionalDist &dist)
+{
+ for (ConditionalDist::const_iterator dit = dist.begin(); dit != dist.end(); ++dit)
+ out << ((dit == dist.begin()) ? "" : "; ") << *dit;
+ return out;
+}
+
+// FIXME: slow - just use the phrase index, context index to do the mapping
+// (n.b. it's a sparse setup, not just equal to 3d array index)
+int
+lambda_index(const Phrase &phrase, const Context &context, int tag)
+{
+ return lambda_indices[phrase][context] + tag;
+}
+
+template <typename T>
+Dist
+penalised_conditionals(const Phrase &phrase, const Context &context,
+ const T &lambda, double *normalisation)
+{
+ Dist d = conditional_probs(phrase, context, 0);
+
+ double z = 0;
+ for (int t = 0; t < numTags; ++t)
+ {
+ d[t] *= exp(-lambda[lambda_index(phrase, context, t)]);
+ z += d[t];
+ }
+
+ if (normalisation)
+ *normalisation = z;
+
+ for (int t = 0; t < numTags; ++t)
+ d[t] /= z;
+
+ return d;
+}
+
+Dist
+conditional_probs(const Phrase &phrase, const Context &context, double *normalisation)
+{
+ Dist tagCounts(numTags, 0.0);
+ double z = 0;
+ for (int t = 0; t < numTags; ++t)
+ {
+ double prob = prior[t];
+ prob *= (probCtx[0][t][get<0>(context)] * probCtx[1][t][get<1>(context)] *
+ probCtx[2][t][get<2>(context)] * probCtx[3][t][get<3>(context)]);
+
+ if (includePhraseProb)
+ {
+ prob *= pow(1 - probPhraseLength[t], phrase.size() - 1) * probPhraseLength[t];
+ for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit)
+ prob *= probPhrase[t][*pit];
+ }
+
+ tagCounts[t] = prob;
+ z += prob;
+ }
+ if (normalisation)
+ *normalisation = z;
+
+ for (int t = 0; t < numTags; ++t)
+ tagCounts[t] /= z;
+
+ return tagCounts;
+}
+
+double
+penalised_log_likelihood(int n, const double *lambda, double *grad, void *)
+{
+ // return log Z(lambda, theta) over the corpus
+ // where theta are the global parameters (prior, probCtx*, probPhrase*)
+ // and lambda are lagrange multipliers for the posterior sparsity constraints
+ //
+ // this is formulated as:
+ // f = log Z(lambda) = sum_i log ( sum_i p_theta(t_i|p_i,c_i) exp [-lambda_{t_i,p_i,c_i}] )
+ // where i indexes the training examples - specifying the (p, c) pair (which may occur with count > 1)
+ //
+ // with derivative:
+ // f'_{tpc} = frac { - count(t,p,c) p_theta(t|p,c) exp (-lambda_{t,p,c}) }
+ // { sum_t' p_theta(t'|p,c) exp (-lambda_{t',p,c}) }
+
+ //cout << "penalised_log_likelihood with lambda ";
+ //copy(lambda, lambda+n, ostream_iterator<double>(cout, " "));
+ //cout << "\n";
+
+ double f = 0;
+ if (grad)
+ {
+ for (int i = 0; i < n; ++i)
+ grad[i] = 0.0;
+ }
+
+ for (int p = 0; p < phrases.size(); ++p)
+ {
+ const Phrase &phrase = phrases.type(p);
+ PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p);
+ for (ContextCounts::const_iterator ccit = pcit->second.begin();
+ ccit != pcit->second.end(); ++ccit)
+ {
+ const Context &context = contexts.type(ccit->first);
+ double z = 0;
+ Dist scores = penalised_conditionals(phrase, context, lambda, &z);
+
+ f += ccit->second * log(z);
+ //cout << "\tphrase: " << phrase << " context: " << context << " count: " << ccit->second << " z " << z << endl;
+ //cout << "\t\tscores: " << scores << "\n";
+
+ if (grad)
+ {
+ for (int t = 0; t < numTags; ++t)
+ {
+ int i = lambda_index(phrase, context, t); // FIXME: redundant lookups
+ assert(grad[i] == 0.0);
+ grad[i] = - ccit->second * scores[t];
+ }
+ }
+ }
+ }
+
+ //cout << "penalised_log_likelihood returning " << f;
+ //if (grad)
+ //{
+ //cout << "\ngradient: ";
+ //copy(grad, grad+n, ostream_iterator<double>(cout, " "));
+ //}
+ //cout << "\n";
+
+ return f;
+}
+
+typedef struct
+{
+ // one of p or c should be set to -1, in which case it will be marginalised out
+ // i.e. sum_p' lambda_{p'ct} <= threshold
+ // or sum_c' lambda_{pc't} <= threshold
+ int p, c, t, threshold;
+} constraint_data;
+
+double
+constraint_and_gradient(int n, const double *lambda, double *grad, void *data)
+{
+ constraint_data *d = (constraint_data *) data;
+ assert(d->t >= 0);
+ assert(d->threshold >= 0);
+
+ //cout << "constraint_and_gradient: t " << d->t << " p " << d->p << " c " << d->c << " tau " << d->threshold << endl;
+ //cout << "\tlambda ";
+ //copy(lambda, lambda+n, ostream_iterator<double>(cout, " "));
+ //cout << "\n";
+
+ // FIXME: it's crazy to use a dense gradient here => will only have a handful of non-zero entries
+ if (grad)
+ {
+ for (int i = 0; i < n; ++i)
+ grad[i] = 0.0;
+ }
+
+ //cout << "constraint_and_gradient: " << d->p << "; " << d->c << "; " << d->t << "; " << d->threshold << endl;
+
+ if (d->p >= 0)
+ {
+ assert(d->c < 0);
+ // sum_c lambda_pct <= delta [a.k.a. threshold]
+ // => sum_c lambda_pct - delta <= 0
+ // derivative_pct = { 1, if p and t match; 0, otherwise }
+
+ double val = -d->threshold;
+
+ const Phrase &phrase = phrases.type(d->p);
+ PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(d->p);
+ assert(pcit != concordancePhraseToContexts.end());
+ for (ContextCounts::const_iterator ccit = pcit->second.begin();
+ ccit != pcit->second.end(); ++ccit)
+ {
+ const Context &context = contexts.type(ccit->first);
+ int i = lambda_index(phrase, context, d->t);
+ val += lambda[i];
+ if (grad) grad[i] = 1;
+ }
+ //cout << "\treturning " << val << endl;
+
+ return val;
+ }
+ else
+ {
+ assert(d->c >= 0);
+ assert(d->p < 0);
+ // sum_p lambda_pct <= gamma [a.k.a. threshold]
+ // => sum_p lambda_pct - gamma <= 0
+ // derivative_pct = { 1, if c and t match; 0, otherwise }
+
+ double val = -d->threshold;
+
+ const Context &context = contexts.type(d->c);
+ ContextToPhraseCounts::iterator cpit = concordanceContextToPhrases.find(d->c);
+ assert(cpit != concordanceContextToPhrases.end());
+ for (PhraseCounts::iterator pcit = cpit->second.begin();
+ pcit != cpit->second.end(); ++pcit)
+ {
+ const Phrase &phrase = phrases.type(pcit->first);
+ int i = lambda_index(phrase, context, d->t);
+ val += lambda[i];
+ if (grad) grad[i] = 1;
+ }
+ //cout << "\treturning " << val << endl;
+
+ return val;
+ }
+}
+
+void
+optimise_lambda(double delta, double gamma, vector<double> &lambdav)
+{
+ int num_lambdas = lambdav.size();
+ if (lambda_indices.empty() || lambdav.empty())
+ {
+ lambda_indices.clear();
+ lambdav.clear();
+
+ int i = 0;
+ for (int p = 0; p < phrases.size(); ++p)
+ {
+ const Phrase &phrase = phrases.type(p);
+ PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.find(p);
+ for (ContextCounts::iterator ccit = pcit->second.begin();
+ ccit != pcit->second.end(); ++ccit)
+ {
+ const Context &context = contexts.type(ccit->first);
+ lambda_indices[phrase][context] = i;
+ i += numTags;
+ }
+ }
+ num_lambdas = i;
+ lambdav.resize(num_lambdas);
+ }
+ //cout << "optimise_lambda: #langrange multipliers " << num_lambdas << endl;
+
+ // FIXME: better to work with an implicit representation to save memory usage
+ int num_constraints = (((delta > 0) ? phrases.size() : 0) + ((gamma > 0) ? contexts.size() : 0)) * numTags;
+ //cout << "optimise_lambda: #constraints " << num_constraints << endl;
+ constraint_data *data = new constraint_data[num_constraints];
+ int i = 0;
+ if (delta > 0)
+ {
+ for (int p = 0; p < phrases.size(); ++p)
+ {
+ for (int t = 0; t < numTags; ++t, ++i)
+ {
+ constraint_data &d = data[i];
+ d.p = p;
+ d.c = -1;
+ d.t = t;
+ d.threshold = delta;
+ }
+ }
+ }
+
+ if (gamma > 0)
+ {
+ for (int c = 0; c < contexts.size(); ++c)
+ {
+ for (int t = 0; t < numTags; ++t, ++i)
+ {
+ constraint_data &d = data[i];
+ d.p = -1;
+ d.c = c;
+ d.t = t;
+ d.threshold = gamma;
+ }
+ }
+ }
+ assert(i == num_constraints);
+
+ double lambda[num_lambdas];
+ double lb[num_lambdas], ub[num_lambdas];
+ for (i = 0; i < num_lambdas; ++i)
+ {
+ lambda[i] = lambdav[i]; // starting value
+ lb[i] = 0; // lower bound
+ if (delta <= 0) // upper bound
+ ub[i] = gamma;
+ else if (gamma <= 0)
+ ub[i] = delta;
+ else
+ assert(false);
+ }
+
+ //print_primal_dual(lambda, delta, gamma);
+
+ double minf;
+ int error_code = nlopt_minimize_constrained(NLOPT_LN_COBYLA, num_lambdas, penalised_log_likelihood, NULL,
+ num_constraints, constraint_and_gradient, data, sizeof(constraint_data),
+ lb, ub, lambda, &minf, -HUGE_VAL, 0.0, 0.0, 1e-4, NULL, 0, 0.0);
+ //cout << "optimise error code " << error_code << endl;
+
+ //print_primal_dual(lambda, delta, gamma);
+
+ delete [] data;
+
+ if (error_code < 0)
+ cout << "WARNING: optimisation failed with error code: " << error_code << endl;
+ //else
+ //{
+ //cout << "success; minf " << minf << endl;
+ //print_primal_dual(lambda, delta, gamma);
+ //}
+
+ lambdav = vector<double>(&lambda[0], &lambda[0] + num_lambdas);
+}
+
+// FIXME: inefficient - cache the scores
+double
+expected_violation_phrases(const double *lambda)
+{
+ // sum_pt max_c E_q[phi_pct]
+ double violation = 0;
+
+ for (int p = 0; p < phrases.size(); ++p)
+ {
+ const Phrase &phrase = phrases.type(p);
+ PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p);
+
+ for (int t = 0; t < numTags; ++t)
+ {
+ double best = 0;
+ for (ContextCounts::const_iterator ccit = pcit->second.begin();
+ ccit != pcit->second.end(); ++ccit)
+ {
+ const Context &context = contexts.type(ccit->first);
+ Dist scores = penalised_conditionals(phrase, context, lambda, 0);
+ best = max(best, scores[t]);
+ }
+ violation += best;
+ }
+ }
+
+ return violation;
+}
+
+// FIXME: inefficient - cache the scores
+double
+expected_violation_contexts(const double *lambda)
+{
+ // sum_ct max_p E_q[phi_pct]
+ double violation = 0;
+
+ for (int c = 0; c < contexts.size(); ++c)
+ {
+ const Context &context = contexts.type(c);
+ ContextToPhraseCounts::iterator cpit = concordanceContextToPhrases.find(c);
+
+ for (int t = 0; t < numTags; ++t)
+ {
+ double best = 0;
+ for (PhraseCounts::iterator pit = cpit->second.begin();
+ pit != cpit->second.end(); ++pit)
+ {
+ const Phrase &phrase = phrases.type(pit->first);
+ Dist scores = penalised_conditionals(phrase, context, lambda, 0);
+ best = max(best, scores[t]);
+ }
+ violation += best;
+ }
+ }
+
+ return violation;
+}
+
+// FIXME: possibly inefficient
+double
+primal_likelihood() // FIXME: primal evaluation needs to use lambda and calculate l1linf terms
+{
+ double llh = 0;
+ for (int p = 0; p < phrases.size(); ++p)
+ {
+ const Phrase &phrase = phrases.type(p);
+ PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p);
+ for (ContextCounts::const_iterator ccit = pcit->second.begin();
+ ccit != pcit->second.end(); ++ccit)
+ {
+ const Context &context = contexts.type(ccit->first);
+ double z = 0;
+ Dist scores = conditional_probs(phrase, context, &z);
+ llh += ccit->second * log(z);
+ }
+ }
+ return llh;
+}
+
+// FIXME: inefficient - cache the scores
+double
+primal_kl_divergence(const double *lambda)
+{
+ // return KL(q || p) = sum_y q(y) { log q(y) - log p(y | x) }
+ // = sum_y q(y) { log p(y | x) - lambda . phi(x, y) - log Z - log p(y | x) }
+ // = sum_y q(y) { - lambda . phi(x, y) } - log Z
+ // and q(y) factors with each edge, ditto for Z
+
+ double feature_sum = 0, log_z = 0;
+ for (int p = 0; p < phrases.size(); ++p)
+ {
+ const Phrase &phrase = phrases.type(p);
+ PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p);
+ for (ContextCounts::const_iterator ccit = pcit->second.begin();
+ ccit != pcit->second.end(); ++ccit)
+ {
+ const Context &context = contexts.type(ccit->first);
+
+ double local_z = 0;
+ double local_f = 0;
+ Dist d = conditional_probs(phrase, context, 0);
+ for (int t = 0; t < numTags; ++t)
+ {
+ int i = lambda_index(phrase, context, t);
+ double s = d[t] * exp(-lambda[i]);
+ local_f += lambda[i] * s;
+ local_z += s;
+ }
+
+ log_z += ccit->second * log(local_z);
+ feature_sum += ccit->second * (local_f / local_z);
+ }
+ }
+
+ return -feature_sum - log_z;
+}
+
+// FIXME: inefficient - cache the scores
+double
+dual(const double *lambda)
+{
+ // return log(Z) = - log { sum_y p(y | x) exp( - lambda . phi(x, y) }
+ // n.b. have flipped the sign as we're minimising
+
+ double z = 0;
+ for (int p = 0; p < phrases.size(); ++p)
+ {
+ const Phrase &phrase = phrases.type(p);
+ PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p);
+ for (ContextCounts::const_iterator ccit = pcit->second.begin();
+ ccit != pcit->second.end(); ++ccit)
+ {
+ const Context &context = contexts.type(ccit->first);
+ double lz = 0;
+ Dist scores = penalised_conditionals(phrase, context, lambda, &z);
+ z += lz * ccit->second;
+ }
+ }
+ return log(z);
+}
+
+void
+print_primal_dual(const double *lambda, double delta, double gamma)
+{
+ double likelihood = primal_likelihood();
+ double kl = primal_kl_divergence(lambda);
+ double sum_pt = expected_violation_phrases(lambda);
+ double sum_ct = expected_violation_contexts(lambda);
+ //double d = dual(lambda);
+
+ cout << "\tllh=" << likelihood
+ << " kl=" << kl
+ << " violations phrases=" << sum_pt
+ << " contexts=" << sum_ct
+ //<< " primal=" << (kl + delta * sum_pt + gamma * sum_ct)
+ //<< " dual=" << d
+ << " objective=" << (likelihood - kl + delta * sum_pt + gamma * sum_ct)
+ << endl;
+}
diff --git a/gi/posterior-regularisation/invert.hh b/gi/posterior-regularisation/invert.hh
new file mode 100644
index 00000000..d06356e9
--- /dev/null
+++ b/gi/posterior-regularisation/invert.hh
@@ -0,0 +1,45 @@
+// The following code inverts the matrix input using LU-decomposition with
+// backsubstitution of unit vectors. Reference: Numerical Recipies in C, 2nd
+// ed., by Press, Teukolsky, Vetterling & Flannery.
+// Code written by Fredrik Orderud.
+// http://www.crystalclearsoftware.com/cgi-bin/boost_wiki/wiki.pl?LU_Matrix_Inversion
+
+#ifndef INVERT_MATRIX_HPP
+#define INVERT_MATRIX_HPP
+
+// REMEMBER to update "lu.hpp" header includes from boost-CVS
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+namespace ublas = boost::numeric::ublas;
+
+/* Matrix inversion routine.
+ Uses lu_factorize and lu_substitute in uBLAS to invert a matrix */
+template<class T>
+bool invert_matrix(const ublas::matrix<T>& input, ublas::matrix<T>& inverse)
+{
+ using namespace boost::numeric::ublas;
+ typedef permutation_matrix<std::size_t> pmatrix;
+ // create a working copy of the input
+ matrix<T> A(input);
+ // create a permutation matrix for the LU-factorization
+ pmatrix pm(A.size1());
+
+ // perform LU-factorization
+ int res = lu_factorize(A,pm);
+ if( res != 0 ) return false;
+
+ // create identity matrix of "inverse"
+ inverse.assign(ublas::identity_matrix<T>(A.size1()));
+
+ // backsubstitute to get the inverse
+ lu_substitute(A, pm, inverse);
+
+ return true;
+}
+
+#endif //INVERT_MATRIX_HPP
diff --git a/gi/posterior-regularisation/log_add.hh b/gi/posterior-regularisation/log_add.hh
new file mode 100644
index 00000000..e0620c5a
--- /dev/null
+++ b/gi/posterior-regularisation/log_add.hh
@@ -0,0 +1,30 @@
+#ifndef log_add_hh
+#define log_add_hh
+
+#include <limits>
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+template <typename T>
+struct Log
+{
+ static T zero() { return -std::numeric_limits<T>::infinity(); }
+
+ static T add(T l1, T l2)
+ {
+ if (l1 == zero()) return l2;
+ if (l1 > l2)
+ return l1 + std::log(1 + exp(l2 - l1));
+ else
+ return l2 + std::log(1 + exp(l1 - l2));
+ }
+
+ static T subtract(T l1, T l2)
+ {
+ //std::assert(l1 >= l2);
+ return l1 + log(1 - exp(l2 - l1));
+ }
+};
+
+#endif
diff --git a/gi/posterior-regularisation/projected_gradient.cc b/gi/posterior-regularisation/projected_gradient.cc
new file mode 100644
index 00000000..f7c39817
--- /dev/null
+++ b/gi/posterior-regularisation/projected_gradient.cc
@@ -0,0 +1,87 @@
+//
+// Minimises given functional using the projected gradient method. Based on
+// algorithm and demonstration example in Linear and Nonlinear Programming,
+// Luenberger and Ye, 3rd ed., p 370.
+//
+
+#include "invert.hh"
+#include <iostream>
+
+using namespace std;
+
+double
+f(double x1, double x2, double x3, double x4)
+{
+ return x1 * x1 + x2 * x2 + x3 * x3 + x4 * x4 - 2 * x1 - 3 * x4;
+}
+
+ublas::vector<double>
+g(double x1, double x2, double x3, double x4)
+{
+ ublas::vector<double> v(4);
+ v(0) = 2 * x1 - 2;
+ v(1) = 2 * x2;
+ v(2) = 2 * x3;
+ v(3) = 2 * x4 - 3;
+ return v;
+}
+
+ublas::matrix<double>
+activeConstraints(double x1, double x2, double x3, double x4)
+{
+ int n = 2;
+ if (x1 == 0) ++n;
+ if (x2 == 0) ++n;
+ if (x3 == 0) ++n;
+ if (x4 == 0) ++n;
+
+ ublas::matrix<double> a(n,4);
+ a(0, 0) = 2; a(0, 1) = 1; a(0, 2) = 1; a(0, 3) = 4;
+ a(1, 0) = 1; a(1, 1) = 1; a(1, 2) = 2; a(1, 3) = 1;
+
+ int c = 2;
+ if (x1 == 0) a(c++, 0) = 1;
+ if (x2 == 0) a(c++, 1) = 1;
+ if (x3 == 0) a(c++, 2) = 1;
+ if (x4 == 0) a(c++, 3) = 1;
+
+ return a;
+}
+
+ublas::matrix<double>
+projection(const ublas::matrix<double> &a)
+{
+ ublas::matrix<double> aT = ublas::trans(a);
+ ublas::matrix<double> inv(a.size1(), a.size1());
+ bool ok = invert_matrix(ublas::matrix<double>(ublas::prod(a, aT)), inv);
+ assert(ok && "Failed to invert matrix");
+ return ublas::identity_matrix<double>(4) -
+ ublas::prod(aT, ublas::matrix<double>(ublas::prod(inv, a)));
+}
+
+int main(int argc, char *argv[])
+{
+ double x1 = 2, x2 = 2, x3 = 1, x4 = 0;
+
+ double fval = f(x1, x2, x3, x4);
+ cout << "f = " << fval << endl;
+ ublas::vector<double> grad = g(x1, x2, x3, x4);
+ cout << "g = " << grad << endl;
+ ublas::matrix<double> A = activeConstraints(x1, x2, x3, x4);
+ cout << "A = " << A << endl;
+ ublas::matrix<double> P = projection(A);
+ cout << "P = " << P << endl;
+ // the direction of movement
+ ublas::vector<double> d = prod(P, grad);
+ cout << "d = " << (d / d(0)) << endl;
+
+ // special case for d = 0
+
+ // next solve for limits on the line search
+
+ // then use golden rule technique between these values (if bounded)
+
+ // or simple Armijo's rule technique
+
+ return 0;
+}
diff --git a/gi/posterior-regularisation/train_pr_global.py b/gi/posterior-regularisation/train_pr_global.py
new file mode 100644
index 00000000..467069ef
--- /dev/null
+++ b/gi/posterior-regularisation/train_pr_global.py
@@ -0,0 +1,272 @@
+import sys
+import scipy.optimize
+from numpy import *
+from numpy.random import random
+
+#
+# Step 1: load the concordance counts
+#
+
+edges_phrase_to_context = {}
+edges_context_to_phrase = {}
+types = {}
+num_edges = 0
+
+for line in sys.stdin:
+ phrase, rest = line.strip().split('\t')
+ parts = rest.split('|||')
+ for i in range(0, len(parts), 2):
+ context, count = parts[i:i+2]
+
+ ctx = tuple(filter(lambda x: x != '<PHRASE>', context.split()))
+ cnt = int(count.strip()[2:])
+ ccs = edges_phrase_to_context.setdefault(phrase, {})
+ ccs[ctx] = cnt
+ pcs = edges_context_to_phrase.setdefault(ctx, {})
+ pcs[phrase] = cnt
+
+ for token in ctx:
+ types.setdefault(token, len(types))
+ for token in phrase.split():
+ types.setdefault(token, len(types))
+
+ num_edges += 1
+
+print 'Read in', num_edges, 'edges and', len(types), 'word types'
+
+#
+# Step 2: initialise the model parameters
+#
+
+num_tags = 5
+num_types = len(types)
+delta = int(sys.argv[1])
+gamma = int(sys.argv[2])
+
+def normalise(a):
+ return a / sum(a)
+
+# Pr(tag)
+tagDist = normalise(random(num_tags)+1)
+# Pr(context at pos i = w | tag) indexed by i, tag, word
+contextWordDist = [[normalise(random(num_types)+1) for t in range(num_tags)] for i in range(4)]
+# PR langrange multipliers
+lamba = zeros(num_edges * num_tags)
+lamba_index = {}
+next = 0
+for phrase, ccs in edges_phrase_to_context.items():
+ for context in ccs.keys():
+ lamba_index[phrase,context] = next
+ next += num_tags
+
+#
+# Step 3: expectation maximisation
+#
+
+for iteration in range(20):
+ tagCounts = zeros(num_tags)
+ contextWordCounts = [[zeros(num_types) for t in range(num_tags)] for i in range(4)]
+
+ #print 'tagDist', tagDist
+ #print 'contextWordCounts[0][0]', contextWordCounts[0][0]
+
+ # Tune lambda
+ # dual: min log Z(lamba) s.t. lamba >= 0;
+ # sum_c lamba_pct <= delta; sum_p lamba_pct <= gamma
+ def dual(ls):
+ logz = 0
+ for phrase, ccs in edges_phrase_to_context.items():
+ for context, count in ccs.items():
+ conditionals = zeros(num_tags)
+ for t in range(num_tags):
+ prob = tagDist[t]
+ for i in range(4):
+ prob *= contextWordDist[i][t][types[context[i]]]
+ conditionals[t] = prob
+ cz = sum(conditionals)
+ conditionals /= cz
+
+ local_z = 0
+ for t in range(num_tags):
+ local_z += conditionals[t] * exp(-ls[lamba_index[phrase,context] + t])
+ logz += log(local_z) * count
+
+ #print 'ls', ls
+ #print 'lambda', list(ls)
+ #print 'dual', logz
+ return logz
+
+ def primal(ls):
+ # FIXME: returns negative values for KL (impossible)
+ logz = dual(ls)
+ kl = -logz
+
+ expectations = zeros(lamba.shape)
+ for phrase, ccs in edges_phrase_to_context.items():
+ for context, count in ccs.items():
+ conditionals = zeros(num_tags)
+ for t in range(num_tags):
+ prob = tagDist[t]
+ for i in range(4):
+ prob *= contextWordDist[i][t][types[context[i]]]
+ conditionals[t] = prob
+ cz = sum(conditionals)
+ conditionals /= cz
+
+ scores = zeros(num_tags)
+ for t in range(num_tags):
+ scores[t] = conditionals[t] * exp(-ls[lamba_index[phrase,context] + t])
+ local_z = sum(scores)
+
+ for t in range(num_tags):
+ li = lamba_index[phrase,context] + t
+ expectations[li] = scores[t] / local_z * count
+ kl -= expectations[li] * ls[li]
+
+ pt_l1linf = 0
+ for phrase, ccs in edges_phrase_to_context.items():
+ for t in range(num_tags):
+ best = -1e500
+ for context, count in ccs.items():
+ li = lamba_index[phrase,context] + t
+ s = expectations[li]
+ if s > best: best = s
+ pt_l1linf += best
+
+ ct_l1linf = 0
+ for context, pcs in edges_context_to_phrase.items():
+ for t in range(num_tags):
+ best = -1e500
+ for phrase, count in pcs.items():
+ li = lamba_index[phrase,context] + t
+ s = expectations[li]
+ if s > best: best = s
+ ct_l1linf += best
+
+ return kl, pt_l1linf, ct_l1linf, kl + delta * pt_l1linf + gamma * ct_l1linf
+
+ def dual_deriv(ls):
+ # d/dl log(z) = E_q[phi]
+ deriv = zeros(num_edges * num_tags)
+ for phrase, ccs in edges_phrase_to_context.items():
+ for context, count in ccs.items():
+ conditionals = zeros(num_tags)
+ for t in range(num_tags):
+ prob = tagDist[t]
+ for i in range(4):
+ prob *= contextWordDist[i][t][types[context[i]]]
+ conditionals[t] = prob
+ cz = sum(conditionals)
+ conditionals /= cz
+
+ scores = zeros(num_tags)
+ for t in range(num_tags):
+ scores[t] = conditionals[t] * exp(-ls[lamba_index[phrase,context] + t])
+ local_z = sum(scores)
+
+ for t in range(num_tags):
+ deriv[lamba_index[phrase,context] + t] -= count * scores[t] / local_z
+
+ #print 'ddual', deriv
+ return deriv
+
+ def constraints(ls):
+ cons = []
+ if delta > 0:
+ for phrase, ccs in edges_phrase_to_context.items():
+ for t in range(num_tags):
+ total = delta
+ for cprime in ccs.keys():
+ total -= ls[lamba_index[phrase, cprime] + t]
+ cons.append(total)
+
+ if gamma > 0:
+ for context, pcs in edges_context_to_phrase.items():
+ for t in range(num_tags):
+ total = gamma
+ for pprime in pcs.keys():
+ total -= ls[lamba_index[pprime, context] + t]
+ cons.append(total)
+ #print 'cons', cons
+ return cons
+
+ def constraints_deriv(ls):
+ cons = []
+ if delta > 0:
+ for phrase, ccs in edges_phrase_to_context.items():
+ for t in range(num_tags):
+ d = zeros(num_edges * num_tags)
+ for cprime in ccs.keys():
+ d[lamba_index[phrase, cprime] + t] = -1
+ cons.append(d)
+
+ if gamma > 0:
+ for context, pcs in edges_context_to_phrase.items():
+ for t in range(num_tags):
+ d = zeros(num_edges * num_tags)
+ for pprime in pcs.keys():
+ d[lamba_index[pprime, context] + t] = -1
+ cons.append(d)
+ #print 'dcons', cons
+ return cons
+
+ print 'Pre lambda optimisation dual', dual(lamba), 'primal', primal(lamba)
+ lamba = scipy.optimize.fmin_slsqp(dual, lamba,
+ bounds=[(0, delta)] * (num_edges * num_tags),
+ f_ieqcons=constraints,
+ fprime=dual_deriv,
+ fprime_ieqcons=constraints_deriv,
+ iprint=0)
+ print 'Post lambda optimisation dual', dual(lamba), 'primal', primal(lamba)
+
+ # E-step
+ llh = z = 0
+ for phrase, ccs in edges_phrase_to_context.items():
+ for context, count in ccs.items():
+ conditionals = zeros(num_tags)
+ for t in range(num_tags):
+ prob = tagDist[t]
+ for i in range(4):
+ prob *= contextWordDist[i][t][types[context[i]]]
+ conditionals[t] = prob
+ cz = sum(conditionals)
+ conditionals /= cz
+ llh += log(cz) * count
+
+ scores = zeros(num_tags)
+ li = lamba_index[phrase, context]
+ for t in range(num_tags):
+ scores[t] = conditionals[t] * exp(-lamba[li + t])
+ z += count * sum(scores)
+
+ tagCounts += count * scores
+ for i in range(4):
+ for t in range(num_tags):
+ contextWordCounts[i][t][types[context[i]]] += count * scores[t]
+
+ print 'iteration', iteration, 'llh', llh, 'logz', log(z)
+
+ # M-step
+ tagDist = normalise(tagCounts)
+ for i in range(4):
+ for t in range(num_tags):
+ contextWordDist[i][t] = normalise(contextWordCounts[i][t])
+
+
+for phrase, ccs in edges_phrase_to_context.items():
+ for context, count in ccs.items():
+ conditionals = zeros(num_tags)
+ for t in range(num_tags):
+ prob = tagDist[t]
+ for i in range(4):
+ prob *= contextWordDist[i][t][types[context[i]]]
+ conditionals[t] = prob
+ cz = sum(conditionals)
+ conditionals /= cz
+
+ scores = zeros(num_tags)
+ li = lamba_index[phrase, context]
+ for t in range(num_tags):
+ scores[t] = conditionals[t] * exp(-lamba[li + t])
+
+ print '%s\t%s ||| C=%d ||| %d |||' % (phrase, context, count, argmax(scores)), scores / sum(scores)