diff options
author | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-28 19:34:58 +0000 |
---|---|---|
committer | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-28 19:34:58 +0000 |
commit | 89afb013e40bf9c749956420e77ac72773874844 (patch) | |
tree | 2e18cf00d768507ae5cbc4cb085e22165f3b6fe6 /gi/posterior-regularisation | |
parent | c1fd8f9ba0c8a6f5ab07a3390d3f1a3910a3267e (diff) |
First bits of code for PR training
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@44 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/posterior-regularisation')
-rw-r--r-- | gi/posterior-regularisation/PhraseContextModel.java | 317 | ||||
-rw-r--r-- | gi/posterior-regularisation/README | 3 | ||||
-rw-r--r-- | gi/posterior-regularisation/alphabet.hh | 61 | ||||
-rw-r--r-- | gi/posterior-regularisation/canned.concordance | 4 | ||||
-rw-r--r-- | gi/posterior-regularisation/em.cc | 830 | ||||
-rw-r--r-- | gi/posterior-regularisation/invert.hh | 45 | ||||
-rw-r--r-- | gi/posterior-regularisation/log_add.hh | 30 | ||||
-rw-r--r-- | gi/posterior-regularisation/projected_gradient.cc | 87 | ||||
-rw-r--r-- | gi/posterior-regularisation/train_pr_global.py | 272 |
9 files changed, 1649 insertions, 0 deletions
diff --git a/gi/posterior-regularisation/PhraseContextModel.java b/gi/posterior-regularisation/PhraseContextModel.java new file mode 100644 index 00000000..3af72d54 --- /dev/null +++ b/gi/posterior-regularisation/PhraseContextModel.java @@ -0,0 +1,317 @@ +// Input of the form: +// " the phantom of the opera " tickets for <PHRASE> tonight ? ||| C=1 ||| seats for <PHRASE> ? </s> ||| C=1 ||| i see <PHRASE> ? </s> ||| C=1 +// phrase TAB [context]+ +// where context = phrase ||| C=... which are separated by ||| + +// Model parameterised as follows: +// - each phrase, p, is allocated a latent state, t +// - this is used to generate the contexts, c +// - each context is generated using 4 independent multinomials, one for each position LL, L, R, RR + +// Training with EM: +// - e-step is estimating q(t) = P(t|p,c) for all x,c +// - m-step is estimating model parameters P(c,t|p) = P(t) P(c|t) +// - PR uses alternate e-step, which first optimizes lambda +// min_q KL(q||p) + delta sum_pt max_c E_q[phi_ptc] +// where +// q(t|p,c) propto p(t,c|p) exp( -phi_ptc ) +// Then q is used to obtain expectations for vanilla M-step. + +// Sexing it up: +// - learn p-specific conditionals P(t|p) +// - or generate phrase internals, e.g., generate edge words from +// different distribution to central words +// - agreement between phrase->context model and context->phrase model + +import java.io.*; +import java.util.*; +import java.util.regex.*; +import static java.lang.Math.*; + +class Lexicon<T> +{ + public int insert(T word) + { + Integer i = wordToIndex.get(word); + if (i == null) + { + i = indexToWord.size(); + wordToIndex.put(word, i); + indexToWord.add(word); + } + return i; + } + + public T lookup(int index) + { + return indexToWord.get(index); + } + + public int size() + { + return indexToWord.size(); + } + + private Map<T,Integer> wordToIndex = new HashMap<T,Integer>(); + private List<T> indexToWord = new ArrayList<T>(); +} + +class PhraseContextModel +{ + // model/optimisation configuration parameters + int numTags; + int numPRIterations = 5; + boolean posteriorRegularisation = false; + double constraintScale = 10; + + // book keeping + Lexicon<String> tokenLexicon = new Lexicon<String>(); + int numPositions; + Random rng = new Random(); + + // training set; 1 entry for each unique phrase + PhraseAndContexts training[]; + + // model parameters (learnt) + double emissions[][][]; // position in 0 .. 3 x tag x word Pr(word | tag, position) + double prior[][]; // phrase x tag Pr(tag | phrase) + //double lambda[][][]; // word x context x tag langrange multipliers + + PhraseContextModel(File infile, int tags) throws IOException + { + numTags = tags; + readTrainingFromFile(new FileReader(infile)); + assert(training.length > 0); + + // now initialise emissions + assert(training[0].contexts.length > 0); + numPositions = training[0].contexts[0].tokens.length; + + emissions = new double[numPositions][numTags][tokenLexicon.size()]; + prior = new double[training.length][numTags]; + //lambda = new double[tokenLexicon.size()][???][numTags] + + for (double[][] emissionTW: emissions) + for (double[] emissionW: emissionTW) + randomise(emissionW); + + for (double[] priorTag: prior) + randomise(priorTag); + } + + void expectationMaximisation(int numIterations) + { + for (int iteration = 0; iteration < numIterations; ++iteration) + { + double emissionsCounts[][][] = new double[numPositions][numTags][tokenLexicon.size()]; + double priorCounts[][] = new double[training.length][numTags]; + + // E-step + double llh = 0; + for (int i = 0; i < training.length; ++i) + { + PhraseAndContexts instance = training[i]; + for (Context ctx: instance.contexts) + { + double probs[] = posterior(i, ctx); + double z = normalise(probs); + llh += log(z) * ctx.count; + + for (int t = 0; t < numTags; ++t) + { + priorCounts[i][t] += ctx.count * probs[t]; + for (int j = 0; j < ctx.tokens.length; ++j) + emissionsCounts[j][t][ctx.tokens[j]] += ctx.count * probs[t]; + } + } + } + + // M-step: normalise + for (double[][] emissionTW: emissionsCounts) + for (double[] emissionW: emissionTW) + normalise(emissionW); + + for (double[] priorTag: priorCounts) + normalise(priorTag); + + emissions = emissionsCounts; + prior = priorCounts; + + System.out.println("Iteration " + iteration + " llh " + llh); + } + } + + static double normalise(double probs[]) + { + double z = 0; + for (double p: probs) + z += p; + for (int i = 0; i < probs.length; ++i) + probs[i] /= z; + return z; + } + + void randomise(double probs[]) + { + double z = 0; + for (int i = 0; i < probs.length; ++i) + { + probs[i] = 10 + rng.nextDouble(); + z += probs[i]; + } + + for (int i = 0; i < probs.length; ++i) + probs[i] /= z; + } + + static int argmax(double probs[]) + { + double m = Double.NEGATIVE_INFINITY; + int mi = -1; + for (int i = 0; i < probs.length; ++i) + { + if (probs[i] > m) + { + m = probs[i]; + mi = i; + } + } + return mi; + } + + double[] posterior(int phraseId, Context c) // unnormalised + { + double probs[] = new double[numTags]; + for (int t = 0; t < numTags; ++t) + { + probs[t] = prior[phraseId][t]; + for (int j = 0; j < c.tokens.length; ++j) + probs[t] *= emissions[j][t][c.tokens[j]]; + } + return probs; + } + + private void readTrainingFromFile(Reader in) throws IOException + { + // read in line-by-line + BufferedReader bin = new BufferedReader(in); + String line; + List<PhraseAndContexts> instances = new ArrayList<PhraseAndContexts>(); + Pattern separator = Pattern.compile(" \\|\\|\\| "); + + int numEdges = 0; + while ((line = bin.readLine()) != null) + { + // split into phrase and contexts + StringTokenizer st = new StringTokenizer(line, "\t"); + assert(st.hasMoreTokens()); + String phrase = st.nextToken(); + assert(st.hasMoreTokens()); + String rest = st.nextToken(); + assert(!st.hasMoreTokens()); + + // process phrase + st = new StringTokenizer(phrase, " "); + List<Integer> ptoks = new ArrayList<Integer>(); + while (st.hasMoreTokens()) + ptoks.add(tokenLexicon.insert(st.nextToken())); + + // process contexts + ArrayList<Context> contexts = new ArrayList<Context>(); + String[] parts = separator.split(rest); + assert(parts.length % 2 == 0); + for (int i = 0; i < parts.length; i += 2) + { + // process pairs of strings - context and count + ArrayList<Integer> ctx = new ArrayList<Integer>(); + String ctxString = parts[i]; + String countString = parts[i+1]; + StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " "); + while (ctxStrtok.hasMoreTokens()) + { + String token = ctxStrtok.nextToken(); + if (!token.equals("<PHRASE>")) + ctx.add(tokenLexicon.insert(token)); + } + + assert(countString.startsWith("C=")); + Context c = new Context(); + c.count = Integer.parseInt(countString.substring(2).trim()); + // damn unboxing doesn't work with toArray + c.tokens = new int[ctx.size()]; + for (int k = 0; k < ctx.size(); ++k) + c.tokens[k] = ctx.get(k); + contexts.add(c); + + numEdges += 1; + } + + // package up + PhraseAndContexts instance = new PhraseAndContexts(); + // damn unboxing doesn't work with toArray + instance.phraseTokens = new int[ptoks.size()]; + for (int k = 0; k < ptoks.size(); ++k) + instance.phraseTokens[k] = ptoks.get(k); + instance.contexts = contexts.toArray(new Context[] {}); + instances.add(instance); + } + + training = instances.toArray(new PhraseAndContexts[] {}); + + System.out.println("Read in " + training.length + " phrases and " + numEdges + " edges"); + } + + void displayPosterior() + { + for (int i = 0; i < training.length; ++i) + { + PhraseAndContexts instance = training[i]; + for (Context ctx: instance.contexts) + { + double probs[] = posterior(i, ctx); + double z = normalise(probs); + + // emit phrase + for (int t: instance.phraseTokens) + System.out.print(tokenLexicon.lookup(t) + " "); + System.out.print("\t"); + for (int c: ctx.tokens) + System.out.print(tokenLexicon.lookup(c) + " "); + System.out.print("||| C=" + ctx.count + " |||"); + + System.out.print(" " + argmax(probs)); + //for (int t = 0; t < numTags; ++t) + //System.out.print(" " + probs[t]); + System.out.println(); + } + } + } + + class PhraseAndContexts + { + int phraseTokens[]; + Context contexts[]; + } + + class Context + { + int count; + int[] tokens; + } + + public static void main(String []args) + { + assert(args.length >= 2); + try + { + PhraseContextModel model = new PhraseContextModel(new File(args[0]), Integer.parseInt(args[1])); + model.expectationMaximisation(Integer.parseInt(args[2])); + model.displayPosterior(); + } + catch (IOException e) + { + System.out.println("Failed to read input file: " + args[0]); + e.printStackTrace(); + } + } +} diff --git a/gi/posterior-regularisation/README b/gi/posterior-regularisation/README new file mode 100644 index 00000000..a3d54ffc --- /dev/null +++ b/gi/posterior-regularisation/README @@ -0,0 +1,3 @@ + 557 ./cdec_extools/extractor -i btec/split.zh-en.al -c 500000 -L 12 -C | sort -t $'\t' -k 1 | ./cdec_extools/mr_stripe_rule_reduce > btec.concordance + 559 wc -l btec.concordance + 588 cat btec.concordance | sed 's/.* //' | awk '{ for (i=1; i < NF; i++) { x=substr($i, 1, 2); if (x == "C=") printf "\n"; else if (x != "||") printf "%s ", $i; }; printf "\n"; }' | sort | uniq | wc -l diff --git a/gi/posterior-regularisation/alphabet.hh b/gi/posterior-regularisation/alphabet.hh new file mode 100644 index 00000000..1db928da --- /dev/null +++ b/gi/posterior-regularisation/alphabet.hh @@ -0,0 +1,61 @@ +#ifndef _alphabet_hh +#define _alphabet_hh + +#include <cassert> +#include <iosfwd> +#include <map> +#include <string> +#include <vector> + +// Alphabet: indexes a set of types +template <typename T> +class Alphabet: protected std::map<T, int> +{ +public: + Alphabet() {}; + + bool empty() const { return std::map<T,int>::empty(); } + int size() const { return std::map<T,int>::size(); } + + int operator[](const T &k) const + { + typename std::map<T,int>::const_iterator cit = find(k); + if (cit != std::map<T,int>::end()) + return cit->second; + else + return -1; + } + + int lookup(const T &k) const { return (*this)[k]; } + + int insert(const T &k) + { + int sz = size(); + assert((unsigned) sz == _items.size()); + + std::pair<typename std::map<T,int>::iterator, bool> + ins = std::map<T,int>::insert(make_pair(k, sz)); + + if (ins.second) + _items.push_back(k); + + return ins.first->second; + } + + const T &type(int i) const + { + assert(i >= 0); + assert(i < size()); + return _items[i]; + } + + std::ostream &display(std::ostream &out, int i) const + { + return out << type(i); + } + +private: + std::vector<T> _items; +}; + +#endif diff --git a/gi/posterior-regularisation/canned.concordance b/gi/posterior-regularisation/canned.concordance new file mode 100644 index 00000000..710973ff --- /dev/null +++ b/gi/posterior-regularisation/canned.concordance @@ -0,0 +1,4 @@ +a 0 0 <PHRASE> 0 0 ||| C=1 ||| 1 1 <PHRASE> 1 1 ||| C=1 ||| 2 2 <PHRASE> 2 2 ||| C=1 +b 0 0 <PHRASE> 0 0 ||| C=1 ||| 1 1 <PHRASE> 1 1 ||| C=1 +c 2 2 <PHRASE> 2 2 ||| C=1 ||| 4 4 <PHRASE> 4 4 ||| C=1 ||| 5 5 <PHRASE> 5 5 ||| C=1 +d 4 4 <PHRASE> 4 4 ||| C=1 ||| 5 5 <PHRASE> 5 5 ||| C=1 diff --git a/gi/posterior-regularisation/em.cc b/gi/posterior-regularisation/em.cc new file mode 100644 index 00000000..f6c9fd68 --- /dev/null +++ b/gi/posterior-regularisation/em.cc @@ -0,0 +1,830 @@ +// Input of the form: +// " the phantom of the opera " tickets for <PHRASE> tonight ? ||| C=1 ||| seats for <PHRASE> ? </s> ||| C=1 ||| i see <PHRASE> ? </s> ||| C=1 +// phrase TAB [context]+ +// where context = phrase ||| C=... which are separated by ||| + +// Model parameterised as follows: +// - each phrase, p, is allocated a latent state, t +// - this is used to generate the contexts, c +// - each context is generated using 4 independent multinomials, one for each position LL, L, R, RR + +// Training with EM: +// - e-step is estimating P(t|p,c) for all x,c +// - m-step is estimating model parameters P(p,c,t) = P(t) P(p|t) P(c|t) + +// Sexing it up: +// - constrain the posteriors P(t|c) and P(t|p) to have few high-magnitude entries +// - improve the generation of phrase internals, e.g., generate edge words from +// different distribution to central words + +#include "alphabet.hh" +#include "log_add.hh" +#include <algorithm> +#include <fstream> +#include <iostream> +#include <iterator> +#include <map> +#include <sstream> +#include <stdexcept> +#include <vector> +#include <tr1/random> +#include <tr1/tuple> +#include <nlopt.h> + +using namespace std; +using namespace std::tr1; + +const int numTags = 5; +const int numIterations = 100; +const bool posterior_regularisation = true; +const double PHRASE_VIOLATION_WEIGHT = 10; +const double CONTEXT_VIOLATION_WEIGHT = 0; +const bool includePhraseProb = false; + +// Data structures: +Alphabet<string> lexicon; +typedef vector<int> Phrase; +typedef tuple<int, int, int, int> Context; +Alphabet<Phrase> phrases; +Alphabet<Context> contexts; + +typedef map<int, int> ContextCounts; +typedef map<int, int> PhraseCounts; +typedef map<int, ContextCounts> PhraseToContextCounts; +typedef map<int, PhraseCounts> ContextToPhraseCounts; + +PhraseToContextCounts concordancePhraseToContexts; +ContextToPhraseCounts concordanceContextToPhrases; + +typedef vector<double> Dist; +typedef vector<Dist> ConditionalDist; +Dist prior; // class -> P(class) +vector<ConditionalDist> probCtx; // word -> class -> P(word | class), for each position of context word +ConditionalDist probPhrase; // class -> P(word | class) +Dist probPhraseLength; // class -> P(length | class) expressed as geometric distribution parameter + +mt19937 randomGenerator((size_t) time(NULL)); +uniform_real<double> uniDist(0.0, 1e-1); +variate_generator< mt19937, uniform_real<double> > rng(randomGenerator, uniDist); + +void addRandomNoise(Dist &d); +void normalise(Dist &d); +void addTo(Dist &d, const Dist &e); +int argmax(const Dist &d); + +map<Phrase, map<Context, int> > lambda_indices; + +Dist conditional_probs(const Phrase &phrase, const Context &context, double *normalisation = 0); +template <typename T> +Dist +penalised_conditionals(const Phrase &phrase, const Context &context, + const T &lambda, double *normalisation); +//Dist penalised_conditionals(const Phrase &phrase, const Context &context, const double *lambda, double *normalisation = 0); +double penalised_log_likelihood(int n, const double *lambda, double *gradient, void *data); +void optimise_lambda(double delta, double gamma, vector<double> &lambda); +double expected_violation_phrases(const double *lambda); +double expected_violation_contexts(const double *lambda); +double primal_kl_divergence(const double *lambda); +double dual(const double *lambda); +void print_primal_dual(const double *lambda, double delta, double gamma); + +ostream &operator<<(ostream &, const Phrase &); +ostream &operator<<(ostream &, const Context &); +ostream &operator<<(ostream &, const Dist &); +ostream &operator<<(ostream &, const ConditionalDist &); + +int +main(int argc, char *argv[]) +{ + randomGenerator.seed(time(NULL)); + + int edges = 0; + istream &input = cin; + while (input.good()) + { + // read the phrase + string phraseString; + Phrase phrase; + getline(input, phraseString, '\t'); + istringstream pinput(phraseString); + string token; + while (pinput >> token) + phrase.push_back(lexicon.insert(token)); + int phraseId = phrases.insert(phrase); + + // read the rest, storing each context + string remainder; + getline(input, remainder, '\n'); + istringstream rinput(remainder); + Context context(-1, -1, -1, -1); + int index = 0; + while (rinput >> token) + { + if (token != "|||" && token != "<PHRASE>") + { + if (index < 4) + { + // eugh! damn templates + switch (index) + { + case 0: get<0>(context) = lexicon.insert(token); break; + case 1: get<1>(context) = lexicon.insert(token); break; + case 2: get<2>(context) = lexicon.insert(token); break; + case 3: get<3>(context) = lexicon.insert(token); break; + default: assert(false); + } + index += 1; + } + else if (token.find("C=") == 0) + { + int contextId = contexts.insert(context); + int count = atoi(token.substr(strlen("C=")).c_str()); + concordancePhraseToContexts[phraseId][contextId] += count; + concordanceContextToPhrases[contextId][phraseId] += count; + index = 0; + context = Context(-1, -1, -1, -1); + edges += 1; + } + } + } + + // trigger EOF + input >> ws; + } + + cout << "Read in " << phrases.size() << " phrases" + << " and " << contexts.size() << " contexts" + << " and " << edges << " edges" + << " and " << lexicon.size() << " word types\n"; + + // FIXME: filter out low count phrases and low count contexts (based on individual words?) + // now populate model parameters with uniform + random noise + prior.resize(numTags, 1.0); + addRandomNoise(prior); + normalise(prior); + + probCtx.resize(4, ConditionalDist(numTags, Dist(lexicon.size(), 1.0))); + if (includePhraseProb) + probPhrase.resize(numTags, Dist(lexicon.size(), 1.0)); + for (int t = 0; t < numTags; ++t) + { + for (int j = 0; j < 4; ++j) + { + addRandomNoise(probCtx[j][t]); + normalise(probCtx[j][t]); + } + if (includePhraseProb) + { + addRandomNoise(probPhrase[t]); + normalise(probPhrase[t]); + } + } + if (includePhraseProb) + { + probPhraseLength.resize(numTags, 0.5); // geometric distribution p=0.5 + addRandomNoise(probPhraseLength); + } + + cout << "\tprior: " << prior << "\n"; + //cout << "\tcontext: " << probCtx << "\n"; + //cout << "\tphrase: " << probPhrase << "\n"; + //cout << "\tphraseLen: " << probPhraseLength << endl; + + vector<double> lambda; + + // now do EM training + for (int iteration = 0; iteration < numIterations; ++iteration) + { + cout << "EM iteration " << iteration << endl; + + if (posterior_regularisation) + optimise_lambda(PHRASE_VIOLATION_WEIGHT, CONTEXT_VIOLATION_WEIGHT, lambda); + //cout << "\tlambda " << lambda << endl; + + Dist countsPrior(numTags, 0.0); + vector<ConditionalDist> countsCtx(4, ConditionalDist(numTags, Dist(lexicon.size(), 1e-10))); + ConditionalDist countsPhrase(numTags, Dist(lexicon.size(), 1e-10)); + Dist countsPhraseLength(numTags, 0.0); + Dist nPhrases(numTags, 0.0); + + double llh = 0; + for (PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.begin(); + pcit != concordancePhraseToContexts.end(); ++pcit) + { + const Phrase &phrase = phrases.type(pcit->first); + + // e-step: estimate latent class probs; compile (class,word) stats for m-step + for (ContextCounts::iterator ccit = pcit->second.begin(); + ccit != pcit->second.end(); ++ccit) + { + const Context &context = contexts.type(ccit->first); + + double z = 0; + Dist tagCounts; + if (!posterior_regularisation) + tagCounts = conditional_probs(phrase, context, &z); + else + tagCounts = penalised_conditionals(phrase, context, lambda, &z); + + llh += log(z) * ccit->second; + addTo(countsPrior, tagCounts); // FIXME: times ccit->secon + + for (int t = 0; t < numTags; ++t) + { + for (int j = 0; j < 4; ++j) + countsCtx[j][t][get<0>(context)] += tagCounts[t] * ccit->second; + + if (includePhraseProb) + { + for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit) + countsPhrase[t][*pit] += tagCounts[t] * ccit->second; + countsPhraseLength[t] += phrase.size() * tagCounts[t] * ccit->second; + nPhrases[t] += tagCounts[t] * ccit->second; + } + } + } + } + + cout << "M-step\n"; + + // m-step: normalise prior and (class,word) stats and assign to model parameters + normalise(countsPrior); + prior = countsPrior; + for (int t = 0; t < numTags; ++t) + { + //cout << "\t\tt " << t << " prior " << countsPrior[t] << "\n"; + for (int j = 0; j < 4; ++j) + normalise(countsCtx[j][t]); + if (includePhraseProb) + { + normalise(countsPhrase[t]); + countsPhraseLength[t] = nPhrases[t] / countsPhraseLength[t]; + } + } + probCtx = countsCtx; + if (includePhraseProb) + { + probPhrase = countsPhrase; + probPhraseLength = countsPhraseLength; + } + + double *larray = new double[lambda.size()]; + copy(lambda.begin(), lambda.end(), larray); + print_primal_dual(larray, PHRASE_VIOLATION_WEIGHT, CONTEXT_VIOLATION_WEIGHT); + delete [] larray; + + //cout << "\tllh " << llh << endl; + //cout << "\tprior: " << prior << "\n"; + //cout << "\tcontext: " << probCtx << "\n"; + //cout << "\tphrase: " << probPhrase << "\n"; + //cout << "\tphraseLen: " << probPhraseLength << "\n"; + } + + // output class membership + for (PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.begin(); + pcit != concordancePhraseToContexts.end(); ++pcit) + { + const Phrase &phrase = phrases.type(pcit->first); + for (ContextCounts::iterator ccit = pcit->second.begin(); + ccit != pcit->second.end(); ++ccit) + { + const Context &context = contexts.type(ccit->first); + Dist tagCounts = conditional_probs(phrase, context, 0); + cout << phrase << " ||| " << context << " ||| " << argmax(tagCounts) << "\n"; + } + } + + return 0; +} + +void addRandomNoise(Dist &d) +{ + for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit) + *dit += rng(); +} + +void normalise(Dist &d) +{ + double z = 0; + for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit) + z += *dit; + for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit) + *dit /= z; +} + +void addTo(Dist &d, const Dist &e) +{ + assert(d.size() == e.size()); + for (int i = 0; i < (int) d.size(); ++i) + d[i] += e[i]; +} + +int argmax(const Dist &d) +{ + double best = d[0]; + int index = 0; + for (int i = 1; i < (int) d.size(); ++i) + { + if (d[i] > best) + { + best = d[i]; + index = i; + } + } + return index; +} + +ostream &operator<<(ostream &out, const Phrase &phrase) +{ + for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit) + lexicon.display(((pit == phrase.begin()) ? out : out << " "), *pit); + return out; +} + +ostream &operator<<(ostream &out, const Context &context) +{ + lexicon.display(out, get<0>(context)); + lexicon.display(out << " ", get<1>(context)); + lexicon.display(out << " <PHRASE> ", get<2>(context)); + lexicon.display(out << " ", get<3>(context)); + return out; +} + +ostream &operator<<(ostream &out, const Dist &dist) +{ + for (Dist::const_iterator dit = dist.begin(); dit != dist.end(); ++dit) + out << ((dit == dist.begin()) ? "" : " ") << *dit; + return out; +} + +ostream &operator<<(ostream &out, const ConditionalDist &dist) +{ + for (ConditionalDist::const_iterator dit = dist.begin(); dit != dist.end(); ++dit) + out << ((dit == dist.begin()) ? "" : "; ") << *dit; + return out; +} + +// FIXME: slow - just use the phrase index, context index to do the mapping +// (n.b. it's a sparse setup, not just equal to 3d array index) +int +lambda_index(const Phrase &phrase, const Context &context, int tag) +{ + return lambda_indices[phrase][context] + tag; +} + +template <typename T> +Dist +penalised_conditionals(const Phrase &phrase, const Context &context, + const T &lambda, double *normalisation) +{ + Dist d = conditional_probs(phrase, context, 0); + + double z = 0; + for (int t = 0; t < numTags; ++t) + { + d[t] *= exp(-lambda[lambda_index(phrase, context, t)]); + z += d[t]; + } + + if (normalisation) + *normalisation = z; + + for (int t = 0; t < numTags; ++t) + d[t] /= z; + + return d; +} + +Dist +conditional_probs(const Phrase &phrase, const Context &context, double *normalisation) +{ + Dist tagCounts(numTags, 0.0); + double z = 0; + for (int t = 0; t < numTags; ++t) + { + double prob = prior[t]; + prob *= (probCtx[0][t][get<0>(context)] * probCtx[1][t][get<1>(context)] * + probCtx[2][t][get<2>(context)] * probCtx[3][t][get<3>(context)]); + + if (includePhraseProb) + { + prob *= pow(1 - probPhraseLength[t], phrase.size() - 1) * probPhraseLength[t]; + for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit) + prob *= probPhrase[t][*pit]; + } + + tagCounts[t] = prob; + z += prob; + } + if (normalisation) + *normalisation = z; + + for (int t = 0; t < numTags; ++t) + tagCounts[t] /= z; + + return tagCounts; +} + +double +penalised_log_likelihood(int n, const double *lambda, double *grad, void *) +{ + // return log Z(lambda, theta) over the corpus + // where theta are the global parameters (prior, probCtx*, probPhrase*) + // and lambda are lagrange multipliers for the posterior sparsity constraints + // + // this is formulated as: + // f = log Z(lambda) = sum_i log ( sum_i p_theta(t_i|p_i,c_i) exp [-lambda_{t_i,p_i,c_i}] ) + // where i indexes the training examples - specifying the (p, c) pair (which may occur with count > 1) + // + // with derivative: + // f'_{tpc} = frac { - count(t,p,c) p_theta(t|p,c) exp (-lambda_{t,p,c}) } + // { sum_t' p_theta(t'|p,c) exp (-lambda_{t',p,c}) } + + //cout << "penalised_log_likelihood with lambda "; + //copy(lambda, lambda+n, ostream_iterator<double>(cout, " ")); + //cout << "\n"; + + double f = 0; + if (grad) + { + for (int i = 0; i < n; ++i) + grad[i] = 0.0; + } + + for (int p = 0; p < phrases.size(); ++p) + { + const Phrase &phrase = phrases.type(p); + PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); + for (ContextCounts::const_iterator ccit = pcit->second.begin(); + ccit != pcit->second.end(); ++ccit) + { + const Context &context = contexts.type(ccit->first); + double z = 0; + Dist scores = penalised_conditionals(phrase, context, lambda, &z); + + f += ccit->second * log(z); + //cout << "\tphrase: " << phrase << " context: " << context << " count: " << ccit->second << " z " << z << endl; + //cout << "\t\tscores: " << scores << "\n"; + + if (grad) + { + for (int t = 0; t < numTags; ++t) + { + int i = lambda_index(phrase, context, t); // FIXME: redundant lookups + assert(grad[i] == 0.0); + grad[i] = - ccit->second * scores[t]; + } + } + } + } + + //cout << "penalised_log_likelihood returning " << f; + //if (grad) + //{ + //cout << "\ngradient: "; + //copy(grad, grad+n, ostream_iterator<double>(cout, " ")); + //} + //cout << "\n"; + + return f; +} + +typedef struct +{ + // one of p or c should be set to -1, in which case it will be marginalised out + // i.e. sum_p' lambda_{p'ct} <= threshold + // or sum_c' lambda_{pc't} <= threshold + int p, c, t, threshold; +} constraint_data; + +double +constraint_and_gradient(int n, const double *lambda, double *grad, void *data) +{ + constraint_data *d = (constraint_data *) data; + assert(d->t >= 0); + assert(d->threshold >= 0); + + //cout << "constraint_and_gradient: t " << d->t << " p " << d->p << " c " << d->c << " tau " << d->threshold << endl; + //cout << "\tlambda "; + //copy(lambda, lambda+n, ostream_iterator<double>(cout, " ")); + //cout << "\n"; + + // FIXME: it's crazy to use a dense gradient here => will only have a handful of non-zero entries + if (grad) + { + for (int i = 0; i < n; ++i) + grad[i] = 0.0; + } + + //cout << "constraint_and_gradient: " << d->p << "; " << d->c << "; " << d->t << "; " << d->threshold << endl; + + if (d->p >= 0) + { + assert(d->c < 0); + // sum_c lambda_pct <= delta [a.k.a. threshold] + // => sum_c lambda_pct - delta <= 0 + // derivative_pct = { 1, if p and t match; 0, otherwise } + + double val = -d->threshold; + + const Phrase &phrase = phrases.type(d->p); + PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(d->p); + assert(pcit != concordancePhraseToContexts.end()); + for (ContextCounts::const_iterator ccit = pcit->second.begin(); + ccit != pcit->second.end(); ++ccit) + { + const Context &context = contexts.type(ccit->first); + int i = lambda_index(phrase, context, d->t); + val += lambda[i]; + if (grad) grad[i] = 1; + } + //cout << "\treturning " << val << endl; + + return val; + } + else + { + assert(d->c >= 0); + assert(d->p < 0); + // sum_p lambda_pct <= gamma [a.k.a. threshold] + // => sum_p lambda_pct - gamma <= 0 + // derivative_pct = { 1, if c and t match; 0, otherwise } + + double val = -d->threshold; + + const Context &context = contexts.type(d->c); + ContextToPhraseCounts::iterator cpit = concordanceContextToPhrases.find(d->c); + assert(cpit != concordanceContextToPhrases.end()); + for (PhraseCounts::iterator pcit = cpit->second.begin(); + pcit != cpit->second.end(); ++pcit) + { + const Phrase &phrase = phrases.type(pcit->first); + int i = lambda_index(phrase, context, d->t); + val += lambda[i]; + if (grad) grad[i] = 1; + } + //cout << "\treturning " << val << endl; + + return val; + } +} + +void +optimise_lambda(double delta, double gamma, vector<double> &lambdav) +{ + int num_lambdas = lambdav.size(); + if (lambda_indices.empty() || lambdav.empty()) + { + lambda_indices.clear(); + lambdav.clear(); + + int i = 0; + for (int p = 0; p < phrases.size(); ++p) + { + const Phrase &phrase = phrases.type(p); + PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.find(p); + for (ContextCounts::iterator ccit = pcit->second.begin(); + ccit != pcit->second.end(); ++ccit) + { + const Context &context = contexts.type(ccit->first); + lambda_indices[phrase][context] = i; + i += numTags; + } + } + num_lambdas = i; + lambdav.resize(num_lambdas); + } + //cout << "optimise_lambda: #langrange multipliers " << num_lambdas << endl; + + // FIXME: better to work with an implicit representation to save memory usage + int num_constraints = (((delta > 0) ? phrases.size() : 0) + ((gamma > 0) ? contexts.size() : 0)) * numTags; + //cout << "optimise_lambda: #constraints " << num_constraints << endl; + constraint_data *data = new constraint_data[num_constraints]; + int i = 0; + if (delta > 0) + { + for (int p = 0; p < phrases.size(); ++p) + { + for (int t = 0; t < numTags; ++t, ++i) + { + constraint_data &d = data[i]; + d.p = p; + d.c = -1; + d.t = t; + d.threshold = delta; + } + } + } + + if (gamma > 0) + { + for (int c = 0; c < contexts.size(); ++c) + { + for (int t = 0; t < numTags; ++t, ++i) + { + constraint_data &d = data[i]; + d.p = -1; + d.c = c; + d.t = t; + d.threshold = gamma; + } + } + } + assert(i == num_constraints); + + double lambda[num_lambdas]; + double lb[num_lambdas], ub[num_lambdas]; + for (i = 0; i < num_lambdas; ++i) + { + lambda[i] = lambdav[i]; // starting value + lb[i] = 0; // lower bound + if (delta <= 0) // upper bound + ub[i] = gamma; + else if (gamma <= 0) + ub[i] = delta; + else + assert(false); + } + + //print_primal_dual(lambda, delta, gamma); + + double minf; + int error_code = nlopt_minimize_constrained(NLOPT_LN_COBYLA, num_lambdas, penalised_log_likelihood, NULL, + num_constraints, constraint_and_gradient, data, sizeof(constraint_data), + lb, ub, lambda, &minf, -HUGE_VAL, 0.0, 0.0, 1e-4, NULL, 0, 0.0); + //cout << "optimise error code " << error_code << endl; + + //print_primal_dual(lambda, delta, gamma); + + delete [] data; + + if (error_code < 0) + cout << "WARNING: optimisation failed with error code: " << error_code << endl; + //else + //{ + //cout << "success; minf " << minf << endl; + //print_primal_dual(lambda, delta, gamma); + //} + + lambdav = vector<double>(&lambda[0], &lambda[0] + num_lambdas); +} + +// FIXME: inefficient - cache the scores +double +expected_violation_phrases(const double *lambda) +{ + // sum_pt max_c E_q[phi_pct] + double violation = 0; + + for (int p = 0; p < phrases.size(); ++p) + { + const Phrase &phrase = phrases.type(p); + PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); + + for (int t = 0; t < numTags; ++t) + { + double best = 0; + for (ContextCounts::const_iterator ccit = pcit->second.begin(); + ccit != pcit->second.end(); ++ccit) + { + const Context &context = contexts.type(ccit->first); + Dist scores = penalised_conditionals(phrase, context, lambda, 0); + best = max(best, scores[t]); + } + violation += best; + } + } + + return violation; +} + +// FIXME: inefficient - cache the scores +double +expected_violation_contexts(const double *lambda) +{ + // sum_ct max_p E_q[phi_pct] + double violation = 0; + + for (int c = 0; c < contexts.size(); ++c) + { + const Context &context = contexts.type(c); + ContextToPhraseCounts::iterator cpit = concordanceContextToPhrases.find(c); + + for (int t = 0; t < numTags; ++t) + { + double best = 0; + for (PhraseCounts::iterator pit = cpit->second.begin(); + pit != cpit->second.end(); ++pit) + { + const Phrase &phrase = phrases.type(pit->first); + Dist scores = penalised_conditionals(phrase, context, lambda, 0); + best = max(best, scores[t]); + } + violation += best; + } + } + + return violation; +} + +// FIXME: possibly inefficient +double +primal_likelihood() // FIXME: primal evaluation needs to use lambda and calculate l1linf terms +{ + double llh = 0; + for (int p = 0; p < phrases.size(); ++p) + { + const Phrase &phrase = phrases.type(p); + PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); + for (ContextCounts::const_iterator ccit = pcit->second.begin(); + ccit != pcit->second.end(); ++ccit) + { + const Context &context = contexts.type(ccit->first); + double z = 0; + Dist scores = conditional_probs(phrase, context, &z); + llh += ccit->second * log(z); + } + } + return llh; +} + +// FIXME: inefficient - cache the scores +double +primal_kl_divergence(const double *lambda) +{ + // return KL(q || p) = sum_y q(y) { log q(y) - log p(y | x) } + // = sum_y q(y) { log p(y | x) - lambda . phi(x, y) - log Z - log p(y | x) } + // = sum_y q(y) { - lambda . phi(x, y) } - log Z + // and q(y) factors with each edge, ditto for Z + + double feature_sum = 0, log_z = 0; + for (int p = 0; p < phrases.size(); ++p) + { + const Phrase &phrase = phrases.type(p); + PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); + for (ContextCounts::const_iterator ccit = pcit->second.begin(); + ccit != pcit->second.end(); ++ccit) + { + const Context &context = contexts.type(ccit->first); + + double local_z = 0; + double local_f = 0; + Dist d = conditional_probs(phrase, context, 0); + for (int t = 0; t < numTags; ++t) + { + int i = lambda_index(phrase, context, t); + double s = d[t] * exp(-lambda[i]); + local_f += lambda[i] * s; + local_z += s; + } + + log_z += ccit->second * log(local_z); + feature_sum += ccit->second * (local_f / local_z); + } + } + + return -feature_sum - log_z; +} + +// FIXME: inefficient - cache the scores +double +dual(const double *lambda) +{ + // return log(Z) = - log { sum_y p(y | x) exp( - lambda . phi(x, y) } + // n.b. have flipped the sign as we're minimising + + double z = 0; + for (int p = 0; p < phrases.size(); ++p) + { + const Phrase &phrase = phrases.type(p); + PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); + for (ContextCounts::const_iterator ccit = pcit->second.begin(); + ccit != pcit->second.end(); ++ccit) + { + const Context &context = contexts.type(ccit->first); + double lz = 0; + Dist scores = penalised_conditionals(phrase, context, lambda, &z); + z += lz * ccit->second; + } + } + return log(z); +} + +void +print_primal_dual(const double *lambda, double delta, double gamma) +{ + double likelihood = primal_likelihood(); + double kl = primal_kl_divergence(lambda); + double sum_pt = expected_violation_phrases(lambda); + double sum_ct = expected_violation_contexts(lambda); + //double d = dual(lambda); + + cout << "\tllh=" << likelihood + << " kl=" << kl + << " violations phrases=" << sum_pt + << " contexts=" << sum_ct + //<< " primal=" << (kl + delta * sum_pt + gamma * sum_ct) + //<< " dual=" << d + << " objective=" << (likelihood - kl + delta * sum_pt + gamma * sum_ct) + << endl; +} diff --git a/gi/posterior-regularisation/invert.hh b/gi/posterior-regularisation/invert.hh new file mode 100644 index 00000000..d06356e9 --- /dev/null +++ b/gi/posterior-regularisation/invert.hh @@ -0,0 +1,45 @@ +// The following code inverts the matrix input using LU-decomposition with +// backsubstitution of unit vectors. Reference: Numerical Recipies in C, 2nd +// ed., by Press, Teukolsky, Vetterling & Flannery. +// Code written by Fredrik Orderud. +// http://www.crystalclearsoftware.com/cgi-bin/boost_wiki/wiki.pl?LU_Matrix_Inversion + +#ifndef INVERT_MATRIX_HPP +#define INVERT_MATRIX_HPP + +// REMEMBER to update "lu.hpp" header includes from boost-CVS +#include <boost/numeric/ublas/vector.hpp> +#include <boost/numeric/ublas/vector_proxy.hpp> +#include <boost/numeric/ublas/matrix.hpp> +#include <boost/numeric/ublas/triangular.hpp> +#include <boost/numeric/ublas/lu.hpp> +#include <boost/numeric/ublas/io.hpp> + +namespace ublas = boost::numeric::ublas; + +/* Matrix inversion routine. + Uses lu_factorize and lu_substitute in uBLAS to invert a matrix */ +template<class T> +bool invert_matrix(const ublas::matrix<T>& input, ublas::matrix<T>& inverse) +{ + using namespace boost::numeric::ublas; + typedef permutation_matrix<std::size_t> pmatrix; + // create a working copy of the input + matrix<T> A(input); + // create a permutation matrix for the LU-factorization + pmatrix pm(A.size1()); + + // perform LU-factorization + int res = lu_factorize(A,pm); + if( res != 0 ) return false; + + // create identity matrix of "inverse" + inverse.assign(ublas::identity_matrix<T>(A.size1())); + + // backsubstitute to get the inverse + lu_substitute(A, pm, inverse); + + return true; +} + +#endif //INVERT_MATRIX_HPP diff --git a/gi/posterior-regularisation/log_add.hh b/gi/posterior-regularisation/log_add.hh new file mode 100644 index 00000000..e0620c5a --- /dev/null +++ b/gi/posterior-regularisation/log_add.hh @@ -0,0 +1,30 @@ +#ifndef log_add_hh +#define log_add_hh + +#include <limits> +#include <iostream> +#include <cassert> +#include <cmath> + +template <typename T> +struct Log +{ + static T zero() { return -std::numeric_limits<T>::infinity(); } + + static T add(T l1, T l2) + { + if (l1 == zero()) return l2; + if (l1 > l2) + return l1 + std::log(1 + exp(l2 - l1)); + else + return l2 + std::log(1 + exp(l1 - l2)); + } + + static T subtract(T l1, T l2) + { + //std::assert(l1 >= l2); + return l1 + log(1 - exp(l2 - l1)); + } +}; + +#endif diff --git a/gi/posterior-regularisation/projected_gradient.cc b/gi/posterior-regularisation/projected_gradient.cc new file mode 100644 index 00000000..f7c39817 --- /dev/null +++ b/gi/posterior-regularisation/projected_gradient.cc @@ -0,0 +1,87 @@ +// +// Minimises given functional using the projected gradient method. Based on +// algorithm and demonstration example in Linear and Nonlinear Programming, +// Luenberger and Ye, 3rd ed., p 370. +// + +#include "invert.hh" +#include <iostream> + +using namespace std; + +double +f(double x1, double x2, double x3, double x4) +{ + return x1 * x1 + x2 * x2 + x3 * x3 + x4 * x4 - 2 * x1 - 3 * x4; +} + +ublas::vector<double> +g(double x1, double x2, double x3, double x4) +{ + ublas::vector<double> v(4); + v(0) = 2 * x1 - 2; + v(1) = 2 * x2; + v(2) = 2 * x3; + v(3) = 2 * x4 - 3; + return v; +} + +ublas::matrix<double> +activeConstraints(double x1, double x2, double x3, double x4) +{ + int n = 2; + if (x1 == 0) ++n; + if (x2 == 0) ++n; + if (x3 == 0) ++n; + if (x4 == 0) ++n; + + ublas::matrix<double> a(n,4); + a(0, 0) = 2; a(0, 1) = 1; a(0, 2) = 1; a(0, 3) = 4; + a(1, 0) = 1; a(1, 1) = 1; a(1, 2) = 2; a(1, 3) = 1; + + int c = 2; + if (x1 == 0) a(c++, 0) = 1; + if (x2 == 0) a(c++, 1) = 1; + if (x3 == 0) a(c++, 2) = 1; + if (x4 == 0) a(c++, 3) = 1; + + return a; +} + +ublas::matrix<double> +projection(const ublas::matrix<double> &a) +{ + ublas::matrix<double> aT = ublas::trans(a); + ublas::matrix<double> inv(a.size1(), a.size1()); + bool ok = invert_matrix(ublas::matrix<double>(ublas::prod(a, aT)), inv); + assert(ok && "Failed to invert matrix"); + return ublas::identity_matrix<double>(4) - + ublas::prod(aT, ublas::matrix<double>(ublas::prod(inv, a))); +} + +int main(int argc, char *argv[]) +{ + double x1 = 2, x2 = 2, x3 = 1, x4 = 0; + + double fval = f(x1, x2, x3, x4); + cout << "f = " << fval << endl; + ublas::vector<double> grad = g(x1, x2, x3, x4); + cout << "g = " << grad << endl; + ublas::matrix<double> A = activeConstraints(x1, x2, x3, x4); + cout << "A = " << A << endl; + ublas::matrix<double> P = projection(A); + cout << "P = " << P << endl; + // the direction of movement + ublas::vector<double> d = prod(P, grad); + cout << "d = " << (d / d(0)) << endl; + + // special case for d = 0 + + // next solve for limits on the line search + + // then use golden rule technique between these values (if bounded) + + // or simple Armijo's rule technique + + return 0; +} diff --git a/gi/posterior-regularisation/train_pr_global.py b/gi/posterior-regularisation/train_pr_global.py new file mode 100644 index 00000000..467069ef --- /dev/null +++ b/gi/posterior-regularisation/train_pr_global.py @@ -0,0 +1,272 @@ +import sys +import scipy.optimize +from numpy import * +from numpy.random import random + +# +# Step 1: load the concordance counts +# + +edges_phrase_to_context = {} +edges_context_to_phrase = {} +types = {} +num_edges = 0 + +for line in sys.stdin: + phrase, rest = line.strip().split('\t') + parts = rest.split('|||') + for i in range(0, len(parts), 2): + context, count = parts[i:i+2] + + ctx = tuple(filter(lambda x: x != '<PHRASE>', context.split())) + cnt = int(count.strip()[2:]) + ccs = edges_phrase_to_context.setdefault(phrase, {}) + ccs[ctx] = cnt + pcs = edges_context_to_phrase.setdefault(ctx, {}) + pcs[phrase] = cnt + + for token in ctx: + types.setdefault(token, len(types)) + for token in phrase.split(): + types.setdefault(token, len(types)) + + num_edges += 1 + +print 'Read in', num_edges, 'edges and', len(types), 'word types' + +# +# Step 2: initialise the model parameters +# + +num_tags = 5 +num_types = len(types) +delta = int(sys.argv[1]) +gamma = int(sys.argv[2]) + +def normalise(a): + return a / sum(a) + +# Pr(tag) +tagDist = normalise(random(num_tags)+1) +# Pr(context at pos i = w | tag) indexed by i, tag, word +contextWordDist = [[normalise(random(num_types)+1) for t in range(num_tags)] for i in range(4)] +# PR langrange multipliers +lamba = zeros(num_edges * num_tags) +lamba_index = {} +next = 0 +for phrase, ccs in edges_phrase_to_context.items(): + for context in ccs.keys(): + lamba_index[phrase,context] = next + next += num_tags + +# +# Step 3: expectation maximisation +# + +for iteration in range(20): + tagCounts = zeros(num_tags) + contextWordCounts = [[zeros(num_types) for t in range(num_tags)] for i in range(4)] + + #print 'tagDist', tagDist + #print 'contextWordCounts[0][0]', contextWordCounts[0][0] + + # Tune lambda + # dual: min log Z(lamba) s.t. lamba >= 0; + # sum_c lamba_pct <= delta; sum_p lamba_pct <= gamma + def dual(ls): + logz = 0 + for phrase, ccs in edges_phrase_to_context.items(): + for context, count in ccs.items(): + conditionals = zeros(num_tags) + for t in range(num_tags): + prob = tagDist[t] + for i in range(4): + prob *= contextWordDist[i][t][types[context[i]]] + conditionals[t] = prob + cz = sum(conditionals) + conditionals /= cz + + local_z = 0 + for t in range(num_tags): + local_z += conditionals[t] * exp(-ls[lamba_index[phrase,context] + t]) + logz += log(local_z) * count + + #print 'ls', ls + #print 'lambda', list(ls) + #print 'dual', logz + return logz + + def primal(ls): + # FIXME: returns negative values for KL (impossible) + logz = dual(ls) + kl = -logz + + expectations = zeros(lamba.shape) + for phrase, ccs in edges_phrase_to_context.items(): + for context, count in ccs.items(): + conditionals = zeros(num_tags) + for t in range(num_tags): + prob = tagDist[t] + for i in range(4): + prob *= contextWordDist[i][t][types[context[i]]] + conditionals[t] = prob + cz = sum(conditionals) + conditionals /= cz + + scores = zeros(num_tags) + for t in range(num_tags): + scores[t] = conditionals[t] * exp(-ls[lamba_index[phrase,context] + t]) + local_z = sum(scores) + + for t in range(num_tags): + li = lamba_index[phrase,context] + t + expectations[li] = scores[t] / local_z * count + kl -= expectations[li] * ls[li] + + pt_l1linf = 0 + for phrase, ccs in edges_phrase_to_context.items(): + for t in range(num_tags): + best = -1e500 + for context, count in ccs.items(): + li = lamba_index[phrase,context] + t + s = expectations[li] + if s > best: best = s + pt_l1linf += best + + ct_l1linf = 0 + for context, pcs in edges_context_to_phrase.items(): + for t in range(num_tags): + best = -1e500 + for phrase, count in pcs.items(): + li = lamba_index[phrase,context] + t + s = expectations[li] + if s > best: best = s + ct_l1linf += best + + return kl, pt_l1linf, ct_l1linf, kl + delta * pt_l1linf + gamma * ct_l1linf + + def dual_deriv(ls): + # d/dl log(z) = E_q[phi] + deriv = zeros(num_edges * num_tags) + for phrase, ccs in edges_phrase_to_context.items(): + for context, count in ccs.items(): + conditionals = zeros(num_tags) + for t in range(num_tags): + prob = tagDist[t] + for i in range(4): + prob *= contextWordDist[i][t][types[context[i]]] + conditionals[t] = prob + cz = sum(conditionals) + conditionals /= cz + + scores = zeros(num_tags) + for t in range(num_tags): + scores[t] = conditionals[t] * exp(-ls[lamba_index[phrase,context] + t]) + local_z = sum(scores) + + for t in range(num_tags): + deriv[lamba_index[phrase,context] + t] -= count * scores[t] / local_z + + #print 'ddual', deriv + return deriv + + def constraints(ls): + cons = [] + if delta > 0: + for phrase, ccs in edges_phrase_to_context.items(): + for t in range(num_tags): + total = delta + for cprime in ccs.keys(): + total -= ls[lamba_index[phrase, cprime] + t] + cons.append(total) + + if gamma > 0: + for context, pcs in edges_context_to_phrase.items(): + for t in range(num_tags): + total = gamma + for pprime in pcs.keys(): + total -= ls[lamba_index[pprime, context] + t] + cons.append(total) + #print 'cons', cons + return cons + + def constraints_deriv(ls): + cons = [] + if delta > 0: + for phrase, ccs in edges_phrase_to_context.items(): + for t in range(num_tags): + d = zeros(num_edges * num_tags) + for cprime in ccs.keys(): + d[lamba_index[phrase, cprime] + t] = -1 + cons.append(d) + + if gamma > 0: + for context, pcs in edges_context_to_phrase.items(): + for t in range(num_tags): + d = zeros(num_edges * num_tags) + for pprime in pcs.keys(): + d[lamba_index[pprime, context] + t] = -1 + cons.append(d) + #print 'dcons', cons + return cons + + print 'Pre lambda optimisation dual', dual(lamba), 'primal', primal(lamba) + lamba = scipy.optimize.fmin_slsqp(dual, lamba, + bounds=[(0, delta)] * (num_edges * num_tags), + f_ieqcons=constraints, + fprime=dual_deriv, + fprime_ieqcons=constraints_deriv, + iprint=0) + print 'Post lambda optimisation dual', dual(lamba), 'primal', primal(lamba) + + # E-step + llh = z = 0 + for phrase, ccs in edges_phrase_to_context.items(): + for context, count in ccs.items(): + conditionals = zeros(num_tags) + for t in range(num_tags): + prob = tagDist[t] + for i in range(4): + prob *= contextWordDist[i][t][types[context[i]]] + conditionals[t] = prob + cz = sum(conditionals) + conditionals /= cz + llh += log(cz) * count + + scores = zeros(num_tags) + li = lamba_index[phrase, context] + for t in range(num_tags): + scores[t] = conditionals[t] * exp(-lamba[li + t]) + z += count * sum(scores) + + tagCounts += count * scores + for i in range(4): + for t in range(num_tags): + contextWordCounts[i][t][types[context[i]]] += count * scores[t] + + print 'iteration', iteration, 'llh', llh, 'logz', log(z) + + # M-step + tagDist = normalise(tagCounts) + for i in range(4): + for t in range(num_tags): + contextWordDist[i][t] = normalise(contextWordCounts[i][t]) + + +for phrase, ccs in edges_phrase_to_context.items(): + for context, count in ccs.items(): + conditionals = zeros(num_tags) + for t in range(num_tags): + prob = tagDist[t] + for i in range(4): + prob *= contextWordDist[i][t][types[context[i]]] + conditionals[t] = prob + cz = sum(conditionals) + conditionals /= cz + + scores = zeros(num_tags) + li = lamba_index[phrase, context] + for t in range(num_tags): + scores[t] = conditionals[t] * exp(-lamba[li + t]) + + print '%s\t%s ||| C=%d ||| %d |||' % (phrase, context, count, argmax(scores)), scores / sum(scores) |