diff options
Diffstat (limited to 'gi/posterior-regularisation')
| -rw-r--r-- | gi/posterior-regularisation/PhraseContextModel.java | 317 | ||||
| -rw-r--r-- | gi/posterior-regularisation/README | 3 | ||||
| -rw-r--r-- | gi/posterior-regularisation/alphabet.hh | 61 | ||||
| -rw-r--r-- | gi/posterior-regularisation/canned.concordance | 4 | ||||
| -rw-r--r-- | gi/posterior-regularisation/em.cc | 830 | ||||
| -rw-r--r-- | gi/posterior-regularisation/invert.hh | 45 | ||||
| -rw-r--r-- | gi/posterior-regularisation/log_add.hh | 30 | ||||
| -rw-r--r-- | gi/posterior-regularisation/projected_gradient.cc | 87 | ||||
| -rw-r--r-- | gi/posterior-regularisation/train_pr_global.py | 272 | 
9 files changed, 1649 insertions, 0 deletions
diff --git a/gi/posterior-regularisation/PhraseContextModel.java b/gi/posterior-regularisation/PhraseContextModel.java new file mode 100644 index 00000000..3af72d54 --- /dev/null +++ b/gi/posterior-regularisation/PhraseContextModel.java @@ -0,0 +1,317 @@ +// Input of the form: +// " the phantom of the opera "    tickets for <PHRASE> tonight ? ||| C=1 ||| seats for <PHRASE> ? </s> ||| C=1 ||| i see <PHRASE> ? </s> ||| C=1 +//                      phrase TAB [context]+ +// where    context =   phrase ||| C=...        which are separated by ||| + +// Model parameterised as follows: +// - each phrase, p, is allocated a latent state, t +// - this is used to generate the contexts, c +// - each context is generated using 4 independent multinomials, one for each position LL, L, R, RR + +// Training with EM: +// - e-step is estimating q(t) = P(t|p,c) for all x,c +// - m-step is estimating model parameters P(c,t|p) = P(t) P(c|t) +// - PR uses alternate e-step, which first optimizes lambda  +//      min_q KL(q||p) + delta sum_pt max_c E_q[phi_ptc] +//   where +//      q(t|p,c) propto p(t,c|p) exp( -phi_ptc ) +//   Then q is used to obtain expectations for vanilla M-step. + +// Sexing it up: +// - learn p-specific conditionals P(t|p) +// - or generate phrase internals, e.g., generate edge words from +//   different distribution to central words +// - agreement between phrase->context model and context->phrase model + +import java.io.*; +import java.util.*; +import java.util.regex.*; +import static java.lang.Math.*; + +class Lexicon<T> +{ +    public int insert(T word) +    { +        Integer i = wordToIndex.get(word); +        if (i == null) +        { +            i = indexToWord.size(); +            wordToIndex.put(word, i); +            indexToWord.add(word); +        } +        return i; +    } + +    public T lookup(int index) +    { +        return indexToWord.get(index); +    } + +    public int size() +    { +        return indexToWord.size(); +    } +     +    private Map<T,Integer> wordToIndex = new HashMap<T,Integer>(); +    private List<T> indexToWord = new ArrayList<T>(); +} + +class PhraseContextModel +{ +    // model/optimisation configuration parameters +    int numTags; +    int numPRIterations = 5; +    boolean posteriorRegularisation = false; +    double constraintScale = 10; + +    // book keeping +    Lexicon<String> tokenLexicon = new Lexicon<String>(); +    int numPositions; +    Random rng = new Random(); + +    // training set; 1 entry for each unique phrase +    PhraseAndContexts training[]; + +    // model parameters (learnt) +    double emissions[][][]; // position in 0 .. 3 x tag x word      Pr(word | tag, position) +    double prior[][];       // phrase x tag                         Pr(tag | phrase) +    //double lambda[][][];    // word x context x tag               langrange multipliers + +    PhraseContextModel(File infile, int tags) throws IOException +    { +        numTags = tags; +        readTrainingFromFile(new FileReader(infile)); +        assert(training.length > 0); + +        // now initialise emissions +        assert(training[0].contexts.length > 0); +        numPositions = training[0].contexts[0].tokens.length; + +        emissions = new double[numPositions][numTags][tokenLexicon.size()]; +        prior = new double[training.length][numTags]; +        //lambda = new double[tokenLexicon.size()][???][numTags] + +        for (double[][] emissionTW: emissions) +            for (double[] emissionW: emissionTW) +                randomise(emissionW); + +        for (double[] priorTag: prior) +            randomise(priorTag); +    } + +    void expectationMaximisation(int numIterations) +    {  +        for (int iteration = 0; iteration < numIterations; ++iteration) +        { +            double emissionsCounts[][][] = new double[numPositions][numTags][tokenLexicon.size()]; +            double priorCounts[][] = new double[training.length][numTags]; + +            // E-step +            double llh = 0; +            for (int i = 0; i < training.length; ++i) +            { +                PhraseAndContexts instance = training[i]; +                for (Context ctx: instance.contexts) +                { +                    double probs[] = posterior(i, ctx); +                    double z = normalise(probs); +                    llh += log(z) * ctx.count; + +                    for (int t = 0; t < numTags; ++t) +                    { +                        priorCounts[i][t] += ctx.count * probs[t]; +                        for (int j = 0; j < ctx.tokens.length; ++j) +                            emissionsCounts[j][t][ctx.tokens[j]] += ctx.count * probs[t]; +                    } +                } +            } + +            // M-step: normalise +            for (double[][] emissionTW: emissionsCounts) +                for (double[] emissionW: emissionTW) +                    normalise(emissionW); + +            for (double[] priorTag: priorCounts) +                normalise(priorTag); + +            emissions = emissionsCounts; +            prior = priorCounts; + +            System.out.println("Iteration " + iteration + " llh " + llh); +        } +    } + +    static double normalise(double probs[]) +    { +        double z = 0; +        for (double p: probs) +            z += p; +        for (int i = 0; i < probs.length; ++i) +            probs[i] /= z; +        return z; +    } + +    void randomise(double probs[]) +    { +        double z = 0; +        for (int i = 0; i < probs.length; ++i) +        { +            probs[i] = 10 + rng.nextDouble(); +            z += probs[i]; +        } +             +        for (int i = 0; i < probs.length; ++i) +            probs[i] /= z; +    } +     +    static int argmax(double probs[]) +    { +        double m = Double.NEGATIVE_INFINITY; +        int mi = -1; +        for (int i = 0; i < probs.length; ++i) +        { +            if (probs[i] > m) +            { +                m = probs[i]; +                mi = i; +            } +        } +        return mi; +    } + +    double[] posterior(int phraseId, Context c) // unnormalised +    { +        double probs[] = new double[numTags]; +        for (int t = 0; t < numTags; ++t) +        { +            probs[t] = prior[phraseId][t]; +            for (int j = 0; j < c.tokens.length; ++j) +                probs[t] *= emissions[j][t][c.tokens[j]]; +        } +        return probs; +    } + +    private void readTrainingFromFile(Reader in) throws IOException +    { +        // read in line-by-line +        BufferedReader bin = new BufferedReader(in); +        String line; +        List<PhraseAndContexts> instances = new ArrayList<PhraseAndContexts>(); +        Pattern separator = Pattern.compile(" \\|\\|\\| "); + +        int numEdges = 0; +        while ((line = bin.readLine()) != null) +        { +            // split into phrase and contexts +            StringTokenizer st = new StringTokenizer(line, "\t"); +            assert(st.hasMoreTokens()); +            String phrase = st.nextToken(); +            assert(st.hasMoreTokens()); +            String rest = st.nextToken(); +            assert(!st.hasMoreTokens()); + +            // process phrase +            st = new StringTokenizer(phrase, " "); +            List<Integer> ptoks = new ArrayList<Integer>(); +            while (st.hasMoreTokens()) +                ptoks.add(tokenLexicon.insert(st.nextToken())); + +            // process contexts +            ArrayList<Context> contexts = new ArrayList<Context>(); +            String[] parts = separator.split(rest); +            assert(parts.length % 2 == 0); +            for (int i = 0; i < parts.length; i += 2) +            { +                // process pairs of strings - context and count +                ArrayList<Integer> ctx = new ArrayList<Integer>(); +                String ctxString = parts[i]; +                String countString = parts[i+1]; +                StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " "); +                while (ctxStrtok.hasMoreTokens()) +                { +                    String token = ctxStrtok.nextToken(); +                    if (!token.equals("<PHRASE>")) +                        ctx.add(tokenLexicon.insert(token)); +                } + +                assert(countString.startsWith("C=")); +                Context c = new Context(); +                c.count = Integer.parseInt(countString.substring(2).trim()); +                // damn unboxing doesn't work with toArray +                c.tokens = new int[ctx.size()]; +                for (int k = 0; k < ctx.size(); ++k) +                    c.tokens[k] = ctx.get(k); +                contexts.add(c); + +                numEdges += 1; +            } + +            // package up +            PhraseAndContexts instance = new PhraseAndContexts(); +            // damn unboxing doesn't work with toArray +            instance.phraseTokens = new int[ptoks.size()]; +            for (int k = 0; k < ptoks.size(); ++k) +                instance.phraseTokens[k] = ptoks.get(k); +            instance.contexts = contexts.toArray(new Context[] {}); +            instances.add(instance); +        } + +        training = instances.toArray(new PhraseAndContexts[] {}); + +        System.out.println("Read in " + training.length + " phrases and " + numEdges + " edges"); +    } + +    void displayPosterior() +    { +        for (int i = 0; i < training.length; ++i) +        { +            PhraseAndContexts instance = training[i]; +            for (Context ctx: instance.contexts) +            { +                double probs[] = posterior(i, ctx); +                double z = normalise(probs); + +                // emit phrase +                for (int t: instance.phraseTokens) +                    System.out.print(tokenLexicon.lookup(t) + " "); +                System.out.print("\t"); +                for (int c: ctx.tokens) +                    System.out.print(tokenLexicon.lookup(c) + " "); +                System.out.print("||| C=" + ctx.count + " |||"); + +                System.out.print(" " + argmax(probs)); +                //for (int t = 0; t < numTags; ++t) +                    //System.out.print(" " + probs[t]); +                System.out.println(); +            } +        } +    } + +    class PhraseAndContexts +    { +        int phraseTokens[]; +        Context contexts[]; +    } + +    class Context +    { +        int count; +        int[] tokens; +    } + +    public static void main(String []args) +    { +        assert(args.length >= 2); +        try +        { +            PhraseContextModel model = new PhraseContextModel(new File(args[0]), Integer.parseInt(args[1])); +            model.expectationMaximisation(Integer.parseInt(args[2])); +            model.displayPosterior(); +        } +        catch (IOException e) +        { +            System.out.println("Failed to read input file: " + args[0]); +            e.printStackTrace(); +        } +    } +} diff --git a/gi/posterior-regularisation/README b/gi/posterior-regularisation/README new file mode 100644 index 00000000..a3d54ffc --- /dev/null +++ b/gi/posterior-regularisation/README @@ -0,0 +1,3 @@ +  557  ./cdec_extools/extractor -i btec/split.zh-en.al -c 500000 -L 12 -C  | sort -t $'\t' -k 1 | ./cdec_extools/mr_stripe_rule_reduce > btec.concordance +  559  wc -l btec.concordance  +  588  cat btec.concordance  | sed  's/.*	//' | awk '{ for (i=1; i < NF; i++) { x=substr($i, 1, 2); if (x == "C=") printf "\n"; else if (x != "||") printf "%s ", $i; }; printf "\n"; }' | sort | uniq | wc -l diff --git a/gi/posterior-regularisation/alphabet.hh b/gi/posterior-regularisation/alphabet.hh new file mode 100644 index 00000000..1db928da --- /dev/null +++ b/gi/posterior-regularisation/alphabet.hh @@ -0,0 +1,61 @@ +#ifndef _alphabet_hh +#define _alphabet_hh + +#include <cassert> +#include <iosfwd> +#include <map> +#include <string> +#include <vector> + +// Alphabet: indexes a set of types  +template <typename T> +class Alphabet: protected std::map<T, int> +{ +public: +    Alphabet() {}; + +    bool empty() const { return std::map<T,int>::empty(); } +    int size() const { return std::map<T,int>::size(); } + +    int operator[](const T &k) const +    { +        typename std::map<T,int>::const_iterator cit = find(k); +        if (cit != std::map<T,int>::end()) +            return cit->second; +        else +            return -1; +    } + +    int lookup(const T &k) const { return (*this)[k]; } + +    int insert(const T &k)  +    { +        int sz = size(); +        assert((unsigned) sz == _items.size()); + +        std::pair<typename std::map<T,int>::iterator, bool> +            ins = std::map<T,int>::insert(make_pair(k, sz)); + +        if (ins.second)  +            _items.push_back(k); + +        return ins.first->second; +    } + +    const T &type(int i) const +    { +        assert(i >= 0); +        assert(i < size()); +        return _items[i]; +    } + +    std::ostream &display(std::ostream &out, int i) const +    { +        return out << type(i); +    } + +private: +    std::vector<T> _items; +}; + +#endif diff --git a/gi/posterior-regularisation/canned.concordance b/gi/posterior-regularisation/canned.concordance new file mode 100644 index 00000000..710973ff --- /dev/null +++ b/gi/posterior-regularisation/canned.concordance @@ -0,0 +1,4 @@ +a	0 0 <PHRASE> 0 0 ||| C=1 ||| 1 1 <PHRASE> 1 1 ||| C=1 ||| 2 2 <PHRASE> 2 2 ||| C=1 +b	0 0 <PHRASE> 0 0 ||| C=1 ||| 1 1 <PHRASE> 1 1 ||| C=1  +c	2 2 <PHRASE> 2 2 ||| C=1 ||| 4 4 <PHRASE> 4 4 ||| C=1 ||| 5 5 <PHRASE> 5 5 ||| C=1 +d	4 4 <PHRASE> 4 4 ||| C=1 ||| 5 5 <PHRASE> 5 5 ||| C=1 diff --git a/gi/posterior-regularisation/em.cc b/gi/posterior-regularisation/em.cc new file mode 100644 index 00000000..f6c9fd68 --- /dev/null +++ b/gi/posterior-regularisation/em.cc @@ -0,0 +1,830 @@ +// Input of the form: +// " the phantom of the opera "    tickets for <PHRASE> tonight ? ||| C=1 ||| seats for <PHRASE> ? </s> ||| C=1 ||| i see <PHRASE> ? </s> ||| C=1 +//                      phrase TAB [context]+ +// where    context =   phrase ||| C=...        which are separated by ||| + +// Model parameterised as follows: +// - each phrase, p, is allocated a latent state, t +// - this is used to generate the contexts, c +// - each context is generated using 4 independent multinomials, one for each position LL, L, R, RR + +// Training with EM: +// - e-step is estimating P(t|p,c) for all x,c +// - m-step is estimating model parameters P(p,c,t) = P(t) P(p|t) P(c|t) + +// Sexing it up: +// - constrain the posteriors P(t|c) and P(t|p) to have few high-magnitude entries +// - improve the generation of phrase internals, e.g., generate edge words from +//   different distribution to central words + +#include "alphabet.hh" +#include "log_add.hh" +#include <algorithm> +#include <fstream> +#include <iostream> +#include <iterator> +#include <map> +#include <sstream> +#include <stdexcept> +#include <vector> +#include <tr1/random> +#include <tr1/tuple> +#include <nlopt.h> + +using namespace std; +using namespace std::tr1; + +const int numTags = 5; +const int numIterations = 100; +const bool posterior_regularisation = true; +const double PHRASE_VIOLATION_WEIGHT = 10; +const double CONTEXT_VIOLATION_WEIGHT = 0; +const bool includePhraseProb = false; + +// Data structures: +Alphabet<string> lexicon; +typedef vector<int> Phrase; +typedef tuple<int, int, int, int> Context; +Alphabet<Phrase> phrases; +Alphabet<Context> contexts; + +typedef map<int, int> ContextCounts; +typedef map<int, int> PhraseCounts; +typedef map<int, ContextCounts> PhraseToContextCounts; +typedef map<int, PhraseCounts> ContextToPhraseCounts; + +PhraseToContextCounts concordancePhraseToContexts; +ContextToPhraseCounts concordanceContextToPhrases; + +typedef vector<double> Dist; +typedef vector<Dist> ConditionalDist; +Dist prior; // class -> P(class) +vector<ConditionalDist> probCtx; // word -> class -> P(word | class), for each position of context word +ConditionalDist probPhrase; // class -> P(word | class) +Dist probPhraseLength; // class -> P(length | class) expressed as geometric distribution parameter + +mt19937 randomGenerator((size_t) time(NULL)); +uniform_real<double> uniDist(0.0, 1e-1); +variate_generator< mt19937, uniform_real<double> > rng(randomGenerator, uniDist); + +void addRandomNoise(Dist &d); +void normalise(Dist &d); +void addTo(Dist &d, const Dist &e); +int argmax(const Dist &d); + +map<Phrase, map<Context, int> > lambda_indices; + +Dist conditional_probs(const Phrase &phrase, const Context &context, double *normalisation = 0); +template <typename T> +Dist +penalised_conditionals(const Phrase &phrase, const Context &context,  +                       const T &lambda, double *normalisation); +//Dist penalised_conditionals(const Phrase &phrase, const Context &context, const double *lambda, double *normalisation = 0); +double penalised_log_likelihood(int n, const double *lambda, double *gradient, void *data); +void optimise_lambda(double delta, double gamma, vector<double> &lambda); +double expected_violation_phrases(const double *lambda); +double expected_violation_contexts(const double *lambda); +double primal_kl_divergence(const double *lambda); +double dual(const double *lambda); +void print_primal_dual(const double *lambda, double delta, double gamma); + +ostream &operator<<(ostream &, const Phrase &); +ostream &operator<<(ostream &, const Context &); +ostream &operator<<(ostream &, const Dist &); +ostream &operator<<(ostream &, const ConditionalDist &); + +int +main(int argc, char *argv[]) +{ +    randomGenerator.seed(time(NULL)); + +    int edges = 0; +    istream &input = cin; +    while (input.good()) +    { +        // read the phrase +        string phraseString; +        Phrase phrase; +        getline(input, phraseString, '\t'); +        istringstream pinput(phraseString); +        string token; +        while (pinput >> token) +            phrase.push_back(lexicon.insert(token)); +        int phraseId = phrases.insert(phrase); + +        // read the rest, storing each context +        string remainder; +        getline(input, remainder, '\n'); +        istringstream rinput(remainder); +        Context context(-1, -1, -1, -1); +        int index = 0; +        while (rinput >> token) +        { +            if (token != "|||" && token != "<PHRASE>") +            { +                if (index < 4) +                { +                    // eugh! damn templates +                    switch (index) +                    { +                        case 0: get<0>(context) = lexicon.insert(token); break; +                        case 1: get<1>(context) = lexicon.insert(token); break; +                        case 2: get<2>(context) = lexicon.insert(token); break; +                        case 3: get<3>(context) = lexicon.insert(token); break; +                        default: assert(false); +                    } +                    index += 1; +                } +                else if (token.find("C=") == 0) +                { +                    int contextId = contexts.insert(context); +                    int count = atoi(token.substr(strlen("C=")).c_str()); +                    concordancePhraseToContexts[phraseId][contextId] += count; +                    concordanceContextToPhrases[contextId][phraseId] += count; +                    index = 0; +                    context = Context(-1, -1, -1, -1); +                    edges += 1; +                } +            } +        } + +        // trigger EOF +        input >> ws; +    } + +    cout << "Read in " << phrases.size() << " phrases" +         << " and " << contexts.size() << " contexts" +         << " and " << edges << " edges" +         << " and " << lexicon.size() << " word types\n"; + +    // FIXME: filter out low count phrases and low count contexts (based on individual words?) +    // now populate model parameters with uniform + random noise +    prior.resize(numTags, 1.0); +    addRandomNoise(prior); +    normalise(prior); + +    probCtx.resize(4, ConditionalDist(numTags, Dist(lexicon.size(), 1.0))); +    if (includePhraseProb) +        probPhrase.resize(numTags, Dist(lexicon.size(), 1.0)); +    for (int t = 0; t < numTags; ++t) +    { +        for (int j = 0; j < 4; ++j) +        { +            addRandomNoise(probCtx[j][t]); +            normalise(probCtx[j][t]); +        } +        if (includePhraseProb) +        { +            addRandomNoise(probPhrase[t]); +            normalise(probPhrase[t]); +        } +    } +    if (includePhraseProb) +    { +        probPhraseLength.resize(numTags, 0.5); // geometric distribution p=0.5 +        addRandomNoise(probPhraseLength); +    } + +    cout << "\tprior:     " << prior << "\n"; +    //cout << "\tcontext:   " << probCtx << "\n"; +    //cout << "\tphrase:    " << probPhrase << "\n"; +    //cout << "\tphraseLen: " << probPhraseLength << endl; + +    vector<double> lambda; + +    // now do EM training +    for (int iteration = 0; iteration < numIterations; ++iteration) +    { +        cout << "EM iteration " << iteration << endl; + +        if (posterior_regularisation) +            optimise_lambda(PHRASE_VIOLATION_WEIGHT, CONTEXT_VIOLATION_WEIGHT, lambda); +        //cout << "\tlambda " << lambda << endl; + +        Dist countsPrior(numTags, 0.0); +        vector<ConditionalDist> countsCtx(4, ConditionalDist(numTags, Dist(lexicon.size(), 1e-10))); +        ConditionalDist countsPhrase(numTags, Dist(lexicon.size(), 1e-10)); +        Dist countsPhraseLength(numTags, 0.0); +        Dist nPhrases(numTags, 0.0); + +        double llh = 0; +        for (PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.begin(); +             pcit != concordancePhraseToContexts.end(); ++pcit) +        { +            const Phrase &phrase = phrases.type(pcit->first); + +            // e-step: estimate latent class probs; compile (class,word) stats for m-step +            for (ContextCounts::iterator ccit = pcit->second.begin(); +                 ccit != pcit->second.end(); ++ccit) +            { +                const Context &context = contexts.type(ccit->first); + +                double z = 0; +                Dist tagCounts; +                if (!posterior_regularisation) +                    tagCounts = conditional_probs(phrase, context, &z); +                else +                    tagCounts = penalised_conditionals(phrase, context, lambda, &z); + +                llh += log(z) * ccit->second; +                addTo(countsPrior, tagCounts); // FIXME: times ccit->secon + +                for (int t = 0; t < numTags; ++t) +                { +                    for (int j = 0; j < 4; ++j) +                        countsCtx[j][t][get<0>(context)] += tagCounts[t] * ccit->second; + +                    if (includePhraseProb) +                    { +                        for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit) +                            countsPhrase[t][*pit] += tagCounts[t] * ccit->second; +                        countsPhraseLength[t] += phrase.size() * tagCounts[t] * ccit->second; +                        nPhrases[t] += tagCounts[t] * ccit->second; +                    } +                } +            } +        } + +        cout << "M-step\n"; + +        // m-step: normalise prior and (class,word) stats and assign to model parameters +        normalise(countsPrior); +        prior = countsPrior; +        for (int t = 0; t < numTags; ++t) +        { +            //cout << "\t\tt " << t << " prior " << countsPrior[t] << "\n"; +            for (int j = 0; j < 4; ++j) +                normalise(countsCtx[j][t]); +            if (includePhraseProb) +            { +                normalise(countsPhrase[t]); +                countsPhraseLength[t] = nPhrases[t] / countsPhraseLength[t]; +            } +        } +        probCtx = countsCtx; +        if (includePhraseProb) +        { +            probPhrase = countsPhrase; +            probPhraseLength = countsPhraseLength; +        } + +        double *larray = new double[lambda.size()]; +        copy(lambda.begin(), lambda.end(), larray); +        print_primal_dual(larray, PHRASE_VIOLATION_WEIGHT, CONTEXT_VIOLATION_WEIGHT); +        delete [] larray; + +        //cout << "\tllh " << llh << endl; +        //cout << "\tprior:     " << prior << "\n"; +        //cout << "\tcontext:   " << probCtx << "\n"; +        //cout << "\tphrase:    " << probPhrase << "\n"; +        //cout << "\tphraseLen: " << probPhraseLength << "\n"; +    } + +    // output class membership +    for (PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.begin(); +         pcit != concordancePhraseToContexts.end(); ++pcit) +    { +        const Phrase &phrase = phrases.type(pcit->first); +        for (ContextCounts::iterator ccit = pcit->second.begin(); +             ccit != pcit->second.end(); ++ccit) +        { +            const Context &context = contexts.type(ccit->first); +            Dist tagCounts = conditional_probs(phrase, context, 0); +            cout << phrase << " ||| " << context << " ||| " << argmax(tagCounts) << "\n"; +        } +    } + +    return 0; +} + +void addRandomNoise(Dist &d) +{ +    for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit) +        *dit += rng(); +} + +void normalise(Dist &d) +{ +    double z = 0; +    for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit) +        z += *dit; +    for (Dist::iterator dit = d.begin(); dit != d.end(); ++dit) +        *dit /= z; +} + +void addTo(Dist &d, const Dist &e) +{ +    assert(d.size() == e.size()); +    for (int i = 0; i < (int) d.size(); ++i) +        d[i] += e[i]; +} + +int argmax(const Dist &d) +{ +    double best = d[0]; +    int index = 0; +    for (int i = 1; i < (int) d.size(); ++i) +    { +        if (d[i] > best) +        { +            best = d[i]; +            index = i; +        } +    } +    return index; +} + +ostream &operator<<(ostream &out, const Phrase &phrase) +{ +    for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit) +        lexicon.display(((pit == phrase.begin()) ? out : out << " "), *pit); +    return out; +} + +ostream &operator<<(ostream &out, const Context &context) +{ +    lexicon.display(out, get<0>(context)); +    lexicon.display(out << " ", get<1>(context)); +    lexicon.display(out << " <PHRASE> ", get<2>(context)); +    lexicon.display(out << " ", get<3>(context)); +    return out; +} + +ostream &operator<<(ostream &out, const Dist &dist) +{ +    for (Dist::const_iterator dit = dist.begin(); dit != dist.end(); ++dit) +        out << ((dit == dist.begin()) ? "" : " ") << *dit; +    return out; +} + +ostream &operator<<(ostream &out, const ConditionalDist &dist) +{ +    for (ConditionalDist::const_iterator dit = dist.begin(); dit != dist.end(); ++dit) +        out << ((dit == dist.begin()) ? "" : "; ") << *dit; +    return out; +} + +// FIXME: slow - just use the phrase index, context index to do the mapping +// (n.b. it's a sparse setup, not just equal to 3d array index) +int +lambda_index(const Phrase &phrase, const Context &context, int tag) +{ +    return lambda_indices[phrase][context] + tag; +} + +template <typename T> +Dist +penalised_conditionals(const Phrase &phrase, const Context &context,  +                       const T &lambda, double *normalisation) +{ +    Dist d = conditional_probs(phrase, context, 0); + +    double z = 0; +    for (int t = 0; t < numTags; ++t) +    { +        d[t] *= exp(-lambda[lambda_index(phrase, context, t)]); +        z += d[t]; +    } + +    if (normalisation) +        *normalisation = z; + +    for (int t = 0; t < numTags; ++t) +        d[t] /= z; + +    return d; +} + +Dist  +conditional_probs(const Phrase &phrase, const Context &context, double *normalisation) +{ +    Dist tagCounts(numTags, 0.0); +    double z = 0; +    for (int t = 0; t < numTags; ++t) +    { +        double prob = prior[t]; +        prob *= (probCtx[0][t][get<0>(context)] * probCtx[1][t][get<1>(context)] * +                 probCtx[2][t][get<2>(context)] * probCtx[3][t][get<3>(context)]); + +        if (includePhraseProb) +        { +            prob *= pow(1 - probPhraseLength[t], phrase.size() - 1) * probPhraseLength[t]; +            for (Phrase::const_iterator pit = phrase.begin(); pit != phrase.end(); ++pit) +                prob *= probPhrase[t][*pit]; +        } + +        tagCounts[t] = prob; +        z += prob; +    } +    if (normalisation) +        *normalisation = z; + +    for (int t = 0; t < numTags; ++t) +        tagCounts[t] /= z; + +    return tagCounts; +} + +double  +penalised_log_likelihood(int n, const double *lambda, double *grad, void *) +{ +    // return log Z(lambda, theta) over the corpus +    // where theta are the global parameters (prior, probCtx*, probPhrase*)  +    // and lambda are lagrange multipliers for the posterior sparsity constraints +    // +    // this is formulated as:  +    // f = log Z(lambda) = sum_i log ( sum_i p_theta(t_i|p_i,c_i) exp [-lambda_{t_i,p_i,c_i}] ) +    // where i indexes the training examples - specifying the (p, c) pair (which may occur with count > 1) +    // +    // with derivative: +    // f'_{tpc} = frac { - count(t,p,c) p_theta(t|p,c) exp (-lambda_{t,p,c}) } +    //                 { sum_t' p_theta(t'|p,c) exp (-lambda_{t',p,c}) } + +    //cout << "penalised_log_likelihood with lambda "; +    //copy(lambda, lambda+n, ostream_iterator<double>(cout, " ")); +    //cout << "\n"; + +    double f = 0; +    if (grad) +    { +        for (int i = 0; i < n; ++i) +            grad[i] = 0.0; +    } + +    for (int p = 0; p < phrases.size(); ++p) +    { +        const Phrase &phrase = phrases.type(p); +        PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); +        for (ContextCounts::const_iterator ccit = pcit->second.begin(); +             ccit != pcit->second.end(); ++ccit) +        { +            const Context &context = contexts.type(ccit->first); +            double z = 0; +            Dist scores = penalised_conditionals(phrase, context, lambda, &z); + +            f += ccit->second * log(z); +            //cout << "\tphrase: " << phrase << " context: " << context << " count: " << ccit->second << " z " << z << endl; +            //cout << "\t\tscores: " << scores << "\n"; + +            if (grad) +            { +                for (int t = 0; t < numTags; ++t) +                { +                    int i = lambda_index(phrase, context, t); // FIXME: redundant lookups +                    assert(grad[i] == 0.0); +                    grad[i] = - ccit->second * scores[t]; +                } +            } +        } +    } + +    //cout << "penalised_log_likelihood returning " << f; +    //if (grad) +    //{ +        //cout << "\ngradient: "; +        //copy(grad, grad+n, ostream_iterator<double>(cout, " ")); +    //} +    //cout << "\n"; + +    return f; +} + +typedef struct  +{ +    // one of p or c should be set to -1, in which case it will be marginalised out  +    // i.e. sum_p' lambda_{p'ct} <= threshold +    //   or sum_c' lambda_{pc't} <= threshold +    int p, c, t, threshold; +} constraint_data; + +double  +constraint_and_gradient(int n, const double *lambda, double *grad, void *data) +{ +    constraint_data *d = (constraint_data *) data; +    assert(d->t >= 0); +    assert(d->threshold >= 0); + +    //cout << "constraint_and_gradient: t " << d->t << " p " << d->p << " c " << d->c << " tau " << d->threshold << endl; +    //cout << "\tlambda "; +    //copy(lambda, lambda+n, ostream_iterator<double>(cout, " ")); +    //cout << "\n"; + +    // FIXME: it's crazy to use a dense gradient here => will only have a handful of non-zero entries +    if (grad) +    { +        for (int i = 0; i < n; ++i) +            grad[i] = 0.0; +    } + +    //cout << "constraint_and_gradient: " << d->p << "; " << d->c << "; " << d->t << "; " << d->threshold << endl; + +    if (d->p >= 0) +    { +        assert(d->c < 0); +        //    sum_c lambda_pct          <= delta [a.k.a. threshold] +        // => sum_c lambda_pct - delta  <= 0 +        // derivative_pct = { 1, if p and t match; 0, otherwise } + +        double val = -d->threshold; + +        const Phrase &phrase = phrases.type(d->p); +        PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(d->p); +        assert(pcit != concordancePhraseToContexts.end()); +        for (ContextCounts::const_iterator ccit = pcit->second.begin(); +             ccit != pcit->second.end(); ++ccit) +        { +            const Context &context = contexts.type(ccit->first); +            int i = lambda_index(phrase, context, d->t); +            val += lambda[i]; +            if (grad) grad[i] = 1; +        } +        //cout << "\treturning " << val << endl; + +        return val; +    } +    else +    { +        assert(d->c >= 0); +        assert(d->p < 0); +        //    sum_p lambda_pct          <= gamma [a.k.a. threshold] +        // => sum_p lambda_pct - gamma  <= 0 +        // derivative_pct = { 1, if c and t match; 0, otherwise } + +        double val = -d->threshold; + +        const Context &context = contexts.type(d->c); +        ContextToPhraseCounts::iterator cpit = concordanceContextToPhrases.find(d->c); +        assert(cpit != concordanceContextToPhrases.end()); +        for (PhraseCounts::iterator pcit = cpit->second.begin(); +             pcit != cpit->second.end(); ++pcit) +        { +            const Phrase &phrase = phrases.type(pcit->first); +            int i = lambda_index(phrase, context, d->t); +            val += lambda[i]; +            if (grad) grad[i] = 1; +        } +        //cout << "\treturning " << val << endl; + +        return val; +    } +} + +void +optimise_lambda(double delta, double gamma, vector<double> &lambdav) +{ +    int num_lambdas = lambdav.size(); +    if (lambda_indices.empty() || lambdav.empty()) +    { +        lambda_indices.clear(); +        lambdav.clear(); + +        int i = 0; +        for (int p = 0; p < phrases.size(); ++p) +        { +            const Phrase &phrase = phrases.type(p); +            PhraseToContextCounts::iterator pcit = concordancePhraseToContexts.find(p); +            for (ContextCounts::iterator ccit = pcit->second.begin(); +                 ccit != pcit->second.end(); ++ccit) +            { +                const Context &context = contexts.type(ccit->first); +                lambda_indices[phrase][context] = i; +                i += numTags; +            } +        } +        num_lambdas = i; +        lambdav.resize(num_lambdas); +    } +    //cout << "optimise_lambda: #langrange multipliers " << num_lambdas << endl; + +    // FIXME: better to work with an implicit representation to save memory usage +    int num_constraints = (((delta > 0) ? phrases.size() : 0) + ((gamma > 0) ? contexts.size() : 0)) * numTags; +    //cout << "optimise_lambda: #constraints " << num_constraints << endl; +    constraint_data *data = new constraint_data[num_constraints]; +    int i = 0; +    if (delta > 0) +    { +        for (int p = 0; p < phrases.size(); ++p) +        { +            for (int t = 0; t < numTags; ++t, ++i) +            { +                constraint_data &d = data[i]; +                d.p = p; +                d.c = -1; +                d.t = t; +                d.threshold = delta; +            } +        } +    } + +    if (gamma > 0) +    { +        for (int c = 0; c < contexts.size(); ++c) +        { +            for (int t = 0; t < numTags; ++t, ++i) +            { +                constraint_data &d = data[i]; +                d.p = -1; +                d.c = c; +                d.t = t; +                d.threshold = gamma; +            } +        } +    } +    assert(i == num_constraints); + +    double lambda[num_lambdas]; +    double lb[num_lambdas], ub[num_lambdas]; +    for (i = 0; i < num_lambdas; ++i) +    { +        lambda[i] = lambdav[i]; // starting value +        lb[i] = 0;              // lower bound +        if (delta <= 0)         // upper bound +            ub[i] = gamma;       +        else if (gamma <= 0) +            ub[i] = delta; +        else +            assert(false); +    } + +    //print_primal_dual(lambda, delta, gamma); +    +    double minf; +    int error_code = nlopt_minimize_constrained(NLOPT_LN_COBYLA, num_lambdas, penalised_log_likelihood, NULL, +                                                num_constraints, constraint_and_gradient, data, sizeof(constraint_data), +                                                lb, ub, lambda, &minf, -HUGE_VAL, 0.0, 0.0, 1e-4, NULL, 0, 0.0); +    //cout << "optimise error code " << error_code << endl; + +    //print_primal_dual(lambda, delta, gamma); + +    delete [] data; + +    if (error_code < 0) +        cout << "WARNING: optimisation failed with error code: " << error_code << endl; +    //else +    //{ +        //cout << "success; minf " << minf << endl; +        //print_primal_dual(lambda, delta, gamma); +    //} + +    lambdav = vector<double>(&lambda[0], &lambda[0] + num_lambdas); +} + +// FIXME: inefficient - cache the scores +double +expected_violation_phrases(const double *lambda) +{ +    // sum_pt max_c E_q[phi_pct] +    double violation = 0; + +    for (int p = 0; p < phrases.size(); ++p) +    { +        const Phrase &phrase = phrases.type(p); +        PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); + +        for (int t = 0; t < numTags; ++t) +        { +            double best = 0; +            for (ContextCounts::const_iterator ccit = pcit->second.begin(); +                 ccit != pcit->second.end(); ++ccit) +            { +                const Context &context = contexts.type(ccit->first); +                Dist scores = penalised_conditionals(phrase, context, lambda, 0); +                best = max(best, scores[t]); +            } +            violation += best; +        } +    } + +    return violation; +} + +// FIXME: inefficient - cache the scores +double +expected_violation_contexts(const double *lambda) +{ +    // sum_ct max_p E_q[phi_pct] +    double violation = 0; + +    for (int c = 0; c < contexts.size(); ++c) +    { +        const Context &context = contexts.type(c); +        ContextToPhraseCounts::iterator cpit = concordanceContextToPhrases.find(c); + +        for (int t = 0; t < numTags; ++t) +        { +            double best = 0; +            for (PhraseCounts::iterator pit = cpit->second.begin(); +                 pit != cpit->second.end(); ++pit) +            { +                const Phrase &phrase = phrases.type(pit->first); +                Dist scores = penalised_conditionals(phrase, context, lambda, 0); +                best = max(best, scores[t]); +            } +            violation += best; +        } +    } + +    return violation; +} + +// FIXME: possibly inefficient +double  +primal_likelihood() // FIXME: primal evaluation needs to use lambda and calculate l1linf terms +{ +    double llh = 0; +    for (int p = 0; p < phrases.size(); ++p) +    { +        const Phrase &phrase = phrases.type(p); +        PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); +        for (ContextCounts::const_iterator ccit = pcit->second.begin(); +             ccit != pcit->second.end(); ++ccit) +        { +            const Context &context = contexts.type(ccit->first); +            double z = 0; +            Dist scores = conditional_probs(phrase, context, &z); +            llh += ccit->second * log(z); +        } +    } +    return llh; +} + +// FIXME: inefficient - cache the scores +double  +primal_kl_divergence(const double *lambda) +{ +    // return KL(q || p) = sum_y q(y) { log q(y) - log p(y | x) } +    //                   = sum_y q(y) { log p(y | x) - lambda . phi(x, y) - log Z - log p(y | x) } +    //                   = sum_y q(y) { - lambda . phi(x, y) } - log Z +    // and q(y) factors with each edge, ditto for Z +     +    double feature_sum = 0, log_z = 0; +    for (int p = 0; p < phrases.size(); ++p) +    { +        const Phrase &phrase = phrases.type(p); +        PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); +        for (ContextCounts::const_iterator ccit = pcit->second.begin(); +             ccit != pcit->second.end(); ++ccit) +        { +            const Context &context = contexts.type(ccit->first); + +            double local_z = 0; +            double local_f = 0; +            Dist d = conditional_probs(phrase, context, 0); +            for (int t = 0; t < numTags; ++t) +            { +                int i = lambda_index(phrase, context, t); +                double s = d[t] * exp(-lambda[i]); +                local_f += lambda[i] * s; +                local_z += s; +            } + +            log_z += ccit->second * log(local_z); +            feature_sum += ccit->second * (local_f / local_z); +        } +    } + +    return -feature_sum - log_z; +} + +// FIXME: inefficient - cache the scores +double  +dual(const double *lambda) +{ +    // return log(Z) = - log { sum_y p(y | x) exp( - lambda . phi(x, y) } +    // n.b. have flipped the sign as we're minimising +     +    double z = 0; +    for (int p = 0; p < phrases.size(); ++p) +    { +        const Phrase &phrase = phrases.type(p); +        PhraseToContextCounts::const_iterator pcit = concordancePhraseToContexts.find(p); +        for (ContextCounts::const_iterator ccit = pcit->second.begin(); +             ccit != pcit->second.end(); ++ccit) +        { +            const Context &context = contexts.type(ccit->first); +            double lz = 0; +            Dist scores = penalised_conditionals(phrase, context, lambda, &z); +            z += lz * ccit->second; +        } +    } +    return log(z); +} + +void +print_primal_dual(const double *lambda, double delta, double gamma) +{ +    double likelihood = primal_likelihood(); +    double kl = primal_kl_divergence(lambda); +    double sum_pt = expected_violation_phrases(lambda); +    double sum_ct = expected_violation_contexts(lambda); +    //double d = dual(lambda); + +    cout << "\tllh=" << likelihood +         << " kl=" << kl +         << " violations phrases=" << sum_pt +         << " contexts=" << sum_ct +         //<< " primal=" << (kl + delta * sum_pt + gamma * sum_ct)  +         //<< " dual=" << d +         << " objective=" << (likelihood - kl + delta * sum_pt + gamma * sum_ct)  +         << endl; +} diff --git a/gi/posterior-regularisation/invert.hh b/gi/posterior-regularisation/invert.hh new file mode 100644 index 00000000..d06356e9 --- /dev/null +++ b/gi/posterior-regularisation/invert.hh @@ -0,0 +1,45 @@ +// The following code inverts the matrix input using LU-decomposition with +// backsubstitution of unit vectors. Reference: Numerical Recipies in C, 2nd +// ed., by Press, Teukolsky, Vetterling & Flannery.  +// Code written by Fredrik Orderud. +// http://www.crystalclearsoftware.com/cgi-bin/boost_wiki/wiki.pl?LU_Matrix_Inversion + +#ifndef INVERT_MATRIX_HPP +#define INVERT_MATRIX_HPP + +// REMEMBER to update "lu.hpp" header includes from boost-CVS +#include <boost/numeric/ublas/vector.hpp> +#include <boost/numeric/ublas/vector_proxy.hpp> +#include <boost/numeric/ublas/matrix.hpp> +#include <boost/numeric/ublas/triangular.hpp> +#include <boost/numeric/ublas/lu.hpp> +#include <boost/numeric/ublas/io.hpp> + +namespace ublas = boost::numeric::ublas; + +/* Matrix inversion routine. +   Uses lu_factorize and lu_substitute in uBLAS to invert a matrix */ +template<class T> +bool invert_matrix(const ublas::matrix<T>& input, ublas::matrix<T>& inverse)  +{ +    using namespace boost::numeric::ublas; +    typedef permutation_matrix<std::size_t> pmatrix; +    // create a working copy of the input +    matrix<T> A(input); +    // create a permutation matrix for the LU-factorization +    pmatrix pm(A.size1()); + +    // perform LU-factorization +    int res = lu_factorize(A,pm); +    if( res != 0 ) return false; + +    // create identity matrix of "inverse" +    inverse.assign(ublas::identity_matrix<T>(A.size1())); + +    // backsubstitute to get the inverse +    lu_substitute(A, pm, inverse); +     +    return true; +} + +#endif //INVERT_MATRIX_HPP diff --git a/gi/posterior-regularisation/log_add.hh b/gi/posterior-regularisation/log_add.hh new file mode 100644 index 00000000..e0620c5a --- /dev/null +++ b/gi/posterior-regularisation/log_add.hh @@ -0,0 +1,30 @@ +#ifndef log_add_hh +#define log_add_hh + +#include <limits> +#include <iostream> +#include <cassert> +#include <cmath> + +template <typename T> +struct Log +{ +    static T zero() { return -std::numeric_limits<T>::infinity(); }  + +    static T add(T l1, T l2) +    { +        if (l1 == zero()) return l2; +        if (l1 > l2)  +            return l1 + std::log(1 + exp(l2 - l1)); +        else +            return l2 + std::log(1 + exp(l1 - l2)); +    } + +    static T subtract(T l1, T l2) +    { +        //std::assert(l1 >= l2); +        return l1 + log(1 - exp(l2 - l1)); +    } +}; + +#endif diff --git a/gi/posterior-regularisation/projected_gradient.cc b/gi/posterior-regularisation/projected_gradient.cc new file mode 100644 index 00000000..f7c39817 --- /dev/null +++ b/gi/posterior-regularisation/projected_gradient.cc @@ -0,0 +1,87 @@ +// +// Minimises given functional using the projected gradient method. Based on +// algorithm and demonstration example in Linear and Nonlinear Programming, +// Luenberger and Ye, 3rd ed., p 370. +// + +#include "invert.hh" +#include <iostream> + +using namespace std; + +double  +f(double x1, double x2, double x3, double x4) +{ +    return x1 * x1 + x2 * x2 + x3 * x3 + x4 * x4 - 2 * x1 - 3 * x4; +} + +ublas::vector<double>  +g(double x1, double x2, double x3, double x4) +{ +    ublas::vector<double> v(4); +    v(0) = 2 * x1 - 2; +    v(1) = 2 * x2; +    v(2) = 2 * x3; +    v(3) = 2 * x4 - 3; +    return v; +} + +ublas::matrix<double>  +activeConstraints(double x1, double x2, double x3, double x4) +{ +    int n = 2; +    if (x1 == 0) ++n; +    if (x2 == 0) ++n; +    if (x3 == 0) ++n; +    if (x4 == 0) ++n; + +    ublas::matrix<double> a(n,4); +    a(0, 0) = 2; a(0, 1) = 1; a(0, 2) = 1; a(0, 3) = 4; +    a(1, 0) = 1; a(1, 1) = 1; a(1, 2) = 2; a(1, 3) = 1; + +    int c = 2; +    if (x1 == 0) a(c++, 0) = 1; +    if (x2 == 0) a(c++, 1) = 1; +    if (x3 == 0) a(c++, 2) = 1; +    if (x4 == 0) a(c++, 3) = 1; + +    return a; +} + +ublas::matrix<double> +projection(const ublas::matrix<double> &a) +{ +    ublas::matrix<double> aT = ublas::trans(a); +    ublas::matrix<double> inv(a.size1(), a.size1()); +    bool ok = invert_matrix(ublas::matrix<double>(ublas::prod(a, aT)), inv); +    assert(ok && "Failed to invert matrix"); +    return ublas::identity_matrix<double>(4) -  +        ublas::prod(aT, ublas::matrix<double>(ublas::prod(inv, a))); +} + +int main(int argc, char *argv[]) +{ +    double x1 = 2, x2 = 2, x3 = 1, x4 = 0; + +    double fval = f(x1, x2, x3, x4); +    cout << "f = " << fval << endl; +    ublas::vector<double> grad = g(x1, x2, x3, x4); +    cout << "g = " << grad << endl; +    ublas::matrix<double> A = activeConstraints(x1, x2, x3, x4); +    cout << "A = " << A << endl; +    ublas::matrix<double> P = projection(A); +    cout << "P = " << P << endl; +    // the direction of movement +    ublas::vector<double> d = prod(P, grad); +    cout << "d = " << (d / d(0)) << endl; + +    // special case for d = 0 + +    // next solve for limits on the line search + +    // then use golden rule technique between these values (if bounded) + +    // or simple Armijo's rule technique + +    return 0; +} diff --git a/gi/posterior-regularisation/train_pr_global.py b/gi/posterior-regularisation/train_pr_global.py new file mode 100644 index 00000000..467069ef --- /dev/null +++ b/gi/posterior-regularisation/train_pr_global.py @@ -0,0 +1,272 @@ +import sys +import scipy.optimize +from numpy import * +from numpy.random import random + +# +# Step 1: load the concordance counts +#  + +edges_phrase_to_context = {} +edges_context_to_phrase = {} +types = {} +num_edges = 0 + +for line in sys.stdin: +    phrase, rest = line.strip().split('\t') +    parts = rest.split('|||') +    for i in range(0, len(parts), 2): +        context, count = parts[i:i+2] + +        ctx = tuple(filter(lambda x: x != '<PHRASE>', context.split())) +        cnt = int(count.strip()[2:]) +        ccs = edges_phrase_to_context.setdefault(phrase, {}) +        ccs[ctx] = cnt +        pcs = edges_context_to_phrase.setdefault(ctx, {}) +        pcs[phrase] = cnt + +        for token in ctx: +            types.setdefault(token, len(types)) +        for token in phrase.split(): +            types.setdefault(token, len(types)) + +        num_edges += 1 + +print 'Read in', num_edges, 'edges and', len(types), 'word types' + +# +# Step 2: initialise the model parameters +# + +num_tags = 5 +num_types = len(types) +delta = int(sys.argv[1]) +gamma = int(sys.argv[2]) + +def normalise(a): +    return a / sum(a) + +# Pr(tag) +tagDist = normalise(random(num_tags)+1) +# Pr(context at pos i = w | tag) indexed by i, tag, word +contextWordDist = [[normalise(random(num_types)+1) for t in range(num_tags)] for i in range(4)] +# PR langrange multipliers +lamba = zeros(num_edges * num_tags) +lamba_index = {} +next = 0 +for phrase, ccs in edges_phrase_to_context.items(): +    for context in ccs.keys(): +        lamba_index[phrase,context] = next +        next += num_tags + +# +# Step 3: expectation maximisation +# + +for iteration in range(20): +    tagCounts = zeros(num_tags) +    contextWordCounts = [[zeros(num_types) for t in range(num_tags)] for i in range(4)] + +    #print 'tagDist', tagDist +    #print 'contextWordCounts[0][0]', contextWordCounts[0][0] + +    # Tune lambda +    # dual: min log Z(lamba) s.t. lamba >= 0; +    # sum_c lamba_pct <= delta; sum_p lamba_pct <= gamma +    def dual(ls): +        logz = 0 +        for phrase, ccs in edges_phrase_to_context.items(): +            for context, count in ccs.items(): +                conditionals = zeros(num_tags) +                for t in range(num_tags): +                    prob = tagDist[t] +                    for i in range(4): +                        prob *= contextWordDist[i][t][types[context[i]]] +                    conditionals[t] = prob +                cz = sum(conditionals) +                conditionals /= cz + +                local_z = 0 +                for t in range(num_tags): +                    local_z += conditionals[t] * exp(-ls[lamba_index[phrase,context] + t]) +                logz += log(local_z) * count + +        #print 'ls', ls +        #print 'lambda', list(ls) +        #print 'dual', logz +        return logz + +    def primal(ls): +        # FIXME: returns negative values for KL (impossible) +        logz = dual(ls) +        kl = -logz + +        expectations = zeros(lamba.shape) +        for phrase, ccs in edges_phrase_to_context.items(): +            for context, count in ccs.items(): +                conditionals = zeros(num_tags) +                for t in range(num_tags): +                    prob = tagDist[t] +                    for i in range(4): +                        prob *= contextWordDist[i][t][types[context[i]]] +                    conditionals[t] = prob +                cz = sum(conditionals) +                conditionals /= cz + +                scores = zeros(num_tags) +                for t in range(num_tags): +                    scores[t] = conditionals[t] * exp(-ls[lamba_index[phrase,context] + t]) +                local_z = sum(scores) + +                for t in range(num_tags): +                    li = lamba_index[phrase,context] + t +                    expectations[li] = scores[t] / local_z * count +                    kl -= expectations[li] * ls[li] + +        pt_l1linf = 0 +        for phrase, ccs in edges_phrase_to_context.items(): +            for t in range(num_tags): +                best = -1e500 +                for context, count in ccs.items(): +                    li = lamba_index[phrase,context] + t +                    s = expectations[li] +                    if s > best: best = s +                pt_l1linf += best + +        ct_l1linf = 0 +        for context, pcs in edges_context_to_phrase.items(): +            for t in range(num_tags): +                best = -1e500 +                for phrase, count in pcs.items(): +                    li = lamba_index[phrase,context] + t +                    s = expectations[li] +                    if s > best: best = s +                ct_l1linf += best + +        return kl, pt_l1linf, ct_l1linf, kl + delta * pt_l1linf + gamma * ct_l1linf + +    def dual_deriv(ls): +        # d/dl log(z) = E_q[phi] +        deriv = zeros(num_edges * num_tags) +        for phrase, ccs in edges_phrase_to_context.items(): +            for context, count in ccs.items(): +                conditionals = zeros(num_tags) +                for t in range(num_tags): +                    prob = tagDist[t] +                    for i in range(4): +                        prob *= contextWordDist[i][t][types[context[i]]] +                    conditionals[t] = prob +                cz = sum(conditionals) +                conditionals /= cz + +                scores = zeros(num_tags) +                for t in range(num_tags): +                    scores[t] = conditionals[t] * exp(-ls[lamba_index[phrase,context] + t]) +                local_z = sum(scores) + +                for t in range(num_tags): +                    deriv[lamba_index[phrase,context] + t] -= count * scores[t] / local_z + +        #print 'ddual', deriv +        return deriv + +    def constraints(ls): +        cons = [] +        if delta > 0: +            for phrase, ccs in edges_phrase_to_context.items(): +                for t in range(num_tags): +                    total = delta +                    for cprime in ccs.keys(): +                        total -= ls[lamba_index[phrase, cprime] + t] +                    cons.append(total) + +        if gamma > 0: +            for context, pcs in edges_context_to_phrase.items(): +                for t in range(num_tags): +                    total = gamma +                    for pprime in pcs.keys(): +                        total -= ls[lamba_index[pprime, context] + t] +                    cons.append(total) +        #print 'cons', cons +        return cons + +    def constraints_deriv(ls): +        cons = [] +        if delta > 0: +            for phrase, ccs in edges_phrase_to_context.items(): +                for t in range(num_tags): +                    d = zeros(num_edges * num_tags) +                    for cprime in ccs.keys(): +                        d[lamba_index[phrase, cprime] + t] = -1 +                    cons.append(d) + +        if gamma > 0: +            for context, pcs in edges_context_to_phrase.items(): +                for t in range(num_tags): +                    d = zeros(num_edges * num_tags) +                    for pprime in pcs.keys(): +                        d[lamba_index[pprime, context] + t] = -1 +                    cons.append(d) +        #print 'dcons', cons +        return cons + +    print 'Pre lambda optimisation dual', dual(lamba), 'primal', primal(lamba) +    lamba = scipy.optimize.fmin_slsqp(dual, lamba, +                            bounds=[(0, delta)] * (num_edges * num_tags), +                            f_ieqcons=constraints, +                            fprime=dual_deriv, +                            fprime_ieqcons=constraints_deriv, +                            iprint=0) +    print 'Post lambda optimisation dual', dual(lamba), 'primal', primal(lamba) + +    # E-step +    llh = z = 0 +    for phrase, ccs in edges_phrase_to_context.items(): +        for context, count in ccs.items(): +            conditionals = zeros(num_tags) +            for t in range(num_tags): +                prob = tagDist[t] +                for i in range(4): +                    prob *= contextWordDist[i][t][types[context[i]]] +                conditionals[t] = prob +            cz = sum(conditionals) +            conditionals /= cz +            llh += log(cz) * count + +            scores = zeros(num_tags) +            li = lamba_index[phrase, context] +            for t in range(num_tags): +                scores[t] = conditionals[t] * exp(-lamba[li + t]) +            z += count * sum(scores) + +            tagCounts += count * scores +            for i in range(4): +                for t in range(num_tags): +                    contextWordCounts[i][t][types[context[i]]] += count * scores[t] + +    print 'iteration', iteration, 'llh', llh, 'logz', log(z) + +    # M-step +    tagDist = normalise(tagCounts) +    for i in range(4): +        for t in range(num_tags): +            contextWordDist[i][t] = normalise(contextWordCounts[i][t]) + + +for phrase, ccs in edges_phrase_to_context.items(): +    for context, count in ccs.items(): +        conditionals = zeros(num_tags) +        for t in range(num_tags): +            prob = tagDist[t] +            for i in range(4): +                prob *= contextWordDist[i][t][types[context[i]]] +            conditionals[t] = prob +        cz = sum(conditionals) +        conditionals /= cz + +        scores = zeros(num_tags) +        li = lamba_index[phrase, context] +        for t in range(num_tags): +            scores[t] = conditionals[t] * exp(-lamba[li + t]) + +        print '%s\t%s ||| C=%d ||| %d |||' % (phrase, context, count, argmax(scores)), scores / sum(scores)  | 
