From e26434979adc33bd949566ba7bf02dff64e80a3e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 2 Oct 2012 00:19:43 -0400 Subject: cdec cleanup, remove bayesian stuff, parsing stuff --- .../prjava/src/phrase/Agree.java | 204 -------- .../prjava/src/phrase/Agree2Sides.java | 197 -------- .../prjava/src/phrase/C2F.java | 216 --------- .../prjava/src/phrase/Corpus.java | 288 ----------- .../prjava/src/phrase/Lexicon.java | 34 -- .../prjava/src/phrase/PhraseCluster.java | 540 --------------------- .../prjava/src/phrase/PhraseContextObjective.java | 436 ----------------- .../prjava/src/phrase/PhraseCorpus.java | 193 -------- .../prjava/src/phrase/PhraseObjective.java | 224 --------- .../prjava/src/phrase/Trainer.java | 257 ---------- .../prjava/src/phrase/VB.java | 419 ---------------- 11 files changed, 3008 deletions(-) delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/Agree.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/C2F.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/Corpus.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/Lexicon.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/PhraseContextObjective.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/PhraseObjective.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/Trainer.java delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/VB.java (limited to 'gi/posterior-regularisation/prjava/src/phrase') diff --git a/gi/posterior-regularisation/prjava/src/phrase/Agree.java b/gi/posterior-regularisation/prjava/src/phrase/Agree.java deleted file mode 100644 index 8f7b499e..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/Agree.java +++ /dev/null @@ -1,204 +0,0 @@ -package phrase; - -import gnu.trove.TIntArrayList; - -import io.FileUtil; - -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.util.List; - -import phrase.Corpus.Edge; - -public class Agree { - PhraseCluster model1; - C2F model2; - Corpus c; - private int K,n_phrases, n_words, n_contexts, n_positions1,n_positions2; - - /**@brief sum of loglikelihood of two - * individual models - */ - public double llh; - /**@brief Bhattacharyya distance - * - */ - public double bdist; - /** - * - * @param numCluster - * @param corpus - */ - public Agree(int numCluster, Corpus corpus){ - - model1=new PhraseCluster(numCluster, corpus); - model2=new C2F(numCluster,corpus); - c=corpus; - n_words=c.getNumWords(); - n_phrases=c.getNumPhrases(); - n_contexts=c.getNumContexts(); - n_positions1=c.getNumContextPositions(); - n_positions2=2; - K=numCluster; - - } - - /**@brief test - * - */ - public static void main(String args[]){ - //String in="../pdata/canned.con"; - String in="../pdata/btec.con"; - String out="../pdata/posterior.out"; - int numCluster=25; - Corpus corpus = null; - File infile = new File(in); - try { - System.out.println("Reading concordance from " + infile); - corpus = Corpus.readFromFile(FileUtil.reader(infile)); - corpus.printStats(System.out); - } catch (IOException e) { - System.err.println("Failed to open input file: " + infile); - e.printStackTrace(); - System.exit(1); - } - - Agree agree=new Agree(numCluster, corpus); - int iter=20; - for(int i=0;i contexts = c.getEdgesForContext(context); - - for (int ctx=0; ctx 0; - bdist += edge.getCount() * Math.log(z); - arr.F.l1normalize(p); - - double count = edge.getCount(); - //increment expected count - TIntArrayList phraseToks = edge.getPhrase(); - TIntArrayList contextToks = edge.getContext(); - for(int tag=0;tag test) - { - for (Edge edge : test) - { - double probs[] = posterior(edge); - arr.F.l1normalize(probs); - - // emit phrase - ps.print(edge.getPhraseString()); - ps.print("\t"); - ps.print(edge.getContextString(true)); - int t=arr.F.argmax(probs); - ps.println(" ||| C=" + t); - } - } - -} diff --git a/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java b/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java deleted file mode 100644 index 031f887f..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java +++ /dev/null @@ -1,197 +0,0 @@ -package phrase; - -import gnu.trove.TIntArrayList; - -import io.FileUtil; - -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.util.List; - -import phrase.Corpus.Edge; - -public class Agree2Sides { - PhraseCluster model1,model2; - Corpus c1,c2; - private int K; - - /**@brief sum of loglikelihood of two - * individual models - */ - public double llh; - /**@brief Bhattacharyya distance - * - */ - public double bdist; - /** - * - * @param numCluster - * @param corpus - */ - public Agree2Sides(int numCluster, Corpus corpus1 , Corpus corpus2 ){ - - model1=new PhraseCluster(numCluster, corpus1); - model2=new PhraseCluster(numCluster,corpus2); - c1=corpus1; - c2=corpus2; - K=numCluster; - - } - - /**@brief test - * - */ - public static void main(String args[]){ - //String in="../pdata/canned.con"; - // String in="../pdata/btec.con"; - String in1="../pdata/source.txt"; - String in2="../pdata/target.txt"; - String out="../pdata/posterior.out"; - int numCluster=25; - Corpus corpus1 = null,corpus2=null; - File infile1 = new File(in1),infile2=new File(in2); - try { - System.out.println("Reading concordance from " + infile1); - corpus1 = Corpus.readFromFile(FileUtil.reader(infile1)); - System.out.println("Reading concordance from " + infile2); - corpus2 = Corpus.readFromFile(FileUtil.reader(infile2)); - corpus1.printStats(System.out); - } catch (IOException e) { - System.err.println("Failed to open input file: " + infile1); - e.printStackTrace(); - System.exit(1); - } - - Agree2Sides agree=new Agree2Sides(numCluster, corpus1,corpus2); - int iter=20; - for(int i=0;i 0; - bdist += edge1.getCount() * Math.log(z); - arr.F.l1normalize(p); - double count = edge1.getCount(); - //increment expected count - TIntArrayList contextToks1 = edge1.getContext(); - TIntArrayList contextToks2 = edge2.getContext(); - int phrase1=edge1.getPhraseId(); - int phrase2=edge2.getPhraseId(); - for(int tag=0;tag contexts = c.getEdgesForContext(context); - - for (int ctx=0; ctx 0; - loglikelihood += edge.getCount() * Math.log(z); - arr.F.l1normalize(p); - - double count = edge.getCount(); - //increment expected count - TIntArrayList phrase= edge.getPhrase(); - for(int tag=0;tag EPS) - ps.print("\t" + j + ": " + pi[i][j]); - } - ps.println(); - } - - ps.println("P(word|tag,position)"); - for (int i = 0; i < K; ++i) - { - for(int position=0;position EPS) - ps.print(c.getWord(word)+"="+emit[i][position][word]+"\t"); - } - ps.println(); - } - ps.println(); - } - - } - -} diff --git a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java deleted file mode 100644 index 4b1939cd..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java +++ /dev/null @@ -1,288 +0,0 @@ -package phrase; - -import gnu.trove.TIntArrayList; - -import java.io.*; -import java.util.*; -import java.util.regex.Pattern; - - -public class Corpus -{ - private Lexicon wordLexicon = new Lexicon(); - private Lexicon phraseLexicon = new Lexicon(); - private Lexicon contextLexicon = new Lexicon(); - private List edges = new ArrayList(); - private List> phraseToContext = new ArrayList>(); - private List> contextToPhrase = new ArrayList>(); - public int splitSentinel; - public int phraseSentinel; - public int rareSentinel; - - public Corpus() - { - splitSentinel = wordLexicon.insert(""); - phraseSentinel = wordLexicon.insert(""); - rareSentinel = wordLexicon.insert(""); - } - - public class Edge - { - - Edge(int phraseId, int contextId, double count,int tag) - { - this.phraseId = phraseId; - this.contextId = contextId; - this.count = count; - fixTag=tag; - } - - Edge(int phraseId, int contextId, double count) - { - this.phraseId = phraseId; - this.contextId = contextId; - this.count = count; - fixTag=-1; - } - public int getTag(){ - return fixTag; - } - - public int getPhraseId() - { - return phraseId; - } - public TIntArrayList getPhrase() - { - return Corpus.this.getPhrase(phraseId); - } - public String getPhraseString() - { - return Corpus.this.getPhraseString(phraseId); - } - public int getContextId() - { - return contextId; - } - public TIntArrayList getContext() - { - return Corpus.this.getContext(contextId); - } - public String getContextString(boolean insertPhraseSentinel) - { - return Corpus.this.getContextString(contextId, insertPhraseSentinel); - } - public double getCount() - { - return count; - } - public boolean equals(Object other) - { - if (other instanceof Edge) - { - Edge oe = (Edge) other; - return oe.phraseId == phraseId && oe.contextId == contextId; - } - else return false; - } - public int hashCode() - { // this is how boost's hash_combine does it - int seed = phraseId; - seed ^= contextId + 0x9e3779b9 + (seed << 6) + (seed >> 2); - return seed; - } - public String toString() - { - return getPhraseString() + "\t" + getContextString(true); - } - - private int phraseId; - private int contextId; - private double count; - private int fixTag; - } - - List getEdges() - { - return edges; - } - - int getNumEdges() - { - return edges.size(); - } - - int getNumPhrases() - { - return phraseLexicon.size(); - } - - int getNumContextPositions() - { - return contextLexicon.lookup(0).size(); - } - - List getEdgesForPhrase(int phraseId) - { - return phraseToContext.get(phraseId); - } - - int getNumContexts() - { - return contextLexicon.size(); - } - - List getEdgesForContext(int contextId) - { - return contextToPhrase.get(contextId); - } - - int getNumWords() - { - return wordLexicon.size(); - } - - String getWord(int wordId) - { - return wordLexicon.lookup(wordId); - } - - public TIntArrayList getPhrase(int phraseId) - { - return phraseLexicon.lookup(phraseId); - } - - public String getPhraseString(int phraseId) - { - StringBuffer b = new StringBuffer(); - for (int tid: getPhrase(phraseId).toNativeArray()) - { - if (b.length() > 0) - b.append(" "); - b.append(wordLexicon.lookup(tid)); - } - return b.toString(); - } - - public TIntArrayList getContext(int contextId) - { - return contextLexicon.lookup(contextId); - } - - public String getContextString(int contextId, boolean insertPhraseSentinel) - { - StringBuffer b = new StringBuffer(); - TIntArrayList c = getContext(contextId); - for (int i = 0; i < c.size(); ++i) - { - if (i > 0) b.append(" "); - //if (i == c.size() / 2) b.append(" "); - b.append(wordLexicon.lookup(c.get(i))); - } - return b.toString(); - } - - public boolean isSentinel(int wordId) - { - return wordId == splitSentinel || wordId == phraseSentinel; - } - - List readEdges(Reader in) throws IOException - { - // read in line-by-line - BufferedReader bin = new BufferedReader(in); - String line; - Pattern separator = Pattern.compile(" \\|\\|\\| "); - - List edges = new ArrayList(); - while ((line = bin.readLine()) != null) - { - // split into phrase and contexts - StringTokenizer st = new StringTokenizer(line, "\t"); - assert (st.hasMoreTokens()); - String phraseToks = st.nextToken(); - assert (st.hasMoreTokens()); - String rest = st.nextToken(); - assert (!st.hasMoreTokens()); - - // process phrase - st = new StringTokenizer(phraseToks, " "); - TIntArrayList ptoks = new TIntArrayList(); - while (st.hasMoreTokens()) - ptoks.add(wordLexicon.insert(st.nextToken())); - int phraseId = phraseLexicon.insert(ptoks); - - // process contexts - String[] parts = separator.split(rest); - assert (parts.length % 2 == 0); - for (int i = 0; i < parts.length; i += 2) - { - // process pairs of strings - context and count - String ctxString = parts[i]; - String countString = parts[i + 1]; - - assert (countString.startsWith("C=")); - - String []countToks=countString.split(" "); - - double count = Double.parseDouble(countToks[0].substring(2).trim()); - - TIntArrayList ctx = new TIntArrayList(); - StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " "); - while (ctxStrtok.hasMoreTokens()) - { - String token = ctxStrtok.nextToken(); - ctx.add(wordLexicon.insert(token)); - } - int contextId = contextLexicon.insert(ctx); - - - if(countToks.length<2){ - edges.add(new Edge(phraseId, contextId, count)); - } - else{ - int tag=Integer.parseInt(countToks[1].substring(2)); - edges.add(new Edge(phraseId, contextId, count,tag)); - } - } - } - return edges; - } - - static Corpus readFromFile(Reader in) throws IOException - { - Corpus c = new Corpus(); - c.edges = c.readEdges(in); - for (Edge edge: c.edges) - { - while (edge.getPhraseId() >= c.phraseToContext.size()) - c.phraseToContext.add(new ArrayList()); - while (edge.getContextId() >= c.contextToPhrase.size()) - c.contextToPhrase.add(new ArrayList()); - - // index the edge for fast phrase, context lookup - c.phraseToContext.get(edge.getPhraseId()).add(edge); - c.contextToPhrase.get(edge.getContextId()).add(edge); - } - return c; - } - - TIntArrayList phraseEdges(TIntArrayList phrase) - { - TIntArrayList r = new TIntArrayList(4); - for (int p = 0; p < phrase.size(); ++p) - { - if (p == 0 || phrase.get(p-1) == splitSentinel) - r.add(p); - if (p == phrase.size() - 1 || phrase.get(p+1) == splitSentinel) - r.add(p); - } - return r; - } - - public void printStats(PrintStream out) - { - out.println("Corpus has " + edges.size() + " edges " + phraseLexicon.size() + " phrases " - + contextLexicon.size() + " contexts and " + wordLexicon.size() + " word types"); - } -} \ No newline at end of file diff --git a/gi/posterior-regularisation/prjava/src/phrase/Lexicon.java b/gi/posterior-regularisation/prjava/src/phrase/Lexicon.java deleted file mode 100644 index a386e4a3..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/Lexicon.java +++ /dev/null @@ -1,34 +0,0 @@ -package phrase; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -public class Lexicon -{ - public int insert(T word) - { - Integer i = wordToIndex.get(word); - if (i == null) - { - i = indexToWord.size(); - wordToIndex.put(word, i); - indexToWord.add(word); - } - return i; - } - - public T lookup(int index) - { - return indexToWord.get(index); - } - - public int size() - { - return indexToWord.size(); - } - - private Map wordToIndex = new HashMap(); - private List indexToWord = new ArrayList(); -} \ No newline at end of file diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java deleted file mode 100644 index c032bb2b..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java +++ /dev/null @@ -1,540 +0,0 @@ -package phrase; - -import gnu.trove.TIntArrayList; -import org.apache.commons.math.special.Gamma; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; -import java.util.regex.Pattern; - -import phrase.Corpus.Edge; - - -public class PhraseCluster { - - public int K; - private int n_phrases, n_words, n_contexts, n_positions; - public Corpus c; - public ExecutorService pool; - - double[] lambdaPTCT; - double[][] lambdaPT; - boolean cacheLambda = true; - - // emit[tag][position][word] = p(word | tag, position in context) - double emit[][][]; - // pi[phrase][tag] = p(tag | phrase) - double pi[][]; - - public PhraseCluster(int numCluster, Corpus corpus) - { - K=numCluster; - c=corpus; - n_words=c.getNumWords(); - n_phrases=c.getNumPhrases(); - n_contexts=c.getNumContexts(); - n_positions=c.getNumContextPositions(); - - emit=new double [K][n_positions][n_words]; - pi=new double[n_phrases][K]; - - for(double [][]i:emit) - for(double []j:i) - arr.F.randomise(j, true); - - for(double []j:pi) - arr.F.randomise(j, true); - } - - void useThreadPool(ExecutorService pool) - { - this.pool = pool; - } - - public double EM(int phraseSizeLimit) - { - double [][][]exp_emit=new double [K][n_positions][n_words]; - double []exp_pi=new double[K]; - - for(double [][]i:exp_emit) - for(double []j:i) - Arrays.fill(j, 1e-10); - - double loglikelihood=0; - - //E - for(int phrase=0; phrase < n_phrases; phrase++) - { - if (phraseSizeLimit >= 1 && c.getPhrase(phrase).size() > phraseSizeLimit) - continue; - - Arrays.fill(exp_pi, 1e-10); - - List contexts = c.getEdgesForPhrase(phrase); - - for (int ctx=0; ctx 0; - loglikelihood += edge.getCount() * Math.log(z); - arr.F.l1normalize(p); - - double count = edge.getCount(); - //increment expected count - TIntArrayList context = edge.getContext(); - for(int tag=0;tag= 1 && c.getPhrase(phrase).size() > phraseSizeLimit) - { - //System.arraycopy(pi[phrase], 0, exp_pi[phrase], 0, K); - continue; - } - - Arrays.fill(exp_pi, 1e-10); - - // FIXME: add rare edge check to phrase objective & posterior processing - PhraseObjective po = new PhraseObjective(this, phrase, scalePT, (cacheLambda) ? lambdaPT[phrase] : null); - boolean ok = po.optimizeWithProjectedGradientDescent(); - if (!ok) ++failures; - if (cacheLambda) lambdaPT[phrase] = po.getParameters(); - iterations += po.getNumberUpdateCalls(); - double [][] q=po.posterior(); - loglikelihood += po.loglikelihood(); - kl += po.KL_divergence(); - l1lmax += po.l1lmax(); - primal += po.primal(scalePT); - List edges = c.getEdgesForPhrase(phrase); - - for(int edge=0;edge 0) - System.out.println("WARNING: failed to converge in " + failures + "/" + n_phrases + " cases"); - System.out.println("\tmean iters: " + iterations/(double)n_phrases + " elapsed time " + (end - start) / 1000.0); - System.out.println("\tllh: " + loglikelihood); - System.out.println("\tKL: " + kl); - System.out.println("\tphrase l1lmax: " + l1lmax); - - //M - for(double [][]i:exp_emit) - for(double []j:i) - arr.F.l1normalize(j); - emit=exp_emit; - - return primal; - } - - public double PREM_phrase_constraints_parallel(final double scalePT, int phraseSizeLimit) - { - assert(pool != null); - - final LinkedBlockingQueue expectations - = new LinkedBlockingQueue(); - - double [][][]exp_emit=new double [K][n_positions][n_words]; - double [][]exp_pi=new double[n_phrases][K]; - - for(double [][]i:exp_emit) - for(double []j:i) - Arrays.fill(j, 1e-10); - for(double []j:exp_pi) - Arrays.fill(j, 1e-10); - - double loglikelihood=0, kl=0, l1lmax=0, primal=0; - final AtomicInteger failures = new AtomicInteger(0); - final AtomicLong elapsed = new AtomicLong(0l); - int iterations=0; - long start = System.currentTimeMillis(); - List> results = new ArrayList>(); - - if (lambdaPT == null && cacheLambda) - lambdaPT = new double[n_phrases][]; - - //E - for(int phrase=0;phrase= 1 && c.getPhrase(phrase).size() > phraseSizeLimit) { - System.arraycopy(pi[phrase], 0, exp_pi[phrase], 0, K); - continue; - } - - final int p=phrase; - results.add(pool.submit(new Callable() { - public PhraseObjective call() { - //System.out.println("" + Thread.currentThread().getId() + " optimising lambda for " + p); - long start = System.currentTimeMillis(); - PhraseObjective po = new PhraseObjective(PhraseCluster.this, p, scalePT, (cacheLambda) ? lambdaPT[p] : null); - boolean ok = po.optimizeWithProjectedGradientDescent(); - if (!ok) failures.incrementAndGet(); - long end = System.currentTimeMillis(); - elapsed.addAndGet(end - start); - //System.out.println("" + Thread.currentThread().getId() + " done optimising lambda for " + p); - return po; - } - })); - } - - // aggregate the expectations as they become available - for (Future fpo : results) - { - try { - //System.out.println("" + Thread.currentThread().getId() + " reading queue #" + count); - - // wait (blocking) until something is ready - PhraseObjective po = fpo.get(); - // process - int phrase = po.phrase; - if (cacheLambda) lambdaPT[phrase] = po.getParameters(); - //System.out.println("" + Thread.currentThread().getId() + " taken phrase " + phrase); - double [][] q=po.posterior(); - loglikelihood += po.loglikelihood(); - kl += po.KL_divergence(); - l1lmax += po.l1lmax(); - primal += po.primal(scalePT); - iterations += po.getNumberUpdateCalls(); - - List edges = c.getEdgesForPhrase(phrase); - for(int edge=0;edge 0) - System.out.println("WARNING: failed to converge in " + failures.get() + "/" + n_phrases + " cases"); - System.out.println("\tmean iters: " + iterations/(double)n_phrases + " walltime " + (end-start)/1000.0 + " threads " + elapsed.get() / 1000.0); - System.out.println("\tllh: " + loglikelihood); - System.out.println("\tKL: " + kl); - System.out.println("\tphrase l1lmax: " + l1lmax); - - //M - for(double [][]i:exp_emit) - for(double []j:i) - arr.F.l1normalize(j); - emit=exp_emit; - - for(double []j:exp_pi) - arr.F.l1normalize(j); - pi=exp_pi; - - return primal; - } - - public double PREM_phrase_context_constraints(double scalePT, double scaleCT) - { - double[][][] exp_emit = new double [K][n_positions][n_words]; - double[][] exp_pi = new double[n_phrases][K]; - - //E step - PhraseContextObjective pco = new PhraseContextObjective(this, lambdaPTCT, pool, scalePT, scaleCT); - boolean ok = pco.optimizeWithProjectedGradientDescent(); - if (cacheLambda) lambdaPTCT = pco.getParameters(); - - //now extract expectations - List edges = c.getEdges(); - for(int e = 0; e < edges.size(); ++e) - { - double [] q = pco.posterior(e); - Corpus.Edge edge = edges.get(e); - - TIntArrayList context = edge.getContext(); - double contextCnt = edge.getCount(); - //increment expected count - for(int tag=0;tag=0){ - prob=new double[K]; - prob[edge.getTag()]=1; - return prob; - } - - if (edge.getPhraseId() < n_phrases) - prob = Arrays.copyOf(pi[edge.getPhraseId()], K); - else - { - prob = new double[K]; - Arrays.fill(prob, 1.0); - } - - TIntArrayList ctx = edge.getContext(); - for(int tag=0;tag testing) - { - for (Edge edge : testing) - { - double probs[] = posterior(edge); - arr.F.l1normalize(probs); - - // emit phrase - ps.print(edge.getPhraseString()); - ps.print("\t"); - ps.print(edge.getContextString(true)); - int t=arr.F.argmax(probs); - ps.println(" ||| C=" + t + " T=" + edge.getCount() + " P=" + probs[t]); - //ps.println("# probs " + Arrays.toString(probs)); - } - } - - public void displayModelParam(PrintStream ps) - { - final double EPS = 1e-6; - ps.println("phrases " + n_phrases + " tags " + K + " positions " + n_positions); - - for (int i = 0; i < n_phrases; ++i) - for(int j=0;j EPS) - ps.println(i + " " + j + " " + pi[i][j]); - - ps.println(); - for (int i = 0; i < K; ++i) - { - for(int position=0;position EPS) - ps.println(i + " " + position + " " + word + " " + emit[i][position][word]); - } - } - } - } - - double phrase_l1lmax() - { - double sum=0; - for(int phrase=0; phrase data; - - // log likelihood under q - private double loglikelihood; - private SimplexProjection projectionPhrase; - private SimplexProjection projectionContext; - - double[] newPoint; - private int n_param; - - // likelihood under p - public double llh; - - private static Map edgeIndex; - - private long projectionTime; - private long objectiveTime; - private long actualProjectionTime; - private ExecutorService pool; - - double scalePT; - double scaleCT; - - public PhraseContextObjective(PhraseCluster cluster, double[] startingParameters, ExecutorService pool, - double scalePT, double scaleCT) - { - c=cluster; - data=c.c.getEdges(); - n_param=data.size()*c.K*2; - this.pool=pool; - this.scalePT = scalePT; - this.scaleCT = scaleCT; - - parameters = startingParameters; - if (parameters == null) - parameters = new double[n_param]; - - System.out.println("Num parameters " + n_param); - newPoint = new double[n_param]; - gradient = new double[n_param]; - initP(); - projectionPhrase = new SimplexProjection(scalePT); - projectionContext = new SimplexProjection(scaleCT); - q=new double [data.size()][c.K]; - - if (edgeIndex == null) { - edgeIndex = new HashMap(); - for (int e=0; e> tasks = new ArrayList>(); - - System.out.print(","); - System.out.flush(); - - Arrays.fill(newPoint, 0, newPoint.length, 0); - - // first project using the phrase-tag constraints, - // for all p,t: sum_c lambda_ptc < scaleP - if (pool == null) - { - for (int p = 0; p < c.c.getNumPhrases(); ++p) - { - List edges = c.c.getEdgesForPhrase(p); - double[] toProject = new double[edges.size()]; - for(int tag=0;tag edges = c.c.getEdgesForPhrase(phrase); - double toProject[] = new double[edges.size()]; - for(int tag=0;tag edges = c.c.getEdgesForContext(ctx); - double toProject[] = new double[edges.size()]; - for(int tag=0;tag edges = c.c.getEdgesForContext(context); - double toProject[] = new double[edges.size()]; - for(int tag=0;tag task: tasks) - { - try { - task.get(); - } catch (InterruptedException e) { - System.err.println("ERROR: Projection thread interrupted"); - e.printStackTrace(); - failure = e; - } catch (ExecutionException e) { - System.err.println("ERROR: Projection thread died"); - e.printStackTrace(); - failure = e; - } - } - // rethrow the exception - if (failure != null) - { - pool.shutdownNow(); - throw new RuntimeException(failure); - } - } - - double[] tmp = newPoint; - newPoint = point; - projectionTime += System.currentTimeMillis() - begin; - - //if (debug) - //System.out.println("\t\treturning " + Arrays.toString(tmp)); - return tmp; - } - - private int index(Edge edge, int tag, boolean phrase) - { - // NB if indexing changes must also change code in updateFunction and constructor - if (phrase) - return tag * edgeIndex.size() + edgeIndex.get(edge); - else - return (c.K + tag) * edgeIndex.size() + edgeIndex.get(edge); - } - - private int index(int e, int tag, boolean phrase) - { - // NB if indexing changes must also change code in updateFunction and constructor - if (phrase) - return tag * edgeIndex.size() + e; - else - return (c.K + tag) * edgeIndex.size() + e; - } - - @Override - public double[] getGradient() { - gradientCalls++; - return gradient; - } - - @Override - public double getValue() { - functionCalls++; - return loglikelihood; - } - - @Override - public String toString() { - return "No need for pointless toString"; - } - - public double []posterior(int edgeIndex){ - return q[edgeIndex]; - } - - public boolean optimizeWithProjectedGradientDescent() - { - projectionTime = 0; - actualProjectionTime = 0; - objectiveTime = 0; - long start = System.currentTimeMillis(); - - LineSearchMethod ls = - new ArmijoLineSearchMinimizationAlongProjectionArc - (new InterpolationPickFirstStep(INIT_STEP_SIZE)); - //LineSearchMethod ls = new WolfRuleLineSearch( - // (new InterpolationPickFirstStep(INIT_STEP_SIZE)), c1, c2); - OptimizerStats stats = new OptimizerStats(); - - - ProjectedGradientDescent optimizer = new ProjectedGradientDescent(ls); - StopingCriteria stopGrad = new ProjectedGradientL2Norm(GRAD_DIFF); - StopingCriteria stopValue = new ValueDifference(VAL_DIFF*(-llh)); - CompositeStopingCriteria compositeStop = new CompositeStopingCriteria(); - compositeStop.add(stopGrad); - compositeStop.add(stopValue); - optimizer.setMaxIterations(ITERATIONS); - updateFunction(); - boolean success = optimizer.optimize(this,stats,compositeStop); - - System.out.println(); - System.out.println(stats.prettyPrint(1)); - - if (success) - System.out.print("\toptimization took " + optimizer.getCurrentIteration() + " iterations"); - else - System.out.print("\toptimization failed to converge"); - long total = System.currentTimeMillis() - start; - System.out.println(" and " + total + " ms: projection " + projectionTime + - " actual " + actualProjectionTime + " objective " + objectiveTime); - - return success; - } - - double loglikelihood() - { - return llh; - } - - double KL_divergence() - { - return -loglikelihood + MathUtils.dotProduct(parameters, gradient); - } - - double phrase_l1lmax() - { - // \sum_{tag,phrase} max_{context} P(tag|context,phrase) - double sum=0; - for (int p = 0; p < c.c.getNumPhrases(); ++p) - { - List edges = c.c.getEdgesForPhrase(p); - for(int tag=0;tag edges = c.c.getEdgesForContext(ctx); - for(int tag=0; tagwordLex; - public HashMapphraseLex; - - public String wordList[]; - public String phraseList[]; - - //data[phrase][num context][position] - public int data[][][]; - public int numContexts; - - public PhraseCorpus(String filename) throws FileNotFoundException, IOException - { - BufferedReader r = FileUtil.reader(new File(filename)); - - phraseLex=new HashMap(); - wordLex=new HashMap(); - - ArrayListdataList=new ArrayList(); - String line=null; - numContexts = 0; - - while((line=readLine(r))!=null){ - - String toks[]=line.split("\t"); - String phrase=toks[0]; - addLex(phrase,phraseLex); - - toks=toks[1].split(" \\|\\|\\| "); - - ArrayList ctxList=new ArrayList(); - - for(int i=0;i")){ - continue; - } - addLex(word,wordLex); - context[idx]=wordLex.get(word); - idx++; - } - - String count=toks[i+1]; - context[idx]=Integer.parseInt(count.trim().substring(2)); - - ctxList.add(context); - } - - dataList.add(ctxList.toArray(new int [0][])); - - } - try{ - r.close(); - }catch(IOException ioe){ - ioe.printStackTrace(); - } - data=dataList.toArray(new int[0][][]); - } - - private void addLex(String key, HashMaplex){ - Integer i=lex.get(key); - if(i==null){ - lex.put(key, lex.size()); - } - } - - //for debugging - public void saveLex(String lexFilename) throws FileNotFoundException, IOException - { - PrintStream ps = FileUtil.printstream(new File(lexFilename)); - ps.println("Phrase Lexicon"); - ps.println(phraseLex.size()); - printDict(phraseLex,ps); - - ps.println("Word Lexicon"); - ps.println(wordLex.size()); - printDict(wordLex,ps); - ps.close(); - } - - private static void printDict(HashMaplex,PrintStream ps){ - String []dict=buildList(lex); - for(int i=0;i buildMap(String[]dict){ - HashMap map=new HashMap(); - for(int i=0;ilex){ - String dict[]=new String [lex.size()]; - for(String key:lex.keySet()){ - dict[lex.get(key)]=key; - } - return dict; - } - - public String getContextString(int context[], boolean addPhraseMarker) - { - StringBuffer b = new StringBuffer(); - for (int i=0;i 0) - b.append(" "); - - if (i == context.length/2) - b.append(" "); - - b.append(wordList[context[i]]); - } - return b.toString(); - } - - public static String readLine(BufferedReader r){ - try{ - return r.readLine(); - } - catch(IOException ioe){ - ioe.printStackTrace(); - } - return null; - } - - public static void main(String[] args) throws Exception - { - String LEX_FILENAME="../pdata/lex.out"; - String DATA_FILENAME="../pdata/btec.con"; - PhraseCorpus c=new PhraseCorpus(DATA_FILENAME); - c.saveLex(LEX_FILENAME); - c.loadLex(LEX_FILENAME); - c.saveLex(LEX_FILENAME); - } -} diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseObjective.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseObjective.java deleted file mode 100644 index ac73a075..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/PhraseObjective.java +++ /dev/null @@ -1,224 +0,0 @@ -package phrase; - -import java.util.Arrays; -import java.util.List; - -import optimization.gradientBasedMethods.ProjectedGradientDescent; -import optimization.gradientBasedMethods.ProjectedObjective; -import optimization.gradientBasedMethods.stats.OptimizerStats; -import optimization.linesearch.ArmijoLineSearchMinimizationAlongProjectionArc; -import optimization.linesearch.InterpolationPickFirstStep; -import optimization.linesearch.LineSearchMethod; -import optimization.linesearch.WolfRuleLineSearch; -import optimization.projections.SimplexProjection; -import optimization.stopCriteria.CompositeStopingCriteria; -import optimization.stopCriteria.ProjectedGradientL2Norm; -import optimization.stopCriteria.StopingCriteria; -import optimization.stopCriteria.ValueDifference; -import optimization.util.MathUtils; - -public class PhraseObjective extends ProjectedObjective -{ - static final double GRAD_DIFF = 0.00002; - static double INIT_STEP_SIZE = 300; - static double VAL_DIFF = 1e-8; // tuned to BTEC subsample - static int ITERATIONS = 100; - private PhraseCluster c; - - /**@brief - * for debugging purposes - */ - //public static PrintStream ps; - - /**@brief current phrase being optimzed*/ - public int phrase; - - /**@brief un-regularized posterior - * unnormalized - * p[edge][tag] - * P(tag|edge) \propto P(tag|phrase)P(context|tag) - */ - private double[][]p; - - /**@brief regularized posterior - * q[edge][tag] propto p[edge][tag]*exp(-lambda) - */ - private double q[][]; - private List data; - - /**@brief log likelihood of the associated phrase - * - */ - private double loglikelihood; - private SimplexProjection projection; - - double[] newPoint ; - - private int n_param; - - /**@brief likelihood under p - * - */ - public double llh; - - public PhraseObjective(PhraseCluster cluster, int phraseIdx, double scale, double[] lambda){ - phrase=phraseIdx; - c=cluster; - data=c.c.getEdgesForPhrase(phrase); - n_param=data.size()*c.K; - //System.out.println("Num parameters " + n_param + " for phrase #" + phraseIdx); - - if (lambda==null) - lambda=new double[n_param]; - - parameters = lambda; - newPoint = new double[n_param]; - gradient = new double[n_param]; - initP(); - projection=new SimplexProjection(scale); - q=new double [data.size()][c.K]; - - setParameters(parameters); - } - - private void initP(){ - p=new double[data.size()][]; - for(int edge=0;edgemax) - max=q[edge][tag]; - } - sum+=max; - } - return sum; - } - - public double primal(double scale) - { - return loglikelihood() - KL_divergence() - scale * l1lmax(); - } -} diff --git a/gi/posterior-regularisation/prjava/src/phrase/Trainer.java b/gi/posterior-regularisation/prjava/src/phrase/Trainer.java deleted file mode 100644 index 6f302b20..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/Trainer.java +++ /dev/null @@ -1,257 +0,0 @@ -package phrase; - -import io.FileUtil; -import joptsimple.OptionParser; -import joptsimple.OptionSet; -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.util.List; -import java.util.Random; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; - -import phrase.Corpus.Edge; - -import arr.F; - -public class Trainer -{ - public static void main(String[] args) - { - OptionParser parser = new OptionParser(); - parser.accepts("help"); - parser.accepts("in").withRequiredArg().ofType(File.class); - parser.accepts("in1").withRequiredArg().ofType(File.class); - parser.accepts("test").withRequiredArg().ofType(File.class); - parser.accepts("out").withRequiredArg().ofType(File.class); - parser.accepts("start").withRequiredArg().ofType(File.class); - parser.accepts("parameters").withRequiredArg().ofType(File.class); - parser.accepts("topics").withRequiredArg().ofType(Integer.class).defaultsTo(5); - parser.accepts("iterations").withRequiredArg().ofType(Integer.class).defaultsTo(10); - parser.accepts("threads").withRequiredArg().ofType(Integer.class).defaultsTo(0); - parser.accepts("scale-phrase").withRequiredArg().ofType(Double.class).defaultsTo(0.0); - parser.accepts("scale-context").withRequiredArg().ofType(Double.class).defaultsTo(0.0); - parser.accepts("seed").withRequiredArg().ofType(Long.class).defaultsTo(0l); - parser.accepts("convergence-threshold").withRequiredArg().ofType(Double.class).defaultsTo(1e-6); - parser.accepts("variational-bayes"); - parser.accepts("alpha-emit").withRequiredArg().ofType(Double.class).defaultsTo(0.1); - parser.accepts("alpha-pi").withRequiredArg().ofType(Double.class).defaultsTo(0.0001); - parser.accepts("agree-direction"); - parser.accepts("agree-language"); - parser.accepts("no-parameter-cache"); - parser.accepts("skip-large-phrases").withRequiredArg().ofType(Integer.class).defaultsTo(5); - OptionSet options = parser.parse(args); - - if (options.has("help") || !options.has("in")) - { - try { - parser.printHelpOn(System.err); - } catch (IOException e) { - System.err.println("This should never happen."); - e.printStackTrace(); - } - System.exit(1); - } - - int tags = (Integer) options.valueOf("topics"); - int iterations = (Integer) options.valueOf("iterations"); - double scale_phrase = (Double) options.valueOf("scale-phrase"); - double scale_context = (Double) options.valueOf("scale-context"); - int threads = (Integer) options.valueOf("threads"); - double threshold = (Double) options.valueOf("convergence-threshold"); - boolean vb = options.has("variational-bayes"); - double alphaEmit = (vb) ? (Double) options.valueOf("alpha-emit") : 0; - double alphaPi = (vb) ? (Double) options.valueOf("alpha-pi") : 0; - int skip = (Integer) options.valueOf("skip-large-phrases"); - - if (options.has("seed")) - F.rng = new Random((Long) options.valueOf("seed")); - - ExecutorService threadPool = null; - if (threads > 0) - threadPool = Executors.newFixedThreadPool(threads); - - if (tags <= 1 || scale_phrase < 0 || scale_context < 0 || threshold < 0) - { - System.err.println("Invalid arguments. Try again!"); - System.exit(1); - } - - Corpus corpus = null; - File infile = (File) options.valueOf("in"); - Corpus corpus1 = null; - File infile1 = (File) options.valueOf("in1"); - try { - System.out.println("Reading concordance from " + infile); - corpus = Corpus.readFromFile(FileUtil.reader(infile)); - corpus.printStats(System.out); - if(options.has("in1")){ - corpus1 = Corpus.readFromFile(FileUtil.reader(infile1)); - corpus1.printStats(System.out); - } - } catch (IOException e) { - System.err.println("Failed to open input file: " + infile); - e.printStackTrace(); - System.exit(1); - } - - if (!(options.has("agree-direction")||options.has("agree-language"))) - System.out.println("Running with " + tags + " tags " + - "for " + iterations + " iterations " + - ((skip > 0) ? "skipping large phrases for first " + skip + " iterations " : "") + - "with scale " + scale_phrase + " phrase and " + scale_context + " context " + - "and " + threads + " threads"); - else - System.out.println("Running agreement model with " + tags + " tags " + - "for " + iterations); - - System.out.println(); - - PhraseCluster cluster = null; - Agree2Sides agree2sides = null; - Agree agree= null; - VB vbModel=null; - if (options.has("agree-language")) - agree2sides = new Agree2Sides(tags, corpus,corpus1); - else if (options.has("agree-direction")) - agree = new Agree(tags, corpus); - else - { - if (vb) - { - vbModel=new VB(tags,corpus); - vbModel.alpha=alphaPi; - vbModel.lambda=alphaEmit; - if (threadPool != null) vbModel.useThreadPool(threadPool); - } - else - { - cluster = new PhraseCluster(tags, corpus); - if (threadPool != null) cluster.useThreadPool(threadPool); - - if (options.has("no-parameter-cache")) - cluster.cacheLambda = false; - if (options.has("start")) - { - try { - System.err.println("Reading starting parameters from " + options.valueOf("start")); - cluster.loadParameters(FileUtil.reader((File)options.valueOf("start"))); - } catch (IOException e) { - System.err.println("Failed to open input file: " + options.valueOf("start")); - e.printStackTrace(); - } - } - } - } - - double last = 0; - for (int i=0; i < iterations; i++) - { - double o; - if (agree != null) - o = agree.EM(); - else if(agree2sides!=null) - o = agree2sides.EM(); - else - { - if (i < skip) - System.out.println("Skipping phrases of length > " + (i+1)); - - if (scale_phrase <= 0 && scale_context <= 0) - { - if (!vb) - o = cluster.EM((i < skip) ? i+1 : 0); - else - o = vbModel.EM(); - } - else - o = cluster.PREM(scale_phrase, scale_context, (i < skip) ? i+1 : 0); - } - - System.out.println("ITER: "+i+" objective: " + o); - - // sometimes takes a few iterations to break the ties - if (i > 5 && Math.abs((o - last) / o) < threshold) - { - last = o; - break; - } - last = o; - } - - double pl1lmax = 0, cl1lmax = 0; - if (cluster != null) - { - pl1lmax = cluster.phrase_l1lmax(); - cl1lmax = cluster.context_l1lmax(); - } - else if (agree != null) - { - // fairly arbitrary choice of model1 cf model2 - pl1lmax = agree.model1.phrase_l1lmax(); - cl1lmax = agree.model1.context_l1lmax(); - } - else if (agree2sides != null) - { - // fairly arbitrary choice of model1 cf model2 - pl1lmax = agree2sides.model1.phrase_l1lmax(); - cl1lmax = agree2sides.model1.context_l1lmax(); - } - - System.out.println("\nFinal posterior phrase l1lmax " + pl1lmax + " context l1lmax " + cl1lmax); - - if (options.has("out")) - { - File outfile = (File) options.valueOf("out"); - try { - PrintStream ps = FileUtil.printstream(outfile); - List test; - if (!options.has("test")) // just use the training - test = corpus.getEdges(); - else - { // if --test supplied, load up the file - infile = (File) options.valueOf("test"); - System.out.println("Reading testing concordance from " + infile); - test = corpus.readEdges(FileUtil.reader(infile)); - } - if(vb) { - assert !options.has("test"); - vbModel.displayPosterior(ps); - } else if (cluster != null) - cluster.displayPosterior(ps, test); - else if (agree != null) - agree.displayPosterior(ps, test); - else if (agree2sides != null) { - assert !options.has("test"); - agree2sides.displayPosterior(ps); - } - - ps.close(); - } catch (IOException e) { - System.err.println("Failed to open either testing file or output file"); - e.printStackTrace(); - System.exit(1); - } - } - - if (options.has("parameters")) - { - assert !vb; - File outfile = (File) options.valueOf("parameters"); - PrintStream ps; - try { - ps = FileUtil.printstream(outfile); - cluster.displayModelParam(ps); - ps.close(); - } catch (IOException e) { - System.err.println("Failed to open output parameters file: " + outfile); - e.printStackTrace(); - System.exit(1); - } - } - - if (cluster != null && cluster.pool != null) - cluster.pool.shutdown(); - } -} diff --git a/gi/posterior-regularisation/prjava/src/phrase/VB.java b/gi/posterior-regularisation/prjava/src/phrase/VB.java deleted file mode 100644 index cd3f4966..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/VB.java +++ /dev/null @@ -1,419 +0,0 @@ -package phrase; - -import gnu.trove.TIntArrayList; - -import io.FileUtil; - -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; - -import org.apache.commons.math.special.Gamma; - -import phrase.Corpus.Edge; - -public class VB { - - public static int MAX_ITER=400; - - /**@brief - * hyper param for beta - * where beta is multinomial - * for generating words from a topic - */ - public double lambda=0.1; - /**@brief - * hyper param for theta - * where theta is dirichlet for z - */ - public double alpha=0.0001; - /**@brief - * variational param for beta - */ - private double rho[][][]; - private double digamma_rho[][][]; - private double rho_sum[][]; - /**@brief - * variational param for z - */ - //private double phi[][]; - /**@brief - * variational param for theta - */ - private double gamma[]; - private static double VAL_DIFF_RATIO=0.005; - - private int n_positions; - private int n_words; - private int K; - private ExecutorService pool; - - private Corpus c; - public static void main(String[] args) { - // String in="../pdata/canned.con"; - String in="../pdata/btec.con"; - String out="../pdata/vb.out"; - int numCluster=25; - Corpus corpus = null; - File infile = new File(in); - try { - System.out.println("Reading concordance from " + infile); - corpus = Corpus.readFromFile(FileUtil.reader(infile)); - corpus.printStats(System.out); - } catch (IOException e) { - System.err.println("Failed to open input file: " + infile); - e.printStackTrace(); - System.exit(1); - } - - VB vb=new VB(numCluster, corpus); - int iter=20; - for(int i=0;idoc=c.getEdgesForPhrase(d); - for(int n=0;n doc=c.getEdgesForPhrase(phraseID); - for(int i=0;i 0){ - phisum = log_sum(phisum, phi[n][i]); - } - else{ - phisum = phi[n][i]; - } - - }//end of a word - - for(int i=0;i1e-10){ - obj+=phi[n][i]*Math.log(phi[n][i]); - } - - double beta_sum=0; - for(int pos=0;pos0 && (obj-prev_val)/Math.abs(obj) doc=c.getEdgesForPhrase(d); - double[][] phi = new double[doc.size()][K]; - double[] gamma = new double[K]; - - emObj += inference(d, phi, gamma); - - for(int n=0;n - { - double[][] phi; - double[] gamma; - double obj; - int d; - PartialEStep(int d) { this.d = d; } - - public PartialEStep call() - { - phi = new double[c.getEdgesForPhrase(d).size()][K]; - gamma = new double[K]; - obj = inference(d, phi, gamma); - return this; - } - } - - List> jobs = new ArrayList>(); - for (int d=0;d job: jobs) - { - try { - PartialEStep e = job.get(); - - emObj += e.obj; - List doc = c.getEdgesForPhrase(e.d); - for(int n=0;n doc=c.getEdgesForPhrase(d); - double[][] phi = new double[doc.size()][K]; - for(int i=0;i