From 925087356b853e2099c1b60d8b757d7aa02121a9 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 2 Oct 2012 00:19:43 -0400 Subject: cdec cleanup, remove bayesian stuff, parsing stuff --- .../prjava/src/phrase/PhraseCluster.java | 540 --------------------- 1 file changed, 540 deletions(-) delete mode 100644 gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java (limited to 'gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java') diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java deleted file mode 100644 index c032bb2b..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java +++ /dev/null @@ -1,540 +0,0 @@ -package phrase; - -import gnu.trove.TIntArrayList; -import org.apache.commons.math.special.Gamma; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; -import java.util.regex.Pattern; - -import phrase.Corpus.Edge; - - -public class PhraseCluster { - - public int K; - private int n_phrases, n_words, n_contexts, n_positions; - public Corpus c; - public ExecutorService pool; - - double[] lambdaPTCT; - double[][] lambdaPT; - boolean cacheLambda = true; - - // emit[tag][position][word] = p(word | tag, position in context) - double emit[][][]; - // pi[phrase][tag] = p(tag | phrase) - double pi[][]; - - public PhraseCluster(int numCluster, Corpus corpus) - { - K=numCluster; - c=corpus; - n_words=c.getNumWords(); - n_phrases=c.getNumPhrases(); - n_contexts=c.getNumContexts(); - n_positions=c.getNumContextPositions(); - - emit=new double [K][n_positions][n_words]; - pi=new double[n_phrases][K]; - - for(double [][]i:emit) - for(double []j:i) - arr.F.randomise(j, true); - - for(double []j:pi) - arr.F.randomise(j, true); - } - - void useThreadPool(ExecutorService pool) - { - this.pool = pool; - } - - public double EM(int phraseSizeLimit) - { - double [][][]exp_emit=new double [K][n_positions][n_words]; - double []exp_pi=new double[K]; - - for(double [][]i:exp_emit) - for(double []j:i) - Arrays.fill(j, 1e-10); - - double loglikelihood=0; - - //E - for(int phrase=0; phrase < n_phrases; phrase++) - { - if (phraseSizeLimit >= 1 && c.getPhrase(phrase).size() > phraseSizeLimit) - continue; - - Arrays.fill(exp_pi, 1e-10); - - List contexts = c.getEdgesForPhrase(phrase); - - for (int ctx=0; ctx 0; - loglikelihood += edge.getCount() * Math.log(z); - arr.F.l1normalize(p); - - double count = edge.getCount(); - //increment expected count - TIntArrayList context = edge.getContext(); - for(int tag=0;tag= 1 && c.getPhrase(phrase).size() > phraseSizeLimit) - { - //System.arraycopy(pi[phrase], 0, exp_pi[phrase], 0, K); - continue; - } - - Arrays.fill(exp_pi, 1e-10); - - // FIXME: add rare edge check to phrase objective & posterior processing - PhraseObjective po = new PhraseObjective(this, phrase, scalePT, (cacheLambda) ? lambdaPT[phrase] : null); - boolean ok = po.optimizeWithProjectedGradientDescent(); - if (!ok) ++failures; - if (cacheLambda) lambdaPT[phrase] = po.getParameters(); - iterations += po.getNumberUpdateCalls(); - double [][] q=po.posterior(); - loglikelihood += po.loglikelihood(); - kl += po.KL_divergence(); - l1lmax += po.l1lmax(); - primal += po.primal(scalePT); - List edges = c.getEdgesForPhrase(phrase); - - for(int edge=0;edge 0) - System.out.println("WARNING: failed to converge in " + failures + "/" + n_phrases + " cases"); - System.out.println("\tmean iters: " + iterations/(double)n_phrases + " elapsed time " + (end - start) / 1000.0); - System.out.println("\tllh: " + loglikelihood); - System.out.println("\tKL: " + kl); - System.out.println("\tphrase l1lmax: " + l1lmax); - - //M - for(double [][]i:exp_emit) - for(double []j:i) - arr.F.l1normalize(j); - emit=exp_emit; - - return primal; - } - - public double PREM_phrase_constraints_parallel(final double scalePT, int phraseSizeLimit) - { - assert(pool != null); - - final LinkedBlockingQueue expectations - = new LinkedBlockingQueue(); - - double [][][]exp_emit=new double [K][n_positions][n_words]; - double [][]exp_pi=new double[n_phrases][K]; - - for(double [][]i:exp_emit) - for(double []j:i) - Arrays.fill(j, 1e-10); - for(double []j:exp_pi) - Arrays.fill(j, 1e-10); - - double loglikelihood=0, kl=0, l1lmax=0, primal=0; - final AtomicInteger failures = new AtomicInteger(0); - final AtomicLong elapsed = new AtomicLong(0l); - int iterations=0; - long start = System.currentTimeMillis(); - List> results = new ArrayList>(); - - if (lambdaPT == null && cacheLambda) - lambdaPT = new double[n_phrases][]; - - //E - for(int phrase=0;phrase= 1 && c.getPhrase(phrase).size() > phraseSizeLimit) { - System.arraycopy(pi[phrase], 0, exp_pi[phrase], 0, K); - continue; - } - - final int p=phrase; - results.add(pool.submit(new Callable() { - public PhraseObjective call() { - //System.out.println("" + Thread.currentThread().getId() + " optimising lambda for " + p); - long start = System.currentTimeMillis(); - PhraseObjective po = new PhraseObjective(PhraseCluster.this, p, scalePT, (cacheLambda) ? lambdaPT[p] : null); - boolean ok = po.optimizeWithProjectedGradientDescent(); - if (!ok) failures.incrementAndGet(); - long end = System.currentTimeMillis(); - elapsed.addAndGet(end - start); - //System.out.println("" + Thread.currentThread().getId() + " done optimising lambda for " + p); - return po; - } - })); - } - - // aggregate the expectations as they become available - for (Future fpo : results) - { - try { - //System.out.println("" + Thread.currentThread().getId() + " reading queue #" + count); - - // wait (blocking) until something is ready - PhraseObjective po = fpo.get(); - // process - int phrase = po.phrase; - if (cacheLambda) lambdaPT[phrase] = po.getParameters(); - //System.out.println("" + Thread.currentThread().getId() + " taken phrase " + phrase); - double [][] q=po.posterior(); - loglikelihood += po.loglikelihood(); - kl += po.KL_divergence(); - l1lmax += po.l1lmax(); - primal += po.primal(scalePT); - iterations += po.getNumberUpdateCalls(); - - List edges = c.getEdgesForPhrase(phrase); - for(int edge=0;edge 0) - System.out.println("WARNING: failed to converge in " + failures.get() + "/" + n_phrases + " cases"); - System.out.println("\tmean iters: " + iterations/(double)n_phrases + " walltime " + (end-start)/1000.0 + " threads " + elapsed.get() / 1000.0); - System.out.println("\tllh: " + loglikelihood); - System.out.println("\tKL: " + kl); - System.out.println("\tphrase l1lmax: " + l1lmax); - - //M - for(double [][]i:exp_emit) - for(double []j:i) - arr.F.l1normalize(j); - emit=exp_emit; - - for(double []j:exp_pi) - arr.F.l1normalize(j); - pi=exp_pi; - - return primal; - } - - public double PREM_phrase_context_constraints(double scalePT, double scaleCT) - { - double[][][] exp_emit = new double [K][n_positions][n_words]; - double[][] exp_pi = new double[n_phrases][K]; - - //E step - PhraseContextObjective pco = new PhraseContextObjective(this, lambdaPTCT, pool, scalePT, scaleCT); - boolean ok = pco.optimizeWithProjectedGradientDescent(); - if (cacheLambda) lambdaPTCT = pco.getParameters(); - - //now extract expectations - List edges = c.getEdges(); - for(int e = 0; e < edges.size(); ++e) - { - double [] q = pco.posterior(e); - Corpus.Edge edge = edges.get(e); - - TIntArrayList context = edge.getContext(); - double contextCnt = edge.getCount(); - //increment expected count - for(int tag=0;tag=0){ - prob=new double[K]; - prob[edge.getTag()]=1; - return prob; - } - - if (edge.getPhraseId() < n_phrases) - prob = Arrays.copyOf(pi[edge.getPhraseId()], K); - else - { - prob = new double[K]; - Arrays.fill(prob, 1.0); - } - - TIntArrayList ctx = edge.getContext(); - for(int tag=0;tag testing) - { - for (Edge edge : testing) - { - double probs[] = posterior(edge); - arr.F.l1normalize(probs); - - // emit phrase - ps.print(edge.getPhraseString()); - ps.print("\t"); - ps.print(edge.getContextString(true)); - int t=arr.F.argmax(probs); - ps.println(" ||| C=" + t + " T=" + edge.getCount() + " P=" + probs[t]); - //ps.println("# probs " + Arrays.toString(probs)); - } - } - - public void displayModelParam(PrintStream ps) - { - final double EPS = 1e-6; - ps.println("phrases " + n_phrases + " tags " + K + " positions " + n_positions); - - for (int i = 0; i < n_phrases; ++i) - for(int j=0;j EPS) - ps.println(i + " " + j + " " + pi[i][j]); - - ps.println(); - for (int i = 0; i < K; ++i) - { - for(int position=0;position EPS) - ps.println(i + " " + position + " " + word + " " + emit[i][position][word]); - } - } - } - } - - double phrase_l1lmax() - { - double sum=0; - for(int phrase=0; phrase