diff options
author | Chris Dyer <cdyer@cab.ark.cs.cmu.edu> | 2012-10-02 00:19:43 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cab.ark.cs.cmu.edu> | 2012-10-02 00:19:43 -0400 |
commit | e26434979adc33bd949566ba7bf02dff64e80a3e (patch) | |
tree | d1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/posterior-regularisation/prjava/src/phrase | |
parent | 0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff) |
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'gi/posterior-regularisation/prjava/src/phrase')
11 files changed, 0 insertions, 3008 deletions
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Agree.java b/gi/posterior-regularisation/prjava/src/phrase/Agree.java deleted file mode 100644 index 8f7b499e..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/Agree.java +++ /dev/null @@ -1,204 +0,0 @@ -package phrase;
-
-import gnu.trove.TIntArrayList;
-
-import io.FileUtil;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.util.List;
-
-import phrase.Corpus.Edge;
-
-public class Agree {
- PhraseCluster model1;
- C2F model2;
- Corpus c;
- private int K,n_phrases, n_words, n_contexts, n_positions1,n_positions2;
-
- /**@brief sum of loglikelihood of two
- * individual models
- */
- public double llh;
- /**@brief Bhattacharyya distance
- *
- */
- public double bdist;
- /**
- *
- * @param numCluster
- * @param corpus
- */
- public Agree(int numCluster, Corpus corpus){
-
- model1=new PhraseCluster(numCluster, corpus);
- model2=new C2F(numCluster,corpus);
- c=corpus;
- n_words=c.getNumWords();
- n_phrases=c.getNumPhrases();
- n_contexts=c.getNumContexts();
- n_positions1=c.getNumContextPositions();
- n_positions2=2;
- K=numCluster;
-
- }
-
- /**@brief test
- *
- */
- public static void main(String args[]){
- //String in="../pdata/canned.con";
- String in="../pdata/btec.con";
- String out="../pdata/posterior.out";
- int numCluster=25;
- Corpus corpus = null;
- File infile = new File(in);
- try {
- System.out.println("Reading concordance from " + infile);
- corpus = Corpus.readFromFile(FileUtil.reader(infile));
- corpus.printStats(System.out);
- } catch (IOException e) {
- System.err.println("Failed to open input file: " + infile);
- e.printStackTrace();
- System.exit(1);
- }
-
- Agree agree=new Agree(numCluster, corpus);
- int iter=20;
- for(int i=0;i<iter;i++){
- agree.EM();
- System.out.println("Iter"+i+", llh: "+agree.llh+
- ", divergence:"+agree.bdist+
- " sum: "+(agree.llh+agree.bdist));
- }
-
- File outfile = new File (out);
- try {
- PrintStream ps = FileUtil.printstream(outfile);
- agree.displayPosterior(ps);
- // ps.println();
- // c2f.displayModelParam(ps);
- ps.close();
- } catch (IOException e) {
- System.err.println("Failed to open output file: " + outfile);
- e.printStackTrace();
- System.exit(1);
- }
-
- }
-
- public double EM(){
-
- double [][][]exp_emit1=new double [K][n_positions1][n_words];
- double [][]exp_pi1=new double[n_phrases][K];
-
- double [][][]exp_emit2=new double [K][n_positions2][n_words];
- double [][]exp_pi2=new double[n_contexts][K];
-
- llh=0;
- bdist=0;
- //E
- for(int context=0; context< n_contexts; context++){
-
- List<Edge> contexts = c.getEdgesForContext(context);
-
- for (int ctx=0; ctx<contexts.size(); ctx++){
- Edge edge = contexts.get(ctx);
- int phrase=edge.getPhraseId();
- double p[]=posterior(edge);
- double z = arr.F.l1norm(p);
- assert z > 0;
- bdist += edge.getCount() * Math.log(z);
- arr.F.l1normalize(p);
-
- double count = edge.getCount();
- //increment expected count
- TIntArrayList phraseToks = edge.getPhrase();
- TIntArrayList contextToks = edge.getContext();
- for(int tag=0;tag<K;tag++){
-
- for(int position=0;position<n_positions1;position++){
- exp_emit1[tag][position][contextToks.get(position)]+=p[tag]*count;
- }
-
- exp_emit2[tag][0][phraseToks.get(0)]+=p[tag]*count;
- exp_emit2[tag][1][phraseToks.get(phraseToks.size()-1)]+=p[tag]*count;
-
- exp_pi1[phrase][tag]+=p[tag]*count;
- exp_pi2[context][tag]+=p[tag]*count;
- }
- }
- }
-
- //System.out.println("Log likelihood: "+loglikelihood);
-
- //M
- for(double [][]i:exp_emit1){
- for(double []j:i){
- arr.F.l1normalize(j);
- }
- }
-
- for(double []j:exp_pi1){
- arr.F.l1normalize(j);
- }
-
- for(double [][]i:exp_emit2){
- for(double []j:i){
- arr.F.l1normalize(j);
- }
- }
-
- for(double []j:exp_pi2){
- arr.F.l1normalize(j);
- }
-
- model1.emit=exp_emit1;
- model1.pi=exp_pi1;
- model2.emit=exp_emit2;
- model2.pi=exp_pi2;
-
- return llh;
- }
-
- public double[] posterior(Corpus.Edge edge)
- {
- double[] prob1=model1.posterior(edge);
- double[] prob2=model2.posterior(edge);
-
- llh+=edge.getCount()*Math.log(arr.F.l1norm(prob1));
- llh+=edge.getCount()*Math.log(arr.F.l1norm(prob2));
- arr.F.l1normalize(prob1);
- arr.F.l1normalize(prob2);
-
- for(int i=0;i<prob1.length;i++){
- prob1[i]*=prob2[i];
- prob1[i]=Math.sqrt(prob1[i]);
- }
-
- return prob1;
- }
-
- public void displayPosterior(PrintStream ps)
- {
- displayPosterior(ps, c.getEdges());
- }
-
- public void displayPosterior(PrintStream ps, List<Edge> test)
- {
- for (Edge edge : test)
- {
- double probs[] = posterior(edge);
- arr.F.l1normalize(probs);
-
- // emit phrase
- ps.print(edge.getPhraseString());
- ps.print("\t");
- ps.print(edge.getContextString(true));
- int t=arr.F.argmax(probs);
- ps.println(" ||| C=" + t);
- }
- }
-
-}
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java b/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java deleted file mode 100644 index 031f887f..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java +++ /dev/null @@ -1,197 +0,0 @@ -package phrase;
-
-import gnu.trove.TIntArrayList;
-
-import io.FileUtil;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.util.List;
-
-import phrase.Corpus.Edge;
-
-public class Agree2Sides {
- PhraseCluster model1,model2;
- Corpus c1,c2;
- private int K;
-
- /**@brief sum of loglikelihood of two
- * individual models
- */
- public double llh;
- /**@brief Bhattacharyya distance
- *
- */
- public double bdist;
- /**
- *
- * @param numCluster
- * @param corpus
- */
- public Agree2Sides(int numCluster, Corpus corpus1 , Corpus corpus2 ){
-
- model1=new PhraseCluster(numCluster, corpus1);
- model2=new PhraseCluster(numCluster,corpus2);
- c1=corpus1;
- c2=corpus2;
- K=numCluster;
-
- }
-
- /**@brief test
- *
- */
- public static void main(String args[]){
- //String in="../pdata/canned.con";
- // String in="../pdata/btec.con";
- String in1="../pdata/source.txt";
- String in2="../pdata/target.txt";
- String out="../pdata/posterior.out";
- int numCluster=25;
- Corpus corpus1 = null,corpus2=null;
- File infile1 = new File(in1),infile2=new File(in2);
- try {
- System.out.println("Reading concordance from " + infile1);
- corpus1 = Corpus.readFromFile(FileUtil.reader(infile1));
- System.out.println("Reading concordance from " + infile2);
- corpus2 = Corpus.readFromFile(FileUtil.reader(infile2));
- corpus1.printStats(System.out);
- } catch (IOException e) {
- System.err.println("Failed to open input file: " + infile1);
- e.printStackTrace();
- System.exit(1);
- }
-
- Agree2Sides agree=new Agree2Sides(numCluster, corpus1,corpus2);
- int iter=20;
- for(int i=0;i<iter;i++){
- agree.EM();
- System.out.println("Iter"+i+", llh: "+agree.llh+
- ", divergence:"+agree.bdist+
- " sum: "+(agree.llh+agree.bdist));
- }
-
- File outfile = new File (out);
- try {
- PrintStream ps = FileUtil.printstream(outfile);
- agree.displayPosterior(ps);
- // ps.println();
- // c2f.displayModelParam(ps);
- ps.close();
- } catch (IOException e) {
- System.err.println("Failed to open output file: " + outfile);
- e.printStackTrace();
- System.exit(1);
- }
-
- }
-
- public double EM(){
-
- double [][][]exp_emit1=new double [K][c1.getNumContextPositions()][c1.getNumWords()];
- double [][]exp_pi1=new double[c1.getNumPhrases()][K];
-
- double [][][]exp_emit2=new double [K][c2.getNumContextPositions()][c2.getNumWords()];
- double [][]exp_pi2=new double[c2.getNumPhrases()][K];
-
- llh=0;
- bdist=0;
- //E
- for(int i=0;i<c1.getEdges().size();i++){
- Edge edge1=c1.getEdges().get(i);
- Edge edge2=c2.getEdges().get(i);
- double p[]=posterior(i);
- double z = arr.F.l1norm(p);
- assert z > 0;
- bdist += edge1.getCount() * Math.log(z);
- arr.F.l1normalize(p);
- double count = edge1.getCount();
- //increment expected count
- TIntArrayList contextToks1 = edge1.getContext();
- TIntArrayList contextToks2 = edge2.getContext();
- int phrase1=edge1.getPhraseId();
- int phrase2=edge2.getPhraseId();
- for(int tag=0;tag<K;tag++){
- for(int position=0;position<c1.getNumContextPositions();position++){
- exp_emit1[tag][position][contextToks1.get(position)]+=p[tag]*count;
- }
- for(int position=0;position<c2.getNumContextPositions();position++){
- exp_emit2[tag][position][contextToks2.get(position)]+=p[tag]*count;
- }
- exp_pi1[phrase1][tag]+=p[tag]*count;
- exp_pi2[phrase2][tag]+=p[tag]*count;
- }
- }
-
- //System.out.println("Log likelihood: "+loglikelihood);
-
- //M
- for(double [][]i:exp_emit1){
- for(double []j:i){
- arr.F.l1normalize(j);
- }
- }
-
- for(double []j:exp_pi1){
- arr.F.l1normalize(j);
- }
-
- for(double [][]i:exp_emit2){
- for(double []j:i){
- arr.F.l1normalize(j);
- }
- }
-
- for(double []j:exp_pi2){
- arr.F.l1normalize(j);
- }
-
- model1.emit=exp_emit1;
- model1.pi=exp_pi1;
- model2.emit=exp_emit2;
- model2.pi=exp_pi2;
-
- return llh;
- }
-
- public double[] posterior(int edgeIdx)
- {
- return posterior(c1.getEdges().get(edgeIdx), c2.getEdges().get(edgeIdx));
- }
-
- public double[] posterior(Edge e1, Edge e2)
- {
- double[] prob1=model1.posterior(e1);
- double[] prob2=model2.posterior(e2);
-
- llh+=e1.getCount()*Math.log(arr.F.l1norm(prob1));
- llh+=e2.getCount()*Math.log(arr.F.l1norm(prob2));
- arr.F.l1normalize(prob1);
- arr.F.l1normalize(prob2);
-
- for(int i=0;i<prob1.length;i++){
- prob1[i]*=prob2[i];
- prob1[i]=Math.sqrt(prob1[i]);
- }
-
- return prob1;
- }
-
- public void displayPosterior(PrintStream ps)
- {
- for (int i=0;i<c1.getEdges().size();i++)
- {
- Edge edge=c1.getEdges().get(i);
- double probs[] = posterior(i);
- arr.F.l1normalize(probs);
-
- // emit phrase
- ps.print(edge.getPhraseString());
- ps.print("\t");
- ps.print(edge.getContextString(true));
- int t=arr.F.argmax(probs);
- ps.println(" ||| C=" + t);
- }
- }
-}
diff --git a/gi/posterior-regularisation/prjava/src/phrase/C2F.java b/gi/posterior-regularisation/prjava/src/phrase/C2F.java deleted file mode 100644 index e8783950..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/C2F.java +++ /dev/null @@ -1,216 +0,0 @@ -package phrase;
-
-import gnu.trove.TIntArrayList;
-
-import io.FileUtil;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.util.Arrays;
-import java.util.List;
-
-import phrase.Corpus.Edge;
-
-/**
- * @brief context generates phrase
- * @author desaic
- *
- */
-public class C2F {
- public int K;
- private int n_words, n_contexts, n_positions;
- public Corpus c;
-
- /**@brief
- * emit[tag][position][word] = p(word | tag, position in phrase)
- */
- public double emit[][][];
- /**@brief
- * pi[context][tag] = p(tag | context)
- */
- public double pi[][];
-
- public C2F(int numCluster, Corpus corpus){
- K=numCluster;
- c=corpus;
- n_words=c.getNumWords();
- n_contexts=c.getNumContexts();
-
- //number of words in a phrase to be considered
- //currently the first and last word in source and target
- //if the phrase has length 1 in either dimension then
- //we use the same word for two positions
- n_positions=c.phraseEdges(c.getEdges().get(0).getPhrase()).size();
-
- emit=new double [K][n_positions][n_words];
- pi=new double[n_contexts][K];
-
- for(double [][]i:emit){
- for(double []j:i){
- arr.F.randomise(j);
- }
- }
-
- for(double []j:pi){
- arr.F.randomise(j);
- }
- }
-
- /**@brief test
- *
- */
- public static void main(String args[]){
- String in="../pdata/canned.con";
- String out="../pdata/posterior.out";
- int numCluster=25;
- Corpus corpus = null;
- File infile = new File(in);
- try {
- System.out.println("Reading concordance from " + infile);
- corpus = Corpus.readFromFile(FileUtil.reader(infile));
- corpus.printStats(System.out);
- } catch (IOException e) {
- System.err.println("Failed to open input file: " + infile);
- e.printStackTrace();
- System.exit(1);
- }
-
- C2F c2f=new C2F(numCluster,corpus);
- int iter=20;
- double llh=0;
- for(int i=0;i<iter;i++){
- llh=c2f.EM();
- System.out.println("Iter"+i+", llh: "+llh);
- }
-
- File outfile = new File (out);
- try {
- PrintStream ps = FileUtil.printstream(outfile);
- c2f.displayPosterior(ps);
- // ps.println();
- // c2f.displayModelParam(ps);
- ps.close();
- } catch (IOException e) {
- System.err.println("Failed to open output file: " + outfile);
- e.printStackTrace();
- System.exit(1);
- }
-
- }
-
- public double EM(){
- double [][][]exp_emit=new double [K][n_positions][n_words];
- double [][]exp_pi=new double[n_contexts][K];
-
- double loglikelihood=0;
-
- //E
- for(int context=0; context< n_contexts; context++){
-
- List<Edge> contexts = c.getEdgesForContext(context);
-
- for (int ctx=0; ctx<contexts.size(); ctx++){
- Edge edge = contexts.get(ctx);
- double p[]=posterior(edge);
- double z = arr.F.l1norm(p);
- assert z > 0;
- loglikelihood += edge.getCount() * Math.log(z);
- arr.F.l1normalize(p);
-
- double count = edge.getCount();
- //increment expected count
- TIntArrayList phrase= edge.getPhrase();
- for(int tag=0;tag<K;tag++){
-
- exp_emit[tag][0][phrase.get(0)]+=p[tag]*count;
- exp_emit[tag][1][phrase.get(phrase.size()-1)]+=p[tag]*count;
-
- exp_pi[context][tag]+=p[tag]*count;
- }
- }
- }
-
- //System.out.println("Log likelihood: "+loglikelihood);
-
- //M
- for(double [][]i:exp_emit){
- for(double []j:i){
- arr.F.l1normalize(j);
- }
- }
-
- emit=exp_emit;
-
- for(double []j:exp_pi){
- arr.F.l1normalize(j);
- }
-
- pi=exp_pi;
-
- return loglikelihood;
- }
-
- public double[] posterior(Corpus.Edge edge)
- {
- double[] prob=Arrays.copyOf(pi[edge.getContextId()], K);
-
- TIntArrayList phrase = edge.getPhrase();
- TIntArrayList offsets = c.phraseEdges(phrase);
- for(int tag=0;tag<K;tag++)
- {
- for (int i=0; i < offsets.size(); ++i)
- prob[tag]*=emit[tag][i][phrase.get(offsets.get(i))];
- }
-
- return prob;
- }
-
- public void displayPosterior(PrintStream ps)
- {
- for (Edge edge : c.getEdges())
- {
- double probs[] = posterior(edge);
- arr.F.l1normalize(probs);
-
- // emit phrase
- ps.print(edge.getPhraseString());
- ps.print("\t");
- ps.print(edge.getContextString(true));
- int t=arr.F.argmax(probs);
- ps.println(" ||| C=" + t);
- }
- }
-
- public void displayModelParam(PrintStream ps)
- {
- final double EPS = 1e-6;
-
- ps.println("P(tag|context)");
- for (int i = 0; i < n_contexts; ++i)
- {
- ps.print(c.getContext(i));
- for(int j=0;j<pi[i].length;j++){
- if (pi[i][j] > EPS)
- ps.print("\t" + j + ": " + pi[i][j]);
- }
- ps.println();
- }
-
- ps.println("P(word|tag,position)");
- for (int i = 0; i < K; ++i)
- {
- for(int position=0;position<n_positions;position++){
- ps.println("tag " + i + " position " + position);
- for(int word=0;word<emit[i][position].length;word++){
- if (emit[i][position][word] > EPS)
- ps.print(c.getWord(word)+"="+emit[i][position][word]+"\t");
- }
- ps.println();
- }
- ps.println();
- }
-
- }
-
-}
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java deleted file mode 100644 index 4b1939cd..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java +++ /dev/null @@ -1,288 +0,0 @@ -package phrase; - -import gnu.trove.TIntArrayList; - -import java.io.*; -import java.util.*; -import java.util.regex.Pattern; - - -public class Corpus -{ - private Lexicon<String> wordLexicon = new Lexicon<String>(); - private Lexicon<TIntArrayList> phraseLexicon = new Lexicon<TIntArrayList>(); - private Lexicon<TIntArrayList> contextLexicon = new Lexicon<TIntArrayList>(); - private List<Edge> edges = new ArrayList<Edge>(); - private List<List<Edge>> phraseToContext = new ArrayList<List<Edge>>(); - private List<List<Edge>> contextToPhrase = new ArrayList<List<Edge>>(); - public int splitSentinel; - public int phraseSentinel; - public int rareSentinel; - - public Corpus() - { - splitSentinel = wordLexicon.insert("<SPLIT>"); - phraseSentinel = wordLexicon.insert("<PHRASE>"); - rareSentinel = wordLexicon.insert("<RARE>"); - } - - public class Edge - { - - Edge(int phraseId, int contextId, double count,int tag) - { - this.phraseId = phraseId; - this.contextId = contextId; - this.count = count; - fixTag=tag; - } - - Edge(int phraseId, int contextId, double count) - { - this.phraseId = phraseId; - this.contextId = contextId; - this.count = count; - fixTag=-1; - } - public int getTag(){ - return fixTag; - } - - public int getPhraseId() - { - return phraseId; - } - public TIntArrayList getPhrase() - { - return Corpus.this.getPhrase(phraseId); - } - public String getPhraseString() - { - return Corpus.this.getPhraseString(phraseId); - } - public int getContextId() - { - return contextId; - } - public TIntArrayList getContext() - { - return Corpus.this.getContext(contextId); - } - public String getContextString(boolean insertPhraseSentinel) - { - return Corpus.this.getContextString(contextId, insertPhraseSentinel); - } - public double getCount() - { - return count; - } - public boolean equals(Object other) - { - if (other instanceof Edge) - { - Edge oe = (Edge) other; - return oe.phraseId == phraseId && oe.contextId == contextId; - } - else return false; - } - public int hashCode() - { // this is how boost's hash_combine does it - int seed = phraseId; - seed ^= contextId + 0x9e3779b9 + (seed << 6) + (seed >> 2); - return seed; - } - public String toString() - { - return getPhraseString() + "\t" + getContextString(true); - } - - private int phraseId; - private int contextId; - private double count; - private int fixTag; - } - - List<Edge> getEdges() - { - return edges; - } - - int getNumEdges() - { - return edges.size(); - } - - int getNumPhrases() - { - return phraseLexicon.size(); - } - - int getNumContextPositions() - { - return contextLexicon.lookup(0).size(); - } - - List<Edge> getEdgesForPhrase(int phraseId) - { - return phraseToContext.get(phraseId); - } - - int getNumContexts() - { - return contextLexicon.size(); - } - - List<Edge> getEdgesForContext(int contextId) - { - return contextToPhrase.get(contextId); - } - - int getNumWords() - { - return wordLexicon.size(); - } - - String getWord(int wordId) - { - return wordLexicon.lookup(wordId); - } - - public TIntArrayList getPhrase(int phraseId) - { - return phraseLexicon.lookup(phraseId); - } - - public String getPhraseString(int phraseId) - { - StringBuffer b = new StringBuffer(); - for (int tid: getPhrase(phraseId).toNativeArray()) - { - if (b.length() > 0) - b.append(" "); - b.append(wordLexicon.lookup(tid)); - } - return b.toString(); - } - - public TIntArrayList getContext(int contextId) - { - return contextLexicon.lookup(contextId); - } - - public String getContextString(int contextId, boolean insertPhraseSentinel) - { - StringBuffer b = new StringBuffer(); - TIntArrayList c = getContext(contextId); - for (int i = 0; i < c.size(); ++i) - { - if (i > 0) b.append(" "); - //if (i == c.size() / 2) b.append("<PHRASE> "); - b.append(wordLexicon.lookup(c.get(i))); - } - return b.toString(); - } - - public boolean isSentinel(int wordId) - { - return wordId == splitSentinel || wordId == phraseSentinel; - } - - List<Edge> readEdges(Reader in) throws IOException - { - // read in line-by-line - BufferedReader bin = new BufferedReader(in); - String line; - Pattern separator = Pattern.compile(" \\|\\|\\| "); - - List<Edge> edges = new ArrayList<Edge>(); - while ((line = bin.readLine()) != null) - { - // split into phrase and contexts - StringTokenizer st = new StringTokenizer(line, "\t"); - assert (st.hasMoreTokens()); - String phraseToks = st.nextToken(); - assert (st.hasMoreTokens()); - String rest = st.nextToken(); - assert (!st.hasMoreTokens()); - - // process phrase - st = new StringTokenizer(phraseToks, " "); - TIntArrayList ptoks = new TIntArrayList(); - while (st.hasMoreTokens()) - ptoks.add(wordLexicon.insert(st.nextToken())); - int phraseId = phraseLexicon.insert(ptoks); - - // process contexts - String[] parts = separator.split(rest); - assert (parts.length % 2 == 0); - for (int i = 0; i < parts.length; i += 2) - { - // process pairs of strings - context and count - String ctxString = parts[i]; - String countString = parts[i + 1]; - - assert (countString.startsWith("C=")); - - String []countToks=countString.split(" "); - - double count = Double.parseDouble(countToks[0].substring(2).trim()); - - TIntArrayList ctx = new TIntArrayList(); - StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " "); - while (ctxStrtok.hasMoreTokens()) - { - String token = ctxStrtok.nextToken(); - ctx.add(wordLexicon.insert(token)); - } - int contextId = contextLexicon.insert(ctx); - - - if(countToks.length<2){ - edges.add(new Edge(phraseId, contextId, count)); - } - else{ - int tag=Integer.parseInt(countToks[1].substring(2)); - edges.add(new Edge(phraseId, contextId, count,tag)); - } - } - } - return edges; - } - - static Corpus readFromFile(Reader in) throws IOException - { - Corpus c = new Corpus(); - c.edges = c.readEdges(in); - for (Edge edge: c.edges) - { - while (edge.getPhraseId() >= c.phraseToContext.size()) - c.phraseToContext.add(new ArrayList<Edge>()); - while (edge.getContextId() >= c.contextToPhrase.size()) - c.contextToPhrase.add(new ArrayList<Edge>()); - - // index the edge for fast phrase, context lookup - c.phraseToContext.get(edge.getPhraseId()).add(edge); - c.contextToPhrase.get(edge.getContextId()).add(edge); - } - return c; - } - - TIntArrayList phraseEdges(TIntArrayList phrase) - { - TIntArrayList r = new TIntArrayList(4); - for (int p = 0; p < phrase.size(); ++p) - { - if (p == 0 || phrase.get(p-1) == splitSentinel) - r.add(p); - if (p == phrase.size() - 1 || phrase.get(p+1) == splitSentinel) - r.add(p); - } - return r; - } - - public void printStats(PrintStream out) - { - out.println("Corpus has " + edges.size() + " edges " + phraseLexicon.size() + " phrases " - + contextLexicon.size() + " contexts and " + wordLexicon.size() + " word types"); - } -}
\ No newline at end of file diff --git a/gi/posterior-regularisation/prjava/src/phrase/Lexicon.java b/gi/posterior-regularisation/prjava/src/phrase/Lexicon.java deleted file mode 100644 index a386e4a3..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/Lexicon.java +++ /dev/null @@ -1,34 +0,0 @@ -package phrase; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -public class Lexicon<T> -{ - public int insert(T word) - { - Integer i = wordToIndex.get(word); - if (i == null) - { - i = indexToWord.size(); - wordToIndex.put(word, i); - indexToWord.add(word); - } - return i; - } - - public T lookup(int index) - { - return indexToWord.get(index); - } - - public int size() - { - return indexToWord.size(); - } - - private Map<T, Integer> wordToIndex = new HashMap<T, Integer>(); - private List<T> indexToWord = new ArrayList<T>(); -}
\ No newline at end of file diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java deleted file mode 100644 index c032bb2b..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/PhraseCluster.java +++ /dev/null @@ -1,540 +0,0 @@ -package phrase;
-
-import gnu.trove.TIntArrayList;
-import org.apache.commons.math.special.Gamma;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicLong;
-import java.util.regex.Pattern;
-
-import phrase.Corpus.Edge;
-
-
-public class PhraseCluster {
-
- public int K;
- private int n_phrases, n_words, n_contexts, n_positions;
- public Corpus c;
- public ExecutorService pool;
-
- double[] lambdaPTCT;
- double[][] lambdaPT;
- boolean cacheLambda = true;
-
- // emit[tag][position][word] = p(word | tag, position in context)
- double emit[][][];
- // pi[phrase][tag] = p(tag | phrase)
- double pi[][];
-
- public PhraseCluster(int numCluster, Corpus corpus)
- {
- K=numCluster;
- c=corpus;
- n_words=c.getNumWords();
- n_phrases=c.getNumPhrases();
- n_contexts=c.getNumContexts();
- n_positions=c.getNumContextPositions();
-
- emit=new double [K][n_positions][n_words];
- pi=new double[n_phrases][K];
-
- for(double [][]i:emit)
- for(double []j:i)
- arr.F.randomise(j, true);
-
- for(double []j:pi)
- arr.F.randomise(j, true);
- }
-
- void useThreadPool(ExecutorService pool)
- {
- this.pool = pool;
- }
-
- public double EM(int phraseSizeLimit)
- {
- double [][][]exp_emit=new double [K][n_positions][n_words];
- double []exp_pi=new double[K];
-
- for(double [][]i:exp_emit)
- for(double []j:i)
- Arrays.fill(j, 1e-10);
-
- double loglikelihood=0;
-
- //E
- for(int phrase=0; phrase < n_phrases; phrase++)
- {
- if (phraseSizeLimit >= 1 && c.getPhrase(phrase).size() > phraseSizeLimit)
- continue;
-
- Arrays.fill(exp_pi, 1e-10);
-
- List<Edge> contexts = c.getEdgesForPhrase(phrase);
-
- for (int ctx=0; ctx<contexts.size(); ctx++)
- {
- Edge edge = contexts.get(ctx);
-
- double p[]=posterior(edge);
- double z = arr.F.l1norm(p);
- assert z > 0;
- loglikelihood += edge.getCount() * Math.log(z);
- arr.F.l1normalize(p);
-
- double count = edge.getCount();
- //increment expected count
- TIntArrayList context = edge.getContext();
- for(int tag=0;tag<K;tag++)
- {
- for(int pos=0;pos<n_positions;pos++){
- exp_emit[tag][pos][context.get(pos)]+=p[tag]*count;
- }
- exp_pi[tag]+=p[tag]*count;
- }
- }
- arr.F.l1normalize(exp_pi);
- System.arraycopy(exp_pi, 0, pi[phrase], 0, K);
- }
-
- //M
- for(double [][]i:exp_emit)
- for(double []j:i)
- arr.F.l1normalize(j);
-
- emit=exp_emit;
-
- return loglikelihood;
- }
-
- public double PREM(double scalePT, double scaleCT, int phraseSizeLimit)
- {
- if (scaleCT == 0)
- {
- if (pool != null)
- return PREM_phrase_constraints_parallel(scalePT, phraseSizeLimit);
- else
- return PREM_phrase_constraints(scalePT, phraseSizeLimit);
- }
- else // FIXME: ignores phraseSizeLimit
- return this.PREM_phrase_context_constraints(scalePT, scaleCT);
- }
-
-
- public double PREM_phrase_constraints(double scalePT, int phraseSizeLimit)
- {
- double [][][]exp_emit=new double[K][n_positions][n_words];
- double []exp_pi=new double[K];
-
- for(double [][]i:exp_emit)
- for(double []j:i)
- Arrays.fill(j, 1e-10);
-
- if (lambdaPT == null && cacheLambda)
- lambdaPT = new double[n_phrases][];
-
- double loglikelihood=0, kl=0, l1lmax=0, primal=0;
- int failures=0, iterations=0;
- long start = System.currentTimeMillis();
- //E
- for(int phrase=0; phrase<n_phrases; phrase++)
- {
- if (phraseSizeLimit >= 1 && c.getPhrase(phrase).size() > phraseSizeLimit)
- {
- //System.arraycopy(pi[phrase], 0, exp_pi[phrase], 0, K);
- continue;
- }
-
- Arrays.fill(exp_pi, 1e-10);
-
- // FIXME: add rare edge check to phrase objective & posterior processing
- PhraseObjective po = new PhraseObjective(this, phrase, scalePT, (cacheLambda) ? lambdaPT[phrase] : null);
- boolean ok = po.optimizeWithProjectedGradientDescent();
- if (!ok) ++failures;
- if (cacheLambda) lambdaPT[phrase] = po.getParameters();
- iterations += po.getNumberUpdateCalls();
- double [][] q=po.posterior();
- loglikelihood += po.loglikelihood();
- kl += po.KL_divergence();
- l1lmax += po.l1lmax();
- primal += po.primal(scalePT);
- List<Edge> edges = c.getEdgesForPhrase(phrase);
-
- for(int edge=0;edge<q.length;edge++){
- Edge e = edges.get(edge);
- TIntArrayList context = e.getContext();
- double contextCnt = e.getCount();
- //increment expected count
- for(int tag=0;tag<K;tag++){
- for(int pos=0;pos<n_positions;pos++){
- exp_emit[tag][pos][context.get(pos)]+=q[edge][tag]*contextCnt;
- }
-
- exp_pi[tag]+=q[edge][tag]*contextCnt;
-
- }
- }
- arr.F.l1normalize(exp_pi);
- System.arraycopy(exp_pi, 0, pi[phrase], 0, K);
- }
-
- long end = System.currentTimeMillis();
- if (failures > 0)
- System.out.println("WARNING: failed to converge in " + failures + "/" + n_phrases + " cases");
- System.out.println("\tmean iters: " + iterations/(double)n_phrases + " elapsed time " + (end - start) / 1000.0);
- System.out.println("\tllh: " + loglikelihood);
- System.out.println("\tKL: " + kl);
- System.out.println("\tphrase l1lmax: " + l1lmax);
-
- //M
- for(double [][]i:exp_emit)
- for(double []j:i)
- arr.F.l1normalize(j);
- emit=exp_emit;
-
- return primal;
- }
-
- public double PREM_phrase_constraints_parallel(final double scalePT, int phraseSizeLimit)
- {
- assert(pool != null);
-
- final LinkedBlockingQueue<PhraseObjective> expectations
- = new LinkedBlockingQueue<PhraseObjective>();
-
- double [][][]exp_emit=new double [K][n_positions][n_words];
- double [][]exp_pi=new double[n_phrases][K];
-
- for(double [][]i:exp_emit)
- for(double []j:i)
- Arrays.fill(j, 1e-10);
- for(double []j:exp_pi)
- Arrays.fill(j, 1e-10);
-
- double loglikelihood=0, kl=0, l1lmax=0, primal=0;
- final AtomicInteger failures = new AtomicInteger(0);
- final AtomicLong elapsed = new AtomicLong(0l);
- int iterations=0;
- long start = System.currentTimeMillis();
- List<Future<PhraseObjective>> results = new ArrayList<Future<PhraseObjective>>();
-
- if (lambdaPT == null && cacheLambda)
- lambdaPT = new double[n_phrases][];
-
- //E
- for(int phrase=0;phrase<n_phrases;phrase++) {
- if (phraseSizeLimit >= 1 && c.getPhrase(phrase).size() > phraseSizeLimit) {
- System.arraycopy(pi[phrase], 0, exp_pi[phrase], 0, K);
- continue;
- }
-
- final int p=phrase;
- results.add(pool.submit(new Callable<PhraseObjective>() {
- public PhraseObjective call() {
- //System.out.println("" + Thread.currentThread().getId() + " optimising lambda for " + p);
- long start = System.currentTimeMillis();
- PhraseObjective po = new PhraseObjective(PhraseCluster.this, p, scalePT, (cacheLambda) ? lambdaPT[p] : null);
- boolean ok = po.optimizeWithProjectedGradientDescent();
- if (!ok) failures.incrementAndGet();
- long end = System.currentTimeMillis();
- elapsed.addAndGet(end - start);
- //System.out.println("" + Thread.currentThread().getId() + " done optimising lambda for " + p);
- return po;
- }
- }));
- }
-
- // aggregate the expectations as they become available
- for (Future<PhraseObjective> fpo : results)
- {
- try {
- //System.out.println("" + Thread.currentThread().getId() + " reading queue #" + count);
-
- // wait (blocking) until something is ready
- PhraseObjective po = fpo.get();
- // process
- int phrase = po.phrase;
- if (cacheLambda) lambdaPT[phrase] = po.getParameters();
- //System.out.println("" + Thread.currentThread().getId() + " taken phrase " + phrase);
- double [][] q=po.posterior();
- loglikelihood += po.loglikelihood();
- kl += po.KL_divergence();
- l1lmax += po.l1lmax();
- primal += po.primal(scalePT);
- iterations += po.getNumberUpdateCalls();
-
- List<Edge> edges = c.getEdgesForPhrase(phrase);
- for(int edge=0;edge<q.length;edge++){
- Edge e = edges.get(edge);
- TIntArrayList context = e.getContext();
- double contextCnt = e.getCount();
- //increment expected count
- for(int tag=0;tag<K;tag++){
- for(int pos=0;pos<n_positions;pos++){
- exp_emit[tag][pos][context.get(pos)]+=q[edge][tag]*contextCnt;
- }
- exp_pi[phrase][tag]+=q[edge][tag]*contextCnt;
- }
- }
- } catch (InterruptedException e) {
- System.err.println("M-step thread interrupted. Probably fatal!");
- throw new RuntimeException(e);
- } catch (ExecutionException e) {
- System.err.println("M-step thread execution died. Probably fatal!");
- throw new RuntimeException(e);
- }
- }
-
- long end = System.currentTimeMillis();
-
- if (failures.get() > 0)
- System.out.println("WARNING: failed to converge in " + failures.get() + "/" + n_phrases + " cases");
- System.out.println("\tmean iters: " + iterations/(double)n_phrases + " walltime " + (end-start)/1000.0 + " threads " + elapsed.get() / 1000.0);
- System.out.println("\tllh: " + loglikelihood);
- System.out.println("\tKL: " + kl);
- System.out.println("\tphrase l1lmax: " + l1lmax);
-
- //M
- for(double [][]i:exp_emit)
- for(double []j:i)
- arr.F.l1normalize(j);
- emit=exp_emit;
-
- for(double []j:exp_pi)
- arr.F.l1normalize(j);
- pi=exp_pi;
-
- return primal;
- }
-
- public double PREM_phrase_context_constraints(double scalePT, double scaleCT)
- {
- double[][][] exp_emit = new double [K][n_positions][n_words];
- double[][] exp_pi = new double[n_phrases][K];
-
- //E step
- PhraseContextObjective pco = new PhraseContextObjective(this, lambdaPTCT, pool, scalePT, scaleCT);
- boolean ok = pco.optimizeWithProjectedGradientDescent();
- if (cacheLambda) lambdaPTCT = pco.getParameters();
-
- //now extract expectations
- List<Corpus.Edge> edges = c.getEdges();
- for(int e = 0; e < edges.size(); ++e)
- {
- double [] q = pco.posterior(e);
- Corpus.Edge edge = edges.get(e);
-
- TIntArrayList context = edge.getContext();
- double contextCnt = edge.getCount();
- //increment expected count
- for(int tag=0;tag<K;tag++)
- {
- for(int pos=0;pos<n_positions;pos++)
- exp_emit[tag][pos][context.get(pos)]+=q[tag]*contextCnt;
- exp_pi[edge.getPhraseId()][tag]+=q[tag]*contextCnt;
- }
- }
-
- System.out.println("\tllh: " + pco.loglikelihood());
- System.out.println("\tKL: " + pco.KL_divergence());
- System.out.println("\tphrase l1lmax: " + pco.phrase_l1lmax());
- System.out.println("\tcontext l1lmax: " + pco.context_l1lmax());
-
- //M step
- for(double [][]i:exp_emit)
- for(double []j:i)
- arr.F.l1normalize(j);
- emit=exp_emit;
-
- for(double []j:exp_pi)
- arr.F.l1normalize(j);
- pi=exp_pi;
-
- return pco.primal();
- }
-
- /**
- * @param phrase index of phrase
- * @param ctx array of context
- * @return unnormalized posterior
- */
- public double[] posterior(Corpus.Edge edge)
- {
- double[] prob;
-
- if(edge.getTag()>=0){
- prob=new double[K];
- prob[edge.getTag()]=1;
- return prob;
- }
-
- if (edge.getPhraseId() < n_phrases)
- prob = Arrays.copyOf(pi[edge.getPhraseId()], K);
- else
- {
- prob = new double[K];
- Arrays.fill(prob, 1.0);
- }
-
- TIntArrayList ctx = edge.getContext();
- for(int tag=0;tag<K;tag++)
- {
- for(int c=0;c<n_positions;c++)
- {
- int word = ctx.get(c);
- if (!this.c.isSentinel(word) && word < n_words)
- prob[tag]*=emit[tag][c][word];
- }
- }
-
- return prob;
- }
-
- public void displayPosterior(PrintStream ps, List<Edge> testing)
- {
- for (Edge edge : testing)
- {
- double probs[] = posterior(edge);
- arr.F.l1normalize(probs);
-
- // emit phrase
- ps.print(edge.getPhraseString());
- ps.print("\t");
- ps.print(edge.getContextString(true));
- int t=arr.F.argmax(probs);
- ps.println(" ||| C=" + t + " T=" + edge.getCount() + " P=" + probs[t]);
- //ps.println("# probs " + Arrays.toString(probs));
- }
- }
-
- public void displayModelParam(PrintStream ps)
- {
- final double EPS = 1e-6;
- ps.println("phrases " + n_phrases + " tags " + K + " positions " + n_positions);
-
- for (int i = 0; i < n_phrases; ++i)
- for(int j=0;j<pi[i].length;j++)
- if (pi[i][j] > EPS)
- ps.println(i + " " + j + " " + pi[i][j]);
-
- ps.println();
- for (int i = 0; i < K; ++i)
- {
- for(int position=0;position<n_positions;position++)
- {
- for(int word=0;word<emit[i][position].length;word++)
- {
- if (emit[i][position][word] > EPS)
- ps.println(i + " " + position + " " + word + " " + emit[i][position][word]);
- }
- }
- }
- }
-
- double phrase_l1lmax()
- {
- double sum=0;
- for(int phrase=0; phrase<n_phrases; phrase++)
- {
- double [] maxes = new double[K];
- for (Edge edge : c.getEdgesForPhrase(phrase))
- {
- double p[] = posterior(edge);
- arr.F.l1normalize(p);
- for(int tag=0;tag<K;tag++)
- maxes[tag] = Math.max(maxes[tag], p[tag]);
- }
- for(int tag=0;tag<K;tag++)
- sum += maxes[tag];
- }
- return sum;
- }
-
- double context_l1lmax()
- {
- double sum=0;
- for(int context=0; context<n_contexts; context++)
- {
- double [] maxes = new double[K];
- for (Edge edge : c.getEdgesForContext(context))
- {
- double p[] = posterior(edge);
- arr.F.l1normalize(p);
- for(int tag=0;tag<K;tag++)
- maxes[tag] = Math.max(maxes[tag], p[tag]);
- }
- for(int tag=0;tag<K;tag++)
- sum += maxes[tag];
- }
- return sum;
- }
-
- public void loadParameters(BufferedReader input) throws IOException
- {
- final double EPS = 1e-50;
-
- // overwrite pi, emit with ~zeros
- for(double [][]i:emit)
- for(double []j:i)
- Arrays.fill(j, EPS);
-
- for(double []j:pi)
- Arrays.fill(j, EPS);
-
- String line = input.readLine();
- assert line != null;
-
- Pattern space = Pattern.compile(" +");
- String[] parts = space.split(line);
- assert parts.length == 6;
-
- assert parts[0].equals("phrases");
- int phrases = Integer.parseInt(parts[1]);
- int tags = Integer.parseInt(parts[3]);
- int positions = Integer.parseInt(parts[5]);
-
- assert phrases == n_phrases;
- assert tags == K;
- assert positions == n_positions;
-
- // read in pi
- while ((line = input.readLine()) != null)
- {
- line = line.trim();
- if (line.isEmpty()) break;
-
- String[] tokens = space.split(line);
- assert tokens.length == 3;
- int p = Integer.parseInt(tokens[0]);
- int t = Integer.parseInt(tokens[1]);
- double v = Double.parseDouble(tokens[2]);
-
- pi[p][t] = v;
- }
-
- // read in emissions
- while ((line = input.readLine()) != null)
- {
- String[] tokens = space.split(line);
- assert tokens.length == 4;
- int t = Integer.parseInt(tokens[0]);
- int p = Integer.parseInt(tokens[1]);
- int w = Integer.parseInt(tokens[2]);
- double v = Double.parseDouble(tokens[3]);
-
- emit[t][p][w] = v;
- }
- }
-}
diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseContextObjective.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseContextObjective.java deleted file mode 100644 index 646ff392..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/PhraseContextObjective.java +++ /dev/null @@ -1,436 +0,0 @@ -package phrase;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-
-import optimization.gradientBasedMethods.ProjectedGradientDescent;
-import optimization.gradientBasedMethods.ProjectedObjective;
-import optimization.gradientBasedMethods.stats.OptimizerStats;
-import optimization.linesearch.ArmijoLineSearchMinimizationAlongProjectionArc;
-import optimization.linesearch.InterpolationPickFirstStep;
-import optimization.linesearch.LineSearchMethod;
-import optimization.projections.SimplexProjection;
-import optimization.stopCriteria.CompositeStopingCriteria;
-import optimization.stopCriteria.ProjectedGradientL2Norm;
-import optimization.stopCriteria.StopingCriteria;
-import optimization.stopCriteria.ValueDifference;
-import optimization.util.MathUtils;
-import phrase.Corpus.Edge;
-
-public class PhraseContextObjective extends ProjectedObjective
-{
- private static final double GRAD_DIFF = 0.00002;
- private static double INIT_STEP_SIZE = 300;
- private static double VAL_DIFF = 1e-8;
- private static int ITERATIONS = 20;
- boolean debug = false;
-
- private PhraseCluster c;
-
- // un-regularized unnormalized posterior, p[edge][tag]
- // P(tag|edge) \propto P(tag|phrase)P(context|tag)
- private double p[][];
-
- // regularized unnormalized posterior
- // q[edge][tag] propto p[edge][tag]*exp(-lambda)
- private double q[][];
- private List<Corpus.Edge> data;
-
- // log likelihood under q
- private double loglikelihood;
- private SimplexProjection projectionPhrase;
- private SimplexProjection projectionContext;
-
- double[] newPoint;
- private int n_param;
-
- // likelihood under p
- public double llh;
-
- private static Map<Corpus.Edge, Integer> edgeIndex;
-
- private long projectionTime;
- private long objectiveTime;
- private long actualProjectionTime;
- private ExecutorService pool;
-
- double scalePT;
- double scaleCT;
-
- public PhraseContextObjective(PhraseCluster cluster, double[] startingParameters, ExecutorService pool,
- double scalePT, double scaleCT)
- {
- c=cluster;
- data=c.c.getEdges();
- n_param=data.size()*c.K*2;
- this.pool=pool;
- this.scalePT = scalePT;
- this.scaleCT = scaleCT;
-
- parameters = startingParameters;
- if (parameters == null)
- parameters = new double[n_param];
-
- System.out.println("Num parameters " + n_param);
- newPoint = new double[n_param];
- gradient = new double[n_param];
- initP();
- projectionPhrase = new SimplexProjection(scalePT);
- projectionContext = new SimplexProjection(scaleCT);
- q=new double [data.size()][c.K];
-
- if (edgeIndex == null) {
- edgeIndex = new HashMap<Edge, Integer>();
- for (int e=0; e<data.size(); e++)
- {
- edgeIndex.put(data.get(e), e);
- //if (debug) System.out.println("Edge " + data.get(e) + " index " + e);
- }
- }
-
- setParameters(parameters);
- }
-
- private void initP(){
- p=new double[data.size()][];
- for(int edge=0;edge<data.size();edge++)
- {
- p[edge]=c.posterior(data.get(edge));
- llh += data.get(edge).getCount() * Math.log(arr.F.l1norm(p[edge]));
- arr.F.l1normalize(p[edge]);
- }
- }
-
- @Override
- public void setParameters(double[] params) {
- //System.out.println("setParameters " + Arrays.toString(parameters));
- // TODO: test if params have changed and skip update otherwise
- super.setParameters(params);
- updateFunction();
- }
-
- private void updateFunction()
- {
- updateCalls++;
- loglikelihood=0;
-
- System.out.print(".");
- System.out.flush();
-
- long begin = System.currentTimeMillis();
- for (int e=0; e<data.size(); e++)
- {
- Edge edge = data.get(e);
- for(int tag=0; tag<c.K; tag++)
- {
- int ip = index(e, tag, true);
- int ic = index(e, tag, false);
- q[e][tag] = p[e][tag]*
- Math.exp((-parameters[ip]-parameters[ic]) / edge.getCount());
- //if (debug)
- //System.out.println("\tposterior " + edge + " with tag " + tag + " p " + p[e][tag] + " params " + parameters[ip] + " and " + parameters[ic] + " q " + q[e][tag]);
- }
- }
-
- for(int edge=0;edge<data.size();edge++) {
- loglikelihood+=data.get(edge).getCount() * Math.log(arr.F.l1norm(q[edge]));
- arr.F.l1normalize(q[edge]);
- }
-
- for (int e=0; e<data.size(); e++)
- {
- for(int tag=0; tag<c.K; tag++)
- {
- int ip = index(e, tag, true);
- int ic = index(e, tag, false);
- gradient[ip]=-q[e][tag];
- gradient[ic]=-q[e][tag];
- }
- }
- //if (debug) {
- //System.out.println("objective " + loglikelihood + " ||gradient||_2: " + arr.F.l2norm(gradient));
- //System.out.println("gradient " + Arrays.toString(gradient));
- //}
- objectiveTime += System.currentTimeMillis() - begin;
- }
-
- @Override
- public double[] projectPoint(double[] point)
- {
- long begin = System.currentTimeMillis();
- List<Future<?>> tasks = new ArrayList<Future<?>>();
-
- System.out.print(",");
- System.out.flush();
-
- Arrays.fill(newPoint, 0, newPoint.length, 0);
-
- // first project using the phrase-tag constraints,
- // for all p,t: sum_c lambda_ptc < scaleP
- if (pool == null)
- {
- for (int p = 0; p < c.c.getNumPhrases(); ++p)
- {
- List<Edge> edges = c.c.getEdgesForPhrase(p);
- double[] toProject = new double[edges.size()];
- for(int tag=0;tag<c.K;tag++)
- {
- // FIXME: slow hash lookup for e (twice)
- for(int e=0; e<edges.size(); e++)
- toProject[e] = point[index(edges.get(e), tag, true)];
- long lbegin = System.currentTimeMillis();
- projectionPhrase.project(toProject);
- actualProjectionTime += System.currentTimeMillis() - lbegin;
- for(int e=0; e<edges.size(); e++)
- newPoint[index(edges.get(e), tag, true)] = toProject[e];
- }
- }
- }
- else // do above in parallel using thread pool
- {
- for (int p = 0; p < c.c.getNumPhrases(); ++p)
- {
- final int phrase = p;
- final double[] inPoint = point;
- Runnable task = new Runnable()
- {
- public void run()
- {
- List<Edge> edges = c.c.getEdgesForPhrase(phrase);
- double toProject[] = new double[edges.size()];
- for(int tag=0;tag<c.K;tag++)
- {
- // FIXME: slow hash lookup for e
- for(int e=0; e<edges.size(); e++)
- toProject[e] = inPoint[index(edges.get(e), tag, true)];
- projectionPhrase.project(toProject);
- for(int e=0; e<edges.size(); e++)
- newPoint[index(edges.get(e), tag, true)] = toProject[e];
- }
- }
- };
- tasks.add(pool.submit(task));
- }
- }
- //System.out.println("after PT " + Arrays.toString(newPoint));
-
- // now project using the context-tag constraints,
- // for all c,t: sum_p omega_pct < scaleC
- if (pool == null)
- {
- for (int ctx = 0; ctx < c.c.getNumContexts(); ++ctx)
- {
- List<Edge> edges = c.c.getEdgesForContext(ctx);
- double toProject[] = new double[edges.size()];
- for(int tag=0;tag<c.K;tag++)
- {
- // FIXME: slow hash lookup for e
- for(int e=0; e<edges.size(); e++)
- toProject[e] = point[index(edges.get(e), tag, false)];
- long lbegin = System.currentTimeMillis();
- projectionContext.project(toProject);
- actualProjectionTime += System.currentTimeMillis() - lbegin;
- for(int e=0; e<edges.size(); e++)
- newPoint[index(edges.get(e), tag, false)] = toProject[e];
- }
- }
- }
- else
- {
- // do above in parallel using thread pool
- for (int ctx = 0; ctx < c.c.getNumContexts(); ++ctx)
- {
- final int context = ctx;
- final double[] inPoint = point;
- Runnable task = new Runnable()
- {
- public void run()
- {
- List<Edge> edges = c.c.getEdgesForContext(context);
- double toProject[] = new double[edges.size()];
- for(int tag=0;tag<c.K;tag++)
- {
- // FIXME: slow hash lookup for e
- for(int e=0; e<edges.size(); e++)
- toProject[e] = inPoint[index(edges.get(e), tag, false)];
- projectionContext.project(toProject);
- for(int e=0; e<edges.size(); e++)
- newPoint[index(edges.get(e), tag, false)] = toProject[e];
- }
- }
- };
- tasks.add(pool.submit(task));
- }
- }
-
- if (pool != null)
- {
- // wait for all the jobs to complete
- Exception failure = null;
- for (Future<?> task: tasks)
- {
- try {
- task.get();
- } catch (InterruptedException e) {
- System.err.println("ERROR: Projection thread interrupted");
- e.printStackTrace();
- failure = e;
- } catch (ExecutionException e) {
- System.err.println("ERROR: Projection thread died");
- e.printStackTrace();
- failure = e;
- }
- }
- // rethrow the exception
- if (failure != null)
- {
- pool.shutdownNow();
- throw new RuntimeException(failure);
- }
- }
-
- double[] tmp = newPoint;
- newPoint = point;
- projectionTime += System.currentTimeMillis() - begin;
-
- //if (debug)
- //System.out.println("\t\treturning " + Arrays.toString(tmp));
- return tmp;
- }
-
- private int index(Edge edge, int tag, boolean phrase)
- {
- // NB if indexing changes must also change code in updateFunction and constructor
- if (phrase)
- return tag * edgeIndex.size() + edgeIndex.get(edge);
- else
- return (c.K + tag) * edgeIndex.size() + edgeIndex.get(edge);
- }
-
- private int index(int e, int tag, boolean phrase)
- {
- // NB if indexing changes must also change code in updateFunction and constructor
- if (phrase)
- return tag * edgeIndex.size() + e;
- else
- return (c.K + tag) * edgeIndex.size() + e;
- }
-
- @Override
- public double[] getGradient() {
- gradientCalls++;
- return gradient;
- }
-
- @Override
- public double getValue() {
- functionCalls++;
- return loglikelihood;
- }
-
- @Override
- public String toString() {
- return "No need for pointless toString";
- }
-
- public double []posterior(int edgeIndex){
- return q[edgeIndex];
- }
-
- public boolean optimizeWithProjectedGradientDescent()
- {
- projectionTime = 0;
- actualProjectionTime = 0;
- objectiveTime = 0;
- long start = System.currentTimeMillis();
-
- LineSearchMethod ls =
- new ArmijoLineSearchMinimizationAlongProjectionArc
- (new InterpolationPickFirstStep(INIT_STEP_SIZE));
- //LineSearchMethod ls = new WolfRuleLineSearch(
- // (new InterpolationPickFirstStep(INIT_STEP_SIZE)), c1, c2);
- OptimizerStats stats = new OptimizerStats();
-
-
- ProjectedGradientDescent optimizer = new ProjectedGradientDescent(ls);
- StopingCriteria stopGrad = new ProjectedGradientL2Norm(GRAD_DIFF);
- StopingCriteria stopValue = new ValueDifference(VAL_DIFF*(-llh));
- CompositeStopingCriteria compositeStop = new CompositeStopingCriteria();
- compositeStop.add(stopGrad);
- compositeStop.add(stopValue);
- optimizer.setMaxIterations(ITERATIONS);
- updateFunction();
- boolean success = optimizer.optimize(this,stats,compositeStop);
-
- System.out.println();
- System.out.println(stats.prettyPrint(1));
-
- if (success)
- System.out.print("\toptimization took " + optimizer.getCurrentIteration() + " iterations");
- else
- System.out.print("\toptimization failed to converge");
- long total = System.currentTimeMillis() - start;
- System.out.println(" and " + total + " ms: projection " + projectionTime +
- " actual " + actualProjectionTime + " objective " + objectiveTime);
-
- return success;
- }
-
- double loglikelihood()
- {
- return llh;
- }
-
- double KL_divergence()
- {
- return -loglikelihood + MathUtils.dotProduct(parameters, gradient);
- }
-
- double phrase_l1lmax()
- {
- // \sum_{tag,phrase} max_{context} P(tag|context,phrase)
- double sum=0;
- for (int p = 0; p < c.c.getNumPhrases(); ++p)
- {
- List<Edge> edges = c.c.getEdgesForPhrase(p);
- for(int tag=0;tag<c.K;tag++)
- {
- double max=0;
- for (Edge edge: edges)
- max = Math.max(max, q[edgeIndex.get(edge)][tag]);
- sum+=max;
- }
- }
- return sum;
- }
-
- double context_l1lmax()
- {
- // \sum_{tag,context} max_{phrase} P(tag|context,phrase)
- double sum=0;
- for (int ctx = 0; ctx < c.c.getNumContexts(); ++ctx)
- {
- List<Edge> edges = c.c.getEdgesForContext(ctx);
- for(int tag=0; tag<c.K; tag++)
- {
- double max=0;
- for (Edge edge: edges)
- max = Math.max(max, q[edgeIndex.get(edge)][tag]);
- sum+=max;
- }
- }
- return sum;
- }
-
- // L - KL(q||p) - scalePT * l1lmax_phrase - scaleCT * l1lmax_context
- public double primal()
- {
- return loglikelihood() - KL_divergence() - scalePT * phrase_l1lmax() - scaleCT * context_l1lmax();
- }
-}
\ No newline at end of file diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java deleted file mode 100644 index 0cf31c1c..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/PhraseCorpus.java +++ /dev/null @@ -1,193 +0,0 @@ -package phrase;
-
-import io.FileUtil;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Scanner;
-
-public class PhraseCorpus
-{
- public HashMap<String,Integer>wordLex;
- public HashMap<String,Integer>phraseLex;
-
- public String wordList[];
- public String phraseList[];
-
- //data[phrase][num context][position]
- public int data[][][];
- public int numContexts;
-
- public PhraseCorpus(String filename) throws FileNotFoundException, IOException
- {
- BufferedReader r = FileUtil.reader(new File(filename));
-
- phraseLex=new HashMap<String,Integer>();
- wordLex=new HashMap<String,Integer>();
-
- ArrayList<int[][]>dataList=new ArrayList<int[][]>();
- String line=null;
- numContexts = 0;
-
- while((line=readLine(r))!=null){
-
- String toks[]=line.split("\t");
- String phrase=toks[0];
- addLex(phrase,phraseLex);
-
- toks=toks[1].split(" \\|\\|\\| ");
-
- ArrayList <int[]>ctxList=new ArrayList<int[]>();
-
- for(int i=0;i<toks.length;i+=2){
- String ctx=toks[i];
- String words[]=ctx.split(" ");
- if (numContexts == 0)
- numContexts = words.length - 1;
- else
- assert numContexts == words.length - 1;
-
- int []context=new int [numContexts+1];
- int idx=0;
- for(String word:words){
- if(word.equals("<PHRASE>")){
- continue;
- }
- addLex(word,wordLex);
- context[idx]=wordLex.get(word);
- idx++;
- }
-
- String count=toks[i+1];
- context[idx]=Integer.parseInt(count.trim().substring(2));
-
- ctxList.add(context);
- }
-
- dataList.add(ctxList.toArray(new int [0][]));
-
- }
- try{
- r.close();
- }catch(IOException ioe){
- ioe.printStackTrace();
- }
- data=dataList.toArray(new int[0][][]);
- }
-
- private void addLex(String key, HashMap<String,Integer>lex){
- Integer i=lex.get(key);
- if(i==null){
- lex.put(key, lex.size());
- }
- }
-
- //for debugging
- public void saveLex(String lexFilename) throws FileNotFoundException, IOException
- {
- PrintStream ps = FileUtil.printstream(new File(lexFilename));
- ps.println("Phrase Lexicon");
- ps.println(phraseLex.size());
- printDict(phraseLex,ps);
-
- ps.println("Word Lexicon");
- ps.println(wordLex.size());
- printDict(wordLex,ps);
- ps.close();
- }
-
- private static void printDict(HashMap<String,Integer>lex,PrintStream ps){
- String []dict=buildList(lex);
- for(int i=0;i<dict.length;i++){
- ps.println(dict[i]);
- }
- }
-
- public void loadLex(String lexFilename){
- Scanner sc=io.FileUtil.openInFile(lexFilename);
-
- sc.nextLine();
- int size=sc.nextInt();
- sc.nextLine();
- String[]dict=new String[size];
- for(int i=0;i<size;i++){
- dict[i]=sc.nextLine();
- }
- phraseLex=buildMap(dict);
-
- sc.nextLine();
- size=sc.nextInt();
- sc.nextLine();
- dict=new String[size];
- for(int i=0;i<size;i++){
- dict[i]=sc.nextLine();
- }
- wordLex=buildMap(dict);
- sc.close();
- }
-
- private HashMap<String, Integer> buildMap(String[]dict){
- HashMap<String,Integer> map=new HashMap<String,Integer>();
- for(int i=0;i<dict.length;i++){
- map.put(dict[i], i);
- }
- return map;
- }
-
- public void buildList(){
- if(wordList==null){
- wordList=buildList(wordLex);
- phraseList=buildList(phraseLex);
- }
- }
-
- private static String[]buildList(HashMap<String,Integer>lex){
- String dict[]=new String [lex.size()];
- for(String key:lex.keySet()){
- dict[lex.get(key)]=key;
- }
- return dict;
- }
-
- public String getContextString(int context[], boolean addPhraseMarker)
- {
- StringBuffer b = new StringBuffer();
- for (int i=0;i<context.length-1;i++)
- {
- if (b.length() > 0)
- b.append(" ");
-
- if (i == context.length/2)
- b.append("<PHRASE> ");
-
- b.append(wordList[context[i]]);
- }
- return b.toString();
- }
-
- public static String readLine(BufferedReader r){
- try{
- return r.readLine();
- }
- catch(IOException ioe){
- ioe.printStackTrace();
- }
- return null;
- }
-
- public static void main(String[] args) throws Exception
- {
- String LEX_FILENAME="../pdata/lex.out";
- String DATA_FILENAME="../pdata/btec.con";
- PhraseCorpus c=new PhraseCorpus(DATA_FILENAME);
- c.saveLex(LEX_FILENAME);
- c.loadLex(LEX_FILENAME);
- c.saveLex(LEX_FILENAME);
- }
-}
diff --git a/gi/posterior-regularisation/prjava/src/phrase/PhraseObjective.java b/gi/posterior-regularisation/prjava/src/phrase/PhraseObjective.java deleted file mode 100644 index ac73a075..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/PhraseObjective.java +++ /dev/null @@ -1,224 +0,0 @@ -package phrase;
-
-import java.util.Arrays;
-import java.util.List;
-
-import optimization.gradientBasedMethods.ProjectedGradientDescent;
-import optimization.gradientBasedMethods.ProjectedObjective;
-import optimization.gradientBasedMethods.stats.OptimizerStats;
-import optimization.linesearch.ArmijoLineSearchMinimizationAlongProjectionArc;
-import optimization.linesearch.InterpolationPickFirstStep;
-import optimization.linesearch.LineSearchMethod;
-import optimization.linesearch.WolfRuleLineSearch;
-import optimization.projections.SimplexProjection;
-import optimization.stopCriteria.CompositeStopingCriteria;
-import optimization.stopCriteria.ProjectedGradientL2Norm;
-import optimization.stopCriteria.StopingCriteria;
-import optimization.stopCriteria.ValueDifference;
-import optimization.util.MathUtils;
-
-public class PhraseObjective extends ProjectedObjective
-{
- static final double GRAD_DIFF = 0.00002;
- static double INIT_STEP_SIZE = 300;
- static double VAL_DIFF = 1e-8; // tuned to BTEC subsample
- static int ITERATIONS = 100;
- private PhraseCluster c;
-
- /**@brief
- * for debugging purposes
- */
- //public static PrintStream ps;
-
- /**@brief current phrase being optimzed*/
- public int phrase;
-
- /**@brief un-regularized posterior
- * unnormalized
- * p[edge][tag]
- * P(tag|edge) \propto P(tag|phrase)P(context|tag)
- */
- private double[][]p;
-
- /**@brief regularized posterior
- * q[edge][tag] propto p[edge][tag]*exp(-lambda)
- */
- private double q[][];
- private List<Corpus.Edge> data;
-
- /**@brief log likelihood of the associated phrase
- *
- */
- private double loglikelihood;
- private SimplexProjection projection;
-
- double[] newPoint ;
-
- private int n_param;
-
- /**@brief likelihood under p
- *
- */
- public double llh;
-
- public PhraseObjective(PhraseCluster cluster, int phraseIdx, double scale, double[] lambda){
- phrase=phraseIdx;
- c=cluster;
- data=c.c.getEdgesForPhrase(phrase);
- n_param=data.size()*c.K;
- //System.out.println("Num parameters " + n_param + " for phrase #" + phraseIdx);
-
- if (lambda==null)
- lambda=new double[n_param];
-
- parameters = lambda;
- newPoint = new double[n_param];
- gradient = new double[n_param];
- initP();
- projection=new SimplexProjection(scale);
- q=new double [data.size()][c.K];
-
- setParameters(parameters);
- }
-
- private void initP(){
- p=new double[data.size()][];
- for(int edge=0;edge<data.size();edge++){
- p[edge]=c.posterior(data.get(edge));
- llh += data.get(edge).getCount() * Math.log(arr.F.l1norm(p[edge])); // Was bug here - count inside log!
- arr.F.l1normalize(p[edge]);
- }
- }
-
- @Override
- public void setParameters(double[] params) {
- super.setParameters(params);
- updateFunction();
- }
-
- private void updateFunction(){
- updateCalls++;
- loglikelihood=0;
-
- for(int tag=0;tag<c.K;tag++){
- for(int edge=0;edge<data.size();edge++){
- q[edge][tag]=p[edge][tag]*
- Math.exp(-parameters[tag*data.size()+edge]/data.get(edge).getCount());
- }
- }
-
- for(int edge=0;edge<data.size();edge++){
- loglikelihood+=data.get(edge).getCount() * Math.log(arr.F.l1norm(q[edge]));
- arr.F.l1normalize(q[edge]);
- }
-
- for(int tag=0;tag<c.K;tag++){
- for(int edge=0;edge<data.size();edge++){
- gradient[tag*data.size()+edge]=-q[edge][tag];
- }
- }
- }
-
- @Override
- public double[] projectPoint(double[] point)
- {
- double toProject[]=new double[data.size()];
- for(int tag=0;tag<c.K;tag++){
- for(int edge=0;edge<data.size();edge++){
- toProject[edge]=point[tag*data.size()+edge];
- }
- projection.project(toProject);
- for(int edge=0;edge<data.size();edge++){
- newPoint[tag*data.size()+edge]=toProject[edge];
- }
- }
- return newPoint;
- }
-
- @Override
- public double[] getGradient() {
- gradientCalls++;
- return gradient;
- }
-
- @Override
- public double getValue() {
- functionCalls++;
- return loglikelihood;
- }
-
- @Override
- public String toString() {
- return Arrays.toString(parameters);
- }
-
- public double [][]posterior(){
- return q;
- }
-
- long optimizationTime;
-
- public boolean optimizeWithProjectedGradientDescent(){
- long start = System.currentTimeMillis();
-
- LineSearchMethod ls =
- new ArmijoLineSearchMinimizationAlongProjectionArc
- (new InterpolationPickFirstStep(INIT_STEP_SIZE));
- //LineSearchMethod ls = new WolfRuleLineSearch(
- // (new InterpolationPickFirstStep(INIT_STEP_SIZE)), c1, c2);
- OptimizerStats stats = new OptimizerStats();
-
-
- ProjectedGradientDescent optimizer = new ProjectedGradientDescent(ls);
- StopingCriteria stopGrad = new ProjectedGradientL2Norm(GRAD_DIFF);
- StopingCriteria stopValue = new ValueDifference(VAL_DIFF*(-llh));
- CompositeStopingCriteria compositeStop = new CompositeStopingCriteria();
- compositeStop.add(stopGrad);
- compositeStop.add(stopValue);
- optimizer.setMaxIterations(ITERATIONS);
- updateFunction();
- boolean success = optimizer.optimize(this,stats,compositeStop);
- //System.out.println("Ended optimzation Projected Gradient Descent\n" + stats.prettyPrint(1));
- //if(succed){
- //System.out.println("Ended optimization in " + optimizer.getCurrentIteration());
- //}else{
-// System.out.println("Failed to optimize");
- //}
- //System.out.println(Arrays.toString(parameters));
-
- // for(int edge=0;edge<data.getSize();edge++){
- // ps.println(Arrays.toString(q[edge]));
- // }
-
- return success;
- }
-
- public double KL_divergence()
- {
- return -loglikelihood + MathUtils.dotProduct(parameters, gradient);
- }
-
- public double loglikelihood()
- {
- return llh;
- }
-
- public double l1lmax()
- {
- double sum=0;
- for(int tag=0;tag<c.K;tag++){
- double max=0;
- for(int edge=0;edge<data.size();edge++){
- if(q[edge][tag]>max)
- max=q[edge][tag];
- }
- sum+=max;
- }
- return sum;
- }
-
- public double primal(double scale)
- {
- return loglikelihood() - KL_divergence() - scale * l1lmax();
- }
-}
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Trainer.java b/gi/posterior-regularisation/prjava/src/phrase/Trainer.java deleted file mode 100644 index 6f302b20..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/Trainer.java +++ /dev/null @@ -1,257 +0,0 @@ -package phrase; - -import io.FileUtil; -import joptsimple.OptionParser; -import joptsimple.OptionSet; -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.util.List; -import java.util.Random; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; - -import phrase.Corpus.Edge; - -import arr.F; - -public class Trainer -{ - public static void main(String[] args) - { - OptionParser parser = new OptionParser(); - parser.accepts("help"); - parser.accepts("in").withRequiredArg().ofType(File.class); - parser.accepts("in1").withRequiredArg().ofType(File.class); - parser.accepts("test").withRequiredArg().ofType(File.class); - parser.accepts("out").withRequiredArg().ofType(File.class); - parser.accepts("start").withRequiredArg().ofType(File.class); - parser.accepts("parameters").withRequiredArg().ofType(File.class); - parser.accepts("topics").withRequiredArg().ofType(Integer.class).defaultsTo(5); - parser.accepts("iterations").withRequiredArg().ofType(Integer.class).defaultsTo(10); - parser.accepts("threads").withRequiredArg().ofType(Integer.class).defaultsTo(0); - parser.accepts("scale-phrase").withRequiredArg().ofType(Double.class).defaultsTo(0.0); - parser.accepts("scale-context").withRequiredArg().ofType(Double.class).defaultsTo(0.0); - parser.accepts("seed").withRequiredArg().ofType(Long.class).defaultsTo(0l); - parser.accepts("convergence-threshold").withRequiredArg().ofType(Double.class).defaultsTo(1e-6); - parser.accepts("variational-bayes"); - parser.accepts("alpha-emit").withRequiredArg().ofType(Double.class).defaultsTo(0.1); - parser.accepts("alpha-pi").withRequiredArg().ofType(Double.class).defaultsTo(0.0001); - parser.accepts("agree-direction"); - parser.accepts("agree-language"); - parser.accepts("no-parameter-cache"); - parser.accepts("skip-large-phrases").withRequiredArg().ofType(Integer.class).defaultsTo(5); - OptionSet options = parser.parse(args); - - if (options.has("help") || !options.has("in")) - { - try { - parser.printHelpOn(System.err); - } catch (IOException e) { - System.err.println("This should never happen."); - e.printStackTrace(); - } - System.exit(1); - } - - int tags = (Integer) options.valueOf("topics"); - int iterations = (Integer) options.valueOf("iterations"); - double scale_phrase = (Double) options.valueOf("scale-phrase"); - double scale_context = (Double) options.valueOf("scale-context"); - int threads = (Integer) options.valueOf("threads"); - double threshold = (Double) options.valueOf("convergence-threshold"); - boolean vb = options.has("variational-bayes"); - double alphaEmit = (vb) ? (Double) options.valueOf("alpha-emit") : 0; - double alphaPi = (vb) ? (Double) options.valueOf("alpha-pi") : 0; - int skip = (Integer) options.valueOf("skip-large-phrases"); - - if (options.has("seed")) - F.rng = new Random((Long) options.valueOf("seed")); - - ExecutorService threadPool = null; - if (threads > 0) - threadPool = Executors.newFixedThreadPool(threads); - - if (tags <= 1 || scale_phrase < 0 || scale_context < 0 || threshold < 0) - { - System.err.println("Invalid arguments. Try again!"); - System.exit(1); - } - - Corpus corpus = null; - File infile = (File) options.valueOf("in"); - Corpus corpus1 = null; - File infile1 = (File) options.valueOf("in1"); - try { - System.out.println("Reading concordance from " + infile); - corpus = Corpus.readFromFile(FileUtil.reader(infile)); - corpus.printStats(System.out); - if(options.has("in1")){ - corpus1 = Corpus.readFromFile(FileUtil.reader(infile1)); - corpus1.printStats(System.out); - } - } catch (IOException e) { - System.err.println("Failed to open input file: " + infile); - e.printStackTrace(); - System.exit(1); - } - - if (!(options.has("agree-direction")||options.has("agree-language"))) - System.out.println("Running with " + tags + " tags " + - "for " + iterations + " iterations " + - ((skip > 0) ? "skipping large phrases for first " + skip + " iterations " : "") + - "with scale " + scale_phrase + " phrase and " + scale_context + " context " + - "and " + threads + " threads"); - else - System.out.println("Running agreement model with " + tags + " tags " + - "for " + iterations); - - System.out.println(); - - PhraseCluster cluster = null; - Agree2Sides agree2sides = null; - Agree agree= null; - VB vbModel=null; - if (options.has("agree-language")) - agree2sides = new Agree2Sides(tags, corpus,corpus1); - else if (options.has("agree-direction")) - agree = new Agree(tags, corpus); - else - { - if (vb) - { - vbModel=new VB(tags,corpus); - vbModel.alpha=alphaPi; - vbModel.lambda=alphaEmit; - if (threadPool != null) vbModel.useThreadPool(threadPool); - } - else - { - cluster = new PhraseCluster(tags, corpus); - if (threadPool != null) cluster.useThreadPool(threadPool); - - if (options.has("no-parameter-cache")) - cluster.cacheLambda = false; - if (options.has("start")) - { - try { - System.err.println("Reading starting parameters from " + options.valueOf("start")); - cluster.loadParameters(FileUtil.reader((File)options.valueOf("start"))); - } catch (IOException e) { - System.err.println("Failed to open input file: " + options.valueOf("start")); - e.printStackTrace(); - } - } - } - } - - double last = 0; - for (int i=0; i < iterations; i++) - { - double o; - if (agree != null) - o = agree.EM(); - else if(agree2sides!=null) - o = agree2sides.EM(); - else - { - if (i < skip) - System.out.println("Skipping phrases of length > " + (i+1)); - - if (scale_phrase <= 0 && scale_context <= 0) - { - if (!vb) - o = cluster.EM((i < skip) ? i+1 : 0); - else - o = vbModel.EM(); - } - else - o = cluster.PREM(scale_phrase, scale_context, (i < skip) ? i+1 : 0); - } - - System.out.println("ITER: "+i+" objective: " + o); - - // sometimes takes a few iterations to break the ties - if (i > 5 && Math.abs((o - last) / o) < threshold) - { - last = o; - break; - } - last = o; - } - - double pl1lmax = 0, cl1lmax = 0; - if (cluster != null) - { - pl1lmax = cluster.phrase_l1lmax(); - cl1lmax = cluster.context_l1lmax(); - } - else if (agree != null) - { - // fairly arbitrary choice of model1 cf model2 - pl1lmax = agree.model1.phrase_l1lmax(); - cl1lmax = agree.model1.context_l1lmax(); - } - else if (agree2sides != null) - { - // fairly arbitrary choice of model1 cf model2 - pl1lmax = agree2sides.model1.phrase_l1lmax(); - cl1lmax = agree2sides.model1.context_l1lmax(); - } - - System.out.println("\nFinal posterior phrase l1lmax " + pl1lmax + " context l1lmax " + cl1lmax); - - if (options.has("out")) - { - File outfile = (File) options.valueOf("out"); - try { - PrintStream ps = FileUtil.printstream(outfile); - List<Edge> test; - if (!options.has("test")) // just use the training - test = corpus.getEdges(); - else - { // if --test supplied, load up the file - infile = (File) options.valueOf("test"); - System.out.println("Reading testing concordance from " + infile); - test = corpus.readEdges(FileUtil.reader(infile)); - } - if(vb) { - assert !options.has("test"); - vbModel.displayPosterior(ps); - } else if (cluster != null) - cluster.displayPosterior(ps, test); - else if (agree != null) - agree.displayPosterior(ps, test); - else if (agree2sides != null) { - assert !options.has("test"); - agree2sides.displayPosterior(ps); - } - - ps.close(); - } catch (IOException e) { - System.err.println("Failed to open either testing file or output file"); - e.printStackTrace(); - System.exit(1); - } - } - - if (options.has("parameters")) - { - assert !vb; - File outfile = (File) options.valueOf("parameters"); - PrintStream ps; - try { - ps = FileUtil.printstream(outfile); - cluster.displayModelParam(ps); - ps.close(); - } catch (IOException e) { - System.err.println("Failed to open output parameters file: " + outfile); - e.printStackTrace(); - System.exit(1); - } - } - - if (cluster != null && cluster.pool != null) - cluster.pool.shutdown(); - } -} diff --git a/gi/posterior-regularisation/prjava/src/phrase/VB.java b/gi/posterior-regularisation/prjava/src/phrase/VB.java deleted file mode 100644 index cd3f4966..00000000 --- a/gi/posterior-regularisation/prjava/src/phrase/VB.java +++ /dev/null @@ -1,419 +0,0 @@ -package phrase;
-
-import gnu.trove.TIntArrayList;
-
-import io.FileUtil;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-
-import org.apache.commons.math.special.Gamma;
-
-import phrase.Corpus.Edge;
-
-public class VB {
-
- public static int MAX_ITER=400;
-
- /**@brief
- * hyper param for beta
- * where beta is multinomial
- * for generating words from a topic
- */
- public double lambda=0.1;
- /**@brief
- * hyper param for theta
- * where theta is dirichlet for z
- */
- public double alpha=0.0001;
- /**@brief
- * variational param for beta
- */
- private double rho[][][];
- private double digamma_rho[][][];
- private double rho_sum[][];
- /**@brief
- * variational param for z
- */
- //private double phi[][];
- /**@brief
- * variational param for theta
- */
- private double gamma[];
- private static double VAL_DIFF_RATIO=0.005;
-
- private int n_positions;
- private int n_words;
- private int K;
- private ExecutorService pool;
-
- private Corpus c;
- public static void main(String[] args) {
- // String in="../pdata/canned.con";
- String in="../pdata/btec.con";
- String out="../pdata/vb.out";
- int numCluster=25;
- Corpus corpus = null;
- File infile = new File(in);
- try {
- System.out.println("Reading concordance from " + infile);
- corpus = Corpus.readFromFile(FileUtil.reader(infile));
- corpus.printStats(System.out);
- } catch (IOException e) {
- System.err.println("Failed to open input file: " + infile);
- e.printStackTrace();
- System.exit(1);
- }
-
- VB vb=new VB(numCluster, corpus);
- int iter=20;
- for(int i=0;i<iter;i++){
- double obj=vb.EM();
- System.out.println("Iter "+i+": "+obj);
- }
-
- File outfile = new File (out);
- try {
- PrintStream ps = FileUtil.printstream(outfile);
- vb.displayPosterior(ps);
- // ps.println();
- // c2f.displayModelParam(ps);
- ps.close();
- } catch (IOException e) {
- System.err.println("Failed to open output file: " + outfile);
- e.printStackTrace();
- System.exit(1);
- }
- }
-
- public VB(int numCluster, Corpus corpus){
- c=corpus;
- K=numCluster;
- n_positions=c.getNumContextPositions();
- n_words=c.getNumWords();
- rho=new double[K][n_positions][n_words];
- //to init rho
- //loop through data and count up words
- double[] phi_tmp=new double[K];
- for(int i=0;i<K;i++){
- for(int pos=0;pos<n_positions;pos++){
- Arrays.fill(rho[i][pos], lambda);
- }
- }
- for(int d=0;d<c.getNumPhrases();d++){
- List<Edge>doc=c.getEdgesForPhrase(d);
- for(int n=0;n<doc.size();n++){
- TIntArrayList context=doc.get(n).getContext();
- arr.F.randomise(phi_tmp);
- for(int i=0;i<K;i++){
- for(int pos=0;pos<n_positions;pos++){
- rho[i][pos][context.get(pos)]+=phi_tmp[i];
- }
- }
- }
- }
-
- }
-
- private double inference(int phraseID, double[][] phi, double[] gamma)
- {
- List<Edge > doc=c.getEdgesForPhrase(phraseID);
- for(int i=0;i<phi.length;i++){
- for(int j=0;j<phi[i].length;j++){
- phi[i][j]=1.0/K;
- }
- }
- Arrays.fill(gamma,alpha+1.0/K);
-
- double digamma_gamma[]=new double[K];
-
- double gamma_sum=digamma(arr.F.l1norm(gamma));
- for(int i=0;i<K;i++){
- digamma_gamma[i]=digamma(gamma[i]);
- }
- double gammaSum[]=new double [K];
- double prev_val=0;
- double obj=0;
-
- for(int iter=0;iter<MAX_ITER;iter++){
- prev_val=obj;
- obj=0;
- Arrays.fill(gammaSum,0.0);
- for(int n=0;n<doc.size();n++){
- TIntArrayList context=doc.get(n).getContext();
- double phisum=0;
- for(int i=0;i<K;i++){
- double sum=0;
- for(int pos=0;pos<n_positions;pos++){
- int word=context.get(pos);
- sum+=digamma_rho[i][pos][word]-rho_sum[i][pos];
- }
- sum+= digamma_gamma[i]-gamma_sum;
- phi[n][i]=sum;
-
- if (i > 0){
- phisum = log_sum(phisum, phi[n][i]);
- }
- else{
- phisum = phi[n][i];
- }
-
- }//end of a word
-
- for(int i=0;i<K;i++){
- phi[n][i]=Math.exp(phi[n][i]-phisum);
- gammaSum[i]+=phi[n][i];
- }
-
- }//end of doc
-
- for(int i=0;i<K;i++){
- gamma[i]=alpha+gammaSum[i];
- }
- gamma_sum=digamma(arr.F.l1norm(gamma));
- for(int i=0;i<K;i++){
- digamma_gamma[i]=digamma(gamma[i]);
- }
- //compute objective for reporting
-
- obj=0;
-
- for(int i=0;i<K;i++){
- obj+=(alpha-1)*(digamma_gamma[i]-gamma_sum);
- }
-
-
- for(int n=0;n<doc.size();n++){
- TIntArrayList context=doc.get(n).getContext();
-
- for(int i=0;i<K;i++){
- //entropy of phi + expected log likelihood of z
- obj+=phi[n][i]*(digamma_gamma[i]-gamma_sum);
-
- if(phi[n][i]>1e-10){
- obj+=phi[n][i]*Math.log(phi[n][i]);
- }
-
- double beta_sum=0;
- for(int pos=0;pos<n_positions;pos++){
- int word=context.get(pos);
- beta_sum+=(digamma(rho[i][pos][word])-rho_sum[i][pos]);
- }
- obj+=phi[n][i]*beta_sum;
- }
- }
-
- obj-=log_gamma(arr.F.l1norm(gamma));
- for(int i=0;i<K;i++){
- obj+=Gamma.logGamma(gamma[i]);
- obj-=(gamma[i]-1)*(digamma_gamma[i]-gamma_sum);
- }
-
-// System.out.println(phraseID+": "+obj);
- if(iter>0 && (obj-prev_val)/Math.abs(obj)<VAL_DIFF_RATIO){
- break;
- }
- }//end of inference loop
-
- return obj;
- }//end of inference
-
- /**
- * @return objective of this iteration
- */
- public double EM(){
- double emObj=0;
- if(digamma_rho==null){
- digamma_rho=new double[K][n_positions][n_words];
- }
- for(int i=0;i<K;i++){
- for (int pos=0;pos<n_positions;pos++){
- for(int j=0;j<n_words;j++){
- digamma_rho[i][pos][j]= digamma(rho[i][pos][j]);
- }
- }
- }
-
- if(rho_sum==null){
- rho_sum=new double [K][n_positions];
- }
- for(int i=0;i<K;i++){
- for(int pos=0;pos<n_positions;pos++){
- rho_sum[i][pos]=digamma(arr.F.l1norm(rho[i][pos]));
- }
- }
-
- //E
- double exp_rho[][][]=new double[K][n_positions][n_words];
- if (pool == null)
- {
- for (int d=0;d<c.getNumPhrases();d++)
- {
- List<Edge > doc=c.getEdgesForPhrase(d);
- double[][] phi = new double[doc.size()][K];
- double[] gamma = new double[K];
-
- emObj += inference(d, phi, gamma);
-
- for(int n=0;n<doc.size();n++){
- TIntArrayList context=doc.get(n).getContext();
- for(int pos=0;pos<n_positions;pos++){
- int word=context.get(pos);
- for(int i=0;i<K;i++){
- exp_rho[i][pos][word]+=phi[n][i];
- }
- }
- }
- //if(d!=0 && d%100==0) System.out.print(".");
- //if(d!=0 && d%1000==0) System.out.println(d);
- }
- }
- else // multi-threaded version of above loop
- {
- class PartialEStep implements Callable<PartialEStep>
- {
- double[][] phi;
- double[] gamma;
- double obj;
- int d;
- PartialEStep(int d) { this.d = d; }
-
- public PartialEStep call()
- {
- phi = new double[c.getEdgesForPhrase(d).size()][K];
- gamma = new double[K];
- obj = inference(d, phi, gamma);
- return this;
- }
- }
-
- List<Future<PartialEStep>> jobs = new ArrayList<Future<PartialEStep>>();
- for (int d=0;d<c.getNumPhrases();d++)
- jobs.add(pool.submit(new PartialEStep(d)));
-
- for (Future<PartialEStep> job: jobs)
- {
- try {
- PartialEStep e = job.get();
-
- emObj += e.obj;
- List<Edge> doc = c.getEdgesForPhrase(e.d);
- for(int n=0;n<doc.size();n++){
- TIntArrayList context=doc.get(n).getContext();
- for(int pos=0;pos<n_positions;pos++){
- int word=context.get(pos);
- for(int i=0;i<K;i++){
- exp_rho[i][pos][word]+=e.phi[n][i];
- }
- }
- }
- } catch (ExecutionException e) {
- System.err.println("ERROR: E-step thread execution failed.");
- throw new RuntimeException(e);
- } catch (InterruptedException e) {
- System.err.println("ERROR: Failed to join E-step thread.");
- throw new RuntimeException(e);
- }
- }
- }
- // System.out.println("EM Objective:"+emObj);
-
- //M
- for(int i=0;i<K;i++){
- for(int pos=0;pos<n_positions;pos++){
- for(int j=0;j<n_words;j++){
- rho[i][pos][j]=lambda+exp_rho[i][pos][j];
- }
- }
- }
-
- //E[\log p(\beta|\lambda)] - E[\log q(\beta)]
- for(int i=0;i<K;i++){
- double rhoSum=0;
- for(int pos=0;pos<n_positions;pos++){
- for(int j=0;j<n_words;j++){
- rhoSum+=rho[i][pos][j];
- }
- double digamma_rhoSum=Gamma.digamma(rhoSum);
- emObj-=Gamma.logGamma(rhoSum);
- for(int j=0;j<n_words;j++){
- emObj+=(lambda-rho[i][pos][j])*(Gamma.digamma(rho[i][pos][j])-digamma_rhoSum);
- emObj+=Gamma.logGamma(rho[i][pos][j]);
- }
- }
- }
-
- return emObj;
- }//end of EM
-
- public void displayPosterior(PrintStream ps)
- {
- for(int d=0;d<c.getNumPhrases();d++){
- List<Edge > doc=c.getEdgesForPhrase(d);
- double[][] phi = new double[doc.size()][K];
- for(int i=0;i<phi.length;i++)
- for(int j=0;j<phi[i].length;j++)
- phi[i][j]=1.0/K;
- double[] gamma = new double[K];
-
- inference(d, phi, gamma);
-
- for(int n=0;n<doc.size();n++){
- Edge edge=doc.get(n);
- int tag=arr.F.argmax(phi[n]);
- ps.print(edge.getPhraseString());
- ps.print("\t");
- ps.print(edge.getContextString(true));
-
- ps.println(" ||| C=" + tag);
- }
- }
- }
-
- double log_sum(double log_a, double log_b)
- {
- double v;
-
- if (log_a < log_b)
- v = log_b+Math.log(1 + Math.exp(log_a-log_b));
- else
- v = log_a+Math.log(1 + Math.exp(log_b-log_a));
- return(v);
- }
-
- double digamma(double x)
- {
- double p;
- x=x+6;
- p=1/(x*x);
- p=(((0.004166666666667*p-0.003968253986254)*p+
- 0.008333333333333)*p-0.083333333333333)*p;
- p=p+Math.log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6);
- return p;
- }
-
- double log_gamma(double x)
- {
- double z=1/(x*x);
-
- x=x+6;
- z=(((-0.000595238095238*z+0.000793650793651)
- *z-0.002777777777778)*z+0.083333333333333)/x;
- z=(x-0.5)*Math.log(x)-x+0.918938533204673+z-Math.log(x-1)-
- Math.log(x-2)-Math.log(x-3)-Math.log(x-4)-Math.log(x-5)-Math.log(x-6);
- return z;
- }
-
- public void useThreadPool(ExecutorService threadPool)
- {
- pool = threadPool;
- }
-}//End of class
|