From ffd9800a8ed78f818debb0e02a9f379b166f3de9 Mon Sep 17 00:00:00 2001 From: desaicwtf Date: Fri, 9 Jul 2010 20:55:36 +0000 Subject: context->phrase git-svn-id: https://ws10smt.googlecode.com/svn/trunk@211 ec762483-ff6d-05da-a07a-a48fb63a330f --- .../prjava/src/phrase/C2F.java | 205 ++++++++++++++++++++- 1 file changed, 200 insertions(+), 5 deletions(-) (limited to 'gi/posterior-regularisation/prjava/src/phrase') diff --git a/gi/posterior-regularisation/prjava/src/phrase/C2F.java b/gi/posterior-regularisation/prjava/src/phrase/C2F.java index 2646d961..63dad2ab 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/C2F.java +++ b/gi/posterior-regularisation/prjava/src/phrase/C2F.java @@ -1,17 +1,212 @@ package phrase; + +import gnu.trove.TIntArrayList; + +import io.FileUtil; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.List; + +import phrase.Corpus.Edge; + /** * @brief context generates phrase * @author desaic * */ public class C2F { - - /** - * @param args + public int K; + private int n_words, n_contexts, n_positions; + public Corpus c; + + /**@brief + * emit[tag][position][word] = p(word | tag, position in phrase) + */ + private double emit[][][]; + /**@brief + * pi[context][tag] = p(tag | context) + */ + private double pi[][]; + + public C2F(int numCluster, Corpus corpus){ + K=numCluster; + c=corpus; + n_words=c.getNumWords(); + n_contexts=c.getNumContexts(); + + //number of words in a phrase to be considered + //currently the first and last word + //if the phrase has length 1 + //use the same word for two positions + n_positions=2; + + emit=new double [K][n_positions][n_words]; + pi=new double[n_contexts][K]; + + for(double [][]i:emit){ + for(double []j:i){ + arr.F.randomise(j); + } + } + + for(double []j:pi){ + arr.F.randomise(j); + } + } + + /**@brief test + * */ - public static void main(String[] args) { - // TODO Auto-generated method stub + public static void main(String args[]){ + String in="../pdata/canned.con"; + String out="../pdata/posterior.out"; + int numCluster=5; + Corpus corpus = null; + File infile = new File(in); + try { + System.out.println("Reading concordance from " + infile); + corpus = Corpus.readFromFile(FileUtil.reader(infile)); + corpus.printStats(System.out); + } catch (IOException e) { + System.err.println("Failed to open input file: " + infile); + e.printStackTrace(); + System.exit(1); + } + + C2F c2f=new C2F(numCluster,corpus); + int iter=20; + double llh=0; + for(int i=0;i contexts = c.getEdgesForContext(context); + + for (int ctx=0; ctx 0; + loglikelihood += edge.getCount() * Math.log(z); + arr.F.l1normalize(p); + + int count = edge.getCount(); + //increment expected count + TIntArrayList phrase= edge.getPhrase(); + for(int tag=0;tag EPS) + ps.print("\t" + j + ": " + pi[i][j]); + } + ps.println(); + } + + ps.println("P(word|tag,position)"); + for (int i = 0; i < K; ++i) + { + for(int position=0;position EPS) + ps.print(c.getWord(word)+"="+emit[i][position][word]+"\t"); + } + ps.println(); + } + ps.println(); + } + + } + } -- cgit v1.2.3