From ad418214fe3b3fcd33d81225eb3d3fb08b67f88a Mon Sep 17 00:00:00 2001 From: desaicwtf Date: Mon, 28 Jun 2010 23:14:21 +0000 Subject: add draft version of POS induction with HMM and L1 Linf constraints git-svn-id: https://ws10smt.googlecode.com/svn/trunk@47 ec762483-ff6d-05da-a07a-a48fb63a330f --- .../prjava/src/test/CorpusTest.java | 60 ++++++++++ .../prjava/src/test/HMMModelStats.java | 96 +++++++++++++++ .../prjava/src/test/IntDoublePair.java | 23 ++++ .../prjava/src/test/X2y2WithConstraints.java | 131 +++++++++++++++++++++ 4 files changed, 310 insertions(+) create mode 100644 gi/posterior-regularisation/prjava/src/test/CorpusTest.java create mode 100644 gi/posterior-regularisation/prjava/src/test/HMMModelStats.java create mode 100644 gi/posterior-regularisation/prjava/src/test/IntDoublePair.java create mode 100644 gi/posterior-regularisation/prjava/src/test/X2y2WithConstraints.java (limited to 'gi/posterior-regularisation/prjava/src/test') diff --git a/gi/posterior-regularisation/prjava/src/test/CorpusTest.java b/gi/posterior-regularisation/prjava/src/test/CorpusTest.java new file mode 100644 index 00000000..b4c3041f --- /dev/null +++ b/gi/posterior-regularisation/prjava/src/test/CorpusTest.java @@ -0,0 +1,60 @@ +package test; + +import java.util.Arrays; +import java.util.HashMap; + +import data.Corpus; +import hmm.POS; + +public class CorpusTest { + + public static void main(String[] args) { + Corpus c=new Corpus(POS.trainFilename); + + + int idx=30; + + + HashMapvocab= + (HashMap) io.SerializedObjects.readSerializedObject(Corpus.alphaFilename); + + HashMaptagVocab= + (HashMap) io.SerializedObjects.readSerializedObject(Corpus.tagalphaFilename); + + + String [] dict=new String [vocab.size()+1]; + for(String key:vocab.keySet()){ + dict[vocab.get(key)]=key; + } + dict[dict.length-1]=Corpus.UNK_TOK; + + String [] tagdict=new String [tagVocab.size()+1]; + for(String key:tagVocab.keySet()){ + tagdict[tagVocab.get(key)]=key; + } + tagdict[tagdict.length-1]=Corpus.UNK_TOK; + + String[] sent=c.get(idx); + int []data=c.getInt(idx); + + + String []roundtrip=new String [sent.length]; + for(int i=0;ivocab= + (HashMap) io.SerializedObjects.readSerializedObject(alphaFilename); + + Corpus test=new Corpus(testFilename,vocab); + + String [] dict=new String [vocab.size()+1]; + for(String key:vocab.keySet()){ + dict[vocab.get(key)]=key; + } + dict[dict.length-1]=Corpus.UNK_TOK; + + HMM hmm=new HMM(); + hmm.readModel(modelFilename); + + + + PrintStream ps=io.FileUtil.openOutFile(statsFilename); + + double [][] emit=hmm.getEmitProb(); + for(int i=0;il=new ArrayList(); + for(int j=0;j=dict.length){ + break; + } + ps.print(dict[l.get(j).idx]+"\t"); + if((1+j)%10==0){ + ps.println(); + } + } + ps.println("\n"); + } + + checkMaxwt(hmm,ps,test.getAllData()); + + int terminalSym=vocab.get(Corpus .END_SYM); + //sample 10 sentences + for(int i=0;i<10;i++){ + int []sent=hmm.sample(terminalSym); + for(int j=0;jval){ + return 1; + } + if(pair.val