summaryrefslogtreecommitdiff
path: root/gi/posterior-regularisation/prjava/src/test/HMMModelStats.java
blob: 26d7abec0481426aca399e736f566cc4ceb0e4c1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
package test;

import hmm.HMM;
import hmm.POS;

import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;

import data.Corpus;

public class HMMModelStats {

	public static String modelFilename="../posdata/posModel.out";
	public static String alphaFilename="../posdata/corpus.alphabet";
	public static String statsFilename="../posdata/model.stats";

	public static final int NUM_WORD=50;
	
	public static String testFilename="../posdata/en_test.conll";
	
	public static double [][]maxwt;
	
	public static void main(String[] args) {
		HashMap<String, Integer>vocab=
			(HashMap<String, Integer>) io.SerializedObjects.readSerializedObject(alphaFilename);
		
		Corpus test=new Corpus(testFilename,vocab);
		
		String [] dict=new String [vocab.size()+1];
		for(String key:vocab.keySet()){
			dict[vocab.get(key)]=key;
		}
		dict[dict.length-1]=Corpus.UNK_TOK;
		
		HMM hmm=new HMM();
		hmm.readModel(modelFilename);

		
		
		PrintStream ps=io.FileUtil.openOutFile(statsFilename);
		
		double [][] emit=hmm.getEmitProb();
		for(int i=0;i<emit.length;i++){
			ArrayList<IntDoublePair>l=new ArrayList<IntDoublePair>();
			for(int j=0;j<emit[i].length;j++){
				l.add(new IntDoublePair(j,emit[i][j]));
			}
			Collections.sort(l);
			ps.println(i);
			for(int j=0;j<NUM_WORD;j++){
				if(j>=dict.length){
					break;
				}
				ps.print(dict[l.get(j).idx]+"\t");
				if((1+j)%10==0){
					ps.println();
				}
			}
			ps.println("\n");
		}
		
		checkMaxwt(hmm,ps,test.getAllData());
		
		int terminalSym=vocab.get(Corpus .END_SYM);
		//sample 10 sentences
		for(int i=0;i<10;i++){
			int []sent=hmm.sample(terminalSym);
			for(int j=0;j<sent.length;j++){
				ps.print(dict[sent[j]]+"\t");
			}
			ps.println();
		}
		
		ps.close();
		
	}
	
	public static void checkMaxwt(HMM hmm,PrintStream ps,int [][]data){
		double [][]emit=hmm.getEmitProb();
		maxwt=new double[emit.length][emit[0].length];
		
		hmm.computeMaxwt(maxwt,data);
		double sum=0;
		for(int i=0;i<maxwt.length;i++){
			for(int j=0;j<maxwt.length;j++){
				sum+=maxwt[i][j];
			}
		}
		
		ps.println("max w t P(w_i|t)"+sum);
		
	}
	
}