summaryrefslogtreecommitdiff
path: root/gi/posterior-regularisation/prjava/src/test/HMMModelStats.java
blob: d54525c8b9aa9a29b3f6b4885d544ff5a6cd8c32 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
package test;

import hmm.HMM;
import hmm.POS;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;

import data.Corpus;

public class HMMModelStats {

	public static String modelFilename="../posdata/posModel.out";
	public static String alphaFilename="../posdata/corpus.alphabet";
	public static String statsFilename="../posdata/model.stats";

	public static final int NUM_WORD=50;
	
	public static String testFilename="../posdata/en_test.conll";
	
	public static double [][]maxwt;
	
	public static void main(String[] args) {
		HashMap<String, Integer>vocab=
			(HashMap<String, Integer>) io.SerializedObjects.readSerializedObject(alphaFilename);
		
		Corpus test=new Corpus(testFilename,vocab);
		
		String [] dict=new String [vocab.size()+1];
		for(String key:vocab.keySet()){
			dict[vocab.get(key)]=key;
		}
		dict[dict.length-1]=Corpus.UNK_TOK;
		
		HMM hmm=new HMM();
		hmm.readModel(modelFilename);

		
		
		PrintStream ps = null;
		try {
			ps = io.FileUtil.printstream(new File(statsFilename));
		} catch (IOException e) {
			e.printStackTrace();
			System.exit(1);
		}
		
		double [][] emit=hmm.getEmitProb();
		for(int i=0;i<emit.length;i++){
			ArrayList<IntDoublePair>l=new ArrayList<IntDoublePair>();
			for(int j=0;j<emit[i].length;j++){
				l.add(new IntDoublePair(j,emit[i][j]));
			}
			Collections.sort(l);
			ps.println(i);
			for(int j=0;j<NUM_WORD;j++){
				if(j>=dict.length){
					break;
				}
				ps.print(dict[l.get(j).idx]+"\t");
				if((1+j)%10==0){
					ps.println();
				}
			}
			ps.println("\n");
		}
		
		checkMaxwt(hmm,ps,test.getAllData());
		
		int terminalSym=vocab.get(Corpus .END_SYM);
		//sample 10 sentences
		for(int i=0;i<10;i++){
			int []sent=hmm.sample(terminalSym);
			for(int j=0;j<sent.length;j++){
				ps.print(dict[sent[j]]+"\t");
			}
			ps.println();
		}
		
		ps.close();
		
	}
	
	public static void checkMaxwt(HMM hmm,PrintStream ps,int [][]data){
		double [][]emit=hmm.getEmitProb();
		maxwt=new double[emit.length][emit[0].length];
		
		hmm.computeMaxwt(maxwt,data);
		double sum=0;
		for(int i=0;i<maxwt.length;i++){
			for(int j=0;j<maxwt.length;j++){
				sum+=maxwt[i][j];
			}
		}
		
		ps.println("max w t P(w_i|t): "+sum);
		
	}
	
}