summaryrefslogtreecommitdiff
path: root/gi/evaluation/conditional_entropy.py
blob: 356d3b1d055090d17a378cfcb3b84938d9b96e7a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python

import sys, math, itertools, getopt

def usage():
    print >>sys.stderr, 'Usage:', sys.argv[0], '[-s slash_threshold] input-1 input-2'
    sys.exit(0)

optlist, args = getopt.getopt(sys.argv[1:], 'hs:')
slash_threshold = None
for opt, arg in optlist:
    if opt == '-s':
        slash_threshold = int(arg)
    else:
        usage()
if len(args) != 2:
    usage()

ginfile = open(args[0])
pinfile = open(args[1])

# evaluating: H(G | P) = sum_{g,p} p(g,p) log { p(p) / p(g,p) }
#                      = sum_{g,p} c(g,p)/N { log c(p) - log N - log c(g,p) + log N }
#                      = 1/N sum_{g,p} c(g,p) { log c(p) - log c(g,p) }
# where G = gold, P = predicted, N = number of events

N = 0
gold_frequencies = {}
predict_frequencies = {}
joint_frequencies = {}

for gline, pline in itertools.izip(ginfile, pinfile):
    gparts = gline.split('||| ')[1].split()
    pparts = pline.split('||| ')[1].split()
    assert len(gparts) == len(pparts)

    for gpart, ppart in zip(gparts, pparts):
        gtag = gpart.split(':',1)[1]
        ptag = ppart.split(':',1)[1]

        if slash_threshold == None or gtag.count('/') + gtag.count('\\') <= slash_threshold:
            joint_frequencies.setdefault((gtag, ptag), 0)
            joint_frequencies[gtag,ptag] += 1

            predict_frequencies.setdefault(ptag, 0)
            predict_frequencies[ptag] += 1

            gold_frequencies.setdefault(gtag, 0)
            gold_frequencies[gtag] += 1

            N += 1

hg2p = 0
hp2g = 0
for (gtag, ptag), cgp in joint_frequencies.items():
    hp2g += cgp * (math.log(predict_frequencies[ptag], 2) - math.log(cgp, 2))
    hg2p += cgp * (math.log(gold_frequencies[gtag], 2) - math.log(cgp, 2))
hg2p /= N
hp2g /= N

print 'H(P|G)', hg2p, 'H(G|P)', hp2g, 'VI', hg2p + hp2g
e2 61 fd 6a ce d6 51 df dd b0 75 2a ae 5f fe e7 q..w.}.9rZy..Q...a.j..Q...u*._.. 0260 51 86 47 8b f5 ab 3f 7f eb 31 d1 56 2a 87 b4 5f 44 11 1d bc 5b 12 4b f9 fd f1 f5 e9 a7 6f bb a7 Q.G...?..1.V*.._D...[.K......o.. 0280 97 87 fd 53 d7 30 a8 95 95 12 94 c2 24 a8 42 e1 26 b5 ff 53 35 7e 12 b7 06 7c af b4 bd aa a7 51 ...S.0......$.B.&..S5~..