summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts/score-mkcls.py
blob: 6bd33fc5d70562643451284f3ad974d281ceb913 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/python

import sys
from collections import defaultdict

def dict_max(d):
  max_val=-1
  max_key=None
  for k in d:
    if d[k] > max_val: 
      max_val = d[k]
      max_key = k
  assert max_key
  return max_key

if len(sys.argv) != 3:
  print "Usage: score-mkcls.py gold classes"
  exit(1)

gold_file=open(sys.argv[1],'r')

term_to_topics = {}
for line in open(sys.argv[2],'r'):
  term,cls = line.split()
  term_to_topics[term] = cls

gold_to_topics = defaultdict(dict)
topics_to_gold = defaultdict(dict)

for gold_line in gold_file:
  gold_tokens = gold_line.split()
  for gold_token in gold_tokens:
    gold_term,gold_tag = gold_token.rsplit('|',1)
    pred_token = term_to_topics[gold_term]
    gold_to_topics[gold_tag][pred_token] \
      = gold_to_topics[gold_tag].get(pred_token, 0) + 1
    topics_to_gold[pred_token][gold_tag] \
      = topics_to_gold[pred_token].get(gold_tag, 0) + 1

pred=0
correct=0
gold_file=open(sys.argv[1],'r')
for gold_line in gold_file:
  gold_tokens = gold_line.split()

  for gold_token in gold_tokens:
    gold_term,gold_tag = gold_token.rsplit('|',1)
    pred_token = term_to_topics[gold_term]
    print "%s|%s|%s" % (gold_token, pred_token, dict_max(topics_to_gold[pred_token])),
    pred += 1
    if gold_tag == dict_max(topics_to_gold[pred_token]):
      correct += 1
  print
print >>sys.stderr, "Many-to-One Accuracy = %f" % (float(correct) / pred)
#for x in gold_to_topics: 
#  print x,dict_max(gold_to_topics[x])
#print "###################################################"
#for x in range(len(topics_to_gold)): 
#  print x,dict_max(topics_to_gold[str(x)])
#  print x,topics_to_gold[str(x)]
#print term_to_topics