diff options
Diffstat (limited to 'gi')
| -rw-r--r-- | gi/evaluation/entropy.py | 4 | ||||
| -rw-r--r-- | gi/evaluation/evaluate_entropy.py | 117 | ||||
| -rw-r--r-- | gi/evaluation/extract_ccg_labels.py | 10 | 
3 files changed, 126 insertions, 5 deletions
| diff --git a/gi/evaluation/entropy.py b/gi/evaluation/entropy.py index cef0dbb4..ec1ef502 100644 --- a/gi/evaluation/entropy.py +++ b/gi/evaluation/entropy.py @@ -26,8 +26,8 @@ for line in infile:          tag = part.split(':',1)[1]          if slash_threshold == None or tag.count('/') + tag.count('\\') <= slash_threshold: -            frequencies.setdefault(gtag, 0) -            frequencies[gtag] += 1 +            frequencies.setdefault(tag, 0) +            frequencies[tag] += 1              N += 1  h = 0 diff --git a/gi/evaluation/evaluate_entropy.py b/gi/evaluation/evaluate_entropy.py new file mode 100644 index 00000000..43edc376 --- /dev/null +++ b/gi/evaluation/evaluate_entropy.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python + +import sys, math, itertools + +ginfile = open(sys.argv[1]) +pinfile = open(sys.argv[2]) +if len(sys.argv) > 3: +    slash_threshold = int(sys.argv[3]) +    #print >>sys.stderr, 'slash threshold', slash_threshold +else: +    slash_threshold = 99999 + +# evaluating: H(G | P) = sum_{g,p} p(g,p) log { p(p) / p(g,p) } +#                      = sum_{g,p} c(g,p)/N { log c(p) - log N - log c(g,p) + log N } +#                      = 1/N sum_{g,p} c(g,p) { log c(p) - log c(g,p) } +# where G = gold, P = predicted, N = number of events + +N = 0 +gold_frequencies = {} +predict_frequencies = {} +joint_frequencies = {} + +for gline, pline in itertools.izip(ginfile, pinfile): +    gparts = gline.split('||| ')[1].split() +    pparts = pline.split('||| ')[1].split() +    assert len(gparts) == len(pparts) + +    for gpart, ppart in zip(gparts, pparts): +        gtag = gpart.split(':',1)[1] +        ptag = ppart.split(':',1)[1] + +        if gtag.count('/') + gtag.count('\\') <= slash_threshold: +            joint_frequencies.setdefault((gtag, ptag), 0) +            joint_frequencies[gtag,ptag] += 1 + +            predict_frequencies.setdefault(ptag, 0) +            predict_frequencies[ptag] += 1 + +            gold_frequencies.setdefault(gtag, 0) +            gold_frequencies[gtag] += 1 + +            N += 1 + +hg2p = 0 +hp2g = 0 +for (gtag, ptag), cgp in joint_frequencies.items(): +    hp2g += cgp * (math.log(predict_frequencies[ptag], 2) - math.log(cgp, 2)) +    hg2p += cgp * (math.log(gold_frequencies[gtag], 2) - math.log(cgp, 2)) +hg2p /= N +hp2g /= N + +hg = 0 +for gtag, c in gold_frequencies.items(): +    hg -= c * (math.log(c, 2) - math.log(N, 2)) +hg /= N + +print 'H(P|G)', hg2p, 'H(G|P)', hp2g, 'VI', hg2p + hp2g, 'H(G)', hg +#sys.exit(0) + +# find top tags +gtags = gold_frequencies.items() +gtags.sort(lambda x,y: x[1]-y[1]) +gtags.reverse() +#gtags = gtags[:50] + +print '%7s %7s' % ('pred', 'cnt'), +for gtag, gcount in gtags: print '%7s' % gtag, +print +print '=' * 80 + +preds = predict_frequencies.items() +preds.sort(lambda x,y: x[1]-y[1]) +preds.reverse() +for ptag, pcount in preds: +    print '%7s %7d' % (ptag, pcount), +    for gtag, gcount in gtags: +	print '%7d' % joint_frequencies.get((gtag, ptag), 0), +    print + +print '%7s %7d' % ('total', N), +for gtag, gcount in gtags: print '%7d' % gcount, +print + +if len(sys.argv) > 4: +	# needs Python Image Library (PIL) +	import Image, ImageDraw + +	offset=10 + +	image = Image.new("RGB", (len(preds), len(gtags)), (255, 255, 255)) +	#hsl(hue, saturation%, lightness%) + +	# resort preds to get a better diagonal +	ptags = [] +	remaining = set(predict_frequencies.keys()) +	for y, (gtag, gcount) in enumerate(gtags): +	    best = (None, 0) +	    for ptag in remaining: +		#pcount = predict_frequencies[ptag] +		p = joint_frequencies.get((gtag, ptag), 0)# / float(pcount) +		if p > best[1]: best = (ptag, p) +	    ptags.append(ptag) +	    remaining.remove(ptag) +	    if not remaining: break +	 +	draw = ImageDraw.Draw(image) +	for x, ptag in enumerate(ptags): +	    pcount = predict_frequencies[ptag] +	    minval = math.log(offset) +	    maxval = math.log(pcount + offset) +	    for y, (gtag, gcount) in enumerate(gtags): +		f = math.log(offset + joint_frequencies.get((gtag, ptag), 0)) +		z = int(240. * (maxval - f) / float(maxval - minval)) +		#print x, y, z, f, maxval +		draw.point([(x,y)], fill='hsl(%d, 100%%, 50%%)' % z) +	del draw +	image.save(sys.argv[4]) diff --git a/gi/evaluation/extract_ccg_labels.py b/gi/evaluation/extract_ccg_labels.py index 77f21004..e0034648 100644 --- a/gi/evaluation/extract_ccg_labels.py +++ b/gi/evaluation/extract_ccg_labels.py @@ -90,10 +90,13 @@ for tline, eline in itertools.izip(tinfile, einfile):      else:          tr = None -    zh, en, spans = eline.strip().split(" ||| ") +    parts = eline.strip().split(" ||| ") +    zh, en = parts[:2] +    spans = parts[-1]      print '|||',      for span in spans.split(): -        i, j, x, y = map(int, span.split("-")) +        sps = span.split(":") +        i, j, x, y = map(int, sps[0].split("-"))          if tr:              a = ancestor(tr, range(x,y)) @@ -113,7 +116,8 @@ for tline, eline in itertools.izip(tinfile, einfile):                      cat += '\\' + f.data.tag                  else:                      break -            for f in reversed(fs): +            fs.reverse() +            for f in fs:                  if f.left >= y:                      cat += '/' + f.data.tag                  else: | 
