diff options
author | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-20 18:37:04 +0000 |
---|---|---|
committer | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-20 18:37:04 +0000 |
commit | f281f2deac864d57a0eb566ae1f1c203ee5a8623 (patch) | |
tree | 9de5753f91edab5b89fd40152360f0e7135818cb /gi/evaluation/entropy.py | |
parent | 9380fb4819f3ed56cb7ad77a43728718039389cc (diff) |
Cleaned up scripts
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@336 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/evaluation/entropy.py')
-rw-r--r-- | gi/evaluation/entropy.py | 38 |
1 files changed, 38 insertions, 0 deletions
diff --git a/gi/evaluation/entropy.py b/gi/evaluation/entropy.py new file mode 100644 index 00000000..cef0dbb4 --- /dev/null +++ b/gi/evaluation/entropy.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +import sys, math, itertools, getopt + +def usage(): + print >>sys.stderr, 'Usage:', sys.argv[0], '[-s slash_threshold] input file' + sys.exit(0) + +optlist, args = getopt.getopt(sys.argv[1:], 'hs:') +slash_threshold = None +for opt, arg in optlist: + if opt == '-s': + slash_threshold = int(arg) + else: + usage() +if len(args) != 1: + usage() + +infile = open(args[0]) +N = 0 +frequencies = {} + +for line in infile: + + for part in line.split('||| ')[1].split(): + tag = part.split(':',1)[1] + + if slash_threshold == None or tag.count('/') + tag.count('\\') <= slash_threshold: + frequencies.setdefault(gtag, 0) + frequencies[gtag] += 1 + N += 1 + +h = 0 +for tag, c in frequencies.items(): + h -= c * (math.log(c, 2) - math.log(N, 2)) +h /= N + +print 'entropy', h |