diff options
author | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 20:34:00 +0000 |
---|---|---|
committer | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 20:34:00 +0000 |
commit | 2f2ba42a1453f4a3a08f9c1ecfc53c1b1c83d550 (patch) | |
tree | 646e81b6325280f64a72771b5eeadf5118e465a9 /gi/pyp-topics/scripts/contexts2documents.py | |
parent | 2f2e36ca3060e7e9853c3d611f6cc5e112a76ddd (diff) |
Initial ci of gi dir
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@5 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/scripts/contexts2documents.py')
-rwxr-xr-x | gi/pyp-topics/scripts/contexts2documents.py | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/gi/pyp-topics/scripts/contexts2documents.py b/gi/pyp-topics/scripts/contexts2documents.py new file mode 100755 index 00000000..c625d17d --- /dev/null +++ b/gi/pyp-topics/scripts/contexts2documents.py @@ -0,0 +1,29 @@ +#!/usr/bin/python + +import sys +from operator import itemgetter + +if len(sys.argv) > 2: + print "Usage: contexts2documents.py [contexts_index_out]" + exit(1) + +context_index = {} +for line in sys.stdin: + phrase, line_tail = line.split('\t') + + raw_contexts = line_tail.split('|||') + contexts = [c.strip() for x,c in enumerate(raw_contexts) if x%2 == 0] + counts = [int(c.split('=')[1].strip()) for x,c in enumerate(raw_contexts) if x%2 != 0] + + print len(contexts), + for context,count in zip(contexts,counts): + c = context_index.setdefault(context, len(context_index)) + print "%d:%d" % (c,count), + print +if len(sys.argv) == 2: + contexts_out = open(sys.argv[1],'w') + contexts = context_index.items() + contexts.sort(key = itemgetter(1)) + for context in contexts: + print >>contexts_out, context[0] + contexts_out.close() |