diff options
Diffstat (limited to 'gi/pyp-topics/scripts/contexts2documents.py')
-rwxr-xr-x | gi/pyp-topics/scripts/contexts2documents.py | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/gi/pyp-topics/scripts/contexts2documents.py b/gi/pyp-topics/scripts/contexts2documents.py new file mode 100755 index 00000000..c625d17d --- /dev/null +++ b/gi/pyp-topics/scripts/contexts2documents.py @@ -0,0 +1,29 @@ +#!/usr/bin/python + +import sys +from operator import itemgetter + +if len(sys.argv) > 2: + print "Usage: contexts2documents.py [contexts_index_out]" + exit(1) + +context_index = {} +for line in sys.stdin: + phrase, line_tail = line.split('\t') + + raw_contexts = line_tail.split('|||') + contexts = [c.strip() for x,c in enumerate(raw_contexts) if x%2 == 0] + counts = [int(c.split('=')[1].strip()) for x,c in enumerate(raw_contexts) if x%2 != 0] + + print len(contexts), + for context,count in zip(contexts,counts): + c = context_index.setdefault(context, len(context_index)) + print "%d:%d" % (c,count), + print +if len(sys.argv) == 2: + contexts_out = open(sys.argv[1],'w') + contexts = context_index.items() + contexts.sort(key = itemgetter(1)) + for context in contexts: + print >>contexts_out, context[0] + contexts_out.close() |