summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts/contexts2documents.py
blob: 9be4ebbbb8c09a22e143da654572720410b17712 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/usr/bin/python

import sys
from operator import itemgetter

if len(sys.argv) > 3:
  print "Usage: contexts2documents.py [contexts_index_out] [phrases_index_out]"
  exit(1)

context_index = {} 
phrase_index = {}
for line in sys.stdin:
  phrase, line_tail = line.split('\t')

  raw_contexts = line_tail.split('|||')
  contexts = [c.strip() for x,c in enumerate(raw_contexts) if x%2 == 0]
  counts   = [int(c.split('=')[1].strip()) for x,c in enumerate(raw_contexts) if x%2 != 0]
  phrase_index.setdefault(phrase, len(phrase_index))
  print len(contexts),
  for context,count in zip(contexts,counts): 
    c = context_index.setdefault(context, len(context_index))
    print "%d:%d" % (c,count),
  print
if 1 < len(sys.argv) < 4:
  contexts_out = open(sys.argv[1],'w')
  contexts = context_index.items()
  contexts.sort(key = itemgetter(1))
  for context in contexts: 
    print >>contexts_out, context[0]
  contexts_out.close()
if len(sys.argv) == 3:
  phrases_out = open(sys.argv[2],'w')
  phrases = phrase_index.items()
  phrases.sort(key = itemgetter(1))
  for phrase in phrases: 
    print >>phrases_out, phrase[0]
  phrases_out.close()