summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts/extract_contexts_test.py
blob: 693b6e0b89bd26c6c68afa6fb1e867d1e1baee17 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/python

import sys,collections

def tuple_to_str(t):
  s=""
  for i,x in enumerate(t):
    if i > 0: s += "|"
    s += str(x)
  return s

if len(sys.argv) < 5:
  print "Usage: extract-contexts_test.py output_filename vocab contexts order lowercase"
  exit(1)

output_filename = sys.argv[1]
output = open(output_filename+".test_data",'w')

unk_term="-UNK-"
vocab_dict={}
for i,x in enumerate(file(sys.argv[2], 'r').readlines()): 
  vocab_dict[x.strip()]=i

contexts_dict={}
contexts_list=[]
for i,x in enumerate(file(sys.argv[3], 'r').readlines()): 
  contexts_dict[x.strip()]=i
  contexts_list.append(x.strip())

order = int(sys.argv[4])

lowercase = False
if len(sys.argv) > 5:
  lowercase = bool(sys.argv[5])
if lowercase: unk_term = unk_term.lower()

prefix = ["<s%d>|<s>"%i for i in range(order)]
suffix = ["</s%d>|</s>"%i for i in range(order)]

assert unk_term in vocab_dict
for line in sys.stdin:
  tokens = list(prefix)
  tokens.extend(line.split())
  tokens.extend(suffix)
  if lowercase:
    tokens = map(lambda x: x.lower(), tokens)

  for i in range(order, len(tokens)-order):
    context_list=[]
    term=""
    for j in range(i-order, i+order+1):
      token,tag = tokens[j].rsplit('|',2)
      if j != i:
        context_list.append(token)
      else:
        if token not in vocab_dict: 
          term = vocab_dict[unk_term] 
        else:
          term = vocab_dict[token] 
    context = tuple_to_str(context_list)
    if context not in contexts_dict: 
      contexts_dict[context] = len(contexts_dict)
      contexts_list.append(context)
    context_index = contexts_dict[context]
    print >>output, "%d:%d" % (term,context_index),
  print >>output
output.close()

contexts_file = open(output_filename+".test_contexts",'w')
for c in contexts_list: 
  print >>contexts_file, c
contexts_file.close()