1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
#!/usr/bin/python
import sys
from operator import itemgetter
if len(sys.argv) <= 2:
print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}]"
exit(1)
order=1
threshold = 0
cutoff_cat = "<UNK>"
if len(sys.argv) > 2:
order = int(sys.argv[2])
if len(sys.argv) > 3:
threshold = float(sys.argv[3])
phr=ctx='t'
if len(sys.argv) > 4:
phr, ctx = sys.argv[4]
assert phr in 'stb'
assert ctx in 'stb'
print >>sys.stderr, "Loading phrase index"
phrase_context_index = {}
for line in file(sys.argv[1], 'r'):
phrase,tail= line.split('\t')
contexts = tail.split(" ||| ")
try: # remove Phil's bizarre integer pair
x,y = contexts[0].split()
x=int(x); y=int(y)
contexts = contexts[1:]
except:
pass
if len(contexts) == 1: continue
assert len(contexts) % 2 == 0
for i in range(0, len(contexts), 2):
#parse contexts[i+1] = " C=1 P=0.8 ... "
features=dict([ keyval.split('=') for keyval in contexts[i+1].split()])
category = features['C']
if features.has_key('P') and float(features['P']) < threshold:
category = cutoff_cat
phrase_context_index[(phrase,contexts[i])] = category
#print (phrase,contexts[i]), category
print >>sys.stderr, "Labelling spans"
for line in sys.stdin:
line_segments = line.split('|||')
source = ['<s>' for x in range(order)] + line_segments[0].split() + ['</s>' for x in range(order)]
target = ['<s>' for x in range(order)] + line_segments[1].split() + ['</s>' for x in range(order)]
phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()]
#print >>sys.stderr, "line", source, '---', target, 'phrases', phrases
print "|||",
for s1,s2,t1,t2 in phrases:
s1 += order
s2 += order
t1 += order
t2 += order
phraset = phrases = contextt = contexts = ''
if phr in 'tb':
phraset = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip()
if phr in 'sb':
phrases = reduce(lambda x, y: x+y+" ", source[s1:s2], "").strip()
if ctx in 'tb':
left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "")
right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip()
contextt = "%s<PHRASE> %s" % (left_context, right_context)
if ctx in 'sb':
left_context = reduce(lambda x, y: x+y+" ", source[s1-order:s1], "")
right_context = reduce(lambda x, y: x+y+" ", source[s2:s2+order], "").strip()
contexts = "%s<PHRASE> %s" % (left_context, right_context)
if phr == 'b':
phrase = phraset + ' <SPLIT> ' + phrases
elif phr == 's':
phrase = phrases
else:
phrase = phraset
if ctx == 'b':
context = contextt + ' <SPLIT> ' + contexts
elif ctx == 's':
context = contexts
else:
context = contextt
#print "%d-%d-%d-%d looking up" % (s1-order,s2-order,t1-order,t2-order), (phrase, context)
label = phrase_context_index.get((phrase,context), cutoff_cat)
if label != cutoff_cat: #cutoff'd spans are left unlabelled
print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label),
print
|