summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts/spans2labels.py
diff options
context:
space:
mode:
authorphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-29 17:06:03 +0000
committerphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-29 17:06:03 +0000
commitfa91c76321460773e0de7fc077db5a3c919eb89b (patch)
tree097ba6452ddc389283d1fb2956357dcb9576ea46 /gi/pyp-topics/scripts/spans2labels.py
parent90f4ede0eea7ec5b73e7167dde4fe590485dcd67 (diff)
Added different executable for working with context models, changed input/output processing.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@54 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/scripts/spans2labels.py')
-rwxr-xr-xgi/pyp-topics/scripts/spans2labels.py47
1 files changed, 23 insertions, 24 deletions
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index 409fda92..3221dbf0 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -3,44 +3,43 @@
import sys
from operator import itemgetter
-if len(sys.argv) != 4:
- print "Usage: spans2labels.py phrase_index context_index phrase_context_index"
+if len(sys.argv) <= 2:
+ print "Usage: spans2labels.py phrase_context_index [order]"
exit(1)
-phrase_index = dict(map(lambda x: (x[1].strip(),x[0]), enumerate(file(sys.argv[1], 'r').readlines())))
-context_index = dict(map(lambda x: (x[1].strip(),x[0]), enumerate(file(sys.argv[2], 'r').readlines())))
+order=1
+if len(sys.argv) > 2:
+ order = int(sys.argv[2])
phrase_context_index = {}
-for i,line in enumerate(file(sys.argv[3], 'r').readlines()):
- for c,l in map(lambda x: x.split(':'), line.split()[1:]):
- phrase_context_index[(int(i),int(c))] = l
+for line in file(sys.argv[1], 'r'):
+ phrase,tail= line.split('\t')
+ contexts = tail.split(" ||| ")
+ assert len(contexts) % 2 == 0
+ for i in range(0, len(contexts), 2):
+ category = contexts[i+1].split("=")[1].strip()
+ phrase_context_index[(phrase,contexts[i])] = category
+# print (phrase,contexts[i]), category
for line in sys.stdin:
line_segments = line.split('|||')
- source = ['<s>'] + line_segments[0].split() + ['</s>']
- target = ['<s>'] + line_segments[1].split() + ['</s>']
+ source = ['<s>' for x in range(order)] + line_segments[0].split() + ['</s>' for x in range(order)]
+ target = ['<s>' for x in range(order)] + line_segments[1].split() + ['</s>' for x in range(order)]
phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()]
-# for x in source[1:-1]:
-# print x,
-# print "|||",
-# for x in target[1:-1]:
-# print x,
print "|||",
for s1,s2,t1,t2 in phrases:
- s1 += 1
- s2 += 1
- t1 += 1
- t2 += 1
+ s1 += order
+ s2 += order
+ t1 += order
+ t2 += order
phrase = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip()
- context = "%s <PHRASE> %s" % (target[t1-1], target[t2])
+ left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "")
+ right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip()
+ context = "%s<PHRASE> %s" % (left_context, right_context)
- pi = phrase_index[phrase]
- ci = context_index[context]
- label = phrase_context_index[(pi,ci)]
+ label = phrase_context_index[(phrase,context)]
print "%s-%s:X%s" % (t1-1,t2-1,label),
-# print phrase, pi, context, ci
-# print phrase_context_index[(pi,ci)]
print