From 40ee5446b84b5cdbc4e4a613e4c1aa19231c42d3 Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Thu, 15 Jul 2010 00:34:58 +0000 Subject: Massacred the pipeline to support source language phrases and contexts. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@255 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pyp-topics/scripts/spans2labels.py | 41 ++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'gi/pyp-topics') diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py index f990582e..3dc60835 100755 --- a/gi/pyp-topics/scripts/spans2labels.py +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -4,7 +4,7 @@ import sys from operator import itemgetter if len(sys.argv) <= 2: - print "Usage: spans2labels.py phrase_context_index [order] [threshold]" + print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}]" exit(1) order=1 @@ -14,6 +14,11 @@ if len(sys.argv) > 2: order = int(sys.argv[2]) if len(sys.argv) > 3: threshold = float(sys.argv[3]) +phr=ctx='t' +if len(sys.argv) > 4: + phr, ctx = sys.argv[4] + assert phr in 'stb' + assert ctx in 'stb' phrase_context_index = {} for line in file(sys.argv[1], 'r'): @@ -52,11 +57,35 @@ for line in sys.stdin: t1 += order t2 += order - phrase = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip() - left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "") - right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip() - context = "%s %s" % (left_context, right_context) + phraset = phrases = contextt = contexts = '' + if phr in 'tb': + phraset = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip() + if phr in 'sb': + phrases = reduce(lambda x, y: x+y+" ", source[s1:s2], "").strip() + + if ctx in 'tb': + left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "") + right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip() + contextt = "%s %s" % (left_context, right_context) + if ctx in 'sb': + left_context = reduce(lambda x, y: x+y+" ", source[s1-order:s1], "") + right_context = reduce(lambda x, y: x+y+" ", source[s2:s2+order], "").strip() + contexts = "%s %s" % (left_context, right_context) + + if phr == 'b': + phrase = phraset + ' ' + phrases + elif phr == 's': + phrase = phrases + else: + phrase = phraset + + if ctx == 'b': + context = contextt + ' ' + contexts + elif ctx == 's': + context = contexts + else: + context = contextt label = phrase_context_index.get((phrase,context), "") - print "%s-%s:X%s" % (t1-order,t2-order,label), + print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label), print -- cgit v1.2.3