summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts
diff options
context:
space:
mode:
authortrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-15 00:34:58 +0000
committertrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-15 00:34:58 +0000
commit40ee5446b84b5cdbc4e4a613e4c1aa19231c42d3 (patch)
treedeb10ba93df13ec5cce90aa59d5fb8fe5a678a55 /gi/pyp-topics/scripts
parent2775fc13d1e8d3ad45c8ddf94226397403e0e373 (diff)
Massacred the pipeline to support source language phrases and contexts.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@255 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/scripts')
-rwxr-xr-xgi/pyp-topics/scripts/spans2labels.py41
1 files changed, 35 insertions, 6 deletions
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index f990582e..3dc60835 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -4,7 +4,7 @@ import sys
from operator import itemgetter
if len(sys.argv) <= 2:
- print "Usage: spans2labels.py phrase_context_index [order] [threshold]"
+ print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}]"
exit(1)
order=1
@@ -14,6 +14,11 @@ if len(sys.argv) > 2:
order = int(sys.argv[2])
if len(sys.argv) > 3:
threshold = float(sys.argv[3])
+phr=ctx='t'
+if len(sys.argv) > 4:
+ phr, ctx = sys.argv[4]
+ assert phr in 'stb'
+ assert ctx in 'stb'
phrase_context_index = {}
for line in file(sys.argv[1], 'r'):
@@ -52,11 +57,35 @@ for line in sys.stdin:
t1 += order
t2 += order
- phrase = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip()
- left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "")
- right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip()
- context = "%s<PHRASE> %s" % (left_context, right_context)
+ phraset = phrases = contextt = contexts = ''
+ if phr in 'tb':
+ phraset = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip()
+ if phr in 'sb':
+ phrases = reduce(lambda x, y: x+y+" ", source[s1:s2], "").strip()
+
+ if ctx in 'tb':
+ left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "")
+ right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip()
+ contextt = "%s<PHRASE> %s" % (left_context, right_context)
+ if ctx in 'sb':
+ left_context = reduce(lambda x, y: x+y+" ", source[s1-order:s1], "")
+ right_context = reduce(lambda x, y: x+y+" ", source[s2:s2+order], "").strip()
+ contexts = "%s<PHRASE> %s" % (left_context, right_context)
+
+ if phr == 'b':
+ phrase = phraset + ' <SPLIT> ' + phrases
+ elif phr == 's':
+ phrase = phrases
+ else:
+ phrase = phraset
+
+ if ctx == 'b':
+ context = contextt + ' <SPLIT> ' + contexts
+ elif ctx == 's':
+ context = contexts
+ else:
+ context = contextt
label = phrase_context_index.get((phrase,context), "<UNK>")
- print "%s-%s:X%s" % (t1-order,t2-order,label),
+ print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label),
print