summaryrefslogtreecommitdiff
path: root/gi/posterior-regularisation/split-languages.py
diff options
context:
space:
mode:
authortrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-19 22:40:21 +0000
committertrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-19 22:40:21 +0000
commit1b606343b7368aa4c61d5088b22b8916486f0073 (patch)
treeb71edeeff408e24a4a53d2725aa895868a5424b4 /gi/posterior-regularisation/split-languages.py
parent40a21147727a2f1c77ee4796fd8c648160e3b555 (diff)
Tool to pull out separate language data from context.txt.gz
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@326 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/posterior-regularisation/split-languages.py')
-rwxr-xr-xgi/posterior-regularisation/split-languages.py23
1 files changed, 23 insertions, 0 deletions
diff --git a/gi/posterior-regularisation/split-languages.py b/gi/posterior-regularisation/split-languages.py
new file mode 100755
index 00000000..206da661
--- /dev/null
+++ b/gi/posterior-regularisation/split-languages.py
@@ -0,0 +1,23 @@
+#!/usr/bin/python
+
+import sys
+
+sout = open(sys.argv[1], 'w')
+tout = open(sys.argv[2], 'w')
+for line in sys.stdin:
+ phrase, contexts = line.rstrip().split('\t')
+ sp, tp = phrase.split(' <SPLIT> ')
+ sout.write('%s\t' % sp)
+ tout.write('%s\t' % tp)
+ parts = contexts.split(' ||| ')
+ for i in range(0, len(parts), 2):
+ sc, tc = parts[i].split(' <SPLIT> ')
+ if i != 0:
+ sout.write(' ||| ')
+ tout.write(' ||| ')
+ sout.write('%s ||| %s' % (sc, parts[i+1]))
+ tout.write('%s ||| %s' % (tc, parts[i+1]))
+ sout.write('\n')
+ tout.write('\n')
+sout.close()
+tout.close()