diff options
author | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-19 22:40:21 +0000 |
---|---|---|
committer | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-19 22:40:21 +0000 |
commit | 1b606343b7368aa4c61d5088b22b8916486f0073 (patch) | |
tree | b71edeeff408e24a4a53d2725aa895868a5424b4 /gi/posterior-regularisation | |
parent | 40a21147727a2f1c77ee4796fd8c648160e3b555 (diff) |
Tool to pull out separate language data from context.txt.gz
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@326 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/posterior-regularisation')
-rwxr-xr-x | gi/posterior-regularisation/split-languages.py | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/gi/posterior-regularisation/split-languages.py b/gi/posterior-regularisation/split-languages.py new file mode 100755 index 00000000..206da661 --- /dev/null +++ b/gi/posterior-regularisation/split-languages.py @@ -0,0 +1,23 @@ +#!/usr/bin/python + +import sys + +sout = open(sys.argv[1], 'w') +tout = open(sys.argv[2], 'w') +for line in sys.stdin: + phrase, contexts = line.rstrip().split('\t') + sp, tp = phrase.split(' <SPLIT> ') + sout.write('%s\t' % sp) + tout.write('%s\t' % tp) + parts = contexts.split(' ||| ') + for i in range(0, len(parts), 2): + sc, tc = parts[i].split(' <SPLIT> ') + if i != 0: + sout.write(' ||| ') + tout.write(' ||| ') + sout.write('%s ||| %s' % (sc, parts[i+1])) + tout.write('%s ||| %s' % (tc, parts[i+1])) + sout.write('\n') + tout.write('\n') +sout.close() +tout.close() |