From 11c4cc3836e9e0e38bc4250500381ce7723799ee Mon Sep 17 00:00:00 2001 From: philblunsom Date: Fri, 2 Jul 2010 18:25:25 +0000 Subject: New script for mapping corpus files to classes git-svn-id: https://ws10smt.googlecode.com/svn/trunk@117 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pyp-topics/scripts/tokens2classes.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100755 gi/pyp-topics/scripts/tokens2classes.py (limited to 'gi') diff --git a/gi/pyp-topics/scripts/tokens2classes.py b/gi/pyp-topics/scripts/tokens2classes.py new file mode 100755 index 00000000..33df255f --- /dev/null +++ b/gi/pyp-topics/scripts/tokens2classes.py @@ -0,0 +1,27 @@ +#!/usr/bin/python + +import sys + +if len(sys.argv) != 3: + print "Usage: tokens2classes.py source_classes target_classes" + exit(1) + +source_to_topics = {} +for line in open(sys.argv[1],'r'): + term,cls = line.split() + source_to_topics[term] = cls + +target_to_topics = {} +for line in open(sys.argv[2],'r'): + term,cls = line.split() + target_to_topics[term] = cls + +for line in sys.stdin: + source, target, tail = line.split(" ||| ") + + for token in source.split(): + print source_to_topics[token], + print "|||", + for token in target.split(): + print target_to_topics[token], + print "|||", tail, -- cgit v1.2.3