diff options
| author | philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-02 18:25:25 +0000 | 
|---|---|---|
| committer | philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-02 18:25:25 +0000 | 
| commit | 11c4cc3836e9e0e38bc4250500381ce7723799ee (patch) | |
| tree | 0164bb4783b113d0014d797f70946b50428b90da /gi/pyp-topics | |
| parent | ab708faeac7ff9eb5c252cf3236ebd79ff3d43d8 (diff) | |
New script for mapping corpus files to classes
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@117 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics')
| -rwxr-xr-x | gi/pyp-topics/scripts/tokens2classes.py | 27 | 
1 files changed, 27 insertions, 0 deletions
diff --git a/gi/pyp-topics/scripts/tokens2classes.py b/gi/pyp-topics/scripts/tokens2classes.py new file mode 100755 index 00000000..33df255f --- /dev/null +++ b/gi/pyp-topics/scripts/tokens2classes.py @@ -0,0 +1,27 @@ +#!/usr/bin/python + +import sys + +if len(sys.argv) != 3: +  print "Usage: tokens2classes.py source_classes target_classes" +  exit(1) + +source_to_topics = {} +for line in open(sys.argv[1],'r'): +  term,cls = line.split() +  source_to_topics[term] = cls + +target_to_topics = {} +for line in open(sys.argv[2],'r'): +  term,cls = line.split() +  target_to_topics[term] = cls + +for line in sys.stdin: +  source, target, tail = line.split(" ||| ") + +  for token in source.split(): +    print source_to_topics[token], +  print "|||", +  for token in target.split(): +    print target_to_topics[token], +  print "|||", tail,  | 
