summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/scripts
diff options
context:
space:
mode:
authorphilblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-02 18:25:25 +0000
committerphilblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-02 18:25:25 +0000
commit11c4cc3836e9e0e38bc4250500381ce7723799ee (patch)
tree0164bb4783b113d0014d797f70946b50428b90da /gi/pyp-topics/scripts
parentab708faeac7ff9eb5c252cf3236ebd79ff3d43d8 (diff)
New script for mapping corpus files to classes
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@117 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/scripts')
-rwxr-xr-xgi/pyp-topics/scripts/tokens2classes.py27
1 files changed, 27 insertions, 0 deletions
diff --git a/gi/pyp-topics/scripts/tokens2classes.py b/gi/pyp-topics/scripts/tokens2classes.py
new file mode 100755
index 00000000..33df255f
--- /dev/null
+++ b/gi/pyp-topics/scripts/tokens2classes.py
@@ -0,0 +1,27 @@
+#!/usr/bin/python
+
+import sys
+
+if len(sys.argv) != 3:
+ print "Usage: tokens2classes.py source_classes target_classes"
+ exit(1)
+
+source_to_topics = {}
+for line in open(sys.argv[1],'r'):
+ term,cls = line.split()
+ source_to_topics[term] = cls
+
+target_to_topics = {}
+for line in open(sys.argv[2],'r'):
+ term,cls = line.split()
+ target_to_topics[term] = cls
+
+for line in sys.stdin:
+ source, target, tail = line.split(" ||| ")
+
+ for token in source.split():
+ print source_to_topics[token],
+ print "|||",
+ for token in target.split():
+ print target_to_topics[token],
+ print "|||", tail,