summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Clark <jon.h.clark@gmail.com>2012-05-23 13:33:48 -0400
committerJonathan Clark <jon.h.clark@gmail.com>2012-05-23 13:33:48 -0400
commit44a66ff4a31763c71c5fa5a31967fc019a1bad2a (patch)
treeb5fc8bb3d70ca08c9e5597300fbc8a827e868842
parent4738874acde9391beef87c83d0d5c25f3289a625 (diff)
Add script to convert from sa-extract's unnamed format to cdec's more readable named format
-rwxr-xr-xsa-extract/sa2cdec.py19
1 files changed, 19 insertions, 0 deletions
diff --git a/sa-extract/sa2cdec.py b/sa-extract/sa2cdec.py
new file mode 100755
index 00000000..55fb19f3
--- /dev/null
+++ b/sa-extract/sa2cdec.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+import sys
+
+featNames = [ line.strip() for line in open(sys.argv[1]) if not line.startswith('#') ]
+
+for line in sys.stdin:
+ try:
+ (lhs, src, tgt, feats, align) = line.strip("\n").split(' ||| ')
+ except:
+ print >>sys.stderr, 'WARNING: No alignments:', line
+ try:
+ (lhs, src, tgt, feats) = line.strip().split(' ||| ')
+ align = ''
+ except:
+ print >>sys.stderr, "ERROR: Malformed line:", line
+ raise
+ featValues = feats.split(' ')
+ namedFeats = ' '.join( name+"="+value for (name, value) in zip(featNames, featValues) )
+ print " ||| ".join( (lhs, src, tgt, namedFeats, align) )