summaryrefslogtreecommitdiff
path: root/sa-extract/wrap_input.py
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-05-31 13:57:24 +0200
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-05-31 13:57:24 +0200
commitf1ba05780db1705493d9afb562332498b93d26f1 (patch)
treefb429a657ba97f33e8140742de9bc74d9fc88e75 /sa-extract/wrap_input.py
parentaadabfdf37dfd451485277cb77fad02f77b361c6 (diff)
parent317d650f6cb1e24ac6f3be6f7bf9d4246a59e0e5 (diff)
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'sa-extract/wrap_input.py')
-rwxr-xr-xsa-extract/wrap_input.py37
1 files changed, 37 insertions, 0 deletions
diff --git a/sa-extract/wrap_input.py b/sa-extract/wrap_input.py
new file mode 100755
index 00000000..e859a4fd
--- /dev/null
+++ b/sa-extract/wrap_input.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+import sys
+import codecs
+import os
+import os.path
+from xml.sax.saxutils import escape
+
+graPrefix = sys.argv[1]
+
+# Second argument can be a file with observable sentence-level features,
+# one set of features per line (parallel with source sentences). Features are space-delimited indicator features.
+obsFeatsFile = None
+if len(sys.argv) == 3:
+ obsFeatsFilename = sys.argv[2]
+ obsFeatsFile = open(obsFeatsFilename)
+
+sys.stdin = codecs.getreader("utf-8")(sys.stdin)
+sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
+
+i = 0
+for line in sys.stdin:
+ filename = "%s%d"%(graPrefix,i)
+ if not os.path.exists(filename):
+ filenameGz = filename + ".gz"
+ if not os.path.exists(filenameGz):
+ print >>sys.stderr, "Grammar file not found: ", filename, filenameGz
+ sys.exit(1)
+ else:
+ filename = filenameGz
+
+ if obsFeatsFile:
+ obsFeats = obsFeatsFile.next().strip()
+ print '<seg id="%d" features="%s" grammar="%s"> '%(i,obsFeats,filename) + escape(line.strip()) + " </seg>"
+ else:
+ print '<seg id="%d" grammar="%s"> '%(i,filename) + escape(line.strip()) + " </seg>"
+ i+=1
+