diff options
author | Jonathan Clark <jon.h.clark@gmail.com> | 2012-05-23 13:43:29 -0400 |
---|---|---|
committer | Jonathan Clark <jon.h.clark@gmail.com> | 2012-05-23 13:43:29 -0400 |
commit | 7a1e274fc4147631d4a70af47406301dcaaff497 (patch) | |
tree | f2efd26f9c7711b7480c389d3a4fa10d72be3205 /sa-extract/wrap_input.py | |
parent | e70a73b8c365329f7a8cf86ad527b12358752266 (diff) |
Add a script that can wrap an input set with SGML tags that tell cdec where to find sentence-level grammars
Diffstat (limited to 'sa-extract/wrap_input.py')
-rwxr-xr-x | sa-extract/wrap_input.py | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/sa-extract/wrap_input.py b/sa-extract/wrap_input.py new file mode 100755 index 00000000..e859a4fd --- /dev/null +++ b/sa-extract/wrap_input.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +import sys +import codecs +import os +import os.path +from xml.sax.saxutils import escape + +graPrefix = sys.argv[1] + +# Second argument can be a file with observable sentence-level features, +# one set of features per line (parallel with source sentences). Features are space-delimited indicator features. +obsFeatsFile = None +if len(sys.argv) == 3: + obsFeatsFilename = sys.argv[2] + obsFeatsFile = open(obsFeatsFilename) + +sys.stdin = codecs.getreader("utf-8")(sys.stdin) +sys.stdout = codecs.getwriter("utf-8")(sys.stdout) + +i = 0 +for line in sys.stdin: + filename = "%s%d"%(graPrefix,i) + if not os.path.exists(filename): + filenameGz = filename + ".gz" + if not os.path.exists(filenameGz): + print >>sys.stderr, "Grammar file not found: ", filename, filenameGz + sys.exit(1) + else: + filename = filenameGz + + if obsFeatsFile: + obsFeats = obsFeatsFile.next().strip() + print '<seg id="%d" features="%s" grammar="%s"> '%(i,obsFeats,filename) + escape(line.strip()) + " </seg>" + else: + print '<seg id="%d" grammar="%s"> '%(i,filename) + escape(line.strip()) + " </seg>" + i+=1 + |