diff options
| -rwxr-xr-x | dpmert/divide_refs.py | 15 | ||||
| -rwxr-xr-x | sa-extract/sa2cdec.py | 19 | ||||
| -rw-r--r-- | sa-extract/sa_feat_names.txt | 7 | ||||
| -rwxr-xr-x | sa-extract/wrap_input.py | 37 | 
4 files changed, 78 insertions, 0 deletions
| diff --git a/dpmert/divide_refs.py b/dpmert/divide_refs.py new file mode 100755 index 00000000..b478f918 --- /dev/null +++ b/dpmert/divide_refs.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +import sys + +(numRefs, outPrefix) = sys.argv[1:] +numRefs = int(numRefs) + +outs = [open(outPrefix+str(i), "w") for i in range(numRefs)] + +i = 0 +for line in sys.stdin: +  outs[i].write(line) +  i = (i + 1) % numRefs + +for out in outs: +  out.close() diff --git a/sa-extract/sa2cdec.py b/sa-extract/sa2cdec.py new file mode 100755 index 00000000..55fb19f3 --- /dev/null +++ b/sa-extract/sa2cdec.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +import sys + +featNames = [ line.strip() for line in open(sys.argv[1]) if not line.startswith('#') ] + +for line in sys.stdin: +  try: +    (lhs, src, tgt, feats, align) = line.strip("\n").split(' ||| ') +  except: +    print >>sys.stderr, 'WARNING: No alignments:', line +    try: +      (lhs, src, tgt, feats) = line.strip().split(' ||| ') +      align = '' +    except: +      print >>sys.stderr, "ERROR: Malformed line:", line +      raise +  featValues = feats.split(' ') +  namedFeats = ' '.join( name+"="+value for (name, value) in zip(featNames, featValues) ) +  print " ||| ".join( (lhs, src, tgt, namedFeats, align) ) diff --git a/sa-extract/sa_feat_names.txt b/sa-extract/sa_feat_names.txt new file mode 100644 index 00000000..02c137d7 --- /dev/null +++ b/sa-extract/sa_feat_names.txt @@ -0,0 +1,7 @@ +EGivenFCoherent +SampleCountF +CountEF +MaxLexFGivenE +MaxLexEGivenF +IsSingletonF +IsSingletonFE diff --git a/sa-extract/wrap_input.py b/sa-extract/wrap_input.py new file mode 100755 index 00000000..e859a4fd --- /dev/null +++ b/sa-extract/wrap_input.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +import sys +import codecs +import os +import os.path +from xml.sax.saxutils import escape + +graPrefix = sys.argv[1] + +# Second argument can be a file with observable sentence-level features, +# one set of features per line (parallel with source sentences). Features are space-delimited indicator features. +obsFeatsFile = None +if len(sys.argv) == 3: +  obsFeatsFilename = sys.argv[2] +  obsFeatsFile = open(obsFeatsFilename) + +sys.stdin = codecs.getreader("utf-8")(sys.stdin) +sys.stdout = codecs.getwriter("utf-8")(sys.stdout) + +i = 0 +for line in sys.stdin: +  filename = "%s%d"%(graPrefix,i) +  if not os.path.exists(filename): +    filenameGz = filename + ".gz" +    if not os.path.exists(filenameGz): +      print >>sys.stderr, "Grammar file not found: ", filename, filenameGz +      sys.exit(1) +    else: +      filename = filenameGz +     +  if obsFeatsFile: +    obsFeats = obsFeatsFile.next().strip() +    print '<seg id="%d" features="%s" grammar="%s"> '%(i,obsFeats,filename) + escape(line.strip()) + " </seg>" +  else: +    print '<seg id="%d" grammar="%s"> '%(i,filename) + escape(line.strip()) + " </seg>" +  i+=1 + | 
