From 44a66ff4a31763c71c5fa5a31967fc019a1bad2a Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Wed, 23 May 2012 13:33:48 -0400 Subject: Add script to convert from sa-extract's unnamed format to cdec's more readable named format --- sa-extract/sa2cdec.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100755 sa-extract/sa2cdec.py (limited to 'sa-extract') diff --git a/sa-extract/sa2cdec.py b/sa-extract/sa2cdec.py new file mode 100755 index 00000000..55fb19f3 --- /dev/null +++ b/sa-extract/sa2cdec.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +import sys + +featNames = [ line.strip() for line in open(sys.argv[1]) if not line.startswith('#') ] + +for line in sys.stdin: + try: + (lhs, src, tgt, feats, align) = line.strip("\n").split(' ||| ') + except: + print >>sys.stderr, 'WARNING: No alignments:', line + try: + (lhs, src, tgt, feats) = line.strip().split(' ||| ') + align = '' + except: + print >>sys.stderr, "ERROR: Malformed line:", line + raise + featValues = feats.split(' ') + namedFeats = ' '.join( name+"="+value for (name, value) in zip(featNames, featValues) ) + print " ||| ".join( (lhs, src, tgt, namedFeats, align) ) -- cgit v1.2.3 From 8dbba4eb6bf894a1f73354a63ea365448c25c574 Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Wed, 23 May 2012 13:43:29 -0400 Subject: Add a script that can wrap an input set with SGML tags that tell cdec where to find sentence-level grammars --- sa-extract/wrap_input.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100755 sa-extract/wrap_input.py (limited to 'sa-extract') diff --git a/sa-extract/wrap_input.py b/sa-extract/wrap_input.py new file mode 100755 index 00000000..e859a4fd --- /dev/null +++ b/sa-extract/wrap_input.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +import sys +import codecs +import os +import os.path +from xml.sax.saxutils import escape + +graPrefix = sys.argv[1] + +# Second argument can be a file with observable sentence-level features, +# one set of features per line (parallel with source sentences). Features are space-delimited indicator features. +obsFeatsFile = None +if len(sys.argv) == 3: + obsFeatsFilename = sys.argv[2] + obsFeatsFile = open(obsFeatsFilename) + +sys.stdin = codecs.getreader("utf-8")(sys.stdin) +sys.stdout = codecs.getwriter("utf-8")(sys.stdout) + +i = 0 +for line in sys.stdin: + filename = "%s%d"%(graPrefix,i) + if not os.path.exists(filename): + filenameGz = filename + ".gz" + if not os.path.exists(filenameGz): + print >>sys.stderr, "Grammar file not found: ", filename, filenameGz + sys.exit(1) + else: + filename = filenameGz + + if obsFeatsFile: + obsFeats = obsFeatsFile.next().strip() + print ' '%(i,obsFeats,filename) + escape(line.strip()) + " " + else: + print ' '%(i,filename) + escape(line.strip()) + " " + i+=1 + -- cgit v1.2.3 From 896b1b75fdd78df5666269f624588715b30ea9b1 Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Wed, 23 May 2012 15:35:37 -0400 Subject: Add names of default features so that we can convert into the typical 'named' sparse feature format of cdec --- sa-extract/sa_feat_names.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 sa-extract/sa_feat_names.txt (limited to 'sa-extract') diff --git a/sa-extract/sa_feat_names.txt b/sa-extract/sa_feat_names.txt new file mode 100644 index 00000000..02c137d7 --- /dev/null +++ b/sa-extract/sa_feat_names.txt @@ -0,0 +1,7 @@ +EGivenFCoherent +SampleCountF +CountEF +MaxLexFGivenE +MaxLexEGivenF +IsSingletonF +IsSingletonFE -- cgit v1.2.3