From e70a73b8c365329f7a8cf86ad527b12358752266 Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Wed, 23 May 2012 13:33:48 -0400
Subject: Add script to convert from sa-extract's unnamed format to cdec's more
 readable named format

---
 sa-extract/sa2cdec.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100755 sa-extract/sa2cdec.py
diff --git a/sa-extract/sa2cdec.py b/sa-extract/sa2cdec.py
new file mode 100755
index 00000000..55fb19f3
--- /dev/null
+++ b/sa-extract/sa2cdec.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+import sys
+
+featNames = [ line.strip() for line in open(sys.argv[1]) if not line.startswith('#') ]
+
+for line in sys.stdin:
+  try:
+    (lhs, src, tgt, feats, align) = line.strip("\n").split(' ||| ')
+  except:
+    print >>sys.stderr, 'WARNING: No alignments:', line
+    try:
+      (lhs, src, tgt, feats) = line.strip().split(' ||| ')
+      align = ''
+    except:
+      print >>sys.stderr, "ERROR: Malformed line:", line
+      raise
+  featValues = feats.split(' ')
+  namedFeats = ' '.join( name+"="+value for (name, value) in zip(featNames, featValues) )
+  print " ||| ".join( (lhs, src, tgt, namedFeats, align) )
-- 
cgit v1.2.3


From 7a1e274fc4147631d4a70af47406301dcaaff497 Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Wed, 23 May 2012 13:43:29 -0400
Subject: Add a script that can wrap an input set with SGML tags that tell cdec
 where to find sentence-level grammars

---
 sa-extract/wrap_input.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100755 sa-extract/wrap_input.py

diff --git a/sa-extract/wrap_input.py b/sa-extract/wrap_input.py
new file mode 100755
index 00000000..e859a4fd
--- /dev/null
+++ b/sa-extract/wrap_input.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+import sys
+import codecs
+import os
+import os.path
+from xml.sax.saxutils import escape
+
+graPrefix = sys.argv[1]
+
+# Second argument can be a file with observable sentence-level features,
+# one set of features per line (parallel with source sentences). Features are space-delimited indicator features.
+obsFeatsFile = None
+if len(sys.argv) == 3:
+  obsFeatsFilename = sys.argv[2]
+  obsFeatsFile = open(obsFeatsFilename)
+
+sys.stdin = codecs.getreader("utf-8")(sys.stdin)
+sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
+
+i = 0
+for line in sys.stdin:
+  filename = "%s%d"%(graPrefix,i)
+  if not os.path.exists(filename):
+    filenameGz = filename + ".gz"
+    if not os.path.exists(filenameGz):
+      print >>sys.stderr, "Grammar file not found: ", filename, filenameGz
+      sys.exit(1)
+    else:
+      filename = filenameGz
+    
+  if obsFeatsFile:
+    obsFeats = obsFeatsFile.next().strip()
+    print '<seg id="%d" features="%s" grammar="%s"> '%(i,obsFeats,filename) + escape(line.strip()) + " </seg>"
+  else:
+    print '<seg id="%d" grammar="%s"> '%(i,filename) + escape(line.strip()) + " </seg>"
+  i+=1
+
-- 
cgit v1.2.3


From b0173db2bae5dc1afef5f6804043d422adaa9118 Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Wed, 23 May 2012 13:48:04 -0400
Subject: Script for dividing a single file full of references into the
 1-reference-per-file format expected by cdec

---
 dpmert/divide_refs.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100755 dpmert/divide_refs.py

diff --git a/dpmert/divide_refs.py b/dpmert/divide_refs.py
new file mode 100755
index 00000000..b478f918
--- /dev/null
+++ b/dpmert/divide_refs.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+import sys
+
+(numRefs, outPrefix) = sys.argv[1:]
+numRefs = int(numRefs)
+
+outs = [open(outPrefix+str(i), "w") for i in range(numRefs)]
+
+i = 0
+for line in sys.stdin:
+  outs[i].write(line)
+  i = (i + 1) % numRefs
+
+for out in outs:
+  out.close()
-- 
cgit v1.2.3


From 70e50adc591ca6e3e56a1ced66bdb133eb89807b Mon Sep 17 00:00:00 2001
From: Jonathan Clark <jon.h.clark@gmail.com>
Date: Wed, 23 May 2012 15:35:37 -0400
Subject: Add names of default features so that we can convert into the typical
 'named' sparse feature format of cdec

---
 sa-extract/sa_feat_names.txt | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 sa-extract/sa_feat_names.txt

diff --git a/sa-extract/sa_feat_names.txt b/sa-extract/sa_feat_names.txt
new file mode 100644
index 00000000..02c137d7
--- /dev/null
+++ b/sa-extract/sa_feat_names.txt
@@ -0,0 +1,7 @@
+EGivenFCoherent
+SampleCountF
+CountEF
+MaxLexFGivenE
+MaxLexEGivenF
+IsSingletonF
+IsSingletonFE
-- 
cgit v1.2.3