escaping tool for grammar extractor

author: Chris Dyer <cdyer@cs.cmu.edu> 2012-02-03 18:03:49 -0500
committer: Chris Dyer <cdyer@cs.cmu.edu> 2012-02-03 18:03:49 -0500
commit: f08ff03664ee7c9601c9daaa217cb032160f386f (patch)
tree: 5e93393df8bddb128a778f29ea86a0ea81ce7ebf /sa-extract
parent: 16d08eefddbecfefced16a0dd5a13d4c64c139b0 (diff)
4 files changed, 51 insertions, 4 deletions
diff --git a/sa-extract/Makefile b/sa-extract/Makefile
index e2b6158d..7b39ae4d 100644
--- a/sa-extract/Makefile
+++ b/sa-extract/Makefile
@@ -1,7 +1,7 @@
 PYVER=python2.7
-PYDIR=/usr
+PYDIR=/usr/local/Cellar/python/2.7.2
 PYINCLUDE=$(PYDIR)/include/$(PYVER)
-CYTHON=/usr/bin/cython
+CYTHON=/usr/local/share/python/cython
 PYTHON=$(PYDIR)/bin/python
 
 %.c: %.pyx
diff --git a/sa-extract/README b/sa-extract/README
index f43e58cc..e4022c7e 100644
--- a/sa-extract/README
+++ b/sa-extract/README
@@ -28,10 +28,22 @@ COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT
                 -a alignment_name=alignment.txt > extract.ini
 
 
+  The training data should be in two parallel text files (source.fr,source.en)
+  and the alignments are expected in "0-0 1-2 2-1 ..." format produced by
+  most alignment toolkits. The text files should NOT be escaped for non-XML
+  characters.
+
+
 EXTRACTION OF PER-SENTENCE GRAMMARS
 ==============================================================================
+The most common use-case we support is extraction of "per-sentence" grammars
+for each segment in a testset. You may run the extractor on test set, but it
+will try to interpret tags as SGML markup, so we provide a script that does
+escaping: ./escape-testset.pl.
+
 - Example:
-  cat test.fr | extractor.py -c extract.ini
+
+  cat test.fr | ./escape-testset.pl | ./extractor.py -c extract.ini
 
 
 EXTRACTION OF COMPLETE TEST-SET GRAMMARS
diff --git a/sa-extract/escape-testset.pl b/sa-extract/escape-testset.pl
new file mode 100755
index 00000000..02fd7445
--- /dev/null
+++ b/sa-extract/escape-testset.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/perl -w
+
+use utf8;
+use strict;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+
+my @fh = ();
+if (scalar @ARGV == 0) {
+  push @fh, \*STDIN;
+} else {
+  for my $file (@ARGV) {
+    my $f;
+    open $f, "<$file" or die "Can't read $file: $!\n";
+    binmode $f, ":utf8";
+    push @fh, $f;
+  }
+}
+
+my $id = -1;
+for my $f (@fh) {
+  while(<$f>) {
+    chomp;
+    die "Empty line in test set" if /^\s*$/;
+    die "Please remove <seg> tags from input:\n$_" if /^\s*<seg/i;
+    $id++;
+    s/&/\&amp;/g;
+    s/</\&lt;/g;
+    s/>/\&gt;/g;
+    print "<seg id=\"$id\"> $_ </seg>\n";
+  }
+}
+
+
diff --git a/sa-extract/example/README b/sa-extract/example/README
index 9819ba5f..f6eac52b 100644
--- a/sa-extract/example/README
+++ b/sa-extract/example/README
@@ -4,5 +4,5 @@ Commands to compile a corpus and extract some grammars
 # compile
 ../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini
 # extract
-cat test.de | ../extractor.py -c extract.ini
+cat test.de | ../escape-testset.pl | ../extractor.py -c extract.ini
author	Chris Dyer <cdyer@cs.cmu.edu>	2012-02-03 18:03:49 -0500
committer	Chris Dyer <cdyer@cs.cmu.edu>	2012-02-03 18:03:49 -0500
commit	f08ff03664ee7c9601c9daaa217cb032160f386f (patch)
tree	5e93393df8bddb128a778f29ea86a0ea81ce7ebf /sa-extract
parent	16d08eefddbecfefced16a0dd5a13d4c64c139b0 (diff)