summaryrefslogtreecommitdiff
path: root/sa-extract
diff options
context:
space:
mode:
Diffstat (limited to 'sa-extract')
-rw-r--r--sa-extract/Makefile4
-rw-r--r--sa-extract/README14
-rwxr-xr-xsa-extract/escape-testset.pl35
-rw-r--r--sa-extract/example/README2
4 files changed, 51 insertions, 4 deletions
diff --git a/sa-extract/Makefile b/sa-extract/Makefile
index e2b6158d..7b39ae4d 100644
--- a/sa-extract/Makefile
+++ b/sa-extract/Makefile
@@ -1,7 +1,7 @@
PYVER=python2.7
-PYDIR=/usr
+PYDIR=/usr/local/Cellar/python/2.7.2
PYINCLUDE=$(PYDIR)/include/$(PYVER)
-CYTHON=/usr/bin/cython
+CYTHON=/usr/local/share/python/cython
PYTHON=$(PYDIR)/bin/python
%.c: %.pyx
diff --git a/sa-extract/README b/sa-extract/README
index f43e58cc..e4022c7e 100644
--- a/sa-extract/README
+++ b/sa-extract/README
@@ -28,10 +28,22 @@ COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT
-a alignment_name=alignment.txt > extract.ini
+ The training data should be in two parallel text files (source.fr,source.en)
+ and the alignments are expected in "0-0 1-2 2-1 ..." format produced by
+ most alignment toolkits. The text files should NOT be escaped for non-XML
+ characters.
+
+
EXTRACTION OF PER-SENTENCE GRAMMARS
==============================================================================
+The most common use-case we support is extraction of "per-sentence" grammars
+for each segment in a testset. You may run the extractor on test set, but it
+will try to interpret tags as SGML markup, so we provide a script that does
+escaping: ./escape-testset.pl.
+
- Example:
- cat test.fr | extractor.py -c extract.ini
+
+ cat test.fr | ./escape-testset.pl | ./extractor.py -c extract.ini
EXTRACTION OF COMPLETE TEST-SET GRAMMARS
diff --git a/sa-extract/escape-testset.pl b/sa-extract/escape-testset.pl
new file mode 100755
index 00000000..02fd7445
--- /dev/null
+++ b/sa-extract/escape-testset.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/perl -w
+
+use utf8;
+use strict;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+
+my @fh = ();
+if (scalar @ARGV == 0) {
+ push @fh, \*STDIN;
+} else {
+ for my $file (@ARGV) {
+ my $f;
+ open $f, "<$file" or die "Can't read $file: $!\n";
+ binmode $f, ":utf8";
+ push @fh, $f;
+ }
+}
+
+my $id = -1;
+for my $f (@fh) {
+ while(<$f>) {
+ chomp;
+ die "Empty line in test set" if /^\s*$/;
+ die "Please remove <seg> tags from input:\n$_" if /^\s*<seg/i;
+ $id++;
+ s/&/\&amp;/g;
+ s/</\&lt;/g;
+ s/>/\&gt;/g;
+ print "<seg id=\"$id\"> $_ </seg>\n";
+ }
+}
+
+
diff --git a/sa-extract/example/README b/sa-extract/example/README
index 9819ba5f..f6eac52b 100644
--- a/sa-extract/example/README
+++ b/sa-extract/example/README
@@ -4,5 +4,5 @@ Commands to compile a corpus and extract some grammars
# compile
../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini
# extract
-cat test.de | ../extractor.py -c extract.ini
+cat test.de | ../escape-testset.pl | ../extractor.py -c extract.ini