From 3a2fc36378337147a956e439db31baf91bfb95c8 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 3 Feb 2012 18:03:49 -0500 Subject: escaping tool for grammar extractor --- sa-extract/Makefile | 4 ++-- sa-extract/README | 14 +++++++++++++- sa-extract/escape-testset.pl | 35 +++++++++++++++++++++++++++++++++++ sa-extract/example/README | 2 +- 4 files changed, 51 insertions(+), 4 deletions(-) create mode 100755 sa-extract/escape-testset.pl (limited to 'sa-extract') diff --git a/sa-extract/Makefile b/sa-extract/Makefile index e2b6158d..7b39ae4d 100644 --- a/sa-extract/Makefile +++ b/sa-extract/Makefile @@ -1,7 +1,7 @@ PYVER=python2.7 -PYDIR=/usr +PYDIR=/usr/local/Cellar/python/2.7.2 PYINCLUDE=$(PYDIR)/include/$(PYVER) -CYTHON=/usr/bin/cython +CYTHON=/usr/local/share/python/cython PYTHON=$(PYDIR)/bin/python %.c: %.pyx diff --git a/sa-extract/README b/sa-extract/README index f43e58cc..e4022c7e 100644 --- a/sa-extract/README +++ b/sa-extract/README @@ -28,10 +28,22 @@ COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT -a alignment_name=alignment.txt > extract.ini + The training data should be in two parallel text files (source.fr,source.en) + and the alignments are expected in "0-0 1-2 2-1 ..." format produced by + most alignment toolkits. The text files should NOT be escaped for non-XML + characters. + + EXTRACTION OF PER-SENTENCE GRAMMARS ============================================================================== +The most common use-case we support is extraction of "per-sentence" grammars +for each segment in a testset. You may run the extractor on test set, but it +will try to interpret tags as SGML markup, so we provide a script that does +escaping: ./escape-testset.pl. + - Example: - cat test.fr | extractor.py -c extract.ini + + cat test.fr | ./escape-testset.pl | ./extractor.py -c extract.ini EXTRACTION OF COMPLETE TEST-SET GRAMMARS diff --git a/sa-extract/escape-testset.pl b/sa-extract/escape-testset.pl new file mode 100755 index 00000000..02fd7445 --- /dev/null +++ b/sa-extract/escape-testset.pl @@ -0,0 +1,35 @@ +#!/usr/bin/perl -w + +use utf8; +use strict; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); + +my @fh = (); +if (scalar @ARGV == 0) { + push @fh, \*STDIN; +} else { + for my $file (@ARGV) { + my $f; + open $f, "<$file" or die "Can't read $file: $!\n"; + binmode $f, ":utf8"; + push @fh, $f; + } +} + +my $id = -1; +for my $f (@fh) { + while(<$f>) { + chomp; + die "Empty line in test set" if /^\s*$/; + die "Please remove tags from input:\n$_" if /^\s*/\>/g; + print " $_ \n"; + } +} + + diff --git a/sa-extract/example/README b/sa-extract/example/README index 9819ba5f..f6eac52b 100644 --- a/sa-extract/example/README +++ b/sa-extract/example/README @@ -4,5 +4,5 @@ Commands to compile a corpus and extract some grammars # compile ../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini # extract -cat test.de | ../extractor.py -c extract.ini +cat test.de | ../escape-testset.pl | ../extractor.py -c extract.ini -- cgit v1.2.3