From 8e5fad9bcbadf36bbab3c1c5b053e3c8f7dddbce Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 2 Feb 2012 06:29:50 +0000 Subject: lopez suffix array extractor with copyrighted david chiang code excised --- sa-extract/README | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 sa-extract/README (limited to 'sa-extract/README') diff --git a/sa-extract/README b/sa-extract/README new file mode 100644 index 00000000..f43e58cc --- /dev/null +++ b/sa-extract/README @@ -0,0 +1,50 @@ +SUFFIX-ARRAY-EXTRACT README + Feb 1, 2012 + +Written by Adam Lopez, repackaged by Chris Dyer. + +Originally based on parts of Hiero, by David Chiang, but these dependencies +have been removed or rewritten. + + +BUILD INSTRUCTIONS +============================================================================== + +Requirements: + Python 2.7 or later (http://www.python.org) + Cython 0.14.1 or later (http://cython.org/) + +- Edit Makefile to set the location of Python/Cython then do: + + make + + +COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT +============================================================================== +- Run sa-compile.pl to compile the training data and generate an extract.ini + file (which is written to STDOUT): + + sa-compile.pl -b bitext_name=source.fr,target.en \ + -a alignment_name=alignment.txt > extract.ini + + +EXTRACTION OF PER-SENTENCE GRAMMARS +============================================================================== +- Example: + cat test.fr | extractor.py -c extract.ini + + +EXTRACTION OF COMPLETE TEST-SET GRAMMARS +============================================================================== +Edit the generated extract.ini file a change per_sentence_grammar +to False. Then, run extraction as normal. + +Note: extracting a single grammar for an entire test set will consume more +memory during extraction and (probably) during decoding. + + +EXAMPLE +============================================================================== +- See example/ and the README therein. + + -- cgit v1.2.3 From 3a2fc36378337147a956e439db31baf91bfb95c8 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 3 Feb 2012 18:03:49 -0500 Subject: escaping tool for grammar extractor --- mteval/ns_ter.cc | 4 ++++ sa-extract/Makefile | 4 ++-- sa-extract/README | 14 +++++++++++++- sa-extract/escape-testset.pl | 35 +++++++++++++++++++++++++++++++++++ sa-extract/example/README | 2 +- 5 files changed, 55 insertions(+), 4 deletions(-) create mode 100755 sa-extract/escape-testset.pl (limited to 'sa-extract/README') diff --git a/mteval/ns_ter.cc b/mteval/ns_ter.cc index 91a17f0d..0e1008db 100644 --- a/mteval/ns_ter.cc +++ b/mteval/ns_ter.cc @@ -22,6 +22,10 @@ static const unsigned kDUMMY_LAST_ENTRY = 5; using namespace std; using namespace std::tr1; +bool TERMetric::IsErrorMetric() const { + return true; +} + namespace NewScorer { struct COSTS { diff --git a/sa-extract/Makefile b/sa-extract/Makefile index e2b6158d..7b39ae4d 100644 --- a/sa-extract/Makefile +++ b/sa-extract/Makefile @@ -1,7 +1,7 @@ PYVER=python2.7 -PYDIR=/usr +PYDIR=/usr/local/Cellar/python/2.7.2 PYINCLUDE=$(PYDIR)/include/$(PYVER) -CYTHON=/usr/bin/cython +CYTHON=/usr/local/share/python/cython PYTHON=$(PYDIR)/bin/python %.c: %.pyx diff --git a/sa-extract/README b/sa-extract/README index f43e58cc..e4022c7e 100644 --- a/sa-extract/README +++ b/sa-extract/README @@ -28,10 +28,22 @@ COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT -a alignment_name=alignment.txt > extract.ini + The training data should be in two parallel text files (source.fr,source.en) + and the alignments are expected in "0-0 1-2 2-1 ..." format produced by + most alignment toolkits. The text files should NOT be escaped for non-XML + characters. + + EXTRACTION OF PER-SENTENCE GRAMMARS ============================================================================== +The most common use-case we support is extraction of "per-sentence" grammars +for each segment in a testset. You may run the extractor on test set, but it +will try to interpret tags as SGML markup, so we provide a script that does +escaping: ./escape-testset.pl. + - Example: - cat test.fr | extractor.py -c extract.ini + + cat test.fr | ./escape-testset.pl | ./extractor.py -c extract.ini EXTRACTION OF COMPLETE TEST-SET GRAMMARS diff --git a/sa-extract/escape-testset.pl b/sa-extract/escape-testset.pl new file mode 100755 index 00000000..02fd7445 --- /dev/null +++ b/sa-extract/escape-testset.pl @@ -0,0 +1,35 @@ +#!/usr/bin/perl -w + +use utf8; +use strict; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); + +my @fh = (); +if (scalar @ARGV == 0) { + push @fh, \*STDIN; +} else { + for my $file (@ARGV) { + my $f; + open $f, "<$file" or die "Can't read $file: $!\n"; + binmode $f, ":utf8"; + push @fh, $f; + } +} + +my $id = -1; +for my $f (@fh) { + while(<$f>) { + chomp; + die "Empty line in test set" if /^\s*$/; + die "Please remove tags from input:\n$_" if /^\s*/\>/g; + print " $_ \n"; + } +} + + diff --git a/sa-extract/example/README b/sa-extract/example/README index 9819ba5f..f6eac52b 100644 --- a/sa-extract/example/README +++ b/sa-extract/example/README @@ -4,5 +4,5 @@ Commands to compile a corpus and extract some grammars # compile ../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini # extract -cat test.de | ../extractor.py -c extract.ini +cat test.de | ../escape-testset.pl | ../extractor.py -c extract.ini -- cgit v1.2.3