From 8e5fad9bcbadf36bbab3c1c5b053e3c8f7dddbce Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 2 Feb 2012 06:29:50 +0000 Subject: lopez suffix array extractor with copyrighted david chiang code excised --- sa-extract/example/README | 8 ++++++++ sa-extract/example/corpus.align.gz | Bin 0 -> 829334 bytes sa-extract/example/corpus.de.gz | Bin 0 -> 1724393 bytes sa-extract/example/corpus.en.gz | Bin 0 -> 1457711 bytes sa-extract/example/test.de | 10 ++++++++++ sa-extract/example/test.ref.en | 10 ++++++++++ 6 files changed, 28 insertions(+) create mode 100644 sa-extract/example/README create mode 100644 sa-extract/example/corpus.align.gz create mode 100644 sa-extract/example/corpus.de.gz create mode 100644 sa-extract/example/corpus.en.gz create mode 100644 sa-extract/example/test.de create mode 100644 sa-extract/example/test.ref.en (limited to 'sa-extract/example') diff --git a/sa-extract/example/README b/sa-extract/example/README new file mode 100644 index 00000000..9819ba5f --- /dev/null +++ b/sa-extract/example/README @@ -0,0 +1,8 @@ +Commands to compile a corpus and extract some grammars +====================================================== + +# compile +../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini +# extract +cat test.de | ../extractor.py -c extract.ini + diff --git a/sa-extract/example/corpus.align.gz b/sa-extract/example/corpus.align.gz new file mode 100644 index 00000000..741de7e4 Binary files /dev/null and b/sa-extract/example/corpus.align.gz differ diff --git a/sa-extract/example/corpus.de.gz b/sa-extract/example/corpus.de.gz new file mode 100644 index 00000000..0d66470a Binary files /dev/null and b/sa-extract/example/corpus.de.gz differ diff --git a/sa-extract/example/corpus.en.gz b/sa-extract/example/corpus.en.gz new file mode 100644 index 00000000..28cb5c58 Binary files /dev/null and b/sa-extract/example/corpus.en.gz differ diff --git a/sa-extract/example/test.de b/sa-extract/example/test.de new file mode 100644 index 00000000..8923329f --- /dev/null +++ b/sa-extract/example/test.de @@ -0,0 +1,10 @@ +dies ist der richtige ansatz für diejenigen in chinas politischer führung , die aus der who den maximalen nutzen für die unterstützung der inneren reform ziehen wollen . +taiwan hat sich auch vorgenommen , konstruktiv zu sein - wenn china mitspielt . +die stadt staaten hongkong und singapur verfolgen eine klarsichtige who - politik und konzentrieren sich auf markt zugänge und starke regeln . +malaysia und thailand sind auch recht aktiv innerhalb der who , mit verschiedenen positionen , die vom frei handel bis zum protektionismus reichen . +indonesien und die philippinen sind schwächer , überwältigt von politischer zusammen hanglosigkeit und ganz in anspruch genommen von den anstrengungen , das schlimmste zu hause zu verhüten , so dass nur geringe kräfte übrig bleiben , mit der stets anschwellenden und immer komplizierteren agenda der who fertig zu werden . +die who steht vor einer wichtigen entscheidung . +sie muss dringend den handel progressiv liberalisieren . +eine starke führung seitens der usa ist erforderlich , damit die who in diese richtung gebracht werden kann und man gleichzeitig vermeidet , die zukunft nach dem muster der eu zu gestalten ( regel wucherung ) oder nach dem muster der uno ( macht lose gespräch runde ) . +dies geschieht sicher besser unter bush , mit einem klaren bekenntnis zum offenen markt und einer aktiveren außen politik , als es unter irgendeiner demokratischen alternative geschehen könnte . +robert zoellick , präsident bushs handel beauftragter , braucht aber verbündete . diff --git a/sa-extract/example/test.ref.en b/sa-extract/example/test.ref.en new file mode 100644 index 00000000..e50edcac --- /dev/null +++ b/sa-extract/example/test.ref.en @@ -0,0 +1,10 @@ +this is the right approach for those in china 's leadership who wish to extract maximum benefits from the wto to bolster domestic reform . +taiwan is also set to play a constructive role -- if mainland china plays along . +the city states , hong kong and singapore , have clear - sighted wto policies , focusing on market access and strong rules . +malaysia and thailand are also fairly active in the wto , with a mix of free - market and protectionist positions . +indonesia and the philippines are weaker , overwhelmed by policy incoherence and fire - fighting at home , and with insufficient capacity to deal with the wto 's burgeoning and increasingly complicated agenda . +the wto is at a crossroads . +it sorely needs to liberalize trade progressively . +strong us leadership is required to push the wto in this direction while avoiding an eu - style future ( regulatory overload ) or a un - style future ( an irrelevant talking shop ) . +this is more likely under a bush administration with better open - market credentials and a more assertive foreign policy than any democratic alternative . +however , robert zoellick , president bush 's trade representative , needs allies . -- cgit v1.2.3 From 3a2fc36378337147a956e439db31baf91bfb95c8 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 3 Feb 2012 18:03:49 -0500 Subject: escaping tool for grammar extractor --- mteval/ns_ter.cc | 4 ++++ sa-extract/Makefile | 4 ++-- sa-extract/README | 14 +++++++++++++- sa-extract/escape-testset.pl | 35 +++++++++++++++++++++++++++++++++++ sa-extract/example/README | 2 +- 5 files changed, 55 insertions(+), 4 deletions(-) create mode 100755 sa-extract/escape-testset.pl (limited to 'sa-extract/example') diff --git a/mteval/ns_ter.cc b/mteval/ns_ter.cc index 91a17f0d..0e1008db 100644 --- a/mteval/ns_ter.cc +++ b/mteval/ns_ter.cc @@ -22,6 +22,10 @@ static const unsigned kDUMMY_LAST_ENTRY = 5; using namespace std; using namespace std::tr1; +bool TERMetric::IsErrorMetric() const { + return true; +} + namespace NewScorer { struct COSTS { diff --git a/sa-extract/Makefile b/sa-extract/Makefile index e2b6158d..7b39ae4d 100644 --- a/sa-extract/Makefile +++ b/sa-extract/Makefile @@ -1,7 +1,7 @@ PYVER=python2.7 -PYDIR=/usr +PYDIR=/usr/local/Cellar/python/2.7.2 PYINCLUDE=$(PYDIR)/include/$(PYVER) -CYTHON=/usr/bin/cython +CYTHON=/usr/local/share/python/cython PYTHON=$(PYDIR)/bin/python %.c: %.pyx diff --git a/sa-extract/README b/sa-extract/README index f43e58cc..e4022c7e 100644 --- a/sa-extract/README +++ b/sa-extract/README @@ -28,10 +28,22 @@ COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT -a alignment_name=alignment.txt > extract.ini + The training data should be in two parallel text files (source.fr,source.en) + and the alignments are expected in "0-0 1-2 2-1 ..." format produced by + most alignment toolkits. The text files should NOT be escaped for non-XML + characters. + + EXTRACTION OF PER-SENTENCE GRAMMARS ============================================================================== +The most common use-case we support is extraction of "per-sentence" grammars +for each segment in a testset. You may run the extractor on test set, but it +will try to interpret tags as SGML markup, so we provide a script that does +escaping: ./escape-testset.pl. + - Example: - cat test.fr | extractor.py -c extract.ini + + cat test.fr | ./escape-testset.pl | ./extractor.py -c extract.ini EXTRACTION OF COMPLETE TEST-SET GRAMMARS diff --git a/sa-extract/escape-testset.pl b/sa-extract/escape-testset.pl new file mode 100755 index 00000000..02fd7445 --- /dev/null +++ b/sa-extract/escape-testset.pl @@ -0,0 +1,35 @@ +#!/usr/bin/perl -w + +use utf8; +use strict; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); + +my @fh = (); +if (scalar @ARGV == 0) { + push @fh, \*STDIN; +} else { + for my $file (@ARGV) { + my $f; + open $f, "<$file" or die "Can't read $file: $!\n"; + binmode $f, ":utf8"; + push @fh, $f; + } +} + +my $id = -1; +for my $f (@fh) { + while(<$f>) { + chomp; + die "Empty line in test set" if /^\s*$/; + die "Please remove tags from input:\n$_" if /^\s*/\>/g; + print " $_ \n"; + } +} + + diff --git a/sa-extract/example/README b/sa-extract/example/README index 9819ba5f..f6eac52b 100644 --- a/sa-extract/example/README +++ b/sa-extract/example/README @@ -4,5 +4,5 @@ Commands to compile a corpus and extract some grammars # compile ../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini # extract -cat test.de | ../extractor.py -c extract.ini +cat test.de | ../escape-testset.pl | ../extractor.py -c extract.ini -- cgit v1.2.3