diff options
-rwxr-xr-x | corpus/add-sos-eos.pl | 71 |
1 files changed, 55 insertions, 16 deletions
diff --git a/corpus/add-sos-eos.pl b/corpus/add-sos-eos.pl index 5e2d44cb..d7608c5e 100755 --- a/corpus/add-sos-eos.pl +++ b/corpus/add-sos-eos.pl @@ -1,24 +1,63 @@ #!/usr/bin/perl -w use strict; -while(<>) { +die "Usage: $0 corpus.fr[-en1-en2-...] [corpus.al out-corpus.al]\n" unless (scalar @ARGV == 1 || scalar @ARGV == 3); +my $filec = shift @ARGV; +my $filea = shift @ARGV; +my $ofilea = shift @ARGV; +open C, "<$filec" or die "Can't read $filec: $!"; +if ($filea) { + open A, "<$filea" or die "Can't read $filea: $!"; + open OA, ">$ofilea" or die "Can't write $ofilea: $!"; +} +binmode(C, ":utf8"); +binmode(STDOUT, ":utf8"); +print STDERR "Adding <s> and </s> markers to input...\n"; +print STDERR " Reading corpus: $filec\n"; +print STDERR " Writing corpus: STDOUT\n"; +print STDERR "Reading alignments: $filea\n" if $filea; +print STDERR "Writing alignments: $ofilea\n" if $filea; + +my $lines = 0; +while(<C>) { + $lines++; + die "ERROR. Input line $filec:$lines should not contain SGML markup" if /<seg /; + if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } + elsif ($lines % 2500 == 0) { print STDERR "."; } chomp; my @fields = split / \|\|\| /; - my ($ff, $ee, $aa) = @fields; - die "Expected: foreign ||| target ||| alignments" unless scalar @fields == 3; - my @fs = split /\s+/, $ff; - my @es = split /\s+/, $ee; - my @as = split /\s+/, $aa; - my @oas = (); - push @oas, '0-0'; - my $flen = scalar @fs; - my $elen = scalar @es; - for my $ap (@as) { - my ($a, $b) = split /-/, $ap; - die "Bad format in: @as" unless defined $a && defined $b; - push @oas, ($a + 1) . '-' . ($b + 1); + my $o = ''; + for my $field (@fields) { + $o .= " ||| <s> $field </s>"; } - push @oas, ($flen + 1) . '-' . ($elen + 1); - print "<s> $ff </s> ||| <s> $ee </s> ||| @oas\n"; + $o =~ s/^ \|\|\| //; + if ($filea) { + my $aa = <A>; + die "ERROR. Mismatched number of lines between $filec and $filea\n" unless $aa; + chomp $aa; + my ($ff, $ee) = @fields; + die "ERROR in $filec:$lines: expected 'source ||| target'" unless defined $ee; + my @fs = split /\s+/, $ff; + my @es = split /\s+/, $ee; + my @as = split /\s+/, $aa; + my @oas = (); + push @oas, '0-0'; + my $flen = scalar @fs; + my $elen = scalar @es; + for my $ap (@as) { + my ($a, $b) = split /-/, $ap; + die "ERROR. Bad format in: @as" unless defined $a && defined $b; + push @oas, ($a + 1) . '-' . ($b + 1); + } + push @oas, ($flen + 1) . '-' . ($elen + 1); + print OA "@oas\n"; + } + print "$o\n"; +} +if ($filea) { + close OA; + my $aa = <A>; + die "ERROR. Alignment input file $filea contains more lines than corpus file!\n" if $aa; } +print STDERR "\nSUCCESS. Processed $lines lines.\n"; |