summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2014-01-28 19:50:03 -0500
committerChris Dyer <redpony@gmail.com>2014-01-28 19:50:03 -0500
commit06f5b0359dadcbfd663c8f585358805fbff181ab (patch)
treedd2b6c1974971ce5923801e09fd9251f264cbf1a /corpus
parent5382875d5fa5192a662a7c763faf0487c69bc4ed (diff)
smarter script for adding <s> and </s> markers
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/add-sos-eos.pl71
1 files changed, 55 insertions, 16 deletions
diff --git a/corpus/add-sos-eos.pl b/corpus/add-sos-eos.pl
index 5e2d44cb..d7608c5e 100755
--- a/corpus/add-sos-eos.pl
+++ b/corpus/add-sos-eos.pl
@@ -1,24 +1,63 @@
#!/usr/bin/perl -w
use strict;
-while(<>) {
+die "Usage: $0 corpus.fr[-en1-en2-...] [corpus.al out-corpus.al]\n" unless (scalar @ARGV == 1 || scalar @ARGV == 3);
+my $filec = shift @ARGV;
+my $filea = shift @ARGV;
+my $ofilea = shift @ARGV;
+open C, "<$filec" or die "Can't read $filec: $!";
+if ($filea) {
+ open A, "<$filea" or die "Can't read $filea: $!";
+ open OA, ">$ofilea" or die "Can't write $ofilea: $!";
+}
+binmode(C, ":utf8");
+binmode(STDOUT, ":utf8");
+print STDERR "Adding <s> and </s> markers to input...\n";
+print STDERR " Reading corpus: $filec\n";
+print STDERR " Writing corpus: STDOUT\n";
+print STDERR "Reading alignments: $filea\n" if $filea;
+print STDERR "Writing alignments: $ofilea\n" if $filea;
+
+my $lines = 0;
+while(<C>) {
+ $lines++;
+ die "ERROR. Input line $filec:$lines should not contain SGML markup" if /<seg /;
+ if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
+ elsif ($lines % 2500 == 0) { print STDERR "."; }
chomp;
my @fields = split / \|\|\| /;
- my ($ff, $ee, $aa) = @fields;
- die "Expected: foreign ||| target ||| alignments" unless scalar @fields == 3;
- my @fs = split /\s+/, $ff;
- my @es = split /\s+/, $ee;
- my @as = split /\s+/, $aa;
- my @oas = ();
- push @oas, '0-0';
- my $flen = scalar @fs;
- my $elen = scalar @es;
- for my $ap (@as) {
- my ($a, $b) = split /-/, $ap;
- die "Bad format in: @as" unless defined $a && defined $b;
- push @oas, ($a + 1) . '-' . ($b + 1);
+ my $o = '';
+ for my $field (@fields) {
+ $o .= " ||| <s> $field </s>";
}
- push @oas, ($flen + 1) . '-' . ($elen + 1);
- print "<s> $ff </s> ||| <s> $ee </s> ||| @oas\n";
+ $o =~ s/^ \|\|\| //;
+ if ($filea) {
+ my $aa = <A>;
+ die "ERROR. Mismatched number of lines between $filec and $filea\n" unless $aa;
+ chomp $aa;
+ my ($ff, $ee) = @fields;
+ die "ERROR in $filec:$lines: expected 'source ||| target'" unless defined $ee;
+ my @fs = split /\s+/, $ff;
+ my @es = split /\s+/, $ee;
+ my @as = split /\s+/, $aa;
+ my @oas = ();
+ push @oas, '0-0';
+ my $flen = scalar @fs;
+ my $elen = scalar @es;
+ for my $ap (@as) {
+ my ($a, $b) = split /-/, $ap;
+ die "ERROR. Bad format in: @as" unless defined $a && defined $b;
+ push @oas, ($a + 1) . '-' . ($b + 1);
+ }
+ push @oas, ($flen + 1) . '-' . ($elen + 1);
+ print OA "@oas\n";
+ }
+ print "$o\n";
+}
+if ($filea) {
+ close OA;
+ my $aa = <A>;
+ die "ERROR. Alignment input file $filea contains more lines than corpus file!\n" if $aa;
}
+print STDERR "\nSUCCESS. Processed $lines lines.\n";