diff options
author | Patrick Simianer <p@simianer.de> | 2013-01-21 12:29:43 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2013-01-21 12:29:43 +0100 |
commit | 0d23f8aecbfaf982cd165ebfc2a1611cefcc7275 (patch) | |
tree | 8eafa6ea43224ff70635cadd4d6f027d28f4986f /corpus | |
parent | dbc66cd3944321961c5e11d5254fd914f05a98ad (diff) | |
parent | 7cac43b858f3b681555bf0578f54b1f822c43207 (diff) |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'corpus')
-rwxr-xr-x | corpus/add-self-translations.pl | 2 | ||||
-rwxr-xr-x | corpus/cut-corpus.pl | 2 | ||||
-rwxr-xr-x | corpus/filter-length.pl | 18 | ||||
-rwxr-xr-x | corpus/paste-files.pl | 12 |
4 files changed, 28 insertions, 6 deletions
diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl index 153bc454..d707ce29 100755 --- a/corpus/add-self-translations.pl +++ b/corpus/add-self-translations.pl @@ -6,7 +6,7 @@ use strict; my %df; my %def; while(<>) { - print; +# print; chomp; my ($sf, $se) = split / \|\|\| /; die "Format error: $_\n" unless defined $sf && defined $se; diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl index 7daac0e2..0af3b23c 100755 --- a/corpus/cut-corpus.pl +++ b/corpus/cut-corpus.pl @@ -22,7 +22,7 @@ for my $ff (@ind) { while(<>) { chomp; - my @fields = split / \|\|\| /; + my @fields = split /\s*\|\|\|\s*/; my @sf; for my $i (@o) { my $y = $fields[$i]; diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 70032ca7..38837f14 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -3,20 +3,30 @@ use strict; use utf8; ##### EDIT THESE SETTINGS #################################################### -my $MAX_LENGTH = 99; # discard a sentence if it is longer than this -my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include +my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be? ############################################################################## -die "Usage: $0 corpus.fr-en\n\n Filter sentence pairs containing sentences longer than $MAX_LENGTH words\n or whose log length ratios are $MAX_ZSCORE stddevs away from the mean log ratio.\n\n" unless scalar @ARGV == 1; +die "Usage: $0 [-NNN] corpus.fr-en\n\n Filter sentence pairs containing sentences longer than NNN words (where NNN\n is 150 by default) or whose log length ratios are $MAX_ZSCORE stddevs away from the\n mean log ratio.\n\n" unless scalar @ARGV == 1 || scalar @ARGV == 2; binmode(STDOUT,":utf8"); binmode(STDERR,":utf8"); +my $MAX_LENGTH = 150; # discard a sentence if it is longer than this +if (scalar @ARGV == 2) { + my $fp = shift @ARGV; + die "Expected -NNN for first parameter, but got $fp\n" unless $fp =~ /^-(\d+)$/; + $MAX_LENGTH=$1; +} + my $corpus = shift @ARGV; + die "Cannot read from STDIN\n" if $corpus eq '-'; my $ff = "<$corpus"; $ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/; +print STDERR "Max line length (monolingual): $MAX_LENGTH\n"; +print STDERR " Parallel corpus: $corpus\n"; + open F,$ff or die "Can't read $corpus: $!"; binmode(F,":utf8"); @@ -128,6 +138,8 @@ while(<F>) { next; } print; + } else { + print; } $to++; } diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl index 24c70599..0b788386 100755 --- a/corpus/paste-files.pl +++ b/corpus/paste-files.pl @@ -17,6 +17,7 @@ for my $file (@ARGV) { binmode(STDOUT,":utf8"); binmode(STDERR,":utf8"); +my $bad = 0; my $lc = 0; my $done = 0; my $fl = 0; @@ -34,7 +35,15 @@ while(1) { last; } chomp $r; - die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + if ($r =~ /\|\|\|/) { + $r = ''; + $bad++; + } + warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + $r =~ s/\|\|\|/ /g; + $r =~ s/ +//g; + $r =~ s/^ //; + $r =~ s/ $//; $anum++; push @line, $r; } @@ -47,4 +56,5 @@ for (my $i = 1; $i < scalar @fhs; $i++) { my $r = <$fh>; die "Mismatched number of lines.\n" if defined $r; } +print STDERR "Bad lines containing ||| were $bad\n"; |