diff options
Diffstat (limited to 'gi/pipeline/scripts/patch-corpus.pl')
-rwxr-xr-x | gi/pipeline/scripts/patch-corpus.pl | 65 |
1 files changed, 0 insertions, 65 deletions
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl deleted file mode 100755 index c0eec43e..00000000 --- a/gi/pipeline/scripts/patch-corpus.pl +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $PATCH = shift @ARGV; -my $TGT = 1; -my $APPEND; -while ($PATCH eq "-s" || $PATCH eq "-a") { - if ($PATCH eq "-s") { - undef $TGT; - } else { - $APPEND = 1; - } - $PATCH = shift @ARGV; -} - -die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; - -open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; -my $first=<P>; close P; -my @fields = split / \|\|\| /, $first; -die "Bad format!" if (scalar @fields > 2); - -if (scalar @fields != 1) { - # TODO support this - die "Patching source and target not supported yet!"; -} - -my $line = 0; -open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; -while(my $pline = <P>) { - chomp $pline; - $line++; - my $line = <>; - die "Too few lines in lexical corpus!" unless $line; - chomp $line; - @fields = split / \|\|\| /, $line; - my @pwords = split /\s+/, $pline; - if ($TGT) { - my @lwords = split /\s+/, $fields[1]; - die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - if ($APPEND) { - foreach my $i (0..(scalar @pwords-1)) { - $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; - } - $fields[1] = join ' ', @lwords; - } else { - $fields[1] = $pline; - } - } else { # source side - my @lwords = split /\s+/, $fields[0]; - die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - if ($APPEND) { - foreach my $i (0..(scalar @pwords-1)) { - $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; - } - $fields[0] = join ' ', @lwords; - } else { - $fields[0] = $pline; - } - } - print join ' ||| ', @fields; - print "\n"; -} - - |