diff options
Diffstat (limited to 'gi/pipeline/scripts/patch-corpus.pl')
-rwxr-xr-x | gi/pipeline/scripts/patch-corpus.pl | 31 |
1 files changed, 25 insertions, 6 deletions
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl index 200022bc..c0eec43e 100755 --- a/gi/pipeline/scripts/patch-corpus.pl +++ b/gi/pipeline/scripts/patch-corpus.pl @@ -3,12 +3,17 @@ use strict; my $PATCH = shift @ARGV; my $TGT = 1; -if ($PATCH eq "-s") { - undef $TGT; +my $APPEND; +while ($PATCH eq "-s" || $PATCH eq "-a") { + if ($PATCH eq "-s") { + undef $TGT; + } else { + $APPEND = 1; + } $PATCH = shift @ARGV; } -die "Usage: $0 [-s] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; +die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; my $first=<P>; close P; @@ -33,11 +38,25 @@ while(my $pline = <P>) { if ($TGT) { my @lwords = split /\s+/, $fields[1]; die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - $fields[1] = $pline; - } else { + if ($APPEND) { + foreach my $i (0..(scalar @pwords-1)) { + $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; + } + $fields[1] = join ' ', @lwords; + } else { + $fields[1] = $pline; + } + } else { # source side my @lwords = split /\s+/, $fields[0]; die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - $fields[0] = $pline; + if ($APPEND) { + foreach my $i (0..(scalar @pwords-1)) { + $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; + } + $fields[0] = join ' ', @lwords; + } else { + $fields[0] = $pline; + } } print join ' ||| ', @fields; print "\n"; |