summaryrefslogtreecommitdiff
path: root/gi/pipeline/scripts/patch-corpus.pl
diff options
context:
space:
mode:
Diffstat (limited to 'gi/pipeline/scripts/patch-corpus.pl')
-rwxr-xr-xgi/pipeline/scripts/patch-corpus.pl31
1 files changed, 25 insertions, 6 deletions
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl
index 200022bc..c0eec43e 100755
--- a/gi/pipeline/scripts/patch-corpus.pl
+++ b/gi/pipeline/scripts/patch-corpus.pl
@@ -3,12 +3,17 @@ use strict;
my $PATCH = shift @ARGV;
my $TGT = 1;
-if ($PATCH eq "-s") {
- undef $TGT;
+my $APPEND;
+while ($PATCH eq "-s" || $PATCH eq "-a") {
+ if ($PATCH eq "-s") {
+ undef $TGT;
+ } else {
+ $APPEND = 1;
+ }
$PATCH = shift @ARGV;
}
-die "Usage: $0 [-s] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
+die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
my $first=<P>; close P;
@@ -33,11 +38,25 @@ while(my $pline = <P>) {
if ($TGT) {
my @lwords = split /\s+/, $fields[1];
die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
- $fields[1] = $pline;
- } else {
+ if ($APPEND) {
+ foreach my $i (0..(scalar @pwords-1)) {
+ $lwords[$i] = $lwords[$i] . '_' . $pwords[$i];
+ }
+ $fields[1] = join ' ', @lwords;
+ } else {
+ $fields[1] = $pline;
+ }
+ } else { # source side
my @lwords = split /\s+/, $fields[0];
die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
- $fields[0] = $pline;
+ if ($APPEND) {
+ foreach my $i (0..(scalar @pwords-1)) {
+ $lwords[$i] = $lwords[$i] . '_' . $pwords[$i];
+ }
+ $fields[0] = join ' ', @lwords;
+ } else {
+ $fields[0] = $pline;
+ }
}
print join ' ||| ', @fields;
print "\n";