diff options
Diffstat (limited to 'gi/pipeline/scripts')
| -rwxr-xr-x | gi/pipeline/scripts/patch-corpus.pl | 31 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/remove-tags-from-contexts.pl | 53 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/remove-tags-from-corpus.pl | 51 | 
3 files changed, 99 insertions, 36 deletions
| diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl index 200022bc..c0eec43e 100755 --- a/gi/pipeline/scripts/patch-corpus.pl +++ b/gi/pipeline/scripts/patch-corpus.pl @@ -3,12 +3,17 @@ use strict;  my $PATCH = shift @ARGV;  my $TGT = 1; -if ($PATCH eq "-s") { -    undef $TGT; +my $APPEND; +while ($PATCH eq "-s" || $PATCH eq "-a") { +    if ($PATCH eq "-s") { +        undef $TGT; +    } else { +        $APPEND = 1; +    }      $PATCH = shift @ARGV;  } -die "Usage: $0 [-s] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; +die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;  open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";  my $first=<P>; close P; @@ -33,11 +38,25 @@ while(my $pline = <P>) {    if ($TGT) {        my @lwords = split /\s+/, $fields[1];        die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); -      $fields[1] = $pline; -  } else { +      if ($APPEND) { +          foreach my $i (0..(scalar @pwords-1)) { +              $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; +          } +          $fields[1] = join ' ', @lwords; +      } else { +          $fields[1] = $pline; +      } +  } else { # source side        my @lwords = split /\s+/, $fields[0];        die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); -      $fields[0] = $pline; +      if ($APPEND) { +          foreach my $i (0..(scalar @pwords-1)) { +              $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; +          } +          $fields[0] = join ' ', @lwords; +      } else { +          $fields[0] = $pline; +      }    }    print join ' ||| ', @fields;    print "\n"; diff --git a/gi/pipeline/scripts/remove-tags-from-contexts.pl b/gi/pipeline/scripts/remove-tags-from-contexts.pl new file mode 100755 index 00000000..20698816 --- /dev/null +++ b/gi/pipeline/scripts/remove-tags-from-contexts.pl @@ -0,0 +1,53 @@ +#!/usr/bin/perl -w +use strict; + +use Getopt::Long "GetOptions"; + +my $PHRASE = 'tok'; +my $CONTEXT = 'tag'; + +die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus"  +    unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); + +my $lno = 0; +while(my $line = <>) { +    $lno++; +    chomp $line; +    my @top = split /\t/, $line; +    die unless (scalar @top == 2);  + +    my @pwords = split /\s+/, $top[0]; +    foreach my $token (@pwords) { +        #print $token . "\n"; +        my @parts = split /_(?!.*_)/, $token; +        die unless (scalar @parts == 2);  +        if ($PHRASE eq "tok") { +            $token = $parts[0] +        } elsif ($PHRASE eq "tag") { +            $token = $parts[1] +        } +    } + +    my @fields = split / \|\|\| /, $top[1]; +    foreach my $i (0..((scalar @fields) / 2 - 1)) { +        #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; +        my @cwords = split /\s+/, $fields[2*$i]; +        foreach my $token (@cwords) { +            #print $i . ": " . $token . "\n"; +            my @parts = split /_(?!.*_)/, $token; +            if (scalar @parts == 2) { +                if ($CONTEXT eq "tok") { +                    $token = $parts[0] +                } elsif ($CONTEXT eq "tag") { +                    $token = $parts[1] +                } +            } +        } +        $fields[2*$i] = join ' ', @cwords; +    } + +    print join ' ', @pwords; +    print "\t"; +    print join ' ||| ', @fields; +    print "\n"; +} diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl index 5460db95..be3e97c0 100755 --- a/gi/pipeline/scripts/remove-tags-from-corpus.pl +++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl @@ -3,51 +3,42 @@ use strict;  use Getopt::Long "GetOptions"; -my $PHRASE = 'tok'; -my $CONTEXT = 'tag'; - -die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus"  -    unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); +my $LANGUAGE = shift @ARGV; +$LANGUAGE = 'target' unless ($LANGUAGE);  my $lno = 0;  while(my $line = <>) {      $lno++;      chomp $line; -    my @top = split /\t/, $line; -    die unless (scalar @top == 2);  -    my @pwords = split /\s+/, $top[0]; -    foreach my $token (@pwords) { -        #print $token . "\n"; -        my @parts = split /_(?!_)/, $token; -        die unless (scalar @parts == 2);  -        if ($PHRASE eq "tok") { -            $token = $parts[0] -        } elsif ($PHRASE eq "tag") { -            $token = $parts[1] +    my @fields = split / \|\|\| /, $line; + +    if ($LANGUAGE eq "source" or $LANGUAGE eq "both") { +        my @cwords = split /\s+/, $fields[0]; +        foreach my $token (@cwords) { +            my @parts = split /_(?!.*_)/, $token; +            if (scalar @parts == 2) { +                $token = $parts[0] +            } else { +                print STDERR "WARNING: invalid tagged token $token\n"; +            }          } +        $fields[0] = join ' ', @cwords;      } -    my @fields = split / \|\|\| /, $top[1]; -    foreach my $i (0..((scalar @fields) / 2 - 1)) { -        #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; -        my @cwords = split /\s+/, $fields[2*$i]; +    if ($LANGUAGE eq "target" or $LANGUAGE eq "both") { +        my @cwords = split /\s+/, $fields[1];          foreach my $token (@cwords) { -            #print $i . ": " . $token . "\n"; -            my @parts = split /_/, $token; +            my @parts = split /_(?!.*_)/, $token;              if (scalar @parts == 2) { -                if ($CONTEXT eq "tok") { -                    $token = $parts[0] -                } elsif ($CONTEXT eq "tag") { -                    $token = $parts[1] -                } +                $token = $parts[1] +            } else { +                print STDERR "WARNING: invalid tagged token $token\n";              }          } -        $fields[2*$i] = join ' ', @cwords; +        $fields[0] = join ' ', @cwords;      } -    print join ' ', @pwords; -    print "\t";      print join ' ||| ', @fields;      print "\n";  } | 
