diff options
| author | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-22 19:05:54 +0000 | 
|---|---|---|
| committer | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-22 19:05:54 +0000 | 
| commit | 0732fff632e792ae2268e9ef1c9c230624098eb7 (patch) | |
| tree | 344784ef25043f34bf6d9fe74b2579ade072a890 /gi/pipeline/scripts | |
| parent | d6cdcf776b2f4541a7ee80c1a489d4a0fee41be3 (diff) | |
Added option to apply tags to source-side
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@367 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline/scripts')
| -rwxr-xr-x | gi/pipeline/scripts/patch-corpus.pl | 20 | 
1 files changed, 16 insertions, 4 deletions
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl index 2b181837..200022bc 100755 --- a/gi/pipeline/scripts/patch-corpus.pl +++ b/gi/pipeline/scripts/patch-corpus.pl @@ -2,7 +2,13 @@  use strict;  my $PATCH = shift @ARGV; -die "Usage: $0 tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; +my $TGT = 1; +if ($PATCH eq "-s") { +    undef $TGT; +    $PATCH = shift @ARGV; +} + +die "Usage: $0 [-s] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;  open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";  my $first=<P>; close P; @@ -24,9 +30,15 @@ while(my $pline = <P>) {    chomp $line;    @fields = split / \|\|\| /, $line;    my @pwords = split /\s+/, $pline; -  my @lwords = split /\s+/, $fields[1]; -  die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); -  $fields[1] = $pline; +  if ($TGT) { +      my @lwords = split /\s+/, $fields[1]; +      die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); +      $fields[1] = $pline; +  } else { +      my @lwords = split /\s+/, $fields[0]; +      die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); +      $fields[0] = $pline; +  }    print join ' ||| ', @fields;    print "\n";  }  | 
