diff options
author | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-23 16:39:41 +0000 |
---|---|---|
committer | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-23 16:39:41 +0000 |
commit | 3e32deafa6ac5d0904f19c570b0e9dd0521ddbdc (patch) | |
tree | 61d024ea81a9339fac76e10cd766ff1dc7c39a04 /gi/pipeline/scripts/patch-corpus.pl | |
parent | 26c5346a55145028a9171632dcc0defa472ea38f (diff) |
Pipeline code for running with mixing tokens and tags in the clustering.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@377 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline/scripts/patch-corpus.pl')
-rwxr-xr-x | gi/pipeline/scripts/patch-corpus.pl | 31 |
1 files changed, 25 insertions, 6 deletions
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl index 200022bc..c0eec43e 100755 --- a/gi/pipeline/scripts/patch-corpus.pl +++ b/gi/pipeline/scripts/patch-corpus.pl @@ -3,12 +3,17 @@ use strict; my $PATCH = shift @ARGV; my $TGT = 1; -if ($PATCH eq "-s") { - undef $TGT; +my $APPEND; +while ($PATCH eq "-s" || $PATCH eq "-a") { + if ($PATCH eq "-s") { + undef $TGT; + } else { + $APPEND = 1; + } $PATCH = shift @ARGV; } -die "Usage: $0 [-s] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; +die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; my $first=<P>; close P; @@ -33,11 +38,25 @@ while(my $pline = <P>) { if ($TGT) { my @lwords = split /\s+/, $fields[1]; die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - $fields[1] = $pline; - } else { + if ($APPEND) { + foreach my $i (0..(scalar @pwords-1)) { + $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; + } + $fields[1] = join ' ', @lwords; + } else { + $fields[1] = $pline; + } + } else { # source side my @lwords = split /\s+/, $fields[0]; die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - $fields[0] = $pline; + if ($APPEND) { + foreach my $i (0..(scalar @pwords-1)) { + $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; + } + $fields[0] = join ' ', @lwords; + } else { + $fields[0] = $pline; + } } print join ' ||| ', @fields; print "\n"; |