summaryrefslogtreecommitdiff
path: root/gi/pipeline/scripts/patch-corpus.pl
diff options
context:
space:
mode:
authortrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-22 19:05:54 +0000
committertrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-22 19:05:54 +0000
commit113ebafe09fb0474f30d12190b38da74f8b08b4d (patch)
treebc87d4b5fff2ced3146c13bce13f2e8befd9b733 /gi/pipeline/scripts/patch-corpus.pl
parente5f89e61768c7a8cc3ac60976d14314de60862f4 (diff)
Added option to apply tags to source-side
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@367 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline/scripts/patch-corpus.pl')
-rwxr-xr-xgi/pipeline/scripts/patch-corpus.pl20
1 files changed, 16 insertions, 4 deletions
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl
index 2b181837..200022bc 100755
--- a/gi/pipeline/scripts/patch-corpus.pl
+++ b/gi/pipeline/scripts/patch-corpus.pl
@@ -2,7 +2,13 @@
use strict;
my $PATCH = shift @ARGV;
-die "Usage: $0 tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
+my $TGT = 1;
+if ($PATCH eq "-s") {
+ undef $TGT;
+ $PATCH = shift @ARGV;
+}
+
+die "Usage: $0 [-s] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
my $first=<P>; close P;
@@ -24,9 +30,15 @@ while(my $pline = <P>) {
chomp $line;
@fields = split / \|\|\| /, $line;
my @pwords = split /\s+/, $pline;
- my @lwords = split /\s+/, $fields[1];
- die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
- $fields[1] = $pline;
+ if ($TGT) {
+ my @lwords = split /\s+/, $fields[1];
+ die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
+ $fields[1] = $pline;
+ } else {
+ my @lwords = split /\s+/, $fields[0];
+ die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
+ $fields[0] = $pline;
+ }
print join ' ||| ', @fields;
print "\n";
}