summaryrefslogtreecommitdiff
path: root/gi/pipeline/scripts/patch-corpus.pl
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-09 18:54:51 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-09 18:54:51 +0000
commit06a2d0d09edeeea30e7d89b5aafdfe911b89c279 (patch)
tree27609b95b977ee1eff2a906e3c2a20025bd6dff0 /gi/pipeline/scripts/patch-corpus.pl
parenta1e883e6c2d67549f46e2c26c6d1deff86bf34e0 (diff)
support doing clustering on tagged corpora
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@208 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline/scripts/patch-corpus.pl')
-rwxr-xr-xgi/pipeline/scripts/patch-corpus.pl34
1 files changed, 34 insertions, 0 deletions
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl
new file mode 100755
index 00000000..2b181837
--- /dev/null
+++ b/gi/pipeline/scripts/patch-corpus.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $PATCH = shift @ARGV;
+die "Usage: $0 tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
+
+open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
+my $first=<P>; close P;
+my @fields = split / \|\|\| /, $first;
+die "Bad format!" if (scalar @fields > 2);
+
+if (scalar @fields != 1) {
+ # TODO support this
+ die "Patching source and target not supported yet!";
+}
+
+my $line = 0;
+open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
+while(my $pline = <P>) {
+ chomp $pline;
+ $line++;
+ my $line = <>;
+ die "Too few lines in lexical corpus!" unless $line;
+ chomp $line;
+ @fields = split / \|\|\| /, $line;
+ my @pwords = split /\s+/, $pline;
+ my @lwords = split /\s+/, $fields[1];
+ die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
+ $fields[1] = $pline;
+ print join ' ||| ', @fields;
+ print "\n";
+}
+
+