summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-11-06 00:02:58 -0500
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-11-06 00:02:58 -0500
commitd8d8dad6e63306b3c8e326b22fcfdf90856bb85e (patch)
treeb4dc354bdb2080e110a8f9067d108cbacab5b3b8 /corpus
parent552793bbd50f634ea755b84d47ddcc6cd4f158f2 (diff)
parent9e5107a05bfabb76ce547d2849173c5a11aeba60 (diff)
Merge branch 'master' of github.com:redpony/cdec
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/add-sos-eos.pl24
1 files changed, 24 insertions, 0 deletions
diff --git a/corpus/add-sos-eos.pl b/corpus/add-sos-eos.pl
new file mode 100755
index 00000000..5e2d44cb
--- /dev/null
+++ b/corpus/add-sos-eos.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/perl -w
+use strict;
+
+while(<>) {
+ chomp;
+ my @fields = split / \|\|\| /;
+ my ($ff, $ee, $aa) = @fields;
+ die "Expected: foreign ||| target ||| alignments" unless scalar @fields == 3;
+ my @fs = split /\s+/, $ff;
+ my @es = split /\s+/, $ee;
+ my @as = split /\s+/, $aa;
+ my @oas = ();
+ push @oas, '0-0';
+ my $flen = scalar @fs;
+ my $elen = scalar @es;
+ for my $ap (@as) {
+ my ($a, $b) = split /-/, $ap;
+ die "Bad format in: @as" unless defined $a && defined $b;
+ push @oas, ($a + 1) . '-' . ($b + 1);
+ }
+ push @oas, ($flen + 1) . '-' . ($elen + 1);
+ print "<s> $ff </s> ||| <s> $ee </s> ||| @oas\n";
+}
+