diff options
| author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-11-05 21:34:14 -0500 | 
|---|---|---|
| committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-11-05 21:34:14 -0500 | 
| commit | 782fb27af98ed98256cc25c832131c59c8e9ce9c (patch) | |
| tree | 1438be0409a5afe6ce0fde9603c9fe978d50efd2 /corpus | |
| parent | 438f6811342f26c07a6af3bcd413ece7c35ca903 (diff) | |
script to add sos/eos
Diffstat (limited to 'corpus')
| -rwxr-xr-x | corpus/add-sos-eos.pl | 24 | 
1 files changed, 24 insertions, 0 deletions
| diff --git a/corpus/add-sos-eos.pl b/corpus/add-sos-eos.pl new file mode 100755 index 00000000..5e2d44cb --- /dev/null +++ b/corpus/add-sos-eos.pl @@ -0,0 +1,24 @@ +#!/usr/bin/perl -w +use strict; + +while(<>) { +  chomp; +  my @fields = split / \|\|\| /; +  my ($ff, $ee, $aa) = @fields; +  die "Expected: foreign ||| target ||| alignments" unless scalar @fields == 3; +  my @fs = split /\s+/, $ff; +  my @es = split /\s+/, $ee; +  my @as = split /\s+/, $aa; +  my @oas = (); +  push @oas, '0-0'; +  my $flen = scalar @fs; +  my $elen = scalar @es; +  for my $ap (@as) { +    my ($a, $b) = split /-/, $ap; +    die "Bad format in: @as" unless defined $a && defined $b; +    push @oas, ($a + 1) . '-' . ($b + 1); +  } +  push @oas, ($flen + 1) . '-' . ($elen + 1); +  print "<s> $ff </s> ||| <s> $ee </s> ||| @oas\n"; +} + | 
