diff options
author | Patrick Simianer <p@simianer.de> | 2015-11-12 13:42:29 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2015-11-12 13:42:29 +0100 |
commit | 17f5ee803b38a128f9022fff5ee658138f62a0e1 (patch) | |
tree | 9c6ba0e9e593c544903fc722d26fdae74780446f /detruecase.perl | |
parent | d9896c2d4b6f4af0159fc7b16c9c2cedac4826d2 (diff) |
add moses' truecaser
Diffstat (limited to 'detruecase.perl')
-rwxr-xr-x | detruecase.perl | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/detruecase.perl b/detruecase.perl new file mode 100755 index 0000000..012c143 --- /dev/null +++ b/detruecase.perl @@ -0,0 +1,88 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long "GetOptions"; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +my ($SRC,$INFILE,$UNBUFFERED); +die("detruecase.perl < in > out") + unless &GetOptions('headline=s' => \$SRC, + 'in=s' => \$INFILE, + 'b|unbuffered' => \$UNBUFFERED); +if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } + +my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); +my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"""=>1,"'"=>1,"["=>1,"]"=>1); + +# lowercase even in headline +my %ALWAYS_LOWER; +foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; } + +# find out about the headlines +my @HEADLINE; +if (defined($SRC)) { + open(SRC,$SRC); + my $headline_flag = 0; + while(<SRC>) { + $headline_flag = 1 if /<hl>/; + $headline_flag = 0 if /<.hl>/; + next unless /^<seg/; + push @HEADLINE, $headline_flag; + } + close(SRC); +} + +my $sentence = 0; +if ($INFILE) { + open(IN,$INFILE) || die("ERROR: could not open file '$INFILE'"); + binmode(IN, ":utf8"); + while(<IN>) { + &process($_,$sentence++); + } + close(IN); +} +else { + while(<STDIN>) { + &process($_,$sentence++); + } +} + +sub process { + my $line = $_[0]; + chomp($line); + $line =~ s/^\s+//; + $line =~ s/\s+$//; + my @WORD = split(/\s+/,$line); + + # uppercase at sentence start + my $sentence_start = 1; + for(my $i=0;$i<scalar(@WORD);$i++) { + &uppercase(\$WORD[$i]) if $sentence_start; + if (defined($SENTENCE_END{ $WORD[$i] })) { $sentence_start = 1; } + elsif (!defined($DELAYED_SENTENCE_START{$WORD[$i] })) { $sentence_start = 0; } + } + + # uppercase headlines { + if (defined($SRC) && $HEADLINE[$sentence]) { + foreach (@WORD) { + &uppercase(\$_) unless $ALWAYS_LOWER{$_}; + } + } + + # output + my $first = 1; + foreach (@WORD) { + print " " unless $first; + $first = 0; + print $_; + } + print "\n"; + $sentence++; +} + +sub uppercase { + my ($W) = @_; + $$W = uc(substr($$W,0,1)).substr($$W,1); +} |