diff options
author | Patrick Simianer <p@simianer.de> | 2015-11-12 13:42:29 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2015-11-12 13:42:29 +0100 |
commit | 17f5ee803b38a128f9022fff5ee658138f62a0e1 (patch) | |
tree | 9c6ba0e9e593c544903fc722d26fdae74780446f /truecase.perl | |
parent | d9896c2d4b6f4af0159fc7b16c9c2cedac4826d2 (diff) |
add moses' truecaser
Diffstat (limited to 'truecase.perl')
-rwxr-xr-x | truecase.perl | 104 |
1 files changed, 104 insertions, 0 deletions
diff --git a/truecase.perl b/truecase.perl new file mode 100755 index 0000000..0a4d366 --- /dev/null +++ b/truecase.perl @@ -0,0 +1,104 @@ +#!/usr/bin/perl -w + +# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ +use strict; +use Getopt::Long "GetOptions"; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +# apply switches +my ($MODEL, $UNBUFFERED); +die("truecase.perl --model MODEL [-b] < in > out") + unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED) + && defined($MODEL); +if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } + +my (%BEST,%KNOWN); +open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'"); +binmode(MODEL, ":utf8"); +while(<MODEL>) { + my ($word,@OPTIONS) = split; + $BEST{ lc($word) } = $word; + $KNOWN{ $word } = 1; + for(my $i=1;$i<$#OPTIONS;$i+=2) { + $KNOWN{ $OPTIONS[$i] } = 1; + } +} +close(MODEL); + +my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); +my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); + +while(<STDIN>) { + chop; + my ($WORD,$MARKUP) = split_xml($_); + my $sentence_start = 1; + for(my $i=0;$i<=$#$WORD;$i++) { + print " " if $i && $$MARKUP[$i] eq ''; + print $$MARKUP[$i]; + + my ($word,$otherfactors); + if ($$WORD[$i] =~ /^([^\|]+)(.*)/) + { + $word = $1; + $otherfactors = $2; + } + else + { + $word = $$WORD[$i]; + $otherfactors = ""; + } + + if ($sentence_start && defined($BEST{lc($word)})) { + print $BEST{lc($word)}; # truecase sentence start + } + elsif (defined($KNOWN{$word})) { + print $word; # don't change known words + } + elsif (defined($BEST{lc($word)})) { + print $BEST{lc($word)}; # truecase otherwise unknown words + } + else { + print $word; # unknown, nothing to do + } + print $otherfactors; + + if ( defined($SENTENCE_END{ $word })) { $sentence_start = 1; } + elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; } + } + print $$MARKUP[$#$MARKUP]; + print "\n"; +} + +# store away xml markup +sub split_xml { + my ($line) = @_; + my (@WORD,@MARKUP); + my $i = 0; + $MARKUP[0] = ""; + while($line =~ /\S/) { + # XML tag + if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { + $MARKUP[$i] .= $1." "; + $line = $2; + } + # non-XML text + elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + # '<' or '>' occurs in word, but it's not an XML tag + elsif ($line =~ /^\s*(\S+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + else { + die("ERROR: huh? $line\n"); + } + } + chop($MARKUP[$#MARKUP]); + return (\@WORD,\@MARKUP); +} |