From 17f5ee803b38a128f9022fff5ee658138f62a0e1 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 12 Nov 2015 13:42:29 +0100 Subject: add moses' truecaser --- detruecase.perl | 88 ++++++++++++++++++++++++++++++++++++++++ train-truecaser.perl | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++ truecase.perl | 104 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 304 insertions(+) create mode 100755 detruecase.perl create mode 100755 train-truecaser.perl create mode 100755 truecase.perl diff --git a/detruecase.perl b/detruecase.perl new file mode 100755 index 0000000..012c143 --- /dev/null +++ b/detruecase.perl @@ -0,0 +1,88 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long "GetOptions"; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +my ($SRC,$INFILE,$UNBUFFERED); +die("detruecase.perl < in > out") + unless &GetOptions('headline=s' => \$SRC, + 'in=s' => \$INFILE, + 'b|unbuffered' => \$UNBUFFERED); +if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } + +my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); +my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"""=>1,"'"=>1,"["=>1,"]"=>1); + +# lowercase even in headline +my %ALWAYS_LOWER; +foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; } + +# find out about the headlines +my @HEADLINE; +if (defined($SRC)) { + open(SRC,$SRC); + my $headline_flag = 0; + while() { + $headline_flag = 1 if //; + $headline_flag = 0 if /<.hl>/; + next unless /^) { + &process($_,$sentence++); + } + close(IN); +} +else { + while() { + &process($_,$sentence++); + } +} + +sub process { + my $line = $_[0]; + chomp($line); + $line =~ s/^\s+//; + $line =~ s/\s+$//; + my @WORD = split(/\s+/,$line); + + # uppercase at sentence start + my $sentence_start = 1; + for(my $i=0;$i \$CORPUS, + 'model=s' => \$MODEL, + 'possiblyUseFirstToken' => \(my $possiblyUseFirstToken = 0)) + && defined($CORPUS) && defined($MODEL); +my %CASING; +my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); +my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); +open(CORPUS,$CORPUS) || die("ERROR: could not open '$CORPUS'"); +binmode(CORPUS, ":utf8"); +while() { + chop; + my ($WORD,$MARKUP) = split_xml($_); + my $start = 0; + while($start<=$#$WORD && defined($DELAYED_SENTENCE_START{$$WORD[$start]})) { $start++; } + my $firstWordOfSentence = 1; + for(my $i=$start;$i<=$#$WORD;$i++) { + my $currentWord = $$WORD[$i]; + if (! $firstWordOfSentence && defined($SENTENCE_END{$$WORD[$i-1]})) { + $firstWordOfSentence = 1; + } + + my $currentWordWeight = 0; + if (! $firstWordOfSentence) { + $currentWordWeight = 1; + } elsif ($possiblyUseFirstToken) { + # gated special handling of first word of sentence + my $firstChar = substr($currentWord, 0, 1); + if (lc($firstChar) eq $firstChar) { + # if the first character is not upper case, count the token as full evidence (because if it's not capitalized, then there's no reason to be wary that the given casing is only due to being sentence-initial) + $currentWordWeight = 1; + } elsif (scalar(@$WORD) == 1) { + # if the first character is upper case, but the current token is the only token of the segment, then count the token as partial evidence (because the segment is presumably not a sentence and the token is therefore not the first word of a sentence and is possibly in its natural case) + $currentWordWeight = 0.1; + } + } + if ($currentWordWeight > 0) { + $CASING{ lc($currentWord) }{ $currentWord } += $currentWordWeight; + } + + $firstWordOfSentence = 0; + } +} +close(CORPUS); + +open(MODEL,">$MODEL") || die("ERROR: could not create '$MODEL'"); +binmode(MODEL, ":utf8"); +foreach my $type (keys %CASING) { + my ($score,$total,$best) = (-1,0,""); + foreach my $word (keys %{$CASING{$type}}) { + my $count = $CASING{$type}{$word}; + $total += $count; + if ($count > $score) { + $best = $word; + $score = $count; + } + } + print MODEL "$best ($score/$total)"; + foreach my $word (keys %{$CASING{$type}}) { + print MODEL " $word ($CASING{$type}{$word})" unless $word eq $best; + } + print MODEL "\n"; +} +close(MODEL); + + +# store away xml markup +sub split_xml { + my ($line) = @_; + my (@WORD,@MARKUP); + my $i = 0; + $MARKUP[0] = ""; + while($line =~ /\S/) { + # XML tag + if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { + $MARKUP[$i] .= $1." "; + $line = $2; + } + # non-XML text + elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + # '<' or '>' occurs in word, but it's not an XML tag + elsif ($line =~ /^\s*(\S+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + else { + die("ERROR: huh? $line\n"); + } + } + chop($MARKUP[$#MARKUP]); + return (\@WORD,\@MARKUP); +} diff --git a/truecase.perl b/truecase.perl new file mode 100755 index 0000000..0a4d366 --- /dev/null +++ b/truecase.perl @@ -0,0 +1,104 @@ +#!/usr/bin/perl -w + +# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ +use strict; +use Getopt::Long "GetOptions"; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +# apply switches +my ($MODEL, $UNBUFFERED); +die("truecase.perl --model MODEL [-b] < in > out") + unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED) + && defined($MODEL); +if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } + +my (%BEST,%KNOWN); +open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'"); +binmode(MODEL, ":utf8"); +while() { + my ($word,@OPTIONS) = split; + $BEST{ lc($word) } = $word; + $KNOWN{ $word } = 1; + for(my $i=1;$i<$#OPTIONS;$i+=2) { + $KNOWN{ $OPTIONS[$i] } = 1; + } +} +close(MODEL); + +my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); +my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); + +while() { + chop; + my ($WORD,$MARKUP) = split_xml($_); + my $sentence_start = 1; + for(my $i=0;$i<=$#$WORD;$i++) { + print " " if $i && $$MARKUP[$i] eq ''; + print $$MARKUP[$i]; + + my ($word,$otherfactors); + if ($$WORD[$i] =~ /^([^\|]+)(.*)/) + { + $word = $1; + $otherfactors = $2; + } + else + { + $word = $$WORD[$i]; + $otherfactors = ""; + } + + if ($sentence_start && defined($BEST{lc($word)})) { + print $BEST{lc($word)}; # truecase sentence start + } + elsif (defined($KNOWN{$word})) { + print $word; # don't change known words + } + elsif (defined($BEST{lc($word)})) { + print $BEST{lc($word)}; # truecase otherwise unknown words + } + else { + print $word; # unknown, nothing to do + } + print $otherfactors; + + if ( defined($SENTENCE_END{ $word })) { $sentence_start = 1; } + elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; } + } + print $$MARKUP[$#$MARKUP]; + print "\n"; +} + +# store away xml markup +sub split_xml { + my ($line) = @_; + my (@WORD,@MARKUP); + my $i = 0; + $MARKUP[0] = ""; + while($line =~ /\S/) { + # XML tag + if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { + $MARKUP[$i] .= $1." "; + $line = $2; + } + # non-XML text + elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + # '<' or '>' occurs in word, but it's not an XML tag + elsif ($line =~ /^\s*(\S+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + else { + die("ERROR: huh? $line\n"); + } + } + chop($MARKUP[$#MARKUP]); + return (\@WORD,\@MARKUP); +} -- cgit v1.2.3