diff options
| author | Patrick Simianer <p@simianer.de> | 2015-11-12 13:42:29 +0100 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2015-11-12 13:42:29 +0100 | 
| commit | 17f5ee803b38a128f9022fff5ee658138f62a0e1 (patch) | |
| tree | 9c6ba0e9e593c544903fc722d26fdae74780446f /detruecase.perl | |
| parent | d9896c2d4b6f4af0159fc7b16c9c2cedac4826d2 (diff) | |
add moses' truecaser
Diffstat (limited to 'detruecase.perl')
| -rwxr-xr-x | detruecase.perl | 88 | 
1 files changed, 88 insertions, 0 deletions
| diff --git a/detruecase.perl b/detruecase.perl new file mode 100755 index 0000000..012c143 --- /dev/null +++ b/detruecase.perl @@ -0,0 +1,88 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long "GetOptions"; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +my ($SRC,$INFILE,$UNBUFFERED); +die("detruecase.perl < in > out") +    unless &GetOptions('headline=s' => \$SRC, +		       'in=s' => \$INFILE, +                       'b|unbuffered' => \$UNBUFFERED); +if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } + +my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); +my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"""=>1,"'"=>1,"["=>1,"]"=>1); + +# lowercase even in headline +my %ALWAYS_LOWER; +foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; } + +# find out about the headlines +my @HEADLINE; +if (defined($SRC)) { +    open(SRC,$SRC); +    my $headline_flag = 0; +    while(<SRC>) { +	$headline_flag = 1 if /<hl>/; +	$headline_flag = 0 if /<.hl>/; +	next unless /^<seg/; +	push @HEADLINE, $headline_flag; +    } +    close(SRC); +} + +my $sentence = 0; +if ($INFILE) { +  open(IN,$INFILE) || die("ERROR: could not open file '$INFILE'"); +  binmode(IN, ":utf8"); +  while(<IN>) { +    &process($_,$sentence++); +  } +  close(IN); +} +else { +  while(<STDIN>) { +    &process($_,$sentence++); +  } +} + +sub process { +    my $line = $_[0]; +    chomp($line); +    $line =~ s/^\s+//; +    $line =~ s/\s+$//; +    my @WORD  = split(/\s+/,$line); + +    # uppercase at sentence start +    my $sentence_start = 1; +    for(my $i=0;$i<scalar(@WORD);$i++) { +      &uppercase(\$WORD[$i]) if $sentence_start; +      if (defined($SENTENCE_END{ $WORD[$i] })) { $sentence_start = 1; } +      elsif (!defined($DELAYED_SENTENCE_START{$WORD[$i] })) { $sentence_start = 0; } +    } + +    # uppercase headlines { +    if (defined($SRC) && $HEADLINE[$sentence]) { +	foreach (@WORD) { +	    &uppercase(\$_) unless $ALWAYS_LOWER{$_}; +	}	 +    } + +    # output +    my $first = 1; +    foreach (@WORD) { +	print " " unless $first; +	$first = 0; +	print $_; +    } +    print "\n"; +    $sentence++; +} + +sub uppercase { +    my ($W) = @_; +    $$W = uc(substr($$W,0,1)).substr($$W,1); +} | 
