diff options
author | Patrick Simianer <p@simianer.de> | 2018-04-17 15:43:17 +0000 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2018-04-17 15:43:17 +0000 |
commit | f44539de04b178f3a1b14960407ec683030f927a (patch) | |
tree | 08a6a0c50454e57b8c36e3438a5569ab153aa258 /detruecase.perl | |
parent | 641e80a4ad7bff2bb0cae447cc39da0eccc662dd (diff) | |
parent | e86f8f5139196bc99a55797c255401a0d6a86214 (diff) |
Merge branch 'master' of https://github.com/pks/nlp_scripts
Diffstat (limited to 'detruecase.perl')
-rwxr-xr-x | detruecase.perl | 88 |
1 files changed, 0 insertions, 88 deletions
diff --git a/detruecase.perl b/detruecase.perl deleted file mode 100755 index 012c143..0000000 --- a/detruecase.perl +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long "GetOptions"; - -binmode(STDIN, ":utf8"); -binmode(STDOUT, ":utf8"); - -my ($SRC,$INFILE,$UNBUFFERED); -die("detruecase.perl < in > out") - unless &GetOptions('headline=s' => \$SRC, - 'in=s' => \$INFILE, - 'b|unbuffered' => \$UNBUFFERED); -if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } - -my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); -my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"""=>1,"'"=>1,"["=>1,"]"=>1); - -# lowercase even in headline -my %ALWAYS_LOWER; -foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; } - -# find out about the headlines -my @HEADLINE; -if (defined($SRC)) { - open(SRC,$SRC); - my $headline_flag = 0; - while(<SRC>) { - $headline_flag = 1 if /<hl>/; - $headline_flag = 0 if /<.hl>/; - next unless /^<seg/; - push @HEADLINE, $headline_flag; - } - close(SRC); -} - -my $sentence = 0; -if ($INFILE) { - open(IN,$INFILE) || die("ERROR: could not open file '$INFILE'"); - binmode(IN, ":utf8"); - while(<IN>) { - &process($_,$sentence++); - } - close(IN); -} -else { - while(<STDIN>) { - &process($_,$sentence++); - } -} - -sub process { - my $line = $_[0]; - chomp($line); - $line =~ s/^\s+//; - $line =~ s/\s+$//; - my @WORD = split(/\s+/,$line); - - # uppercase at sentence start - my $sentence_start = 1; - for(my $i=0;$i<scalar(@WORD);$i++) { - &uppercase(\$WORD[$i]) if $sentence_start; - if (defined($SENTENCE_END{ $WORD[$i] })) { $sentence_start = 1; } - elsif (!defined($DELAYED_SENTENCE_START{$WORD[$i] })) { $sentence_start = 0; } - } - - # uppercase headlines { - if (defined($SRC) && $HEADLINE[$sentence]) { - foreach (@WORD) { - &uppercase(\$_) unless $ALWAYS_LOWER{$_}; - } - } - - # output - my $first = 1; - foreach (@WORD) { - print " " unless $first; - $first = 0; - print $_; - } - print "\n"; - $sentence++; -} - -sub uppercase { - my ($W) = @_; - $$W = uc(substr($$W,0,1)).substr($$W,1); -} |