diff options
author | Patrick Simianer <pks@pks.rocks> | 2018-04-11 15:21:44 +0200 |
---|---|---|
committer | Patrick Simianer <pks@pks.rocks> | 2018-04-11 15:21:44 +0200 |
commit | 26c944f3bf8e97e35d9662d8672c9527fb13c45f (patch) | |
tree | 35f4d2c27e8bdc72e861bb92a3b1294df1ed2785 | |
parent | 31a3b846e2ae174fb68b61a9e9070e32c11509a6 (diff) |
rm
-rwxr-xr-x | compound-splitter.perl | 291 | ||||
-rwxr-xr-x | detruecase.perl | 88 | ||||
-rwxr-xr-x | lowercase.perl | 10 | ||||
-rwxr-xr-x | multi-bleu.perl | 174 | ||||
-rwxr-xr-x | tokenizer-no-escape.perl | 348 | ||||
-rwxr-xr-x | train-truecaser.perl | 112 | ||||
-rwxr-xr-x | truecase.perl | 104 |
7 files changed, 0 insertions, 1127 deletions
diff --git a/compound-splitter.perl b/compound-splitter.perl deleted file mode 100755 index beca70e..0000000 --- a/compound-splitter.perl +++ /dev/null @@ -1,291 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long "GetOptions"; - -my ($CORPUS,$MODEL,$TRAIN,$HELP,$VERBOSE); -my $FILLER = ":s:es"; -my $MIN_SIZE = 3; -my $MIN_COUNT = 5; -my $MAX_COUNT = 5; -my $FACTORED = 0; -my $SYNTAX = 0; -my $MARK_SPLIT = 0; -my $BINARIZE = 0; -$HELP = 1 - unless &GetOptions('corpus=s' => \$CORPUS, - 'model=s' => \$MODEL, - 'filler=s' => \$FILLER, - 'factored' => \$FACTORED, - 'min-size=i' => \$MIN_SIZE, - 'min-count=i' => \$MIN_COUNT, - 'max-count=i' => \$MAX_COUNT, - 'help' => \$HELP, - 'verbose' => \$VERBOSE, - 'syntax' => \$SYNTAX, - 'binarize' => \$BINARIZE, - 'mark-split' => \$MARK_SPLIT, - 'train' => \$TRAIN); - -if ($HELP || - ( $TRAIN && !$CORPUS) || - (!$TRAIN && !$MODEL)) { - print "Compound splitter\n"; - print "-----------------\n\n"; - print "train: compound-splitter -train -corpus txt-file -model new-model\n"; - print "apply: compound-splitter -model trained-model < in > out\n"; - print "options: -min-size: minimum word size (default $MIN_SIZE)\n"; - print " -min-count: minimum word count (default $MIN_COUNT)\n"; - print " -filler: filler letters between words (default $FILLER)\n"; - print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n"; - print " -syntax: syntactically parsed data (default $SYNTAX)\n"; - print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n"; - print " -binarize: binarize subtree for split word (default $BINARIZE)\n"; - exit; -} - -if ($TRAIN) { - if ($SYNTAX) { &train_syntax(); } - elsif ($FACTORED) { &train_factored(); } - else { &train(); } -} -else { - &apply(); -} - -sub train { - my %COUNT; - open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); - while(<CORPUS>) { - chop; s/\s+/ /g; s/^ //; s/ $//; - foreach (split) { - $COUNT{$_}++; - } - } - close(CORPUS); - &save_trained_model(\%COUNT); -} - -sub save_trained_model { - my ($COUNT) = @_; - my $id = 0; - open(MODEL,">".$MODEL); - foreach my $word (keys %$COUNT) { - print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n"; - } - close(MODEL); - print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n"; -} - -sub train_factored { - my (%COUNT,%FACTORED_COUNT); - # collect counts for interpretations for each surface word - open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); - while(<CORPUS>) { - chop; s/\s+/ /g; s/^ //; s/ $//; - foreach my $factored_word (split) { - my $word = $factored_word; - $word =~ s/\|.+//g; # just first factor - $FACTORED_COUNT{$word}{$factored_word}++; - } - } - close(CORPUS); - # only preserve most frequent interpretation, assign sum of counts - foreach my $word (keys %FACTORED_COUNT) { - my ($max,$best,$total) = (0,"",0); - foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) { - my $count = $FACTORED_COUNT{$word}{$factored_word}; - $total += $count; - if ($count > $max) { - $max = $count; - $best = $factored_word; - } - } - $COUNT{$best} = $total; - } - &save_trained_model(\%COUNT); -} - -sub train_syntax { - my (%COUNT,%LABELED_COUNT); - # collect counts for interpretations for each surface word - open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); - while(<CORPUS>) { - chop; s/\s+/ /g; s/^ //; s/ $//; - my $label; - foreach (split) { - if (/^label="([^\"]+)"/) { - $label = $1; - } - elsif (! /^</) { - $LABELED_COUNT{$_}{$label}++; - } - } - } - close(CORPUS); - - # only preserve most frequent label, assign sum of counts - foreach my $word (keys %LABELED_COUNT) { - my ($max,$best,$total) = (0,"",0); - foreach my $label (keys %{$LABELED_COUNT{$word}}) { - my $count = $LABELED_COUNT{$word}{$label}; - $total += $count; - if ($count > $max) { - $max = $count; - $best = "$word $label"; - } - } - $COUNT{$best} = $total; - } - &save_trained_model(\%COUNT); -} - -sub apply { - my (%COUNT,%TRUECASE,%LABEL); - open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'"); - while(<MODEL>) { - chomp; - my ($id,$factored_word,$count) = split(/\t/); - my $label; - ($factored_word,$label) = split(/ /,$factored_word); - my $word = $factored_word; - $word =~ s/\|.+//g; # just first factor - my $lc = lc($word); - # if word exists with multipe casings, only record most frequent - next if defined($COUNT{$lc}) && $COUNT{$lc} > $count; - $COUNT{$lc} = $count; - $TRUECASE{$lc} = $factored_word; - $LABEL{$lc} = $label if $SYNTAX; - } - close(MODEL); - - while(<STDIN>) { - my $first = 1; - chop; s/\s+/ /g; s/^ //; s/ $//; - my @BUFFER; # for xml tags - foreach my $factored_word (split) { - print " " unless $first; - $first = 0; - - # syntax: don't split xml - if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) { - push @BUFFER,$factored_word; - $first = 1; - next; - } - - # get case class - my $word = $factored_word; - $word =~ s/\|.+//g; # just first factor - my $lc = lc($word); - - print STDERR "considering $word ($lc)...\n" if $VERBOSE; - # don't split frequent words - if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) || - $lc !~ /[a-zA-Z]/) {; # has to have at least one letter - print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer - print $factored_word; - print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE; - next; - } - - # consider possible splits - my $final = length($word)-1; - my %REACHABLE; - for(my $i=0;$i<=$final;$i++) { $REACHABLE{$i} = (); } - - print STDERR "splitting $word:\n" if $VERBOSE; - for(my $end=$MIN_SIZE;$end<length($word);$end++) { - for(my $start=0;$start<=$end-$MIN_SIZE;$start++) { - next unless $start == 0 || defined($REACHABLE{$start-1}); - foreach my $filler (split(/:/,$FILLER)) { - next if $start == 0 && $filler ne ""; - next if lc(substr($word,$start,length($filler))) ne $filler; - my $subword = lc(substr($word, - $start+length($filler), - $end-$start+1-length($filler))); - next unless defined($COUNT{$subword}); - next unless $COUNT{$subword} >= $MIN_COUNT; - print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE; - push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}"; - } - } - } - - # no matches at all? - if (!defined($REACHABLE{$final})) { - print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer - print $factored_word; - next; - } - - my ($best_split,$best_score) = ("",0); - - my %ITERATOR; - for(my $i=0;$i<=$final;$i++) { $ITERATOR{$i}=0; } - my $done = 0; - while(1) { - # read off word - my ($pos,$decomp,$score,$num,@INDEX) = ($final,"",1,0); - while($pos>0) { - last unless scalar @{$REACHABLE{$pos}} > $ITERATOR{$pos}; # dead end? - my ($nextpos,$subword,$count) - = split(/ /,$REACHABLE{$pos}[ $ITERATOR{$pos} ]); - $decomp = $subword." ".$decomp; - $score *= $count; - $num++; - push @INDEX,$pos; -# print STDERR "($nextpos-$pos,$decomp,$score,$num)\n"; - $pos = $nextpos-1; - } - - chop($decomp); - print STDERR "\tsplit: $decomp ($score ** 1/$num) = ".($score ** (1/$num))."\n" if $VERBOSE; - $score **= 1/$num; - if ($score>$best_score) { - $best_score = $score; - $best_split = $decomp; - } - - # increase iterator - my $increase = -1; - while($increase<$final) { - $increase = pop @INDEX; - $ITERATOR{$increase}++; - last if scalar @{$REACHABLE{$increase}} > $ITERATOR{$increase}; - } - last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final}; - for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; } - } - if ($best_split !~ / /) { - print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer - print $factored_word; # do not change case for unsplit words - next; - } - if (!$SYNTAX) { - print $best_split; - } - else { - $BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT; - $BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n"); - my $pos = $1; - print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer - - my @SPLIT = split(/ /,$best_split); - my @OUT = (); - if ($BINARIZE) { - for(my $w=0;$w<scalar(@SPLIT)-2;$w++) { - push @OUT,"<tree label=\"\@$pos\">"; - } - } - for(my $w=0;$w<scalar(@SPLIT);$w++) { - if ($BINARIZE && $w>=2) { push @OUT, "</tree>"; } - push @OUT,"<tree label=\"".$LABEL{lc($SPLIT[$w])}."\"> $SPLIT[$w] </tree>"; - } - print join(" ",@OUT); - } - } - print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer - print "\n"; - } -} diff --git a/detruecase.perl b/detruecase.perl deleted file mode 100755 index 012c143..0000000 --- a/detruecase.perl +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use Getopt::Long "GetOptions"; - -binmode(STDIN, ":utf8"); -binmode(STDOUT, ":utf8"); - -my ($SRC,$INFILE,$UNBUFFERED); -die("detruecase.perl < in > out") - unless &GetOptions('headline=s' => \$SRC, - 'in=s' => \$INFILE, - 'b|unbuffered' => \$UNBUFFERED); -if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } - -my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); -my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"""=>1,"'"=>1,"["=>1,"]"=>1); - -# lowercase even in headline -my %ALWAYS_LOWER; -foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; } - -# find out about the headlines -my @HEADLINE; -if (defined($SRC)) { - open(SRC,$SRC); - my $headline_flag = 0; - while(<SRC>) { - $headline_flag = 1 if /<hl>/; - $headline_flag = 0 if /<.hl>/; - next unless /^<seg/; - push @HEADLINE, $headline_flag; - } - close(SRC); -} - -my $sentence = 0; -if ($INFILE) { - open(IN,$INFILE) || die("ERROR: could not open file '$INFILE'"); - binmode(IN, ":utf8"); - while(<IN>) { - &process($_,$sentence++); - } - close(IN); -} -else { - while(<STDIN>) { - &process($_,$sentence++); - } -} - -sub process { - my $line = $_[0]; - chomp($line); - $line =~ s/^\s+//; - $line =~ s/\s+$//; - my @WORD = split(/\s+/,$line); - - # uppercase at sentence start - my $sentence_start = 1; - for(my $i=0;$i<scalar(@WORD);$i++) { - &uppercase(\$WORD[$i]) if $sentence_start; - if (defined($SENTENCE_END{ $WORD[$i] })) { $sentence_start = 1; } - elsif (!defined($DELAYED_SENTENCE_START{$WORD[$i] })) { $sentence_start = 0; } - } - - # uppercase headlines { - if (defined($SRC) && $HEADLINE[$sentence]) { - foreach (@WORD) { - &uppercase(\$_) unless $ALWAYS_LOWER{$_}; - } - } - - # output - my $first = 1; - foreach (@WORD) { - print " " unless $first; - $first = 0; - print $_; - } - print "\n"; - $sentence++; -} - -sub uppercase { - my ($W) = @_; - $$W = uc(substr($$W,0,1)).substr($$W,1); -} diff --git a/lowercase.perl b/lowercase.perl deleted file mode 100755 index c30e029..0000000 --- a/lowercase.perl +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/perl -w - -use strict; - -binmode(STDIN, ":utf8"); -binmode(STDOUT, ":utf8"); - -while(<STDIN>) { - print lc($_); -} diff --git a/multi-bleu.perl b/multi-bleu.perl deleted file mode 100755 index a25e347..0000000 --- a/multi-bleu.perl +++ /dev/null @@ -1,174 +0,0 @@ -#!/usr/bin/env perl -# -# This file is part of moses. Its use is licensed under the GNU Lesser General -# Public License version 2.1 or, at your option, any later version. - -# $Id$ -use warnings; -use strict; - -my $lowercase = 0; -if ($ARGV[0] eq "-lc") { - $lowercase = 1; - shift; -} - -my $stem = $ARGV[0]; -if (!defined $stem) { - print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; - print STDERR "Reads the references from reference or reference0, reference1, ...\n"; - exit(1); -} - -$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; - -my @REF; -my $ref=0; -while(-e "$stem$ref") { - &add_to_ref("$stem$ref",\@REF); - $ref++; -} -&add_to_ref($stem,\@REF) if -e $stem; -die("ERROR: could not find reference file $stem") unless scalar @REF; - -# add additional references explicitly specified on the command line -shift; -foreach my $stem (@ARGV) { - &add_to_ref($stem,\@REF) if -e $stem; -} - - - -sub add_to_ref { - my ($file,$REF) = @_; - my $s=0; - if ($file =~ /.gz$/) { - open(REF,"gzip -dc $file|") or die "Can't read $file"; - } else { - open(REF,$file) or die "Can't read $file"; - } - while(<REF>) { - chop; - push @{$$REF[$s++]}, $_; - } - close(REF); -} - -my(@CORRECT,@TOTAL,$length_translation,$length_reference); -my $s=0; -while(<STDIN>) { - chop; - $_ = lc if $lowercase; - my @WORD = split; - my %REF_NGRAM = (); - my $length_translation_this_sentence = scalar(@WORD); - my ($closest_diff,$closest_length) = (9999,9999); - foreach my $reference (@{$REF[$s]}) { -# print "$s $_ <=> $reference\n"; - $reference = lc($reference) if $lowercase; - my @WORD = split(' ',$reference); - my $length = scalar(@WORD); - my $diff = abs($length_translation_this_sentence-$length); - if ($diff < $closest_diff) { - $closest_diff = $diff; - $closest_length = $length; - # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; - } elsif ($diff == $closest_diff) { - $closest_length = $length if $length < $closest_length; - # from two references with the same closeness to me - # take the *shorter* into account, not the "first" one. - } - for(my $n=1;$n<=4;$n++) { - my %REF_NGRAM_N = (); - for(my $start=0;$start<=$#WORD-($n-1);$start++) { - my $ngram = "$n"; - for(my $w=0;$w<$n;$w++) { - $ngram .= " ".$WORD[$start+$w]; - } - $REF_NGRAM_N{$ngram}++; - } - foreach my $ngram (keys %REF_NGRAM_N) { - if (!defined($REF_NGRAM{$ngram}) || - $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { - $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; -# print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n"; - } - } - } - } - $length_translation += $length_translation_this_sentence; - $length_reference += $closest_length; - for(my $n=1;$n<=4;$n++) { - my %T_NGRAM = (); - for(my $start=0;$start<=$#WORD-($n-1);$start++) { - my $ngram = "$n"; - for(my $w=0;$w<$n;$w++) { - $ngram .= " ".$WORD[$start+$w]; - } - $T_NGRAM{$ngram}++; - } - foreach my $ngram (keys %T_NGRAM) { - $ngram =~ /^(\d+) /; - my $n = $1; - # my $corr = 0; -# print "$i e $ngram $T_NGRAM{$ngram}<BR>\n"; - $TOTAL[$n] += $T_NGRAM{$ngram}; - if (defined($REF_NGRAM{$ngram})) { - if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { - $CORRECT[$n] += $T_NGRAM{$ngram}; - # $corr = $T_NGRAM{$ngram}; -# print "$i e correct1 $T_NGRAM{$ngram}<BR>\n"; - } - else { - $CORRECT[$n] += $REF_NGRAM{$ngram}; - # $corr = $REF_NGRAM{$ngram}; -# print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n"; - } - } - # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; - # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" - } - } - $s++; -} -my $brevity_penalty = 1; -my $bleu = 0; - -my @bleu=(); - -for(my $n=1;$n<=4;$n++) { - if (defined ($TOTAL[$n])){ - $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; - # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; - }else{ - $bleu[$n]=0; - } -} - -if ($length_reference==0){ - printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; - exit(1); -} - -if ($length_translation<$length_reference) { - $brevity_penalty = exp(1-$length_reference/$length_translation); -} -$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + - my_log( $bleu[2] ) + - my_log( $bleu[3] ) + - my_log( $bleu[4] ) ) / 4) ; -printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", - 100*$bleu, - 100*$bleu[1], - 100*$bleu[2], - 100*$bleu[3], - 100*$bleu[4], - $brevity_penalty, - $length_translation / $length_reference, - $length_translation, - $length_reference; - -sub my_log { - return -9999999999 unless $_[0]; - return log($_[0]); -} diff --git a/tokenizer-no-escape.perl b/tokenizer-no-escape.perl deleted file mode 100755 index 4397360..0000000 --- a/tokenizer-no-escape.perl +++ /dev/null @@ -1,348 +0,0 @@ -#!/usr/bin/perl -w - -# Sample Tokenizer -### Version 1.1 -# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn -# Version 1.1 updates: -# (1) add multithreading option "-threads NUM_THREADS" (default is 1); -# (2) add a timing option "-time" to calculate the average speed of this tokenizer; -# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed); -### Version 1.0 -# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $ -# written by Josh Schroeder, based on code by Philipp Koehn - -binmode(STDIN, ":utf8"); -binmode(STDOUT, ":utf8"); - -use FindBin qw($RealBin); -use strict; -use Time::HiRes; -#use Thread; - -my $mydir = "$RealBin/nonbreaking_prefixes"; - -my %NONBREAKING_PREFIX = (); -my $language = "en"; -my $QUIET = 0; -my $HELP = 0; -my $AGGRESSIVE = 0; -my $SKIP_XML = 0; -my $TIMING = 0; -my $NUM_THREADS = 1; -my $NUM_SENTENCES_PER_THREAD = 2000; - -while (@ARGV) -{ - $_ = shift; - /^-b$/ && ($| = 1, next); - /^-l$/ && ($language = shift, next); - /^-q$/ && ($QUIET = 1, next); - /^-h$/ && ($HELP = 1, next); - /^-x$/ && ($SKIP_XML = 1, next); - /^-a$/ && ($AGGRESSIVE = 1, next); - /^-time$/ && ($TIMING = 1, next); - /^-threads$/ && ($NUM_THREADS = int(shift), next); - /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next); -} - -# for time calculation -my $start_time; -if ($TIMING) -{ - $start_time = [ Time::HiRes::gettimeofday( ) ]; -} - -# print help message -if ($HELP) -{ - print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n"; - print "Options:\n"; - print " -q ... quiet.\n"; - print " -a ... aggressive hyphen splitting.\n"; - print " -b ... disable Perl buffering.\n"; - print " -time ... enable processing time calculation.\n"; - exit; -} - -if (!$QUIET) -{ - print STDERR "Tokenizer Version 1.1\n"; - print STDERR "Language: $language\n"; - print STDERR "Number of threads: $NUM_THREADS\n"; -} - -# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes -load_prefixes($language,\%NONBREAKING_PREFIX); - -if (scalar(%NONBREAKING_PREFIX) eq 0) -{ - print STDERR "Warning: No known abbreviations for language '$language'\n"; -} - -my @batch_sentences = (); -my @thread_list = (); -my $count_sentences = 0; - -if ($NUM_THREADS > 1) -{# multi-threading tokenization - while(<STDIN>) - { - $count_sentences = $count_sentences + 1; - push(@batch_sentences, $_); - if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS)) - { - # assign each thread work - for (my $i=0; $i<$NUM_THREADS; $i++) - { - my $start_index = $i*$NUM_SENTENCES_PER_THREAD; - my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; - my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; - my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; - push(@thread_list, $new_thread); - } - foreach (@thread_list) - { - my $tokenized_list = $_->join; - foreach (@$tokenized_list) - { - print $_; - } - } - # reset for the new run - @thread_list = (); - @batch_sentences = (); - } - } - # the last batch - if (scalar(@batch_sentences)>0) - { - # assign each thread work - for (my $i=0; $i<$NUM_THREADS; $i++) - { - my $start_index = $i*$NUM_SENTENCES_PER_THREAD; - if ($start_index >= scalar(@batch_sentences)) - { - last; - } - my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; - if ($end_index >= scalar(@batch_sentences)) - { - $end_index = scalar(@batch_sentences)-1; - } - my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; - my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; - push(@thread_list, $new_thread); - } - foreach (@thread_list) - { - my $tokenized_list = $_->join; - foreach (@$tokenized_list) - { - print $_; - } - } - } -} -else -{# single thread only - while(<STDIN>) - { - if (($SKIP_XML && /^<.+>$/) || /^\s*$/) - { - #don't try to tokenize XML/HTML tag lines - print $_; - } - else - { - print &tokenize($_); - } - } -} - -if ($TIMING) -{ - my $duration = Time::HiRes::tv_interval( $start_time ); - print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n"); - print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n"); -} - -##################################################################################### -# subroutines afterward - -# tokenize a batch of texts saved in an array -# input: an array containing a batch of texts -# return: another array cotaining a batch of tokenized texts for the input array -sub tokenize_batch -{ - my(@text_list) = @_; - my(@tokenized_list) = (); - foreach (@text_list) - { - if (($SKIP_XML && /^<.+>$/) || /^\s*$/) - { - #don't try to tokenize XML/HTML tag lines - push(@tokenized_list, $_); - } - else - { - push(@tokenized_list, &tokenize($_)); - } - } - return \@tokenized_list; -} - -# the actual tokenize function which tokenizes one input string -# input: one string -# return: the tokenized string for the input string -sub tokenize -{ - my($text) = @_; - chomp($text); - $text = " $text "; - - # remove ASCII junk - $text =~ s/\s+/ /g; - $text =~ s/[\000-\037]//g; - - # seperate out all "other" special characters - $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; - - # aggressive hyphen splitting - if ($AGGRESSIVE) - { - $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g; - } - - #multi-dots stay together - $text =~ s/\.([\.]+)/ DOTMULTI$1/g; - while($text =~ /DOTMULTI\./) - { - $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; - $text =~ s/DOTMULTI\./DOTDOTMULTI/g; - } - - # seperate out "," except if within numbers (5,300) - $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; - # separate , pre and post number - $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; - $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; - - # turn `into ' - $text =~ s/\`/\'/g; - - #turn '' into " - $text =~ s/\'\'/ \" /g; - - if ($language eq "en") - { - #split contractions right - $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; - $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; - $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; - $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; - #special case for "1990's" - $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; - } - elsif (($language eq "fr") or ($language eq "it")) - { - #split contractions left - $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; - $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; - $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; - $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; - } - else - { - $text =~ s/\'/ \' /g; - } - - #word token method - my @words = split(/\s/,$text); - $text = ""; - for (my $i=0;$i<(scalar(@words));$i++) - { - my $word = $words[$i]; - if ( $word =~ /^(\S+)\.$/) - { - my $pre = $1; - if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) - { - #no change - } - elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) - { - #no change - } - else - { - $word = $pre." ."; - } - } - $text .= $word." "; - } - - # clean up extraneous spaces - $text =~ s/ +/ /g; - $text =~ s/^ //g; - $text =~ s/ $//g; - - #restore multi-dots - while($text =~ /DOTDOTMULTI/) - { - $text =~ s/DOTDOTMULTI/DOTMULTI./g; - } - $text =~ s/DOTMULTI/./g; - - #escape special chars - #$text =~ s/\&/\&/g; # escape escape - #$text =~ s/\|/\|/g; # factor separator - #$text =~ s/\</\</g; # xml - #$text =~ s/\>/\>/g; # xml - #$text =~ s/\'/\'/g; # xml - #$text =~ s/\"/\"/g; # xml - #$text =~ s/\[/\[/g; # syntax non-terminal - #$text =~ s/\]/\]/g; # syntax non-terminal - - #ensure final line break - $text .= "\n" unless $text =~ /\n$/; - - return $text; -} - -sub load_prefixes -{ - my ($language, $PREFIX_REF) = @_; - - my $prefixfile = "$mydir/nonbreaking_prefix.$language"; - - #default back to English if we don't have a language-specific prefix file - if (!(-e $prefixfile)) - { - $prefixfile = "$mydir/nonbreaking_prefix.en"; - print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; - die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); - } - - if (-e "$prefixfile") - { - open(PREFIX, "<:utf8", "$prefixfile"); - while (<PREFIX>) - { - my $item = $_; - chomp($item); - if (($item) && (substr($item,0,1) ne "#")) - { - if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) - { - $PREFIX_REF->{$1} = 2; - } - else - { - $PREFIX_REF->{$item} = 1; - } - } - } - close(PREFIX); - } -} - diff --git a/train-truecaser.perl b/train-truecaser.perl deleted file mode 100755 index 59a83ec..0000000 --- a/train-truecaser.perl +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/perl -w - -# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ - -# -# Options: -# -# --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token. -# - -use strict; -use Getopt::Long "GetOptions"; - -# apply switches -my ($MODEL,$CORPUS); -die("train-truecaser.perl --model truecaser --corpus cased [--possiblyUseFirstToken]") - unless &GetOptions('corpus=s' => \$CORPUS, - 'model=s' => \$MODEL, - 'possiblyUseFirstToken' => \(my $possiblyUseFirstToken = 0)) - && defined($CORPUS) && defined($MODEL); -my %CASING; -my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); -my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); -open(CORPUS,$CORPUS) || die("ERROR: could not open '$CORPUS'"); -binmode(CORPUS, ":utf8"); -while(<CORPUS>) { - chop; - my ($WORD,$MARKUP) = split_xml($_); - my $start = 0; - while($start<=$#$WORD && defined($DELAYED_SENTENCE_START{$$WORD[$start]})) { $start++; } - my $firstWordOfSentence = 1; - for(my $i=$start;$i<=$#$WORD;$i++) { - my $currentWord = $$WORD[$i]; - if (! $firstWordOfSentence && defined($SENTENCE_END{$$WORD[$i-1]})) { - $firstWordOfSentence = 1; - } - - my $currentWordWeight = 0; - if (! $firstWordOfSentence) { - $currentWordWeight = 1; - } elsif ($possiblyUseFirstToken) { - # gated special handling of first word of sentence - my $firstChar = substr($currentWord, 0, 1); - if (lc($firstChar) eq $firstChar) { - # if the first character is not upper case, count the token as full evidence (because if it's not capitalized, then there's no reason to be wary that the given casing is only due to being sentence-initial) - $currentWordWeight = 1; - } elsif (scalar(@$WORD) == 1) { - # if the first character is upper case, but the current token is the only token of the segment, then count the token as partial evidence (because the segment is presumably not a sentence and the token is therefore not the first word of a sentence and is possibly in its natural case) - $currentWordWeight = 0.1; - } - } - if ($currentWordWeight > 0) { - $CASING{ lc($currentWord) }{ $currentWord } += $currentWordWeight; - } - - $firstWordOfSentence = 0; - } -} -close(CORPUS); - -open(MODEL,">$MODEL") || die("ERROR: could not create '$MODEL'"); -binmode(MODEL, ":utf8"); -foreach my $type (keys %CASING) { - my ($score,$total,$best) = (-1,0,""); - foreach my $word (keys %{$CASING{$type}}) { - my $count = $CASING{$type}{$word}; - $total += $count; - if ($count > $score) { - $best = $word; - $score = $count; - } - } - print MODEL "$best ($score/$total)"; - foreach my $word (keys %{$CASING{$type}}) { - print MODEL " $word ($CASING{$type}{$word})" unless $word eq $best; - } - print MODEL "\n"; -} -close(MODEL); - - -# store away xml markup -sub split_xml { - my ($line) = @_; - my (@WORD,@MARKUP); - my $i = 0; - $MARKUP[0] = ""; - while($line =~ /\S/) { - # XML tag - if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { - $MARKUP[$i] .= $1." "; - $line = $2; - } - # non-XML text - elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { - $WORD[$i++] = $1; - $MARKUP[$i] = ""; - $line = $2; - } - # '<' or '>' occurs in word, but it's not an XML tag - elsif ($line =~ /^\s*(\S+)(.*)$/) { - $WORD[$i++] = $1; - $MARKUP[$i] = ""; - $line = $2; - } - else { - die("ERROR: huh? $line\n"); - } - } - chop($MARKUP[$#MARKUP]); - return (\@WORD,\@MARKUP); -} diff --git a/truecase.perl b/truecase.perl deleted file mode 100755 index 0a4d366..0000000 --- a/truecase.perl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/perl -w - -# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ -use strict; -use Getopt::Long "GetOptions"; - -binmode(STDIN, ":utf8"); -binmode(STDOUT, ":utf8"); - -# apply switches -my ($MODEL, $UNBUFFERED); -die("truecase.perl --model MODEL [-b] < in > out") - unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED) - && defined($MODEL); -if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } - -my (%BEST,%KNOWN); -open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'"); -binmode(MODEL, ":utf8"); -while(<MODEL>) { - my ($word,@OPTIONS) = split; - $BEST{ lc($word) } = $word; - $KNOWN{ $word } = 1; - for(my $i=1;$i<$#OPTIONS;$i+=2) { - $KNOWN{ $OPTIONS[$i] } = 1; - } -} -close(MODEL); - -my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); -my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); - -while(<STDIN>) { - chop; - my ($WORD,$MARKUP) = split_xml($_); - my $sentence_start = 1; - for(my $i=0;$i<=$#$WORD;$i++) { - print " " if $i && $$MARKUP[$i] eq ''; - print $$MARKUP[$i]; - - my ($word,$otherfactors); - if ($$WORD[$i] =~ /^([^\|]+)(.*)/) - { - $word = $1; - $otherfactors = $2; - } - else - { - $word = $$WORD[$i]; - $otherfactors = ""; - } - - if ($sentence_start && defined($BEST{lc($word)})) { - print $BEST{lc($word)}; # truecase sentence start - } - elsif (defined($KNOWN{$word})) { - print $word; # don't change known words - } - elsif (defined($BEST{lc($word)})) { - print $BEST{lc($word)}; # truecase otherwise unknown words - } - else { - print $word; # unknown, nothing to do - } - print $otherfactors; - - if ( defined($SENTENCE_END{ $word })) { $sentence_start = 1; } - elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; } - } - print $$MARKUP[$#$MARKUP]; - print "\n"; -} - -# store away xml markup -sub split_xml { - my ($line) = @_; - my (@WORD,@MARKUP); - my $i = 0; - $MARKUP[0] = ""; - while($line =~ /\S/) { - # XML tag - if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { - $MARKUP[$i] .= $1." "; - $line = $2; - } - # non-XML text - elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { - $WORD[$i++] = $1; - $MARKUP[$i] = ""; - $line = $2; - } - # '<' or '>' occurs in word, but it's not an XML tag - elsif ($line =~ /^\s*(\S+)(.*)$/) { - $WORD[$i++] = $1; - $MARKUP[$i] = ""; - $line = $2; - } - else { - die("ERROR: huh? $line\n"); - } - } - chop($MARKUP[$#MARKUP]); - return (\@WORD,\@MARKUP); -} |