summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2018-04-17 15:43:17 +0000
committerPatrick Simianer <p@simianer.de>2018-04-17 15:43:17 +0000
commitf44539de04b178f3a1b14960407ec683030f927a (patch)
tree08a6a0c50454e57b8c36e3438a5569ab153aa258
parent641e80a4ad7bff2bb0cae447cc39da0eccc662dd (diff)
parente86f8f5139196bc99a55797c255401a0d6a86214 (diff)
Merge branch 'master' of https://github.com/pks/nlp_scripts
-rw-r--r--README.md8
-rwxr-xr-xcompound-splitter.perl291
-rwxr-xr-xdetruecase.perl88
-rwxr-xr-xlowercase.perl10
-rwxr-xr-xmulti-bleu.perl174
-rwxr-xr-xtokenizer-no-escape.perl348
-rwxr-xr-xtrain-truecaser.perl112
-rwxr-xr-xtruecase.perl104
8 files changed, 2 insertions, 1133 deletions
diff --git a/README.md b/README.md
index 1e4bb01..49ed30d 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,5 @@
-A number of NLP related scripts. Some scripts require my zipf gem, see [1].
-\*.perl taken from the moses [2] toolkit.
-mem\_usage taken from [3].
-
+A number of NLP related scripts. Some scripts require my `zipf` ruby gem,
+see [1].
[1] https://github.com/pks/zipf
-[2] https://github.com/moses-smt/mosesdecoder
-[3] https://gist.github.com/netj/526585
diff --git a/compound-splitter.perl b/compound-splitter.perl
deleted file mode 100755
index beca70e..0000000
--- a/compound-splitter.perl
+++ /dev/null
@@ -1,291 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use Getopt::Long "GetOptions";
-
-my ($CORPUS,$MODEL,$TRAIN,$HELP,$VERBOSE);
-my $FILLER = ":s:es";
-my $MIN_SIZE = 3;
-my $MIN_COUNT = 5;
-my $MAX_COUNT = 5;
-my $FACTORED = 0;
-my $SYNTAX = 0;
-my $MARK_SPLIT = 0;
-my $BINARIZE = 0;
-$HELP = 1
- unless &GetOptions('corpus=s' => \$CORPUS,
- 'model=s' => \$MODEL,
- 'filler=s' => \$FILLER,
- 'factored' => \$FACTORED,
- 'min-size=i' => \$MIN_SIZE,
- 'min-count=i' => \$MIN_COUNT,
- 'max-count=i' => \$MAX_COUNT,
- 'help' => \$HELP,
- 'verbose' => \$VERBOSE,
- 'syntax' => \$SYNTAX,
- 'binarize' => \$BINARIZE,
- 'mark-split' => \$MARK_SPLIT,
- 'train' => \$TRAIN);
-
-if ($HELP ||
- ( $TRAIN && !$CORPUS) ||
- (!$TRAIN && !$MODEL)) {
- print "Compound splitter\n";
- print "-----------------\n\n";
- print "train: compound-splitter -train -corpus txt-file -model new-model\n";
- print "apply: compound-splitter -model trained-model < in > out\n";
- print "options: -min-size: minimum word size (default $MIN_SIZE)\n";
- print " -min-count: minimum word count (default $MIN_COUNT)\n";
- print " -filler: filler letters between words (default $FILLER)\n";
- print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n";
- print " -syntax: syntactically parsed data (default $SYNTAX)\n";
- print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n";
- print " -binarize: binarize subtree for split word (default $BINARIZE)\n";
- exit;
-}
-
-if ($TRAIN) {
- if ($SYNTAX) { &train_syntax(); }
- elsif ($FACTORED) { &train_factored(); }
- else { &train(); }
-}
-else {
- &apply();
-}
-
-sub train {
- my %COUNT;
- open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
- while(<CORPUS>) {
- chop; s/\s+/ /g; s/^ //; s/ $//;
- foreach (split) {
- $COUNT{$_}++;
- }
- }
- close(CORPUS);
- &save_trained_model(\%COUNT);
-}
-
-sub save_trained_model {
- my ($COUNT) = @_;
- my $id = 0;
- open(MODEL,">".$MODEL);
- foreach my $word (keys %$COUNT) {
- print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n";
- }
- close(MODEL);
- print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n";
-}
-
-sub train_factored {
- my (%COUNT,%FACTORED_COUNT);
- # collect counts for interpretations for each surface word
- open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
- while(<CORPUS>) {
- chop; s/\s+/ /g; s/^ //; s/ $//;
- foreach my $factored_word (split) {
- my $word = $factored_word;
- $word =~ s/\|.+//g; # just first factor
- $FACTORED_COUNT{$word}{$factored_word}++;
- }
- }
- close(CORPUS);
- # only preserve most frequent interpretation, assign sum of counts
- foreach my $word (keys %FACTORED_COUNT) {
- my ($max,$best,$total) = (0,"",0);
- foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) {
- my $count = $FACTORED_COUNT{$word}{$factored_word};
- $total += $count;
- if ($count > $max) {
- $max = $count;
- $best = $factored_word;
- }
- }
- $COUNT{$best} = $total;
- }
- &save_trained_model(\%COUNT);
-}
-
-sub train_syntax {
- my (%COUNT,%LABELED_COUNT);
- # collect counts for interpretations for each surface word
- open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
- while(<CORPUS>) {
- chop; s/\s+/ /g; s/^ //; s/ $//;
- my $label;
- foreach (split) {
- if (/^label="([^\"]+)"/) {
- $label = $1;
- }
- elsif (! /^</) {
- $LABELED_COUNT{$_}{$label}++;
- }
- }
- }
- close(CORPUS);
-
- # only preserve most frequent label, assign sum of counts
- foreach my $word (keys %LABELED_COUNT) {
- my ($max,$best,$total) = (0,"",0);
- foreach my $label (keys %{$LABELED_COUNT{$word}}) {
- my $count = $LABELED_COUNT{$word}{$label};
- $total += $count;
- if ($count > $max) {
- $max = $count;
- $best = "$word $label";
- }
- }
- $COUNT{$best} = $total;
- }
- &save_trained_model(\%COUNT);
-}
-
-sub apply {
- my (%COUNT,%TRUECASE,%LABEL);
- open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'");
- while(<MODEL>) {
- chomp;
- my ($id,$factored_word,$count) = split(/\t/);
- my $label;
- ($factored_word,$label) = split(/ /,$factored_word);
- my $word = $factored_word;
- $word =~ s/\|.+//g; # just first factor
- my $lc = lc($word);
- # if word exists with multipe casings, only record most frequent
- next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
- $COUNT{$lc} = $count;
- $TRUECASE{$lc} = $factored_word;
- $LABEL{$lc} = $label if $SYNTAX;
- }
- close(MODEL);
-
- while(<STDIN>) {
- my $first = 1;
- chop; s/\s+/ /g; s/^ //; s/ $//;
- my @BUFFER; # for xml tags
- foreach my $factored_word (split) {
- print " " unless $first;
- $first = 0;
-
- # syntax: don't split xml
- if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
- push @BUFFER,$factored_word;
- $first = 1;
- next;
- }
-
- # get case class
- my $word = $factored_word;
- $word =~ s/\|.+//g; # just first factor
- my $lc = lc($word);
-
- print STDERR "considering $word ($lc)...\n" if $VERBOSE;
- # don't split frequent words
- if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
- $lc !~ /[a-zA-Z]/) {; # has to have at least one letter
- print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
- print $factored_word;
- print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
- next;
- }
-
- # consider possible splits
- my $final = length($word)-1;
- my %REACHABLE;
- for(my $i=0;$i<=$final;$i++) { $REACHABLE{$i} = (); }
-
- print STDERR "splitting $word:\n" if $VERBOSE;
- for(my $end=$MIN_SIZE;$end<length($word);$end++) {
- for(my $start=0;$start<=$end-$MIN_SIZE;$start++) {
- next unless $start == 0 || defined($REACHABLE{$start-1});
- foreach my $filler (split(/:/,$FILLER)) {
- next if $start == 0 && $filler ne "";
- next if lc(substr($word,$start,length($filler))) ne $filler;
- my $subword = lc(substr($word,
- $start+length($filler),
- $end-$start+1-length($filler)));
- next unless defined($COUNT{$subword});
- next unless $COUNT{$subword} >= $MIN_COUNT;
- print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE;
- push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}";
- }
- }
- }
-
- # no matches at all?
- if (!defined($REACHABLE{$final})) {
- print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
- print $factored_word;
- next;
- }
-
- my ($best_split,$best_score) = ("",0);
-
- my %ITERATOR;
- for(my $i=0;$i<=$final;$i++) { $ITERATOR{$i}=0; }
- my $done = 0;
- while(1) {
- # read off word
- my ($pos,$decomp,$score,$num,@INDEX) = ($final,"",1,0);
- while($pos>0) {
- last unless scalar @{$REACHABLE{$pos}} > $ITERATOR{$pos}; # dead end?
- my ($nextpos,$subword,$count)
- = split(/ /,$REACHABLE{$pos}[ $ITERATOR{$pos} ]);
- $decomp = $subword." ".$decomp;
- $score *= $count;
- $num++;
- push @INDEX,$pos;
-# print STDERR "($nextpos-$pos,$decomp,$score,$num)\n";
- $pos = $nextpos-1;
- }
-
- chop($decomp);
- print STDERR "\tsplit: $decomp ($score ** 1/$num) = ".($score ** (1/$num))."\n" if $VERBOSE;
- $score **= 1/$num;
- if ($score>$best_score) {
- $best_score = $score;
- $best_split = $decomp;
- }
-
- # increase iterator
- my $increase = -1;
- while($increase<$final) {
- $increase = pop @INDEX;
- $ITERATOR{$increase}++;
- last if scalar @{$REACHABLE{$increase}} > $ITERATOR{$increase};
- }
- last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final};
- for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; }
- }
- if ($best_split !~ / /) {
- print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
- print $factored_word; # do not change case for unsplit words
- next;
- }
- if (!$SYNTAX) {
- print $best_split;
- }
- else {
- $BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT;
- $BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n");
- my $pos = $1;
- print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
-
- my @SPLIT = split(/ /,$best_split);
- my @OUT = ();
- if ($BINARIZE) {
- for(my $w=0;$w<scalar(@SPLIT)-2;$w++) {
- push @OUT,"<tree label=\"\@$pos\">";
- }
- }
- for(my $w=0;$w<scalar(@SPLIT);$w++) {
- if ($BINARIZE && $w>=2) { push @OUT, "</tree>"; }
- push @OUT,"<tree label=\"".$LABEL{lc($SPLIT[$w])}."\"> $SPLIT[$w] </tree>";
- }
- print join(" ",@OUT);
- }
- }
- print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer
- print "\n";
- }
-}
diff --git a/detruecase.perl b/detruecase.perl
deleted file mode 100755
index 012c143..0000000
--- a/detruecase.perl
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use Getopt::Long "GetOptions";
-
-binmode(STDIN, ":utf8");
-binmode(STDOUT, ":utf8");
-
-my ($SRC,$INFILE,$UNBUFFERED);
-die("detruecase.perl < in > out")
- unless &GetOptions('headline=s' => \$SRC,
- 'in=s' => \$INFILE,
- 'b|unbuffered' => \$UNBUFFERED);
-if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
-
-my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
-my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&quot;"=>1,"&apos;"=>1,"&#91;"=>1,"&#93;"=>1);
-
-# lowercase even in headline
-my %ALWAYS_LOWER;
-foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; }
-
-# find out about the headlines
-my @HEADLINE;
-if (defined($SRC)) {
- open(SRC,$SRC);
- my $headline_flag = 0;
- while(<SRC>) {
- $headline_flag = 1 if /<hl>/;
- $headline_flag = 0 if /<.hl>/;
- next unless /^<seg/;
- push @HEADLINE, $headline_flag;
- }
- close(SRC);
-}
-
-my $sentence = 0;
-if ($INFILE) {
- open(IN,$INFILE) || die("ERROR: could not open file '$INFILE'");
- binmode(IN, ":utf8");
- while(<IN>) {
- &process($_,$sentence++);
- }
- close(IN);
-}
-else {
- while(<STDIN>) {
- &process($_,$sentence++);
- }
-}
-
-sub process {
- my $line = $_[0];
- chomp($line);
- $line =~ s/^\s+//;
- $line =~ s/\s+$//;
- my @WORD = split(/\s+/,$line);
-
- # uppercase at sentence start
- my $sentence_start = 1;
- for(my $i=0;$i<scalar(@WORD);$i++) {
- &uppercase(\$WORD[$i]) if $sentence_start;
- if (defined($SENTENCE_END{ $WORD[$i] })) { $sentence_start = 1; }
- elsif (!defined($DELAYED_SENTENCE_START{$WORD[$i] })) { $sentence_start = 0; }
- }
-
- # uppercase headlines {
- if (defined($SRC) && $HEADLINE[$sentence]) {
- foreach (@WORD) {
- &uppercase(\$_) unless $ALWAYS_LOWER{$_};
- }
- }
-
- # output
- my $first = 1;
- foreach (@WORD) {
- print " " unless $first;
- $first = 0;
- print $_;
- }
- print "\n";
- $sentence++;
-}
-
-sub uppercase {
- my ($W) = @_;
- $$W = uc(substr($$W,0,1)).substr($$W,1);
-}
diff --git a/lowercase.perl b/lowercase.perl
deleted file mode 100755
index c30e029..0000000
--- a/lowercase.perl
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-
-binmode(STDIN, ":utf8");
-binmode(STDOUT, ":utf8");
-
-while(<STDIN>) {
- print lc($_);
-}
diff --git a/multi-bleu.perl b/multi-bleu.perl
deleted file mode 100755
index a25e347..0000000
--- a/multi-bleu.perl
+++ /dev/null
@@ -1,174 +0,0 @@
-#!/usr/bin/env perl
-#
-# This file is part of moses. Its use is licensed under the GNU Lesser General
-# Public License version 2.1 or, at your option, any later version.
-
-# $Id$
-use warnings;
-use strict;
-
-my $lowercase = 0;
-if ($ARGV[0] eq "-lc") {
- $lowercase = 1;
- shift;
-}
-
-my $stem = $ARGV[0];
-if (!defined $stem) {
- print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
- print STDERR "Reads the references from reference or reference0, reference1, ...\n";
- exit(1);
-}
-
-$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
-
-my @REF;
-my $ref=0;
-while(-e "$stem$ref") {
- &add_to_ref("$stem$ref",\@REF);
- $ref++;
-}
-&add_to_ref($stem,\@REF) if -e $stem;
-die("ERROR: could not find reference file $stem") unless scalar @REF;
-
-# add additional references explicitly specified on the command line
-shift;
-foreach my $stem (@ARGV) {
- &add_to_ref($stem,\@REF) if -e $stem;
-}
-
-
-
-sub add_to_ref {
- my ($file,$REF) = @_;
- my $s=0;
- if ($file =~ /.gz$/) {
- open(REF,"gzip -dc $file|") or die "Can't read $file";
- } else {
- open(REF,$file) or die "Can't read $file";
- }
- while(<REF>) {
- chop;
- push @{$$REF[$s++]}, $_;
- }
- close(REF);
-}
-
-my(@CORRECT,@TOTAL,$length_translation,$length_reference);
-my $s=0;
-while(<STDIN>) {
- chop;
- $_ = lc if $lowercase;
- my @WORD = split;
- my %REF_NGRAM = ();
- my $length_translation_this_sentence = scalar(@WORD);
- my ($closest_diff,$closest_length) = (9999,9999);
- foreach my $reference (@{$REF[$s]}) {
-# print "$s $_ <=> $reference\n";
- $reference = lc($reference) if $lowercase;
- my @WORD = split(' ',$reference);
- my $length = scalar(@WORD);
- my $diff = abs($length_translation_this_sentence-$length);
- if ($diff < $closest_diff) {
- $closest_diff = $diff;
- $closest_length = $length;
- # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
- } elsif ($diff == $closest_diff) {
- $closest_length = $length if $length < $closest_length;
- # from two references with the same closeness to me
- # take the *shorter* into account, not the "first" one.
- }
- for(my $n=1;$n<=4;$n++) {
- my %REF_NGRAM_N = ();
- for(my $start=0;$start<=$#WORD-($n-1);$start++) {
- my $ngram = "$n";
- for(my $w=0;$w<$n;$w++) {
- $ngram .= " ".$WORD[$start+$w];
- }
- $REF_NGRAM_N{$ngram}++;
- }
- foreach my $ngram (keys %REF_NGRAM_N) {
- if (!defined($REF_NGRAM{$ngram}) ||
- $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
- $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
-# print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
- }
- }
- }
- }
- $length_translation += $length_translation_this_sentence;
- $length_reference += $closest_length;
- for(my $n=1;$n<=4;$n++) {
- my %T_NGRAM = ();
- for(my $start=0;$start<=$#WORD-($n-1);$start++) {
- my $ngram = "$n";
- for(my $w=0;$w<$n;$w++) {
- $ngram .= " ".$WORD[$start+$w];
- }
- $T_NGRAM{$ngram}++;
- }
- foreach my $ngram (keys %T_NGRAM) {
- $ngram =~ /^(\d+) /;
- my $n = $1;
- # my $corr = 0;
-# print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
- $TOTAL[$n] += $T_NGRAM{$ngram};
- if (defined($REF_NGRAM{$ngram})) {
- if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
- $CORRECT[$n] += $T_NGRAM{$ngram};
- # $corr = $T_NGRAM{$ngram};
-# print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
- }
- else {
- $CORRECT[$n] += $REF_NGRAM{$ngram};
- # $corr = $REF_NGRAM{$ngram};
-# print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
- }
- }
- # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
- # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
- }
- }
- $s++;
-}
-my $brevity_penalty = 1;
-my $bleu = 0;
-
-my @bleu=();
-
-for(my $n=1;$n<=4;$n++) {
- if (defined ($TOTAL[$n])){
- $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
- # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
- }else{
- $bleu[$n]=0;
- }
-}
-
-if ($length_reference==0){
- printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
- exit(1);
-}
-
-if ($length_translation<$length_reference) {
- $brevity_penalty = exp(1-$length_reference/$length_translation);
-}
-$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
- my_log( $bleu[2] ) +
- my_log( $bleu[3] ) +
- my_log( $bleu[4] ) ) / 4) ;
-printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
- 100*$bleu,
- 100*$bleu[1],
- 100*$bleu[2],
- 100*$bleu[3],
- 100*$bleu[4],
- $brevity_penalty,
- $length_translation / $length_reference,
- $length_translation,
- $length_reference;
-
-sub my_log {
- return -9999999999 unless $_[0];
- return log($_[0]);
-}
diff --git a/tokenizer-no-escape.perl b/tokenizer-no-escape.perl
deleted file mode 100755
index 4397360..0000000
--- a/tokenizer-no-escape.perl
+++ /dev/null
@@ -1,348 +0,0 @@
-#!/usr/bin/perl -w
-
-# Sample Tokenizer
-### Version 1.1
-# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
-# Version 1.1 updates:
-# (1) add multithreading option "-threads NUM_THREADS" (default is 1);
-# (2) add a timing option "-time" to calculate the average speed of this tokenizer;
-# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
-### Version 1.0
-# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
-# written by Josh Schroeder, based on code by Philipp Koehn
-
-binmode(STDIN, ":utf8");
-binmode(STDOUT, ":utf8");
-
-use FindBin qw($RealBin);
-use strict;
-use Time::HiRes;
-#use Thread;
-
-my $mydir = "$RealBin/nonbreaking_prefixes";
-
-my %NONBREAKING_PREFIX = ();
-my $language = "en";
-my $QUIET = 0;
-my $HELP = 0;
-my $AGGRESSIVE = 0;
-my $SKIP_XML = 0;
-my $TIMING = 0;
-my $NUM_THREADS = 1;
-my $NUM_SENTENCES_PER_THREAD = 2000;
-
-while (@ARGV)
-{
- $_ = shift;
- /^-b$/ && ($| = 1, next);
- /^-l$/ && ($language = shift, next);
- /^-q$/ && ($QUIET = 1, next);
- /^-h$/ && ($HELP = 1, next);
- /^-x$/ && ($SKIP_XML = 1, next);
- /^-a$/ && ($AGGRESSIVE = 1, next);
- /^-time$/ && ($TIMING = 1, next);
- /^-threads$/ && ($NUM_THREADS = int(shift), next);
- /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
-}
-
-# for time calculation
-my $start_time;
-if ($TIMING)
-{
- $start_time = [ Time::HiRes::gettimeofday( ) ];
-}
-
-# print help message
-if ($HELP)
-{
- print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
- print "Options:\n";
- print " -q ... quiet.\n";
- print " -a ... aggressive hyphen splitting.\n";
- print " -b ... disable Perl buffering.\n";
- print " -time ... enable processing time calculation.\n";
- exit;
-}
-
-if (!$QUIET)
-{
- print STDERR "Tokenizer Version 1.1\n";
- print STDERR "Language: $language\n";
- print STDERR "Number of threads: $NUM_THREADS\n";
-}
-
-# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
-load_prefixes($language,\%NONBREAKING_PREFIX);
-
-if (scalar(%NONBREAKING_PREFIX) eq 0)
-{
- print STDERR "Warning: No known abbreviations for language '$language'\n";
-}
-
-my @batch_sentences = ();
-my @thread_list = ();
-my $count_sentences = 0;
-
-if ($NUM_THREADS > 1)
-{# multi-threading tokenization
- while(<STDIN>)
- {
- $count_sentences = $count_sentences + 1;
- push(@batch_sentences, $_);
- if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
- {
- # assign each thread work
- for (my $i=0; $i<$NUM_THREADS; $i++)
- {
- my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
- my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
- my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
- my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
- push(@thread_list, $new_thread);
- }
- foreach (@thread_list)
- {
- my $tokenized_list = $_->join;
- foreach (@$tokenized_list)
- {
- print $_;
- }
- }
- # reset for the new run
- @thread_list = ();
- @batch_sentences = ();
- }
- }
- # the last batch
- if (scalar(@batch_sentences)>0)
- {
- # assign each thread work
- for (my $i=0; $i<$NUM_THREADS; $i++)
- {
- my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
- if ($start_index >= scalar(@batch_sentences))
- {
- last;
- }
- my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
- if ($end_index >= scalar(@batch_sentences))
- {
- $end_index = scalar(@batch_sentences)-1;
- }
- my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
- my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
- push(@thread_list, $new_thread);
- }
- foreach (@thread_list)
- {
- my $tokenized_list = $_->join;
- foreach (@$tokenized_list)
- {
- print $_;
- }
- }
- }
-}
-else
-{# single thread only
- while(<STDIN>)
- {
- if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
- {
- #don't try to tokenize XML/HTML tag lines
- print $_;
- }
- else
- {
- print &tokenize($_);
- }
- }
-}
-
-if ($TIMING)
-{
- my $duration = Time::HiRes::tv_interval( $start_time );
- print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
- print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
-}
-
-#####################################################################################
-# subroutines afterward
-
-# tokenize a batch of texts saved in an array
-# input: an array containing a batch of texts
-# return: another array cotaining a batch of tokenized texts for the input array
-sub tokenize_batch
-{
- my(@text_list) = @_;
- my(@tokenized_list) = ();
- foreach (@text_list)
- {
- if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
- {
- #don't try to tokenize XML/HTML tag lines
- push(@tokenized_list, $_);
- }
- else
- {
- push(@tokenized_list, &tokenize($_));
- }
- }
- return \@tokenized_list;
-}
-
-# the actual tokenize function which tokenizes one input string
-# input: one string
-# return: the tokenized string for the input string
-sub tokenize
-{
- my($text) = @_;
- chomp($text);
- $text = " $text ";
-
- # remove ASCII junk
- $text =~ s/\s+/ /g;
- $text =~ s/[\000-\037]//g;
-
- # seperate out all "other" special characters
- $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
-
- # aggressive hyphen splitting
- if ($AGGRESSIVE)
- {
- $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
- }
-
- #multi-dots stay together
- $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
- while($text =~ /DOTMULTI\./)
- {
- $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
- $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
- }
-
- # seperate out "," except if within numbers (5,300)
- $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
- # separate , pre and post number
- $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
- $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
-
- # turn `into '
- $text =~ s/\`/\'/g;
-
- #turn '' into "
- $text =~ s/\'\'/ \" /g;
-
- if ($language eq "en")
- {
- #split contractions right
- $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
- #special case for "1990's"
- $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
- }
- elsif (($language eq "fr") or ($language eq "it"))
- {
- #split contractions left
- $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
- }
- else
- {
- $text =~ s/\'/ \' /g;
- }
-
- #word token method
- my @words = split(/\s/,$text);
- $text = "";
- for (my $i=0;$i<(scalar(@words));$i++)
- {
- my $word = $words[$i];
- if ( $word =~ /^(\S+)\.$/)
- {
- my $pre = $1;
- if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
- {
- #no change
- }
- elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
- {
- #no change
- }
- else
- {
- $word = $pre." .";
- }
- }
- $text .= $word." ";
- }
-
- # clean up extraneous spaces
- $text =~ s/ +/ /g;
- $text =~ s/^ //g;
- $text =~ s/ $//g;
-
- #restore multi-dots
- while($text =~ /DOTDOTMULTI/)
- {
- $text =~ s/DOTDOTMULTI/DOTMULTI./g;
- }
- $text =~ s/DOTMULTI/./g;
-
- #escape special chars
- #$text =~ s/\&/\&amp;/g; # escape escape
- #$text =~ s/\|/\&#124;/g; # factor separator
- #$text =~ s/\</\&lt;/g; # xml
- #$text =~ s/\>/\&gt;/g; # xml
- #$text =~ s/\'/\&apos;/g; # xml
- #$text =~ s/\"/\&quot;/g; # xml
- #$text =~ s/\[/\&#91;/g; # syntax non-terminal
- #$text =~ s/\]/\&#93;/g; # syntax non-terminal
-
- #ensure final line break
- $text .= "\n" unless $text =~ /\n$/;
-
- return $text;
-}
-
-sub load_prefixes
-{
- my ($language, $PREFIX_REF) = @_;
-
- my $prefixfile = "$mydir/nonbreaking_prefix.$language";
-
- #default back to English if we don't have a language-specific prefix file
- if (!(-e $prefixfile))
- {
- $prefixfile = "$mydir/nonbreaking_prefix.en";
- print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
- die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
- }
-
- if (-e "$prefixfile")
- {
- open(PREFIX, "<:utf8", "$prefixfile");
- while (<PREFIX>)
- {
- my $item = $_;
- chomp($item);
- if (($item) && (substr($item,0,1) ne "#"))
- {
- if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
- {
- $PREFIX_REF->{$1} = 2;
- }
- else
- {
- $PREFIX_REF->{$item} = 1;
- }
- }
- }
- close(PREFIX);
- }
-}
-
diff --git a/train-truecaser.perl b/train-truecaser.perl
deleted file mode 100755
index 59a83ec..0000000
--- a/train-truecaser.perl
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/perl -w
-
-# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
-
-#
-# Options:
-#
-# --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token.
-#
-
-use strict;
-use Getopt::Long "GetOptions";
-
-# apply switches
-my ($MODEL,$CORPUS);
-die("train-truecaser.perl --model truecaser --corpus cased [--possiblyUseFirstToken]")
- unless &GetOptions('corpus=s' => \$CORPUS,
- 'model=s' => \$MODEL,
- 'possiblyUseFirstToken' => \(my $possiblyUseFirstToken = 0))
- && defined($CORPUS) && defined($MODEL);
-my %CASING;
-my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
-my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
-open(CORPUS,$CORPUS) || die("ERROR: could not open '$CORPUS'");
-binmode(CORPUS, ":utf8");
-while(<CORPUS>) {
- chop;
- my ($WORD,$MARKUP) = split_xml($_);
- my $start = 0;
- while($start<=$#$WORD && defined($DELAYED_SENTENCE_START{$$WORD[$start]})) { $start++; }
- my $firstWordOfSentence = 1;
- for(my $i=$start;$i<=$#$WORD;$i++) {
- my $currentWord = $$WORD[$i];
- if (! $firstWordOfSentence && defined($SENTENCE_END{$$WORD[$i-1]})) {
- $firstWordOfSentence = 1;
- }
-
- my $currentWordWeight = 0;
- if (! $firstWordOfSentence) {
- $currentWordWeight = 1;
- } elsif ($possiblyUseFirstToken) {
- # gated special handling of first word of sentence
- my $firstChar = substr($currentWord, 0, 1);
- if (lc($firstChar) eq $firstChar) {
- # if the first character is not upper case, count the token as full evidence (because if it's not capitalized, then there's no reason to be wary that the given casing is only due to being sentence-initial)
- $currentWordWeight = 1;
- } elsif (scalar(@$WORD) == 1) {
- # if the first character is upper case, but the current token is the only token of the segment, then count the token as partial evidence (because the segment is presumably not a sentence and the token is therefore not the first word of a sentence and is possibly in its natural case)
- $currentWordWeight = 0.1;
- }
- }
- if ($currentWordWeight > 0) {
- $CASING{ lc($currentWord) }{ $currentWord } += $currentWordWeight;
- }
-
- $firstWordOfSentence = 0;
- }
-}
-close(CORPUS);
-
-open(MODEL,">$MODEL") || die("ERROR: could not create '$MODEL'");
-binmode(MODEL, ":utf8");
-foreach my $type (keys %CASING) {
- my ($score,$total,$best) = (-1,0,"");
- foreach my $word (keys %{$CASING{$type}}) {
- my $count = $CASING{$type}{$word};
- $total += $count;
- if ($count > $score) {
- $best = $word;
- $score = $count;
- }
- }
- print MODEL "$best ($score/$total)";
- foreach my $word (keys %{$CASING{$type}}) {
- print MODEL " $word ($CASING{$type}{$word})" unless $word eq $best;
- }
- print MODEL "\n";
-}
-close(MODEL);
-
-
-# store away xml markup
-sub split_xml {
- my ($line) = @_;
- my (@WORD,@MARKUP);
- my $i = 0;
- $MARKUP[0] = "";
- while($line =~ /\S/) {
- # XML tag
- if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
- $MARKUP[$i] .= $1." ";
- $line = $2;
- }
- # non-XML text
- elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
- $WORD[$i++] = $1;
- $MARKUP[$i] = "";
- $line = $2;
- }
- # '<' or '>' occurs in word, but it's not an XML tag
- elsif ($line =~ /^\s*(\S+)(.*)$/) {
- $WORD[$i++] = $1;
- $MARKUP[$i] = "";
- $line = $2;
- }
- else {
- die("ERROR: huh? $line\n");
- }
- }
- chop($MARKUP[$#MARKUP]);
- return (\@WORD,\@MARKUP);
-}
diff --git a/truecase.perl b/truecase.perl
deleted file mode 100755
index 0a4d366..0000000
--- a/truecase.perl
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/usr/bin/perl -w
-
-# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
-use strict;
-use Getopt::Long "GetOptions";
-
-binmode(STDIN, ":utf8");
-binmode(STDOUT, ":utf8");
-
-# apply switches
-my ($MODEL, $UNBUFFERED);
-die("truecase.perl --model MODEL [-b] < in > out")
- unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
- && defined($MODEL);
-if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
-
-my (%BEST,%KNOWN);
-open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
-binmode(MODEL, ":utf8");
-while(<MODEL>) {
- my ($word,@OPTIONS) = split;
- $BEST{ lc($word) } = $word;
- $KNOWN{ $word } = 1;
- for(my $i=1;$i<$#OPTIONS;$i+=2) {
- $KNOWN{ $OPTIONS[$i] } = 1;
- }
-}
-close(MODEL);
-
-my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
-my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
-
-while(<STDIN>) {
- chop;
- my ($WORD,$MARKUP) = split_xml($_);
- my $sentence_start = 1;
- for(my $i=0;$i<=$#$WORD;$i++) {
- print " " if $i && $$MARKUP[$i] eq '';
- print $$MARKUP[$i];
-
- my ($word,$otherfactors);
- if ($$WORD[$i] =~ /^([^\|]+)(.*)/)
- {
- $word = $1;
- $otherfactors = $2;
- }
- else
- {
- $word = $$WORD[$i];
- $otherfactors = "";
- }
-
- if ($sentence_start && defined($BEST{lc($word)})) {
- print $BEST{lc($word)}; # truecase sentence start
- }
- elsif (defined($KNOWN{$word})) {
- print $word; # don't change known words
- }
- elsif (defined($BEST{lc($word)})) {
- print $BEST{lc($word)}; # truecase otherwise unknown words
- }
- else {
- print $word; # unknown, nothing to do
- }
- print $otherfactors;
-
- if ( defined($SENTENCE_END{ $word })) { $sentence_start = 1; }
- elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; }
- }
- print $$MARKUP[$#$MARKUP];
- print "\n";
-}
-
-# store away xml markup
-sub split_xml {
- my ($line) = @_;
- my (@WORD,@MARKUP);
- my $i = 0;
- $MARKUP[0] = "";
- while($line =~ /\S/) {
- # XML tag
- if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
- $MARKUP[$i] .= $1." ";
- $line = $2;
- }
- # non-XML text
- elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
- $WORD[$i++] = $1;
- $MARKUP[$i] = "";
- $line = $2;
- }
- # '<' or '>' occurs in word, but it's not an XML tag
- elsif ($line =~ /^\s*(\S+)(.*)$/) {
- $WORD[$i++] = $1;
- $MARKUP[$i] = "";
- $line = $2;
- }
- else {
- die("ERROR: huh? $line\n");
- }
- }
- chop($MARKUP[$#MARKUP]);
- return (\@WORD,\@MARKUP);
-}