summaryrefslogtreecommitdiff
path: root/compound-splitter.perl
diff options
context:
space:
mode:
authorPatrick Simianer <pks@pks.rocks>2018-04-11 15:21:44 +0200
committerPatrick Simianer <pks@pks.rocks>2018-04-11 15:21:44 +0200
commit26c944f3bf8e97e35d9662d8672c9527fb13c45f (patch)
tree35f4d2c27e8bdc72e861bb92a3b1294df1ed2785 /compound-splitter.perl
parent31a3b846e2ae174fb68b61a9e9070e32c11509a6 (diff)
rm
Diffstat (limited to 'compound-splitter.perl')
-rwxr-xr-xcompound-splitter.perl291
1 files changed, 0 insertions, 291 deletions
diff --git a/compound-splitter.perl b/compound-splitter.perl
deleted file mode 100755
index beca70e..0000000
--- a/compound-splitter.perl
+++ /dev/null
@@ -1,291 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use Getopt::Long "GetOptions";
-
-my ($CORPUS,$MODEL,$TRAIN,$HELP,$VERBOSE);
-my $FILLER = ":s:es";
-my $MIN_SIZE = 3;
-my $MIN_COUNT = 5;
-my $MAX_COUNT = 5;
-my $FACTORED = 0;
-my $SYNTAX = 0;
-my $MARK_SPLIT = 0;
-my $BINARIZE = 0;
-$HELP = 1
- unless &GetOptions('corpus=s' => \$CORPUS,
- 'model=s' => \$MODEL,
- 'filler=s' => \$FILLER,
- 'factored' => \$FACTORED,
- 'min-size=i' => \$MIN_SIZE,
- 'min-count=i' => \$MIN_COUNT,
- 'max-count=i' => \$MAX_COUNT,
- 'help' => \$HELP,
- 'verbose' => \$VERBOSE,
- 'syntax' => \$SYNTAX,
- 'binarize' => \$BINARIZE,
- 'mark-split' => \$MARK_SPLIT,
- 'train' => \$TRAIN);
-
-if ($HELP ||
- ( $TRAIN && !$CORPUS) ||
- (!$TRAIN && !$MODEL)) {
- print "Compound splitter\n";
- print "-----------------\n\n";
- print "train: compound-splitter -train -corpus txt-file -model new-model\n";
- print "apply: compound-splitter -model trained-model < in > out\n";
- print "options: -min-size: minimum word size (default $MIN_SIZE)\n";
- print " -min-count: minimum word count (default $MIN_COUNT)\n";
- print " -filler: filler letters between words (default $FILLER)\n";
- print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n";
- print " -syntax: syntactically parsed data (default $SYNTAX)\n";
- print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n";
- print " -binarize: binarize subtree for split word (default $BINARIZE)\n";
- exit;
-}
-
-if ($TRAIN) {
- if ($SYNTAX) { &train_syntax(); }
- elsif ($FACTORED) { &train_factored(); }
- else { &train(); }
-}
-else {
- &apply();
-}
-
-sub train {
- my %COUNT;
- open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
- while(<CORPUS>) {
- chop; s/\s+/ /g; s/^ //; s/ $//;
- foreach (split) {
- $COUNT{$_}++;
- }
- }
- close(CORPUS);
- &save_trained_model(\%COUNT);
-}
-
-sub save_trained_model {
- my ($COUNT) = @_;
- my $id = 0;
- open(MODEL,">".$MODEL);
- foreach my $word (keys %$COUNT) {
- print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n";
- }
- close(MODEL);
- print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n";
-}
-
-sub train_factored {
- my (%COUNT,%FACTORED_COUNT);
- # collect counts for interpretations for each surface word
- open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
- while(<CORPUS>) {
- chop; s/\s+/ /g; s/^ //; s/ $//;
- foreach my $factored_word (split) {
- my $word = $factored_word;
- $word =~ s/\|.+//g; # just first factor
- $FACTORED_COUNT{$word}{$factored_word}++;
- }
- }
- close(CORPUS);
- # only preserve most frequent interpretation, assign sum of counts
- foreach my $word (keys %FACTORED_COUNT) {
- my ($max,$best,$total) = (0,"",0);
- foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) {
- my $count = $FACTORED_COUNT{$word}{$factored_word};
- $total += $count;
- if ($count > $max) {
- $max = $count;
- $best = $factored_word;
- }
- }
- $COUNT{$best} = $total;
- }
- &save_trained_model(\%COUNT);
-}
-
-sub train_syntax {
- my (%COUNT,%LABELED_COUNT);
- # collect counts for interpretations for each surface word
- open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
- while(<CORPUS>) {
- chop; s/\s+/ /g; s/^ //; s/ $//;
- my $label;
- foreach (split) {
- if (/^label="([^\"]+)"/) {
- $label = $1;
- }
- elsif (! /^</) {
- $LABELED_COUNT{$_}{$label}++;
- }
- }
- }
- close(CORPUS);
-
- # only preserve most frequent label, assign sum of counts
- foreach my $word (keys %LABELED_COUNT) {
- my ($max,$best,$total) = (0,"",0);
- foreach my $label (keys %{$LABELED_COUNT{$word}}) {
- my $count = $LABELED_COUNT{$word}{$label};
- $total += $count;
- if ($count > $max) {
- $max = $count;
- $best = "$word $label";
- }
- }
- $COUNT{$best} = $total;
- }
- &save_trained_model(\%COUNT);
-}
-
-sub apply {
- my (%COUNT,%TRUECASE,%LABEL);
- open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'");
- while(<MODEL>) {
- chomp;
- my ($id,$factored_word,$count) = split(/\t/);
- my $label;
- ($factored_word,$label) = split(/ /,$factored_word);
- my $word = $factored_word;
- $word =~ s/\|.+//g; # just first factor
- my $lc = lc($word);
- # if word exists with multipe casings, only record most frequent
- next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
- $COUNT{$lc} = $count;
- $TRUECASE{$lc} = $factored_word;
- $LABEL{$lc} = $label if $SYNTAX;
- }
- close(MODEL);
-
- while(<STDIN>) {
- my $first = 1;
- chop; s/\s+/ /g; s/^ //; s/ $//;
- my @BUFFER; # for xml tags
- foreach my $factored_word (split) {
- print " " unless $first;
- $first = 0;
-
- # syntax: don't split xml
- if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
- push @BUFFER,$factored_word;
- $first = 1;
- next;
- }
-
- # get case class
- my $word = $factored_word;
- $word =~ s/\|.+//g; # just first factor
- my $lc = lc($word);
-
- print STDERR "considering $word ($lc)...\n" if $VERBOSE;
- # don't split frequent words
- if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
- $lc !~ /[a-zA-Z]/) {; # has to have at least one letter
- print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
- print $factored_word;
- print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
- next;
- }
-
- # consider possible splits
- my $final = length($word)-1;
- my %REACHABLE;
- for(my $i=0;$i<=$final;$i++) { $REACHABLE{$i} = (); }
-
- print STDERR "splitting $word:\n" if $VERBOSE;
- for(my $end=$MIN_SIZE;$end<length($word);$end++) {
- for(my $start=0;$start<=$end-$MIN_SIZE;$start++) {
- next unless $start == 0 || defined($REACHABLE{$start-1});
- foreach my $filler (split(/:/,$FILLER)) {
- next if $start == 0 && $filler ne "";
- next if lc(substr($word,$start,length($filler))) ne $filler;
- my $subword = lc(substr($word,
- $start+length($filler),
- $end-$start+1-length($filler)));
- next unless defined($COUNT{$subword});
- next unless $COUNT{$subword} >= $MIN_COUNT;
- print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE;
- push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}";
- }
- }
- }
-
- # no matches at all?
- if (!defined($REACHABLE{$final})) {
- print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
- print $factored_word;
- next;
- }
-
- my ($best_split,$best_score) = ("",0);
-
- my %ITERATOR;
- for(my $i=0;$i<=$final;$i++) { $ITERATOR{$i}=0; }
- my $done = 0;
- while(1) {
- # read off word
- my ($pos,$decomp,$score,$num,@INDEX) = ($final,"",1,0);
- while($pos>0) {
- last unless scalar @{$REACHABLE{$pos}} > $ITERATOR{$pos}; # dead end?
- my ($nextpos,$subword,$count)
- = split(/ /,$REACHABLE{$pos}[ $ITERATOR{$pos} ]);
- $decomp = $subword." ".$decomp;
- $score *= $count;
- $num++;
- push @INDEX,$pos;
-# print STDERR "($nextpos-$pos,$decomp,$score,$num)\n";
- $pos = $nextpos-1;
- }
-
- chop($decomp);
- print STDERR "\tsplit: $decomp ($score ** 1/$num) = ".($score ** (1/$num))."\n" if $VERBOSE;
- $score **= 1/$num;
- if ($score>$best_score) {
- $best_score = $score;
- $best_split = $decomp;
- }
-
- # increase iterator
- my $increase = -1;
- while($increase<$final) {
- $increase = pop @INDEX;
- $ITERATOR{$increase}++;
- last if scalar @{$REACHABLE{$increase}} > $ITERATOR{$increase};
- }
- last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final};
- for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; }
- }
- if ($best_split !~ / /) {
- print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
- print $factored_word; # do not change case for unsplit words
- next;
- }
- if (!$SYNTAX) {
- print $best_split;
- }
- else {
- $BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT;
- $BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n");
- my $pos = $1;
- print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
-
- my @SPLIT = split(/ /,$best_split);
- my @OUT = ();
- if ($BINARIZE) {
- for(my $w=0;$w<scalar(@SPLIT)-2;$w++) {
- push @OUT,"<tree label=\"\@$pos\">";
- }
- }
- for(my $w=0;$w<scalar(@SPLIT);$w++) {
- if ($BINARIZE && $w>=2) { push @OUT, "</tree>"; }
- push @OUT,"<tree label=\"".$LABEL{lc($SPLIT[$w])}."\"> $SPLIT[$w] </tree>";
- }
- print join(" ",@OUT);
- }
- }
- print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer
- print "\n";
- }
-}