#!/usr/bin/perl -w use strict; use Getopt::Long "GetOptions"; my ($CORPUS,$MODEL,$TRAIN,$HELP,$VERBOSE); my $FILLER = ":s:es"; my $MIN_SIZE = 3; my $MIN_COUNT = 5; my $MAX_COUNT = 5; my $FACTORED = 0; my $SYNTAX = 0; my $MARK_SPLIT = 0; my $BINARIZE = 0; $HELP = 1 unless &GetOptions('corpus=s' => \$CORPUS, 'model=s' => \$MODEL, 'filler=s' => \$FILLER, 'factored' => \$FACTORED, 'min-size=i' => \$MIN_SIZE, 'min-count=i' => \$MIN_COUNT, 'max-count=i' => \$MAX_COUNT, 'help' => \$HELP, 'verbose' => \$VERBOSE, 'syntax' => \$SYNTAX, 'binarize' => \$BINARIZE, 'mark-split' => \$MARK_SPLIT, 'train' => \$TRAIN); if ($HELP || ( $TRAIN && !$CORPUS) || (!$TRAIN && !$MODEL)) { print "Compound splitter\n"; print "-----------------\n\n"; print "train: compound-splitter -train -corpus txt-file -model new-model\n"; print "apply: compound-splitter -model trained-model < in > out\n"; print "options: -min-size: minimum word size (default $MIN_SIZE)\n"; print " -min-count: minimum word count (default $MIN_COUNT)\n"; print " -filler: filler letters between words (default $FILLER)\n"; print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n"; print " -syntax: syntactically parsed data (default $SYNTAX)\n"; print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n"; print " -binarize: binarize subtree for split word (default $BINARIZE)\n"; exit; } if ($TRAIN) { if ($SYNTAX) { &train_syntax(); } elsif ($FACTORED) { &train_factored(); } else { &train(); } } else { &apply(); } sub train { my %COUNT; open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); while() { chop; s/\s+/ /g; s/^ //; s/ $//; foreach (split) { $COUNT{$_}++; } } close(CORPUS); &save_trained_model(\%COUNT); } sub save_trained_model { my ($COUNT) = @_; my $id = 0; open(MODEL,">".$MODEL); foreach my $word (keys %$COUNT) { print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n"; } close(MODEL); print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n"; } sub train_factored { my (%COUNT,%FACTORED_COUNT); # collect counts for interpretations for each surface word open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); while() { chop; s/\s+/ /g; s/^ //; s/ $//; foreach my $factored_word (split) { my $word = $factored_word; $word =~ s/\|.+//g; # just first factor $FACTORED_COUNT{$word}{$factored_word}++; } } close(CORPUS); # only preserve most frequent interpretation, assign sum of counts foreach my $word (keys %FACTORED_COUNT) { my ($max,$best,$total) = (0,"",0); foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) { my $count = $FACTORED_COUNT{$word}{$factored_word}; $total += $count; if ($count > $max) { $max = $count; $best = $factored_word; } } $COUNT{$best} = $total; } &save_trained_model(\%COUNT); } sub train_syntax { my (%COUNT,%LABELED_COUNT); # collect counts for interpretations for each surface word open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); while() { chop; s/\s+/ /g; s/^ //; s/ $//; my $label; foreach (split) { if (/^label="([^\"]+)"/) { $label = $1; } elsif (! /^ $max) { $max = $count; $best = "$word $label"; } } $COUNT{$best} = $total; } &save_trained_model(\%COUNT); } sub apply { my (%COUNT,%TRUECASE,%LABEL); open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'"); while() { chomp; my ($id,$factored_word,$count) = split(/\t/); my $label; ($factored_word,$label) = split(/ /,$factored_word); my $word = $factored_word; $word =~ s/\|.+//g; # just first factor my $lc = lc($word); # if word exists with multipe casings, only record most frequent next if defined($COUNT{$lc}) && $COUNT{$lc} > $count; $COUNT{$lc} = $count; $TRUECASE{$lc} = $factored_word; $LABEL{$lc} = $label if $SYNTAX; } close(MODEL); while() { my $first = 1; chop; s/\s+/ /g; s/^ //; s/ $//; my @BUFFER; # for xml tags foreach my $factored_word (split) { print " " unless $first; $first = 0; # syntax: don't split xml if ($SYNTAX && ($factored_word =~ /^$/)) { push @BUFFER,$factored_word; $first = 1; next; } # get case class my $word = $factored_word; $word =~ s/\|.+//g; # just first factor my $lc = lc($word); print STDERR "considering $word ($lc)...\n" if $VERBOSE; # don't split frequent words if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) || $lc !~ /[a-zA-Z]/) {; # has to have at least one letter print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer print $factored_word; print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE; next; } # consider possible splits my $final = length($word)-1; my %REACHABLE; for(my $i=0;$i<=$final;$i++) { $REACHABLE{$i} = (); } print STDERR "splitting $word:\n" if $VERBOSE; for(my $end=$MIN_SIZE;$end= $MIN_COUNT; print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE; push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}"; } } } # no matches at all? if (!defined($REACHABLE{$final})) { print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer print $factored_word; next; } my ($best_split,$best_score) = ("",0); my %ITERATOR; for(my $i=0;$i<=$final;$i++) { $ITERATOR{$i}=0; } my $done = 0; while(1) { # read off word my ($pos,$decomp,$score,$num,@INDEX) = ($final,"",1,0); while($pos>0) { last unless scalar @{$REACHABLE{$pos}} > $ITERATOR{$pos}; # dead end? my ($nextpos,$subword,$count) = split(/ /,$REACHABLE{$pos}[ $ITERATOR{$pos} ]); $decomp = $subword." ".$decomp; $score *= $count; $num++; push @INDEX,$pos; # print STDERR "($nextpos-$pos,$decomp,$score,$num)\n"; $pos = $nextpos-1; } chop($decomp); print STDERR "\tsplit: $decomp ($score ** 1/$num) = ".($score ** (1/$num))."\n" if $VERBOSE; $score **= 1/$num; if ($score>$best_score) { $best_score = $score; $best_split = $decomp; } # increase iterator my $increase = -1; while($increase<$final) { $increase = pop @INDEX; $ITERATOR{$increase}++; last if scalar @{$REACHABLE{$increase}} > $ITERATOR{$increase}; } last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final}; for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; } } if ($best_split !~ / /) { print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer print $factored_word; # do not change case for unsplit words next; } if (!$SYNTAX) { print $best_split; } else { $BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT; $BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n"); my $pos = $1; print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer my @SPLIT = split(/ /,$best_split); my @OUT = (); if ($BINARIZE) { for(my $w=0;$w"; } } for(my $w=0;$w=2) { push @OUT, ""; } push @OUT," $SPLIT[$w] "; } print join(" ",@OUT); } } print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer print "\n"; } }