diff options
author | Patrick Simianer <p@simianer.de> | 2014-07-22 17:08:32 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-07-22 17:08:32 +0200 |
commit | bad18ebfd358e4352643831040e45ad5435eff48 (patch) | |
tree | 1325725a26bfc5569b0be88aab22f4daa09c973c /compound-splitter.perl | |
parent | 22b81757a97f89e6b3415782c0556ffaf4625baa (diff) |
compound-splitter.perl (taken from moses v2.1.1)
Diffstat (limited to 'compound-splitter.perl')
-rwxr-xr-x | compound-splitter.perl | 291 |
1 files changed, 291 insertions, 0 deletions
diff --git a/compound-splitter.perl b/compound-splitter.perl new file mode 100755 index 0000000..beca70e --- /dev/null +++ b/compound-splitter.perl @@ -0,0 +1,291 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long "GetOptions"; + +my ($CORPUS,$MODEL,$TRAIN,$HELP,$VERBOSE); +my $FILLER = ":s:es"; +my $MIN_SIZE = 3; +my $MIN_COUNT = 5; +my $MAX_COUNT = 5; +my $FACTORED = 0; +my $SYNTAX = 0; +my $MARK_SPLIT = 0; +my $BINARIZE = 0; +$HELP = 1 + unless &GetOptions('corpus=s' => \$CORPUS, + 'model=s' => \$MODEL, + 'filler=s' => \$FILLER, + 'factored' => \$FACTORED, + 'min-size=i' => \$MIN_SIZE, + 'min-count=i' => \$MIN_COUNT, + 'max-count=i' => \$MAX_COUNT, + 'help' => \$HELP, + 'verbose' => \$VERBOSE, + 'syntax' => \$SYNTAX, + 'binarize' => \$BINARIZE, + 'mark-split' => \$MARK_SPLIT, + 'train' => \$TRAIN); + +if ($HELP || + ( $TRAIN && !$CORPUS) || + (!$TRAIN && !$MODEL)) { + print "Compound splitter\n"; + print "-----------------\n\n"; + print "train: compound-splitter -train -corpus txt-file -model new-model\n"; + print "apply: compound-splitter -model trained-model < in > out\n"; + print "options: -min-size: minimum word size (default $MIN_SIZE)\n"; + print " -min-count: minimum word count (default $MIN_COUNT)\n"; + print " -filler: filler letters between words (default $FILLER)\n"; + print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n"; + print " -syntax: syntactically parsed data (default $SYNTAX)\n"; + print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n"; + print " -binarize: binarize subtree for split word (default $BINARIZE)\n"; + exit; +} + +if ($TRAIN) { + if ($SYNTAX) { &train_syntax(); } + elsif ($FACTORED) { &train_factored(); } + else { &train(); } +} +else { + &apply(); +} + +sub train { + my %COUNT; + open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); + while(<CORPUS>) { + chop; s/\s+/ /g; s/^ //; s/ $//; + foreach (split) { + $COUNT{$_}++; + } + } + close(CORPUS); + &save_trained_model(\%COUNT); +} + +sub save_trained_model { + my ($COUNT) = @_; + my $id = 0; + open(MODEL,">".$MODEL); + foreach my $word (keys %$COUNT) { + print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n"; + } + close(MODEL); + print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n"; +} + +sub train_factored { + my (%COUNT,%FACTORED_COUNT); + # collect counts for interpretations for each surface word + open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); + while(<CORPUS>) { + chop; s/\s+/ /g; s/^ //; s/ $//; + foreach my $factored_word (split) { + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + $FACTORED_COUNT{$word}{$factored_word}++; + } + } + close(CORPUS); + # only preserve most frequent interpretation, assign sum of counts + foreach my $word (keys %FACTORED_COUNT) { + my ($max,$best,$total) = (0,"",0); + foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) { + my $count = $FACTORED_COUNT{$word}{$factored_word}; + $total += $count; + if ($count > $max) { + $max = $count; + $best = $factored_word; + } + } + $COUNT{$best} = $total; + } + &save_trained_model(\%COUNT); +} + +sub train_syntax { + my (%COUNT,%LABELED_COUNT); + # collect counts for interpretations for each surface word + open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'"); + while(<CORPUS>) { + chop; s/\s+/ /g; s/^ //; s/ $//; + my $label; + foreach (split) { + if (/^label="([^\"]+)"/) { + $label = $1; + } + elsif (! /^</) { + $LABELED_COUNT{$_}{$label}++; + } + } + } + close(CORPUS); + + # only preserve most frequent label, assign sum of counts + foreach my $word (keys %LABELED_COUNT) { + my ($max,$best,$total) = (0,"",0); + foreach my $label (keys %{$LABELED_COUNT{$word}}) { + my $count = $LABELED_COUNT{$word}{$label}; + $total += $count; + if ($count > $max) { + $max = $count; + $best = "$word $label"; + } + } + $COUNT{$best} = $total; + } + &save_trained_model(\%COUNT); +} + +sub apply { + my (%COUNT,%TRUECASE,%LABEL); + open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'"); + while(<MODEL>) { + chomp; + my ($id,$factored_word,$count) = split(/\t/); + my $label; + ($factored_word,$label) = split(/ /,$factored_word); + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + my $lc = lc($word); + # if word exists with multipe casings, only record most frequent + next if defined($COUNT{$lc}) && $COUNT{$lc} > $count; + $COUNT{$lc} = $count; + $TRUECASE{$lc} = $factored_word; + $LABEL{$lc} = $label if $SYNTAX; + } + close(MODEL); + + while(<STDIN>) { + my $first = 1; + chop; s/\s+/ /g; s/^ //; s/ $//; + my @BUFFER; # for xml tags + foreach my $factored_word (split) { + print " " unless $first; + $first = 0; + + # syntax: don't split xml + if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) { + push @BUFFER,$factored_word; + $first = 1; + next; + } + + # get case class + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + my $lc = lc($word); + + print STDERR "considering $word ($lc)...\n" if $VERBOSE; + # don't split frequent words + if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) || + $lc !~ /[a-zA-Z]/) {; # has to have at least one letter + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + print $factored_word; + print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE; + next; + } + + # consider possible splits + my $final = length($word)-1; + my %REACHABLE; + for(my $i=0;$i<=$final;$i++) { $REACHABLE{$i} = (); } + + print STDERR "splitting $word:\n" if $VERBOSE; + for(my $end=$MIN_SIZE;$end<length($word);$end++) { + for(my $start=0;$start<=$end-$MIN_SIZE;$start++) { + next unless $start == 0 || defined($REACHABLE{$start-1}); + foreach my $filler (split(/:/,$FILLER)) { + next if $start == 0 && $filler ne ""; + next if lc(substr($word,$start,length($filler))) ne $filler; + my $subword = lc(substr($word, + $start+length($filler), + $end-$start+1-length($filler))); + next unless defined($COUNT{$subword}); + next unless $COUNT{$subword} >= $MIN_COUNT; + print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE; + push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}"; + } + } + } + + # no matches at all? + if (!defined($REACHABLE{$final})) { + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + print $factored_word; + next; + } + + my ($best_split,$best_score) = ("",0); + + my %ITERATOR; + for(my $i=0;$i<=$final;$i++) { $ITERATOR{$i}=0; } + my $done = 0; + while(1) { + # read off word + my ($pos,$decomp,$score,$num,@INDEX) = ($final,"",1,0); + while($pos>0) { + last unless scalar @{$REACHABLE{$pos}} > $ITERATOR{$pos}; # dead end? + my ($nextpos,$subword,$count) + = split(/ /,$REACHABLE{$pos}[ $ITERATOR{$pos} ]); + $decomp = $subword." ".$decomp; + $score *= $count; + $num++; + push @INDEX,$pos; +# print STDERR "($nextpos-$pos,$decomp,$score,$num)\n"; + $pos = $nextpos-1; + } + + chop($decomp); + print STDERR "\tsplit: $decomp ($score ** 1/$num) = ".($score ** (1/$num))."\n" if $VERBOSE; + $score **= 1/$num; + if ($score>$best_score) { + $best_score = $score; + $best_split = $decomp; + } + + # increase iterator + my $increase = -1; + while($increase<$final) { + $increase = pop @INDEX; + $ITERATOR{$increase}++; + last if scalar @{$REACHABLE{$increase}} > $ITERATOR{$increase}; + } + last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final}; + for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; } + } + if ($best_split !~ / /) { + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + print $factored_word; # do not change case for unsplit words + next; + } + if (!$SYNTAX) { + print $best_split; + } + else { + $BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT; + $BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n"); + my $pos = $1; + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + + my @SPLIT = split(/ /,$best_split); + my @OUT = (); + if ($BINARIZE) { + for(my $w=0;$w<scalar(@SPLIT)-2;$w++) { + push @OUT,"<tree label=\"\@$pos\">"; + } + } + for(my $w=0;$w<scalar(@SPLIT);$w++) { + if ($BINARIZE && $w>=2) { push @OUT, "</tree>"; } + push @OUT,"<tree label=\"".$LABEL{lc($SPLIT[$w])}."\"> $SPLIT[$w] </tree>"; + } + print join(" ",@OUT); + } + } + print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer + print "\n"; + } +} |