From 7cc92b65a3185aa242088d830e166e495674efc9 Mon Sep 17 00:00:00 2001 From: redpony Date: Tue, 22 Jun 2010 05:12:27 +0000 Subject: initial checkin git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f --- word-aligner/support/classify.pl | 27 ++ word-aligner/support/extract_grammar.pl | 11 + word-aligner/support/extract_vocab.pl | 20 ++ word-aligner/support/extract_weights.pl | 17 + word-aligner/support/invert_grammar.pl | 8 + word-aligner/support/make_lex_grammar.pl | 435 ++++++++++++++++++++++++ word-aligner/support/merge_corpus.pl | 18 + word-aligner/support/supplement_weights_file.pl | 73 ++++ 8 files changed, 609 insertions(+) create mode 100755 word-aligner/support/classify.pl create mode 100755 word-aligner/support/extract_grammar.pl create mode 100755 word-aligner/support/extract_vocab.pl create mode 100755 word-aligner/support/extract_weights.pl create mode 100755 word-aligner/support/invert_grammar.pl create mode 100755 word-aligner/support/make_lex_grammar.pl create mode 100755 word-aligner/support/merge_corpus.pl create mode 100755 word-aligner/support/supplement_weights_file.pl (limited to 'word-aligner/support') diff --git a/word-aligner/support/classify.pl b/word-aligner/support/classify.pl new file mode 100755 index 00000000..893c7b22 --- /dev/null +++ b/word-aligner/support/classify.pl @@ -0,0 +1,27 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 classes.txt corpus.txt" unless scalar @ARGV == 2; + +my ($class, $text) = @ARGV; +open C, "<$class" or die "Can't read $class: $!"; +open T, "<$text" or die "Can't read $text: $!"; + +my %dict = (); +my $cc = 0; +while() { + chomp; + my ($word, $cat) = split /\s+/; + die "'$word' '$cat'" unless (defined $word && defined $cat); + $dict{$word} = $cat; + $cc++; +} +close C; +print STDERR "Loaded classes for $cc words\n"; + +while() { + chomp; + my @cats = map { $dict{$_} or die "Undefined class for $_"; } split /\s+/; + print "@cats\n"; +} + diff --git a/word-aligner/support/extract_grammar.pl b/word-aligner/support/extract_grammar.pl new file mode 100755 index 00000000..d7275ef5 --- /dev/null +++ b/word-aligner/support/extract_grammar.pl @@ -0,0 +1,11 @@ +#!/usr/bin/perl -w +use strict; + +my $key = shift @ARGV; +die "Usage: $0 KEY\n" unless defined $key; + +while(<>) { + my ($k, @rest) = split / \|\|\| /; + print join(' ||| ', @rest) if ($k eq $key); +} + diff --git a/word-aligner/support/extract_vocab.pl b/word-aligner/support/extract_vocab.pl new file mode 100755 index 00000000..070d4202 --- /dev/null +++ b/word-aligner/support/extract_vocab.pl @@ -0,0 +1,20 @@ +#!/usr/bin/perl -w +use strict; + +print STDERR "Extracting vocabulary...\n"; +my %dict = (); +my $wc = 0; +while(<>) { + chomp; + my @words = split /\s+/; + for my $word (@words) { $wc++; $dict{$word}++; } +} + +my $tc = 0; +for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) { + print "$word\n"; + $tc++; +} + +print STDERR "$tc types / $wc tokens\n"; + diff --git a/word-aligner/support/extract_weights.pl b/word-aligner/support/extract_weights.pl new file mode 100755 index 00000000..dfedd12e --- /dev/null +++ b/word-aligner/support/extract_weights.pl @@ -0,0 +1,17 @@ +#!/usr/bin/perl -w +use strict; +my %dict=(); +while(<>) { + chomp; + my ($dummy, $a, $b, $wts) = split / \|\|\| /; + my @weights = split /\s+/, $wts; + for my $w (@weights) { + my ($name, $val) = split /=/, $w; + unless ($dict{$name}) { + my $r = (0.5 - rand) / 5; + $r = sprintf ("%0.4f", $r); + print "$name $r\n"; + $dict{$name}= 1; + } + } +} diff --git a/word-aligner/support/invert_grammar.pl b/word-aligner/support/invert_grammar.pl new file mode 100755 index 00000000..3988388d --- /dev/null +++ b/word-aligner/support/invert_grammar.pl @@ -0,0 +1,8 @@ +#!/usr/bin/perl -w +use strict; + +while(<>) { + my ($f, $e, $scores) = split / \|\|\| /; + print "$e ||| $f ||| $scores"; +} + diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl new file mode 100755 index 00000000..c3e29540 --- /dev/null +++ b/word-aligner/support/make_lex_grammar.pl @@ -0,0 +1,435 @@ +#!/usr/bin/perl -w +use utf8; +use strict; + +my $LIMIT_SIZE=30; + +my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV; +die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f; + +my %eclass = (); +my %fclass = (); +load_classes($class_e, \%eclass); +load_classes($class_f, \%fclass); + +our @IDENT_BINS = qw (Ident0 Ident1 Ident2 Ident3 Ident4 Ident5 Ident6 Ident7 Ident8_9 Ident8_9 Ident10_11 Ident10_11 Ident12_14 Ident12_14 Ident12_14); +die unless scalar @IDENT_BINS == 15; +our $MAX_IDENT_BIN = 'IdentGT' . scalar @IDENT_BINS; + +our %cache; +open EF, "<$effile" or die; +open M1, "<$model1" or die; +open IM1, "<$imodel1" or die; +binmode(EF,":utf8"); +binmode(M1,":utf8"); +binmode(IM1,":utf8"); +binmode(STDOUT,":utf8"); +my %model1; +print STDERR "Reading model1...\n"; +my %sizes = (); +while() { + chomp; + my ($f, $e, $lp) = split /\s+/; + $model1{$f}->{$e} = 1; + $sizes{$f}++; +} +close M1; + +my $inv_add = 0; +my %invm1; +print STDERR "Reading inverse model1...\n"; +my %esizes=(); +while() { + chomp; + my ($e, $f, $lp) = split /\s+/; + $invm1{$e}->{$f} = 1; + $esizes{$e}++; + if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) { + $model1{$f}->{$e} = 1; + $sizes{$f}++; + $inv_add++; + } +} +close IM1; +print STDERR "Added $inv_add from inverse model1\n"; + +open M1, "<$model1" or die; +binmode(M1,":utf8"); +my $dir_add = 0; +print STDERR "Reading model1 (again) for extra inverse translations...\n"; +while() { + chomp; + my ($f, $e, $lp) = split /\s+/; + if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) { + $invm1{$e}->{$f} = 1; + $esizes{$e}++; + $dir_add++; + } +} +close M1; +print STDERR "Added $dir_add from model 1\n"; +print STDERR "Generating grammars...\n"; +open OE, "<$orthoe" or die; +binmode(OE,":utf8"); +my %oe_dict; +while() { + chomp; + my ($a, $b) = split / \|\|\| /, $_; + die "BAD: $_" unless defined $a && defined $b; + $oe_dict{$a} = $b; +} +close OE; +open OF, "<$orthof" or die; +binmode(OF,":utf8"); +my %of_dict; +while() { + chomp; + my ($a, $b) = split / \|\|\| /, $_; + die "BAD: $_" unless defined $a && defined $b; + $of_dict{$a} = $b; +} +close OF; +$of_dict{''} = ''; +$oe_dict{''} = ''; + +my $MIN_FEATURE_COUNT = 0; +my $ADD_PREFIX_ID = 0; +my $ADD_CLASS_CLASS = 1; +my $ADD_LEN = 1; +my $ADD_SIM = 1; +my $ADD_DICE = 1; +my $ADD_111 = 1; +my $ADD_ID = 1; +my $ADD_PUNC = 1; +my $ADD_NULL = 0; +my $ADD_STEM_ID = 1; +my $ADD_SYM = 0; +my $BEAM_RATIO = 50; +my $BIN_ORTHO = 1; +my $BIN_DLEN = 1; +my $BIN_IDENT = 1; +my $BIN_DICE = 1; + +my %fdict; +my %fcounts; +my %ecounts; + +my %sdict; + +while() { + chomp; + my ($f, $e) = split /\s*\|\|\|\s*/; + my @es = split /\s+/, $e; + my @fs = split /\s+/, $f; + for my $ew (@es){ + die "E: Empty word" if $ew eq ''; + $ecounts{$ew}++; + } + push @fs, '' if $ADD_NULL; + for my $fw (@fs){ + die "F: Empty word" if $fw eq ''; + $fcounts{$fw}++; + } + for my $fw (@fs){ + for my $ew (@es){ + $fdict{$fw}->{$ew}++; + } + } +} + +my $specials = 0; +my $fc = 1000000; +my $sids = 1000000; +for my $f (sort keys %fdict) { + my $re = $fdict{$f}; + my $max; + for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) { + my $efcount = $re->{$e}; + unless (defined $max) { $max = $efcount; } + my $m1 = $model1{$f}->{$e}; + my $im1 = $invm1{$e}->{$f}; + my $is_good_pair = (defined $m1); + my $is_inv_good_pair = (defined $im1); + my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); + my @feats; + if ($efcount > $MIN_FEATURE_COUNT) { + $fc++; + push @feats, "F$fc=1"; + } + if ($ADD_SYM && $is_good_pair && $is_inv_good_pair) { push @feats, 'Sym=1'; } + my $oe = $oe_dict{$e}; + die "Can't find orthonorm form for $e" unless defined $oe; + my $of = $of_dict{$f}; + die "Can't find orthonorm form for $f" unless defined $of; + my $len_e = length($oe); + my $len_f = length($of); + if ($ADD_DICE) { + if ($BIN_DICE) { + push @feats, dicebin($dice) . '=1'; + } else { + push @feats, "Dice=$dice"; + } + } + if ($ADD_CLASS_CLASS) { + my $ce = $eclass{$e} or die "E- no class for: '$e'"; + my $cf = $fclass{$f} or die "F- no class for: '$f'"; + push @feats, "C${cf}_${ce}=1"; + } + my $is_null = undef; + if ($ADD_NULL && $f eq '') { + push @feats, "IsNull=1"; + $is_null = 1; + } + if ($ADD_LEN) { + if (!$is_null) { + my $dlen = abs($len_e - $len_f); + if ($BIN_DLEN) { + push @feats, dlenbin($dlen) . '=1'; + } else { + push @feats, "DLen=$dlen"; + } + } + } + my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); + my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); + my $both_non_numeric = (!$e_num && !$f_num); + if ($ADD_STEM_ID) { + my $el = 4; + my $fl = 4; + if ($oe =~ /^al|re|co/) { $el++; } + if ($of =~ /^al|re|co/) { $fl++; } + if ($oe =~ /^trans|inter/) { $el+=2; } + if ($of =~ /^trans|inter/) { $fl+=2; } + if ($fl > length($of)) { $fl = length($of); } + if ($el > length($oe)) { $el = length($oe); } + my $sf = substr $of, 0, $fl; + my $se = substr $oe, 0, $el; + my $id = $sdict{$sf}->{$se}; + if (!$id) { + $sids++; + $sdict{$sf}->{$se} = $sids; + $id = $sids; + } + push @feats, "S$id=1"; + } + if ($ADD_PREFIX_ID) { + if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { + my $pe = substr $oe, 0, 3; + my $pf = substr $of, 0, 3; + if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } + } + } + if ($ADD_SIM) { + my $ld = 0; + my $eff = $len_e; + if ($eff < $len_f) { $eff = $len_f; } + if (!$is_null) { + $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); + } + if ($ld > 1.5) { $is_good_pair = 1; } + if ($BIN_ORTHO) { + push @feats, orthobin($ld) . '=1'; + } else { + push @feats, "OrthoSim=$ld"; + } + } + my $ident = ($e eq $f); + if ($ident) { $is_good_pair = 1; } + if ($ident && $ADD_ID) { + if ($BIN_IDENT) { + push @feats, identbin($len_e) . '=1'; + } else { + push @feats, "Identical=$len_e"; + } + } + if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) { + $is_good_pair = 1; + if ($ADD_111) { + push @feats, "OneOneOne=1"; + } + } + if ($ADD_PUNC) { + if ($f =~ /^[!,\-\/"':;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) { + push @feats, "PuncMiss=1"; + } + } + my $is_special = ($is_good_pair && !(defined $m1)); + $specials++ if $is_special; + print STDERR "$f -> $e\n" if $is_special; + print "1 ||| $f ||| $e ||| @feats\n" if $is_good_pair; + print "2 ||| $e ||| $f ||| @feats\n" if $is_inv_good_pair; + } +} +print STDERR "Added $specials special rules that were not in the M1 set\n"; + + +sub levenshtein +{ + # $s1 and $s2 are the two strings + # $len1 and $len2 are their respective lengths + # + my ($s1, $s2) = @_; + my ($len1, $len2) = (length $s1, length $s2); + + # If one of the strings is empty, the distance is the length + # of the other string + # + return $len2 if ($len1 == 0); + return $len1 if ($len2 == 0); + + my %mat; + + # Init the distance matrix + # + # The first row to 0..$len1 + # The first column to 0..$len2 + # The rest to 0 + # + # The first row and column are initialized so to denote distance + # from the empty string + # + for (my $i = 0; $i <= $len1; ++$i) + { + for (my $j = 0; $j <= $len2; ++$j) + { + $mat{$i}{$j} = 0; + $mat{0}{$j} = $j; + } + + $mat{$i}{0} = $i; + } + + # Some char-by-char processing is ahead, so prepare + # array of chars from the strings + # + my @ar1 = split(//, $s1); + my @ar2 = split(//, $s2); + + for (my $i = 1; $i <= $len1; ++$i) + { + for (my $j = 1; $j <= $len2; ++$j) + { + # Set the cost to 1 iff the ith char of $s1 + # equals the jth of $s2 + # + # Denotes a substitution cost. When the char are equal + # there is no need to substitute, so the cost is 0 + # + my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; + + # Cell $mat{$i}{$j} equals the minimum of: + # + # - The cell immediately above plus 1 + # - The cell immediately to the left plus 1 + # - The cell diagonally above and to the left plus the cost + # + # We can either insert a new char, delete a char or + # substitute an existing char (with an associated cost) + # + $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, + $mat{$i}{$j-1} + 1, + $mat{$i-1}{$j-1} + $cost]); + } + } + + # Finally, the Levenshtein distance equals the rightmost bottom cell + # of the matrix + # + # Note that $mat{$x}{$y} denotes the distance between the substrings + # 1..$x and 1..$y + # + return $mat{$len1}{$len2}; +} + + +# minimal element of a list +# +sub min +{ + my @list = @{$_[0]}; + my $min = $list[0]; + + foreach my $i (@list) + { + $min = $i if ($i < $min); + } + + return $min; +} + +sub load_classes { + my ($file, $ref) = @_; + print STDERR "Reading classes from $file...\n"; + open F, "<$file" or die "Can't read $file: $!"; + binmode(F, ":utf8") or die; + while() { + chomp; + my ($word, $class) = split /\s+/; +# print STDERR "'$word' -> $class\n"; + $ref->{$word} = $class; + } + close F; +} + +sub dicebin { + my $x = shift; + if ($x < 0.05) { return 'DiceLT005'; } + elsif ($x >= 0.05 && $x < 0.1) { return 'Dice005_01'; } + elsif ($x >= 0.1 && $x < 0.2) { return 'Dice01_02'; } + elsif ($x >= 0.2 && $x < 0.3) { return 'Dice02_03'; } + elsif ($x >= 0.3 && $x < 0.4) { return 'Dice03_04'; } + elsif ($x >= 0.4 && $x < 0.5) { return 'Dice04_05'; } + elsif ($x >= 0.5 && $x < 0.6) { return 'Dice05_06'; } + elsif ($x >= 0.6 && $x < 0.7) { return 'Dice06_07'; } + elsif ($x >= 0.7 && $x < 0.8) { return 'Dice07_08'; } + elsif ($x >= 0.8 && $x < 0.9) { return 'Dice08_09'; } + elsif ($x >= 0.9 && $x < 1.0) { return 'Dice09_10'; } + elsif ($x >= 1.0 && $x < 1.1) { return 'Dice10_11'; } + elsif ($x >= 1.1 && $x < 1.2) { return 'Dice11_12'; } + elsif ($x >= 1.2 && $x < 1.4) { return 'Dice12_14'; } + elsif ($x >= 1.4 && $x < 1.6) { return 'Dice14_16'; } + elsif ($x >= 1.6 && $x < 1.8) { return 'Dice16_18'; } + elsif ($x >= 1.8 && $x < 2.0) { return 'Dice18_20'; } + elsif ($x >= 2.0 && $x < 2.3) { return 'Dice20_23'; } + elsif ($x >= 2.3) { return 'DiceGT23'; } +} + +sub orthobin { + my $x = shift; + if ($x < 0.9) { return 'OrthoLT09'; } + elsif ($x >= 0.9 && $x < 1.1) { return 'Ortho09_11'; } + elsif ($x >= 1.1 && $x < 1.3) { return 'Ortho11_13'; } + elsif ($x >= 1.3 && $x < 1.5) { return 'Ortho13_15'; } + elsif ($x >= 1.5 && $x < 1.7) { return 'Ortho15_17'; } + elsif ($x >= 1.7 && $x < 1.9) { return 'Ortho17_19'; } + elsif ($x >= 1.9 && $x < 2.1) { return 'Ortho19_21'; } + elsif ($x >= 2.1 && $x < 2.3) { return 'Ortho21_23'; } + elsif ($x >= 2.3 && $x < 2.5) { return 'Ortho23_25'; } + elsif ($x >= 2.5 && $x < 2.7) { return 'Ortho25_27'; } + elsif ($x >= 2.7 && $x < 2.9) { return 'Ortho27_29'; } + elsif ($x >= 2.9) { return 'OrthoGT29'; } +} + +sub dlenbin { + my $x = shift; + if ($x == 0) { return 'DLen0'; } + elsif ($x == 1) { return 'DLen1'; } + elsif ($x == 2) { return 'DLen2'; } + elsif ($x == 3) { return 'DLen3'; } + elsif ($x == 4) { return 'DLen4'; } + elsif ($x == 5) { return 'DLen5'; } + elsif ($x == 6) { return 'DLen6'; } + elsif ($x == 7) { return 'DLen7'; } + elsif ($x == 8) { return 'DLen8'; } + elsif ($x == 9) { return 'DLen9'; } + elsif ($x >= 10) { return 'DLenGT10'; } +} + +sub identbin { + my $x = shift; + if ($x == 0) { die; } + if ($x > scalar @IDENT_BINS) { return $MAX_IDENT_BIN; } + return $IDENT_BINS[$x]; +} + + diff --git a/word-aligner/support/merge_corpus.pl b/word-aligner/support/merge_corpus.pl new file mode 100755 index 00000000..02827903 --- /dev/null +++ b/word-aligner/support/merge_corpus.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; +die "Usage: $0 corpus.e|f corpus.f|e" unless scalar @ARGV == 2; + +my ($a, $b) = @ARGV; +open A, "<$a" or die "Can't read $a: $!"; +open B, "<$b" or die "Can't read $a: $!"; + +while() { + chomp; + my $e = ; + die "Mismatched lines in $a and $b!" unless defined $e; + print "$_ ||| $e"; +} + +my $e = ; +die "Mismatched lines in $a and $b!" unless !defined $e; + diff --git a/word-aligner/support/supplement_weights_file.pl b/word-aligner/support/supplement_weights_file.pl new file mode 100755 index 00000000..06876043 --- /dev/null +++ b/word-aligner/support/supplement_weights_file.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl -w +use strict; + +my $ADD_FCLASS_JUMP = 1; +my $ADD_MODEL2_BINARY = 0; +my $ADD_FC_RELPOS = 1; + +my ($f_classes) = @ARGV; + +die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes; + +print <) { + chomp; + my ($x, $cat) = split /\s+/; + $dcats{$cat} = 1; +} + +my @cats = sort keys %dcats; + +my $added = 0; +for (my $i=0; $i < scalar @cats; $i++) { + my $c1 = $cats[$i]; + for (my $j=0; $j < scalar @cats; $j++) { + my $c2 = $cats[$j]; + print "SP:${c1}_${c2} 0\n"; + $added++; + } +} + +for (my $ss=1; $ss < 100; $ss++) { + if ($ADD_FCLASS_JUMP) { + for (my $i=0; $i < scalar @cats; $i++) { + my $cat = $cats[$i]; + for (my $j = -$ss; $j <= $ss; $j++) { + print "Jump_FL:${ss}_FC:${cat}_J:$j 0\n"; + $added++; + } + } + } + if ($ADD_MODEL2_BINARY) { + # M2_FL:8_SI:3_TI:2=1 + for (my $i = 0; $i < $ss; $i++) { + for (my $j = 0; $j < 100; $j++) { + print "M2FL:${ss}:TI:${j}_SI:${i} 0\n"; + $added++; + } + } + } +} +if ($ADD_FC_RELPOS) { + #RelPos_FC:11 + for (my $i=0; $i < scalar @cats; $i++) { + my $cat = $cats[$i]; + print "RelPos_FC:$cat 0\n"; + $added++; + } +} + +print STDERR "Added $added weights\n"; -- cgit v1.2.3