summaryrefslogtreecommitdiff
path: root/word-aligner/support
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2010-02-01 17:38:39 -0500
committerChris Dyer <redpony@gmail.com>2010-02-01 17:38:39 -0500
commitc97b8a8b58f7385fb48b74e2cf1ea9610cd1202f (patch)
tree3bc1b02c39927a810862136534d5a0e35d7ed4fc /word-aligner/support
parentda222df300e4f87ad185a7decbf119ad56aa34e0 (diff)
word aligner cleanup, new features
Diffstat (limited to 'word-aligner/support')
-rwxr-xr-xword-aligner/support/classify.pl27
-rwxr-xr-xword-aligner/support/extract_grammar.pl11
-rwxr-xr-xword-aligner/support/extract_vocab.pl20
-rwxr-xr-xword-aligner/support/extract_weights.pl17
-rwxr-xr-xword-aligner/support/invert_grammar.pl8
-rwxr-xr-xword-aligner/support/make_lex_grammar.pl339
-rwxr-xr-xword-aligner/support/merge_corpus.pl18
-rwxr-xr-xword-aligner/support/supplement_weights_file.pl73
8 files changed, 513 insertions, 0 deletions
diff --git a/word-aligner/support/classify.pl b/word-aligner/support/classify.pl
new file mode 100755
index 00000000..893c7b22
--- /dev/null
+++ b/word-aligner/support/classify.pl
@@ -0,0 +1,27 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 classes.txt corpus.txt" unless scalar @ARGV == 2;
+
+my ($class, $text) = @ARGV;
+open C, "<$class" or die "Can't read $class: $!";
+open T, "<$text" or die "Can't read $text: $!";
+
+my %dict = ();
+my $cc = 0;
+while(<C>) {
+ chomp;
+ my ($word, $cat) = split /\s+/;
+ die "'$word' '$cat'" unless (defined $word && defined $cat);
+ $dict{$word} = $cat;
+ $cc++;
+}
+close C;
+print STDERR "Loaded classes for $cc words\n";
+
+while(<T>) {
+ chomp;
+ my @cats = map { $dict{$_} or die "Undefined class for $_"; } split /\s+/;
+ print "@cats\n";
+}
+
diff --git a/word-aligner/support/extract_grammar.pl b/word-aligner/support/extract_grammar.pl
new file mode 100755
index 00000000..d7275ef5
--- /dev/null
+++ b/word-aligner/support/extract_grammar.pl
@@ -0,0 +1,11 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $key = shift @ARGV;
+die "Usage: $0 KEY\n" unless defined $key;
+
+while(<>) {
+ my ($k, @rest) = split / \|\|\| /;
+ print join(' ||| ', @rest) if ($k eq $key);
+}
+
diff --git a/word-aligner/support/extract_vocab.pl b/word-aligner/support/extract_vocab.pl
new file mode 100755
index 00000000..070d4202
--- /dev/null
+++ b/word-aligner/support/extract_vocab.pl
@@ -0,0 +1,20 @@
+#!/usr/bin/perl -w
+use strict;
+
+print STDERR "Extracting vocabulary...\n";
+my %dict = ();
+my $wc = 0;
+while(<>) {
+ chomp;
+ my @words = split /\s+/;
+ for my $word (@words) { $wc++; $dict{$word}++; }
+}
+
+my $tc = 0;
+for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) {
+ print "$word\n";
+ $tc++;
+}
+
+print STDERR "$tc types / $wc tokens\n";
+
diff --git a/word-aligner/support/extract_weights.pl b/word-aligner/support/extract_weights.pl
new file mode 100755
index 00000000..dfedd12e
--- /dev/null
+++ b/word-aligner/support/extract_weights.pl
@@ -0,0 +1,17 @@
+#!/usr/bin/perl -w
+use strict;
+my %dict=();
+while(<>) {
+ chomp;
+ my ($dummy, $a, $b, $wts) = split / \|\|\| /;
+ my @weights = split /\s+/, $wts;
+ for my $w (@weights) {
+ my ($name, $val) = split /=/, $w;
+ unless ($dict{$name}) {
+ my $r = (0.5 - rand) / 5;
+ $r = sprintf ("%0.4f", $r);
+ print "$name $r\n";
+ $dict{$name}= 1;
+ }
+ }
+}
diff --git a/word-aligner/support/invert_grammar.pl b/word-aligner/support/invert_grammar.pl
new file mode 100755
index 00000000..3988388d
--- /dev/null
+++ b/word-aligner/support/invert_grammar.pl
@@ -0,0 +1,8 @@
+#!/usr/bin/perl -w
+use strict;
+
+while(<>) {
+ my ($f, $e, $scores) = split / \|\|\| /;
+ print "$e ||| $f ||| $scores";
+}
+
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
new file mode 100755
index 00000000..bdb2752c
--- /dev/null
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -0,0 +1,339 @@
+#!/usr/bin/perl -w
+use utf8;
+use strict;
+
+my $LIMIT_SIZE=30;
+
+my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV;
+die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
+
+my %eclass = ();
+my %fclass = ();
+load_classes($class_e, \%eclass);
+load_classes($class_f, \%fclass);
+
+our %cache;
+open EF, "<$effile" or die;
+open M1, "<$model1" or die;
+open IM1, "<$imodel1" or die;
+binmode(EF,":utf8");
+binmode(M1,":utf8");
+binmode(IM1,":utf8");
+binmode(STDOUT,":utf8");
+my %model1;
+print STDERR "Reading model1...\n";
+my %sizes = ();
+while(<M1>) {
+ chomp;
+ my ($f, $e, $lp) = split /\s+/;
+ $model1{$f}->{$e} = 1;
+ $sizes{$f}++;
+}
+close M1;
+
+my $inv_add = 0;
+my %invm1;
+print STDERR "Reading inverse model1...\n";
+my %esizes=();
+while(<IM1>) {
+ chomp;
+ my ($e, $f, $lp) = split /\s+/;
+ $invm1{$e}->{$f} = 1;
+ $esizes{$e}++;
+ if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) {
+ $model1{$f}->{$e} = 1;
+ $sizes{$f}++;
+ $inv_add++;
+ }
+}
+close IM1;
+print STDERR "Added $inv_add from inverse model1\n";
+
+open M1, "<$model1" or die;
+binmode(M1,":utf8");
+my $dir_add = 0;
+print STDERR "Reading model1 (again) for extra inverse translations...\n";
+while(<M1>) {
+ chomp;
+ my ($f, $e, $lp) = split /\s+/;
+ if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) {
+ $invm1{$e}->{$f} = 1;
+ $esizes{$e}++;
+ $dir_add++;
+ }
+}
+close M1;
+print STDERR "Added $dir_add from model 1\n";
+print STDERR "Generating grammars...\n";
+open OE, "<$orthoe" or die;
+binmode(OE,":utf8");
+my %oe_dict;
+while(<OE>) {
+ chomp;
+ my ($a, $b) = split / \|\|\| /, $_;
+ die "BAD: $_" unless defined $a && defined $b;
+ $oe_dict{$a} = $b;
+}
+close OE;
+open OF, "<$orthof" or die;
+binmode(OF,":utf8");
+my %of_dict;
+while(<OF>) {
+ chomp;
+ my ($a, $b) = split / \|\|\| /, $_;
+ die "BAD: $_" unless defined $a && defined $b;
+ $of_dict{$a} = $b;
+}
+close OF;
+$of_dict{'<eps>'} = '<eps>';
+$oe_dict{'<eps>'} = '<eps>';
+
+my $MIN_FEATURE_COUNT = 0;
+my $ADD_PREFIX_ID = 0;
+my $ADD_CLASS_CLASS = 1;
+my $ADD_LEN = 1;
+my $ADD_SIM = 1;
+my $ADD_DICE = 1;
+my $ADD_111 = 1;
+my $ADD_ID = 1;
+my $ADD_PUNC = 1;
+my $ADD_NULL = 0;
+my $ADD_STEM_ID = 1;
+my $ADD_SYM = 0;
+my $BEAM_RATIO = 50;
+
+my %fdict;
+my %fcounts;
+my %ecounts;
+
+my %sdict;
+
+while(<EF>) {
+ chomp;
+ my ($f, $e) = split /\s*\|\|\|\s*/;
+ my @es = split /\s+/, $e;
+ my @fs = split /\s+/, $f;
+ for my $ew (@es){ $ecounts{$ew}++; }
+ push @fs, '<eps>' if $ADD_NULL;
+ for my $fw (@fs){ $fcounts{$fw}++; }
+ for my $fw (@fs){
+ for my $ew (@es){
+ $fdict{$fw}->{$ew}++;
+ }
+ }
+}
+
+my $specials = 0;
+my $fc = 1000000;
+my $sids = 1000000;
+for my $f (sort keys %fdict) {
+ my $re = $fdict{$f};
+ my $max;
+ for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) {
+ my $efcount = $re->{$e};
+ unless (defined $max) { $max = $efcount; }
+ my $m1 = $model1{$f}->{$e};
+ my $im1 = $invm1{$e}->{$f};
+ my $is_good_pair = (defined $m1);
+ my $is_inv_good_pair = (defined $im1);
+ my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
+ my @feats;
+ if ($efcount > $MIN_FEATURE_COUNT) {
+ $fc++;
+ push @feats, "F$fc=1";
+ }
+ if ($ADD_SYM && $is_good_pair && $is_inv_good_pair) { push @feats, 'Sym=1'; }
+ my $oe = $oe_dict{$e};
+ die "Can't find orthonorm form for $e" unless defined $oe;
+ my $of = $of_dict{$f};
+ die "Can't find orthonorm form for $f" unless defined $of;
+ my $len_e = length($oe);
+ my $len_f = length($of);
+ push @feats, "Dice=$dice" if $ADD_DICE;
+ if ($ADD_CLASS_CLASS) {
+ my $ce = $eclass{$e} or die "E- no class for: $e";
+ my $cf = $fclass{$f} or die "F- no class for: $f";
+ push @feats, "C${cf}_${ce}=1";
+ }
+ my $is_null = undef;
+ if ($ADD_NULL && $f eq '<eps>') {
+ push @feats, "IsNull=1";
+ $is_null = 1;
+ }
+ if ($ADD_LEN) {
+ if (!$is_null) {
+ my $dlen = abs($len_e - $len_f);
+ push @feats, "DLen=$dlen";
+ }
+ }
+ my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
+ my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
+ my $both_non_numeric = (!$e_num && !$f_num);
+ if ($ADD_STEM_ID) {
+ my $el = 4;
+ my $fl = 4;
+ if ($oe =~ /^al|re|co/) { $el++; }
+ if ($of =~ /^al|re|co/) { $fl++; }
+ if ($oe =~ /^trans|inter/) { $el+=2; }
+ if ($of =~ /^trans|inter/) { $fl+=2; }
+ if ($fl > length($of)) { $fl = length($of); }
+ if ($el > length($oe)) { $el = length($oe); }
+ my $sf = substr $of, 0, $fl;
+ my $se = substr $oe, 0, $el;
+ my $id = $sdict{$sf}->{$se};
+ if (!$id) {
+ $sids++;
+ $sdict{$sf}->{$se} = $sids;
+ $id = $sids;
+ }
+ push @feats, "S$id=1";
+ }
+ if ($ADD_PREFIX_ID) {
+ if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {
+ my $pe = substr $oe, 0, 3;
+ my $pf = substr $of, 0, 3;
+ if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
+ }
+ }
+ if ($ADD_SIM) {
+ my $ld = 0;
+ my $eff = $len_e;
+ if ($eff < $len_f) { $eff = $len_f; }
+ if (!$is_null) {
+ $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
+ }
+ if ($ld > 1.5) { $is_good_pair = 1; }
+ push @feats, "OrthoSim=$ld";
+ }
+ my $ident = ($e eq $f);
+ if ($ident) { $is_good_pair = 1; }
+ if ($ident && $ADD_ID) { push @feats, "Identical=$len_e"; }
+ if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {
+ $is_good_pair = 1;
+ if ($ADD_111) {
+ push @feats, "OneOneOne=1";
+ }
+ }
+ if ($ADD_PUNC) {
+ if ($f =~ /^[!,\-\/"':;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) {
+ push @feats, "PuncMiss=1";
+ }
+ }
+ my $is_special = ($is_good_pair && !(defined $m1));
+ $specials++ if $is_special;
+ print STDERR "$f -> $e\n" if $is_special;
+ print "1 ||| $f ||| $e ||| @feats\n" if $is_good_pair;
+ print "2 ||| $e ||| $f ||| @feats\n" if $is_inv_good_pair;
+ }
+}
+print STDERR "Added $specials special rules that were not in the M1 set\n";
+
+
+sub levenshtein
+{
+ # $s1 and $s2 are the two strings
+ # $len1 and $len2 are their respective lengths
+ #
+ my ($s1, $s2) = @_;
+ my ($len1, $len2) = (length $s1, length $s2);
+
+ # If one of the strings is empty, the distance is the length
+ # of the other string
+ #
+ return $len2 if ($len1 == 0);
+ return $len1 if ($len2 == 0);
+
+ my %mat;
+
+ # Init the distance matrix
+ #
+ # The first row to 0..$len1
+ # The first column to 0..$len2
+ # The rest to 0
+ #
+ # The first row and column are initialized so to denote distance
+ # from the empty string
+ #
+ for (my $i = 0; $i <= $len1; ++$i)
+ {
+ for (my $j = 0; $j <= $len2; ++$j)
+ {
+ $mat{$i}{$j} = 0;
+ $mat{0}{$j} = $j;
+ }
+
+ $mat{$i}{0} = $i;
+ }
+
+ # Some char-by-char processing is ahead, so prepare
+ # array of chars from the strings
+ #
+ my @ar1 = split(//, $s1);
+ my @ar2 = split(//, $s2);
+
+ for (my $i = 1; $i <= $len1; ++$i)
+ {
+ for (my $j = 1; $j <= $len2; ++$j)
+ {
+ # Set the cost to 1 iff the ith char of $s1
+ # equals the jth of $s2
+ #
+ # Denotes a substitution cost. When the char are equal
+ # there is no need to substitute, so the cost is 0
+ #
+ my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
+
+ # Cell $mat{$i}{$j} equals the minimum of:
+ #
+ # - The cell immediately above plus 1
+ # - The cell immediately to the left plus 1
+ # - The cell diagonally above and to the left plus the cost
+ #
+ # We can either insert a new char, delete a char or
+ # substitute an existing char (with an associated cost)
+ #
+ $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
+ $mat{$i}{$j-1} + 1,
+ $mat{$i-1}{$j-1} + $cost]);
+ }
+ }
+
+ # Finally, the Levenshtein distance equals the rightmost bottom cell
+ # of the matrix
+ #
+ # Note that $mat{$x}{$y} denotes the distance between the substrings
+ # 1..$x and 1..$y
+ #
+ return $mat{$len1}{$len2};
+}
+
+
+# minimal element of a list
+#
+sub min
+{
+ my @list = @{$_[0]};
+ my $min = $list[0];
+
+ foreach my $i (@list)
+ {
+ $min = $i if ($i < $min);
+ }
+
+ return $min;
+}
+
+sub load_classes {
+ my ($file, $ref) = @_;
+ print STDERR "Reading classes from $file...\n";
+ open F, "<$file" or die "Can't read $file: $!";
+ binmode(F, ":utf8") or die;
+ while(<F>) {
+ chomp;
+ my ($word, $class) = split /\s+/;
+# print STDERR "'$word' -> $class\n";
+ $ref->{$word} = $class;
+ }
+ close F;
+}
+
diff --git a/word-aligner/support/merge_corpus.pl b/word-aligner/support/merge_corpus.pl
new file mode 100755
index 00000000..02827903
--- /dev/null
+++ b/word-aligner/support/merge_corpus.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+die "Usage: $0 corpus.e|f corpus.f|e" unless scalar @ARGV == 2;
+
+my ($a, $b) = @ARGV;
+open A, "<$a" or die "Can't read $a: $!";
+open B, "<$b" or die "Can't read $a: $!";
+
+while(<A>) {
+ chomp;
+ my $e = <B>;
+ die "Mismatched lines in $a and $b!" unless defined $e;
+ print "$_ ||| $e";
+}
+
+my $e = <B>;
+die "Mismatched lines in $a and $b!" unless !defined $e;
+
diff --git a/word-aligner/support/supplement_weights_file.pl b/word-aligner/support/supplement_weights_file.pl
new file mode 100755
index 00000000..7f804b90
--- /dev/null
+++ b/word-aligner/support/supplement_weights_file.pl
@@ -0,0 +1,73 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $ADD_FCLASS_JUMP = 1;
+my $ADD_MODEL2_BINARY = 0;
+my $ADD_FC_RELPOS = 1;
+
+my ($f_classes) = @ARGV;
+
+die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes;
+
+print <<EOT;
+MarkovJump 0
+RelativeSentencePosition 0
+EOT
+
+# ! 8
+# " 11
+# 's 18
+
+my %dcats = ();
+$dcats{'BOS'} = 1;
+$dcats{'EOS'} = 1;
+
+open FC, "<$f_classes" or die;
+while(<FC>) {
+ chomp;
+ my ($x, $cat) = split /\s+/;
+ $dcats{$cat} = 1;
+}
+
+my @cats = sort keys %dcats;
+
+my $added = 0;
+for (my $i=0; $i < scalar @cats; $i++) {
+ my $c1 = $cats[$i];
+ for (my $j=0; $j < scalar @cats; $j++) {
+ my $c2 = $cats[$j];
+ print "SP:${c1}_${c2} 0\n";
+ $added++;
+ }
+}
+
+for (my $ss=1; $ss < 100; $ss++) {
+ if ($ADD_FCLASS_JUMP) {
+ for (my $i=0; $i < scalar @cats; $i++) {
+ my $cat = $cats[$i];
+ for (my $j = -$ss; $j <= $ss; $j++) {
+ print "Jump_FL:${ss}_FC:${cat}_J:$j 0\n";
+ $added++;
+ }
+ }
+ }
+ if ($ADD_MODEL2_BINARY) {
+ # M2_FL:8_SI:3_TI:2=1
+ for (my $i = 0; $i < $ss; $i++) {
+ for (my $j = 0; $j < 100; $j++) {
+ print "M2_FL:${ss}_SI:${i}_TI:${j} 0\n";
+ $added++;
+ }
+ }
+ }
+}
+if ($ADD_FC_RELPOS) {
+ #RelPos_FC:11
+ for (my $i=0; $i < scalar @cats; $i++) {
+ my $cat = $cats[$i];
+ print "RelPos_FC:$cat 0\n";
+ $added++;
+ }
+}
+
+print STDERR "Added $added weights\n";