summaryrefslogtreecommitdiff
path: root/word-aligner
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-12-01 05:27:13 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-12-01 05:27:13 +0000
commit5694fc704f0c7b040c28f88a034e67a1ed19d3ba (patch)
tree5ee46a3429414b1c1cdf9712f27a645b7438eed6 /word-aligner
parent083e28a2694df51a4631d81347c45f57a5182560 (diff)
alternative def of neighborhoods
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@739 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'word-aligner')
-rwxr-xr-xword-aligner/aligner.pl3
-rw-r--r--word-aligner/makefiles/makefile.grammars22
-rwxr-xr-xword-aligner/support/generate_word_pair_features.pl432
-rwxr-xr-xword-aligner/support/make_lex_grammar.pl388
4 files changed, 458 insertions, 387 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index f0733449..81ac4198 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -118,15 +118,18 @@ grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
# grammar=$align_dir/grammars/freq_grammar.$direction.gz
# per_sentence_grammar_file=$align_dir/grammars/psg.$direction
+feature_function=WordPairFeatures $align_dir/grammars/wordpairs.$direction.features.gz
feature_function=LexicalPairIdentity
feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second
feature_function=LexicalPairIdentity S $align_dir/grammars/corpus.stemmed.$first $align_dir/grammars/${second}stem.map
feature_function=InputIdentity
feature_function=OutputIdentity
feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first
+# the following two are deprecated
feature_function=MarkovJump +b
feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first
feature_function=SourceBigram
+# following is deprecated- should reuse SourceBigram the way LexicalPairIdentity does
feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first
EOT
close CDEC;
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index 21f39ac1..60417ec5 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -1,7 +1,7 @@
-all: corpus.f-e.lex-grammar.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml
+all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml
clean:
- $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg*
+ $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg* wordpairs*
SUPPORT_DIR = $(SCRIPT_DIR)/support
GZIP = /usr/bin/gzip
@@ -9,6 +9,7 @@ ZCAT = zcat
EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl
EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl
GENERATE_PSG = $(SUPPORT_DIR)/generate_per_sentence_grammars.pl
+GENERATE_WORDPAIR_FEATURES = $(SUPPORT_DIR)/generate_word_pair_features.pl
ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl
ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl
STEM_F = $(SCRIPT_DIR)/stemmers/$(F_LANG).pl
@@ -66,13 +67,22 @@ corpus.e-f: corpus.f corpus.e $(MERGE_CORPUS)
$(MERGE_CORPUS) corpus.e corpus.f > $@
corpus.f-e.model1: corpus.f-e $(MODEL1)
- $(MODEL1) corpus.f-e > $@
+ $(MODEL1) -v -V corpus.f-e > $@
corpus.e-f.model1: corpus.e-f $(MODEL1)
- $(MODEL1) corpus.e-f > $@
+ $(MODEL1) -v -V corpus.e-f > $@
-corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN)
- $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) | $(GZIP) -9 > $@
+corpus.f-e.full-model1: corpus.f-e $(MODEL1)
+ $(MODEL1) -t -999999 -v -V corpus.f-e > $@
+
+corpus.e-f.full-model1: corpus.e-f $(MODEL1)
+ $(MODEL1) -t -999999 -v -V corpus.e-f > $@
+
+corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1
+ $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 | $(GZIP) -9 > $@
+
+wordpairs.f-e.features.gz: corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1
+ $(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 | $(GZIP) -9 > $@
corpus.f-e.sgml: f.voc corpus.f-e.lex-grammar.gz corpus.f-e
$(GENERATE_PSG) f.voc corpus.f-e corpus.f-e.lex-grammar.gz freq_grammar.f-e.gz psg.f-e $@
diff --git a/word-aligner/support/generate_word_pair_features.pl b/word-aligner/support/generate_word_pair_features.pl
new file mode 100755
index 00000000..b722ee49
--- /dev/null
+++ b/word-aligner/support/generate_word_pair_features.pl
@@ -0,0 +1,432 @@
+#!/usr/bin/perl -w
+use utf8;
+use strict;
+
+my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $sparse_m1) = @ARGV;
+die "Usage: $0 corpus.fr-en corpus.f-e.full-model1 corpus.e-f.full-model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f corpus.f-e.model1\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && $sparse_m1 && -f $sparse_m1;
+
+my %eclass = ();
+my %fclass = ();
+load_classes($class_e, \%eclass);
+load_classes($class_f, \%fclass);
+
+our @IDENT_BINS = qw (Ident0 Ident1 Ident2 Ident3 Ident4 Ident5 Ident6 Ident7 Ident8_9 Ident8_9 Ident10_11 Ident10_11 Ident12_14 Ident12_14 Ident12_14);
+die unless scalar @IDENT_BINS == 15;
+our $MAX_IDENT_BIN = 'IdentGT' . scalar @IDENT_BINS;
+
+my $MIN_MAGNITUDE = 0.001; # minimum value of a feature
+
+our %cache;
+open EF, "<$effile" or die;
+open M1, "<$model1" or die;
+open IM1, "<$imodel1" or die;
+open SM1, "<$sparse_m1" or die;
+binmode(EF,":utf8");
+binmode(M1,":utf8");
+binmode(IM1,":utf8");
+binmode(SM1,":utf8");
+binmode(STDOUT,":utf8");
+my %model1;
+print STDERR "Reading model1...\n";
+my %sizes = ();
+while(<M1>) {
+ chomp;
+ my ($f, $e, $lp) = split /\s+/;
+ $model1{$f}->{$e} = sprintf("%.5g", 1e-12 + exp($lp));
+ $sizes{$f}++;
+}
+close M1;
+
+my $inv_add = 0;
+my %invm1;
+print STDERR "Reading inverse model1...\n";
+my %esizes=();
+while(<IM1>) {
+ chomp;
+ my ($e, $f, $lp) = split /\s+/;
+ $invm1{$e}->{$f} = sprintf("%.5g", 1e-12 + exp($lp));
+}
+close IM1;
+
+open OE, "<$orthoe" or die;
+binmode(OE,":utf8");
+my %oe_dict;
+while(<OE>) {
+ chomp;
+ my ($a, $b) = split / \|\|\| /, $_;
+ die "BAD: $_" unless defined $a && defined $b;
+ $oe_dict{$a} = $b;
+}
+close OE;
+
+print STDERR "Reading sparse model 1 from $sparse_m1...\n";
+my %s_m1;
+while(<SM1>) {
+ chomp;
+ my ($f, $e, $lp) = split /\s+/;
+ die unless defined $e && defined $f;
+ $s_m1{$f}->{$e} = 1;
+}
+close SM1;
+
+open OF, "<$orthof" or die;
+binmode(OF,":utf8");
+my %of_dict;
+while(<OF>) {
+ chomp;
+ my ($a, $b) = split / \|\|\| /, $_;
+ die "BAD: $_" unless defined $a && defined $b;
+ $of_dict{$a} = $b;
+}
+close OF;
+$of_dict{'<eps>'} = '<eps>';
+$oe_dict{'<eps>'} = '<eps>';
+
+my $MIN_FEATURE_COUNT = 0;
+my $ADD_PREFIX_ID = 1;
+my $ADD_LEN = 1;
+my $ADD_SIM = 1;
+my $ADD_DICE = 1;
+my $ADD_111 = 1;
+my $ADD_SPARSE_M1 = 0; # this is a very bad feature
+my $SPARSE_111 = 1; # if 1-1-1, then don't include Model1 & Dice features
+my $ADD_ID = 1;
+my $ADD_PUNC = 1;
+my $ADD_NULL = 1;
+my $ADD_MODEL1 = 1;
+my $ADD_NOMODEL1 = 1;
+my $BEAM_RATIO = 50;
+my $BIN_ORTHO = 1;
+my $BIN_DLEN = 1;
+my $BIN_IDENT = 1;
+my $BIN_DICE = 1;
+
+if ($ADD_NULL) { $fclass{'<eps>'}='NUL'; $eclass{'<eps>'} ='NUL'; }
+
+my %fdict;
+my %fcounts;
+my %ecounts;
+
+my %sdict;
+
+while(<EF>) {
+ chomp;
+ my ($f, $e) = split /\s*\|\|\|\s*/;
+ my @es = split /\s+/, $e;
+ my @fs = split /\s+/, $f;
+ for my $ew (@es){
+ die "E: Empty word" if $ew eq '';
+ $ecounts{$ew}++;
+ }
+ push @fs, '<eps>' if $ADD_NULL;
+ my $i = 0;
+ for my $fw (@fs){
+ $i++;
+ die "F: Empty word\nI=$i FS: @fs" if $fw eq '';
+ $fcounts{$fw}++;
+ }
+ for my $fw (@fs){
+ for my $ew (@es){
+ $fdict{$fw}->{$ew}++;
+ }
+ }
+}
+
+print STDERR "Extracting word pair features...\n";
+my $specials = 0;
+my $fc = 1000000;
+my $sids = 1000000;
+for my $f (sort keys %fdict) {
+ my $re = $fdict{$f};
+ my $max;
+ for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) {
+ my $efcount = $re->{$e};
+ unless (defined $max) { $max = $efcount; }
+ my $m1 = $model1{$f}->{$e};
+ my $im1 = $invm1{$e}->{$f};
+ my $is_null = undef;
+ if ($f eq '<eps>') {
+ $is_null = 1;
+ $im1 = 0; # probability not calcuated
+ }
+ die "No Model1 probability for $e | $f !" unless defined $m1;
+ die "No inverse Model1 probability for $f | $e !" unless defined $im1;
+ my $ident = ($e eq $f);
+ my $total_eandf = $ecounts{$e} + $fcounts{$f};
+ my $dice = 2 * $efcount / $total_eandf;
+ my @feats;
+ my $is_111 = ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1);
+ if ($is_111 && $ADD_111) {
+ push @feats, "OneOneOne=1";
+ }
+ unless ($is_111 && $SPARSE_111) {
+ if ($ADD_SPARSE_M1 && defined $s_m1{$f}->{$e}) {
+ push @feats, "HighM1=1";
+ }
+ if (defined $m1 && $ADD_MODEL1) {
+ if ($m1 > $MIN_MAGNITUDE) {
+ push @feats, "Model1=$m1";
+ my $m1d = sprintf("%.5g", sqrt($m1 * $dice));
+ push @feats, "M1Dice=$m1d" if $m1d > $MIN_MAGNITUDE;
+ } elsif ($ADD_NOMODEL1) {
+ push @feats, 'NoModel1=1';
+ }
+ if ($im1 > $MIN_MAGNITUDE) {
+ push @feats, "InvModel1=$im1" if $im1;
+ } else {
+ push @feats, 'NoInvModel1=1';
+ }
+ my $am1 = sprintf("%.5g", sqrt($m1 * $im1));
+ push @feats, "AgrModel1=$am1" if $am1 > $MIN_MAGNITUDE;
+ }
+ if ($ADD_DICE) {
+ if ($BIN_DICE) {
+ push @feats, dicebin($dice) . '=1';
+ } else {
+ push @feats, "Dice=$dice";
+ }
+ }
+ }
+ my $oe = $oe_dict{$e};
+ die "Can't find orthonorm form for $e" unless defined $oe;
+ my $of = $of_dict{$f};
+ die "Can't find orthonorm form for $f" unless defined $of;
+ my $len_e = length($oe);
+ my $len_f = length($of);
+ if ($ADD_LEN) {
+ if (!$is_null) {
+ my $dlen = abs($len_e - $len_f);
+ if ($BIN_DLEN) {
+ push @feats, dlenbin($dlen) . '=1';
+ } else {
+ push @feats, "DLen=$dlen";
+ }
+ }
+ }
+ my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
+ my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
+ my $both_non_numeric = (!$e_num && !$f_num);
+
+ unless ($total_eandf > 20) {
+ if ($f_num && $e_num) {
+ my $xf = $of;
+ $xf =~ s/[.,\N{U+0087}]//g;
+ my $xe = $oe;
+ $xe =~ s/[.,\N{U+0087}]//g;
+ if (($of ne $oe) && ($xe eq $xf)) { push @feats, "NumNearIdent=1"; }
+ }
+ }
+
+ if ($ADD_SIM) {
+ my $ld = 0;
+ my $eff = $len_e;
+ if ($eff < $len_f) { $eff = $len_f; }
+ if (!$is_null) {
+ $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
+ }
+ if ($BIN_ORTHO) {
+ push @feats, orthobin($ld) . '=1';
+ } else {
+ push @feats, "OrthoSim=$ld";
+ }
+ }
+ my $f_is_punc = ($f =~ /^[!,\-\/"'`:;&=+?.()\[\]«»]+$/);
+ if ($ident && $ADD_ID) {
+ if ($f_is_punc) { push @feats, "IdentPunc=1"; }
+ else {
+ if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; }
+ if ($total_eandf < 8) { push @feats, "IdentRare=1"; }
+ if ($BIN_IDENT) {
+ push @feats, identbin($len_e) . '=1';
+ } else {
+ push @feats, "Identical=$len_e";
+ }
+ }
+ }
+ if ($ADD_PREFIX_ID && !$ident) {
+ if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {
+ my $pe = substr $oe, 0, 3;
+ my $pf = substr $of, 0, 3;
+ if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
+ }
+ }
+ if ($ADD_PUNC) {
+ if ($f_is_punc && $e =~ /[a-z]+/) {
+ push @feats, "PuncMiss=1";
+ }
+ }
+ print "$f ||| $e ||| @feats\n";
+ }
+}
+
+
+sub levenshtein
+{
+ # $s1 and $s2 are the two strings
+ # $len1 and $len2 are their respective lengths
+ #
+ my ($s1, $s2) = @_;
+ my ($len1, $len2) = (length $s1, length $s2);
+
+ # If one of the strings is empty, the distance is the length
+ # of the other string
+ #
+ return $len2 if ($len1 == 0);
+ return $len1 if ($len2 == 0);
+
+ my %mat;
+
+ # Init the distance matrix
+ #
+ # The first row to 0..$len1
+ # The first column to 0..$len2
+ # The rest to 0
+ #
+ # The first row and column are initialized so to denote distance
+ # from the empty string
+ #
+ for (my $i = 0; $i <= $len1; ++$i)
+ {
+ for (my $j = 0; $j <= $len2; ++$j)
+ {
+ $mat{$i}{$j} = 0;
+ $mat{0}{$j} = $j;
+ }
+
+ $mat{$i}{0} = $i;
+ }
+
+ # Some char-by-char processing is ahead, so prepare
+ # array of chars from the strings
+ #
+ my @ar1 = split(//, $s1);
+ my @ar2 = split(//, $s2);
+
+ for (my $i = 1; $i <= $len1; ++$i)
+ {
+ for (my $j = 1; $j <= $len2; ++$j)
+ {
+ # Set the cost to 1 iff the ith char of $s1
+ # equals the jth of $s2
+ #
+ # Denotes a substitution cost. When the char are equal
+ # there is no need to substitute, so the cost is 0
+ #
+ my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
+
+ # Cell $mat{$i}{$j} equals the minimum of:
+ #
+ # - The cell immediately above plus 1
+ # - The cell immediately to the left plus 1
+ # - The cell diagonally above and to the left plus the cost
+ #
+ # We can either insert a new char, delete a char or
+ # substitute an existing char (with an associated cost)
+ #
+ $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
+ $mat{$i}{$j-1} + 1,
+ $mat{$i-1}{$j-1} + $cost]);
+ }
+ }
+
+ # Finally, the Levenshtein distance equals the rightmost bottom cell
+ # of the matrix
+ #
+ # Note that $mat{$x}{$y} denotes the distance between the substrings
+ # 1..$x and 1..$y
+ #
+ return $mat{$len1}{$len2};
+}
+
+
+# minimal element of a list
+#
+sub min
+{
+ my @list = @{$_[0]};
+ my $min = $list[0];
+
+ foreach my $i (@list)
+ {
+ $min = $i if ($i < $min);
+ }
+
+ return $min;
+}
+
+sub load_classes {
+ my ($file, $ref) = @_;
+ print STDERR "Reading classes from $file...\n";
+ open F, "<$file" or die "Can't read $file: $!";
+ binmode(F, ":utf8") or die;
+ while(<F>) {
+ chomp;
+ my ($word, $class) = split /\s+/;
+# print STDERR "'$word' -> $class\n";
+ $ref->{$word} = $class;
+ }
+ close F;
+}
+
+sub dicebin {
+ my $x = shift;
+ if ($x < 0.05) { return 'DiceLT005'; }
+ elsif ($x >= 0.05 && $x < 0.1) { return 'Dice005_01'; }
+ elsif ($x >= 0.1 && $x < 0.2) { return 'Dice01_02'; }
+ elsif ($x >= 0.2 && $x < 0.3) { return 'Dice02_03'; }
+ elsif ($x >= 0.3 && $x < 0.4) { return 'Dice03_04'; }
+ elsif ($x >= 0.4 && $x < 0.5) { return 'Dice04_05'; }
+ elsif ($x >= 0.5 && $x < 0.6) { return 'Dice05_06'; }
+ elsif ($x >= 0.6 && $x < 0.7) { return 'Dice06_07'; }
+ elsif ($x >= 0.7 && $x < 0.8) { return 'Dice07_08'; }
+ elsif ($x >= 0.8 && $x < 0.9) { return 'Dice08_09'; }
+ elsif ($x >= 0.9 && $x < 1.0) { return 'Dice09_10'; }
+ elsif ($x >= 1.0 && $x < 1.1) { return 'Dice10_11'; }
+ elsif ($x >= 1.1 && $x < 1.2) { return 'Dice11_12'; }
+ elsif ($x >= 1.2 && $x < 1.4) { return 'Dice12_14'; }
+ elsif ($x >= 1.4 && $x < 1.6) { return 'Dice14_16'; }
+ elsif ($x >= 1.6 && $x < 1.8) { return 'Dice16_18'; }
+ elsif ($x >= 1.8 && $x < 2.0) { return 'Dice18_20'; }
+ elsif ($x >= 2.0 && $x < 2.3) { return 'Dice20_23'; }
+ elsif ($x >= 2.3) { return 'DiceGT23'; }
+}
+
+sub orthobin {
+ my $x = shift;
+ if ($x < 0.9) { return 'OrthoLT09'; }
+ elsif ($x >= 0.9 && $x < 1.1) { return 'Ortho09_11'; }
+ elsif ($x >= 1.1 && $x < 1.3) { return 'Ortho11_13'; }
+ elsif ($x >= 1.3 && $x < 1.5) { return 'Ortho13_15'; }
+ elsif ($x >= 1.5 && $x < 1.7) { return 'Ortho15_17'; }
+ elsif ($x >= 1.7 && $x < 1.9) { return 'Ortho17_19'; }
+ elsif ($x >= 1.9 && $x < 2.1) { return 'Ortho19_21'; }
+ elsif ($x >= 2.1 && $x < 2.3) { return 'Ortho21_23'; }
+ elsif ($x >= 2.3 && $x < 2.5) { return 'Ortho23_25'; }
+ elsif ($x >= 2.5 && $x < 2.7) { return 'Ortho25_27'; }
+ elsif ($x >= 2.7 && $x < 2.9) { return 'Ortho27_29'; }
+ elsif ($x >= 2.9) { return 'OrthoGT29'; }
+}
+
+sub dlenbin {
+ my $x = shift;
+ if ($x == 0) { return 'DLen0'; }
+ elsif ($x == 1) { return 'DLen1'; }
+ elsif ($x == 2) { return 'DLen2'; }
+ elsif ($x == 3) { return 'DLen3'; }
+ elsif ($x == 4) { return 'DLen4'; }
+ elsif ($x == 5) { return 'DLen5'; }
+ elsif ($x == 6) { return 'DLen6'; }
+ elsif ($x == 7) { return 'DLen7'; }
+ elsif ($x == 8) { return 'DLen8'; }
+ elsif ($x == 9) { return 'DLen9'; }
+ elsif ($x >= 10) { return 'DLenGT10'; }
+}
+
+sub identbin {
+ my $x = shift;
+ if ($x == 0) { die; }
+ if ($x > scalar @IDENT_BINS) { return $MAX_IDENT_BIN; }
+ return $IDENT_BINS[$x];
+}
+
+
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index c96071bf..47d4d945 100755
--- a/word-aligner/support/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -4,27 +4,14 @@ use strict;
my $LIMIT_SIZE=30;
-my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
+my ($effile, $model1, $imodel1) = @ARGV;
+die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1;
+my $ADD_NULL = 1;
-my %eclass = ();
-my %fclass = ();
-load_classes($class_e, \%eclass);
-load_classes($class_f, \%fclass);
-
-our @IDENT_BINS = qw (Ident0 Ident1 Ident2 Ident3 Ident4 Ident5 Ident6 Ident7 Ident8_9 Ident8_9 Ident10_11 Ident10_11 Ident12_14 Ident12_14 Ident12_14);
-die unless scalar @IDENT_BINS == 15;
-our $MAX_IDENT_BIN = 'IdentGT' . scalar @IDENT_BINS;
-
-our %cache;
open EF, "<$effile" or die;
open M1, "<$model1" or die;
open IM1, "<$imodel1" or die;
-#open M4, "<$gizaf2e" or die;
-#open IM4, "<$gizae2f" or die;
-#binmode(M4,":utf8");
-#binmode(IM4,":utf8");
binmode(EF,":utf8");
binmode(M1,":utf8");
binmode(IM1,":utf8");
@@ -35,7 +22,7 @@ my %sizes = ();
while(<M1>) {
chomp;
my ($f, $e, $lp) = split /\s+/;
- $model1{$f}->{$e} = sprintf("%.5g", 1e-12 + exp($lp));
+ $model1{$f}->{$e} = 1;
$sizes{$f}++;
}
close M1;
@@ -47,10 +34,10 @@ my %esizes=();
while(<IM1>) {
chomp;
my ($e, $f, $lp) = split /\s+/;
- $invm1{$e}->{$f} = sprintf("%.5g", 1e-12 + exp($lp));
+ $invm1{$e}->{$f} = 1;
$esizes{$e}++;
if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) {
- $model1{$f}->{$e} = 1e-12;
+ $model1{$f}->{$e} = 1;
$sizes{$f}++;
$inv_add++;
}
@@ -58,72 +45,9 @@ while(<IM1>) {
close IM1;
print STDERR "Added $inv_add from inverse model1\n";
-open M1, "<$model1" or die;
-binmode(M1,":utf8");
-my $dir_add = 0;
-print STDERR "Reading model1 (again) for extra inverse translations...\n";
-while(<M1>) {
- chomp;
- my ($f, $e, $lp) = split /\s+/;
- if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) {
- $invm1{$e}->{$f} = 1e-12;
- $esizes{$e}++;
- $dir_add++;
- }
-}
-close M1;
-print STDERR "Added $dir_add from model 1\n";
print STDERR "Generating grammars...\n";
-open OE, "<$orthoe" or die;
-binmode(OE,":utf8");
-my %oe_dict;
-while(<OE>) {
- chomp;
- my ($a, $b) = split / \|\|\| /, $_;
- die "BAD: $_" unless defined $a && defined $b;
- $oe_dict{$a} = $b;
-}
-close OE;
-open OF, "<$orthof" or die;
-binmode(OF,":utf8");
-my %of_dict;
-while(<OF>) {
- chomp;
- my ($a, $b) = split / \|\|\| /, $_;
- die "BAD: $_" unless defined $a && defined $b;
- $of_dict{$a} = $b;
-}
-close OF;
-$of_dict{'<eps>'} = '<eps>';
-$oe_dict{'<eps>'} = '<eps>';
-
-my $MIN_FEATURE_COUNT = 0;
-my $ADD_PREFIX_ID = 1;
-my $ADD_LEN = 1;
-my $ADD_SIM = 1;
-my $ADD_DICE = 1;
-my $ADD_111 = 1;
-my $ADD_ID = 1;
-my $ADD_PUNC = 1;
-my $ADD_NULL = 1;
-my $ADD_MODEL1 = 1;
-my $ADD_STEM_ID = 0;
-my $ADD_SYM = 0;
-my $BEAM_RATIO = 50;
-my $BIN_ORTHO = 1;
-my $BIN_DLEN = 1;
-my $BIN_IDENT = 1;
-my $BIN_DICE = 1;
-my $ADD_FIDENT = 0;
-
-if ($ADD_NULL) { $fclass{'<eps>'}='NUL'; $eclass{'<eps>'} ='NUL'; }
my %fdict;
-my %fcounts;
-my %ecounts;
-
-my %sdict;
-
while(<EF>) {
chomp;
my ($f, $e) = split /\s*\|\|\|\s*/;
@@ -131,14 +55,12 @@ while(<EF>) {
my @fs = split /\s+/, $f;
for my $ew (@es){
die "E: Empty word" if $ew eq '';
- $ecounts{$ew}++;
}
push @fs, '<eps>' if $ADD_NULL;
my $i = 0;
for my $fw (@fs){
$i++;
die "F: Empty word\nI=$i FS: @fs" if $fw eq '';
- $fcounts{$fw}++;
}
for my $fw (@fs){
for my $ew (@es){
@@ -147,7 +69,6 @@ while(<EF>) {
}
}
-#print STDERR "Loading Giza output...\n";
my %model4;
#while(<M4>) {
# my $en = <M4>; chomp $en;
@@ -181,305 +102,10 @@ for my $f (sort keys %fdict) {
my $m4 = $model4{$f}->{$e};
my $im1 = $invm1{$e}->{$f};
my $is_good_pair = (defined $m1 || defined $m4);
- my $is_inv_good_pair = (defined $im1);
my $ident = ($e eq $f);
if ($ident) { $is_good_pair = 1; }
- my $total_eandf = $ecounts{$e} + $fcounts{$f};
- my $dice = 2 * $efcount / $total_eandf;
- my @feats;
- if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {
- $is_good_pair = 1;
- if ($ADD_111) {
- push @feats, "OneOneOne=1";
- }
- }
next unless $is_good_pair;
- if (defined $m1 && $ADD_MODEL1) {
- push @feats, "Model1=$m1";
- my $m1d = sprintf("%.5g", sqrt($m1 * $dice));
- push @feats, "Model1Dice=$m1d";
- }
- if ($ADD_MODEL1 && !defined $m1) { push @feats, "NoModel1=1"; }
- if (defined $im1 && $ADD_MODEL1) {
- push @feats, "InvModel1=$im1";
- }
- if (!defined $im1 && $ADD_MODEL1) {
- push @feats, "NoInvModel1=1";
- }
- if ($ADD_FIDENT && $efcount > $MIN_FEATURE_COUNT) {
- $fc++;
- push @feats, "F$fc=1";
- }
- if ($ADD_SYM && $is_good_pair && $is_inv_good_pair) { push @feats, 'Sym=1'; }
- my $oe = $oe_dict{$e};
- die "Can't find orthonorm form for $e" unless defined $oe;
- my $of = $of_dict{$f};
- die "Can't find orthonorm form for $f" unless defined $of;
- my $len_e = length($oe);
- my $len_f = length($of);
- if ($ADD_DICE) {
- if ($BIN_DICE) {
- push @feats, dicebin($dice) . '=1';
- } else {
- push @feats, "Dice=$dice";
- }
- }
- my $is_null = undef;
- if ($ADD_NULL && $f eq '<eps>') {
- $is_null = 1;
- }
- if ($ADD_LEN) {
- if (!$is_null) {
- my $dlen = abs($len_e - $len_f);
- if ($BIN_DLEN) {
- push @feats, dlenbin($dlen) . '=1';
- } else {
- push @feats, "DLen=$dlen";
- }
- }
- }
- my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
- my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
- my $both_non_numeric = (!$e_num && !$f_num);
- unless ($total_eandf > 20) {
- if ($f_num && $e_num) {
- my $xf = $of;
- $xf =~ s/[.,]//g;
- my $xe = $oe;
- $xe =~ s/[.,]//g;
- if (($of ne $oe) && ($xe eq $xf)) { push @feats, "NumNearIdent=1"; }
- }
- }
- if ($ADD_STEM_ID) {
- my $el = 4;
- my $fl = 4;
- if ($oe =~ /^al|re|co/) { $el++; }
- if ($of =~ /^al|re|co/) { $fl++; }
- if ($oe =~ /^trans|inter/) { $el+=2; }
- if ($of =~ /^trans|inter/) { $fl+=2; }
- if ($fl > length($of)) { $fl = length($of); }
- if ($el > length($oe)) { $el = length($oe); }
- my $sf = substr $of, 0, $fl;
- my $se = substr $oe, 0, $el;
- my $id = $sdict{$sf}->{$se};
- if (!$id) {
- $sids++;
- $sdict{$sf}->{$se} = $sids;
- $id = $sids;
- }
- push @feats, "S$id=1";
- }
- if ($ADD_SIM) {
- my $ld = 0;
- my $eff = $len_e;
- if ($eff < $len_f) { $eff = $len_f; }
- if (!$is_null) {
- $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
- }
- #if ($ld > 1.5) { $is_good_pair = 1; }
- if ($BIN_ORTHO) {
- push @feats, orthobin($ld) . '=1';
- } else {
- push @feats, "OrthoSim=$ld";
- }
- }
- if ($ident && $ADD_ID) {
- if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; }
- if ($total_eandf < 8) { push @feats, "IdentRare=1"; }
- if ($BIN_IDENT) {
- push @feats, identbin($len_e) . '=1';
- } else {
- push @feats, "Identical=$len_e";
- }
- }
- if ($ADD_PREFIX_ID && !$ident) {
- if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {
- my $pe = substr $oe, 0, 3;
- my $pf = substr $of, 0, 3;
- if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
- }
- }
- if ($ADD_PUNC) {
- if ($f =~ /^[!,\-\/"'`:;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) {
- push @feats, "PuncMiss=1";
- }
- }
- my $is_special = ($is_good_pair && !(defined $m1));
- $specials++ if $is_special;
- print STDERR "$f -> $e\n" if $is_special;
- print "$f ||| $e ||| @feats\n" if $is_good_pair;
+ print "$f ||| $e ||| X=0\n" if $is_good_pair;
}
}
-print STDERR "Added $specials special rules that were not in the M1 set\n";
-
-
-sub levenshtein
-{
- # $s1 and $s2 are the two strings
- # $len1 and $len2 are their respective lengths
- #
- my ($s1, $s2) = @_;
- my ($len1, $len2) = (length $s1, length $s2);
-
- # If one of the strings is empty, the distance is the length
- # of the other string
- #
- return $len2 if ($len1 == 0);
- return $len1 if ($len2 == 0);
-
- my %mat;
-
- # Init the distance matrix
- #
- # The first row to 0..$len1
- # The first column to 0..$len2
- # The rest to 0
- #
- # The first row and column are initialized so to denote distance
- # from the empty string
- #
- for (my $i = 0; $i <= $len1; ++$i)
- {
- for (my $j = 0; $j <= $len2; ++$j)
- {
- $mat{$i}{$j} = 0;
- $mat{0}{$j} = $j;
- }
-
- $mat{$i}{0} = $i;
- }
-
- # Some char-by-char processing is ahead, so prepare
- # array of chars from the strings
- #
- my @ar1 = split(//, $s1);
- my @ar2 = split(//, $s2);
-
- for (my $i = 1; $i <= $len1; ++$i)
- {
- for (my $j = 1; $j <= $len2; ++$j)
- {
- # Set the cost to 1 iff the ith char of $s1
- # equals the jth of $s2
- #
- # Denotes a substitution cost. When the char are equal
- # there is no need to substitute, so the cost is 0
- #
- my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
-
- # Cell $mat{$i}{$j} equals the minimum of:
- #
- # - The cell immediately above plus 1
- # - The cell immediately to the left plus 1
- # - The cell diagonally above and to the left plus the cost
- #
- # We can either insert a new char, delete a char or
- # substitute an existing char (with an associated cost)
- #
- $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
- $mat{$i}{$j-1} + 1,
- $mat{$i-1}{$j-1} + $cost]);
- }
- }
-
- # Finally, the Levenshtein distance equals the rightmost bottom cell
- # of the matrix
- #
- # Note that $mat{$x}{$y} denotes the distance between the substrings
- # 1..$x and 1..$y
- #
- return $mat{$len1}{$len2};
-}
-
-
-# minimal element of a list
-#
-sub min
-{
- my @list = @{$_[0]};
- my $min = $list[0];
-
- foreach my $i (@list)
- {
- $min = $i if ($i < $min);
- }
-
- return $min;
-}
-
-sub load_classes {
- my ($file, $ref) = @_;
- print STDERR "Reading classes from $file...\n";
- open F, "<$file" or die "Can't read $file: $!";
- binmode(F, ":utf8") or die;
- while(<F>) {
- chomp;
- my ($word, $class) = split /\s+/;
-# print STDERR "'$word' -> $class\n";
- $ref->{$word} = $class;
- }
- close F;
-}
-
-sub dicebin {
- my $x = shift;
- if ($x < 0.05) { return 'DiceLT005'; }
- elsif ($x >= 0.05 && $x < 0.1) { return 'Dice005_01'; }
- elsif ($x >= 0.1 && $x < 0.2) { return 'Dice01_02'; }
- elsif ($x >= 0.2 && $x < 0.3) { return 'Dice02_03'; }
- elsif ($x >= 0.3 && $x < 0.4) { return 'Dice03_04'; }
- elsif ($x >= 0.4 && $x < 0.5) { return 'Dice04_05'; }
- elsif ($x >= 0.5 && $x < 0.6) { return 'Dice05_06'; }
- elsif ($x >= 0.6 && $x < 0.7) { return 'Dice06_07'; }
- elsif ($x >= 0.7 && $x < 0.8) { return 'Dice07_08'; }
- elsif ($x >= 0.8 && $x < 0.9) { return 'Dice08_09'; }
- elsif ($x >= 0.9 && $x < 1.0) { return 'Dice09_10'; }
- elsif ($x >= 1.0 && $x < 1.1) { return 'Dice10_11'; }
- elsif ($x >= 1.1 && $x < 1.2) { return 'Dice11_12'; }
- elsif ($x >= 1.2 && $x < 1.4) { return 'Dice12_14'; }
- elsif ($x >= 1.4 && $x < 1.6) { return 'Dice14_16'; }
- elsif ($x >= 1.6 && $x < 1.8) { return 'Dice16_18'; }
- elsif ($x >= 1.8 && $x < 2.0) { return 'Dice18_20'; }
- elsif ($x >= 2.0 && $x < 2.3) { return 'Dice20_23'; }
- elsif ($x >= 2.3) { return 'DiceGT23'; }
-}
-
-sub orthobin {
- my $x = shift;
- if ($x < 0.9) { return 'OrthoLT09'; }
- elsif ($x >= 0.9 && $x < 1.1) { return 'Ortho09_11'; }
- elsif ($x >= 1.1 && $x < 1.3) { return 'Ortho11_13'; }
- elsif ($x >= 1.3 && $x < 1.5) { return 'Ortho13_15'; }
- elsif ($x >= 1.5 && $x < 1.7) { return 'Ortho15_17'; }
- elsif ($x >= 1.7 && $x < 1.9) { return 'Ortho17_19'; }
- elsif ($x >= 1.9 && $x < 2.1) { return 'Ortho19_21'; }
- elsif ($x >= 2.1 && $x < 2.3) { return 'Ortho21_23'; }
- elsif ($x >= 2.3 && $x < 2.5) { return 'Ortho23_25'; }
- elsif ($x >= 2.5 && $x < 2.7) { return 'Ortho25_27'; }
- elsif ($x >= 2.7 && $x < 2.9) { return 'Ortho27_29'; }
- elsif ($x >= 2.9) { return 'OrthoGT29'; }
-}
-
-sub dlenbin {
- my $x = shift;
- if ($x == 0) { return 'DLen0'; }
- elsif ($x == 1) { return 'DLen1'; }
- elsif ($x == 2) { return 'DLen2'; }
- elsif ($x == 3) { return 'DLen3'; }
- elsif ($x == 4) { return 'DLen4'; }
- elsif ($x == 5) { return 'DLen5'; }
- elsif ($x == 6) { return 'DLen6'; }
- elsif ($x == 7) { return 'DLen7'; }
- elsif ($x == 8) { return 'DLen8'; }
- elsif ($x == 9) { return 'DLen9'; }
- elsif ($x >= 10) { return 'DLenGT10'; }
-}
-
-sub identbin {
- my $x = shift;
- if ($x == 0) { die; }
- if ($x > scalar @IDENT_BINS) { return $MAX_IDENT_BIN; }
- return $IDENT_BINS[$x];
-}
-