bit more alignment stuff

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@686 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-21 01:27:58 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-21 01:27:58 +0000
commit: ed364db2699dec02f5bd18b9c15147e0a02efee1 (patch)
tree: 80db871258b3d769da3e06cb299857832315c60c /word-aligner/support
parent: eb43ccf1e1a7d59a0f672956230644ebb7a5cbf1 (diff)
1 files changed, 48 insertions, 16 deletions
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index c3e29540..3926fd8d 100755
--- a/word-aligner/support/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -4,8 +4,8 @@ use strict;
 
 my $LIMIT_SIZE=30;
 
-my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
+my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV;
+die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && -f $gizaf2e && -f $gizae2f;
 
 my %eclass = ();
 my %fclass = ();
@@ -20,8 +20,12 @@ our %cache;
 open EF, "<$effile" or die;
 open M1, "<$model1" or die;
 open IM1, "<$imodel1" or die;
+open M4, "<$gizaf2e" or die;
+open IM4, "<$gizae2f" or die;
 binmode(EF,":utf8");
 binmode(M1,":utf8");
+binmode(M4,":utf8");
+binmode(IM4,":utf8");
 binmode(IM1,":utf8");
 binmode(STDOUT,":utf8");
 my %model1;
@@ -93,7 +97,7 @@ $of_dict{'<eps>'} = '<eps>';
 $oe_dict{'<eps>'} = '<eps>';
 
 my $MIN_FEATURE_COUNT = 0;
-my $ADD_PREFIX_ID = 0;
+my $ADD_PREFIX_ID = 1;
 my $ADD_CLASS_CLASS = 1;
 my $ADD_LEN = 1;
 my $ADD_SIM = 1;
@@ -102,13 +106,14 @@ my $ADD_111 = 1;
 my $ADD_ID = 1;
 my $ADD_PUNC = 1;
 my $ADD_NULL = 0;
-my $ADD_STEM_ID = 1;
+my $ADD_STEM_ID = 0;
 my $ADD_SYM = 0;
 my $BEAM_RATIO = 50;
 my $BIN_ORTHO = 1;
 my $BIN_DLEN = 1;
 my $BIN_IDENT = 1;
 my $BIN_DICE = 1;
+my $ADD_FIDENT = 0;
 
 my %fdict;
 my %fcounts;
@@ -126,8 +131,10 @@ while(<EF>) {
     $ecounts{$ew}++;
   }
   push @fs, '<eps>' if $ADD_NULL;
+  my $i = 0;
   for my $fw (@fs){
-    die "F: Empty word" if $fw eq '';
+    $i++;
+    die "F: Empty word\nI=$i FS: @fs" if $fw eq '';
     $fcounts{$fw}++;
   }
   for my $fw (@fs){
@@ -137,6 +144,27 @@ while(<EF>) {
   }
 }
 
+print STDERR "Loading Giza output...\n";
+my %model4;
+while(<M4>) {
+  my $en = <M4>; chomp $en;
+  my $zh = <M4>; chomp $zh;
+  die unless $zh =~ /^NULL \({/;
+  my @ewords = split /\s+/, $en;
+  my @chunks = split /\}\) ?/, $zh;
+
+  for my $c (@chunks) {
+    my ($zh, $taps) = split / \(\{ /, $c;
+    if ($zh eq 'NULL') { $zh = '<eps>'; }
+    my @aps = map { $ewords[$_ - 1]; } (split / /, $taps);
+    #print "$zh -> @aps\n";
+    for my $ap (@aps) {
+      $model4{$zh}->{$ap} += 1;
+    }
+  }
+}
+close M4;
+
 my $specials = 0;
 my $fc = 1000000;
 my $sids = 1000000;
@@ -147,12 +175,14 @@ for my $f (sort keys %fdict) {
     my $efcount = $re->{$e};
     unless (defined $max) { $max = $efcount; }
     my $m1 = $model1{$f}->{$e};
+    my $m4 = $model4{$f}->{$e};
     my $im1 = $invm1{$e}->{$f};
-    my $is_good_pair = (defined $m1);
+    my $is_good_pair = (defined $m1 || defined $m4);
     my $is_inv_good_pair = (defined $im1);
-    my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
+    my $total_eandf = $ecounts{$e} + $fcounts{$f};
+    my $dice = 2 * $efcount / $total_eandf;
     my @feats;
-    if ($efcount > $MIN_FEATURE_COUNT) {
+    if ($ADD_FIDENT && $efcount > $MIN_FEATURE_COUNT) {
       $fc++;
       push @feats, "F$fc=1";
     }
@@ -212,13 +242,6 @@ for my $f (sort keys %fdict) {
       }
       push @feats, "S$id=1";
     }
-    if ($ADD_PREFIX_ID) {
-      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { 
-        my $pe = substr $oe, 0, 3;
-        my $pf = substr $of, 0, 3;
-        if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
-      }
-    }
     if ($ADD_SIM) {
       my $ld = 0;
       my $eff = $len_e;
@@ -226,7 +249,7 @@ for my $f (sort keys %fdict) {
       if (!$is_null) {
         $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
       }
-      if ($ld > 1.5) { $is_good_pair = 1; }
+      #if ($ld > 1.5) { $is_good_pair = 1; }
       if ($BIN_ORTHO) {
         push @feats, orthobin($ld) . '=1';
       } else {
@@ -236,12 +259,21 @@ for my $f (sort keys %fdict) {
     my $ident = ($e eq $f);
     if ($ident) { $is_good_pair = 1; }
     if ($ident && $ADD_ID) {
+      if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; }
+      if ($total_eandf < 8) { push @feats, "IdentRare=1"; }
       if ($BIN_IDENT) {
         push @feats, identbin($len_e) . '=1';
       } else {
         push @feats, "Identical=$len_e";
       }
     }
+    if ($ADD_PREFIX_ID && !$ident) {
+      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { 
+        my $pe = substr $oe, 0, 3;
+        my $pf = substr $of, 0, 3;
+        if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
+      }
+    }
     if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {
       $is_good_pair = 1;
       if ($ADD_111) {
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-21 01:27:58 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-21 01:27:58 +0000
commit	ed364db2699dec02f5bd18b9c15147e0a02efee1 (patch)
tree	80db871258b3d769da3e06cb299857832315c60c /word-aligner/support
parent	eb43ccf1e1a7d59a0f672956230644ebb7a5cbf1 (diff)