bit more alignment stuff

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@686 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-21 01:27:58 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-21 01:27:58 +0000
commit: c94d18f3c6f71dfc1c23405c61341fe042277c3d (patch)
tree: 7887c41d577f04e890f57d4aa534776cd87f8ca4 /word-aligner
parent: 36faf01602d28d5bb5f030e0e03c8e7dd2078445 (diff)
3 files changed, 70 insertions, 53 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index 7eec0e42..7821560f 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -16,17 +16,24 @@ GetOptions("cdec=s" => \$DECODER,
            "pmem=s" => \$pmem,
            "mkcls=s" => \$mkcls,
           ) or usage();
-usage() unless (scalar @ARGV == 1);
+usage() unless (scalar @ARGV == 3);
 die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls;
 die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls;
 
 my $in_file = shift @ARGV;
+my $m4 = shift @ARGV;
+my $im4 = shift @ARGV;
+die "Can't find model4: $m4" unless -f $m4;
+die "Can't find inverse model4: $im4" unless -f $im4;
+
 die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/);
 my $f_lang = $1;
 my $e_lang = $2;
 
 print STDERR "Source language: $f_lang\n";
 print STDERR "Target language: $e_lang\n";
+print STDERR "  Model 4 align: $m4\n";
+print STDERR "InModel 4 align: $im4\n";
 print STDERR " Using mkcls in: $mkcls\n\n";
 die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";
 die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";
@@ -77,6 +84,8 @@ SCRIPT_DIR = $SCRIPT_DIR
 TRAINING_DIR = $training_dir
 MKCLS = $mkcls
 NCLASSES = $num_classes
+GIZAALIGN = $m4
+INVGIZAALIGN = $im4
 
 TARGETS = @targets
 PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary
@@ -105,16 +114,19 @@ exit 0;
 
 sub make_stage {
   my ($stage, $direction, $prev_stage) = @_;
-  my $stage_dir = "$align_dir/$stage-$direction";
+  my $stage_dir = "$align_dir/model-$direction";
   my $first = $direction;
   $first =~ s/^(.+)-.*$/$1/;
   mkdir $stage_dir;
   my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n";
-  open CDEC, ">$stage_dir/cdec.ini" or die;
+  open CDEC, ">$stage_dir/cdec.$stage.ini" or die;
   print CDEC <<EOT;
-formalism=lexcrf
+formalism=lextrans
 intersection_strategy=full
 grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
+feature_function=LexicalPairIdentity
+feature_function=InputIdentity
+feature_function=OutputIdentity
 EOT
   if ($stage =~ /relpos/) {
     print CDEC "$RELPOS\n";
@@ -122,36 +134,16 @@ EOT
     print CDEC "$RELPOS\n";
     print CDEC "feature_function=MarkovJump\n";
     print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n";
+    print CDEC "feature_function=SourceBigram\n";
     print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n";
   }
   close CDEC;
-
-  my $init_weights = "weights.init.gz: ../grammars/weights.init.gz\n\tcp \$< \$\@\n";
-  if ($prev_stage) {
-    $init_weights = "weights.init.gz: ../$prev_stage-$direction/weights.final.gz\n\tcp \$< \$\@\n";
-  }
-
-  open MAKE, ">$stage_dir/Makefile" or die;
-  print MAKE <<EOT;
-all: weights.final.gz
-
-clean:
-	\$(RM) -r ptrain weights.init.gz weights.final.gz
-
-$init_weights
-
-weights.final.gz: weights.init.gz cdec.ini
-	\$(PTRAIN) \$(PTRAIN_PARAMS) cdec.ini ../grammars/corpus.$direction weights.init.gz
-	cp ptrain/weights.final.gz weights.final.gz
-	\$(RM) -r ptrain
-EOT
-  close MAKE;
 }
 
 sub usage {
   die <<EOT;
 
-Usage: $0 [OPTIONS] training_corpus.fr-en
+Usage: $0 [OPTIONS] training_corpus.fr-en giza.en-fr.A3 giza.fr-en.A3
 
 EOT
 }
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index b89937c1..f4b956bc 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -1,7 +1,7 @@
-all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f weights.init.gz
+all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f
 
 clean:
-	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e weights* corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar*
+	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar*
 
 SUPPORT_DIR = $(SCRIPT_DIR)/support
 GZIP = /usr/bin/gzip
@@ -51,9 +51,9 @@ corpus.f-e.model1: corpus.f-e $(MODEL1)
 corpus.e-f.model1: corpus.e-f $(MODEL1)
 	$(MODEL1) corpus.e-f > corpus.e-f.model1
 
-bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f
+bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN)
 	$(RM) $@
-	$(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f > bidir.grammars
+	$(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) > bidir.grammars
 
 corpus.f-e.lex-grammar.gz: bidir.grammars
 	$(EXTRACT_GRAMMAR) 1 bidir.grammars | $(GZIP) -9 > corpus.f-e.lex-grammar.gz
@@ -61,10 +61,3 @@ corpus.f-e.lex-grammar.gz: bidir.grammars
 corpus.e-f.lex-grammar.gz: bidir.grammars
 	$(EXTRACT_GRAMMAR) 2 bidir.grammars | $(GZIP) -9 > corpus.e-f.lex-grammar.gz
 
-weights.init.gz: bidir.grammars voc2class.f voc2class.e
-	$(EXTRACT_WEIGHTS) bidir.grammars > weights.init
-	$(SUPPLEMENT_WEIGHTS) voc2class.f > weights.dup
-	$(SUPPLEMENT_WEIGHTS) voc2class.e >> weights.dup
-	sort -u weights.dup >> weights.init
-	$(GZIP) -9 weights.init
-
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index c3e29540..3926fd8d 100755
--- a/word-aligner/support/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -4,8 +4,8 @@ use strict;
 
 my $LIMIT_SIZE=30;
 
-my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
+my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV;
+die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && -f $gizaf2e && -f $gizae2f;
 
 my %eclass = ();
 my %fclass = ();
@@ -20,8 +20,12 @@ our %cache;
 open EF, "<$effile" or die;
 open M1, "<$model1" or die;
 open IM1, "<$imodel1" or die;
+open M4, "<$gizaf2e" or die;
+open IM4, "<$gizae2f" or die;
 binmode(EF,":utf8");
 binmode(M1,":utf8");
+binmode(M4,":utf8");
+binmode(IM4,":utf8");
 binmode(IM1,":utf8");
 binmode(STDOUT,":utf8");
 my %model1;
@@ -93,7 +97,7 @@ $of_dict{'<eps>'} = '<eps>';
 $oe_dict{'<eps>'} = '<eps>';
 
 my $MIN_FEATURE_COUNT = 0;
-my $ADD_PREFIX_ID = 0;
+my $ADD_PREFIX_ID = 1;
 my $ADD_CLASS_CLASS = 1;
 my $ADD_LEN = 1;
 my $ADD_SIM = 1;
@@ -102,13 +106,14 @@ my $ADD_111 = 1;
 my $ADD_ID = 1;
 my $ADD_PUNC = 1;
 my $ADD_NULL = 0;
-my $ADD_STEM_ID = 1;
+my $ADD_STEM_ID = 0;
 my $ADD_SYM = 0;
 my $BEAM_RATIO = 50;
 my $BIN_ORTHO = 1;
 my $BIN_DLEN = 1;
 my $BIN_IDENT = 1;
 my $BIN_DICE = 1;
+my $ADD_FIDENT = 0;
 
 my %fdict;
 my %fcounts;
@@ -126,8 +131,10 @@ while(<EF>) {
     $ecounts{$ew}++;
   }
   push @fs, '<eps>' if $ADD_NULL;
+  my $i = 0;
   for my $fw (@fs){
-    die "F: Empty word" if $fw eq '';
+    $i++;
+    die "F: Empty word\nI=$i FS: @fs" if $fw eq '';
     $fcounts{$fw}++;
   }
   for my $fw (@fs){
@@ -137,6 +144,27 @@ while(<EF>) {
   }
 }
 
+print STDERR "Loading Giza output...\n";
+my %model4;
+while(<M4>) {
+  my $en = <M4>; chomp $en;
+  my $zh = <M4>; chomp $zh;
+  die unless $zh =~ /^NULL \({/;
+  my @ewords = split /\s+/, $en;
+  my @chunks = split /\}\) ?/, $zh;
+
+  for my $c (@chunks) {
+    my ($zh, $taps) = split / \(\{ /, $c;
+    if ($zh eq 'NULL') { $zh = '<eps>'; }
+    my @aps = map { $ewords[$_ - 1]; } (split / /, $taps);
+    #print "$zh -> @aps\n";
+    for my $ap (@aps) {
+      $model4{$zh}->{$ap} += 1;
+    }
+  }
+}
+close M4;
+
 my $specials = 0;
 my $fc = 1000000;
 my $sids = 1000000;
@@ -147,12 +175,14 @@ for my $f (sort keys %fdict) {
     my $efcount = $re->{$e};
     unless (defined $max) { $max = $efcount; }
     my $m1 = $model1{$f}->{$e};
+    my $m4 = $model4{$f}->{$e};
     my $im1 = $invm1{$e}->{$f};
-    my $is_good_pair = (defined $m1);
+    my $is_good_pair = (defined $m1 || defined $m4);
     my $is_inv_good_pair = (defined $im1);
-    my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
+    my $total_eandf = $ecounts{$e} + $fcounts{$f};
+    my $dice = 2 * $efcount / $total_eandf;
     my @feats;
-    if ($efcount > $MIN_FEATURE_COUNT) {
+    if ($ADD_FIDENT && $efcount > $MIN_FEATURE_COUNT) {
       $fc++;
       push @feats, "F$fc=1";
     }
@@ -212,13 +242,6 @@ for my $f (sort keys %fdict) {
       }
       push @feats, "S$id=1";
     }
-    if ($ADD_PREFIX_ID) {
-      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { 
-        my $pe = substr $oe, 0, 3;
-        my $pf = substr $of, 0, 3;
-        if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
-      }
-    }
     if ($ADD_SIM) {
       my $ld = 0;
       my $eff = $len_e;
@@ -226,7 +249,7 @@ for my $f (sort keys %fdict) {
       if (!$is_null) {
         $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
       }
-      if ($ld > 1.5) { $is_good_pair = 1; }
+      #if ($ld > 1.5) { $is_good_pair = 1; }
       if ($BIN_ORTHO) {
         push @feats, orthobin($ld) . '=1';
       } else {
@@ -236,12 +259,21 @@ for my $f (sort keys %fdict) {
     my $ident = ($e eq $f);
     if ($ident) { $is_good_pair = 1; }
     if ($ident && $ADD_ID) {
+      if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; }
+      if ($total_eandf < 8) { push @feats, "IdentRare=1"; }
       if ($BIN_IDENT) {
         push @feats, identbin($len_e) . '=1';
       } else {
         push @feats, "Identical=$len_e";
       }
     }
+    if ($ADD_PREFIX_ID && !$ident) {
+      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { 
+        my $pe = substr $oe, 0, 3;
+        my $pf = substr $of, 0, 3;
+        if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
+      }
+    }
     if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {
       $is_good_pair = 1;
       if ($ADD_111) {
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-21 01:27:58 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-21 01:27:58 +0000
commit	c94d18f3c6f71dfc1c23405c61341fe042277c3d (patch)
tree	7887c41d577f04e890f57d4aa534776cd87f8ca4 /word-aligner
parent	36faf01602d28d5bb5f030e0e03c8e7dd2078445 (diff)