handle translation from the null word

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@689 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-22 23:29:11 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-22 23:29:11 +0000
commit: dd886ca6da84970ccb96b2f0155ff672e03f5b58 (patch)
tree: 78b5627347f3953539852cdd6b92053e844e87d4 /word-aligner/support/make_lex_grammar.pl
parent: 550019457302ecaaec6f72e912013a6fa9f2da67 (diff)
1 files changed, 27 insertions, 25 deletions
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index 3926fd8d..fb9d0214 100755
--- a/word-aligner/support/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -5,7 +5,8 @@ use strict;
 my $LIMIT_SIZE=30;
 
 my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && -f $gizaf2e && -f $gizae2f;
+die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
+
 
 my %eclass = ();
 my %fclass = ();
@@ -20,12 +21,12 @@ our %cache;
 open EF, "<$effile" or die;
 open M1, "<$model1" or die;
 open IM1, "<$imodel1" or die;
-open M4, "<$gizaf2e" or die;
-open IM4, "<$gizae2f" or die;
+#open M4, "<$gizaf2e" or die;
+#open IM4, "<$gizae2f" or die;
+#binmode(M4,":utf8");
+#binmode(IM4,":utf8");
 binmode(EF,":utf8");
 binmode(M1,":utf8");
-binmode(M4,":utf8");
-binmode(IM4,":utf8");
 binmode(IM1,":utf8");
 binmode(STDOUT,":utf8");
 my %model1;
@@ -105,7 +106,7 @@ my $ADD_DICE = 1;
 my $ADD_111 = 1;
 my $ADD_ID = 1;
 my $ADD_PUNC = 1;
-my $ADD_NULL = 0;
+my $ADD_NULL = 1;
 my $ADD_STEM_ID = 0;
 my $ADD_SYM = 0;
 my $BEAM_RATIO = 50;
@@ -115,6 +116,8 @@ my $BIN_IDENT = 1;
 my $BIN_DICE = 1;
 my $ADD_FIDENT = 0;
 
+if ($ADD_NULL) { $fclass{'<eps>'}='NUL'; $eclass{'<eps>'} ='NUL'; }
+
 my %fdict;
 my %fcounts;
 my %ecounts;
@@ -146,24 +149,24 @@ while(<EF>) {
 
 print STDERR "Loading Giza output...\n";
 my %model4;
-while(<M4>) {
-  my $en = <M4>; chomp $en;
-  my $zh = <M4>; chomp $zh;
-  die unless $zh =~ /^NULL \({/;
-  my @ewords = split /\s+/, $en;
-  my @chunks = split /\}\) ?/, $zh;
-
-  for my $c (@chunks) {
-    my ($zh, $taps) = split / \(\{ /, $c;
-    if ($zh eq 'NULL') { $zh = '<eps>'; }
-    my @aps = map { $ewords[$_ - 1]; } (split / /, $taps);
-    #print "$zh -> @aps\n";
-    for my $ap (@aps) {
-      $model4{$zh}->{$ap} += 1;
-    }
-  }
-}
-close M4;
+#while(<M4>) {
+#  my $en = <M4>; chomp $en;
+#  my $zh = <M4>; chomp $zh;
+#  die unless $zh =~ /^NULL \({/;
+#  my @ewords = split /\s+/, $en;
+#  my @chunks = split /\}\) ?/, $zh;
+#
+#  for my $c (@chunks) {
+#    my ($zh, $taps) = split / \(\{ /, $c;
+#    if ($zh eq 'NULL') { $zh = '<eps>'; }
+#    my @aps = map { $ewords[$_ - 1]; } (split / /, $taps);
+#    #print "$zh -> @aps\n";
+#    for my $ap (@aps) {
+#      $model4{$zh}->{$ap} += 1;
+#    }
+#  }
+#}
+#close M4;
 
 my $specials = 0;
 my $fc = 1000000;
@@ -207,7 +210,6 @@ for my $f (sort keys %fdict) {
     }
     my $is_null = undef;
     if ($ADD_NULL && $f eq '<eps>') {
-      push @feats, "IsNull=1";
       $is_null = 1;
     }
     if ($ADD_LEN) {
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-22 23:29:11 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-22 23:29:11 +0000
commit	dd886ca6da84970ccb96b2f0155ff672e03f5b58 (patch)
tree	78b5627347f3953539852cdd6b92053e844e87d4 /word-aligner/support/make_lex_grammar.pl
parent	550019457302ecaaec6f72e912013a6fa9f2da67 (diff)