diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-22 23:29:11 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-22 23:29:11 +0000 |
commit | dd886ca6da84970ccb96b2f0155ff672e03f5b58 (patch) | |
tree | 78b5627347f3953539852cdd6b92053e844e87d4 /word-aligner/support/make_lex_grammar.pl | |
parent | 550019457302ecaaec6f72e912013a6fa9f2da67 (diff) |
handle translation from the null word
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@689 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'word-aligner/support/make_lex_grammar.pl')
-rwxr-xr-x | word-aligner/support/make_lex_grammar.pl | 52 |
1 files changed, 27 insertions, 25 deletions
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl index 3926fd8d..fb9d0214 100755 --- a/word-aligner/support/make_lex_grammar.pl +++ b/word-aligner/support/make_lex_grammar.pl @@ -5,7 +5,8 @@ use strict; my $LIMIT_SIZE=30; my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV; -die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && -f $gizaf2e && -f $gizae2f; +die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f; + my %eclass = (); my %fclass = (); @@ -20,12 +21,12 @@ our %cache; open EF, "<$effile" or die; open M1, "<$model1" or die; open IM1, "<$imodel1" or die; -open M4, "<$gizaf2e" or die; -open IM4, "<$gizae2f" or die; +#open M4, "<$gizaf2e" or die; +#open IM4, "<$gizae2f" or die; +#binmode(M4,":utf8"); +#binmode(IM4,":utf8"); binmode(EF,":utf8"); binmode(M1,":utf8"); -binmode(M4,":utf8"); -binmode(IM4,":utf8"); binmode(IM1,":utf8"); binmode(STDOUT,":utf8"); my %model1; @@ -105,7 +106,7 @@ my $ADD_DICE = 1; my $ADD_111 = 1; my $ADD_ID = 1; my $ADD_PUNC = 1; -my $ADD_NULL = 0; +my $ADD_NULL = 1; my $ADD_STEM_ID = 0; my $ADD_SYM = 0; my $BEAM_RATIO = 50; @@ -115,6 +116,8 @@ my $BIN_IDENT = 1; my $BIN_DICE = 1; my $ADD_FIDENT = 0; +if ($ADD_NULL) { $fclass{'<eps>'}='NUL'; $eclass{'<eps>'} ='NUL'; } + my %fdict; my %fcounts; my %ecounts; @@ -146,24 +149,24 @@ while(<EF>) { print STDERR "Loading Giza output...\n"; my %model4; -while(<M4>) { - my $en = <M4>; chomp $en; - my $zh = <M4>; chomp $zh; - die unless $zh =~ /^NULL \({/; - my @ewords = split /\s+/, $en; - my @chunks = split /\}\) ?/, $zh; - - for my $c (@chunks) { - my ($zh, $taps) = split / \(\{ /, $c; - if ($zh eq 'NULL') { $zh = '<eps>'; } - my @aps = map { $ewords[$_ - 1]; } (split / /, $taps); - #print "$zh -> @aps\n"; - for my $ap (@aps) { - $model4{$zh}->{$ap} += 1; - } - } -} -close M4; +#while(<M4>) { +# my $en = <M4>; chomp $en; +# my $zh = <M4>; chomp $zh; +# die unless $zh =~ /^NULL \({/; +# my @ewords = split /\s+/, $en; +# my @chunks = split /\}\) ?/, $zh; +# +# for my $c (@chunks) { +# my ($zh, $taps) = split / \(\{ /, $c; +# if ($zh eq 'NULL') { $zh = '<eps>'; } +# my @aps = map { $ewords[$_ - 1]; } (split / /, $taps); +# #print "$zh -> @aps\n"; +# for my $ap (@aps) { +# $model4{$zh}->{$ap} += 1; +# } +# } +#} +#close M4; my $specials = 0; my $fc = 1000000; @@ -207,7 +210,6 @@ for my $f (sort keys %fdict) { } my $is_null = undef; if ($ADD_NULL && $f eq '<eps>') { - push @feats, "IsNull=1"; $is_null = 1; } if ($ADD_LEN) { |