From c94d18f3c6f71dfc1c23405c61341fe042277c3d Mon Sep 17 00:00:00 2001 From: redpony Date: Thu, 21 Oct 2010 01:27:58 +0000 Subject: bit more alignment stuff git-svn-id: https://ws10smt.googlecode.com/svn/trunk@686 ec762483-ff6d-05da-a07a-a48fb63a330f --- word-aligner/aligner.pl | 44 +++++++++------------- word-aligner/makefiles/makefile.grammars | 15 ++------ word-aligner/support/make_lex_grammar.pl | 64 ++++++++++++++++++++++++-------- 3 files changed, 70 insertions(+), 53 deletions(-) (limited to 'word-aligner') diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index 7eec0e42..7821560f 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -16,17 +16,24 @@ GetOptions("cdec=s" => \$DECODER, "pmem=s" => \$pmem, "mkcls=s" => \$mkcls, ) or usage(); -usage() unless (scalar @ARGV == 1); +usage() unless (scalar @ARGV == 3); die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls; die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls; my $in_file = shift @ARGV; +my $m4 = shift @ARGV; +my $im4 = shift @ARGV; +die "Can't find model4: $m4" unless -f $m4; +die "Can't find inverse model4: $im4" unless -f $im4; + die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/); my $f_lang = $1; my $e_lang = $2; print STDERR "Source language: $f_lang\n"; print STDERR "Target language: $e_lang\n"; +print STDERR " Model 4 align: $m4\n"; +print STDERR "InModel 4 align: $im4\n"; print STDERR " Using mkcls in: $mkcls\n\n"; die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl"; die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl"; @@ -77,6 +84,8 @@ SCRIPT_DIR = $SCRIPT_DIR TRAINING_DIR = $training_dir MKCLS = $mkcls NCLASSES = $num_classes +GIZAALIGN = $m4 +INVGIZAALIGN = $im4 TARGETS = @targets PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary @@ -105,16 +114,19 @@ exit 0; sub make_stage { my ($stage, $direction, $prev_stage) = @_; - my $stage_dir = "$align_dir/$stage-$direction"; + my $stage_dir = "$align_dir/model-$direction"; my $first = $direction; $first =~ s/^(.+)-.*$/$1/; mkdir $stage_dir; my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n"; - open CDEC, ">$stage_dir/cdec.ini" or die; + open CDEC, ">$stage_dir/cdec.$stage.ini" or die; print CDEC <$stage_dir/Makefile" or die; - print MAKE < corpus.e-f.model1 -bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f +bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) $(RM) $@ - $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f > bidir.grammars + $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) > bidir.grammars corpus.f-e.lex-grammar.gz: bidir.grammars $(EXTRACT_GRAMMAR) 1 bidir.grammars | $(GZIP) -9 > corpus.f-e.lex-grammar.gz @@ -61,10 +61,3 @@ corpus.f-e.lex-grammar.gz: bidir.grammars corpus.e-f.lex-grammar.gz: bidir.grammars $(EXTRACT_GRAMMAR) 2 bidir.grammars | $(GZIP) -9 > corpus.e-f.lex-grammar.gz -weights.init.gz: bidir.grammars voc2class.f voc2class.e - $(EXTRACT_WEIGHTS) bidir.grammars > weights.init - $(SUPPLEMENT_WEIGHTS) voc2class.f > weights.dup - $(SUPPLEMENT_WEIGHTS) voc2class.e >> weights.dup - sort -u weights.dup >> weights.init - $(GZIP) -9 weights.init - diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl index c3e29540..3926fd8d 100755 --- a/word-aligner/support/make_lex_grammar.pl +++ b/word-aligner/support/make_lex_grammar.pl @@ -4,8 +4,8 @@ use strict; my $LIMIT_SIZE=30; -my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV; -die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f; +my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV; +die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && -f $gizaf2e && -f $gizae2f; my %eclass = (); my %fclass = (); @@ -20,8 +20,12 @@ our %cache; open EF, "<$effile" or die; open M1, "<$model1" or die; open IM1, "<$imodel1" or die; +open M4, "<$gizaf2e" or die; +open IM4, "<$gizae2f" or die; binmode(EF,":utf8"); binmode(M1,":utf8"); +binmode(M4,":utf8"); +binmode(IM4,":utf8"); binmode(IM1,":utf8"); binmode(STDOUT,":utf8"); my %model1; @@ -93,7 +97,7 @@ $of_dict{''} = ''; $oe_dict{''} = ''; my $MIN_FEATURE_COUNT = 0; -my $ADD_PREFIX_ID = 0; +my $ADD_PREFIX_ID = 1; my $ADD_CLASS_CLASS = 1; my $ADD_LEN = 1; my $ADD_SIM = 1; @@ -102,13 +106,14 @@ my $ADD_111 = 1; my $ADD_ID = 1; my $ADD_PUNC = 1; my $ADD_NULL = 0; -my $ADD_STEM_ID = 1; +my $ADD_STEM_ID = 0; my $ADD_SYM = 0; my $BEAM_RATIO = 50; my $BIN_ORTHO = 1; my $BIN_DLEN = 1; my $BIN_IDENT = 1; my $BIN_DICE = 1; +my $ADD_FIDENT = 0; my %fdict; my %fcounts; @@ -126,8 +131,10 @@ while() { $ecounts{$ew}++; } push @fs, '' if $ADD_NULL; + my $i = 0; for my $fw (@fs){ - die "F: Empty word" if $fw eq ''; + $i++; + die "F: Empty word\nI=$i FS: @fs" if $fw eq ''; $fcounts{$fw}++; } for my $fw (@fs){ @@ -137,6 +144,27 @@ while() { } } +print STDERR "Loading Giza output...\n"; +my %model4; +while() { + my $en = ; chomp $en; + my $zh = ; chomp $zh; + die unless $zh =~ /^NULL \({/; + my @ewords = split /\s+/, $en; + my @chunks = split /\}\) ?/, $zh; + + for my $c (@chunks) { + my ($zh, $taps) = split / \(\{ /, $c; + if ($zh eq 'NULL') { $zh = ''; } + my @aps = map { $ewords[$_ - 1]; } (split / /, $taps); + #print "$zh -> @aps\n"; + for my $ap (@aps) { + $model4{$zh}->{$ap} += 1; + } + } +} +close M4; + my $specials = 0; my $fc = 1000000; my $sids = 1000000; @@ -147,12 +175,14 @@ for my $f (sort keys %fdict) { my $efcount = $re->{$e}; unless (defined $max) { $max = $efcount; } my $m1 = $model1{$f}->{$e}; + my $m4 = $model4{$f}->{$e}; my $im1 = $invm1{$e}->{$f}; - my $is_good_pair = (defined $m1); + my $is_good_pair = (defined $m1 || defined $m4); my $is_inv_good_pair = (defined $im1); - my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); + my $total_eandf = $ecounts{$e} + $fcounts{$f}; + my $dice = 2 * $efcount / $total_eandf; my @feats; - if ($efcount > $MIN_FEATURE_COUNT) { + if ($ADD_FIDENT && $efcount > $MIN_FEATURE_COUNT) { $fc++; push @feats, "F$fc=1"; } @@ -212,13 +242,6 @@ for my $f (sort keys %fdict) { } push @feats, "S$id=1"; } - if ($ADD_PREFIX_ID) { - if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { - my $pe = substr $oe, 0, 3; - my $pf = substr $of, 0, 3; - if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } - } - } if ($ADD_SIM) { my $ld = 0; my $eff = $len_e; @@ -226,7 +249,7 @@ for my $f (sort keys %fdict) { if (!$is_null) { $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); } - if ($ld > 1.5) { $is_good_pair = 1; } + #if ($ld > 1.5) { $is_good_pair = 1; } if ($BIN_ORTHO) { push @feats, orthobin($ld) . '=1'; } else { @@ -236,12 +259,21 @@ for my $f (sort keys %fdict) { my $ident = ($e eq $f); if ($ident) { $is_good_pair = 1; } if ($ident && $ADD_ID) { + if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; } + if ($total_eandf < 8) { push @feats, "IdentRare=1"; } if ($BIN_IDENT) { push @feats, identbin($len_e) . '=1'; } else { push @feats, "Identical=$len_e"; } } + if ($ADD_PREFIX_ID && !$ident) { + if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { + my $pe = substr $oe, 0, 3; + my $pf = substr $of, 0, 3; + if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } + } + } if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) { $is_good_pair = 1; if ($ADD_111) { -- cgit v1.2.3