diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-21 01:27:58 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-21 01:27:58 +0000 |
commit | ed364db2699dec02f5bd18b9c15147e0a02efee1 (patch) | |
tree | 80db871258b3d769da3e06cb299857832315c60c /word-aligner | |
parent | eb43ccf1e1a7d59a0f672956230644ebb7a5cbf1 (diff) |
bit more alignment stuff
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@686 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'word-aligner')
-rwxr-xr-x | word-aligner/aligner.pl | 44 | ||||
-rw-r--r-- | word-aligner/makefiles/makefile.grammars | 15 | ||||
-rwxr-xr-x | word-aligner/support/make_lex_grammar.pl | 64 |
3 files changed, 70 insertions, 53 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index 7eec0e42..7821560f 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -16,17 +16,24 @@ GetOptions("cdec=s" => \$DECODER, "pmem=s" => \$pmem, "mkcls=s" => \$mkcls, ) or usage(); -usage() unless (scalar @ARGV == 1); +usage() unless (scalar @ARGV == 3); die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls; die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls; my $in_file = shift @ARGV; +my $m4 = shift @ARGV; +my $im4 = shift @ARGV; +die "Can't find model4: $m4" unless -f $m4; +die "Can't find inverse model4: $im4" unless -f $im4; + die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/); my $f_lang = $1; my $e_lang = $2; print STDERR "Source language: $f_lang\n"; print STDERR "Target language: $e_lang\n"; +print STDERR " Model 4 align: $m4\n"; +print STDERR "InModel 4 align: $im4\n"; print STDERR " Using mkcls in: $mkcls\n\n"; die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl"; die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl"; @@ -77,6 +84,8 @@ SCRIPT_DIR = $SCRIPT_DIR TRAINING_DIR = $training_dir MKCLS = $mkcls NCLASSES = $num_classes +GIZAALIGN = $m4 +INVGIZAALIGN = $im4 TARGETS = @targets PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary @@ -105,16 +114,19 @@ exit 0; sub make_stage { my ($stage, $direction, $prev_stage) = @_; - my $stage_dir = "$align_dir/$stage-$direction"; + my $stage_dir = "$align_dir/model-$direction"; my $first = $direction; $first =~ s/^(.+)-.*$/$1/; mkdir $stage_dir; my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n"; - open CDEC, ">$stage_dir/cdec.ini" or die; + open CDEC, ">$stage_dir/cdec.$stage.ini" or die; print CDEC <<EOT; -formalism=lexcrf +formalism=lextrans intersection_strategy=full grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz +feature_function=LexicalPairIdentity +feature_function=InputIdentity +feature_function=OutputIdentity EOT if ($stage =~ /relpos/) { print CDEC "$RELPOS\n"; @@ -122,36 +134,16 @@ EOT print CDEC "$RELPOS\n"; print CDEC "feature_function=MarkovJump\n"; print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n"; + print CDEC "feature_function=SourceBigram\n"; print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n"; } close CDEC; - - my $init_weights = "weights.init.gz: ../grammars/weights.init.gz\n\tcp \$< \$\@\n"; - if ($prev_stage) { - $init_weights = "weights.init.gz: ../$prev_stage-$direction/weights.final.gz\n\tcp \$< \$\@\n"; - } - - open MAKE, ">$stage_dir/Makefile" or die; - print MAKE <<EOT; -all: weights.final.gz - -clean: - \$(RM) -r ptrain weights.init.gz weights.final.gz - -$init_weights - -weights.final.gz: weights.init.gz cdec.ini - \$(PTRAIN) \$(PTRAIN_PARAMS) cdec.ini ../grammars/corpus.$direction weights.init.gz - cp ptrain/weights.final.gz weights.final.gz - \$(RM) -r ptrain -EOT - close MAKE; } sub usage { die <<EOT; -Usage: $0 [OPTIONS] training_corpus.fr-en +Usage: $0 [OPTIONS] training_corpus.fr-en giza.en-fr.A3 giza.fr-en.A3 EOT } diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index b89937c1..f4b956bc 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -1,7 +1,7 @@ -all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f weights.init.gz +all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f clean: - $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e weights* corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* + $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* SUPPORT_DIR = $(SCRIPT_DIR)/support GZIP = /usr/bin/gzip @@ -51,9 +51,9 @@ corpus.f-e.model1: corpus.f-e $(MODEL1) corpus.e-f.model1: corpus.e-f $(MODEL1) $(MODEL1) corpus.e-f > corpus.e-f.model1 -bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f +bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) $(RM) $@ - $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f > bidir.grammars + $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) > bidir.grammars corpus.f-e.lex-grammar.gz: bidir.grammars $(EXTRACT_GRAMMAR) 1 bidir.grammars | $(GZIP) -9 > corpus.f-e.lex-grammar.gz @@ -61,10 +61,3 @@ corpus.f-e.lex-grammar.gz: bidir.grammars corpus.e-f.lex-grammar.gz: bidir.grammars $(EXTRACT_GRAMMAR) 2 bidir.grammars | $(GZIP) -9 > corpus.e-f.lex-grammar.gz -weights.init.gz: bidir.grammars voc2class.f voc2class.e - $(EXTRACT_WEIGHTS) bidir.grammars > weights.init - $(SUPPLEMENT_WEIGHTS) voc2class.f > weights.dup - $(SUPPLEMENT_WEIGHTS) voc2class.e >> weights.dup - sort -u weights.dup >> weights.init - $(GZIP) -9 weights.init - diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl index c3e29540..3926fd8d 100755 --- a/word-aligner/support/make_lex_grammar.pl +++ b/word-aligner/support/make_lex_grammar.pl @@ -4,8 +4,8 @@ use strict; my $LIMIT_SIZE=30; -my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV; -die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f; +my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV; +die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && -f $gizaf2e && -f $gizae2f; my %eclass = (); my %fclass = (); @@ -20,8 +20,12 @@ our %cache; open EF, "<$effile" or die; open M1, "<$model1" or die; open IM1, "<$imodel1" or die; +open M4, "<$gizaf2e" or die; +open IM4, "<$gizae2f" or die; binmode(EF,":utf8"); binmode(M1,":utf8"); +binmode(M4,":utf8"); +binmode(IM4,":utf8"); binmode(IM1,":utf8"); binmode(STDOUT,":utf8"); my %model1; @@ -93,7 +97,7 @@ $of_dict{'<eps>'} = '<eps>'; $oe_dict{'<eps>'} = '<eps>'; my $MIN_FEATURE_COUNT = 0; -my $ADD_PREFIX_ID = 0; +my $ADD_PREFIX_ID = 1; my $ADD_CLASS_CLASS = 1; my $ADD_LEN = 1; my $ADD_SIM = 1; @@ -102,13 +106,14 @@ my $ADD_111 = 1; my $ADD_ID = 1; my $ADD_PUNC = 1; my $ADD_NULL = 0; -my $ADD_STEM_ID = 1; +my $ADD_STEM_ID = 0; my $ADD_SYM = 0; my $BEAM_RATIO = 50; my $BIN_ORTHO = 1; my $BIN_DLEN = 1; my $BIN_IDENT = 1; my $BIN_DICE = 1; +my $ADD_FIDENT = 0; my %fdict; my %fcounts; @@ -126,8 +131,10 @@ while(<EF>) { $ecounts{$ew}++; } push @fs, '<eps>' if $ADD_NULL; + my $i = 0; for my $fw (@fs){ - die "F: Empty word" if $fw eq ''; + $i++; + die "F: Empty word\nI=$i FS: @fs" if $fw eq ''; $fcounts{$fw}++; } for my $fw (@fs){ @@ -137,6 +144,27 @@ while(<EF>) { } } +print STDERR "Loading Giza output...\n"; +my %model4; +while(<M4>) { + my $en = <M4>; chomp $en; + my $zh = <M4>; chomp $zh; + die unless $zh =~ /^NULL \({/; + my @ewords = split /\s+/, $en; + my @chunks = split /\}\) ?/, $zh; + + for my $c (@chunks) { + my ($zh, $taps) = split / \(\{ /, $c; + if ($zh eq 'NULL') { $zh = '<eps>'; } + my @aps = map { $ewords[$_ - 1]; } (split / /, $taps); + #print "$zh -> @aps\n"; + for my $ap (@aps) { + $model4{$zh}->{$ap} += 1; + } + } +} +close M4; + my $specials = 0; my $fc = 1000000; my $sids = 1000000; @@ -147,12 +175,14 @@ for my $f (sort keys %fdict) { my $efcount = $re->{$e}; unless (defined $max) { $max = $efcount; } my $m1 = $model1{$f}->{$e}; + my $m4 = $model4{$f}->{$e}; my $im1 = $invm1{$e}->{$f}; - my $is_good_pair = (defined $m1); + my $is_good_pair = (defined $m1 || defined $m4); my $is_inv_good_pair = (defined $im1); - my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); + my $total_eandf = $ecounts{$e} + $fcounts{$f}; + my $dice = 2 * $efcount / $total_eandf; my @feats; - if ($efcount > $MIN_FEATURE_COUNT) { + if ($ADD_FIDENT && $efcount > $MIN_FEATURE_COUNT) { $fc++; push @feats, "F$fc=1"; } @@ -212,13 +242,6 @@ for my $f (sort keys %fdict) { } push @feats, "S$id=1"; } - if ($ADD_PREFIX_ID) { - if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { - my $pe = substr $oe, 0, 3; - my $pf = substr $of, 0, 3; - if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } - } - } if ($ADD_SIM) { my $ld = 0; my $eff = $len_e; @@ -226,7 +249,7 @@ for my $f (sort keys %fdict) { if (!$is_null) { $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); } - if ($ld > 1.5) { $is_good_pair = 1; } + #if ($ld > 1.5) { $is_good_pair = 1; } if ($BIN_ORTHO) { push @feats, orthobin($ld) . '=1'; } else { @@ -236,12 +259,21 @@ for my $f (sort keys %fdict) { my $ident = ($e eq $f); if ($ident) { $is_good_pair = 1; } if ($ident && $ADD_ID) { + if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; } + if ($total_eandf < 8) { push @feats, "IdentRare=1"; } if ($BIN_IDENT) { push @feats, identbin($len_e) . '=1'; } else { push @feats, "Identical=$len_e"; } } + if ($ADD_PREFIX_ID && !$ident) { + if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { + my $pe = substr $oe, 0, 3; + my $pf = substr $of, 0, 3; + if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } + } + } if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) { $is_good_pair = 1; if ($ADD_111) { |