diff options
Diffstat (limited to 'word-aligner')
| -rwxr-xr-x | word-aligner/aligner.pl | 44 | ||||
| -rw-r--r-- | word-aligner/makefiles/makefile.grammars | 15 | ||||
| -rwxr-xr-x | word-aligner/support/make_lex_grammar.pl | 64 | 
3 files changed, 70 insertions, 53 deletions
| diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index 7eec0e42..7821560f 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -16,17 +16,24 @@ GetOptions("cdec=s" => \$DECODER,             "pmem=s" => \$pmem,             "mkcls=s" => \$mkcls,            ) or usage(); -usage() unless (scalar @ARGV == 1); +usage() unless (scalar @ARGV == 3);  die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls;  die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls;  my $in_file = shift @ARGV; +my $m4 = shift @ARGV; +my $im4 = shift @ARGV; +die "Can't find model4: $m4" unless -f $m4; +die "Can't find inverse model4: $im4" unless -f $im4; +  die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/);  my $f_lang = $1;  my $e_lang = $2;  print STDERR "Source language: $f_lang\n";  print STDERR "Target language: $e_lang\n"; +print STDERR "  Model 4 align: $m4\n"; +print STDERR "InModel 4 align: $im4\n";  print STDERR " Using mkcls in: $mkcls\n\n";  die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";  die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl"; @@ -77,6 +84,8 @@ SCRIPT_DIR = $SCRIPT_DIR  TRAINING_DIR = $training_dir  MKCLS = $mkcls  NCLASSES = $num_classes +GIZAALIGN = $m4 +INVGIZAALIGN = $im4  TARGETS = @targets  PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary @@ -105,16 +114,19 @@ exit 0;  sub make_stage {    my ($stage, $direction, $prev_stage) = @_; -  my $stage_dir = "$align_dir/$stage-$direction"; +  my $stage_dir = "$align_dir/model-$direction";    my $first = $direction;    $first =~ s/^(.+)-.*$/$1/;    mkdir $stage_dir;    my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n"; -  open CDEC, ">$stage_dir/cdec.ini" or die; +  open CDEC, ">$stage_dir/cdec.$stage.ini" or die;    print CDEC <<EOT; -formalism=lexcrf +formalism=lextrans  intersection_strategy=full  grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz +feature_function=LexicalPairIdentity +feature_function=InputIdentity +feature_function=OutputIdentity  EOT    if ($stage =~ /relpos/) {      print CDEC "$RELPOS\n"; @@ -122,36 +134,16 @@ EOT      print CDEC "$RELPOS\n";      print CDEC "feature_function=MarkovJump\n";      print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n"; +    print CDEC "feature_function=SourceBigram\n";      print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n";    }    close CDEC; - -  my $init_weights = "weights.init.gz: ../grammars/weights.init.gz\n\tcp \$< \$\@\n"; -  if ($prev_stage) { -    $init_weights = "weights.init.gz: ../$prev_stage-$direction/weights.final.gz\n\tcp \$< \$\@\n"; -  } - -  open MAKE, ">$stage_dir/Makefile" or die; -  print MAKE <<EOT; -all: weights.final.gz - -clean: -	\$(RM) -r ptrain weights.init.gz weights.final.gz - -$init_weights - -weights.final.gz: weights.init.gz cdec.ini -	\$(PTRAIN) \$(PTRAIN_PARAMS) cdec.ini ../grammars/corpus.$direction weights.init.gz -	cp ptrain/weights.final.gz weights.final.gz -	\$(RM) -r ptrain -EOT -  close MAKE;  }  sub usage {    die <<EOT; -Usage: $0 [OPTIONS] training_corpus.fr-en +Usage: $0 [OPTIONS] training_corpus.fr-en giza.en-fr.A3 giza.fr-en.A3  EOT  } diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index b89937c1..f4b956bc 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -1,7 +1,7 @@ -all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f weights.init.gz +all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f  clean: -	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e weights* corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* +	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar*  SUPPORT_DIR = $(SCRIPT_DIR)/support  GZIP = /usr/bin/gzip @@ -51,9 +51,9 @@ corpus.f-e.model1: corpus.f-e $(MODEL1)  corpus.e-f.model1: corpus.e-f $(MODEL1)  	$(MODEL1) corpus.e-f > corpus.e-f.model1 -bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f +bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN)  	$(RM) $@ -	$(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f > bidir.grammars +	$(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) > bidir.grammars  corpus.f-e.lex-grammar.gz: bidir.grammars  	$(EXTRACT_GRAMMAR) 1 bidir.grammars | $(GZIP) -9 > corpus.f-e.lex-grammar.gz @@ -61,10 +61,3 @@ corpus.f-e.lex-grammar.gz: bidir.grammars  corpus.e-f.lex-grammar.gz: bidir.grammars  	$(EXTRACT_GRAMMAR) 2 bidir.grammars | $(GZIP) -9 > corpus.e-f.lex-grammar.gz -weights.init.gz: bidir.grammars voc2class.f voc2class.e -	$(EXTRACT_WEIGHTS) bidir.grammars > weights.init -	$(SUPPLEMENT_WEIGHTS) voc2class.f > weights.dup -	$(SUPPLEMENT_WEIGHTS) voc2class.e >> weights.dup -	sort -u weights.dup >> weights.init -	$(GZIP) -9 weights.init - diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl index c3e29540..3926fd8d 100755 --- a/word-aligner/support/make_lex_grammar.pl +++ b/word-aligner/support/make_lex_grammar.pl @@ -4,8 +4,8 @@ use strict;  my $LIMIT_SIZE=30; -my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV; -die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f; +my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV; +die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && -f $gizaf2e && -f $gizae2f;  my %eclass = ();  my %fclass = (); @@ -20,8 +20,12 @@ our %cache;  open EF, "<$effile" or die;  open M1, "<$model1" or die;  open IM1, "<$imodel1" or die; +open M4, "<$gizaf2e" or die; +open IM4, "<$gizae2f" or die;  binmode(EF,":utf8");  binmode(M1,":utf8"); +binmode(M4,":utf8"); +binmode(IM4,":utf8");  binmode(IM1,":utf8");  binmode(STDOUT,":utf8");  my %model1; @@ -93,7 +97,7 @@ $of_dict{'<eps>'} = '<eps>';  $oe_dict{'<eps>'} = '<eps>';  my $MIN_FEATURE_COUNT = 0; -my $ADD_PREFIX_ID = 0; +my $ADD_PREFIX_ID = 1;  my $ADD_CLASS_CLASS = 1;  my $ADD_LEN = 1;  my $ADD_SIM = 1; @@ -102,13 +106,14 @@ my $ADD_111 = 1;  my $ADD_ID = 1;  my $ADD_PUNC = 1;  my $ADD_NULL = 0; -my $ADD_STEM_ID = 1; +my $ADD_STEM_ID = 0;  my $ADD_SYM = 0;  my $BEAM_RATIO = 50;  my $BIN_ORTHO = 1;  my $BIN_DLEN = 1;  my $BIN_IDENT = 1;  my $BIN_DICE = 1; +my $ADD_FIDENT = 0;  my %fdict;  my %fcounts; @@ -126,8 +131,10 @@ while(<EF>) {      $ecounts{$ew}++;    }    push @fs, '<eps>' if $ADD_NULL; +  my $i = 0;    for my $fw (@fs){ -    die "F: Empty word" if $fw eq ''; +    $i++; +    die "F: Empty word\nI=$i FS: @fs" if $fw eq '';      $fcounts{$fw}++;    }    for my $fw (@fs){ @@ -137,6 +144,27 @@ while(<EF>) {    }  } +print STDERR "Loading Giza output...\n"; +my %model4; +while(<M4>) { +  my $en = <M4>; chomp $en; +  my $zh = <M4>; chomp $zh; +  die unless $zh =~ /^NULL \({/; +  my @ewords = split /\s+/, $en; +  my @chunks = split /\}\) ?/, $zh; + +  for my $c (@chunks) { +    my ($zh, $taps) = split / \(\{ /, $c; +    if ($zh eq 'NULL') { $zh = '<eps>'; } +    my @aps = map { $ewords[$_ - 1]; } (split / /, $taps); +    #print "$zh -> @aps\n"; +    for my $ap (@aps) { +      $model4{$zh}->{$ap} += 1; +    } +  } +} +close M4; +  my $specials = 0;  my $fc = 1000000;  my $sids = 1000000; @@ -147,12 +175,14 @@ for my $f (sort keys %fdict) {      my $efcount = $re->{$e};      unless (defined $max) { $max = $efcount; }      my $m1 = $model1{$f}->{$e}; +    my $m4 = $model4{$f}->{$e};      my $im1 = $invm1{$e}->{$f}; -    my $is_good_pair = (defined $m1); +    my $is_good_pair = (defined $m1 || defined $m4);      my $is_inv_good_pair = (defined $im1); -    my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); +    my $total_eandf = $ecounts{$e} + $fcounts{$f}; +    my $dice = 2 * $efcount / $total_eandf;      my @feats; -    if ($efcount > $MIN_FEATURE_COUNT) { +    if ($ADD_FIDENT && $efcount > $MIN_FEATURE_COUNT) {        $fc++;        push @feats, "F$fc=1";      } @@ -212,13 +242,6 @@ for my $f (sort keys %fdict) {        }        push @feats, "S$id=1";      } -    if ($ADD_PREFIX_ID) { -      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {  -        my $pe = substr $oe, 0, 3; -        my $pf = substr $of, 0, 3; -        if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } -      } -    }      if ($ADD_SIM) {        my $ld = 0;        my $eff = $len_e; @@ -226,7 +249,7 @@ for my $f (sort keys %fdict) {        if (!$is_null) {          $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);        } -      if ($ld > 1.5) { $is_good_pair = 1; } +      #if ($ld > 1.5) { $is_good_pair = 1; }        if ($BIN_ORTHO) {          push @feats, orthobin($ld) . '=1';        } else { @@ -236,12 +259,21 @@ for my $f (sort keys %fdict) {      my $ident = ($e eq $f);      if ($ident) { $is_good_pair = 1; }      if ($ident && $ADD_ID) { +      if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; } +      if ($total_eandf < 8) { push @feats, "IdentRare=1"; }        if ($BIN_IDENT) {          push @feats, identbin($len_e) . '=1';        } else {          push @feats, "Identical=$len_e";        }      } +    if ($ADD_PREFIX_ID && !$ident) { +      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {  +        my $pe = substr $oe, 0, 3; +        my $pf = substr $of, 0, 3; +        if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } +      } +    }      if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {        $is_good_pair = 1;        if ($ADD_111) { | 
