summaryrefslogtreecommitdiff
path: root/word-aligner
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-21 01:27:58 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-21 01:27:58 +0000
commitc94d18f3c6f71dfc1c23405c61341fe042277c3d (patch)
tree7887c41d577f04e890f57d4aa534776cd87f8ca4 /word-aligner
parent36faf01602d28d5bb5f030e0e03c8e7dd2078445 (diff)
bit more alignment stuff
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@686 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'word-aligner')
-rwxr-xr-xword-aligner/aligner.pl44
-rw-r--r--word-aligner/makefiles/makefile.grammars15
-rwxr-xr-xword-aligner/support/make_lex_grammar.pl64
3 files changed, 70 insertions, 53 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index 7eec0e42..7821560f 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -16,17 +16,24 @@ GetOptions("cdec=s" => \$DECODER,
"pmem=s" => \$pmem,
"mkcls=s" => \$mkcls,
) or usage();
-usage() unless (scalar @ARGV == 1);
+usage() unless (scalar @ARGV == 3);
die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls;
die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls;
my $in_file = shift @ARGV;
+my $m4 = shift @ARGV;
+my $im4 = shift @ARGV;
+die "Can't find model4: $m4" unless -f $m4;
+die "Can't find inverse model4: $im4" unless -f $im4;
+
die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/);
my $f_lang = $1;
my $e_lang = $2;
print STDERR "Source language: $f_lang\n";
print STDERR "Target language: $e_lang\n";
+print STDERR " Model 4 align: $m4\n";
+print STDERR "InModel 4 align: $im4\n";
print STDERR " Using mkcls in: $mkcls\n\n";
die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";
die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";
@@ -77,6 +84,8 @@ SCRIPT_DIR = $SCRIPT_DIR
TRAINING_DIR = $training_dir
MKCLS = $mkcls
NCLASSES = $num_classes
+GIZAALIGN = $m4
+INVGIZAALIGN = $im4
TARGETS = @targets
PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary
@@ -105,16 +114,19 @@ exit 0;
sub make_stage {
my ($stage, $direction, $prev_stage) = @_;
- my $stage_dir = "$align_dir/$stage-$direction";
+ my $stage_dir = "$align_dir/model-$direction";
my $first = $direction;
$first =~ s/^(.+)-.*$/$1/;
mkdir $stage_dir;
my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n";
- open CDEC, ">$stage_dir/cdec.ini" or die;
+ open CDEC, ">$stage_dir/cdec.$stage.ini" or die;
print CDEC <<EOT;
-formalism=lexcrf
+formalism=lextrans
intersection_strategy=full
grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
+feature_function=LexicalPairIdentity
+feature_function=InputIdentity
+feature_function=OutputIdentity
EOT
if ($stage =~ /relpos/) {
print CDEC "$RELPOS\n";
@@ -122,36 +134,16 @@ EOT
print CDEC "$RELPOS\n";
print CDEC "feature_function=MarkovJump\n";
print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n";
+ print CDEC "feature_function=SourceBigram\n";
print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n";
}
close CDEC;
-
- my $init_weights = "weights.init.gz: ../grammars/weights.init.gz\n\tcp \$< \$\@\n";
- if ($prev_stage) {
- $init_weights = "weights.init.gz: ../$prev_stage-$direction/weights.final.gz\n\tcp \$< \$\@\n";
- }
-
- open MAKE, ">$stage_dir/Makefile" or die;
- print MAKE <<EOT;
-all: weights.final.gz
-
-clean:
- \$(RM) -r ptrain weights.init.gz weights.final.gz
-
-$init_weights
-
-weights.final.gz: weights.init.gz cdec.ini
- \$(PTRAIN) \$(PTRAIN_PARAMS) cdec.ini ../grammars/corpus.$direction weights.init.gz
- cp ptrain/weights.final.gz weights.final.gz
- \$(RM) -r ptrain
-EOT
- close MAKE;
}
sub usage {
die <<EOT;
-Usage: $0 [OPTIONS] training_corpus.fr-en
+Usage: $0 [OPTIONS] training_corpus.fr-en giza.en-fr.A3 giza.fr-en.A3
EOT
}
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index b89937c1..f4b956bc 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -1,7 +1,7 @@
-all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f weights.init.gz
+all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f
clean:
- $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e weights* corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar*
+ $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar*
SUPPORT_DIR = $(SCRIPT_DIR)/support
GZIP = /usr/bin/gzip
@@ -51,9 +51,9 @@ corpus.f-e.model1: corpus.f-e $(MODEL1)
corpus.e-f.model1: corpus.e-f $(MODEL1)
$(MODEL1) corpus.e-f > corpus.e-f.model1
-bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f
+bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN)
$(RM) $@
- $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f > bidir.grammars
+ $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) > bidir.grammars
corpus.f-e.lex-grammar.gz: bidir.grammars
$(EXTRACT_GRAMMAR) 1 bidir.grammars | $(GZIP) -9 > corpus.f-e.lex-grammar.gz
@@ -61,10 +61,3 @@ corpus.f-e.lex-grammar.gz: bidir.grammars
corpus.e-f.lex-grammar.gz: bidir.grammars
$(EXTRACT_GRAMMAR) 2 bidir.grammars | $(GZIP) -9 > corpus.e-f.lex-grammar.gz
-weights.init.gz: bidir.grammars voc2class.f voc2class.e
- $(EXTRACT_WEIGHTS) bidir.grammars > weights.init
- $(SUPPLEMENT_WEIGHTS) voc2class.f > weights.dup
- $(SUPPLEMENT_WEIGHTS) voc2class.e >> weights.dup
- sort -u weights.dup >> weights.init
- $(GZIP) -9 weights.init
-
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index c3e29540..3926fd8d 100755
--- a/word-aligner/support/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -4,8 +4,8 @@ use strict;
my $LIMIT_SIZE=30;
-my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
+my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV;
+die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && -f $gizaf2e && -f $gizae2f;
my %eclass = ();
my %fclass = ();
@@ -20,8 +20,12 @@ our %cache;
open EF, "<$effile" or die;
open M1, "<$model1" or die;
open IM1, "<$imodel1" or die;
+open M4, "<$gizaf2e" or die;
+open IM4, "<$gizae2f" or die;
binmode(EF,":utf8");
binmode(M1,":utf8");
+binmode(M4,":utf8");
+binmode(IM4,":utf8");
binmode(IM1,":utf8");
binmode(STDOUT,":utf8");
my %model1;
@@ -93,7 +97,7 @@ $of_dict{'<eps>'} = '<eps>';
$oe_dict{'<eps>'} = '<eps>';
my $MIN_FEATURE_COUNT = 0;
-my $ADD_PREFIX_ID = 0;
+my $ADD_PREFIX_ID = 1;
my $ADD_CLASS_CLASS = 1;
my $ADD_LEN = 1;
my $ADD_SIM = 1;
@@ -102,13 +106,14 @@ my $ADD_111 = 1;
my $ADD_ID = 1;
my $ADD_PUNC = 1;
my $ADD_NULL = 0;
-my $ADD_STEM_ID = 1;
+my $ADD_STEM_ID = 0;
my $ADD_SYM = 0;
my $BEAM_RATIO = 50;
my $BIN_ORTHO = 1;
my $BIN_DLEN = 1;
my $BIN_IDENT = 1;
my $BIN_DICE = 1;
+my $ADD_FIDENT = 0;
my %fdict;
my %fcounts;
@@ -126,8 +131,10 @@ while(<EF>) {
$ecounts{$ew}++;
}
push @fs, '<eps>' if $ADD_NULL;
+ my $i = 0;
for my $fw (@fs){
- die "F: Empty word" if $fw eq '';
+ $i++;
+ die "F: Empty word\nI=$i FS: @fs" if $fw eq '';
$fcounts{$fw}++;
}
for my $fw (@fs){
@@ -137,6 +144,27 @@ while(<EF>) {
}
}
+print STDERR "Loading Giza output...\n";
+my %model4;
+while(<M4>) {
+ my $en = <M4>; chomp $en;
+ my $zh = <M4>; chomp $zh;
+ die unless $zh =~ /^NULL \({/;
+ my @ewords = split /\s+/, $en;
+ my @chunks = split /\}\) ?/, $zh;
+
+ for my $c (@chunks) {
+ my ($zh, $taps) = split / \(\{ /, $c;
+ if ($zh eq 'NULL') { $zh = '<eps>'; }
+ my @aps = map { $ewords[$_ - 1]; } (split / /, $taps);
+ #print "$zh -> @aps\n";
+ for my $ap (@aps) {
+ $model4{$zh}->{$ap} += 1;
+ }
+ }
+}
+close M4;
+
my $specials = 0;
my $fc = 1000000;
my $sids = 1000000;
@@ -147,12 +175,14 @@ for my $f (sort keys %fdict) {
my $efcount = $re->{$e};
unless (defined $max) { $max = $efcount; }
my $m1 = $model1{$f}->{$e};
+ my $m4 = $model4{$f}->{$e};
my $im1 = $invm1{$e}->{$f};
- my $is_good_pair = (defined $m1);
+ my $is_good_pair = (defined $m1 || defined $m4);
my $is_inv_good_pair = (defined $im1);
- my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
+ my $total_eandf = $ecounts{$e} + $fcounts{$f};
+ my $dice = 2 * $efcount / $total_eandf;
my @feats;
- if ($efcount > $MIN_FEATURE_COUNT) {
+ if ($ADD_FIDENT && $efcount > $MIN_FEATURE_COUNT) {
$fc++;
push @feats, "F$fc=1";
}
@@ -212,13 +242,6 @@ for my $f (sort keys %fdict) {
}
push @feats, "S$id=1";
}
- if ($ADD_PREFIX_ID) {
- if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {
- my $pe = substr $oe, 0, 3;
- my $pf = substr $of, 0, 3;
- if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
- }
- }
if ($ADD_SIM) {
my $ld = 0;
my $eff = $len_e;
@@ -226,7 +249,7 @@ for my $f (sort keys %fdict) {
if (!$is_null) {
$ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
}
- if ($ld > 1.5) { $is_good_pair = 1; }
+ #if ($ld > 1.5) { $is_good_pair = 1; }
if ($BIN_ORTHO) {
push @feats, orthobin($ld) . '=1';
} else {
@@ -236,12 +259,21 @@ for my $f (sort keys %fdict) {
my $ident = ($e eq $f);
if ($ident) { $is_good_pair = 1; }
if ($ident && $ADD_ID) {
+ if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; }
+ if ($total_eandf < 8) { push @feats, "IdentRare=1"; }
if ($BIN_IDENT) {
push @feats, identbin($len_e) . '=1';
} else {
push @feats, "Identical=$len_e";
}
}
+ if ($ADD_PREFIX_ID && !$ident) {
+ if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {
+ my $pe = substr $oe, 0, 3;
+ my $pf = substr $of, 0, 3;
+ if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
+ }
+ }
if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {
$is_good_pair = 1;
if ($ADD_111) {