summaryrefslogtreecommitdiff
path: root/word-aligner/support
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-21 01:27:58 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-21 01:27:58 +0000
commited364db2699dec02f5bd18b9c15147e0a02efee1 (patch)
tree80db871258b3d769da3e06cb299857832315c60c /word-aligner/support
parenteb43ccf1e1a7d59a0f672956230644ebb7a5cbf1 (diff)
bit more alignment stuff
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@686 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'word-aligner/support')
-rwxr-xr-xword-aligner/support/make_lex_grammar.pl64
1 files changed, 48 insertions, 16 deletions
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index c3e29540..3926fd8d 100755
--- a/word-aligner/support/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -4,8 +4,8 @@ use strict;
my $LIMIT_SIZE=30;
-my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
+my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV;
+die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && -f $gizaf2e && -f $gizae2f;
my %eclass = ();
my %fclass = ();
@@ -20,8 +20,12 @@ our %cache;
open EF, "<$effile" or die;
open M1, "<$model1" or die;
open IM1, "<$imodel1" or die;
+open M4, "<$gizaf2e" or die;
+open IM4, "<$gizae2f" or die;
binmode(EF,":utf8");
binmode(M1,":utf8");
+binmode(M4,":utf8");
+binmode(IM4,":utf8");
binmode(IM1,":utf8");
binmode(STDOUT,":utf8");
my %model1;
@@ -93,7 +97,7 @@ $of_dict{'<eps>'} = '<eps>';
$oe_dict{'<eps>'} = '<eps>';
my $MIN_FEATURE_COUNT = 0;
-my $ADD_PREFIX_ID = 0;
+my $ADD_PREFIX_ID = 1;
my $ADD_CLASS_CLASS = 1;
my $ADD_LEN = 1;
my $ADD_SIM = 1;
@@ -102,13 +106,14 @@ my $ADD_111 = 1;
my $ADD_ID = 1;
my $ADD_PUNC = 1;
my $ADD_NULL = 0;
-my $ADD_STEM_ID = 1;
+my $ADD_STEM_ID = 0;
my $ADD_SYM = 0;
my $BEAM_RATIO = 50;
my $BIN_ORTHO = 1;
my $BIN_DLEN = 1;
my $BIN_IDENT = 1;
my $BIN_DICE = 1;
+my $ADD_FIDENT = 0;
my %fdict;
my %fcounts;
@@ -126,8 +131,10 @@ while(<EF>) {
$ecounts{$ew}++;
}
push @fs, '<eps>' if $ADD_NULL;
+ my $i = 0;
for my $fw (@fs){
- die "F: Empty word" if $fw eq '';
+ $i++;
+ die "F: Empty word\nI=$i FS: @fs" if $fw eq '';
$fcounts{$fw}++;
}
for my $fw (@fs){
@@ -137,6 +144,27 @@ while(<EF>) {
}
}
+print STDERR "Loading Giza output...\n";
+my %model4;
+while(<M4>) {
+ my $en = <M4>; chomp $en;
+ my $zh = <M4>; chomp $zh;
+ die unless $zh =~ /^NULL \({/;
+ my @ewords = split /\s+/, $en;
+ my @chunks = split /\}\) ?/, $zh;
+
+ for my $c (@chunks) {
+ my ($zh, $taps) = split / \(\{ /, $c;
+ if ($zh eq 'NULL') { $zh = '<eps>'; }
+ my @aps = map { $ewords[$_ - 1]; } (split / /, $taps);
+ #print "$zh -> @aps\n";
+ for my $ap (@aps) {
+ $model4{$zh}->{$ap} += 1;
+ }
+ }
+}
+close M4;
+
my $specials = 0;
my $fc = 1000000;
my $sids = 1000000;
@@ -147,12 +175,14 @@ for my $f (sort keys %fdict) {
my $efcount = $re->{$e};
unless (defined $max) { $max = $efcount; }
my $m1 = $model1{$f}->{$e};
+ my $m4 = $model4{$f}->{$e};
my $im1 = $invm1{$e}->{$f};
- my $is_good_pair = (defined $m1);
+ my $is_good_pair = (defined $m1 || defined $m4);
my $is_inv_good_pair = (defined $im1);
- my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
+ my $total_eandf = $ecounts{$e} + $fcounts{$f};
+ my $dice = 2 * $efcount / $total_eandf;
my @feats;
- if ($efcount > $MIN_FEATURE_COUNT) {
+ if ($ADD_FIDENT && $efcount > $MIN_FEATURE_COUNT) {
$fc++;
push @feats, "F$fc=1";
}
@@ -212,13 +242,6 @@ for my $f (sort keys %fdict) {
}
push @feats, "S$id=1";
}
- if ($ADD_PREFIX_ID) {
- if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {
- my $pe = substr $oe, 0, 3;
- my $pf = substr $of, 0, 3;
- if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
- }
- }
if ($ADD_SIM) {
my $ld = 0;
my $eff = $len_e;
@@ -226,7 +249,7 @@ for my $f (sort keys %fdict) {
if (!$is_null) {
$ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
}
- if ($ld > 1.5) { $is_good_pair = 1; }
+ #if ($ld > 1.5) { $is_good_pair = 1; }
if ($BIN_ORTHO) {
push @feats, orthobin($ld) . '=1';
} else {
@@ -236,12 +259,21 @@ for my $f (sort keys %fdict) {
my $ident = ($e eq $f);
if ($ident) { $is_good_pair = 1; }
if ($ident && $ADD_ID) {
+ if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; }
+ if ($total_eandf < 8) { push @feats, "IdentRare=1"; }
if ($BIN_IDENT) {
push @feats, identbin($len_e) . '=1';
} else {
push @feats, "Identical=$len_e";
}
}
+ if ($ADD_PREFIX_ID && !$ident) {
+ if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {
+ my $pe = substr $oe, 0, 3;
+ my $pf = substr $of, 0, 3;
+ if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
+ }
+ }
if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {
$is_good_pair = 1;
if ($ADD_111) {