summaryrefslogtreecommitdiff
path: root/word-aligner
diff options
context:
space:
mode:
Diffstat (limited to 'word-aligner')
-rwxr-xr-xword-aligner/aligner.pl47
-rwxr-xr-xword-aligner/support/make_lex_grammar.pl52
2 files changed, 44 insertions, 55 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index 7821560f..e23c2beb 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -6,9 +6,10 @@ use Getopt::Long;
my $training_dir = "$SCRIPT_DIR/../training";
die "Can't find training dir: $training_dir" unless -d $training_dir;
-my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';
+my $mkcls = '/Users/cdyer/software/giza-pp/mkcls-v2/mkcls';
my $num_classes = 50;
my $nodes = 40;
+my $TRAINING_ITERATIONS = 2000;
my $pmem = "2500mb";
my $DECODER = "cdec";
GetOptions("cdec=s" => \$DECODER,
@@ -16,15 +17,11 @@ GetOptions("cdec=s" => \$DECODER,
"pmem=s" => \$pmem,
"mkcls=s" => \$mkcls,
) or usage();
-usage() unless (scalar @ARGV == 3);
+usage() unless (scalar @ARGV == 1);
die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls;
die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls;
my $in_file = shift @ARGV;
-my $m4 = shift @ARGV;
-my $im4 = shift @ARGV;
-die "Can't find model4: $m4" unless -f $m4;
-die "Can't find inverse model4: $im4" unless -f $im4;
die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/);
my $f_lang = $1;
@@ -32,13 +29,11 @@ my $e_lang = $2;
print STDERR "Source language: $f_lang\n";
print STDERR "Target language: $e_lang\n";
-print STDERR " Model 4 align: $m4\n";
-print STDERR "InModel 4 align: $im4\n";
print STDERR " Using mkcls in: $mkcls\n\n";
die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";
die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";
-my @stages = qw(nopos relpos markov);
+my @stages = qw(markov);
my @directions = qw(f-e e-f);
my $corpus = 'c';
@@ -67,12 +62,8 @@ die unless $? == 0;
my @targets = qw(grammars);
for my $direction (@directions) {
- my $prev_stage = undef;
- for my $stage (@stages) {
- push @targets, "$stage-$direction";
- make_stage($stage, $direction, $prev_stage);
- $prev_stage = $stage;
- }
+ push @targets, "model-$direction";
+ make_stage($direction);
}
open TOPLEVEL, ">$align_dir/Makefile" or die "Can't write $align_dir/Makefile: $!";
@@ -84,8 +75,6 @@ SCRIPT_DIR = $SCRIPT_DIR
TRAINING_DIR = $training_dir
MKCLS = $mkcls
NCLASSES = $num_classes
-GIZAALIGN = $m4
-INVGIZAALIGN = $im4
TARGETS = @targets
PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary
@@ -113,13 +102,12 @@ print STDERR "Created alignment task. chdir to talign/ then type make.\n\n";
exit 0;
sub make_stage {
- my ($stage, $direction, $prev_stage) = @_;
+ my ($direction) = @_;
my $stage_dir = "$align_dir/model-$direction";
my $first = $direction;
$first =~ s/^(.+)-.*$/$1/;
mkdir $stage_dir;
- my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n";
- open CDEC, ">$stage_dir/cdec.$stage.ini" or die;
+ open CDEC, ">$stage_dir/cdec.ini" or die "Can't write $stage_dir/cdec.ini: $!";
print CDEC <<EOT;
formalism=lextrans
intersection_strategy=full
@@ -127,23 +115,22 @@ grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
feature_function=LexicalPairIdentity
feature_function=InputIdentity
feature_function=OutputIdentity
+feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first
+feature_function=MarkovJump +b
+feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first
+feature_function=SourceBigram
+feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first
EOT
- if ($stage =~ /relpos/) {
- print CDEC "$RELPOS\n";
- } elsif ($stage =~ /markov/) {
- print CDEC "$RELPOS\n";
- print CDEC "feature_function=MarkovJump\n";
- print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n";
- print CDEC "feature_function=SourceBigram\n";
- print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n";
- }
close CDEC;
+ open AGENDA, ">$stage_dir/agenda.txt" or die "Can't write $stage_dir/agenda.txt: $!";
+ print AGENDA "cdec.ini $TRAINING_ITERATIONS\n";
+ close AGENDA;
}
sub usage {
die <<EOT;
-Usage: $0 [OPTIONS] training_corpus.fr-en giza.en-fr.A3 giza.fr-en.A3
+Usage: $0 [OPTIONS] training_corpus.fr-en
EOT
}
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index 3926fd8d..fb9d0214 100755
--- a/word-aligner/support/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -5,7 +5,8 @@ use strict;
my $LIMIT_SIZE=30;
my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && -f $gizaf2e && -f $gizae2f;
+die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
+
my %eclass = ();
my %fclass = ();
@@ -20,12 +21,12 @@ our %cache;
open EF, "<$effile" or die;
open M1, "<$model1" or die;
open IM1, "<$imodel1" or die;
-open M4, "<$gizaf2e" or die;
-open IM4, "<$gizae2f" or die;
+#open M4, "<$gizaf2e" or die;
+#open IM4, "<$gizae2f" or die;
+#binmode(M4,":utf8");
+#binmode(IM4,":utf8");
binmode(EF,":utf8");
binmode(M1,":utf8");
-binmode(M4,":utf8");
-binmode(IM4,":utf8");
binmode(IM1,":utf8");
binmode(STDOUT,":utf8");
my %model1;
@@ -105,7 +106,7 @@ my $ADD_DICE = 1;
my $ADD_111 = 1;
my $ADD_ID = 1;
my $ADD_PUNC = 1;
-my $ADD_NULL = 0;
+my $ADD_NULL = 1;
my $ADD_STEM_ID = 0;
my $ADD_SYM = 0;
my $BEAM_RATIO = 50;
@@ -115,6 +116,8 @@ my $BIN_IDENT = 1;
my $BIN_DICE = 1;
my $ADD_FIDENT = 0;
+if ($ADD_NULL) { $fclass{'<eps>'}='NUL'; $eclass{'<eps>'} ='NUL'; }
+
my %fdict;
my %fcounts;
my %ecounts;
@@ -146,24 +149,24 @@ while(<EF>) {
print STDERR "Loading Giza output...\n";
my %model4;
-while(<M4>) {
- my $en = <M4>; chomp $en;
- my $zh = <M4>; chomp $zh;
- die unless $zh =~ /^NULL \({/;
- my @ewords = split /\s+/, $en;
- my @chunks = split /\}\) ?/, $zh;
-
- for my $c (@chunks) {
- my ($zh, $taps) = split / \(\{ /, $c;
- if ($zh eq 'NULL') { $zh = '<eps>'; }
- my @aps = map { $ewords[$_ - 1]; } (split / /, $taps);
- #print "$zh -> @aps\n";
- for my $ap (@aps) {
- $model4{$zh}->{$ap} += 1;
- }
- }
-}
-close M4;
+#while(<M4>) {
+# my $en = <M4>; chomp $en;
+# my $zh = <M4>; chomp $zh;
+# die unless $zh =~ /^NULL \({/;
+# my @ewords = split /\s+/, $en;
+# my @chunks = split /\}\) ?/, $zh;
+#
+# for my $c (@chunks) {
+# my ($zh, $taps) = split / \(\{ /, $c;
+# if ($zh eq 'NULL') { $zh = '<eps>'; }
+# my @aps = map { $ewords[$_ - 1]; } (split / /, $taps);
+# #print "$zh -> @aps\n";
+# for my $ap (@aps) {
+# $model4{$zh}->{$ap} += 1;
+# }
+# }
+#}
+#close M4;
my $specials = 0;
my $fc = 1000000;
@@ -207,7 +210,6 @@ for my $f (sort keys %fdict) {
}
my $is_null = undef;
if ($ADD_NULL && $f eq '<eps>') {
- push @feats, "IsNull=1";
$is_null = 1;
}
if ($ADD_LEN) {