From c0565f28cd45ba7d6f478c1dc36a6f1b8aa47669 Mon Sep 17 00:00:00 2001 From: redpony Date: Fri, 22 Oct 2010 23:29:11 +0000 Subject: handle translation from the null word git-svn-id: https://ws10smt.googlecode.com/svn/trunk@689 ec762483-ff6d-05da-a07a-a48fb63a330f --- word-aligner/aligner.pl | 47 +++++++++++------------------ word-aligner/support/make_lex_grammar.pl | 52 +++++++++++++++++--------------- 2 files changed, 44 insertions(+), 55 deletions(-) (limited to 'word-aligner') diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index 7821560f..e23c2beb 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -6,9 +6,10 @@ use Getopt::Long; my $training_dir = "$SCRIPT_DIR/../training"; die "Can't find training dir: $training_dir" unless -d $training_dir; -my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls'; +my $mkcls = '/Users/cdyer/software/giza-pp/mkcls-v2/mkcls'; my $num_classes = 50; my $nodes = 40; +my $TRAINING_ITERATIONS = 2000; my $pmem = "2500mb"; my $DECODER = "cdec"; GetOptions("cdec=s" => \$DECODER, @@ -16,15 +17,11 @@ GetOptions("cdec=s" => \$DECODER, "pmem=s" => \$pmem, "mkcls=s" => \$mkcls, ) or usage(); -usage() unless (scalar @ARGV == 3); +usage() unless (scalar @ARGV == 1); die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls; die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls; my $in_file = shift @ARGV; -my $m4 = shift @ARGV; -my $im4 = shift @ARGV; -die "Can't find model4: $m4" unless -f $m4; -die "Can't find inverse model4: $im4" unless -f $im4; die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/); my $f_lang = $1; @@ -32,13 +29,11 @@ my $e_lang = $2; print STDERR "Source language: $f_lang\n"; print STDERR "Target language: $e_lang\n"; -print STDERR " Model 4 align: $m4\n"; -print STDERR "InModel 4 align: $im4\n"; print STDERR " Using mkcls in: $mkcls\n\n"; die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl"; die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl"; -my @stages = qw(nopos relpos markov); +my @stages = qw(markov); my @directions = qw(f-e e-f); my $corpus = 'c'; @@ -67,12 +62,8 @@ die unless $? == 0; my @targets = qw(grammars); for my $direction (@directions) { - my $prev_stage = undef; - for my $stage (@stages) { - push @targets, "$stage-$direction"; - make_stage($stage, $direction, $prev_stage); - $prev_stage = $stage; - } + push @targets, "model-$direction"; + make_stage($direction); } open TOPLEVEL, ">$align_dir/Makefile" or die "Can't write $align_dir/Makefile: $!"; @@ -84,8 +75,6 @@ SCRIPT_DIR = $SCRIPT_DIR TRAINING_DIR = $training_dir MKCLS = $mkcls NCLASSES = $num_classes -GIZAALIGN = $m4 -INVGIZAALIGN = $im4 TARGETS = @targets PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary @@ -113,13 +102,12 @@ print STDERR "Created alignment task. chdir to talign/ then type make.\n\n"; exit 0; sub make_stage { - my ($stage, $direction, $prev_stage) = @_; + my ($direction) = @_; my $stage_dir = "$align_dir/model-$direction"; my $first = $direction; $first =~ s/^(.+)-.*$/$1/; mkdir $stage_dir; - my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n"; - open CDEC, ">$stage_dir/cdec.$stage.ini" or die; + open CDEC, ">$stage_dir/cdec.ini" or die "Can't write $stage_dir/cdec.ini: $!"; print CDEC <$stage_dir/agenda.txt" or die "Can't write $stage_dir/agenda.txt: $!"; + print AGENDA "cdec.ini $TRAINING_ITERATIONS\n"; + close AGENDA; } sub usage { die <'}='NUL'; $eclass{''} ='NUL'; } + my %fdict; my %fcounts; my %ecounts; @@ -146,24 +149,24 @@ while() { print STDERR "Loading Giza output...\n"; my %model4; -while() { - my $en = ; chomp $en; - my $zh = ; chomp $zh; - die unless $zh =~ /^NULL \({/; - my @ewords = split /\s+/, $en; - my @chunks = split /\}\) ?/, $zh; - - for my $c (@chunks) { - my ($zh, $taps) = split / \(\{ /, $c; - if ($zh eq 'NULL') { $zh = ''; } - my @aps = map { $ewords[$_ - 1]; } (split / /, $taps); - #print "$zh -> @aps\n"; - for my $ap (@aps) { - $model4{$zh}->{$ap} += 1; - } - } -} -close M4; +#while() { +# my $en = ; chomp $en; +# my $zh = ; chomp $zh; +# die unless $zh =~ /^NULL \({/; +# my @ewords = split /\s+/, $en; +# my @chunks = split /\}\) ?/, $zh; +# +# for my $c (@chunks) { +# my ($zh, $taps) = split / \(\{ /, $c; +# if ($zh eq 'NULL') { $zh = ''; } +# my @aps = map { $ewords[$_ - 1]; } (split / /, $taps); +# #print "$zh -> @aps\n"; +# for my $ap (@aps) { +# $model4{$zh}->{$ap} += 1; +# } +# } +#} +#close M4; my $specials = 0; my $fc = 1000000; @@ -207,7 +210,6 @@ for my $f (sort keys %fdict) { } my $is_null = undef; if ($ADD_NULL && $f eq '') { - push @feats, "IsNull=1"; $is_null = 1; } if ($ADD_LEN) { -- cgit v1.2.3