From 1305f38dba367f31754044cabc7b2bea9b8a2074 Mon Sep 17 00:00:00 2001 From: redpony Date: Fri, 29 Oct 2010 17:14:57 +0000 Subject: more wa git-svn-id: https://ws10smt.googlecode.com/svn/trunk@702 ec762483-ff6d-05da-a07a-a48fb63a330f --- word-aligner/aligner.pl | 8 ++++++-- word-aligner/makefiles/makefile.grammars | 20 +++++--------------- word-aligner/support/make_lex_grammar.pl | 22 +++++++++++----------- 3 files changed, 22 insertions(+), 28 deletions(-) diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index 508dbd8d..f0733449 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -33,8 +33,7 @@ print STDERR " Using mkcls in: $mkcls\n\n"; die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl"; die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl"; -my @stages = qw(markov); -my @directions = qw(f-e e-f); +my @directions = qw(f-e); my $corpus = 'c'; @@ -112,8 +111,13 @@ sub make_stage { open CDEC, ">$stage_dir/cdec.ini" or die "Can't write $stage_dir/cdec.ini: $!"; print CDEC < $@ corpus.f-e.model1: corpus.f-e $(MODEL1) - $(MODEL1) corpus.f-e > corpus.f-e.model1 + $(MODEL1) corpus.f-e > $@ corpus.e-f.model1: corpus.e-f $(MODEL1) - $(MODEL1) corpus.e-f > corpus.e-f.model1 + $(MODEL1) corpus.e-f > $@ -bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) - $(RM) $@ - $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) > bidir.grammars - -corpus.f-e.lex-grammar.gz: bidir.grammars - $(EXTRACT_GRAMMAR) 1 bidir.grammars | $(GZIP) -9 > corpus.f-e.lex-grammar.gz - -corpus.e-f.lex-grammar.gz: bidir.grammars - $(EXTRACT_GRAMMAR) 2 bidir.grammars | $(GZIP) -9 > corpus.e-f.lex-grammar.gz +corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) + $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) | $(GZIP) -9 > $@ corpus.f-e.sgml: f.voc corpus.f-e.lex-grammar.gz corpus.f-e $(GENERATE_PSG) f.voc corpus.f-e corpus.f-e.lex-grammar.gz freq_grammar.f-e.gz psg.f-e $@ -corpus.e-f.sgml: e.voc corpus.e-f.lex-grammar.gz corpus.e-f - $(GENERATE_PSG) e.voc corpus.e-f corpus.e-f.lex-grammar.gz freq_grammar.e-f.gz psg.e-f $@ - diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl index 8d38abda..2cc1be52 100755 --- a/word-aligner/support/make_lex_grammar.pl +++ b/word-aligner/support/make_lex_grammar.pl @@ -182,9 +182,18 @@ for my $f (sort keys %fdict) { my $im1 = $invm1{$e}->{$f}; my $is_good_pair = (defined $m1 || defined $m4); my $is_inv_good_pair = (defined $im1); + my $ident = ($e eq $f); + if ($ident) { $is_good_pair = 1; } my $total_eandf = $ecounts{$e} + $fcounts{$f}; my $dice = 2 * $efcount / $total_eandf; my @feats; + if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) { + $is_good_pair = 1; + if ($ADD_111) { + push @feats, "OneOneOne=1"; + } + } + next unless $is_good_pair; if (defined $m1 && $ADD_MODEL1) { push @feats, "Model1=$m1"; my $m1d = sprintf("%.5g", $m1 * $dice); @@ -259,8 +268,6 @@ for my $f (sort keys %fdict) { push @feats, "OrthoSim=$ld"; } } - my $ident = ($e eq $f); - if ($ident) { $is_good_pair = 1; } if ($ident && $ADD_ID) { if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; } if ($total_eandf < 8) { push @feats, "IdentRare=1"; } @@ -277,22 +284,15 @@ for my $f (sort keys %fdict) { if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } } } - if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) { - $is_good_pair = 1; - if ($ADD_111) { - push @feats, "OneOneOne=1"; - } - } if ($ADD_PUNC) { - if ($f =~ /^[!,\-\/"':;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) { + if ($f =~ /^[!,\-\/"'`:;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) { push @feats, "PuncMiss=1"; } } my $is_special = ($is_good_pair && !(defined $m1)); $specials++ if $is_special; print STDERR "$f -> $e\n" if $is_special; - print "1 ||| $f ||| $e ||| @feats\n" if $is_good_pair; - print "2 ||| $e ||| $f ||| @feats\n" if $is_inv_good_pair; + print "$f ||| $e ||| @feats\n" if $is_good_pair; } } print STDERR "Added $specials special rules that were not in the M1 set\n"; -- cgit v1.2.3