From 9a8cbe4db88e63378b6d3c4ec96438819f1f1131 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 9 Dec 2010 17:04:29 -0500 Subject: major refactor of markov features for word alignment --- word-aligner/aligner.pl | 14 ++++++++------ word-aligner/makefiles/makefile.grammars | 8 ++------ word-aligner/support/generate_word_pair_features.pl | 4 ++-- 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'word-aligner') diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index 81ac4198..f5ee5d3f 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -120,17 +120,19 @@ grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz feature_function=WordPairFeatures $align_dir/grammars/wordpairs.$direction.features.gz feature_function=LexicalPairIdentity -feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second +# stem translation feature_function=LexicalPairIdentity S $align_dir/grammars/corpus.stemmed.$first $align_dir/grammars/${second}stem.map +# POS translation +feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second feature_function=InputIdentity feature_function=OutputIdentity feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first -# the following two are deprecated -feature_function=MarkovJump +b -feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first +feature_function=NewJump +feature_function=NewJump use_binned_log_lengths flen +# jump distance and src and destination class type +feature_function=NewJump use_binned_log_lengths f0 fprev f:$align_dir/grammars/corpus.class.$first feature_function=SourceBigram -# following is deprecated- should reuse SourceBigram the way LexicalPairIdentity does -feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first +feature_function=SourceBigram SC $align_dir/grammars/corpus.class.$first EOT close CDEC; open AGENDA, ">$stage_dir/agenda.txt" or die "Can't write $stage_dir/agenda.txt: $!"; diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index be0644df..1a069abf 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -1,14 +1,13 @@ -all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml +all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map clean: - $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg* wordpairs* + $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* freq* wordpairs* SUPPORT_DIR = $(SCRIPT_DIR)/support GZIP = /usr/bin/gzip ZCAT = zcat EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl -GENERATE_PSG = $(SUPPORT_DIR)/generate_per_sentence_grammars.pl GENERATE_WORDPAIR_FEATURES = $(SUPPORT_DIR)/generate_word_pair_features.pl ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl @@ -84,6 +83,3 @@ corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 wordpairs.f-e.features.gz: corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 $(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 | $(GZIP) -9 > $@ -corpus.f-e.sgml: f.voc corpus.f-e.lex-grammar.gz corpus.f-e - $(GENERATE_PSG) f.voc corpus.f-e corpus.f-e.lex-grammar.gz freq_grammar.f-e.gz psg.f-e $@ - diff --git a/word-aligner/support/generate_word_pair_features.pl b/word-aligner/support/generate_word_pair_features.pl index b28f6feb..54b89ce1 100755 --- a/word-aligner/support/generate_word_pair_features.pl +++ b/word-aligner/support/generate_word_pair_features.pl @@ -92,7 +92,7 @@ my $ADD_ID = 1; my $ADD_PUNC = 1; my $ADD_NULL = 1; my $ADD_MODEL1 = 1; -my $ADD_NOMODEL1 = 1; +my $ADD_NOMODEL1 = 0; my $BEAM_RATIO = 50; my $BIN_ORTHO = 1; my $BIN_DLEN = 1; @@ -171,7 +171,7 @@ for my $f (sort keys %fdict) { } if ($im1 > $MIN_MAGNITUDE) { push @feats, "InvModel1=$im1" if $im1; - } else { + } elsif ($ADD_NOMODEL1) { push @feats, 'NoInvModel1=1'; } my $am1 = sprintf("%.5g", sqrt($m1 * $im1)); -- cgit v1.2.3