diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-22 23:29:11 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-22 23:29:11 +0000 |
commit | dd886ca6da84970ccb96b2f0155ff672e03f5b58 (patch) | |
tree | 78b5627347f3953539852cdd6b92053e844e87d4 | |
parent | 550019457302ecaaec6f72e912013a6fa9f2da67 (diff) |
handle translation from the null word
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@689 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r-- | decoder/aligner.cc | 12 | ||||
-rw-r--r-- | decoder/decoder.cc | 2 | ||||
-rw-r--r-- | decoder/ff_wordalign.cc | 66 | ||||
-rw-r--r-- | decoder/ff_wordalign.h | 6 | ||||
-rwxr-xr-x | word-aligner/aligner.pl | 47 | ||||
-rwxr-xr-x | word-aligner/support/make_lex_grammar.pl | 52 |
6 files changed, 99 insertions, 86 deletions
diff --git a/decoder/aligner.cc b/decoder/aligner.cc index 92431be4..3f0c7347 100644 --- a/decoder/aligner.cc +++ b/decoder/aligner.cc @@ -24,8 +24,10 @@ void SourceEdgeCoveragesUsingParseIndices(const Hypergraph& g, if (edge.rule_->EWords() == 0 || edge.rule_->FWords() == 0) continue; // aligned to NULL (crf ibm variant only) - if (edge.prev_i_ == -1 || edge.i_ == -1) + if (edge.prev_i_ == -1 || edge.i_ == -1) { + cov.insert(-1); continue; + } assert(edge.j_ >= 0); assert(edge.prev_j_ >= 0); if (edge.Arity() == 0) { @@ -211,7 +213,7 @@ void AlignerTools::WriteAlignment(const Lattice& src_lattice, // figure out the src and reference size; int src_size = src_sent.size(); int ref_size = trg_sent.size(); - Array2D<prob_t> align(src_size, ref_size, prob_t::Zero()); + Array2D<prob_t> align(src_size + 1, ref_size, prob_t::Zero()); for (int c = 0; c < g->edges_.size(); ++c) { const prob_t& p = edge_posteriors[c]; const set<int>& srcs = src_cov[c]; @@ -220,7 +222,7 @@ void AlignerTools::WriteAlignment(const Lattice& src_lattice, si != srcs.end(); ++si) { for (set<int>::const_iterator ti = trgs.begin(); ti != trgs.end(); ++ti) { - align(*si, *ti) += p; + align(*si + 1, *ti) += p; } } } @@ -234,12 +236,12 @@ void AlignerTools::WriteAlignment(const Lattice& src_lattice, for (int j = 0; j < ref_size; ++j) { if (use_soft_threshold) { threshold = prob_t::Zero(); - for (int i = 0; i < src_size; ++i) + for (int i = 0; i <= src_size; ++i) if (align(i, j) > threshold) threshold = align(i, j); //threshold *= prob_t(0.99); } for (int i = 0; i < src_size; ++i) - grid(i, j) = align(i, j) >= threshold; + grid(i, j) = align(i+1, j) >= threshold; } if (out == &cout) { // TODO need to do some sort of verbose flag diff --git a/decoder/decoder.cc b/decoder/decoder.cc index b975a5fc..eb983419 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -364,7 +364,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("beam_prune", po::value<double>(), "Prune paths from +LM forest, keep paths within exp(alpha>=0)") ("scale_prune_srclen", "scale beams by the input length (in # of tokens; may not be what you want for lattices") ("promise_power",po::value<double>()->default_value(0), "Give more beam budget to more promising previous-pass nodes when pruning - but allocate the same average beams. 0 means off, 1 means beam proportional to inside*outside prob, n means nth power (affects just --cubepruning_pop_limit). note: for the same pop_limit, this gives more search error unless very close to 0 (recommend disabled; even 0.01 is slightly worse than 0) which is a bad sign and suggests this isn't doing a good job; further it's slightly slower to LM cube rescore with 0.01 compared to 0, as well as giving (very insignificantly) lower BLEU. TODO: test under more conditions, or try idea with different formula, or prob. cube beams.") - ("lexalign_use_null", "Support source-side null words in lexical translation") + ("lextrans_use_null", "Support source-side null words in lexical translation") ("tagger_tagset,t", po::value<string>(), "(Tagger) file containing tag set") ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format") ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice") diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc index f2f07033..5f42b438 100644 --- a/decoder/ff_wordalign.cc +++ b/decoder/ff_wordalign.cc @@ -16,6 +16,8 @@ static const int MAX_SENTENCE_SIZE = 100; +static const int kNULL_i = 255; // -1 as an unsigned char + using namespace std; Model2BinaryFeatures::Model2BinaryFeatures(const string& ) : @@ -149,7 +151,11 @@ void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta, int prev_src_pos, int cur_src_pos, SparseVector<double>* features) const { + if (prev_src_pos == kNULL_i || cur_src_pos == kNULL_i) + return; + const int jumpsize = cur_src_pos - prev_src_pos; + assert(smeta.GetSentenceID() < pos_.size()); const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos]; const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second; @@ -189,10 +195,13 @@ void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta, } } -// std::vector<std::map<int, int> > flen2jump2fid_; MarkovJump::MarkovJump(const string& param) : FeatureFunction(1), fid_(FD::Convert("MarkovJump")), + fid_lex_null_(FD::Convert("JumpLexNull")), + fid_null_lex_(FD::Convert("JumpNullLex")), + fid_null_null_(FD::Convert("JumpNullNull")), + fid_lex_lex_(FD::Convert("JumpLexLex")), binary_params_(false) { cerr << " MarkovJump"; vector<string> argv; @@ -218,7 +227,7 @@ MarkovJump::MarkovJump(const string& param) : cerr << endl; } -// TODO handle NULLs according to Och 2000 +// TODO handle NULLs according to Och 2000? void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, const vector<const void*>& ant_states, @@ -229,19 +238,20 @@ void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, const int flen = smeta.GetSourceLength(); if (edge.Arity() == 0) { dpstate = static_cast<unsigned int>(edge.i_); - if (edge.prev_i_ == 0) { - if (binary_params_) { - // NULL will be tricky - // TODO initial state distribution, not normal jumps + if (edge.prev_i_ == 0) { // first word in sentence + if (edge.i_ >= 0 && binary_params_) { const int fid = flen2jump2fid_[flen].find(edge.i_ + 1)->second; features->set_value(fid, 1.0); + } else if (edge.i_ < 0 && binary_params_) { + // handled by bigram features } } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) { - // NULL will be tricky - if (binary_params_) { + if (edge.i_ >= 0 && binary_params_) { int jumpsize = flen - edge.i_; const int fid = flen2jump2fid_[flen].find(jumpsize)->second; features->set_value(fid, 1.0); + } else if (edge.i_ < 0 && binary_params_) { + // handled by bigram features } } } else if (edge.Arity() == 1) { @@ -253,13 +263,24 @@ void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, dpstate = static_cast<unsigned int>(left_index); else dpstate = static_cast<unsigned int>(right_index); - const int jumpsize = right_index - left_index; + if (left_index == kNULL_i || right_index == kNULL_i) { + if (left_index == kNULL_i && right_index == kNULL_i) + features->set_value(fid_null_null_, 1.0); + else if (left_index == kNULL_i) + features->set_value(fid_null_lex_, 1.0); + else + features->set_value(fid_lex_null_, 1.0); - if (binary_params_) { - const int fid = flen2jump2fid_[flen].find(jumpsize)->second; - features->set_value(fid, 1.0); } else { - features->set_value(fid_, fabs(jumpsize - 1)); // Blunsom and Cohn def + features->set_value(fid_lex_lex_, 1.0); // TODO should only use if NULL is enabled + const int jumpsize = right_index - left_index; + + if (binary_params_) { + const int fid = flen2jump2fid_[flen].find(jumpsize)->second; + features->set_value(fid, 1.0); + } else { + features->set_value(fid_, fabs(jumpsize - 1)); // Blunsom and Cohn def + } } } else { assert(!"something really unexpected is happening"); @@ -294,15 +315,6 @@ void SourceBigram::FireFeature(WordID left, if (fid == 0) fid = -1; } if (fid > 0) features->set_value(fid, 1.0); - int& ufid = ufmap_[left]; - if (!ufid) { - ostringstream os; - os << "SU:"; - if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); } - ufid = FD::Convert(os.str()); - if (ufid == 0) fid = -1; - } - if (ufid > 0) features->set_value(ufid, 1.0); } void SourceBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta, @@ -386,8 +398,14 @@ void SourcePOSBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta, if (arity == 0) { assert(smeta.GetSentenceID() < pos_.size()); const vector<WordID>& pos_sent = pos_[smeta.GetSentenceID()]; - assert(edge.i_ < pos_sent.size()); - out_context = pos_sent[edge.i_]; + if (edge.i_ >= 0) { // non-NULL source + assert(edge.i_ < pos_sent.size()); + out_context = pos_sent[edge.i_]; + } else { // NULL source + // should assert that source is kNULL? + static const WordID kNULL = TD::Convert("<eps>"); + out_context = kNULL; + } out_word_count = edge.rule_->EWords(); assert(out_word_count == 1); // this is only defined for lex translation! // revisit this if you want to translate into null words diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h index 30ddf7a1..0714229c 100644 --- a/decoder/ff_wordalign.h +++ b/decoder/ff_wordalign.h @@ -49,6 +49,11 @@ class MarkovJump : public FeatureFunction { void* out_context) const; private: const int fid_; + const int fid_lex_null_; + const int fid_null_lex_; + const int fid_null_null_; + const int fid_lex_lex_; + bool binary_params_; std::vector<std::map<int, int> > flen2jump2fid_; }; @@ -96,7 +101,6 @@ class SourceBigram : public FeatureFunction { WordID trg, SparseVector<double>* features) const; mutable Class2Class2FID fmap_; - mutable Class2FID ufmap_; }; class SourcePOSBigram : public FeatureFunction { diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index 7821560f..e23c2beb 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -6,9 +6,10 @@ use Getopt::Long; my $training_dir = "$SCRIPT_DIR/../training"; die "Can't find training dir: $training_dir" unless -d $training_dir; -my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls'; +my $mkcls = '/Users/cdyer/software/giza-pp/mkcls-v2/mkcls'; my $num_classes = 50; my $nodes = 40; +my $TRAINING_ITERATIONS = 2000; my $pmem = "2500mb"; my $DECODER = "cdec"; GetOptions("cdec=s" => \$DECODER, @@ -16,15 +17,11 @@ GetOptions("cdec=s" => \$DECODER, "pmem=s" => \$pmem, "mkcls=s" => \$mkcls, ) or usage(); -usage() unless (scalar @ARGV == 3); +usage() unless (scalar @ARGV == 1); die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls; die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls; my $in_file = shift @ARGV; -my $m4 = shift @ARGV; -my $im4 = shift @ARGV; -die "Can't find model4: $m4" unless -f $m4; -die "Can't find inverse model4: $im4" unless -f $im4; die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/); my $f_lang = $1; @@ -32,13 +29,11 @@ my $e_lang = $2; print STDERR "Source language: $f_lang\n"; print STDERR "Target language: $e_lang\n"; -print STDERR " Model 4 align: $m4\n"; -print STDERR "InModel 4 align: $im4\n"; print STDERR " Using mkcls in: $mkcls\n\n"; die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl"; die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl"; -my @stages = qw(nopos relpos markov); +my @stages = qw(markov); my @directions = qw(f-e e-f); my $corpus = 'c'; @@ -67,12 +62,8 @@ die unless $? == 0; my @targets = qw(grammars); for my $direction (@directions) { - my $prev_stage = undef; - for my $stage (@stages) { - push @targets, "$stage-$direction"; - make_stage($stage, $direction, $prev_stage); - $prev_stage = $stage; - } + push @targets, "model-$direction"; + make_stage($direction); } open TOPLEVEL, ">$align_dir/Makefile" or die "Can't write $align_dir/Makefile: $!"; @@ -84,8 +75,6 @@ SCRIPT_DIR = $SCRIPT_DIR TRAINING_DIR = $training_dir MKCLS = $mkcls NCLASSES = $num_classes -GIZAALIGN = $m4 -INVGIZAALIGN = $im4 TARGETS = @targets PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary @@ -113,13 +102,12 @@ print STDERR "Created alignment task. chdir to talign/ then type make.\n\n"; exit 0; sub make_stage { - my ($stage, $direction, $prev_stage) = @_; + my ($direction) = @_; my $stage_dir = "$align_dir/model-$direction"; my $first = $direction; $first =~ s/^(.+)-.*$/$1/; mkdir $stage_dir; - my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n"; - open CDEC, ">$stage_dir/cdec.$stage.ini" or die; + open CDEC, ">$stage_dir/cdec.ini" or die "Can't write $stage_dir/cdec.ini: $!"; print CDEC <<EOT; formalism=lextrans intersection_strategy=full @@ -127,23 +115,22 @@ grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz feature_function=LexicalPairIdentity feature_function=InputIdentity feature_function=OutputIdentity +feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first +feature_function=MarkovJump +b +feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first +feature_function=SourceBigram +feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first EOT - if ($stage =~ /relpos/) { - print CDEC "$RELPOS\n"; - } elsif ($stage =~ /markov/) { - print CDEC "$RELPOS\n"; - print CDEC "feature_function=MarkovJump\n"; - print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n"; - print CDEC "feature_function=SourceBigram\n"; - print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n"; - } close CDEC; + open AGENDA, ">$stage_dir/agenda.txt" or die "Can't write $stage_dir/agenda.txt: $!"; + print AGENDA "cdec.ini $TRAINING_ITERATIONS\n"; + close AGENDA; } sub usage { die <<EOT; -Usage: $0 [OPTIONS] training_corpus.fr-en giza.en-fr.A3 giza.fr-en.A3 +Usage: $0 [OPTIONS] training_corpus.fr-en EOT } diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl index 3926fd8d..fb9d0214 100755 --- a/word-aligner/support/make_lex_grammar.pl +++ b/word-aligner/support/make_lex_grammar.pl @@ -5,7 +5,8 @@ use strict; my $LIMIT_SIZE=30; my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV; -die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && -f $gizaf2e && -f $gizae2f; +die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f; + my %eclass = (); my %fclass = (); @@ -20,12 +21,12 @@ our %cache; open EF, "<$effile" or die; open M1, "<$model1" or die; open IM1, "<$imodel1" or die; -open M4, "<$gizaf2e" or die; -open IM4, "<$gizae2f" or die; +#open M4, "<$gizaf2e" or die; +#open IM4, "<$gizae2f" or die; +#binmode(M4,":utf8"); +#binmode(IM4,":utf8"); binmode(EF,":utf8"); binmode(M1,":utf8"); -binmode(M4,":utf8"); -binmode(IM4,":utf8"); binmode(IM1,":utf8"); binmode(STDOUT,":utf8"); my %model1; @@ -105,7 +106,7 @@ my $ADD_DICE = 1; my $ADD_111 = 1; my $ADD_ID = 1; my $ADD_PUNC = 1; -my $ADD_NULL = 0; +my $ADD_NULL = 1; my $ADD_STEM_ID = 0; my $ADD_SYM = 0; my $BEAM_RATIO = 50; @@ -115,6 +116,8 @@ my $BIN_IDENT = 1; my $BIN_DICE = 1; my $ADD_FIDENT = 0; +if ($ADD_NULL) { $fclass{'<eps>'}='NUL'; $eclass{'<eps>'} ='NUL'; } + my %fdict; my %fcounts; my %ecounts; @@ -146,24 +149,24 @@ while(<EF>) { print STDERR "Loading Giza output...\n"; my %model4; -while(<M4>) { - my $en = <M4>; chomp $en; - my $zh = <M4>; chomp $zh; - die unless $zh =~ /^NULL \({/; - my @ewords = split /\s+/, $en; - my @chunks = split /\}\) ?/, $zh; - - for my $c (@chunks) { - my ($zh, $taps) = split / \(\{ /, $c; - if ($zh eq 'NULL') { $zh = '<eps>'; } - my @aps = map { $ewords[$_ - 1]; } (split / /, $taps); - #print "$zh -> @aps\n"; - for my $ap (@aps) { - $model4{$zh}->{$ap} += 1; - } - } -} -close M4; +#while(<M4>) { +# my $en = <M4>; chomp $en; +# my $zh = <M4>; chomp $zh; +# die unless $zh =~ /^NULL \({/; +# my @ewords = split /\s+/, $en; +# my @chunks = split /\}\) ?/, $zh; +# +# for my $c (@chunks) { +# my ($zh, $taps) = split / \(\{ /, $c; +# if ($zh eq 'NULL') { $zh = '<eps>'; } +# my @aps = map { $ewords[$_ - 1]; } (split / /, $taps); +# #print "$zh -> @aps\n"; +# for my $ap (@aps) { +# $model4{$zh}->{$ap} += 1; +# } +# } +#} +#close M4; my $specials = 0; my $fc = 1000000; @@ -207,7 +210,6 @@ for my $f (sort keys %fdict) { } my $is_null = undef; if ($ADD_NULL && $f eq '<eps>') { - push @feats, "IsNull=1"; $is_null = 1; } if ($ADD_LEN) { |