From f412aaab3d10fb82b20a2190f2cb1424959c599a Mon Sep 17 00:00:00 2001 From: redpony Date: Wed, 29 Sep 2010 20:45:48 +0000 Subject: another feature, another POS git-svn-id: https://ws10smt.googlecode.com/svn/trunk@664 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/ff_wordalign.cc | 66 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'decoder/ff_wordalign.cc') diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc index a1968159..da86b714 100644 --- a/decoder/ff_wordalign.cc +++ b/decoder/ff_wordalign.cc @@ -266,6 +266,72 @@ void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, } } +// state: src word used, number of trg words generated +SourceBigram::SourceBigram(const std::string& param) : + FeatureFunction(sizeof(WordID) + sizeof(int)) { +} + +void SourceBigram::FinalTraversalFeatures(const void* context, + SparseVector* features) const { + WordID left = *static_cast(context); + int left_wc = *(static_cast(context) + 1); + if (left_wc == 1) + FireFeature(-1, left, features); + FireFeature(left, -1, features); +} + +void SourceBigram::FireFeature(WordID left, + WordID right, + SparseVector* features) const { + int& fid = fmap_[left][right]; + // TODO important important !!! escape strings !!! + if (!fid) { + ostringstream os; + os << "SB:"; + if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); } + os << '_'; + if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); } + fid = FD::Convert(os.str()); + if (fid == 0) fid = -1; + } + if (fid > 0) features->set_value(fid, 1.0); + int& ufid = ufmap_[left]; + if (!ufid) { + ostringstream os; + os << "SU:"; + if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); } + ufid = FD::Convert(os.str()); + if (ufid == 0) fid = -1; + } + if (ufid > 0) features->set_value(ufid, 1.0); +} + +void SourceBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* /* estimated_features */, + void* context) const { + WordID& out_context = *static_cast(context); + int& out_word_count = *(static_cast(context) + 1); + const int arity = edge.Arity(); + if (arity == 0) { + out_context = edge.rule_->f()[0]; + out_word_count = edge.rule_->EWords(); + assert(out_word_count == 1); // this is only defined for lex translation! + // revisit this if you want to translate into null words + } else if (arity == 2) { + WordID left = *static_cast(ant_contexts[0]); + WordID right = *static_cast(ant_contexts[1]); + int left_wc = *(static_cast(ant_contexts[0]) + 1); + int right_wc = *(static_cast(ant_contexts[0]) + 1); + if (left_wc == 1 && right_wc == 1) + FireFeature(-1, left, features); + FireFeature(left, right, features); + out_word_count = left_wc + right_wc; + out_context = right; + } +} // state: POS of src word used, number of trg words generated SourcePOSBigram::SourcePOSBigram(const std::string& param) : FeatureFunction(sizeof(WordID) + sizeof(int)) { -- cgit v1.2.3