From ecde8cb600b24c31b062f8f53d57641e3fa23379 Mon Sep 17 00:00:00 2001 From: redpony Date: Wed, 27 Oct 2010 13:55:23 +0000 Subject: factored lexicon git-svn-id: https://ws10smt.googlecode.com/svn/trunk@692 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/Makefile.am | 1 + decoder/factored_lexicon_helper.cc | 80 ++++++++++++++++++++++++++++++++++++++ decoder/factored_lexicon_helper.h | 66 +++++++++++++++++++++++++++++++ decoder/ff_tagger.cc | 60 ++++++++++++++++------------ decoder/ff_tagger.h | 7 +++- 5 files changed, 189 insertions(+), 25 deletions(-) create mode 100644 decoder/factored_lexicon_helper.cc create mode 100644 decoder/factored_lexicon_helper.h diff --git a/decoder/Makefile.am b/decoder/Makefile.am index bf368c6d..da0e5987 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -48,6 +48,7 @@ libcdec_a_SOURCES = \ hg_io.cc \ decoder.cc \ hg_intersect.cc \ + factored_lexicon_helper.cc \ viterbi.cc \ lattice.cc \ aligner.cc \ diff --git a/decoder/factored_lexicon_helper.cc b/decoder/factored_lexicon_helper.cc new file mode 100644 index 00000000..7203b325 --- /dev/null +++ b/decoder/factored_lexicon_helper.cc @@ -0,0 +1,80 @@ +#include "factored_lexicon_helper.h" + +#include "filelib.h" +#include "stringlib.h" + +using namespace std; + +FactoredLexiconHelper::FactoredLexiconHelper() : + kNULL(TD::Convert("")), + has_src_(false), + has_trg_(false) { InitEscape(); } + +FactoredLexiconHelper::FactoredLexiconHelper(const std::string& srcfile, const std::string& trgmapfile) : + kNULL(TD::Convert("")), + has_src_(false), + has_trg_(false) { + if (srcfile.size() && srcfile != "*") { + ReadFile rf(srcfile); + has_src_ = true; + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (!in) continue; + vector v; + TD::ConvertSentence(line, &v); + src_.push_back(v); + } + } + if (trgmapfile.size() && trgmapfile != "*") { + ReadFile rf(trgmapfile); + has_trg_ = true; + istream& in = *rf.stream(); + string line; + vector v; + while(in) { + getline(in, line); + if (!in) continue; + SplitOnWhitespace(line, &v); + if (v.size() != 2) { + cerr << "Error reading line in map file: " << line << endl; + abort(); + } + WordID& to = trgmap_[TD::Convert(v[0])]; + if (to != 0) { + cerr << "Duplicate entry for word " << v[0] << endl; + abort(); + } + to = TD::Convert(v[1]); + } + } + InitEscape(); +} + +void FactoredLexiconHelper::InitEscape() { + escape_[TD::Convert("=")] = TD::Convert("__EQ"); + escape_[TD::Convert(";")] = TD::Convert("__SC"); + escape_[TD::Convert(",")] = TD::Convert("__CO"); +} + +void FactoredLexiconHelper::PrepareForInput(const SentenceMetadata& smeta) { + if (has_src_) { + const int id = smeta.GetSentenceID(); + assert(id < src_.size()); + cur_src_ = src_[id]; + } else { + cur_src_.resize(smeta.GetSourceLength()); + for (int i = 0; i < cur_src_.size(); ++i) { + const vector& arcs = smeta.GetSourceLattice()[i]; + assert(arcs.size() == 1); // only sentences supported for now + cur_src_[i] = arcs[0].label; + } + } + if (cur_src_.size() != smeta.GetSourceLength()) { + cerr << "Length mismatch between mapped source and real source in sentence id=" << smeta.GetSentenceID() << endl; + cerr << " mapped len=" << cur_src_.size() << endl; + cerr << " actual len=" << smeta.GetSourceLength() << endl; + } +} + diff --git a/decoder/factored_lexicon_helper.h b/decoder/factored_lexicon_helper.h new file mode 100644 index 00000000..81c75275 --- /dev/null +++ b/decoder/factored_lexicon_helper.h @@ -0,0 +1,66 @@ +#ifndef _FACTORED_LEXICON_HELPER_ +#define _FACTORED_LEXICON_HELPER_ + +#include +#include +#include +#include +#include "tdict.h" +#include "sentence_metadata.h" + +// when computing features, it can be advantageous to: +// 1) back off to less specific forms (e.g., less highly inflected forms, POS tags, etc) +// 2) look at more specific forms (on the source ONLY) +// this class helps you do both by creating a "corpus" view +// should probably add a discussion of why the source can be "refined" by this class +// but not the target. basically, this is because the source is on the right side of +// the conditioning line in the model, and the target is on the left. the most specific +// form must always be generated, but the "source" can include arbitrarily large +// context. +// this currently only works for sentence input to maintain simplicity of the code and +// file formats, but there is no reason why it couldn't work with lattices / CFGs +class FactoredLexiconHelper { + public: + // default constructor does no mapping + FactoredLexiconHelper(); + // Either filename can be empty or * to indicate no mapping + FactoredLexiconHelper(const std::string& srcfile, const std::string& trgmapfile); + + void PrepareForInput(const SentenceMetadata& smeta); + + inline WordID SourceWordAtPosition(const int i) const { + if (i < 0) return kNULL; + assert(i < cur_src_.size()); + return Escape(cur_src_[i]); + } + + inline WordID CoarsenedTargetWordForTarget(const WordID surface_target) const { + if (has_trg_) { + const WordWordMap::const_iterator it = trgmap_.find(surface_target); + if (it == trgmap_.end()) return surface_target; + return Escape(it->second); + } else { + return Escape(surface_target); + } + } + + private: + inline WordID Escape(WordID word) const { + const std::map::const_iterator it = escape_.find(word); + if (it == escape_.end()) return word; + return it->second; + } + + void InitEscape(); + + const WordID kNULL; + bool has_src_; + bool has_trg_; + std::vector > src_; + typedef std::map WordWordMap; + WordWordMap trgmap_; + std::vector cur_src_; + std::map escape_; +}; + +#endif diff --git a/decoder/ff_tagger.cc b/decoder/ff_tagger.cc index 05de8ba3..21d0f812 100644 --- a/decoder/ff_tagger.cc +++ b/decoder/ff_tagger.cc @@ -1,9 +1,10 @@ #include "ff_tagger.h" +#include + #include "tdict.h" #include "sentence_metadata.h" - -#include +#include "stringlib.h" using namespace std; @@ -52,23 +53,36 @@ void Tagger_BigramIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta, } } -LexicalPairIdentity::LexicalPairIdentity(const std::string& param) {} +void LexicalPairIdentity::PrepareForInput(const SentenceMetadata& smeta) { + lexmap_->PrepareForInput(smeta); +} + +LexicalPairIdentity::LexicalPairIdentity(const std::string& param) { + name_ = "Id"; + if (param.size()) { + // name corpus.f emap.txt + vector params; + SplitOnWhitespace(param, ¶ms); + if (params.size() != 3) { + cerr << "LexicalPairIdentity takes 3 parameters: \n"; + cerr << " * may be used for corpus.src.txt or trgmap.txt to use surface forms\n"; + cerr << " Received: " << param << endl; + abort(); + } + name_ = params[0]; + lexmap_.reset(new FactoredLexiconHelper(params[1], params[2])); + } else { + lexmap_.reset(new FactoredLexiconHelper); + } +} void LexicalPairIdentity::FireFeature(WordID src, - WordID trg, - SparseVector* features) const { + WordID trg, + SparseVector* features) const { int& fid = fmap_[src][trg]; if (!fid) { - static map escape; - if (escape.empty()) { - escape[TD::Convert("=")] = TD::Convert("__EQ"); - escape[TD::Convert(";")] = TD::Convert("__SC"); - escape[TD::Convert(",")] = TD::Convert("__CO"); - } - if (escape.count(src)) src = escape[src]; - if (escape.count(trg)) trg = escape[trg]; ostringstream os; - os << "Id:" << TD::Convert(src) << ':' << TD::Convert(trg); + os << name_ << ':' << TD::Convert(src) << ':' << TD::Convert(trg); fid = FD::Convert(os.str()); } features->set_value(fid, 1.0); @@ -80,16 +94,14 @@ void LexicalPairIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta, SparseVector* features, SparseVector* estimated_features, void* context) const { - const vector& ew = edge.rule_->e_; - const vector& fw = edge.rule_->f_; - for (int i = 0; i < ew.size(); ++i) { - const WordID& e = ew[i]; - if (e <= 0) continue; - for (int j = 0; j < fw.size(); ++j) { - const WordID& f = fw[j]; - if (f <= 0) continue; - FireFeature(f, e, features); - } + // inline WordID SourceWordAtPosition(const int i); + // inline WordID CoarsenedTargetWordForTarget(const WordID surface_target); + if (edge.Arity() == 0) { + const WordID src = lexmap_->SourceWordAtPosition(edge.i_); + const vector& ew = edge.rule_->e_; + assert(ew.size() == 1); + const WordID trg = lexmap_->CoarsenedTargetWordForTarget(ew[0]); + FireFeature(src, trg, features); } } diff --git a/decoder/ff_tagger.h b/decoder/ff_tagger.h index 9e47854e..6adee5ab 100644 --- a/decoder/ff_tagger.h +++ b/decoder/ff_tagger.h @@ -2,7 +2,9 @@ #define _FF_TAGGER_H_ #include +#include #include "ff.h" +#include "factored_lexicon_helper.h" typedef std::map Class2FID; typedef std::map Class2Class2FID; @@ -33,6 +35,7 @@ class Tagger_BigramIdentity : public FeatureFunction { class LexicalPairIdentity : public FeatureFunction { public: LexicalPairIdentity(const std::string& param); + virtual void PrepareForInput(const SentenceMetadata& smeta); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, @@ -44,7 +47,9 @@ class LexicalPairIdentity : public FeatureFunction { void FireFeature(WordID src, WordID trg, SparseVector* features) const; - mutable Class2Class2FID fmap_; + std::string name_; // used to construct feature string + boost::scoped_ptr lexmap_; // different view (stemmed, etc) of source/target + mutable Class2Class2FID fmap_; // feature ideas }; -- cgit v1.2.3