diff options
| author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-27 13:55:23 +0000 | 
|---|---|---|
| committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-27 13:55:23 +0000 | 
| commit | ecde8cb600b24c31b062f8f53d57641e3fa23379 (patch) | |
| tree | 71dbc397a8a6aac6209bff294623b3230223fcd2 /decoder | |
| parent | 4e66b377ebb4b73d470c0efc573f5bda773b2972 (diff) | |
factored lexicon
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@692 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder')
| -rw-r--r-- | decoder/Makefile.am | 1 | ||||
| -rw-r--r-- | decoder/factored_lexicon_helper.cc | 80 | ||||
| -rw-r--r-- | decoder/factored_lexicon_helper.h | 66 | ||||
| -rw-r--r-- | decoder/ff_tagger.cc | 60 | ||||
| -rw-r--r-- | decoder/ff_tagger.h | 7 | 
5 files changed, 189 insertions, 25 deletions
| diff --git a/decoder/Makefile.am b/decoder/Makefile.am index bf368c6d..da0e5987 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -48,6 +48,7 @@ libcdec_a_SOURCES = \    hg_io.cc \    decoder.cc \    hg_intersect.cc \ +  factored_lexicon_helper.cc \    viterbi.cc \    lattice.cc \    aligner.cc \ diff --git a/decoder/factored_lexicon_helper.cc b/decoder/factored_lexicon_helper.cc new file mode 100644 index 00000000..7203b325 --- /dev/null +++ b/decoder/factored_lexicon_helper.cc @@ -0,0 +1,80 @@ +#include "factored_lexicon_helper.h" + +#include "filelib.h" +#include "stringlib.h" + +using namespace std; + +FactoredLexiconHelper::FactoredLexiconHelper() : +    kNULL(TD::Convert("<eps>")), +    has_src_(false), +    has_trg_(false) { InitEscape(); } + +FactoredLexiconHelper::FactoredLexiconHelper(const std::string& srcfile, const std::string& trgmapfile) : +    kNULL(TD::Convert("<eps>")), +    has_src_(false), +    has_trg_(false) { +  if (srcfile.size() && srcfile != "*") { +    ReadFile rf(srcfile); +    has_src_ = true; +    istream& in = *rf.stream(); +    string line; +    while(in) { +      getline(in, line); +      if (!in) continue; +      vector<WordID> v; +      TD::ConvertSentence(line, &v); +      src_.push_back(v); +    } +  } +  if (trgmapfile.size() && trgmapfile != "*") { +    ReadFile rf(trgmapfile); +    has_trg_ = true; +    istream& in = *rf.stream(); +    string line; +    vector<string> v; +    while(in) { +      getline(in, line); +      if (!in) continue; +      SplitOnWhitespace(line, &v); +      if (v.size() != 2) { +        cerr << "Error reading line in map file: " << line << endl; +        abort(); +      } +      WordID& to = trgmap_[TD::Convert(v[0])]; +      if (to != 0) { +        cerr << "Duplicate entry for word " << v[0] << endl; +        abort(); +      } +      to = TD::Convert(v[1]); +    } +  } +  InitEscape(); +} + +void FactoredLexiconHelper::InitEscape() { +  escape_[TD::Convert("=")] = TD::Convert("__EQ"); +  escape_[TD::Convert(";")] = TD::Convert("__SC"); +  escape_[TD::Convert(",")] = TD::Convert("__CO"); +} + +void FactoredLexiconHelper::PrepareForInput(const SentenceMetadata& smeta) { +  if (has_src_) { +    const int id = smeta.GetSentenceID(); +    assert(id < src_.size()); +    cur_src_ = src_[id]; +  } else { +    cur_src_.resize(smeta.GetSourceLength()); +    for (int i = 0; i < cur_src_.size(); ++i) { +      const vector<LatticeArc>& arcs = smeta.GetSourceLattice()[i]; +      assert(arcs.size() == 1);    // only sentences supported for now +      cur_src_[i] = arcs[0].label; +    } +  } +  if (cur_src_.size() != smeta.GetSourceLength()) { +    cerr << "Length mismatch between mapped source and real source in sentence id=" << smeta.GetSentenceID() << endl; +    cerr << "  mapped len=" << cur_src_.size() << endl; +    cerr << "  actual len=" << smeta.GetSourceLength() << endl; +  } +} + diff --git a/decoder/factored_lexicon_helper.h b/decoder/factored_lexicon_helper.h new file mode 100644 index 00000000..81c75275 --- /dev/null +++ b/decoder/factored_lexicon_helper.h @@ -0,0 +1,66 @@ +#ifndef _FACTORED_LEXICON_HELPER_ +#define _FACTORED_LEXICON_HELPER_ + +#include <cassert> +#include <vector> +#include <string> +#include <map> +#include "tdict.h" +#include "sentence_metadata.h" + +// when computing features, it can be advantageous to: +//   1) back off to less specific forms (e.g., less highly inflected forms, POS tags, etc) +//   2) look at more specific forms (on the source ONLY) +// this class helps you do both by creating a "corpus" view +// should probably add a discussion of why the source can be "refined" by this class +// but not the target. basically, this is because the source is on the right side of +// the conditioning line in the model, and the target is on the left. the most specific +// form must always be generated, but the "source" can include arbitrarily large +// context. +// this currently only works for sentence input to maintain simplicity of the code and +// file formats, but there is no reason why it couldn't work with lattices / CFGs +class FactoredLexiconHelper { + public: +  // default constructor does no mapping +  FactoredLexiconHelper(); +  // Either filename can be empty or * to indicate no mapping +  FactoredLexiconHelper(const std::string& srcfile, const std::string& trgmapfile); + +  void PrepareForInput(const SentenceMetadata& smeta); + +  inline WordID SourceWordAtPosition(const int i) const { +    if (i < 0) return kNULL; +    assert(i < cur_src_.size()); +    return Escape(cur_src_[i]); +  } + +  inline WordID CoarsenedTargetWordForTarget(const WordID surface_target) const { +    if (has_trg_) { +      const WordWordMap::const_iterator it = trgmap_.find(surface_target); +      if (it == trgmap_.end()) return surface_target; +      return Escape(it->second); +    } else { +      return Escape(surface_target); +    } +  } + + private: +  inline WordID Escape(WordID word) const { +    const std::map<WordID,WordID>::const_iterator it = escape_.find(word); +    if (it == escape_.end()) return word; +    return it->second; +  } + +  void InitEscape(); + +  const WordID kNULL; +  bool has_src_; +  bool has_trg_; +  std::vector<std::vector<WordID> > src_; +  typedef std::map<WordID, WordID> WordWordMap; +  WordWordMap trgmap_; +  std::vector<WordID> cur_src_; +  std::map<WordID,WordID> escape_; +}; + +#endif diff --git a/decoder/ff_tagger.cc b/decoder/ff_tagger.cc index 05de8ba3..21d0f812 100644 --- a/decoder/ff_tagger.cc +++ b/decoder/ff_tagger.cc @@ -1,9 +1,10 @@  #include "ff_tagger.h" +#include <sstream> +  #include "tdict.h"  #include "sentence_metadata.h" - -#include <sstream> +#include "stringlib.h"  using namespace std; @@ -52,23 +53,36 @@ void Tagger_BigramIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta,    }  } -LexicalPairIdentity::LexicalPairIdentity(const std::string& param) {} +void LexicalPairIdentity::PrepareForInput(const SentenceMetadata& smeta) { +  lexmap_->PrepareForInput(smeta); +} + +LexicalPairIdentity::LexicalPairIdentity(const std::string& param) { +  name_ = "Id"; +  if (param.size()) { +    // name corpus.f emap.txt +    vector<string> params; +    SplitOnWhitespace(param, ¶ms); +    if (params.size() != 3) { +      cerr << "LexicalPairIdentity takes 3 parameters: <name> <corpus.src.txt> <trgmap.txt>\n"; +      cerr << " * may be used for corpus.src.txt or trgmap.txt to use surface forms\n"; +      cerr << " Received: " << param << endl; +      abort(); +    } +    name_ = params[0]; +    lexmap_.reset(new FactoredLexiconHelper(params[1], params[2])); +  } else { +    lexmap_.reset(new FactoredLexiconHelper); +  } +}  void LexicalPairIdentity::FireFeature(WordID src, -                                 WordID trg, -                                 SparseVector<double>* features) const { +                                      WordID trg, +                                      SparseVector<double>* features) const {    int& fid = fmap_[src][trg];    if (!fid) { -    static map<WordID, WordID> escape; -    if (escape.empty()) { -      escape[TD::Convert("=")] = TD::Convert("__EQ"); -      escape[TD::Convert(";")] = TD::Convert("__SC"); -      escape[TD::Convert(",")] = TD::Convert("__CO"); -    } -    if (escape.count(src)) src = escape[src]; -    if (escape.count(trg)) trg = escape[trg];      ostringstream os; -    os << "Id:" << TD::Convert(src) << ':' << TD::Convert(trg); +    os << name_ << ':' << TD::Convert(src) << ':' << TD::Convert(trg);      fid = FD::Convert(os.str());    }    features->set_value(fid, 1.0); @@ -80,16 +94,14 @@ void LexicalPairIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta,                                       SparseVector<double>* features,                                       SparseVector<double>* estimated_features,                                       void* context) const { -  const vector<WordID>& ew = edge.rule_->e_; -  const vector<WordID>& fw = edge.rule_->f_; -  for (int i = 0; i < ew.size(); ++i) { -    const WordID& e = ew[i]; -    if (e <= 0) continue; -    for (int j = 0; j < fw.size(); ++j) { -      const WordID& f = fw[j]; -      if (f <= 0) continue; -      FireFeature(f, e, features); -    } +  // inline WordID SourceWordAtPosition(const int i); +  // inline WordID CoarsenedTargetWordForTarget(const WordID surface_target); +  if (edge.Arity() == 0) { +    const WordID src = lexmap_->SourceWordAtPosition(edge.i_); +    const vector<WordID>& ew = edge.rule_->e_; +    assert(ew.size() == 1); +    const WordID trg = lexmap_->CoarsenedTargetWordForTarget(ew[0]); +    FireFeature(src, trg, features);    }  } diff --git a/decoder/ff_tagger.h b/decoder/ff_tagger.h index 9e47854e..6adee5ab 100644 --- a/decoder/ff_tagger.h +++ b/decoder/ff_tagger.h @@ -2,7 +2,9 @@  #define _FF_TAGGER_H_  #include <map> +#include <boost/scoped_ptr.hpp>  #include "ff.h" +#include "factored_lexicon_helper.h"  typedef std::map<WordID, int> Class2FID;  typedef std::map<WordID, Class2FID> Class2Class2FID; @@ -33,6 +35,7 @@ class Tagger_BigramIdentity : public FeatureFunction {  class LexicalPairIdentity : public FeatureFunction {   public:    LexicalPairIdentity(const std::string& param); +  virtual void PrepareForInput(const SentenceMetadata& smeta);   protected:    virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,                                       const Hypergraph::Edge& edge, @@ -44,7 +47,9 @@ class LexicalPairIdentity : public FeatureFunction {    void FireFeature(WordID src,                     WordID trg,                     SparseVector<double>* features) const; -  mutable Class2Class2FID fmap_; +  std::string name_;  // used to construct feature string +  boost::scoped_ptr<FactoredLexiconHelper> lexmap_; // different view (stemmed, etc) of source/target +  mutable Class2Class2FID fmap_; // feature ideas  }; | 
