factored lexicon

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@692 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-27 13:55:23 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-27 13:55:23 +0000
commit: ecde8cb600b24c31b062f8f53d57641e3fa23379 (patch)
tree: 71dbc397a8a6aac6209bff294623b3230223fcd2 /decoder
parent: 4e66b377ebb4b73d470c0efc573f5bda773b2972 (diff)
5 files changed, 189 insertions, 25 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index bf368c6d..da0e5987 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -48,6 +48,7 @@ libcdec_a_SOURCES = \
   hg_io.cc \
   decoder.cc \
   hg_intersect.cc \
+  factored_lexicon_helper.cc \
   viterbi.cc \
   lattice.cc \
   aligner.cc \
diff --git a/decoder/factored_lexicon_helper.cc b/decoder/factored_lexicon_helper.cc
new file mode 100644
index 00000000..7203b325
--- /dev/null
+++ b/decoder/factored_lexicon_helper.cc
@@ -0,0 +1,80 @@
+#include "factored_lexicon_helper.h"
+
+#include "filelib.h"
+#include "stringlib.h"
+
+using namespace std;
+
+FactoredLexiconHelper::FactoredLexiconHelper() :
+    kNULL(TD::Convert("<eps>")),
+    has_src_(false),
+    has_trg_(false) { InitEscape(); }
+
+FactoredLexiconHelper::FactoredLexiconHelper(const std::string& srcfile, const std::string& trgmapfile) :
+    kNULL(TD::Convert("<eps>")),
+    has_src_(false),
+    has_trg_(false) {
+  if (srcfile.size() && srcfile != "*") {
+    ReadFile rf(srcfile);
+    has_src_ = true;
+    istream& in = *rf.stream();
+    string line;
+    while(in) {
+      getline(in, line);
+      if (!in) continue;
+      vector<WordID> v;
+      TD::ConvertSentence(line, &v);
+      src_.push_back(v);
+    }
+  }
+  if (trgmapfile.size() && trgmapfile != "*") {
+    ReadFile rf(trgmapfile);
+    has_trg_ = true;
+    istream& in = *rf.stream();
+    string line;
+    vector<string> v;
+    while(in) {
+      getline(in, line);
+      if (!in) continue;
+      SplitOnWhitespace(line, &v);
+      if (v.size() != 2) {
+        cerr << "Error reading line in map file: " << line << endl;
+        abort();
+      }
+      WordID& to = trgmap_[TD::Convert(v[0])];
+      if (to != 0) {
+        cerr << "Duplicate entry for word " << v[0] << endl;
+        abort();
+      }
+      to = TD::Convert(v[1]);
+    }
+  }
+  InitEscape();
+}
+
+void FactoredLexiconHelper::InitEscape() {
+  escape_[TD::Convert("=")] = TD::Convert("__EQ");
+  escape_[TD::Convert(";")] = TD::Convert("__SC");
+  escape_[TD::Convert(",")] = TD::Convert("__CO");
+}
+
+void FactoredLexiconHelper::PrepareForInput(const SentenceMetadata& smeta) {
+  if (has_src_) {
+    const int id = smeta.GetSentenceID();
+    assert(id < src_.size());
+    cur_src_ = src_[id];
+  } else {
+    cur_src_.resize(smeta.GetSourceLength());
+    for (int i = 0; i < cur_src_.size(); ++i) {
+      const vector<LatticeArc>& arcs = smeta.GetSourceLattice()[i];
+      assert(arcs.size() == 1);    // only sentences supported for now
+      cur_src_[i] = arcs[0].label;
+    }
+  }
+  if (cur_src_.size() != smeta.GetSourceLength()) {
+    cerr << "Length mismatch between mapped source and real source in sentence id=" << smeta.GetSentenceID() << endl;
+    cerr << "  mapped len=" << cur_src_.size() << endl;
+    cerr << "  actual len=" << smeta.GetSourceLength() << endl;
+  }
+}
+
diff --git a/decoder/factored_lexicon_helper.h b/decoder/factored_lexicon_helper.h
new file mode 100644
index 00000000..81c75275
--- /dev/null
+++ b/decoder/factored_lexicon_helper.h
@@ -0,0 +1,66 @@
+#ifndef _FACTORED_LEXICON_HELPER_
+#define _FACTORED_LEXICON_HELPER_
+
+#include <cassert>
+#include <vector>
+#include <string>
+#include <map>
+#include "tdict.h"
+#include "sentence_metadata.h"
+
+// when computing features, it can be advantageous to:
+//   1) back off to less specific forms (e.g., less highly inflected forms, POS tags, etc)
+//   2) look at more specific forms (on the source ONLY)
+// this class helps you do both by creating a "corpus" view
+// should probably add a discussion of why the source can be "refined" by this class
+// but not the target. basically, this is because the source is on the right side of
+// the conditioning line in the model, and the target is on the left. the most specific
+// form must always be generated, but the "source" can include arbitrarily large
+// context.
+// this currently only works for sentence input to maintain simplicity of the code and
+// file formats, but there is no reason why it couldn't work with lattices / CFGs
+class FactoredLexiconHelper {
+ public:
+  // default constructor does no mapping
+  FactoredLexiconHelper();
+  // Either filename can be empty or * to indicate no mapping
+  FactoredLexiconHelper(const std::string& srcfile, const std::string& trgmapfile);
+
+  void PrepareForInput(const SentenceMetadata& smeta);
+
+  inline WordID SourceWordAtPosition(const int i) const {
+    if (i < 0) return kNULL;
+    assert(i < cur_src_.size());
+    return Escape(cur_src_[i]);
+  }
+
+  inline WordID CoarsenedTargetWordForTarget(const WordID surface_target) const {
+    if (has_trg_) {
+      const WordWordMap::const_iterator it = trgmap_.find(surface_target);
+      if (it == trgmap_.end()) return surface_target;
+      return Escape(it->second);
+    } else {
+      return Escape(surface_target);
+    }
+  }
+
+ private:
+  inline WordID Escape(WordID word) const {
+    const std::map<WordID,WordID>::const_iterator it = escape_.find(word);
+    if (it == escape_.end()) return word;
+    return it->second;
+  }
+
+  void InitEscape();
+
+  const WordID kNULL;
+  bool has_src_;
+  bool has_trg_;
+  std::vector<std::vector<WordID> > src_;
+  typedef std::map<WordID, WordID> WordWordMap;
+  WordWordMap trgmap_;
+  std::vector<WordID> cur_src_;
+  std::map<WordID,WordID> escape_;
+};
+
+#endif
diff --git a/decoder/ff_tagger.cc b/decoder/ff_tagger.cc
index 05de8ba3..21d0f812 100644
--- a/decoder/ff_tagger.cc
+++ b/decoder/ff_tagger.cc
@@ -1,9 +1,10 @@
 #include "ff_tagger.h"
 
+#include <sstream>
+
 #include "tdict.h"
 #include "sentence_metadata.h"
-
-#include <sstream>
+#include "stringlib.h"
 
 using namespace std;
 
@@ -52,23 +53,36 @@ void Tagger_BigramIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta,
   }
 }
 
-LexicalPairIdentity::LexicalPairIdentity(const std::string& param) {}
+void LexicalPairIdentity::PrepareForInput(const SentenceMetadata& smeta) {
+  lexmap_->PrepareForInput(smeta);
+}
+
+LexicalPairIdentity::LexicalPairIdentity(const std::string& param) {
+  name_ = "Id";
+  if (param.size()) {
+    // name corpus.f emap.txt
+    vector<string> params;
+    SplitOnWhitespace(param, &params);
+    if (params.size() != 3) {
+      cerr << "LexicalPairIdentity takes 3 parameters: <name> <corpus.src.txt> <trgmap.txt>\n";
+      cerr << " * may be used for corpus.src.txt or trgmap.txt to use surface forms\n";
+      cerr << " Received: " << param << endl;
+      abort();
+    }
+    name_ = params[0];
+    lexmap_.reset(new FactoredLexiconHelper(params[1], params[2]));
+  } else {
+    lexmap_.reset(new FactoredLexiconHelper);
+  }
+}
 
 void LexicalPairIdentity::FireFeature(WordID src,
-                                 WordID trg,
-                                 SparseVector<double>* features) const {
+                                      WordID trg,
+                                      SparseVector<double>* features) const {
   int& fid = fmap_[src][trg];
   if (!fid) {
-    static map<WordID, WordID> escape;
-    if (escape.empty()) {
-      escape[TD::Convert("=")] = TD::Convert("__EQ");
-      escape[TD::Convert(";")] = TD::Convert("__SC");
-      escape[TD::Convert(",")] = TD::Convert("__CO");
-    }
-    if (escape.count(src)) src = escape[src];
-    if (escape.count(trg)) trg = escape[trg];
     ostringstream os;
-    os << "Id:" << TD::Convert(src) << ':' << TD::Convert(trg);
+    os << name_ << ':' << TD::Convert(src) << ':' << TD::Convert(trg);
     fid = FD::Convert(os.str());
   }
   features->set_value(fid, 1.0);
@@ -80,16 +94,14 @@ void LexicalPairIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta,
                                      SparseVector<double>* features,
                                      SparseVector<double>* estimated_features,
                                      void* context) const {
-  const vector<WordID>& ew = edge.rule_->e_;
-  const vector<WordID>& fw = edge.rule_->f_;
-  for (int i = 0; i < ew.size(); ++i) {
-    const WordID& e = ew[i];
-    if (e <= 0) continue;
-    for (int j = 0; j < fw.size(); ++j) {
-      const WordID& f = fw[j];
-      if (f <= 0) continue;
-      FireFeature(f, e, features);
-    }
+  // inline WordID SourceWordAtPosition(const int i);
+  // inline WordID CoarsenedTargetWordForTarget(const WordID surface_target);
+  if (edge.Arity() == 0) {
+    const WordID src = lexmap_->SourceWordAtPosition(edge.i_);
+    const vector<WordID>& ew = edge.rule_->e_;
+    assert(ew.size() == 1);
+    const WordID trg = lexmap_->CoarsenedTargetWordForTarget(ew[0]);
+    FireFeature(src, trg, features);
   }
 }
 
diff --git a/decoder/ff_tagger.h b/decoder/ff_tagger.h
index 9e47854e..6adee5ab 100644
--- a/decoder/ff_tagger.h
+++ b/decoder/ff_tagger.h
@@ -2,7 +2,9 @@
 #define _FF_TAGGER_H_
 
 #include <map>
+#include <boost/scoped_ptr.hpp>
 #include "ff.h"
+#include "factored_lexicon_helper.h"
 
 typedef std::map<WordID, int> Class2FID;
 typedef std::map<WordID, Class2FID> Class2Class2FID;
@@ -33,6 +35,7 @@ class Tagger_BigramIdentity : public FeatureFunction {
 class LexicalPairIdentity : public FeatureFunction {
  public:
   LexicalPairIdentity(const std::string& param);
+  virtual void PrepareForInput(const SentenceMetadata& smeta);
  protected:
   virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
                                      const Hypergraph::Edge& edge,
@@ -44,7 +47,9 @@ class LexicalPairIdentity : public FeatureFunction {
   void FireFeature(WordID src,
                    WordID trg,
                    SparseVector<double>* features) const;
-  mutable Class2Class2FID fmap_;
+  std::string name_;  // used to construct feature string
+  boost::scoped_ptr<FactoredLexiconHelper> lexmap_; // different view (stemmed, etc) of source/target
+  mutable Class2Class2FID fmap_; // feature ideas
 };
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-27 13:55:23 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-27 13:55:23 +0000
commit	ecde8cb600b24c31b062f8f53d57641e3fa23379 (patch)
tree	71dbc397a8a6aac6209bff294623b3230223fcd2 /decoder
parent	4e66b377ebb4b73d470c0efc573f5bda773b2972 (diff)