summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-27 13:55:23 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-27 13:55:23 +0000
commitecde8cb600b24c31b062f8f53d57641e3fa23379 (patch)
tree71dbc397a8a6aac6209bff294623b3230223fcd2
parent4e66b377ebb4b73d470c0efc573f5bda773b2972 (diff)
factored lexicon
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@692 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--decoder/Makefile.am1
-rw-r--r--decoder/factored_lexicon_helper.cc80
-rw-r--r--decoder/factored_lexicon_helper.h66
-rw-r--r--decoder/ff_tagger.cc60
-rw-r--r--decoder/ff_tagger.h7
5 files changed, 189 insertions, 25 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index bf368c6d..da0e5987 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -48,6 +48,7 @@ libcdec_a_SOURCES = \
hg_io.cc \
decoder.cc \
hg_intersect.cc \
+ factored_lexicon_helper.cc \
viterbi.cc \
lattice.cc \
aligner.cc \
diff --git a/decoder/factored_lexicon_helper.cc b/decoder/factored_lexicon_helper.cc
new file mode 100644
index 00000000..7203b325
--- /dev/null
+++ b/decoder/factored_lexicon_helper.cc
@@ -0,0 +1,80 @@
+#include "factored_lexicon_helper.h"
+
+#include "filelib.h"
+#include "stringlib.h"
+
+using namespace std;
+
+FactoredLexiconHelper::FactoredLexiconHelper() :
+ kNULL(TD::Convert("<eps>")),
+ has_src_(false),
+ has_trg_(false) { InitEscape(); }
+
+FactoredLexiconHelper::FactoredLexiconHelper(const std::string& srcfile, const std::string& trgmapfile) :
+ kNULL(TD::Convert("<eps>")),
+ has_src_(false),
+ has_trg_(false) {
+ if (srcfile.size() && srcfile != "*") {
+ ReadFile rf(srcfile);
+ has_src_ = true;
+ istream& in = *rf.stream();
+ string line;
+ while(in) {
+ getline(in, line);
+ if (!in) continue;
+ vector<WordID> v;
+ TD::ConvertSentence(line, &v);
+ src_.push_back(v);
+ }
+ }
+ if (trgmapfile.size() && trgmapfile != "*") {
+ ReadFile rf(trgmapfile);
+ has_trg_ = true;
+ istream& in = *rf.stream();
+ string line;
+ vector<string> v;
+ while(in) {
+ getline(in, line);
+ if (!in) continue;
+ SplitOnWhitespace(line, &v);
+ if (v.size() != 2) {
+ cerr << "Error reading line in map file: " << line << endl;
+ abort();
+ }
+ WordID& to = trgmap_[TD::Convert(v[0])];
+ if (to != 0) {
+ cerr << "Duplicate entry for word " << v[0] << endl;
+ abort();
+ }
+ to = TD::Convert(v[1]);
+ }
+ }
+ InitEscape();
+}
+
+void FactoredLexiconHelper::InitEscape() {
+ escape_[TD::Convert("=")] = TD::Convert("__EQ");
+ escape_[TD::Convert(";")] = TD::Convert("__SC");
+ escape_[TD::Convert(",")] = TD::Convert("__CO");
+}
+
+void FactoredLexiconHelper::PrepareForInput(const SentenceMetadata& smeta) {
+ if (has_src_) {
+ const int id = smeta.GetSentenceID();
+ assert(id < src_.size());
+ cur_src_ = src_[id];
+ } else {
+ cur_src_.resize(smeta.GetSourceLength());
+ for (int i = 0; i < cur_src_.size(); ++i) {
+ const vector<LatticeArc>& arcs = smeta.GetSourceLattice()[i];
+ assert(arcs.size() == 1); // only sentences supported for now
+ cur_src_[i] = arcs[0].label;
+ }
+ }
+ if (cur_src_.size() != smeta.GetSourceLength()) {
+ cerr << "Length mismatch between mapped source and real source in sentence id=" << smeta.GetSentenceID() << endl;
+ cerr << " mapped len=" << cur_src_.size() << endl;
+ cerr << " actual len=" << smeta.GetSourceLength() << endl;
+ }
+}
+
diff --git a/decoder/factored_lexicon_helper.h b/decoder/factored_lexicon_helper.h
new file mode 100644
index 00000000..81c75275
--- /dev/null
+++ b/decoder/factored_lexicon_helper.h
@@ -0,0 +1,66 @@
+#ifndef _FACTORED_LEXICON_HELPER_
+#define _FACTORED_LEXICON_HELPER_
+
+#include <cassert>
+#include <vector>
+#include <string>
+#include <map>
+#include "tdict.h"
+#include "sentence_metadata.h"
+
+// when computing features, it can be advantageous to:
+// 1) back off to less specific forms (e.g., less highly inflected forms, POS tags, etc)
+// 2) look at more specific forms (on the source ONLY)
+// this class helps you do both by creating a "corpus" view
+// should probably add a discussion of why the source can be "refined" by this class
+// but not the target. basically, this is because the source is on the right side of
+// the conditioning line in the model, and the target is on the left. the most specific
+// form must always be generated, but the "source" can include arbitrarily large
+// context.
+// this currently only works for sentence input to maintain simplicity of the code and
+// file formats, but there is no reason why it couldn't work with lattices / CFGs
+class FactoredLexiconHelper {
+ public:
+ // default constructor does no mapping
+ FactoredLexiconHelper();
+ // Either filename can be empty or * to indicate no mapping
+ FactoredLexiconHelper(const std::string& srcfile, const std::string& trgmapfile);
+
+ void PrepareForInput(const SentenceMetadata& smeta);
+
+ inline WordID SourceWordAtPosition(const int i) const {
+ if (i < 0) return kNULL;
+ assert(i < cur_src_.size());
+ return Escape(cur_src_[i]);
+ }
+
+ inline WordID CoarsenedTargetWordForTarget(const WordID surface_target) const {
+ if (has_trg_) {
+ const WordWordMap::const_iterator it = trgmap_.find(surface_target);
+ if (it == trgmap_.end()) return surface_target;
+ return Escape(it->second);
+ } else {
+ return Escape(surface_target);
+ }
+ }
+
+ private:
+ inline WordID Escape(WordID word) const {
+ const std::map<WordID,WordID>::const_iterator it = escape_.find(word);
+ if (it == escape_.end()) return word;
+ return it->second;
+ }
+
+ void InitEscape();
+
+ const WordID kNULL;
+ bool has_src_;
+ bool has_trg_;
+ std::vector<std::vector<WordID> > src_;
+ typedef std::map<WordID, WordID> WordWordMap;
+ WordWordMap trgmap_;
+ std::vector<WordID> cur_src_;
+ std::map<WordID,WordID> escape_;
+};
+
+#endif
diff --git a/decoder/ff_tagger.cc b/decoder/ff_tagger.cc
index 05de8ba3..21d0f812 100644
--- a/decoder/ff_tagger.cc
+++ b/decoder/ff_tagger.cc
@@ -1,9 +1,10 @@
#include "ff_tagger.h"
+#include <sstream>
+
#include "tdict.h"
#include "sentence_metadata.h"
-
-#include <sstream>
+#include "stringlib.h"
using namespace std;
@@ -52,23 +53,36 @@ void Tagger_BigramIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta,
}
}
-LexicalPairIdentity::LexicalPairIdentity(const std::string& param) {}
+void LexicalPairIdentity::PrepareForInput(const SentenceMetadata& smeta) {
+ lexmap_->PrepareForInput(smeta);
+}
+
+LexicalPairIdentity::LexicalPairIdentity(const std::string& param) {
+ name_ = "Id";
+ if (param.size()) {
+ // name corpus.f emap.txt
+ vector<string> params;
+ SplitOnWhitespace(param, &params);
+ if (params.size() != 3) {
+ cerr << "LexicalPairIdentity takes 3 parameters: <name> <corpus.src.txt> <trgmap.txt>\n";
+ cerr << " * may be used for corpus.src.txt or trgmap.txt to use surface forms\n";
+ cerr << " Received: " << param << endl;
+ abort();
+ }
+ name_ = params[0];
+ lexmap_.reset(new FactoredLexiconHelper(params[1], params[2]));
+ } else {
+ lexmap_.reset(new FactoredLexiconHelper);
+ }
+}
void LexicalPairIdentity::FireFeature(WordID src,
- WordID trg,
- SparseVector<double>* features) const {
+ WordID trg,
+ SparseVector<double>* features) const {
int& fid = fmap_[src][trg];
if (!fid) {
- static map<WordID, WordID> escape;
- if (escape.empty()) {
- escape[TD::Convert("=")] = TD::Convert("__EQ");
- escape[TD::Convert(";")] = TD::Convert("__SC");
- escape[TD::Convert(",")] = TD::Convert("__CO");
- }
- if (escape.count(src)) src = escape[src];
- if (escape.count(trg)) trg = escape[trg];
ostringstream os;
- os << "Id:" << TD::Convert(src) << ':' << TD::Convert(trg);
+ os << name_ << ':' << TD::Convert(src) << ':' << TD::Convert(trg);
fid = FD::Convert(os.str());
}
features->set_value(fid, 1.0);
@@ -80,16 +94,14 @@ void LexicalPairIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta,
SparseVector<double>* features,
SparseVector<double>* estimated_features,
void* context) const {
- const vector<WordID>& ew = edge.rule_->e_;
- const vector<WordID>& fw = edge.rule_->f_;
- for (int i = 0; i < ew.size(); ++i) {
- const WordID& e = ew[i];
- if (e <= 0) continue;
- for (int j = 0; j < fw.size(); ++j) {
- const WordID& f = fw[j];
- if (f <= 0) continue;
- FireFeature(f, e, features);
- }
+ // inline WordID SourceWordAtPosition(const int i);
+ // inline WordID CoarsenedTargetWordForTarget(const WordID surface_target);
+ if (edge.Arity() == 0) {
+ const WordID src = lexmap_->SourceWordAtPosition(edge.i_);
+ const vector<WordID>& ew = edge.rule_->e_;
+ assert(ew.size() == 1);
+ const WordID trg = lexmap_->CoarsenedTargetWordForTarget(ew[0]);
+ FireFeature(src, trg, features);
}
}
diff --git a/decoder/ff_tagger.h b/decoder/ff_tagger.h
index 9e47854e..6adee5ab 100644
--- a/decoder/ff_tagger.h
+++ b/decoder/ff_tagger.h
@@ -2,7 +2,9 @@
#define _FF_TAGGER_H_
#include <map>
+#include <boost/scoped_ptr.hpp>
#include "ff.h"
+#include "factored_lexicon_helper.h"
typedef std::map<WordID, int> Class2FID;
typedef std::map<WordID, Class2FID> Class2Class2FID;
@@ -33,6 +35,7 @@ class Tagger_BigramIdentity : public FeatureFunction {
class LexicalPairIdentity : public FeatureFunction {
public:
LexicalPairIdentity(const std::string& param);
+ virtual void PrepareForInput(const SentenceMetadata& smeta);
protected:
virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
const Hypergraph::Edge& edge,
@@ -44,7 +47,9 @@ class LexicalPairIdentity : public FeatureFunction {
void FireFeature(WordID src,
WordID trg,
SparseVector<double>* features) const;
- mutable Class2Class2FID fmap_;
+ std::string name_; // used to construct feature string
+ boost::scoped_ptr<FactoredLexiconHelper> lexmap_; // different view (stemmed, etc) of source/target
+ mutable Class2Class2FID fmap_; // feature ideas
};