From 68fbe21c181d6804ebce52058ebccd1a0d77444c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 14 Mar 2013 23:38:41 -0400 Subject: source path features --- decoder/Makefile.am | 2 ++ decoder/cdec_ff.cc | 2 ++ decoder/ff_source_path.cc | 40 ++++++++++++++++++++++++++++++++++++++++ decoder/ff_source_path.h | 26 ++++++++++++++++++++++++++ 4 files changed, 70 insertions(+) create mode 100644 decoder/ff_source_path.cc create mode 100644 decoder/ff_source_path.h (limited to 'decoder') diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 6499b38b..82b50f19 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -60,6 +60,7 @@ libcdec_a_SOURCES = \ ff_rules.h \ ff_ruleshape.h \ ff_sample_fsa.h \ + ff_source_path.h \ ff_source_syntax.h \ ff_spans.h \ ff_tagger.h \ @@ -140,6 +141,7 @@ libcdec_a_SOURCES = \ ff_wordalign.cc \ ff_csplit.cc \ ff_tagger.cc \ + ff_source_path.cc \ ff_source_syntax.cc \ ff_bleu.cc \ ff_factory.cc \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 3ab0f9f6..a60f2c33 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -14,6 +14,7 @@ #include "ff_rules.h" #include "ff_ruleshape.h" #include "ff_bleu.h" +#include "ff_source_path.h" #include "ff_source_syntax.h" #include "ff_register.h" #include "ff_charset.h" @@ -70,6 +71,7 @@ void register_feature_functions() { ff_registry.Register("InputIndicator", new FFFactory); ff_registry.Register("LexicalTranslationTrigger", new FFFactory); ff_registry.Register("WordPairFeatures", new FFFactory); + ff_registry.Register("SourthPathFeatures", new FFFactory); ff_registry.Register("WordSet", new FFFactory); ff_registry.Register("Dwarf", new FFFactory); ff_registry.Register("External", new FFFactory); diff --git a/decoder/ff_source_path.cc b/decoder/ff_source_path.cc new file mode 100644 index 00000000..d5fa6bb3 --- /dev/null +++ b/decoder/ff_source_path.cc @@ -0,0 +1,40 @@ +#include "ff_source_path.h" + +#include "hg.h" + +using namespace std; + +SourcePathFeatures::SourcePathFeatures(const string& param) : FeatureFunction(4) {} + +void SourcePathFeatures::FireBigramFeature(WordID prev, WordID cur, SparseVector* features) const { + int& fid = bigram_fids[prev][cur]; + if (!fid) fid = FD::Convert("SB:"+TD::Convert(prev) + "_" + TD::Convert(cur)); + if (fid) features->add_value(fid, 1.0); +} + +void SourcePathFeatures::FireUnigramFeature(WordID cur, SparseVector* features) const { + int& fid = unigram_fids[cur]; + if (!fid) fid = FD::Convert("SU:" + TD::Convert(cur)); + if (fid) features->add_value(fid, 1.0); +} + +void SourcePathFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const HG::Edge& edge, + const vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const { + WordID* res = reinterpret_cast(context); + const vector& f = edge.rule_->f(); + int prev = 0; + for (unsigned i = 0; i < f.size(); ++i) { + int cur = f[i]; + if (cur <= 0) + cur = *reinterpret_cast(ant_contexts[cur]); + else + FireUnigramFeature(cur, features); + if (prev) FireBigramFeature(prev, cur, features); + prev = cur; + } + *res = prev; +} diff --git a/decoder/ff_source_path.h b/decoder/ff_source_path.h new file mode 100644 index 00000000..03126412 --- /dev/null +++ b/decoder/ff_source_path.h @@ -0,0 +1,26 @@ +#ifndef _FF_SOURCE_PATH_H_ +#define _FF_SOURCE_PATH_H_ + +#include +#include +#include "ff.h" + +class SourcePathFeatures : public FeatureFunction { + public: + SourcePathFeatures(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const HG::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + + private: + void FireBigramFeature(WordID prev, WordID cur, SparseVector* features) const; + void FireUnigramFeature(WordID cur, SparseVector* features) const; + mutable std::map > bigram_fids; + mutable std::map unigram_fids; +}; + +#endif -- cgit v1.2.3 From 37162522a07551b261c445a92245445c5458f759 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 14 Mar 2013 23:46:46 -0400 Subject: fix source path --- decoder/cdec_ff.cc | 2 +- decoder/ff_source_path.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'decoder') diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index a60f2c33..0bf441d4 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -71,7 +71,7 @@ void register_feature_functions() { ff_registry.Register("InputIndicator", new FFFactory); ff_registry.Register("LexicalTranslationTrigger", new FFFactory); ff_registry.Register("WordPairFeatures", new FFFactory); - ff_registry.Register("SourthPathFeatures", new FFFactory); + ff_registry.Register("SourcePathFeatures", new FFFactory); ff_registry.Register("WordSet", new FFFactory); ff_registry.Register("Dwarf", new FFFactory); ff_registry.Register("External", new FFFactory); diff --git a/decoder/ff_source_path.cc b/decoder/ff_source_path.cc index d5fa6bb3..56cbfc48 100644 --- a/decoder/ff_source_path.cc +++ b/decoder/ff_source_path.cc @@ -30,7 +30,7 @@ void SourcePathFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, for (unsigned i = 0; i < f.size(); ++i) { int cur = f[i]; if (cur <= 0) - cur = *reinterpret_cast(ant_contexts[cur]); + cur = *reinterpret_cast(ant_contexts[-cur]); else FireUnigramFeature(cur, features); if (prev) FireBigramFeature(prev, cur, features); -- cgit v1.2.3 From cba324bee086f4dd0fc9df204d33ff7eb9b6c323 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 15 Mar 2013 00:17:55 -0400 Subject: fix bug --- decoder/ff_source_path.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'decoder') diff --git a/decoder/ff_source_path.cc b/decoder/ff_source_path.cc index 56cbfc48..2a3bee2e 100644 --- a/decoder/ff_source_path.cc +++ b/decoder/ff_source_path.cc @@ -4,7 +4,7 @@ using namespace std; -SourcePathFeatures::SourcePathFeatures(const string& param) : FeatureFunction(4) {} +SourcePathFeatures::SourcePathFeatures(const string& param) : FeatureFunction(sizeof(int)) {} void SourcePathFeatures::FireBigramFeature(WordID prev, WordID cur, SparseVector* features) const { int& fid = bigram_fids[prev][cur]; @@ -27,10 +27,11 @@ void SourcePathFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, WordID* res = reinterpret_cast(context); const vector& f = edge.rule_->f(); int prev = 0; + unsigned ntc = 0; for (unsigned i = 0; i < f.size(); ++i) { int cur = f[i]; - if (cur <= 0) - cur = *reinterpret_cast(ant_contexts[-cur]); + if (cur < 0) + cur = *reinterpret_cast(ant_contexts[ntc++]); else FireUnigramFeature(cur, features); if (prev) FireBigramFeature(prev, cur, features); @@ -38,3 +39,4 @@ void SourcePathFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, } *res = prev; } + -- cgit v1.2.3 From 8acded145b7f23b2c57a6ad93487f727b8a19b3b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 20 Mar 2013 12:24:01 -0400 Subject: n-gram word class features --- decoder/ff_ngrams.cc | 68 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 5 deletions(-) (limited to 'decoder') diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index 9c13fdbb..d337b28b 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -60,7 +60,7 @@ namespace { } } -static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector& prefixes, string& target_separator) { +static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector& prefixes, string& target_separator, string* cluster_file) { vector const& argv=SplitOnWhitespace(in); *explicit_markers = false; *order = 3; @@ -103,6 +103,10 @@ static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, LMSPEC_NEXTARG; prefixes[5] = *i; break; + case 'c': + LMSPEC_NEXTARG; + *cluster_file = *i; + break; case 'S': LMSPEC_NEXTARG; target_separator = *i; @@ -124,6 +128,7 @@ usage: << "NgramFeatures Usage: \n" << " feature_function=NgramFeatures filename.lm [-x] [-o ] \n" + << " [-c ]\n" << " [-U ] [-B ][-T ]\n" << " [-4 <4-gram-prefix>] [-5 <5-gram-prefix>] [-S ]\n\n" @@ -203,6 +208,12 @@ class NgramDetectorImpl { SetFlag(flag, HAS_FULL_CONTEXT, state); } + WordID MapToClusterIfNecessary(WordID w) const { + if (cluster_map.size() == 0) return w; + if (w >= cluster_map.size()) return kCDEC_UNK; + return cluster_map[w]; + } + void FireFeatures(const State<5>& state, WordID cur, SparseVector* feats) { FidTree* ft = &fidroot_; int n = 0; @@ -285,7 +296,7 @@ class NgramDetectorImpl { context_complete = true; } } else { // handle terminal - const WordID cur_word = e[j]; + const WordID cur_word = MapToClusterIfNecessary(e[j]); SparseVector p; if (cur_word == kSOS_) { state = BeginSentenceState(); @@ -348,9 +359,52 @@ class NgramDetectorImpl { } } + void ReadClusterFile(const string& clusters) { + ReadFile rf(clusters); + istream& in = *rf.stream(); + string line; + int lc = 0; + string cluster; + string word; + while(getline(in, line)) { + ++lc; + if (line.size() == 0) continue; + if (line[0] == '#') continue; + unsigned cend = 1; + while((line[cend] != ' ' && line[cend] != '\t') && cend < line.size()) { + ++cend; + } + if (cend == line.size()) { + cerr << "Line " << lc << " in " << clusters << " malformed: " << line << endl; + abort(); + } + unsigned wbeg = cend + 1; + while((line[wbeg] == ' ' || line[wbeg] == '\t') && wbeg < line.size()) { + ++wbeg; + } + if (wbeg == line.size()) { + cerr << "Line " << lc << " in " << clusters << " malformed: " << line << endl; + abort(); + } + unsigned wend = wbeg + 1; + while((line[wend] != ' ' && line[wend] != '\t') && wend < line.size()) { + ++wend; + } + const WordID clusterid = TD::Convert(line.substr(0, cend)); + const WordID wordid = TD::Convert(line.substr(wbeg, wend - wbeg)); + if (wordid >= cluster_map.size()) + cluster_map.resize(wordid + 10, kCDEC_UNK); + cluster_map[wordid] = clusterid; + } + cluster_map[kSOS_] = kSOS_; + cluster_map[kEOS_] = kEOS_; + } + + vector cluster_map; + public: explicit NgramDetectorImpl(bool explicit_markers, unsigned order, - vector& prefixes, string& target_separator) : + vector& prefixes, string& target_separator, const string& clusters) : kCDEC_UNK(TD::Convert("")) , add_sos_eos_(!explicit_markers) { order_ = order; @@ -369,6 +423,9 @@ class NgramDetectorImpl { dummy_rule_.reset(new TRule("[DUMMY] ||| [BOS] [DUMMY] ||| [1] [2] ||| X=0")); kSOS_ = TD::Convert(""); kEOS_ = TD::Convert(""); + + if (clusters.size()) + ReadClusterFile(clusters); } ~NgramDetectorImpl() { @@ -409,9 +466,10 @@ NgramDetector::NgramDetector(const string& param) { vector prefixes; bool explicit_markers = false; unsigned order = 3; - ParseArgs(param, &explicit_markers, &order, prefixes, target_separator); + string clusters; + ParseArgs(param, &explicit_markers, &order, prefixes, target_separator, &clusters); pimpl_ = new NgramDetectorImpl(explicit_markers, order, prefixes, - target_separator); + target_separator, clusters); SetStateSize(pimpl_->ReserveStateSize()); } -- cgit v1.2.3 From 4db3a0164dcd97d7540ff4239548df39398fd23c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 23 Mar 2013 23:09:37 -0400 Subject: fix rules features --- decoder/ff_rules.cc | 20 ++++++++++++++++---- decoder/ff_rules.h | 1 + 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'decoder') diff --git a/decoder/ff_rules.cc b/decoder/ff_rules.cc index 6716d3da..410e083c 100644 --- a/decoder/ff_rules.cc +++ b/decoder/ff_rules.cc @@ -107,7 +107,12 @@ void RuleSourceBigramFeatures::TraversalFeaturesImpl(const SentenceMetadata& sme (*features) += it->second; } -RuleTargetBigramFeatures::RuleTargetBigramFeatures(const std::string& param) { +RuleTargetBigramFeatures::RuleTargetBigramFeatures(const std::string& param) : inds(1000) { + for (unsigned i = 0; i < inds.size(); ++i) { + ostringstream os; + os << (i + 1); + inds[i] = os.str(); + } } void RuleTargetBigramFeatures::PrepareForInput(const SentenceMetadata& smeta) { @@ -126,11 +131,18 @@ void RuleTargetBigramFeatures::TraversalFeaturesImpl(const SentenceMetadata& sme it = rule2_feats_.insert(make_pair(&rule, SparseVector())).first; SparseVector& f = it->second; string prev = ""; + vector nt_types(rule.Arity()); + unsigned ntc = 0; + for (int i = 0; i < rule.f_.size(); ++i) + if (rule.f_[i] < 0) nt_types[ntc++] = -rule.f_[i]; for (int i = 0; i < rule.e_.size(); ++i) { WordID w = rule.e_[i]; - if (w < 0) w = -w; - if (w == 0) return; - const string& cur = TD::Convert(w); + string cur; + if (w > 0) { + cur = TD::Convert(w); + } else { + cur = TD::Convert(nt_types[-w]) + inds[-w]; + } ostringstream os; os << "RBT:" << prev << '_' << cur; const int fid = FD::Convert(Escape(os.str())); diff --git a/decoder/ff_rules.h b/decoder/ff_rules.h index b100ec34..f210dc65 100644 --- a/decoder/ff_rules.h +++ b/decoder/ff_rules.h @@ -51,6 +51,7 @@ class RuleTargetBigramFeatures : public FeatureFunction { void* context) const; virtual void PrepareForInput(const SentenceMetadata& smeta); private: + std::vector inds; mutable std::map > rule2_feats_; }; -- cgit v1.2.3 From 0e46089cafa4e8e2f060e370d7afaceeda6b90a9 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 22 Apr 2013 22:50:14 -0400 Subject: support emission probabilities in class-based LMs --- decoder/ff_klm.cc | 49 ++++++++++++++++++++++++++++++------------------- decoder/ff_klm.h | 5 +++-- 2 files changed, 33 insertions(+), 21 deletions(-) (limited to 'decoder') diff --git a/decoder/ff_klm.cc b/decoder/ff_klm.cc index fefa90bd..c8ca917a 100644 --- a/decoder/ff_klm.cc +++ b/decoder/ff_klm.cc @@ -1,6 +1,7 @@ #include "ff_klm.h" #include +#include #include #include @@ -151,8 +152,9 @@ template class BoundaryRuleScore { template class KLanguageModelImpl { public: - double LookupWords(const TRule& rule, const vector& ant_states, double* oovs, void* remnant) { + double LookupWords(const TRule& rule, const vector& ant_states, double* oovs, double* emit, void* remnant) { *oovs = 0; + *emit = 0; const vector& e = rule.e(); BoundaryRuleScore ruleScore(*ngram_, *static_cast(remnant)); unsigned i = 0; @@ -169,8 +171,9 @@ class KLanguageModelImpl { if (e[i] <= 0) { ruleScore.NonTerminal(*static_cast(ant_states[-e[i]])); } else { - const WordID cdec_word_or_class = ClassifyWordIfNecessary(e[i]); // in future, - // maybe handle emission + float ep = 0.f; + const WordID cdec_word_or_class = ClassifyWordIfNecessary(e[i], &ep); + if (ep) { *emit += ep; } const lm::WordIndex cur_word = MapWord(cdec_word_or_class); // map to LM's id if (cur_word == 0) (*oovs) += 1.0; ruleScore.Terminal(cur_word); @@ -205,12 +208,14 @@ class KLanguageModelImpl { // if this is not a class-based LM, returns w untransformed, // otherwise returns a word class mapping of w, // returns TD::Convert("") if there is no mapping for w - WordID ClassifyWordIfNecessary(WordID w) const { + WordID ClassifyWordIfNecessary(WordID w, float* emitp) const { if (word2class_map_.empty()) return w; if (w >= word2class_map_.size()) return kCDEC_UNK; - else - return word2class_map_[w]; + else { + *emitp = word2class_map_[w].second; + return word2class_map_[w].first; + } } // converts to cdec word id's to KenLM's id space, OOVs and end up at 0 @@ -256,32 +261,32 @@ class KLanguageModelImpl { int lc = 0; if (!SILENT) cerr << " Loading word classes from " << file << " ...\n"; - AddWordToClassMapping_(TD::Convert(""), TD::Convert("")); - AddWordToClassMapping_(TD::Convert(""), TD::Convert("")); - while(in) { - getline(in, line); - if (!in) continue; + AddWordToClassMapping_(TD::Convert(""), TD::Convert(""), 0.0); + AddWordToClassMapping_(TD::Convert(""), TD::Convert(""), 0.0); + while(getline(in, line)) { dummy.clear(); TD::ConvertSentence(line, &dummy); ++lc; - if (dummy.size() != 2) { + if (dummy.size() != 3) { + cerr << " Class map file expects: CLASS WORD logp(WORD|CLASS)\n"; cerr << " Format error in " << file << ", line " << lc << ": " << line << endl; abort(); } - AddWordToClassMapping_(dummy[0], dummy[1]); + AddWordToClassMapping_(dummy[1], dummy[0], strtof(TD::Convert(dummy[2]).c_str(), NULL)); } } - void AddWordToClassMapping_(WordID word, WordID cls) { + void AddWordToClassMapping_(WordID word, WordID cls, float emit) { if (word2class_map_.size() <= word) { - word2class_map_.resize((word + 10) * 1.1, kCDEC_UNK); + word2class_map_.resize((word + 10) * 1.1, pair(kCDEC_UNK,0.f)); assert(word2class_map_.size() > word); } - if(word2class_map_[word] != kCDEC_UNK) { + if(word2class_map_[word].first != kCDEC_UNK) { cerr << "Multiple classes for symbol " << TD::Convert(word) << endl; abort(); } - word2class_map_[word] = cls; + word2class_map_[word].first = cls; + word2class_map_[word].second = emit; } ~KLanguageModelImpl() { @@ -304,7 +309,9 @@ class KLanguageModelImpl { int order_; vector cdec2klm_map_; - vector word2class_map_; // if this is a class-based LM, this is the word->class mapping + vector > word2class_map_; // if this is a class-based LM, + // .first is the word->class mapping + // .second is the emission log probability }; template @@ -322,6 +329,7 @@ KLanguageModel::KLanguageModel(const string& param) { } fid_ = FD::Convert(featname); oov_fid_ = FD::Convert(featname+"_OOV"); + emit_fid_ = FD::Convert(featname+"_Emit"); // cerr << "FID: " << oov_fid_ << endl; SetStateSize(pimpl_->ReserveStateSize()); } @@ -340,9 +348,12 @@ void KLanguageModel::TraversalFeaturesImpl(const SentenceMetadata& /* sme void* state) const { double est = 0; double oovs = 0; - features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, &oovs, state)); + double emit = 0; + features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, &oovs, &emit, state)); if (oovs && oov_fid_) features->set_value(oov_fid_, oovs); + if (emit && emit_fid_) + features->set_value(emit_fid_, emit); } template diff --git a/decoder/ff_klm.h b/decoder/ff_klm.h index b5ceffd0..db4032f7 100644 --- a/decoder/ff_klm.h +++ b/decoder/ff_klm.h @@ -28,8 +28,9 @@ class KLanguageModel : public FeatureFunction { SparseVector* estimated_features, void* out_context) const; private: - int fid_; // conceptually const; mutable only to simplify constructor - int oov_fid_; // will be zero if extra OOV feature is not configured by decoder + int fid_; // LanguageModel + int oov_fid_; // LanguageModel_OOV + int emit_fid_; // LanguageModel_Emit [only used for class-based LMs] KLanguageModelImpl* pimpl_; }; -- cgit v1.2.3