From 6b6a2d966a0d341fe5abee8b332a9d89f6c95bc4 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 13 Jan 2014 17:15:24 +0100 Subject: Felix' https://github.com/felleh lexical word alignment features --- decoder/Makefile.am | 1 + decoder/cdec_ff.cc | 2 + decoder/ff_lexical.h | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+) create mode 100644 decoder/ff_lexical.h (limited to 'decoder') diff --git a/decoder/Makefile.am b/decoder/Makefile.am index b735756d..c0371081 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -48,6 +48,7 @@ libcdec_a_SOURCES = \ ff_external.h \ ff_factory.h \ ff_klm.h \ + ff_lexical.h \ ff_lm.h \ ff_ngrams.h \ ff_parse_match.h \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index b2541722..80b42d22 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -24,6 +24,7 @@ #include "ff_charset.h" #include "ff_wordset.h" #include "ff_external.h" +#include "ff_lexical.h" void register_feature_functions() { @@ -39,6 +40,7 @@ void register_feature_functions() { RegisterFF(); RegisterFF(); RegisterFF(); + RegisterFF(); //TODO: use for all features the new Register which requires static FF::usage(false,false) give name ff_registry.Register("SpanFeatures", new FFFactory()); diff --git a/decoder/ff_lexical.h b/decoder/ff_lexical.h new file mode 100644 index 00000000..21c85b27 --- /dev/null +++ b/decoder/ff_lexical.h @@ -0,0 +1,128 @@ +#ifndef FF_LEXICAL_H_ +#define FF_LEXICAL_H_ + +#include +#include +#include "trule.h" +#include "ff.h" +#include "hg.h" +#include "array2d.h" +#include "wordid.h" +#include +#include +#include + +#include "filelib.h" +#include "stringlib.h" +#include "sentence_metadata.h" +#include "lattice.h" +#include "fdict.h" +#include "verbose.h" +#include "tdict.h" +#include "hg.h" + +using namespace std; + +namespace { + string Escape(const string& x) { + string y = x; + for (int i = 0; i < y.size(); ++i) { + if (y[i] == '=') y[i]='_'; + if (y[i] == ';') y[i]='_'; + } + return y; + } +} + +class LexicalFeatures : public FeatureFunction { +public: + LexicalFeatures(const std::string& param) { + if (param.empty()) { + cerr << "LexicalFeatures: using T,D,I\n"; + T_ = true; I_ = true; D_ = true; + } else { + const vector argv = SplitOnWhitespace(param); + assert(argv.size() == 3); + T_ = (bool) atoi(argv[0].c_str()); + I_ = (bool) atoi(argv[1].c_str()); + D_ = (bool) atoi(argv[2].c_str()); + cerr << "T=" << T_ << " I=" << I_ << " D=" << D_ << endl; + } + }; + static std::string usage(bool p,bool d) { + return usage_helper("LexicalFeatures","[0/1 0/1 0/1]","Sparse lexical word translation indicator features. If arguments are supplied, specify like this: translations insertions deletions",p,d); + } +protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const HG::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + virtual void PrepareForInput(const SentenceMetadata& smeta); +private: + mutable std::map > rule2feats_; + bool T_; + bool I_; + bool D_; +}; + +void LexicalFeatures::PrepareForInput(const SentenceMetadata& smeta) { + rule2feats_.clear(); // std::map > +} + +void LexicalFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const HG::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const { + + map >::iterator it = rule2feats_.find(edge.rule_.get()); + if (it == rule2feats_.end()) { + const TRule& rule = *edge.rule_; + it = rule2feats_.insert(make_pair(&rule, SparseVector())).first; + SparseVector& f = it->second; + std::vector sf(edge.rule_->FLength(),false); // stores if source tokens are visited by alignment points + std::vector se(edge.rule_->ELength(),false); // stores if target tokens are visited by alignment points + int fid = 0; + // translations + for (unsigned i=0;i 0) {// if not visited and is terminal + ostringstream os; + os << "LD:" << Escape(TD::Convert(rule.f_[i])); + fid = FD::Convert(os.str()); + if (fid <= 0) continue; + if (D_) + f.add_value(fid, 1.0); + } + } + // word insertions + for (unsigned i=0;i= 1) {// if not visited and is terminal + ostringstream os; + os << "LI:" << Escape(TD::Convert(rule.e_[i])); + fid = FD::Convert(os.str()); + if (fid <= 0) continue; + if (I_) + f.add_value(fid, 1.0); + } + } + } + (*features) += it->second; +} + + +#endif -- cgit v1.2.3 From a1f3f7fb262b6efefb106a7bddaf81d2a3a6df93 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 13 Jan 2014 18:30:58 +0100 Subject: remove duplicate word alignment features --- decoder/cdec_ff.cc | 1 - decoder/ff_rules.cc | 22 ---------------------- decoder/ff_rules.h | 13 ------------- training/dtrain/examples/standard/cdec.ini | 1 - 4 files changed, 37 deletions(-) (limited to 'decoder') diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 80b42d22..8689a615 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -47,7 +47,6 @@ void register_feature_functions() { ff_registry.Register("NgramFeatures", new FFFactory()); ff_registry.Register("RuleContextFeatures", new FFFactory()); ff_registry.Register("RuleIdentityFeatures", new FFFactory()); - ff_registry.Register("RuleWordAlignmentFeatures", new FFFactory()); ff_registry.Register("ParseMatchFeatures", new FFFactory); ff_registry.Register("SoftSyntaxFeatures", new FFFactory); ff_registry.Register("SoftSyntaxFeaturesMindist", new FFFactory); diff --git a/decoder/ff_rules.cc b/decoder/ff_rules.cc index 7bccf084..9533caed 100644 --- a/decoder/ff_rules.cc +++ b/decoder/ff_rules.cc @@ -69,28 +69,6 @@ void RuleIdentityFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, features->add_value(it->second, 1); } -RuleWordAlignmentFeatures::RuleWordAlignmentFeatures(const std::string& param) { -} - -void RuleWordAlignmentFeatures::PrepareForInput(const SentenceMetadata& smeta) { -} - -void RuleWordAlignmentFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const { - const TRule& rule = *edge.rule_; - ostringstream os; - vector als = rule.als(); - std::vector::const_iterator xx = als.begin(); - for (; xx != als.end(); ++xx) { - os << "WA:" << TD::Convert(rule.f_[xx->s_]) << ":" << TD::Convert(rule.e_[xx->t_]); - } - features->add_value(FD::Convert(Escape(os.str())), 1); -} - RuleSourceBigramFeatures::RuleSourceBigramFeatures(const std::string& param) { } diff --git a/decoder/ff_rules.h b/decoder/ff_rules.h index 324d7a39..f210dc65 100644 --- a/decoder/ff_rules.h +++ b/decoder/ff_rules.h @@ -24,19 +24,6 @@ class RuleIdentityFeatures : public FeatureFunction { mutable std::map rule2_fid_; }; -class RuleWordAlignmentFeatures : public FeatureFunction { - public: - RuleWordAlignmentFeatures(const std::string& param); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const HG::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); -}; - class RuleSourceBigramFeatures : public FeatureFunction { public: RuleSourceBigramFeatures(const std::string& param); diff --git a/training/dtrain/examples/standard/cdec.ini b/training/dtrain/examples/standard/cdec.ini index 044ae2f5..3330dd71 100644 --- a/training/dtrain/examples/standard/cdec.ini +++ b/training/dtrain/examples/standard/cdec.ini @@ -21,7 +21,6 @@ feature_function=RuleIdentityFeatures feature_function=RuleSourceBigramFeatures feature_function=RuleTargetBigramFeatures feature_function=RuleShape -#feature_function=RuleWordAlignmentFeatures feature_function=LexicalFeatures 1 1 1 #feature_function=SourceSpanSizeFeatures #feature_function=SourceWordPenalty -- cgit v1.2.3 From 7cdaf68c0189b9ad0e4ba4482e4c75d7643ba982 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 29 Jul 2014 15:02:22 +0200 Subject: make PassThrough1..N optional --- decoder/decoder.cc | 1 + decoder/scfg_translator.cc | 25 ++++++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'decoder') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 6783cad0..2c044192 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -366,6 +366,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("beam_prune3", po::value(), "Optional pass 3") ("add_pass_through_rules,P","Add rules to translate OOV words as themselves") + ("add_extra_pass_through_features,Q",po::value()->default_value(6), "Add PassThrough{1..N} features, capped at N.") ("k_best,k",po::value(),"Extract the k best derivations") ("unique_k_best,r", "Unique k-best translation list") ("aligner,a", "Run as a word/phrase aligner (src & ref required)") diff --git a/decoder/scfg_translator.cc b/decoder/scfg_translator.cc index 88f62769..57ad85e2 100644 --- a/decoder/scfg_translator.cc +++ b/decoder/scfg_translator.cc @@ -28,7 +28,7 @@ struct GlueGrammar : public TextGrammar { }; struct PassThroughGrammar : public TextGrammar { - PassThroughGrammar(const Lattice& input, const std::string& cat, const unsigned int ctf_level=0); + PassThroughGrammar(const Lattice& input, const std::string& cat, const unsigned int ctf_level=0, const unsigned int num_pt_features=0); virtual bool HasRuleForSpan(int i, int j, int distance) const; }; @@ -56,7 +56,7 @@ bool GlueGrammar::HasRuleForSpan(int i, int /* j */, int /* distance */) const { return (i == 0); } -PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat, const unsigned int ctf_level) { +PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat, const unsigned int ctf_level, const unsigned num_pt_features) { unordered_set ss; for (int i = 0; i < input.size(); ++i) { const vector& alts = input[i]; @@ -64,12 +64,17 @@ PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat, const int j = alts[k].dist2next + i; const string& src = TD::Convert(alts[k].label); if (ss.count(alts[k].label) == 0) { - int length = static_cast(log(UTF8StringLen(src)) / log(1.6)) + 1; - if (length > 6) length = 6; - string len_feat = "PassThrough_0=1"; - len_feat[12] += length; - TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1 " + len_feat)); - pt->a_.push_back(AlignmentPoint(0,0)); + TRulePtr pt; + if (num_pt_features > 0) { + int length = static_cast(log(UTF8StringLen(src)) / log(1.6)) + 1; + if (length > num_pt_features) length = num_pt_features; + string len_feat = "PassThrough_0=1"; + len_feat[12] += length; + TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1 " + len_feat)); + } else { + TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1 ")); + pt->a_.push_back(AlignmentPoint(0,0)); + } AddRule(pt); RefineRule(pt, ctf_level); ss.insert(alts[k].label); @@ -86,6 +91,7 @@ struct SCFGTranslatorImpl { SCFGTranslatorImpl(const boost::program_options::variables_map& conf) : max_span_limit(conf["scfg_max_span_limit"].as()), add_pass_through_rules(conf.count("add_pass_through_rules")), + num_pt_features(conf["add_pass_through_features"].as()), goal(conf["goal"].as()), default_nt(conf["scfg_default_nt"].as()), use_ctf_(conf.count("coarse_to_fine_beam_prune")) @@ -140,6 +146,7 @@ struct SCFGTranslatorImpl { const int max_span_limit; const bool add_pass_through_rules; + const unsigned int num_pt_features; const string goal; const string default_nt; const bool use_ctf_; @@ -187,7 +194,7 @@ struct SCFGTranslatorImpl { smeta->SetSourceLength(lattice.size()); if (add_pass_through_rules){ if (!SILENT) cerr << "Adding pass through grammar" << endl; - PassThroughGrammar* g = new PassThroughGrammar(lattice, default_nt, ctf_iterations_); + PassThroughGrammar* g = new PassThroughGrammar(lattice, default_nt, ctf_iterations_, num_pt_features); g->SetGrammarName("PassThrough"); glist.push_back(GrammarPtr(g)); } -- cgit v1.2.3 From 014714dfd7d19a9d84bfccf48f71502ba0e7024d Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sat, 2 Aug 2014 14:11:14 +0200 Subject: fix --- decoder/decoder.cc | 2 +- decoder/scfg_translator.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'decoder') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 2c044192..081da8d6 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -366,7 +366,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("beam_prune3", po::value(), "Optional pass 3") ("add_pass_through_rules,P","Add rules to translate OOV words as themselves") - ("add_extra_pass_through_features,Q",po::value()->default_value(6), "Add PassThrough{1..N} features, capped at N.") + ("add_extra_pass_through_features,Q", po::value()->default_value(6), "Add PassThrough{1..N} features, capped at N.") ("k_best,k",po::value(),"Extract the k best derivations") ("unique_k_best,r", "Unique k-best translation list") ("aligner,a", "Run as a word/phrase aligner (src & ref required)") diff --git a/decoder/scfg_translator.cc b/decoder/scfg_translator.cc index 57ad85e2..0c3e4cf7 100644 --- a/decoder/scfg_translator.cc +++ b/decoder/scfg_translator.cc @@ -91,7 +91,7 @@ struct SCFGTranslatorImpl { SCFGTranslatorImpl(const boost::program_options::variables_map& conf) : max_span_limit(conf["scfg_max_span_limit"].as()), add_pass_through_rules(conf.count("add_pass_through_rules")), - num_pt_features(conf["add_pass_through_features"].as()), + num_pt_features(conf["add_extra_pass_through_features"].as()), goal(conf["goal"].as()), default_nt(conf["scfg_default_nt"].as()), use_ctf_(conf.count("coarse_to_fine_beam_prune")) -- cgit v1.2.3 From f33ab2ff3f1cf135ffb80721e1f4d71d124bc8f9 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sat, 2 Aug 2014 14:34:57 +0200 Subject: finally --- decoder/decoder.cc | 2 +- decoder/scfg_translator.cc | 8 +++++--- training/dtrain/examples/toy/cdec.ini | 1 + training/dtrain/examples/toy/dtrain.ini | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) (limited to 'decoder') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 081da8d6..c384c33f 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -366,7 +366,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("beam_prune3", po::value(), "Optional pass 3") ("add_pass_through_rules,P","Add rules to translate OOV words as themselves") - ("add_extra_pass_through_features,Q", po::value()->default_value(6), "Add PassThrough{1..N} features, capped at N.") + ("add_extra_pass_through_features,Q", po::value()->default_value(0), "Add PassThrough{1..N} features, capped at N.") ("k_best,k",po::value(),"Extract the k best derivations") ("unique_k_best,r", "Unique k-best translation list") ("aligner,a", "Run as a word/phrase aligner (src & ref required)") diff --git a/decoder/scfg_translator.cc b/decoder/scfg_translator.cc index 0c3e4cf7..c3cfcaad 100644 --- a/decoder/scfg_translator.cc +++ b/decoder/scfg_translator.cc @@ -64,19 +64,21 @@ PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat, const int j = alts[k].dist2next + i; const string& src = TD::Convert(alts[k].label); if (ss.count(alts[k].label) == 0) { - TRulePtr pt; if (num_pt_features > 0) { int length = static_cast(log(UTF8StringLen(src)) / log(1.6)) + 1; if (length > num_pt_features) length = num_pt_features; string len_feat = "PassThrough_0=1"; len_feat[12] += length; TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1 " + len_feat)); + pt->a_.push_back(AlignmentPoint(0,0)); + AddRule(pt); + RefineRule(pt, ctf_level); } else { TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1 ")); pt->a_.push_back(AlignmentPoint(0,0)); + AddRule(pt); + RefineRule(pt, ctf_level); } - AddRule(pt); - RefineRule(pt, ctf_level); ss.insert(alts[k].label); } } diff --git a/training/dtrain/examples/toy/cdec.ini b/training/dtrain/examples/toy/cdec.ini index b14f4819..e6c19abe 100644 --- a/training/dtrain/examples/toy/cdec.ini +++ b/training/dtrain/examples/toy/cdec.ini @@ -1,3 +1,4 @@ formalism=scfg add_pass_through_rules=true grammar=grammar.gz +#add_extra_pass_through_features=6 diff --git a/training/dtrain/examples/toy/dtrain.ini b/training/dtrain/examples/toy/dtrain.ini index cd715f26..ef956df7 100644 --- a/training/dtrain/examples/toy/dtrain.ini +++ b/training/dtrain/examples/toy/dtrain.ini @@ -2,7 +2,7 @@ decoder_config=cdec.ini input=src refs=tgt output=- -print_weights=logp shell_rule house_rule small_rule little_rule PassThrough +print_weights=logp shell_rule house_rule small_rule little_rule PassThrough PassThrough_1 PassThrough_2 PassThrough_3 PassThrough_4 PassThrough_5 PassThrough_6 k=4 N=4 epochs=2 -- cgit v1.2.3