From 77b37114061a17edf55dc2abadf484182b04aa8e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 16 Feb 2014 01:11:59 -0500 Subject: new rule shape features --- decoder/cdec_ff.cc | 1 + decoder/ff_ruleshape.cc | 138 ++++++++++++++++++++++++++++++++++++++++++++++++ decoder/ff_ruleshape.h | 46 ++++++++++++++++ 3 files changed, 185 insertions(+) (limited to 'decoder') diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index b2541722..0411908f 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -58,6 +58,7 @@ void register_feature_functions() { ff_registry.Register("KLanguageModel", new KLanguageModelFactory()); ff_registry.Register("NonLatinCount", new FFFactory); ff_registry.Register("RuleShape", new FFFactory); + ff_registry.Register("RuleShape2", new FFFactory); ff_registry.Register("RelativeSentencePosition", new FFFactory); ff_registry.Register("LexNullJump", new FFFactory); ff_registry.Register("NewJump", new FFFactory); diff --git a/decoder/ff_ruleshape.cc b/decoder/ff_ruleshape.cc index 7bb548c4..35b41c46 100644 --- a/decoder/ff_ruleshape.cc +++ b/decoder/ff_ruleshape.cc @@ -1,5 +1,8 @@ #include "ff_ruleshape.h" +#include "filelib.h" +#include "stringlib.h" +#include "verbose.h" #include "trule.h" #include "hg.h" #include "fdict.h" @@ -104,3 +107,138 @@ void RuleShapeFeatures::TraversalFeaturesImpl(const SentenceMetadata& /* smeta * features->set_value(cur->fid_, 1.0); } +namespace { +void ParseRSArgs(string const& in, string* emapfile, string* fmapfile, unsigned *pfxsize) { + vector const& argv=SplitOnWhitespace(in); + *emapfile = ""; + *fmapfile = ""; + *pfxsize = 0; +#define RSSPEC_NEXTARG if (i==argv.end()) { \ + cerr << "Missing argument for "<<*last<<". "; goto usage; \ + } else { ++i; } + + for (vector::const_iterator last,i=argv.begin(),e=argv.end();i!=e;++i) { + string const& s=*i; + if (s[0]=='-') { + if (s.size()>2) goto fail; + switch (s[1]) { + case 'e': + if (emapfile->size() > 0) { cerr << "Multiple -e specifications!\n"; abort(); } + RSSPEC_NEXTARG; *emapfile=*i; + break; + case 'f': + if (fmapfile->size() > 0) { cerr << "Multiple -f specifications!\n"; abort(); } + RSSPEC_NEXTARG; *fmapfile=*i; + break; + case 'p': + RSSPEC_NEXTARG; *pfxsize=atoi(i->c_str()); + break; +#undef RSSPEC_NEXTARG + default: + fail: + cerr<<"Unknown RuleShape2 option "<* pv, unsigned f, unsigned t, unsigned pfx_size) { + if (pfx_size) { + const string& ts = TD::Convert(t); + if (pfx_size < ts.size()) + t = TD::Convert(ts.substr(0, pfx_size)); + } + if (f >= pv->size()) + pv->resize((f + 1) * 1.2); + (*pv)[f] = t; +} +} + +RuleShapeFeatures2::~RuleShapeFeatures2() {} + +RuleShapeFeatures2::RuleShapeFeatures2(const string& param) : kNT(TD::Convert("NT")), kUNK(TD::Convert("")) { + string emap; + string fmap; + unsigned pfxsize = 0; + ParseRSArgs(param, &emap, &fmap, &pfxsize); + has_src_ = fmap.size(); + has_trg_ = emap.size(); + if (has_trg_) LoadWordClasses(emap, pfxsize, &e2class_); + if (has_src_) LoadWordClasses(fmap, pfxsize, &f2class_); + if (!has_trg_ && !has_src_) { + cerr << "RuleShapeFeatures2 requires [-e trg_map.gz] or [-f src_map.gz] or both, and optional [-p pfxsize]\n"; + abort(); + } +} + +void RuleShapeFeatures2::LoadWordClasses(const string& file, const unsigned pfx_size, vector* pv) { + ReadFile rf(file); + istream& in = *rf.stream(); + string line; + vector dummy; + int lc = 0; + if (!SILENT) + cerr << " Loading word classes from " << file << " ...\n"; + AddWordToClassMapping_(pv, TD::Convert(""), TD::Convert(""), 0); + AddWordToClassMapping_(pv, TD::Convert(""), TD::Convert(""), 0); + while(getline(in, line)) { + dummy.clear(); + TD::ConvertSentence(line, &dummy); + ++lc; + if (dummy.size() != 2 && dummy.size() != 3) { + cerr << " Class map file expects: CLASS WORD [freq]\n"; + cerr << " Format error in " << file << ", line " << lc << ": " << line << endl; + abort(); + } + AddWordToClassMapping_(pv, dummy[1], dummy[0], pfx_size); + } + if (!SILENT) + cerr << " Loaded word " << lc << " mapping rules.\n"; +} + +void RuleShapeFeatures2::TraversalFeaturesImpl(const SentenceMetadata& /* smeta */, + const Hypergraph::Edge& edge, + const vector& /* ant_contexts */, + SparseVector* features, + SparseVector* /* estimated_features */, + void* /* context */) const { + const vector& f = edge.rule_->f(); + const vector& e = edge.rule_->e(); + Node* fid = &fidtree_; + if (has_src_) { + for (unsigned i = 0; i < f.size(); ++i) + fid = &fid->next_[MapF(f[i])]; + } + if (has_trg_) { + for (unsigned i = 0; i < e.size(); ++i) + fid = &fid->next_[MapE(e[i])]; + } + if (!fid->fid_) { + ostringstream os; + os << "RS:"; + if (has_src_) { + for (unsigned i = 0; i < f.size(); ++i) { + if (i) os << '_'; + os << TD::Convert(MapF(f[i])); + } + if (has_trg_) os << "__"; + } + if (has_trg_) { + for (unsigned i = 0; i < e.size(); ++i) { + if (i) os << '_'; + os << TD::Convert(MapE(e[i])); + } + } + fid->fid_ = FD::Convert(os.str()); + } + features->set_value(fid->fid_, 1); +} + diff --git a/decoder/ff_ruleshape.h b/decoder/ff_ruleshape.h index 9f20faf3..488cfd84 100644 --- a/decoder/ff_ruleshape.h +++ b/decoder/ff_ruleshape.h @@ -2,6 +2,7 @@ #define _FF_RULESHAPE_H_ #include +#include #include "ff.h" class RuleShapeFeatures : public FeatureFunction { @@ -28,4 +29,49 @@ class RuleShapeFeatures : public FeatureFunction { } }; +class RuleShapeFeatures2 : public FeatureFunction { + public: + ~RuleShapeFeatures2(); + RuleShapeFeatures2(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const HG::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + private: + struct Node { + int fid_; + Node() : fid_() {} + std::map next_; + }; + mutable Node fidtree_; + + inline WordID MapE(WordID w) const { + if (w <= 0) return kNT; + unsigned res = 0; + if (w < e2class_.size()) res = e2class_[w]; + if (!res) res = kUNK; + return res; + } + + inline WordID MapF(WordID w) const { + if (w <= 0) return kNT; + unsigned res = 0; + if (w < f2class_.size()) res = f2class_[w]; + if (!res) res = kUNK; + return res; + } + + // prfx_size=0 => use full word classes otherwise truncate to specified length + void LoadWordClasses(const std::string& fname, unsigned pfxsize, std::vector* pv); + const WordID kNT; + const WordID kUNK; + std::vector e2class_; + std::vector f2class_; + bool has_src_; + bool has_trg_; +}; + #endif -- cgit v1.2.3 From a3aa460b375b8d0c3db59c40fc7060fc5e634c14 Mon Sep 17 00:00:00 2001 From: armatthews Date: Thu, 20 Feb 2014 22:21:21 -0500 Subject: Allow NGramFeatures to be named in order to avoid conflicts when using more than one set of them --- decoder/decoder.cc | 2 +- decoder/ff_ngrams.cc | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'decoder') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index e02c7730..7b49fcfa 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -408,7 +408,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("max_translation_sample,X", po::value(), "Sample the max translation from the chart") ("pb_max_distortion,D", po::value()->default_value(4), "Phrase-based decoder: maximum distortion") ("cll_gradient,G","Compute conditional log-likelihood gradient and write to STDOUT (src & ref required)") - ("get_oracle_forest,o", "Calculate rescored hypregraph using approximate BLEU scoring of rules") + ("get_oracle_forest,o", "Calculate rescored hypergraph using approximate BLEU scoring of rules") ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)") ("vector_format",po::value()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)") ("combine_size,C",po::value()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)") diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index d337b28b..0bc14e5a 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -60,8 +60,9 @@ namespace { } } -static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector& prefixes, string& target_separator, string* cluster_file) { +static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector& prefixes, string& target_separator, string* cluster_file, string* featname) { vector const& argv=SplitOnWhitespace(in); + *featname = ""; *explicit_markers = false; *order = 3; prefixes.push_back("NOT-USED"); @@ -83,6 +84,9 @@ static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, case 'x': *explicit_markers = true; break; + case 'n': + LMSPEC_NEXTARG; *featname=*i; + break; case 'U': LMSPEC_NEXTARG; prefixes[1] = *i; @@ -226,6 +230,7 @@ class NgramDetectorImpl { ++n; if (!fid) { ostringstream os; + os << featname_; os << prefixes_[n]; for (int i = n-1; i >= 0; --i) { os << (i != n-1 ? target_separator_ : ""); @@ -404,7 +409,8 @@ class NgramDetectorImpl { public: explicit NgramDetectorImpl(bool explicit_markers, unsigned order, - vector& prefixes, string& target_separator, const string& clusters) : + vector& prefixes, string& target_separator, const string& clusters, + const string& featname) : kCDEC_UNK(TD::Convert("")) , add_sos_eos_(!explicit_markers) { order_ = order; @@ -414,6 +420,7 @@ class NgramDetectorImpl { unscored_words_offset_ = is_complete_offset_ + 1; prefixes_ = prefixes; target_separator_ = target_separator; + featname_ = featname; // special handling of beginning / ending sentence markers dummy_state_ = new char[state_size_]; @@ -454,6 +461,7 @@ class NgramDetectorImpl { TRulePtr dummy_rule_; vector prefixes_; string target_separator_; + string featname_; struct FidTree { map fids; map levels; @@ -467,9 +475,9 @@ NgramDetector::NgramDetector(const string& param) { bool explicit_markers = false; unsigned order = 3; string clusters; - ParseArgs(param, &explicit_markers, &order, prefixes, target_separator, &clusters); + ParseArgs(param, &explicit_markers, &order, prefixes, target_separator, &clusters, &featname); pimpl_ = new NgramDetectorImpl(explicit_markers, order, prefixes, - target_separator, clusters); + target_separator, clusters, featname); SetStateSize(pimpl_->ReserveStateSize()); } -- cgit v1.2.3 From 3ec30b72f47e063d94648a9823653e6ec3e17401 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 23 Feb 2014 02:13:32 -0500 Subject: fix rule emission behavior --- decoder/decoder.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'decoder') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index e02c7730..f8104c5e 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -662,11 +662,6 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream oracle.show_derivation=conf.count("show_derivations"); remove_intersected_rule_annotations = conf.count("remove_intersected_rule_annotations"); - if (conf.count("extract_rules")) { - stringstream ss; - ss << sent_id; - extract_file.reset(new WriteFile(str("extract_rules",conf)+"/"+ss.str())); - } combine_size = conf["combine_size"].as(); if (combine_size < 1) combine_size = 1; sent_id = -1; @@ -720,6 +715,11 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { } cerr << " id = " << sent_id << endl; } + if (conf.count("extract_rules")) { + stringstream ss; + ss << sent_id << ".gz"; + extract_file.reset(new WriteFile(str("extract_rules",conf)+"/"+ss.str())); + } string to_translate; Lattice ref; ParseTranslatorInputLattice(buf, &to_translate, &ref); -- cgit v1.2.3 From d843587027d815f3a1c9b8dd5394f3fe04ac85fa Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 23 Feb 2014 17:32:59 -0500 Subject: ngrams fix for unigram models --- decoder/ff_ngrams.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'decoder') diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index 0bc14e5a..0a97cba5 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -36,7 +36,7 @@ struct State { } explicit State(const State& other, unsigned order, WordID extend) { char om1 = order - 1; - assert(om1 > 0); + if (!om1) { memset(state, 0, sizeof(state)); return; } for (char i = 1; i < om1; ++i) state[i - 1]= other.state[i]; state[om1 - 1] = extend; } @@ -152,7 +152,7 @@ usage: << "Example feature instantiation: \n" << " tri:a|b|c \n\n"; - return false; + abort(); } class NgramDetectorImpl { -- cgit v1.2.3