From 353ff197b082b71c623c7851e0df0ac60d770533 Mon Sep 17 00:00:00 2001 From: graehl Date: Fri, 23 Jul 2010 21:02:44 +0000 Subject: implicit first param to ffs: "debug". fsa final traversal set feature=0 first. set FF_FSA_DEBUG (also FSA_DEBUG). git-svn-id: https://ws10smt.googlecode.com/svn/trunk@387 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/apply_models.cc | 2 ++ decoder/ff.cc | 2 ++ decoder/ff.h | 1 + decoder/ff_factory.cc | 16 ++++++++- decoder/ff_from_fsa.h | 38 ++++++++++++++++++-- decoder/ff_fsa.h | 95 ++++++++++++++++++++++++++++++++++++------------- decoder/ff_sample_fsa.h | 21 ++++++----- decoder/sparse_vector.h | 2 +- decoder/stringlib.h | 25 +++++++++++++ 9 files changed, 163 insertions(+), 39 deletions(-) diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc index 2b518d62..11d43e93 100644 --- a/decoder/apply_models.cc +++ b/decoder/apply_models.cc @@ -1,3 +1,5 @@ +////TODO: keep model state in forest? + //TODO: (for many nonterminals, or multi-rescoring pass) either global //best-first, or group by (NT,span) - use prev forest outside as a (admissable, //if models are a subset and weights are same) heuristic diff --git a/decoder/ff.cc b/decoder/ff.cc index b6a541e3..28d6f732 100644 --- a/decoder/ff.cc +++ b/decoder/ff.cc @@ -1,3 +1,5 @@ +//TODO: actually score rule_feature()==true features once only, hash keyed on rule or modify TRule directly? need to keep clear in forest which features come from models vs. rules; then rescoring could drop all the old models features at once + //TODO: 0 size state != rule-local feature, i.e. still may depend on source span loc/context. identify truly rule-local features so if we want they can be added to grammar rules (minor speedup) #include diff --git a/decoder/ff.h b/decoder/ff.h index 0bfc8582..a0b39c26 100644 --- a/decoder/ff.h +++ b/decoder/ff.h @@ -17,6 +17,7 @@ typedef std::vector Features; // set of features ids class FeatureFunction { public: std::string name; // set by FF factory using usage() + bool debug; // also set by FF factory checking param for immediate initial "debug" FeatureFunction() : state_size_() {} explicit FeatureFunction(int state_size) : state_size_(state_size) {} virtual ~FeatureFunction(); diff --git a/decoder/ff_factory.cc b/decoder/ff_factory.cc index fe733ca5..a6d834e0 100644 --- a/decoder/ff_factory.cc +++ b/decoder/ff_factory.cc @@ -1,6 +1,7 @@ #include "ff_factory.h" #include "ff.h" +#include "stringlib.h" using boost::shared_ptr; using namespace std; @@ -21,14 +22,27 @@ string FFRegistry::usage(string const& ffname,bool params,bool verbose) const { : it->second->usage(params,verbose); } +namespace { +std::string const& debug_pre="debug"; +} + shared_ptr FFRegistry::Create(const string& ffname, const string& param) const { map >::const_iterator it = reg_.find(ffname); shared_ptr res; if (it == reg_.end()) { cerr << "I don't know how to create feature " << ffname << endl; } else { - res = it->second->Create(param); + int pl=debug_pre.size(); + bool space=false; + std::string p=param; + bool debug=match_begin(p,debug_pre)&&(p.size()==pl||(space=p[pl]==' ')); + if (debug) { + p.erase(0,debug_pre.size()+space); + cerr<<"debug enabled for "<second->Create(p); res->name=ffname; + res->debug=debug; } return res; } diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h index f84bda31..42fa1e80 100755 --- a/decoder/ff_from_fsa.h +++ b/decoder/ff_from_fsa.h @@ -3,6 +3,13 @@ #include "ff_fsa.h" +#define FSA_FF_DEBUG +#ifdef FSA_FF_DEBUG +# define FSAFFDBG(x) do { if (debug) { std::cerr << x; } } while(0) +#else +# define FSAFFDBG(x) +#endif + /* regular bottom up scorer from Fsa feature uses guarantee about markov order=N to score ASAP encoding of state: if less than N-1 (ctxlen) words @@ -39,6 +46,7 @@ public: FeatureVector* estimated_features, void* out_state) const { + FSAFFDBG("(FromFsa) "< -# define FSADBG(x) do { std::cerr << x; } while(0) +# define FSADBG(x) do { if (d().debug()) { std::cerr << x; } } while(0) #else # define FSADBG(x) #endif +#include +#include #include //C99 #include #include "ff.h" @@ -30,20 +33,28 @@ typedef ValueArray Bytes; */ // it's not necessary to inherit from this, but you probably should to save yourself some boilerplate. defaults to no-state + +// usage: +// struct FsaFeat : public FsaTypedBase +// i.e. Impl is a CRTP +template struct FsaFeatureFunctionBase { + Impl const& d() const { return static_cast(*this); } + Impl & d() { return static_cast(*this); } protected: + int state_bytes_; // don't forget to set this. default 0 (it may depend on params of course) Bytes start,h_start; // start state and estimated-features (heuristic) start state. set these. default empty. Sentence end_phrase_; // words appended for final traversal (final state cost is assessed using Scan) e.g. "" for lm. - int state_bytes_; // don't forget to set this. default 0 (it may depend on params of course) void set_state_bytes(int sb=0) { if (start.size()!=sb) start.resize(sb); if (h_start.size()!=sb) h_start.resize(sb); state_bytes_=sb; } int fid_; // you can have more than 1 feature of course. - void init_fid(std::string const& name) { // call this, though, if you have a single feature - fid_=FD::Convert(name); + void Init() { // CALL THIS MANUALLY (because feature name(s) may depend on param + fid_=FD::Convert(d().name()); } + inline void static to_state(void *state,char const* begin,char const* end) { std::memcpy(state,begin,end-begin); } @@ -58,7 +69,34 @@ protected: inline void static to_state(void *state,T const* begin,T const* end) { to_state(state,(char const*)begin,(char const*)end); } + inline static char hexdigit(int i) { + return '0'+i; + } + inline static void print_hex_byte(std::ostream &o,unsigned c) { + o<>4); + o<set_value(fid_,0); @@ -93,7 +131,7 @@ public: } // don't set state-bytes etc. in ctor because it may depend on parsing param string - FsaFeatureFunctionBase(int statesz=0) : start(statesz),h_start(statesz),state_bytes_(statesz) { } + FsaFeatureFunctionBase(int statesz=0,Sentence const& end_sentence_phrase=Sentence()) : state_bytes_(statesz),start(statesz),h_start(statesz),end_phrase_(end_sentence_phrase) {} }; @@ -160,9 +198,15 @@ void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,WordID const* i, W } // if State is pod. sets state size and allocs start, h_start -template -struct FsaTypedBase : public FsaFeatureFunctionBase { +// usage: +// struct ShorterThanPrev : public FsaTypedBase +// i.e. Impl is a CRTP +template +struct FsaTypedBase : public FsaFeatureFunctionBase { + Impl const& d() const { return static_cast(*this); } + Impl & d() { return static_cast(*this); } protected: + typedef FsaFeatureFunctionBase Base; typedef St State; static inline State & state(void *state) { return *(State*)state; @@ -172,32 +216,33 @@ protected: } void set_starts(State const& s,State const& heuristic_s) { if (0) { // already in ctor - start.resize(sizeof(State)); - h_start.resize(sizeof(State)); + Base::start.resize(sizeof(State)); + Base::h_start.resize(sizeof(State)); } - state(start.begin())=s; - state(h_start.begin())=heuristic_s; + state(Base::start.begin())=s; + state(Base::h_start.begin())=heuristic_s; } - void set_h_start(State const& s) { + FsaTypedBase(St const& start_st=St() + ,St const& h_start_st=St() + ,Sentence const& end_sentence_phrase=Sentence()) + : Base(sizeof(State),end_sentence_phrase) { + set_starts(start_st,h_start_st); } public: + void print_state(std::ostream &o,void const*st) const { + o<"<fid_]<<" = "<state(st)<<" ->"<ScanTyped(smeta,w,impl->state(st),impl->state(next_state),features); - FSADBG(impl->state(next_state)<<" = "<<(*features)[impl->fid_]< { static std::string usage(bool param,bool verbose) { return FeatureFunction::usage_helper( "WordPenaltyFsa","","-1 per target word" @@ -12,7 +12,7 @@ struct WordPenaltyFsa : public FsaFeatureFunctionBase { } WordPenaltyFsa(std::string const& param) { - init_fid(usage(false,false)); + Init(); return; //below are all defaults: set_state_bytes(0); @@ -30,7 +30,7 @@ typedef FeatureFunctionFromFsa WordPenaltyFromFsa; // -struct LongerThanPrev : public FsaFeatureFunctionBase { +struct LongerThanPrev : public FsaFeatureFunctionBase { static std::string usage(bool param,bool verbose) { return FeatureFunction::usage_helper( "LongerThanPrev", @@ -50,7 +50,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase { } int markov_order() const { return 1; } LongerThanPrev(std::string const& param) { - init_fid(usage(false,false)); + Init(); set_state_bytes(sizeof(int)); // start.resize(state_bytes()); // this is done by set_state_bytes already. // h_start.resize(state_bytes()); @@ -73,7 +73,8 @@ struct LongerThanPrev : public FsaFeatureFunctionBase { }; // similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State) -struct ShorterThanPrev : FsaTypedScan { +struct ShorterThanPrev : FsaTypedBase { + typedef FsaTypedBase Base; static std::string usage(bool param,bool verbose) { return FeatureFunction::usage_helper( "ShorterThanPrev", @@ -85,10 +86,12 @@ struct ShorterThanPrev : FsaTypedScan { static inline int wordlen(WordID w) { return std::strlen(TD::Convert(w)); } - ShorterThanPrev(std::string const& param) { - init_fid(usage(false,false)); -// end_phrase_.push_back(TD::Convert("")); // this triggers end of sentence firing - set_starts(-1,4); // estimate: anything <4 chars is usually shorter than previous + ShorterThanPrev(std::string const& param) + : Base(-1,4,Sentence(1,TD::Convert(""))) + // start, h_start, end_phrase + // estimate: anything <4 chars is usually shorter than previous + { + Init(); } static const float val_per_target_word=-1; diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h index 0f3724f0..285e84a7 100644 --- a/decoder/sparse_vector.h +++ b/decoder/sparse_vector.h @@ -420,7 +420,7 @@ private: List p; }; - +typedef SparseVectorList FeatureVectorList; typedef SparseVector FeatureVector; typedef SparseVector WeightVector; typedef std::vector DenseWeightVector; diff --git a/decoder/stringlib.h b/decoder/stringlib.h index a0e03624..9efe3f36 100644 --- a/decoder/stringlib.h +++ b/decoder/stringlib.h @@ -14,6 +14,31 @@ #include #include +template inline +bool match_begin(Istr bstr,Istr estr,Isubstr bsub,Isubstr esub) +{ + while (bsub != esub) { + if (bstr == estr) + return false; + if (*bsub++ != *bstr++) + return false; + } + return true; +} + +template inline +bool match_begin(Istr bstr,Istr estr,Prefix prefix) +{ + return match_begin(bstr,estr,prefix.begin(),prefix.end()); +} + +template inline +bool match_begin(Str const& str,Prefix const& prefix) +{ + return match_begin(str.begin(),str.end(),prefix.begin(),prefix.end()); +} + + // read line in the form of either: // source // source ||| target -- cgit v1.2.3