diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-23 21:02:44 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-23 21:02:44 +0000 |
commit | 353ff197b082b71c623c7851e0df0ac60d770533 (patch) | |
tree | 3d55c79807f86cbe579df3a9d1aebf4464331423 | |
parent | 37f4f68effe3b5ab823985798e77fa64dd66088d (diff) |
implicit first param to ffs: "debug". fsa final traversal set feature=0 first. set FF_FSA_DEBUG (also FSA_DEBUG).
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@387 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r-- | decoder/apply_models.cc | 2 | ||||
-rw-r--r-- | decoder/ff.cc | 2 | ||||
-rw-r--r-- | decoder/ff.h | 1 | ||||
-rw-r--r-- | decoder/ff_factory.cc | 16 | ||||
-rwxr-xr-x | decoder/ff_from_fsa.h | 38 | ||||
-rwxr-xr-x | decoder/ff_fsa.h | 95 | ||||
-rwxr-xr-x | decoder/ff_sample_fsa.h | 21 | ||||
-rw-r--r-- | decoder/sparse_vector.h | 2 | ||||
-rw-r--r-- | decoder/stringlib.h | 25 |
9 files changed, 163 insertions, 39 deletions
diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc index 2b518d62..11d43e93 100644 --- a/decoder/apply_models.cc +++ b/decoder/apply_models.cc @@ -1,3 +1,5 @@ +////TODO: keep model state in forest? + //TODO: (for many nonterminals, or multi-rescoring pass) either global //best-first, or group by (NT,span) - use prev forest outside as a (admissable, //if models are a subset and weights are same) heuristic diff --git a/decoder/ff.cc b/decoder/ff.cc index b6a541e3..28d6f732 100644 --- a/decoder/ff.cc +++ b/decoder/ff.cc @@ -1,3 +1,5 @@ +//TODO: actually score rule_feature()==true features once only, hash keyed on rule or modify TRule directly? need to keep clear in forest which features come from models vs. rules; then rescoring could drop all the old models features at once + //TODO: 0 size state != rule-local feature, i.e. still may depend on source span loc/context. identify truly rule-local features so if we want they can be added to grammar rules (minor speedup) #include <boost/lexical_cast.hpp> diff --git a/decoder/ff.h b/decoder/ff.h index 0bfc8582..a0b39c26 100644 --- a/decoder/ff.h +++ b/decoder/ff.h @@ -17,6 +17,7 @@ typedef std::vector<WordID> Features; // set of features ids class FeatureFunction { public: std::string name; // set by FF factory using usage() + bool debug; // also set by FF factory checking param for immediate initial "debug" FeatureFunction() : state_size_() {} explicit FeatureFunction(int state_size) : state_size_(state_size) {} virtual ~FeatureFunction(); diff --git a/decoder/ff_factory.cc b/decoder/ff_factory.cc index fe733ca5..a6d834e0 100644 --- a/decoder/ff_factory.cc +++ b/decoder/ff_factory.cc @@ -1,6 +1,7 @@ #include "ff_factory.h" #include "ff.h" +#include "stringlib.h" using boost::shared_ptr; using namespace std; @@ -21,14 +22,27 @@ string FFRegistry::usage(string const& ffname,bool params,bool verbose) const { : it->second->usage(params,verbose); } +namespace { +std::string const& debug_pre="debug"; +} + shared_ptr<FeatureFunction> FFRegistry::Create(const string& ffname, const string& param) const { map<string, shared_ptr<FFFactoryBase> >::const_iterator it = reg_.find(ffname); shared_ptr<FeatureFunction> res; if (it == reg_.end()) { cerr << "I don't know how to create feature " << ffname << endl; } else { - res = it->second->Create(param); + int pl=debug_pre.size(); + bool space=false; + std::string p=param; + bool debug=match_begin(p,debug_pre)&&(p.size()==pl||(space=p[pl]==' ')); + if (debug) { + p.erase(0,debug_pre.size()+space); + cerr<<"debug enabled for "<<ffname<< " - rest of param='"<<p<<"'\n"; + } + res = it->second->Create(p); res->name=ffname; + res->debug=debug; } return res; } diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h index f84bda31..42fa1e80 100755 --- a/decoder/ff_from_fsa.h +++ b/decoder/ff_from_fsa.h @@ -3,6 +3,13 @@ #include "ff_fsa.h" +#define FSA_FF_DEBUG +#ifdef FSA_FF_DEBUG +# define FSAFFDBG(x) do { if (debug) { std::cerr << x; } } while(0) +#else +# define FSAFFDBG(x) +#endif + /* regular bottom up scorer from Fsa feature uses guarantee about markov order=N to score ASAP encoding of state: if less than N-1 (ctxlen) words @@ -39,6 +46,7 @@ public: FeatureVector* estimated_features, void* out_state) const { + FSAFFDBG("(FromFsa) "<<name); ff.init_features(features); // estimated_features is fresh if (!ssz) { TRule const& rule=*edge.rule_; @@ -47,9 +55,11 @@ public: if (e[j] < 1) { // variable } else { WordID ew=e[j]; + FSAFFDBG(' '<<TD::Convert(ew)); ff.Scan(smeta,ew,0,0,features); } } + FSAFFDBG('\n'); return; } @@ -63,6 +73,7 @@ public: for (int j = 0; j < e.size(); ++j) { // items in target side of rule if (e[j] < 1) { // variable SP a = ant_contexts[-e[j]]; + FSAFFDBG(' '<<describe_state(a)); WP al=(WP)a; WP ale=left_end(a); // scan(al,le) these - the same as below else. macro for now; pull into closure object later? @@ -87,6 +98,7 @@ public: fsa.reset(fsa_state(a)); } else { // single word WordID ew=e[j]; + FSAFFDBG(' '<<TD::Convert(ew)); // some redundancy: non-vectorized version of above handling of left words of child item if (left_out<left_full) { *left_out++=ew; @@ -105,13 +117,31 @@ public: clear_fsa_state(out_state); // 0 bytes so we compare / hash correctly. don't know state yet while(left_out<left_full) *left_out++=TD::none; // mark as partial left word seq } else // or else store final right-state. heuristic was already assigned - fstatecpy(fsa_state(out_state),fsa.cs); + fstatecpy(out_state,fsa.cs); + FSAFFDBG(" = " << describe_state(out_state)<<" "<<(*features)[ff.fid()]<<" h="<<(*estimated_features)[ff.fid()]<<'\n'); + } + + void print_state(std::ostream &o,void const*ant) const { + WP l=(WP)ant,le=left_end(ant),lf=left_end_full(ant); + o<<'['<<Sentence(l,le); + if (le==lf) { + o<<" : "; + ff.print_state(o,lf); + } + o << ']'; + } + + std::string describe_state(void const*ant) const { + std::ostringstream o; + print_state(o,ant); + return o.str(); } virtual void FinalTraversalFeatures(const SentenceMetadata& smeta, const void* residual_state, FeatureVector* final_features) const { + ff.init_features(final_features); // estimated_features is fresh Sentence const& ends=ff.end_phrase(); if (!ssz) { AccumFeatures(ff,smeta,begin(ends),end(ends),final_features,0); @@ -132,6 +162,7 @@ public: // whole = left-words + end-phrase AccumFeatures(ff,smeta,w,end(whole),final_features,ss); } + FSAFFDBG("Final "<<name<<" = "<<*final_features<<'\n'); } bool rule_feature() const { @@ -190,8 +221,8 @@ private: std::memset(fsa_state(ant),0,ssz); } - inline void fstatecpy(void *dest,void const* src) const { - std::memcpy(dest,src,ssz); + inline void fstatecpy(void *ant,void const* src) const { + std::memcpy(fsa_state(ant),src,ssz); } }; @@ -201,6 +232,7 @@ private: # include "ff_sample_fsa.h" int main() { std::cerr<<"Testing left_end...\n"; + std::cerr<<"sizeof(FeatureVector)="<<sizeof(FeatureVector)<<"\nsizeof(FeatureVectorList)="<<sizeof(FeatureVectorList)<<"\n"; WordPenaltyFromFsa::test(); return 0; } diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h index 4e40f51b..8ca1951f 100755 --- a/decoder/ff_fsa.h +++ b/decoder/ff_fsa.h @@ -3,14 +3,17 @@ //SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h -#define FSA_DEBUG +//#define FSA_DEBUG + #ifdef FSA_DEBUG # include <iostream> -# define FSADBG(x) do { std::cerr << x; } while(0) +# define FSADBG(x) do { if (d().debug()) { std::cerr << x; } } while(0) #else # define FSADBG(x) #endif +#include <boost/lexical_cast.hpp> +#include <sstream> #include <stdint.h> //C99 #include <string> #include "ff.h" @@ -30,20 +33,28 @@ typedef ValueArray<uint8_t> Bytes; */ // it's not necessary to inherit from this, but you probably should to save yourself some boilerplate. defaults to no-state + +// usage: +// struct FsaFeat : public FsaTypedBase<int,FsaFeat> +// i.e. Impl is a CRTP +template <class Impl> struct FsaFeatureFunctionBase { + Impl const& d() const { return static_cast<Impl const&>(*this); } + Impl & d() { return static_cast<Impl &>(*this); } protected: + int state_bytes_; // don't forget to set this. default 0 (it may depend on params of course) Bytes start,h_start; // start state and estimated-features (heuristic) start state. set these. default empty. Sentence end_phrase_; // words appended for final traversal (final state cost is assessed using Scan) e.g. "</s>" for lm. - int state_bytes_; // don't forget to set this. default 0 (it may depend on params of course) void set_state_bytes(int sb=0) { if (start.size()!=sb) start.resize(sb); if (h_start.size()!=sb) h_start.resize(sb); state_bytes_=sb; } int fid_; // you can have more than 1 feature of course. - void init_fid(std::string const& name) { // call this, though, if you have a single feature - fid_=FD::Convert(name); + void Init() { // CALL THIS MANUALLY (because feature name(s) may depend on param + fid_=FD::Convert(d().name()); } + inline void static to_state(void *state,char const* begin,char const* end) { std::memcpy(state,begin,end-begin); } @@ -58,7 +69,34 @@ protected: inline void static to_state(void *state,T const* begin,T const* end) { to_state(state,(char const*)begin,(char const*)end); } + inline static char hexdigit(int i) { + return '0'+i; + } + inline static void print_hex_byte(std::ostream &o,unsigned c) { + o<<hexdigit(c>>4); + o<<hexdigit(c&0x0f); + } + public: + bool debug() const { return true; } + int fid() const { return fid_; } // return the one most important feature (for debugging) + std::string name() const { + return Impl::usage(false,false); + } + + void print_state(std::ostream &o,void const*state) const { + char const* i=(char const*)state; + char const* e=i+state_bytes_; + for (;i!=e;++i) + print_hex_byte(o,*i); + } + + std::string describe_state(void const* state) const { + std::ostringstream o; + d().print_state(o,state); + return o.str(); + } + //edges may have old features on them. override if you have more than 1 fid. we need to call this explicitly because edges may have old feature values already, and I chose to use add_value (+=) to simplify scanning a phrase, rather than set_value (=) for fsa ffs. could revisit this and use set_value and therefore sum void init_features(FeatureVector *fv) const { fv->set_value(fid_,0); @@ -93,7 +131,7 @@ public: } // don't set state-bytes etc. in ctor because it may depend on parsing param string - FsaFeatureFunctionBase(int statesz=0) : start(statesz),h_start(statesz),state_bytes_(statesz) { } + FsaFeatureFunctionBase(int statesz=0,Sentence const& end_sentence_phrase=Sentence()) : state_bytes_(statesz),start(statesz),h_start(statesz),end_phrase_(end_sentence_phrase) {} }; @@ -160,9 +198,15 @@ void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,WordID const* i, W } // if State is pod. sets state size and allocs start, h_start -template <class St> -struct FsaTypedBase : public FsaFeatureFunctionBase { +// usage: +// struct ShorterThanPrev : public FsaTypedBase<int,ShorterThanPrev> +// i.e. Impl is a CRTP +template <class St,class Impl> +struct FsaTypedBase : public FsaFeatureFunctionBase<Impl> { + Impl const& d() const { return static_cast<Impl const&>(*this); } + Impl & d() { return static_cast<Impl &>(*this); } protected: + typedef FsaFeatureFunctionBase<Impl> Base; typedef St State; static inline State & state(void *state) { return *(State*)state; @@ -172,32 +216,33 @@ protected: } void set_starts(State const& s,State const& heuristic_s) { if (0) { // already in ctor - start.resize(sizeof(State)); - h_start.resize(sizeof(State)); + Base::start.resize(sizeof(State)); + Base::h_start.resize(sizeof(State)); } - state(start.begin())=s; - state(h_start.begin())=heuristic_s; + state(Base::start.begin())=s; + state(Base::h_start.begin())=heuristic_s; } - void set_h_start(State const& s) { + FsaTypedBase(St const& start_st=St() + ,St const& h_start_st=St() + ,Sentence const& end_sentence_phrase=Sentence()) + : Base(sizeof(State),end_sentence_phrase) { + set_starts(start_st,h_start_st); } public: + void print_state(std::ostream &o,void const*st) const { + o<<state(st); + } int markov_order() const { return 1; } - FsaTypedBase() : FsaFeatureFunctionBase(sizeof(State)) { + void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const { + Impl const& im=d(); + FSADBG("Scan "<<FD::Convert(im.fid_)<<" = "<<(*features)[im.fid_]<<" "<<im.state(st)<<" ->"<<TD::Convert(w)<<" "); + im.ScanTyped(smeta,w,im.state(st),im.state(next_state),features); + FSADBG(im.state(next_state)<<" = "<<(*features)[im.fid_]<<std::endl); } + }; -// usage (if you're lazy): -// struct ShorterThanPrev : public FsaTypedBase<int>,FsaTypedScan<ShorterThanPrev> -template <class St,class Impl> -struct FsaTypedScan : public FsaTypedBase<St> { - void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const { - Impl const* impl=static_cast<Impl const*>(this); - FSADBG("Scan "<<(*features)[impl->fid_]<<" = "<<impl->state(st)<<" ->"<<TD::Convert(w)<<" "); - impl->ScanTyped(smeta,w,impl->state(st),impl->state(next_state),features); - FSADBG(impl->state(next_state)<<" = "<<(*features)[impl->fid_]<<std::endl); - } -}; diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h index 947ad21c..8befc0bb 100755 --- a/decoder/ff_sample_fsa.h +++ b/decoder/ff_sample_fsa.h @@ -4,7 +4,7 @@ #include "ff_from_fsa.h" // example: feature val = -1 * # of target words -struct WordPenaltyFsa : public FsaFeatureFunctionBase { +struct WordPenaltyFsa : public FsaFeatureFunctionBase<WordPenaltyFsa> { static std::string usage(bool param,bool verbose) { return FeatureFunction::usage_helper( "WordPenaltyFsa","","-1 per target word" @@ -12,7 +12,7 @@ struct WordPenaltyFsa : public FsaFeatureFunctionBase { } WordPenaltyFsa(std::string const& param) { - init_fid(usage(false,false)); + Init(); return; //below are all defaults: set_state_bytes(0); @@ -30,7 +30,7 @@ typedef FeatureFunctionFromFsa<WordPenaltyFsa> WordPenaltyFromFsa; // -struct LongerThanPrev : public FsaFeatureFunctionBase { +struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> { static std::string usage(bool param,bool verbose) { return FeatureFunction::usage_helper( "LongerThanPrev", @@ -50,7 +50,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase { } int markov_order() const { return 1; } LongerThanPrev(std::string const& param) { - init_fid(usage(false,false)); + Init(); set_state_bytes(sizeof(int)); // start.resize(state_bytes()); // this is done by set_state_bytes already. // h_start.resize(state_bytes()); @@ -73,7 +73,8 @@ struct LongerThanPrev : public FsaFeatureFunctionBase { }; // similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State) -struct ShorterThanPrev : FsaTypedScan<int,ShorterThanPrev> { +struct ShorterThanPrev : FsaTypedBase<int,ShorterThanPrev> { + typedef FsaTypedBase<int,ShorterThanPrev> Base; static std::string usage(bool param,bool verbose) { return FeatureFunction::usage_helper( "ShorterThanPrev", @@ -85,10 +86,12 @@ struct ShorterThanPrev : FsaTypedScan<int,ShorterThanPrev> { static inline int wordlen(WordID w) { return std::strlen(TD::Convert(w)); } - ShorterThanPrev(std::string const& param) { - init_fid(usage(false,false)); -// end_phrase_.push_back(TD::Convert("")); // this triggers end of sentence firing - set_starts(-1,4); // estimate: anything <4 chars is usually shorter than previous + ShorterThanPrev(std::string const& param) + : Base(-1,4,Sentence(1,TD::Convert(""))) + // start, h_start, end_phrase + // estimate: anything <4 chars is usually shorter than previous + { + Init(); } static const float val_per_target_word=-1; diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h index 0f3724f0..285e84a7 100644 --- a/decoder/sparse_vector.h +++ b/decoder/sparse_vector.h @@ -420,7 +420,7 @@ private: List p; }; - +typedef SparseVectorList<double> FeatureVectorList; typedef SparseVector<double> FeatureVector; typedef SparseVector<double> WeightVector; typedef std::vector<double> DenseWeightVector; diff --git a/decoder/stringlib.h b/decoder/stringlib.h index a0e03624..9efe3f36 100644 --- a/decoder/stringlib.h +++ b/decoder/stringlib.h @@ -14,6 +14,31 @@ #include <cstring> #include <string> +template <class Istr, class Isubstr> inline +bool match_begin(Istr bstr,Istr estr,Isubstr bsub,Isubstr esub) +{ + while (bsub != esub) { + if (bstr == estr) + return false; + if (*bsub++ != *bstr++) + return false; + } + return true; +} + +template <class Istr, class Prefix> inline +bool match_begin(Istr bstr,Istr estr,Prefix prefix) +{ + return match_begin(bstr,estr,prefix.begin(),prefix.end()); +} + +template <class Str, class Prefix> inline +bool match_begin(Str const& str,Prefix const& prefix) +{ + return match_begin(str.begin(),str.end(),prefix.begin(),prefix.end()); +} + + // read line in the form of either: // source // source ||| target |