From aaac9f8ee73ba59b72609af9a78b167312a6dac7 Mon Sep 17 00:00:00 2001 From: graehl Date: Sat, 7 Aug 2010 02:24:51 +0000 Subject: propagation of feature name+debug from factory, return correct features array for fsa ffs git-svn-id: https://ws10smt.googlecode.com/svn/trunk@486 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/cdec_ff.cc | 6 ++- decoder/fdict.cc | 14 +++++++ decoder/fdict.h | 3 ++ decoder/ff.cc | 2 +- decoder/ff.h | 17 +++++++- decoder/ff_factory.h | 12 ++++-- decoder/ff_from_fsa.h | 21 +++++++--- decoder/ff_fsa.h | 27 +++++++++---- decoder/ff_fsa_data.h | 26 +++++++++++-- decoder/ff_fsa_dynamic.h | 29 +++++++++++--- decoder/ff_lm.cc | 3 +- decoder/ff_lm_fsa.h | 7 ++++ decoder/ff_sample_fsa.h | 12 ++++-- decoder/perro.sh | 2 +- graehl/NOTES.beam | 9 +++++ graehl/NOTES.earley | 73 +++++++++++++++++++++++++++++++++++ vest/mr_vest_generate_mapper_input.cc | 16 ++++++-- 17 files changed, 239 insertions(+), 40 deletions(-) create mode 100755 graehl/NOTES.earley diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 6edac126..1ef50eb1 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -13,8 +13,12 @@ #include "ff_register.h" void register_feature_functions() { + RegisterFsaImpl(true,false); + RegisterFsaImpl(true,false); RegisterFF(); RegisterFsaImpl(true,false); // same as LM but using fsa wrapper + ff_registry.Register("LanguageModelFsaDynamic",new FFFactory > >); // test correctness of FsaFeatureFunctionDynamic erasure + RegisterFsaDynToFF(); RegisterFF(); RegisterFF(); @@ -23,8 +27,6 @@ void register_feature_functions() { //TODO: worthless example target FSA ffs. remove later ff_registry.Register(new FFFactory); // same as WordPenalty, but implemented using ff_fsa - RegisterFsaImpl(true,false); - ff_registry.Register(new FFFactory >); ff_registry.Register(new FFFactory >); //TODO: use for all features the new Register which requires static FF::usage(false,false) give name diff --git a/decoder/fdict.cc b/decoder/fdict.cc index 65187685..baa0b552 100644 --- a/decoder/fdict.cc +++ b/decoder/fdict.cc @@ -2,12 +2,26 @@ #include "stdlib.h" //for malloc (need on cygwin); todo and std::malloc #include +#include using namespace std; Dict FD::dict_; bool FD::frozen_ = false; +std::string FD::Convert(std::vector const& v) { + return Convert(&*v.begin(),&*v.end()); +} + +std::string FD::Convert(WordID const *b,WordID const* e) { + ostringstream o; + for (WordID const* i=b;ib) o << ' '; + o << FD::Convert(*i); + } + return o.str(); +} + static int HexPairValue(const char * code) { int value = 0; const char * pch = code; diff --git a/decoder/fdict.h b/decoder/fdict.h index c4236580..f9673023 100644 --- a/decoder/fdict.h +++ b/decoder/fdict.h @@ -20,6 +20,9 @@ struct FD { static inline const std::string& Convert(const WordID& w) { return dict_.Convert(w); } + static std::string Convert(WordID const *i,WordID const* e); + static std::string Convert(std::vector const& v); + // Escape any string to a form that can be used as the name // of a weight in a weights file static std::string Escape(const std::string& s); diff --git a/decoder/ff.cc b/decoder/ff.cc index a23f1655..5c52ca2b 100644 --- a/decoder/ff.cc +++ b/decoder/ff.cc @@ -46,7 +46,7 @@ Features ModelSet::all_features(std::ostream *warn,bool warn0) { string const& ffname=ff.name_; FFS si=ff.features(); if (si.empty()) { - WARNFF(ffname<<" doesn't yet report any feature IDs - implement features() method?"); + WARNFF(ffname<<" doesn't yet report any feature IDs - either supply feature weight, or use --no_freeze_feature_set, or implement features() method"); } unsigned n0=0; for (unsigned j=0;j +# define DBGINIT(a) do { std::cerr< #include #include "fdict.h" @@ -19,6 +27,12 @@ class FeatureFunction { public: std::string name_; // set by FF factory using usage() bool debug_; // also set by FF factory checking param for immediate initial "debug" + //called after constructor, but before name_ and debug_ have been set + virtual void Init() { DBGINIT("default FF::Init name="< and . + protected: virtual void FinalTraversalFeatures(const void* residual_state, FeatureVector* final_features) const; diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h index 102e709c..5eb68c8b 100644 --- a/decoder/ff_factory.h +++ b/decoder/ff_factory.h @@ -42,7 +42,9 @@ struct FactoryBase : public UntypedFactory { template struct FFFactory : public FactoryBase { FP Create(std::string param) const { - return FP(new FF(param)); + FF *ret=new FF(param); + ret->Init(); + return FP(ret); } virtual std::string usage(bool params,bool verbose) const { return FF::usage(params,verbose); @@ -54,7 +56,9 @@ struct FFFactory : public FactoryBase { template struct FsaFactory : public FactoryBase { FP Create(std::string param) const { - return FP(new FF(param)); + FF *ret=new FF(param); + ret->Init(); + return FP(ret); } virtual std::string usage(bool params,bool verbose) const { return FF::usage(params,verbose); @@ -94,8 +98,8 @@ struct FactoryRegistry : public UntypedFactoryRegistry { if (debug) cerr<<"debug enabled for "<(*it->second).Create(param); - res->name_ = ffname; - res->debug_ = debug; + res->init_name_debug(ffname,debug); + // could add a res->Init() here instead of in Create if we wanted feature id to potentially differ based on the registered name rather than static usage() - of course, specific feature ids can be computed on the basis of feature param as well; this only affects the default single feature id=name return res; } }; diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h index 2c339aa8..ec1a28fa 100755 --- a/decoder/ff_from_fsa.h +++ b/decoder/ff_from_fsa.h @@ -30,14 +30,21 @@ class FeatureFunctionFromFsa : public FeatureFunction { public: FeatureFunctionFromFsa(std::string const& param) : ff(param) { debug_=true; // because factory won't set until after we construct. - Init(); } static std::string usage(bool args,bool verbose) { return Impl::usage(args,verbose); } + void init_name_debug(std::string const& n,bool debug) { + FeatureFunction::init_name_debug(n,debug); + ff.init_name_debug(n,debug); + } - Features features() const { return ff.features(); } + // this should override + Features features() const { + DBGINIT("FeatureFunctionFromFsa features() name="<(*this); } + Impl & d() { return static_cast(*this); } + + // this will get called by factory - override if you have multiple or dynamically named features. note: may be called repeatedly + void Init() { + Init(name()); + DBGINIT("base (single feature) FsaFeatureFunctionBase::Init name="< Bytes; -// stuff I see no reason to have virtual. but there's a diamond inheritance problem to solve now when type erasing the CRTP impl wrapper. virtual inheritance would slow things? +// stuff I see no reason to have virtual. but because it's impossible (w/o virtual inheritance to have dynamic fsa ff know where the impl's data starts, implemented a sync (copy) method that needs to be called. init_name_debug was already necessary to keep state in sync between ff and ff_from_fsa, so no sync should be needed after it. supposing all modifications were through setters, then no explicit sync call would ever be needed; updates could be mirrored. struct FsaFeatureFunctionData { + void init_name_debug(std::string const& n,bool debug) { + name_=n; + debug_=debug; + } //HACK for diamond inheritance (w/o costing performance) FsaFeatureFunctionData *sync_to_; void sync() const { // call this if you modify any fields after your constructor is done - if (sync_to_) *sync_to_=*this; + + if (sync_to_) { + DBGINIT("sync to "<<*sync_to_); + *sync_to_=*this; + DBGINIT("synced result="<<*sync_to_<< " from this="<<*this); + } else { + DBGINIT("nobody to sync to - from FeatureFunctionData this="<<*this); + } + } + + friend std::ostream &operator<<(std::ostream &o,FsaFeatureFunctionData const& d) { + o << "[FSA "<" for lm. diff --git a/decoder/ff_fsa_dynamic.h b/decoder/ff_fsa_dynamic.h index 2a26676d..d03fddee 100755 --- a/decoder/ff_fsa_dynamic.h +++ b/decoder/ff_fsa_dynamic.h @@ -26,10 +26,14 @@ struct FsaFeatureFunction : public FsaFeatureFunctionData { virtual void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const = 0; virtual int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const { return 0; } + // called after constructor, before use + virtual void Init() = 0; virtual std::string usage_v(bool param,bool verbose) const { return FeatureFunction::usage_helper("unnamed_dynamic_fsa_feature","","",param,verbose); } - + virtual void init_name_debug(std::string const& n,bool debug) { + FsaFeatureFunctionData::init_name_debug(n,debug); + } virtual void print_state(std::ostream &o,void const*state) const { FsaFeatureFunctionData::print_state(o,state); @@ -49,10 +53,13 @@ struct FsaFeatureFunction : public FsaFeatureFunctionData { // you might be wondering: why do this? answer: it's cool, and it means that the bottom-up ff over ff_fsa wrapper doesn't go through multiple layers of dynamic dispatch // usage: typedef FsaFeatureFunctionDynamic MyFsaDyn; template -struct FsaFeatureFunctionDynamic : public FsaFeatureFunction,Impl { +struct FsaFeatureFunctionDynamic : public FsaFeatureFunction { static const bool simple_phrase_score=Impl::simple_phrase_score; - Impl& d() { return static_cast(*this); } - Impl const& d() const { return static_cast(*this); } + Impl& d() { return impl;//static_cast(*this); + } + Impl const& d() const { return impl; + //static_cast(*this); + } int markov_order() const { return d().markov_order(); } virtual void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, @@ -92,11 +99,23 @@ struct FsaFeatureFunctionDynamic : public FsaFeatureFunction,Impl { return d().print_state(o,state); } - FsaFeatureFunctionDynamic(std::string const& param) : Impl(param) { + void init_name_debug(std::string const& n,bool debug) { + FsaFeatureFunction::init_name_debug(n,debug); + d().init_name_debug(n,debug); + } + + virtual void Init() { d().sync_to_=(FsaFeatureFunction*)this; + d().Init(); d().sync(); } + FsaFeatureFunctionDynamic(std::string const& param) : impl(param) { + Init(); + } +private: + Impl impl; + }; diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index 61d845d6..f3e65cb7 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -609,12 +609,13 @@ void LanguageModelFsa::set_ngram_order(int i) { //TODO: reevaluate whether state space comes cleared by allocator or not. } } + sync(); // for dynamic markov_order copy etc } LanguageModelFsa::LanguageModelFsa(string const& param) { int lmorder; pimpl_ = make_lm_impl(param,&lmorder,&fid_); - InitHaveFid(); + Init(); floor_=pimpl_->floor_; set_ngram_order(lmorder); } diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h index 1c8ebdad..d2df943e 100755 --- a/decoder/ff_lm_fsa.h +++ b/decoder/ff_lm_fsa.h @@ -115,8 +115,15 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase { // impl details: void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows. otherwise, it's the same as a "-o i" argument to constructor + // note: if you adjust ngram_order, ff_from_fsa won't notice. + double floor_; // log10prob minimum used (e.g. unk words) + // because we might have a custom fid due to lm name option: + void Init() { + InitHaveFid(); + } + private: int ngram_order_; int ctxlen_; // 1 less than above diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h index 40a32bae..20d64b16 100755 --- a/decoder/ff_sample_fsa.h +++ b/decoder/ff_sample_fsa.h @@ -14,12 +14,14 @@ struct WordPenaltyFsa : public FsaFeatureFunctionBase { } WordPenaltyFsa(std::string const& param) { - Init(); +#if 0 + Init(); // gets called by factory already return; //below are all defaults: set_state_bytes(0); start.clear(); h_start.clear(); - } +#endif + } // move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates. state and next_state may be same memory. Featval Scan1(WordID w,void const* state,void *next_state) const { return 1; @@ -32,7 +34,8 @@ struct SameFirstLetter : public FsaFeatureFunctionBase { SameFirstLetter(std::string const& param) : FsaFeatureFunctionBase(1,singleton_sentence("END")) // 1 byte of state, scan final (single) symbol "END" to get final state cost { - start[0]='a'; h_start[0]=0; Init(); + start[0]='a'; h_start[0]=0; +// Init(); } int markov_order() const { return 1; } Featval Scan1(WordID w,void const* old_state,void *new_state) const { @@ -82,6 +85,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase { } int markov_order() const { return 1; } LongerThanPrev(std::string const& param) : Base(sizeof(int)/* ,singleton_sentence(TD::se) */) { +#if 0 Init(); if (0) { // all this is done in constructor already set_state_bytes(sizeof(int)); @@ -96,7 +100,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase { assert(h_start.size()==sizeof(int)); state(start.begin())=999999; state(h_start.begin())=4; // estimate: anything >4 chars is usually longer than previous - +#endif } Featval Scan1(WordID w,void const* from,void *next_state) const { diff --git a/decoder/perro.sh b/decoder/perro.sh index 8df71855..836ad07f 100755 --- a/decoder/perro.sh +++ b/decoder/perro.sh @@ -1 +1 @@ -$gdb $cdec -k 30 --show_features -c fsa-hiero.ini -i perro.ur +$gdb $cdec -k 30 --show_features -c fsa-hiero.ini -i perro.ur "$@" diff --git a/graehl/NOTES.beam b/graehl/NOTES.beam index 59314439..a48d1ab7 100755 --- a/graehl/NOTES.beam +++ b/graehl/NOTES.beam @@ -18,3 +18,12 @@ randlm worth using? guess not. actually get all 0-state models in 1st pass parse and prune passive edges per span. allocate cube pruning budget per prev pass + +(has been tried in ISI decoder) models with nonstandard state comparison, +typically (partially) greedy forest scoring, some part of the state is excluded +from equality/hashing. Within virtual ff interface, would just add equals, hash +to vtable (rather than the faster raw state compare). If this is done often, +then add a nonvirtual flag to interface instead, saying whether to use the +virt. methods or not. or: simple flag by user of ApplyModels (per feature?) +saying whether to be 100% greedy or 0% - no halfway e.g. item name uses bigram +context, but score using 5gram state. diff --git a/graehl/NOTES.earley b/graehl/NOTES.earley new file mode 100755 index 00000000..73727a54 --- /dev/null +++ b/graehl/NOTES.earley @@ -0,0 +1,73 @@ + +the target CFG (tcfg) is finite - absolutely no cycles. conceptually we're intersecting it with a wfsa (weights are feature vectors), which is a lot like parsing a lattice, in that states are (source state, dest state) pairs and you're covering some string(s) that go from source->dest in the wfsa. + +Chris' paper http://www.ling.umd.edu/~redpony/forest-reordering.pdf - apparently (figure 5) already contains the exact concept we're going for, albeit with only inside scores. http://www.speech.sri.com/cgi-bin/run-distill?ftp:papers/stolcke-cl95.ps.gz describes a nice way of computing sums over derivations given a string by keeping a tuple of ("forward","inner") scores while Earley parsing. I'm not sure yet if this is applicable (because we'll already have the exact outside scores from the -LM forest already, and plan on using cost pushing toward the top so we don't have to explicitly consider them). + +normally in earley, one word is consumed at a time, left to right. completions happen from shortest to longest, then (repeated) predictions, and finally scans. i'm sure this has the usual obvious extension to parsing lattices (proceed in some topological order). + +but because the wfsa (ngram lm) has cycles and forgets the length of the string (at some point), it's slightly more complicated than lattice parsing the tcfg - there's no topological order over the wfsa states and so you can't finish all the items [x,j] for j from left->right. best first with monotonic total scores (admissable heuristics) is an easy way to avoid generating the whole space; otherwise I can't think of a fixed order that would allow for alternative beaming. as we discussed, arbitrary predicates filtering candidate items can be added if exact best-first is too slow + +if the wfsa were just a single string t[0...n-1], then any time you have an item [i,j]X->a.b that means that there's some derivation in the tCFG of S =>* t[0...i-1]Xc => t[0....i-1]abc =>* t[0...j-1]bc , for a FSA, the analog is S =>* W(0,i)Xc => W(0,i)abc =>* W(0,i)W(i,j)bc where W(a,b) means any string in the wfsa language with a as the start state and b as the final state. + + +http://www.isi.edu/natural-language/teaching/cs544/cs544-huang-3-Earley.pdf + +http://www.isi.edu/~lhuang/dp-final.pdf (variation on stolcke 1995 prefix cost) + +http://acl.ldc.upenn.edu/P/P07/P07-1019.pdf - phrase based lazy priority queue "cube growing" descendants (p149) + + + + + +http://www.speech.sri.com/cgi-bin/run-distill?ftp:papers/stolcke-cl95.ps.gz + +http://www.icsi.berkeley.edu/~stolcke/papers/cl95/node10.html#SECTION00042000000000000000 + +a) An (unconstrained) Earley path, or simply path, is a sequence of Earley +states linked by prediction, scanning, or completion. For the purpose of +this definition, we allow scanning to operate in “generation mode,” i.e., all +states with terminals to the right of the dot can be scanned, not just those +matching the input. (For completed states, the predecessor state is defined +to be the complete state from the same state set contributing to the +completion.) +b) A path is said to be constrained by, or generate a string x if the terminals +immediately to the left of the dot in all scanned states, in sequence, form +the string x. +c) A path is complete if the last state on it matches the first, except that the +dot has moved to the end of the RHS. +d) We say that a path starts with nonterminal X if the first state on it is a +predicted statewith X on the LHS. +e) The length of a path is defined as the number of scanned states on it. + +Note that the definition of path length is somewhat counter-intuitive, but is motivated +by the fact that only scanned states correspond directly to input symbols. Thus, +the length of a path is always the same as the length of the input string it generates. + +A constrained path starting with the initial state contains a sequence of states from +state set 0 derived by repeated prediction, followed by a single state from set 1 produced +by scanning the first symbol, followed by a sequence of states produced by completion, +followed by a sequence of predicted states, followed by a state scanning the second +symbol, and so on. The significance of Earley paths is that they are in a one-to-one +correspondence with left-most derivations. + + +============= + +The forward probability alpha_i(X[k]->x.y) is the sum of the probabilities of all +constrained paths of length that end in state X[k]->x.y + +b) The inner probability beta_i(X[k]->x.y) is the sum of the probabilities of all +paths of length i-k that start in state X[k,k]->.xy and end in X[k,i]->x.y, and generate the input symbols x[k,...,i-1] + +(forward,inner) [i.e. (outside,inside)?] + unchanged by scan (rule cost is paid up front when predicting) + +if X[k,i] -> x.Yz (a,b) and rule Y -> r (p) +then Y[i,i] -> .r (a',p) with a' += a*p + +if Y[j,i]->y. (a'',b'') and X[k,j]->r.Ys (a,b) +then X[k,i]->rY.s (a',b') with a' += a*b'', b' += b*b'' + +(this is summing over all derivations) + diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index f1179839..14342a43 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -18,15 +18,21 @@ const bool DEBUG_ORACLE=true; -boost::shared_ptr global_ff_registry; +//TODO: decide on cdec_ff ffs, or just bleumodel - if just bleumodel, then do existing features on serialized hypergraphs remain? weights (origin) is passed to oracle_bleu.h:ComputeOracle +//void register_feature_functions(); +//FFRegistry ff_registry; namespace { +void init_bleumodel() { + ff_registry.clear(); + ff_registry.Register(new FFFactory); +} + struct init_ff { init_ff() { - global_ff_registry.reset(new FFRegistry); - global_ff_registry->Register(new FFFactory); + init_bleumodel(); } }; -init_ff reg; +//init_ff reg; // order of initialization? ff_registry may not be init yet. call in Run() instead. } using namespace std; @@ -146,6 +152,8 @@ struct oracle_directions { } bool verbose() const { return oracle.verbose; } void Run() { + init_bleumodel(); +// register_feature_functions(); AddPrimaryAndRandomDirections(); AddOracleDirections(); compress_similar(directions,max_similarity,&cerr,true,verbose()); -- cgit v1.2.3