From 83654f58e1f5f5518ac0e30ad354eebda67fa774 Mon Sep 17 00:00:00 2001 From: graehl Date: Sat, 7 Aug 2010 00:07:45 +0000 Subject: dynamic fsa ff, factory for fsa and ff shares code, factory moved to ff_factory.cc git-svn-id: https://ws10smt.googlecode.com/svn/trunk@483 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/apply_fsa_models.h | 2 +- decoder/cdec.cc | 51 ++++++++++++++++++---- decoder/cdec_ff.cc | 54 ++++++++++++----------- decoder/do.tests.sh | 1 + decoder/ff.cc | 2 +- decoder/ff.h | 2 +- decoder/ff_factory.cc | 80 ++++++++++++++++++++-------------- decoder/ff_factory.h | 105 ++++++++++++++++++++++++++++++++++----------- decoder/ff_from_fsa.h | 8 +--- decoder/ff_fsa.h | 2 + decoder/ff_fsa_data.h | 11 ++++- decoder/ff_fsa_dynamic.h | 26 ++++++++--- decoder/ff_lm_fsa.h | 23 ++++------ decoder/ff_sample_fsa.h | 33 ++++++-------- decoder/oracle_bleu.h | 2 +- decoder/static_utoa.h | 56 +++++++++++++++++++++++- decoder/value_array.h | 4 +- 17 files changed, 318 insertions(+), 144 deletions(-) create mode 100755 decoder/do.tests.sh (limited to 'decoder') diff --git a/decoder/apply_fsa_models.h b/decoder/apply_fsa_models.h index d22397e3..0a8615b5 100755 --- a/decoder/apply_fsa_models.h +++ b/decoder/apply_fsa_models.h @@ -1,7 +1,7 @@ #ifndef _APPLY_FSA_MODELS_H_ #define _APPLY_FSA_MODELS_H_ -//#include "ff_fsa_dynamic.h" +#include "ff_fsa_dynamic.h" struct FsaFeatureFunction; struct Hypergraph; diff --git a/decoder/cdec.cc b/decoder/cdec.cc index 876dee18..e896a484 100644 --- a/decoder/cdec.cc +++ b/decoder/cdec.cc @@ -34,6 +34,7 @@ #include "exp_semiring.h" #include "sentence_metadata.h" #include "../vest/scorer.h" +#include "apply_fsa_models.h" using namespace std; using namespace std::tr1; @@ -69,15 +70,27 @@ shared_ptr make_ff(string const& ffp,bool verbose_feature_funct cerr << "Feature: " << ff; if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n"; else cerr << " (no config parameters)\n"; - shared_ptr pf = global_ff_registry->Create(ff, param); - if (!pf) - exit(1); + shared_ptr pf = ff_registry.Create(ff, param); + if (!pf) exit(1); int nbyte=pf->NumBytesContext(); if (verbose_feature_functions) cerr<<"State is "< make_fsa_ff(string const& ffp,bool verbose_feature_functions,char const* pre="") { + string ff, param; + SplitCommandAndParam(ffp, &ff, ¶m); + cerr << "FSA Feature: " << ff; + if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n"; + else cerr << " (no config parameters)\n"; + shared_ptr pf = fsa_ff_registry.Create(ff, param); + if (!pf) exit(1); + if (verbose_feature_functions) + cerr<<"State is "<state_bytes()<<" bytes for "< > Ds; @@ -106,6 +119,7 @@ void InitCommandLine(int argc, char** argv, OracleBleu &ob, po::variables_map* c ("warn_0_weight","Warn about any feature id that has a 0 weight (this is perfectly safe if you intend 0 weight, though)") ("no_freeze_feature_set,Z", "Do not freeze feature set after reading feature weights file") ("feature_function,F",po::value >()->composing(), "Additional feature function(s) (-L for list)") + ("fsa_feature_function",po::value >()->composing(), "Additional FSA feature function(s) (-L for list)") ("list_feature_functions,L","List available feature functions") ("add_pass_through_rules,P","Add rules to translate OOV words as themselves") ("k_best,k",po::value(),"Extract the k best derivations") @@ -185,13 +199,15 @@ void InitCommandLine(int argc, char** argv, OracleBleu &ob, po::variables_map* c if (conf.count("list_feature_functions")) { cerr << "Available feature functions (specify with -F; describe with -u FeatureName):\n"; - global_ff_registry->DisplayList(); + ff_registry.DisplayList(); + cerr << "Available feature functions (specify with --fsa_feature_function):\n"; + fsa_ff_registry.DisplayList(); cerr << endl; exit(1); } if (conf.count("usage")) { - cout<usage(str("usage",conf),true,true)< +bool store_conf(po::variables_map const& conf,std::string const& name,V *v) { + if (conf.count(name)) { + *v=conf[name].as(); + return true; + } + return false; +} + + int main(int argc, char** argv) { - global_ff_registry.reset(new FFRegistry); register_feature_functions(); po::variables_map conf; OracleBleu oracle; @@ -441,7 +466,6 @@ int main(int argc, char** argv) { // set up additional scoring features vector > pffs,prelm_only_ffs; - vector late_ffs,prelm_ffs; if (conf.count("feature_function") > 0) { const vector& add_ffs = conf["feature_function"].as >(); @@ -454,7 +478,7 @@ int main(int argc, char** argv) { prelm_ffs.push_back(p); else cerr << "Excluding stateful feature from prelm pruning: "< 0) { @@ -465,6 +489,17 @@ int main(int argc, char** argv) { } } + vector > fsa_ffs; + vector fsa_names; + store_conf(conf,"fsa_feature_function",&fsa_names); + if (fsa_ffs.size()>1) { + //FIXME: support N fsa ffs. + cerr<<"Only the first fsa FF will be used (FIXME).\n"; + fsa_names.resize(1); + for (int i=0;i global_ff_registry; +#include "ff_register.h" void register_feature_functions() { - global_ff_registry->Register(new FFFactory); - global_ff_registry->Register(new FFFactory >); // same as LM but using fsa wrapper + RegisterFF(); + RegisterFsaImpl(true,false); // same as LM but using fsa wrapper + + RegisterFF(); + RegisterFF(); + RegisterFF(); + RegisterFF(); - global_ff_registry->Register(new FFFactory); // same as WordPenalty, but implemented using ff_fsa - global_ff_registry->Register(new FFFactory >); - global_ff_registry->Register(new FFFactory >); + //TODO: worthless example target FSA ffs. remove later + ff_registry.Register(new FFFactory); // same as WordPenalty, but implemented using ff_fsa + ff_registry.Register(new FFFactory >); + ff_registry.Register(new FFFactory >); + ff_registry.Register(new FFFactory >); - //TODO: use for all features the new Register which requires usage(...) + //TODO: use for all features the new Register which requires static FF::usage(false,false) give name #ifdef HAVE_RANDLM - global_ff_registry->Register("RandLM", new FFFactory); + ff_registry.Register("RandLM", new FFFactory); #endif - global_ff_registry->Register(new FFFactory); - global_ff_registry->Register(new FFFactory); - global_ff_registry->Register(new FFFactory); - global_ff_registry->Register(new FFFactory); - global_ff_registry->Register("RuleShape", new FFFactory); - global_ff_registry->Register("RelativeSentencePosition", new FFFactory); - global_ff_registry->Register("Model2BinaryFeatures", new FFFactory); - global_ff_registry->Register("MarkovJump", new FFFactory); - global_ff_registry->Register("MarkovJumpFClass", new FFFactory); - global_ff_registry->Register("SourcePOSBigram", new FFFactory); - global_ff_registry->Register("BlunsomSynchronousParseHack", new FFFactory); - global_ff_registry->Register("AlignerResults", new FFFactory); - global_ff_registry->Register("CSplit_BasicFeatures", new FFFactory); - global_ff_registry->Register("CSplit_ReverseCharLM", new FFFactory); - global_ff_registry->Register("Tagger_BigramIdentity", new FFFactory); - global_ff_registry->Register("LexicalPairIdentity", new FFFactory); + ff_registry.Register("RuleShape", new FFFactory); + ff_registry.Register("RelativeSentencePosition", new FFFactory); + ff_registry.Register("Model2BinaryFeatures", new FFFactory); + ff_registry.Register("MarkovJump", new FFFactory); + ff_registry.Register("MarkovJumpFClass", new FFFactory); + ff_registry.Register("SourcePOSBigram", new FFFactory); + ff_registry.Register("BlunsomSynchronousParseHack", new FFFactory); + ff_registry.Register("AlignerResults", new FFFactory); + ff_registry.Register("CSplit_BasicFeatures", new FFFactory); + ff_registry.Register("CSplit_ReverseCharLM", new FFFactory); + ff_registry.Register("Tagger_BigramIdentity", new FFFactory); + ff_registry.Register("LexicalPairIdentity", new FFFactory); + } + diff --git a/decoder/do.tests.sh b/decoder/do.tests.sh new file mode 100755 index 00000000..b3ddeb18 --- /dev/null +++ b/decoder/do.tests.sh @@ -0,0 +1 @@ +for f in *_test; do ./$f; done diff --git a/decoder/ff.cc b/decoder/ff.cc index 28620bab..a23f1655 100644 --- a/decoder/ff.cc +++ b/decoder/ff.cc @@ -43,7 +43,7 @@ Features ModelSet::all_features(std::ostream *warn,bool warn0) { FFM ff_from; for (unsigned i=0;i Features; // set of features ids // FinalTraversalFeatures(...) class FeatureFunction { public: - std::string name; // set by FF factory using usage() + std::string name_; // set by FF factory using usage() bool debug_; // also set by FF factory checking param for immediate initial "debug" bool debug() const { return debug_; } FeatureFunction() : state_size_() {} diff --git a/decoder/ff_factory.cc b/decoder/ff_factory.cc index 88991fbf..b3aeeac1 100644 --- a/decoder/ff_factory.cc +++ b/decoder/ff_factory.cc @@ -6,49 +6,42 @@ using boost::shared_ptr; using namespace std; -FFFactoryBase::~FFFactoryBase() {} +UntypedFactory::~UntypedFactory() { } -void FFRegistry::DisplayList() const { - for (map >::const_iterator it = reg_.begin(); +namespace { +std::string const& debug_pre="debug"; +} + +void UntypedFactoryRegistry::clear() { + reg_.clear(); +} + +bool UntypedFactoryRegistry::parse_debug(std::string & param) { + int pl=debug_pre.size(); + bool space=false; + std::string p=param; + bool debug=match_begin(p,debug_pre)&& + (p.size()==pl || (space=(p[pl]==' '))); + if (debug) + p.erase(0,debug_pre.size()+space); + return debug; +} + +void UntypedFactoryRegistry::DisplayList() const { + for (Factmap::const_iterator it = reg_.begin(); it != reg_.end(); ++it) { cerr << " " << it->first << endl; } } -string FFRegistry::usage(string const& ffname,bool params,bool verbose) const { - map >::const_iterator it = reg_.find(ffname); +string UntypedFactoryRegistry::usage(string const& ffname,bool params,bool verbose) const { + Factmap::const_iterator it = reg_.find(ffname); return it == reg_.end() ? "Unknown feature " + ffname : it->second->usage(params,verbose); } -namespace { -std::string const& debug_pre="debug"; -} - -shared_ptr FFRegistry::Create(const string& ffname, const string& param) const { - map >::const_iterator it = reg_.find(ffname); - shared_ptr res; - if (it == reg_.end()) { - cerr << "I don't know how to create feature " << ffname << endl; - } else { - int pl=debug_pre.size(); - bool space=false; - std::string p=param; - bool debug=match_begin(p,debug_pre)&& - (p.size()==pl || (space=(p[pl]==' '))); - if (debug) { - p.erase(0,debug_pre.size()+space); - cerr<<"debug enabled for "<second->Create(p); - res->name=ffname; - res->debug_=debug; - } - return res; -} - -void FFRegistry::Register(const string& ffname, FFFactoryBase* factory) { +void UntypedFactoryRegistry::Register(const string& ffname, UntypedFactory* factory) { if (reg_.find(ffname) != reg_.end()) { cerr << "Duplicate registration of FeatureFunction with name " << ffname << "!\n"; abort(); @@ -57,7 +50,28 @@ void FFRegistry::Register(const string& ffname, FFFactoryBase* factory) { } -void FFRegistry::Register(FFFactoryBase* factory) +void UntypedFactoryRegistry::Register(UntypedFactory* factory) { Register(factory->usage(false,false),factory); } + + +/*FIXME: I want these to go in ff_factory.cc, but extern etc. isn't workign right: + ../decoder/libcdec.a(ff_factory.o): In function `~UntypedFactory': +/nfs/topaz/graehl/ws10smt/decoder/ff_factory.cc:9: multiple definition of `global_ff_registry' +mr_vest_generate_mapper_input.o:/nfs/topaz/graehl/ws10smt/vest/mr_vest_generate_mapper_input.cc:307: first defined here +*/ +FsaFFRegistry fsa_ff_registry; +FFRegistry ff_registry; + +/* +namespace { +struct null_deleter +{ + template + void operator()(F const& f) const { } +}; + +boost::shared_ptr global_fsa_ff_registry(&fsa_ff_registry,null_deleter()); +boost::shared_ptr global_ff_registry(&ff_registry,null_deleter()); +*/ diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h index 93681c5e..e5821d44 100644 --- a/decoder/ff_factory.h +++ b/decoder/ff_factory.h @@ -1,6 +1,10 @@ #ifndef _FF_FACTORY_H_ #define _FF_FACTORY_H_ +// FsaF* vs F* (regular ff/factory). + +//TODO: use http://www.boost.org/doc/libs/1_43_0/libs/functional/factory/doc/html/index.html ? + /*TODO: register state identity separately from feature function identity? as * in: string registry for name of state somewhere, assert that same result is * computed by all users? or, we can just require that ff sharing same state @@ -8,49 +12,102 @@ * once. that's fine. */ -//TODO: use http://www.boost.org/doc/libs/1_43_0/libs/functional/factory/doc/html/index.html ? #include #include #include +#include #include class FeatureFunction; -class FFRegistry; -class FFFactoryBase; -extern boost::shared_ptr global_ff_registry; -class FFRegistry { - friend int main(int argc, char** argv); - friend class FFFactoryBase; - public: - boost::shared_ptr Create(const std::string& ffname, const std::string& param) const; - std::string usage(std::string const& ffname,bool params=true,bool verbose=true) const; - void DisplayList() const; - void Register(const std::string& ffname, FFFactoryBase* factory); - void Register(FFFactoryBase* factory); - FFRegistry() {} - private: - std::map > reg_; -}; +class FsaFeatureFunction; -struct FFFactoryBase { - virtual ~FFFactoryBase(); - virtual boost::shared_ptr Create(const std::string& param) const = 0; + +struct UntypedFactory { + virtual ~UntypedFactory(); virtual std::string usage(bool params,bool verbose) const = 0; }; +template +struct FactoryBase : public UntypedFactory { + typedef FF F; + typedef boost::shared_ptr FP; + + virtual FP Create(std::string param) const = 0; +}; + +/* see cdec_ff.cc for example usage: this create concrete factories to be registered */ template -class FFFactory : public FFFactoryBase { - boost::shared_ptr Create(const std::string& param) const { - return boost::shared_ptr(new FF(param)); +struct FFFactory : public FactoryBase { + FP Create(std::string param) const { + return FP(new FF(param)); } - // called with false,false just gives feature name virtual std::string usage(bool params,bool verbose) const { return FF::usage(params,verbose); } +}; + +// same as above, but we didn't want to require a typedef e.g. Parent in FF class, and template typedef isn't available +template +struct FsaFactory : public FactoryBase { + FP Create(std::string param) const { + return FP(new FF(param)); + } + virtual std::string usage(bool params,bool verbose) const { + return FF::usage(params,verbose); + } +}; + +struct UntypedFactoryRegistry { + std::string usage(std::string const& ffname,bool params=true,bool verbose=true) const; + void DisplayList() const; + void Register(const std::string& ffname, UntypedFactory* factory); + void Register(UntypedFactory* factory); + void clear(); + static bool parse_debug(std::string & param_in_out); // returns true iff param starts w/ debug (and remove that prefix from param) + protected: + typedef boost::shared_ptr FactoryP; + typedef std::map Factmap; + Factmap reg_; + friend int main(int argc, char** argv); + friend class UntypedFactory; }; + + +template +struct FactoryRegistry : public UntypedFactoryRegistry { + typedef Feat F; + typedef boost::shared_ptr FP; + typedef FactoryBase FB; + + FP Create(const std::string& ffname, std::string param) const { + using namespace std; + Factmap::const_iterator it = reg_.find(ffname); + if (it == reg_.end()) + throw std::runtime_error("I don't know how to create feature "+ffname); + bool debug=parse_debug(param); + if (debug) + cerr<<"debug enabled for "<(*it->second).Create(param); + res->name_ = ffname; + res->debug_ = debug; + return res; + } +}; + +typedef FactoryRegistry FFRegistry; +typedef FactoryRegistry FsaFFRegistry; + +extern FsaFFRegistry fsa_ff_registry; +inline FsaFFRegistry & global_fsa_ff_registry() { return fsa_ff_registry; } +extern FFRegistry ff_registry; +inline FFRegistry & global_ff_registry() { return ff_registry; } +/* +extern boost::shared_ptr global_fsa_ff_registry; +extern boost::shared_ptr global_ff_registry; +*/ #endif diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h index 10ccfe6d..2c339aa8 100755 --- a/decoder/ff_from_fsa.h +++ b/decoder/ff_from_fsa.h @@ -18,11 +18,8 @@ uses guarantee about markov order=N to score ASAP encoding of state: if less than N-1 (ctxlen) words - either: - struct FF : public FsaImpl,FeatureFunctionFromFsa (more efficient) - - or: - struct FF : public FsaFeatureFunctionDynamic,FeatureFunctionFromFsa (code sharing, but double dynamic dispatch) + usage: + typedef FeatureFunctionFromFsa LanguageModelFromFsa; */ template @@ -271,7 +268,6 @@ private: } }; - #ifdef TEST_FSA # include "tdict.cc" # include "ff_sample_fsa.h" diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h index 6c1294f8..e7877dd5 100755 --- a/decoder/ff_fsa.h +++ b/decoder/ff_fsa.h @@ -8,6 +8,8 @@ state is some fixed width byte array. could actually be a void *, WordID sequence, whatever. + TODO: specify Scan return code or feature value = -inf for failure state (e.g. for hard intersection with desired target lattice?) + TODO: maybe ff that wants to know about SentenceMetadata should store a ref to it permanently rather than get passed it for every operation. we're never decoding more than 1 sentence at once and it's annoying to pass it. same diff --git a/decoder/ff_fsa_data.h b/decoder/ff_fsa_data.h index 66d2cca8..3252c5ac 100755 --- a/decoder/ff_fsa_data.h +++ b/decoder/ff_fsa_data.h @@ -9,11 +9,19 @@ typedef ValueArray Bytes; -// stuff I see no reason to have virtual. +// stuff I see no reason to have virtual. but there's a diamond inheritance problem to solve now when type erasing the CRTP impl wrapper. virtual inheritance would slow things? struct FsaFeatureFunctionData { + //HACK for diamond inheritance (w/o costing performance) + FsaFeatureFunctionData *sync_to_; + + void sync() const { // call this if you modify any fields after your constructor is done + if (sync_to_) *sync_to_=*this; + } + FsaFeatureFunctionData(int statesz=0,Sentence const& end_sentence_phrase=Sentence()) : ssz(statesz),start(statesz),h_start(statesz),end_phrase_(end_sentence_phrase) { debug_=true; + sync_to_=0; } std::string name_; @@ -65,6 +73,7 @@ protected: int ssz; // don't forget to set this. default 0 (it may depend on params of course) Bytes start,h_start; // start state and estimated-features (heuristic) start state. set these. default empty. Sentence end_phrase_; // words appended for final traversal (final state cost is assessed using Scan) e.g. "" for lm. + // this can be called instead or after constructor (also set bytes and end_phrase_) void set_state_bytes(int sb=0) { if (start.size()!=sb) start.resize(sb); if (h_start.size()!=sb) h_start.resize(sb); diff --git a/decoder/ff_fsa_dynamic.h b/decoder/ff_fsa_dynamic.h index 2703b305..2a26676d 100755 --- a/decoder/ff_fsa_dynamic.h +++ b/decoder/ff_fsa_dynamic.h @@ -2,13 +2,14 @@ #define FF_FSA_DYNAMIC_H struct SentenceMetadata; + #include "ff_fsa_data.h" #include "hg.h" // can't forward declare nested Hypergraph::Edge class #include - // the type-erased interface +//FIXME: diamond inheritance problem. make a copy of the fixed data? or else make the dynamic version not wrap but rather be templated CRTP base (yuck) struct FsaFeatureFunction : public FsaFeatureFunctionData { static const bool simple_phrase_score=false; virtual int markov_order() const = 0; @@ -25,10 +26,11 @@ struct FsaFeatureFunction : public FsaFeatureFunctionData { virtual void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *accum) const = 0; virtual int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const { return 0; } - virtual std::string usage(bool param,bool verbose) const { + virtual std::string usage_v(bool param,bool verbose) const { return FeatureFunction::usage_helper("unnamed_dynamic_fsa_feature","","",param,verbose); } + virtual void print_state(std::ostream &o,void const*state) const { FsaFeatureFunctionData::print_state(o,state); } @@ -45,12 +47,12 @@ struct FsaFeatureFunction : public FsaFeatureFunctionData { // conforming to above interface, type erases FsaImpl // you might be wondering: why do this? answer: it's cool, and it means that the bottom-up ff over ff_fsa wrapper doesn't go through multiple layers of dynamic dispatch -// usage: struct My : public FsaFeatureFunctionDynamic +// usage: typedef FsaFeatureFunctionDynamic MyFsaDyn; template -struct FsaFeatureFunctionDynamic : public FsaFeatureFunction { +struct FsaFeatureFunctionDynamic : public FsaFeatureFunction,Impl { static const bool simple_phrase_score=Impl::simple_phrase_score; Impl& d() { return static_cast(*this); } - Impl const& d() { return static_cast(*this); } + Impl const& d() const { return static_cast(*this); } int markov_order() const { return d().markov_order(); } virtual void ScanAccum(SentenceMetadata const& smeta,Hypergraph::Edge const& edge, @@ -68,6 +70,7 @@ struct FsaFeatureFunctionDynamic : public FsaFeatureFunction { WordID const* i, WordID const* end, void const* state,Accum *a) const { return d().ScanPhraseAccumOnly(smeta,edge,i,end,state,a); + } virtual void *ScanPhraseAccumBounce(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,void *cs,void *ns,Accum *a) const { return d().ScanPhraseAccumBounce(smeta,edge,i,end,cs,ns,a); @@ -77,15 +80,26 @@ struct FsaFeatureFunctionDynamic : public FsaFeatureFunction { return d().early_score_words(smeta,edge,i,end,accum); } - virtual std::string usage(bool param,bool verbose) const { + static std::string usage(bool param,bool verbose) { + return Impl::usage(param,verbose); + } + + std::string usage_v(bool param,bool verbose) const { return Impl::usage(param,verbose); } virtual void print_state(std::ostream &o,void const*state) const { return d().print_state(o,state); } + + FsaFeatureFunctionDynamic(std::string const& param) : Impl(param) { + d().sync_to_=(FsaFeatureFunction*)this; + d().sync(); + } + }; + //TODO: combine 2 (or N) FsaFeatureFunction (type erased) diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h index b95fde02..1c8ebdad 100755 --- a/decoder/ff_lm_fsa.h +++ b/decoder/ff_lm_fsa.h @@ -4,6 +4,9 @@ //FIXME: when FSA_LM_PHRASE 1, 3gram fsa has differences, especially with unk words, in about the 4th decimal digit (about .05%), compared to regular ff_lm. this is USUALLY a bug (there's way more actual precision in there). this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that). also, LM_FSA_SHORTEN_CONTEXT gives identical scores with FSA_LM_PHRASE 0 // enabling for now - retest unigram+ more, solve above puzzle + +// some impls in ff_lm.cc + #define FSA_LM_PHRASE 1 #define FSA_LM_DEBUG 0 @@ -15,8 +18,8 @@ # define FSALMDBGnl(e) #endif +#include "ff_fsa.h" #include "ff_lm.h" -#include "ff_from_fsa.h" namespace { WordID empty_context=TD::none; @@ -49,17 +52,13 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase { #endif if (!ctxlen_) { Add(floored(pimpl_->WordProb(w,&empty_context)),a); - return; - } - //variable length array is in C99, msvc++, if it doesn't support it, #ifdef it or use a stackalloc call (forget the name) - if (ctxlen_) { + } else { WordID ctx[ngram_order_]; //alloca if you don't have C99 state_copy(ctx,old_st); - ctx[ctxlen_]=TD::none; // make this part of state? wastes space but saves copies. + ctx[ctxlen_]=TD::none; Featval p=floored(pimpl_->WordProb(w,ctx)); - FSALMDBG(de,"p("<l("<l("<ctx;--ctx_score_end) @@ -128,6 +124,5 @@ private: }; -typedef FeatureFunctionFromFsa LanguageModelFromFsa; #endif diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h index 9f44f1a4..40a32bae 100755 --- a/decoder/ff_sample_fsa.h +++ b/decoder/ff_sample_fsa.h @@ -29,7 +29,11 @@ struct WordPenaltyFsa : public FsaFeatureFunctionBase { typedef FeatureFunctionFromFsa WordPenaltyFromFsa; struct SameFirstLetter : public FsaFeatureFunctionBase { - SameFirstLetter(std::string const& param) : FsaFeatureFunctionBase(1,singleton_sentence("END")) { start[0]='a';h_start[0]=0; } // 1 byte of state, scan final (single) symbol "END" to get final state cost + SameFirstLetter(std::string const& param) : FsaFeatureFunctionBase(1,singleton_sentence("END")) + // 1 byte of state, scan final (single) symbol "END" to get final state cost + { + start[0]='a'; h_start[0]=0; Init(); + } int markov_order() const { return 1; } Featval Scan1(WordID w,void const* old_state,void *new_state) const { char cw=TD::Convert(w)[0]; @@ -41,12 +45,13 @@ struct SameFirstLetter : public FsaFeatureFunctionBase { o<<*(char const*)st; } static std::string usage(bool param,bool verbose) { - return FeatureFunction::usage_helper("SameFirstLetter","[no args]","1 each time 2 consecutive words start with the same letter",param,verbose); + return FeatureFunction::usage_helper("SameFirstLetter", + "[no args]", + "1 each time 2 consecutive words start with the same letter", + param,verbose); } }; - -// appears to be buggy right now: give it a bonus weight (+) struct LongerThanPrev : public FsaFeatureFunctionBase { typedef FsaFeatureFunctionBase Base; static std::string usage(bool param,bool verbose) { @@ -104,7 +109,10 @@ struct LongerThanPrev : public FsaFeatureFunctionBase { // similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State) struct ShorterThanPrev : FsaTypedBase { - typedef FsaTypedBase Base; + ShorterThanPrev(std::string const& param) + : FsaTypedBase(-1,4,singleton_sentence(TD::se)) // start, h_start, end_phrase + // h_start estimate state: anything <4 chars is usually shorter than previous + { Init(); } static std::string usage(bool param,bool verbose) { return FeatureFunction::usage_helper( "ShorterThanPrev", @@ -112,28 +120,13 @@ struct ShorterThanPrev : FsaTypedBase { "stupid example stateful (bigram) feature: 1 per target word that's shorter than the previous word (end of sentence considered '')", param,verbose); } - static inline int wordlen(WordID w) { return std::strlen(TD::Convert(w)); } - ShorterThanPrev(std::string const& param) - : Base(-1,4,singleton_sentence(TD::se)) // start, h_start, end_phrase - // estimate: anything <4 chars is usually shorter than previous - { - Init(); - } - - -/* Featval ScanT1(WordID w,int prevlen,int &len) const; - // alternative to below: - */ - - // evil anti-google int & len out-param: Featval ScanT1(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,int prevlen,int &len) const { len=wordlen(w); return (lenCreate("BLEUModel",param.str()); + pff=ff_registry.Create("BLEUModel",param.str()); } bool is_null() const { diff --git a/decoder/static_utoa.h b/decoder/static_utoa.h index 0dbe111f..fe5f6d92 100755 --- a/decoder/static_utoa.h +++ b/decoder/static_utoa.h @@ -2,6 +2,9 @@ #define STATIC_UTOA_H #include "threadlocal.h" + + +#include #include #define DIGIT_LOOKUP_TABLE 0 @@ -24,8 +27,7 @@ inline char digit_to_char(int d) { #endif } - -// returns n in string [return,num); *num=0 yourself calling if you want a c_str +// returns n in string [return,num); *num=0 yourself before calling if you want a c_str inline char *utoa(char *num,unsigned n) { if ( !n ) { *--num='0'; @@ -59,5 +61,55 @@ inline char* append_utoa(char *to,unsigned n) { return to+ns; } +// so named to avoid gcc segfault when named itoa +inline char *itoa(char *p,int n) { + if (n<0) { + p=utoa(p,-n); // TODO: check that (unsigned)(-INT_MIN) == 0x1000000 in 2s complement and not == 0 + *--p='-'; + return p; + } else + return utoa(p,n); +} + +inline char *static_itoa(int n) { + return itoa(utoa_buf+utoa_bufsizem1,n); +} + + +inline std::string utos(unsigned n) { + const int bufsz=20; + char buf[bufsz]; + char *end=buf+bufsz; + char *p=utoa(end,n); + return std::string(p,end); +} + +inline std::string itos(int n) { + const int bufsz=20; + char buf[bufsz]; + char *end=buf+bufsz; + char *p=itoa(end,n); + return std::string(p,end); +} + +#ifdef ITOA_SAMPLE +# include +# include +# include +using namespace std; + +int main(int argc,char *argv[]) { + printf("d U d U d U\n"); + for (int i=1;i