From 52a8d49e81c14b6f7ed3afb5bdb50b17391995a8 Mon Sep 17 00:00:00 2001 From: graehl Date: Thu, 8 Jul 2010 19:21:02 +0000 Subject: actually use -n feature_name in LanguageModel. FF factory usage facility, FF feature ids facility (not used yet) git-svn-id: https://ws10smt.googlecode.com/svn/trunk@186 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/ff.cc | 13 +++++++ decoder/ff.h | 13 ++++++- decoder/ff_factory.cc | 12 ++++++ decoder/ff_factory.h | 8 ++++ decoder/ff_lm.cc | 97 ++++++++++++++++++++++++++++++------------------- decoder/ff_lm.h | 1 + decoder/sparse_vector.h | 4 +- 7 files changed, 107 insertions(+), 41 deletions(-) (limited to 'decoder') diff --git a/decoder/ff.cc b/decoder/ff.cc index 261e9a17..73dbbdc9 100644 --- a/decoder/ff.cc +++ b/decoder/ff.cc @@ -16,6 +16,19 @@ void FeatureFunction::FinalTraversalFeatures(const void* ant_state, (void) features; } +string FeatureFunction::usage_helper(std::string const& name,std::string const& params,std::string const& details,bool sp,bool sd) { + string r=name; + if (sp) { + r+=": "; + r+=params; + } + if (sd) { + r+="\n"; + r+=details; + } + return r; +} + // Hiero and Joshua use log_10(e) as the value, so I do to WordPenalty::WordPenalty(const string& param) : fid_(FD::Convert("WordPenalty")), diff --git a/decoder/ff.h b/decoder/ff.h index 630b3208..c6c9cf8f 100644 --- a/decoder/ff.h +++ b/decoder/ff.h @@ -19,6 +19,17 @@ class FeatureFunction { explicit FeatureFunction(int state_size) : state_size_(state_size) {} virtual ~FeatureFunction(); + // override this. not virtual because we want to expose this to factory template for help before creating a FF + static std::string usage(bool show_params,bool show_details) { + return usage_helper("FIXME_feature_needs_name","[no parameters]","[no documentation yet]",show_params,show_details); + } + + static std::string usage_helper(std::string const& name,std::string const& params,std::string const& details,bool show_params,bool show_details); + +public: + + typedef std::vector Features; + virtual Features features() { return Features(); } // returns the number of bytes of context that this feature function will // (maximally) use. By default, 0 ("stateless" models in Hiero/Joshua). // NOTE: this value is fixed for the instance of your class, you cannot @@ -144,7 +155,7 @@ class ModelSet { bool empty() const { return models_.empty(); } private: std::vector models_; - std::vector weights_; + std::vector weights_; int state_size_; std::vector model_state_pos_; }; diff --git a/decoder/ff_factory.cc b/decoder/ff_factory.cc index 1854e0bb..d66cd883 100644 --- a/decoder/ff_factory.cc +++ b/decoder/ff_factory.cc @@ -14,6 +14,13 @@ void FFRegistry::DisplayList() const { } } +string FFRegistry::usage(string const& ffname,bool params,bool verbose) const { + map >::const_iterator it = reg_.find(ffname); + return it == reg_.end() + ? "Unknown feature " + ffname + : it->second->usage(params,verbose); +} + shared_ptr FFRegistry::Create(const string& ffname, const string& param) const { map >::const_iterator it = reg_.find(ffname); shared_ptr res; @@ -33,3 +40,8 @@ void FFRegistry::Register(const string& ffname, FFFactoryBase* factory) { reg_[ffname].reset(factory); } + +void FFRegistry::Register(FFFactoryBase* factory) +{ + Register(factory->usage(false,false),factory); +} diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h index bc586567..75911f38 100644 --- a/decoder/ff_factory.h +++ b/decoder/ff_factory.h @@ -17,8 +17,10 @@ class FFRegistry { friend class FFFactoryBase; public: boost::shared_ptr Create(const std::string& ffname, const std::string& param) const; + std::string usage(std::string const& ffname,bool params=true,bool verbose=true) const; void DisplayList() const; void Register(const std::string& ffname, FFFactoryBase* factory); + void Register(FFFactoryBase* factory); private: FFRegistry() {} std::map > reg_; @@ -27,6 +29,7 @@ class FFRegistry { struct FFFactoryBase { virtual ~FFFactoryBase(); virtual boost::shared_ptr Create(const std::string& param) const = 0; + virtual std::string usage(bool params,bool verbose) const = 0; }; template @@ -34,6 +37,11 @@ class FFFactory : public FFFactoryBase { boost::shared_ptr Create(const std::string& param) const { return boost::shared_ptr(new FF(param)); } + // called with false,false just gives feature name + virtual std::string usage(bool params,bool verbose) const { + return FF::usage(params,verbose); + } + }; #endif diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index e6f7912e..9e6f02b7 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -1,3 +1,7 @@ +char const* usage_name="LanguageModel"; +char const* usage_short="srilm.gz [-n FeatureName] [-o StateOrder] [-m LimitLoadOrder]"; +char const* usage_verbose="-n determines the name of the feature (and its weight). -o defaults to 3. -m defaults to effectively infinite, otherwise says what order lm probs to use (up to). you could use -o > -m but that would be wasteful. -o < -m means some ngrams are scored longer (whenever a word is inserted by a rule next to a variable) than the state would ordinarily allow. NOTE: multiple LanguageModel features are allowed, but they will wastefully duplicate state, except in the special case of -o 1 (which uses no state). subsequent references to the same a.lm.gz. unless they specify -m, will reuse the same SRI LM in memory; this means that the -m used in the first load of a.lm.gz will take effect."; + //TODO: backoff wordclasses for named entity xltns, esp. numbers. e.g. digits -> @. idealy rule features would specify replacement lm tokens/classes //TODO: extra int in state to hold "GAP" token is not needed. if there are less than (N-1) words, then null terminate the e.g. left words. however, this would mean treating gapless items differently. not worth the potential bugs right now. @@ -38,7 +42,12 @@ using namespace std; -// intend to have a 0-state prelm-pass heuristic LM that is better than 1gram (like how estimated_features are lower order estimates). NgramShare will keep track of all loaded lms and reuse them. +string LanguageModel::usage(bool param,bool verbose) { + return usage_helper(usage_name,usage_short,usage_verbose,param,verbose); +} + + +// NgramShare will keep track of all loaded lms and reuse them. //TODO: ref counting by shared_ptr? for now, first one to load LM needs to stick around as long as all subsequent users. #include @@ -167,27 +176,31 @@ struct LMClient { }; class LanguageModelImpl { + void init(int order) { + //all these used to be const members, but that has no performance implication, and now there's less duplication. + order_=order; + state_size_ = OrderToStateSize(order)-1; + unigram=(order<=1); + floor_=-100; + kSTART = TD::Convert(""); + kSTOP = TD::Convert(""); + kUNKNOWN = TD::Convert(""); + kNONE = -1; + kSTAR = TD::Convert("<{STAR}>"); + } + public: - explicit LanguageModelImpl(int order) : - ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), - floor_(-100.0), - kSTART(TD::Convert("")), - kSTOP(TD::Convert("")), - kUNKNOWN(TD::Convert("")), - kNONE(-1), - kSTAR(TD::Convert("<{STAR}>")) - , unigram(order<=1) {} + explicit LanguageModelImpl(int order) : ngram_(*TD::dict_, order) + { + init(order); + } + + //TODO: show that unigram special case (0 state) computes what it should. - LanguageModelImpl(int order, const string& f) : - ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), - floor_(-100.0), - kSTART(TD::Convert("")), - kSTOP(TD::Convert("")), - kUNKNOWN(TD::Convert("")), - kNONE(-1), - kSTAR(TD::Convert("<{STAR}>")) - , unigram(order<=1) + LanguageModelImpl(int order, const string& f, int load_order=0) : + ngram_(*TD::dict_, load_order ? load_order : order) { + init(order); File file(f.c_str(), "r", 0); assert(file); cerr << "Reading " << order_ << "-gram LM from " << f << endl; @@ -407,16 +420,16 @@ public: protected: Ngram ngram_; vector buffer_; - const int order_; - const int state_size_; - const double floor_; + int order_; + int state_size_; + double floor_; public: - const WordID kSTART; - const WordID kSTOP; - const WordID kUNKNOWN; - const WordID kNONE; - const WordID kSTAR; - const bool unigram; + WordID kSTART; + WordID kSTOP; + WordID kUNKNOWN; + WordID kNONE; + WordID kSTAR; + bool unigram; }; struct ClientLMI : public LanguageModelImpl @@ -436,32 +449,33 @@ struct ReuseLMI : public LanguageModelImpl { ReuseLMI(int order, Ngram *ng) : LanguageModelImpl(order), ng(ng) {} - virtual double WordProb(int word, int* context) { + double WordProb(int word, int* context) { return ng->wordProb(word, (VocabIndex*)context); } protected: Ngram *ng; }; -LanguageModelImpl *make_lm_impl(int order, string const& f) +LanguageModelImpl *make_lm_impl(int order, string const& f, int load_order) { if (f.find("lm://") == 0) { return new ClientLMI(order,f.substr(5)); - } else if (ngs.have(f)) { + } else if (load_order==0 && ngs.have(f)) { cerr<<"Reusing already loaded Ngram LM: "<get_lm()); return r; } } -bool parse_lmspec(std::string const& in, int &order, string &featurename, string &filename) +bool parse_lmspec(std::string const& in, int &order, string &featurename, string &filename, int &load_order) { vector const& argv=SplitOnWhitespace(in); featurename="LanguageModel"; order=3; + load_order=0; #define LMSPEC_NEXTARG if (i==argv.end()) { \ cerr << "Missing argument for "<<*last<<". "; goto usage; \ } else { ++i; } @@ -477,6 +491,9 @@ bool parse_lmspec(std::string const& in, int &order, string &featurename, string case 'n': LMSPEC_NEXTARG; featurename=*i; break; + case 'm': + LMSPEC_NEXTARG; load_order=lexical_cast(*i); + break; #undef LMSPEC_NEXTARG default: fail: @@ -495,18 +512,22 @@ bool parse_lmspec(std::string const& in, int &order, string &featurename, string if (order > 0 && !filename.empty()) return true; usage: - cerr<<"LanguageModel specification should be: [-o order>0] [-n featurename] filename"<* features) const; std::string DebugStateToString(const void* state) const; + static std::string usage(bool param,bool verbose); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h index 7794fd5e..be91f324 100644 --- a/decoder/sparse_vector.h +++ b/decoder/sparse_vector.h @@ -19,9 +19,9 @@ public: typedef typename std::map::const_iterator const_iterator; SparseVector() {} explicit SparseVector(std::vector const& v) { - MapType::iterator p=values_.end(); + typename MapType::iterator p=values_.begin(); for (unsigned i=0;i