diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-08 19:21:02 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-08 19:21:02 +0000 |
commit | 52a8d49e81c14b6f7ed3afb5bdb50b17391995a8 (patch) | |
tree | 2cd84e2840204de6fbfaa70be5b2818b9c805b51 | |
parent | 71b39bcf60182d1686966db34225a670d13e3594 (diff) |
actually use -n feature_name in LanguageModel. FF factory usage facility, FF feature ids facility (not used yet)
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@186 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r-- | decoder/ff.cc | 13 | ||||
-rw-r--r-- | decoder/ff.h | 13 | ||||
-rw-r--r-- | decoder/ff_factory.cc | 12 | ||||
-rw-r--r-- | decoder/ff_factory.h | 8 | ||||
-rw-r--r-- | decoder/ff_lm.cc | 97 | ||||
-rw-r--r-- | decoder/ff_lm.h | 1 | ||||
-rw-r--r-- | decoder/sparse_vector.h | 4 |
7 files changed, 107 insertions, 41 deletions
diff --git a/decoder/ff.cc b/decoder/ff.cc index 261e9a17..73dbbdc9 100644 --- a/decoder/ff.cc +++ b/decoder/ff.cc @@ -16,6 +16,19 @@ void FeatureFunction::FinalTraversalFeatures(const void* ant_state, (void) features; } +string FeatureFunction::usage_helper(std::string const& name,std::string const& params,std::string const& details,bool sp,bool sd) { + string r=name; + if (sp) { + r+=": "; + r+=params; + } + if (sd) { + r+="\n"; + r+=details; + } + return r; +} + // Hiero and Joshua use log_10(e) as the value, so I do to WordPenalty::WordPenalty(const string& param) : fid_(FD::Convert("WordPenalty")), diff --git a/decoder/ff.h b/decoder/ff.h index 630b3208..c6c9cf8f 100644 --- a/decoder/ff.h +++ b/decoder/ff.h @@ -19,6 +19,17 @@ class FeatureFunction { explicit FeatureFunction(int state_size) : state_size_(state_size) {} virtual ~FeatureFunction(); + // override this. not virtual because we want to expose this to factory template for help before creating a FF + static std::string usage(bool show_params,bool show_details) { + return usage_helper("FIXME_feature_needs_name","[no parameters]","[no documentation yet]",show_params,show_details); + } + + static std::string usage_helper(std::string const& name,std::string const& params,std::string const& details,bool show_params,bool show_details); + +public: + + typedef std::vector<WordID> Features; + virtual Features features() { return Features(); } // returns the number of bytes of context that this feature function will // (maximally) use. By default, 0 ("stateless" models in Hiero/Joshua). // NOTE: this value is fixed for the instance of your class, you cannot @@ -144,7 +155,7 @@ class ModelSet { bool empty() const { return models_.empty(); } private: std::vector<const FeatureFunction*> models_; - std::vector<double> weights_; + std::vector<double> weights_; int state_size_; std::vector<int> model_state_pos_; }; diff --git a/decoder/ff_factory.cc b/decoder/ff_factory.cc index 1854e0bb..d66cd883 100644 --- a/decoder/ff_factory.cc +++ b/decoder/ff_factory.cc @@ -14,6 +14,13 @@ void FFRegistry::DisplayList() const { } } +string FFRegistry::usage(string const& ffname,bool params,bool verbose) const { + map<string, shared_ptr<FFFactoryBase> >::const_iterator it = reg_.find(ffname); + return it == reg_.end() + ? "Unknown feature " + ffname + : it->second->usage(params,verbose); +} + shared_ptr<FeatureFunction> FFRegistry::Create(const string& ffname, const string& param) const { map<string, shared_ptr<FFFactoryBase> >::const_iterator it = reg_.find(ffname); shared_ptr<FeatureFunction> res; @@ -33,3 +40,8 @@ void FFRegistry::Register(const string& ffname, FFFactoryBase* factory) { reg_[ffname].reset(factory); } + +void FFRegistry::Register(FFFactoryBase* factory) +{ + Register(factory->usage(false,false),factory); +} diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h index bc586567..75911f38 100644 --- a/decoder/ff_factory.h +++ b/decoder/ff_factory.h @@ -17,8 +17,10 @@ class FFRegistry { friend class FFFactoryBase; public: boost::shared_ptr<FeatureFunction> Create(const std::string& ffname, const std::string& param) const; + std::string usage(std::string const& ffname,bool params=true,bool verbose=true) const; void DisplayList() const; void Register(const std::string& ffname, FFFactoryBase* factory); + void Register(FFFactoryBase* factory); private: FFRegistry() {} std::map<std::string, boost::shared_ptr<FFFactoryBase> > reg_; @@ -27,6 +29,7 @@ class FFRegistry { struct FFFactoryBase { virtual ~FFFactoryBase(); virtual boost::shared_ptr<FeatureFunction> Create(const std::string& param) const = 0; + virtual std::string usage(bool params,bool verbose) const = 0; }; template<class FF> @@ -34,6 +37,11 @@ class FFFactory : public FFFactoryBase { boost::shared_ptr<FeatureFunction> Create(const std::string& param) const { return boost::shared_ptr<FeatureFunction>(new FF(param)); } + // called with false,false just gives feature name + virtual std::string usage(bool params,bool verbose) const { + return FF::usage(params,verbose); + } + }; #endif diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index e6f7912e..9e6f02b7 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -1,3 +1,7 @@ +char const* usage_name="LanguageModel"; +char const* usage_short="srilm.gz [-n FeatureName] [-o StateOrder] [-m LimitLoadOrder]"; +char const* usage_verbose="-n determines the name of the feature (and its weight). -o defaults to 3. -m defaults to effectively infinite, otherwise says what order lm probs to use (up to). you could use -o > -m but that would be wasteful. -o < -m means some ngrams are scored longer (whenever a word is inserted by a rule next to a variable) than the state would ordinarily allow. NOTE: multiple LanguageModel features are allowed, but they will wastefully duplicate state, except in the special case of -o 1 (which uses no state). subsequent references to the same a.lm.gz. unless they specify -m, will reuse the same SRI LM in memory; this means that the -m used in the first load of a.lm.gz will take effect."; + //TODO: backoff wordclasses for named entity xltns, esp. numbers. e.g. digits -> @. idealy rule features would specify replacement lm tokens/classes //TODO: extra int in state to hold "GAP" token is not needed. if there are less than (N-1) words, then null terminate the e.g. left words. however, this would mean treating gapless items differently. not worth the potential bugs right now. @@ -38,7 +42,12 @@ using namespace std; -// intend to have a 0-state prelm-pass heuristic LM that is better than 1gram (like how estimated_features are lower order estimates). NgramShare will keep track of all loaded lms and reuse them. +string LanguageModel::usage(bool param,bool verbose) { + return usage_helper(usage_name,usage_short,usage_verbose,param,verbose); +} + + +// NgramShare will keep track of all loaded lms and reuse them. //TODO: ref counting by shared_ptr? for now, first one to load LM needs to stick around as long as all subsequent users. #include <boost/shared_ptr.hpp> @@ -167,27 +176,31 @@ struct LMClient { }; class LanguageModelImpl { + void init(int order) { + //all these used to be const members, but that has no performance implication, and now there's less duplication. + order_=order; + state_size_ = OrderToStateSize(order)-1; + unigram=(order<=1); + floor_=-100; + kSTART = TD::Convert("<s>"); + kSTOP = TD::Convert("</s>"); + kUNKNOWN = TD::Convert("<unk>"); + kNONE = -1; + kSTAR = TD::Convert("<{STAR}>"); + } + public: - explicit LanguageModelImpl(int order) : - ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), - floor_(-100.0), - kSTART(TD::Convert("<s>")), - kSTOP(TD::Convert("</s>")), - kUNKNOWN(TD::Convert("<unk>")), - kNONE(-1), - kSTAR(TD::Convert("<{STAR}>")) - , unigram(order<=1) {} + explicit LanguageModelImpl(int order) : ngram_(*TD::dict_, order) + { + init(order); + } + + //TODO: show that unigram special case (0 state) computes what it should. - LanguageModelImpl(int order, const string& f) : - ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), - floor_(-100.0), - kSTART(TD::Convert("<s>")), - kSTOP(TD::Convert("</s>")), - kUNKNOWN(TD::Convert("<unk>")), - kNONE(-1), - kSTAR(TD::Convert("<{STAR}>")) - , unigram(order<=1) + LanguageModelImpl(int order, const string& f, int load_order=0) : + ngram_(*TD::dict_, load_order ? load_order : order) { + init(order); File file(f.c_str(), "r", 0); assert(file); cerr << "Reading " << order_ << "-gram LM from " << f << endl; @@ -407,16 +420,16 @@ public: protected: Ngram ngram_; vector<WordID> buffer_; - const int order_; - const int state_size_; - const double floor_; + int order_; + int state_size_; + double floor_; public: - const WordID kSTART; - const WordID kSTOP; - const WordID kUNKNOWN; - const WordID kNONE; - const WordID kSTAR; - const bool unigram; + WordID kSTART; + WordID kSTOP; + WordID kUNKNOWN; + WordID kNONE; + WordID kSTAR; + bool unigram; }; struct ClientLMI : public LanguageModelImpl @@ -436,32 +449,33 @@ struct ReuseLMI : public LanguageModelImpl { ReuseLMI(int order, Ngram *ng) : LanguageModelImpl(order), ng(ng) {} - virtual double WordProb(int word, int* context) { + double WordProb(int word, int* context) { return ng->wordProb(word, (VocabIndex*)context); } protected: Ngram *ng; }; -LanguageModelImpl *make_lm_impl(int order, string const& f) +LanguageModelImpl *make_lm_impl(int order, string const& f, int load_order) { if (f.find("lm://") == 0) { return new ClientLMI(order,f.substr(5)); - } else if (ngs.have(f)) { + } else if (load_order==0 && ngs.have(f)) { cerr<<"Reusing already loaded Ngram LM: "<<f<<endl; return new ReuseLMI(order,ngs.get(f)); } else { - LanguageModelImpl *r=new LanguageModelImpl(order,f); + LanguageModelImpl *r=new LanguageModelImpl(order,f,load_order); ngs.add(f,r->get_lm()); return r; } } -bool parse_lmspec(std::string const& in, int &order, string &featurename, string &filename) +bool parse_lmspec(std::string const& in, int &order, string &featurename, string &filename, int &load_order) { vector<string> const& argv=SplitOnWhitespace(in); featurename="LanguageModel"; order=3; + load_order=0; #define LMSPEC_NEXTARG if (i==argv.end()) { \ cerr << "Missing argument for "<<*last<<". "; goto usage; \ } else { ++i; } @@ -477,6 +491,9 @@ bool parse_lmspec(std::string const& in, int &order, string &featurename, string case 'n': LMSPEC_NEXTARG; featurename=*i; break; + case 'm': + LMSPEC_NEXTARG; load_order=lexical_cast<int>(*i); + break; #undef LMSPEC_NEXTARG default: fail: @@ -495,18 +512,22 @@ bool parse_lmspec(std::string const& in, int &order, string &featurename, string if (order > 0 && !filename.empty()) return true; usage: - cerr<<"LanguageModel specification should be: [-o order>0] [-n featurename] filename"<<endl<<" you provided: "<<in<<endl; + cerr<<usage_name<<" specification should be: "<<usage_short<<"; you provided: "<<in<<usage_verbose<<endl; return false; } LanguageModel::LanguageModel(const string& param) { - int order; + int order,load_order; string featurename,filename; - if (!parse_lmspec(param,order,featurename,filename)) + if (!parse_lmspec(param,order,featurename,filename,load_order)) abort(); - fid_=FD::Convert("LanguageModel"); - pimpl_ = make_lm_impl(order,filename); + cerr<<"LM feature name: "<<featurename<<" from file "<<filename<<" order "<<order; + if (load_order) + cerr<<" loading LM as order "<<load_order; + cerr<<endl; + fid_=FD::Convert(featurename); + pimpl_ = make_lm_impl(order,filename,load_order); //TODO: see if it's actually possible to set order_ later to mutate an already used FF for e.g. multipass. comment in ff.h says only to change state size in constructor. clone instead? differently -n named ones from same lm filename are already possible, so no urgency. SetStateSize(LanguageModelImpl::OrderToStateSize(order)); } diff --git a/decoder/ff_lm.h b/decoder/ff_lm.h index 10a3e9a3..5ea41068 100644 --- a/decoder/ff_lm.h +++ b/decoder/ff_lm.h @@ -18,6 +18,7 @@ class LanguageModel : public FeatureFunction { virtual void FinalTraversalFeatures(const void* context, SparseVector<double>* features) const; std::string DebugStateToString(const void* state) const; + static std::string usage(bool param,bool verbose); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h index 7794fd5e..be91f324 100644 --- a/decoder/sparse_vector.h +++ b/decoder/sparse_vector.h @@ -19,9 +19,9 @@ public: typedef typename std::map<int, T>::const_iterator const_iterator; SparseVector() {} explicit SparseVector(std::vector<T> const& v) { - MapType::iterator p=values_.end(); + typename MapType::iterator p=values_.begin(); for (unsigned i=0;i<v.size();++i) - p=values_.insert(p,MapType::value_type(i,v[i])); //faster + p=values_.insert(p,typename MapType::value_type(i,v[i])); //faster } const T operator[](int index) const { |