diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-08 19:21:02 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-08 19:21:02 +0000 |
commit | 52a8d49e81c14b6f7ed3afb5bdb50b17391995a8 (patch) | |
tree | 2cd84e2840204de6fbfaa70be5b2818b9c805b51 /decoder/ff_lm.cc | |
parent | 71b39bcf60182d1686966db34225a670d13e3594 (diff) |
actually use -n feature_name in LanguageModel. FF factory usage facility, FF feature ids facility (not used yet)
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@186 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder/ff_lm.cc')
-rw-r--r-- | decoder/ff_lm.cc | 97 |
1 files changed, 59 insertions, 38 deletions
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index e6f7912e..9e6f02b7 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -1,3 +1,7 @@ +char const* usage_name="LanguageModel"; +char const* usage_short="srilm.gz [-n FeatureName] [-o StateOrder] [-m LimitLoadOrder]"; +char const* usage_verbose="-n determines the name of the feature (and its weight). -o defaults to 3. -m defaults to effectively infinite, otherwise says what order lm probs to use (up to). you could use -o > -m but that would be wasteful. -o < -m means some ngrams are scored longer (whenever a word is inserted by a rule next to a variable) than the state would ordinarily allow. NOTE: multiple LanguageModel features are allowed, but they will wastefully duplicate state, except in the special case of -o 1 (which uses no state). subsequent references to the same a.lm.gz. unless they specify -m, will reuse the same SRI LM in memory; this means that the -m used in the first load of a.lm.gz will take effect."; + //TODO: backoff wordclasses for named entity xltns, esp. numbers. e.g. digits -> @. idealy rule features would specify replacement lm tokens/classes //TODO: extra int in state to hold "GAP" token is not needed. if there are less than (N-1) words, then null terminate the e.g. left words. however, this would mean treating gapless items differently. not worth the potential bugs right now. @@ -38,7 +42,12 @@ using namespace std; -// intend to have a 0-state prelm-pass heuristic LM that is better than 1gram (like how estimated_features are lower order estimates). NgramShare will keep track of all loaded lms and reuse them. +string LanguageModel::usage(bool param,bool verbose) { + return usage_helper(usage_name,usage_short,usage_verbose,param,verbose); +} + + +// NgramShare will keep track of all loaded lms and reuse them. //TODO: ref counting by shared_ptr? for now, first one to load LM needs to stick around as long as all subsequent users. #include <boost/shared_ptr.hpp> @@ -167,27 +176,31 @@ struct LMClient { }; class LanguageModelImpl { + void init(int order) { + //all these used to be const members, but that has no performance implication, and now there's less duplication. + order_=order; + state_size_ = OrderToStateSize(order)-1; + unigram=(order<=1); + floor_=-100; + kSTART = TD::Convert("<s>"); + kSTOP = TD::Convert("</s>"); + kUNKNOWN = TD::Convert("<unk>"); + kNONE = -1; + kSTAR = TD::Convert("<{STAR}>"); + } + public: - explicit LanguageModelImpl(int order) : - ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), - floor_(-100.0), - kSTART(TD::Convert("<s>")), - kSTOP(TD::Convert("</s>")), - kUNKNOWN(TD::Convert("<unk>")), - kNONE(-1), - kSTAR(TD::Convert("<{STAR}>")) - , unigram(order<=1) {} + explicit LanguageModelImpl(int order) : ngram_(*TD::dict_, order) + { + init(order); + } + + //TODO: show that unigram special case (0 state) computes what it should. - LanguageModelImpl(int order, const string& f) : - ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1), - floor_(-100.0), - kSTART(TD::Convert("<s>")), - kSTOP(TD::Convert("</s>")), - kUNKNOWN(TD::Convert("<unk>")), - kNONE(-1), - kSTAR(TD::Convert("<{STAR}>")) - , unigram(order<=1) + LanguageModelImpl(int order, const string& f, int load_order=0) : + ngram_(*TD::dict_, load_order ? load_order : order) { + init(order); File file(f.c_str(), "r", 0); assert(file); cerr << "Reading " << order_ << "-gram LM from " << f << endl; @@ -407,16 +420,16 @@ public: protected: Ngram ngram_; vector<WordID> buffer_; - const int order_; - const int state_size_; - const double floor_; + int order_; + int state_size_; + double floor_; public: - const WordID kSTART; - const WordID kSTOP; - const WordID kUNKNOWN; - const WordID kNONE; - const WordID kSTAR; - const bool unigram; + WordID kSTART; + WordID kSTOP; + WordID kUNKNOWN; + WordID kNONE; + WordID kSTAR; + bool unigram; }; struct ClientLMI : public LanguageModelImpl @@ -436,32 +449,33 @@ struct ReuseLMI : public LanguageModelImpl { ReuseLMI(int order, Ngram *ng) : LanguageModelImpl(order), ng(ng) {} - virtual double WordProb(int word, int* context) { + double WordProb(int word, int* context) { return ng->wordProb(word, (VocabIndex*)context); } protected: Ngram *ng; }; -LanguageModelImpl *make_lm_impl(int order, string const& f) +LanguageModelImpl *make_lm_impl(int order, string const& f, int load_order) { if (f.find("lm://") == 0) { return new ClientLMI(order,f.substr(5)); - } else if (ngs.have(f)) { + } else if (load_order==0 && ngs.have(f)) { cerr<<"Reusing already loaded Ngram LM: "<<f<<endl; return new ReuseLMI(order,ngs.get(f)); } else { - LanguageModelImpl *r=new LanguageModelImpl(order,f); + LanguageModelImpl *r=new LanguageModelImpl(order,f,load_order); ngs.add(f,r->get_lm()); return r; } } -bool parse_lmspec(std::string const& in, int &order, string &featurename, string &filename) +bool parse_lmspec(std::string const& in, int &order, string &featurename, string &filename, int &load_order) { vector<string> const& argv=SplitOnWhitespace(in); featurename="LanguageModel"; order=3; + load_order=0; #define LMSPEC_NEXTARG if (i==argv.end()) { \ cerr << "Missing argument for "<<*last<<". "; goto usage; \ } else { ++i; } @@ -477,6 +491,9 @@ bool parse_lmspec(std::string const& in, int &order, string &featurename, string case 'n': LMSPEC_NEXTARG; featurename=*i; break; + case 'm': + LMSPEC_NEXTARG; load_order=lexical_cast<int>(*i); + break; #undef LMSPEC_NEXTARG default: fail: @@ -495,18 +512,22 @@ bool parse_lmspec(std::string const& in, int &order, string &featurename, string if (order > 0 && !filename.empty()) return true; usage: - cerr<<"LanguageModel specification should be: [-o order>0] [-n featurename] filename"<<endl<<" you provided: "<<in<<endl; + cerr<<usage_name<<" specification should be: "<<usage_short<<"; you provided: "<<in<<usage_verbose<<endl; return false; } LanguageModel::LanguageModel(const string& param) { - int order; + int order,load_order; string featurename,filename; - if (!parse_lmspec(param,order,featurename,filename)) + if (!parse_lmspec(param,order,featurename,filename,load_order)) abort(); - fid_=FD::Convert("LanguageModel"); - pimpl_ = make_lm_impl(order,filename); + cerr<<"LM feature name: "<<featurename<<" from file "<<filename<<" order "<<order; + if (load_order) + cerr<<" loading LM as order "<<load_order; + cerr<<endl; + fid_=FD::Convert(featurename); + pimpl_ = make_lm_impl(order,filename,load_order); //TODO: see if it's actually possible to set order_ later to mutate an already used FF for e.g. multipass. comment in ff.h says only to change state size in constructor. clone instead? differently -n named ones from same lm filename are already possible, so no urgency. SetStateSize(LanguageModelImpl::OrderToStateSize(order)); } |