From 7520e730d47155bb05e1fcf2d69f1ed45b96e876 Mon Sep 17 00:00:00 2001 From: graehl Date: Mon, 26 Jul 2010 05:07:07 +0000 Subject: shorten fsa lm state using contextID git-svn-id: https://ws10smt.googlecode.com/svn/trunk@415 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/ff_lm.cc | 35 ++++++++++++++++++++++++++++++++--- decoder/ff_lm_fsa.h | 3 ++- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index a5f43867..e9c172da 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -1,3 +1,6 @@ +#define LM_FSA_SHORTEN_CONTEXT 1 +// seems to work great - just not sure if it actually speeds anything up + namespace { char const* usage_name="LanguageModel"; char const* usage_short="srilm.gz [-n FeatureName] [-o StateOrder] [-m LimitLoadOrder]"; @@ -235,6 +238,21 @@ class LanguageModelImpl { return ngram_.wordProb(word, (VocabIndex*)context); } + // may be shorter than actual null-terminated length. context must be null terminated. len is just to save effort for subclasses that don't support contextID + virtual int ContextSize(WordID const* context,int len) { + unsigned ret; + ngram_.contextID((VocabIndex*)context,ret); + return ret; + } + + void ShortenContext(WordID * context,int len) { + int slen=ContextSize(context,len); + while (len>slen) { + --len; + context[len]=TD::none; + } + } + /// NOT a negative logp, i.e. should be worse prob = more negative. that's what SRI wordProb returns, fortunately. inline double clamp(double logp) const { return logp < floor_ ? floor_ : logp; @@ -448,7 +466,9 @@ struct ClientLMI : public LanguageModelImpl virtual double WordProb(int word, WordID const* context) { return client_.wordProb(word, context); } - + virtual int ContextSize(WordID const* const, int len) { + return len; + } protected: LMClient client_; }; @@ -460,6 +480,11 @@ struct ReuseLMI : public LanguageModelImpl double WordProb(int word, WordID const* context) { return ng->wordProb(word, (VocabIndex*)context); } + virtual int ContextSize(WordID const* context, int len) { + unsigned ret; + ng->contextID((VocabIndex*)context,ret); + return ret; + } protected: Ngram *ng; }; @@ -553,10 +578,11 @@ void LanguageModelFsa::set_ngram_order(int i) { ngram_order_=i; ctxlen_=i-1; set_state_bytes(ctxlen_*sizeof(WordID)); - set_end_phrase(TD::se); //TODO: pretty boring in unigram case, just adds constant prob - bu WordID *ss=(WordID*)start.begin(); + WordID *ss=(WordID*)start.begin(); WordID *hs=(WordID*)h_start.begin(); -t for compat. with non-fsa version, leave it if (ctxlen_) { // avoid segfault in case of unigram lm (0 state) + set_end_phrase(TD::se); +// se is pretty boring in unigram case, just adds constant prob. check that this is what we want ss[0]=TD::ss; // start-sentence context (length 1) hs[0]=TD::none; // empty context for (int i=1;iShortenContext(nst,ctxlen_); +#endif } else { p=pimpl_->WordProb(w,&empty_context); } diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h index 6a4e8201..55a8b497 100755 --- a/decoder/ff_lm_fsa.h +++ b/decoder/ff_lm_fsa.h @@ -1,7 +1,8 @@ #ifndef FF_LM_FSA_H #define FF_LM_FSA_H -//TODO: use SRI LM::contextBOW, LM::contextID to shorten state +//TODO: use SRI LM::contextID to shorten state +//TODO: expose ScanPhrase interface to achieve > ngram probs (e.g. unigram) with higher order lm - but that wouldn't apply to L->R maximal hook/sharing decoding #include "ff_lm.h" #include "ff_from_fsa.h" -- cgit v1.2.3