diff options
-rw-r--r-- | decoder/ff_lm.cc | 35 | ||||
-rwxr-xr-x | decoder/ff_lm_fsa.h | 3 |
2 files changed, 34 insertions, 4 deletions
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index a5f43867..e9c172da 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -1,3 +1,6 @@ +#define LM_FSA_SHORTEN_CONTEXT 1 +// seems to work great - just not sure if it actually speeds anything up + namespace { char const* usage_name="LanguageModel"; char const* usage_short="srilm.gz [-n FeatureName] [-o StateOrder] [-m LimitLoadOrder]"; @@ -235,6 +238,21 @@ class LanguageModelImpl { return ngram_.wordProb(word, (VocabIndex*)context); } + // may be shorter than actual null-terminated length. context must be null terminated. len is just to save effort for subclasses that don't support contextID + virtual int ContextSize(WordID const* context,int len) { + unsigned ret; + ngram_.contextID((VocabIndex*)context,ret); + return ret; + } + + void ShortenContext(WordID * context,int len) { + int slen=ContextSize(context,len); + while (len>slen) { + --len; + context[len]=TD::none; + } + } + /// NOT a negative logp, i.e. should be worse prob = more negative. that's what SRI wordProb returns, fortunately. inline double clamp(double logp) const { return logp < floor_ ? floor_ : logp; @@ -448,7 +466,9 @@ struct ClientLMI : public LanguageModelImpl virtual double WordProb(int word, WordID const* context) { return client_.wordProb(word, context); } - + virtual int ContextSize(WordID const* const, int len) { + return len; + } protected: LMClient client_; }; @@ -460,6 +480,11 @@ struct ReuseLMI : public LanguageModelImpl double WordProb(int word, WordID const* context) { return ng->wordProb(word, (VocabIndex*)context); } + virtual int ContextSize(WordID const* context, int len) { + unsigned ret; + ng->contextID((VocabIndex*)context,ret); + return ret; + } protected: Ngram *ng; }; @@ -553,10 +578,11 @@ void LanguageModelFsa::set_ngram_order(int i) { ngram_order_=i; ctxlen_=i-1; set_state_bytes(ctxlen_*sizeof(WordID)); - set_end_phrase(TD::se); //TODO: pretty boring in unigram case, just adds constant prob - bu WordID *ss=(WordID*)start.begin(); + WordID *ss=(WordID*)start.begin(); WordID *hs=(WordID*)h_start.begin(); -t for compat. with non-fsa version, leave it if (ctxlen_) { // avoid segfault in case of unigram lm (0 state) + set_end_phrase(TD::se); +// se is pretty boring in unigram case, just adds constant prob. check that this is what we want ss[0]=TD::ss; // start-sentence context (length 1) hs[0]=TD::none; // empty context for (int i=1;i<ctxlen_;++i) { @@ -589,6 +615,9 @@ void LanguageModelFsa::Scan(SentenceMetadata const& /* smeta */,const Hypergraph WordID *nst=(WordID *)new_st; nst[0]=w; // new most recent word to_state(nst+1,ctx,ctxlen_-1); // rotate old words right +#if LM_FSA_SHORTEN_CONTEXT + pimpl_->ShortenContext(nst,ctxlen_); +#endif } else { p=pimpl_->WordProb(w,&empty_context); } diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h index 6a4e8201..55a8b497 100755 --- a/decoder/ff_lm_fsa.h +++ b/decoder/ff_lm_fsa.h @@ -1,7 +1,8 @@ #ifndef FF_LM_FSA_H #define FF_LM_FSA_H -//TODO: use SRI LM::contextBOW, LM::contextID to shorten state +//TODO: use SRI LM::contextID to shorten state +//TODO: expose ScanPhrase interface to achieve > ngram probs (e.g. unigram) with higher order lm - but that wouldn't apply to L->R maximal hook/sharing decoding #include "ff_lm.h" #include "ff_from_fsa.h" |