shorten fsa lm state using contextID

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@415 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-26 05:07:07 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-26 05:07:07 +0000
commit: 7520e730d47155bb05e1fcf2d69f1ed45b96e876 (patch)
tree: 54c091ab5af69ebcf136352df639321cd9ebbdc6 /decoder
parent: f4b4aade473f9463dda6fac4baf9c0502d004deb (diff)
2 files changed, 34 insertions, 4 deletions
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index a5f43867..e9c172da 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -1,3 +1,6 @@
+#define LM_FSA_SHORTEN_CONTEXT 1
+// seems to work great - just not sure if it actually speeds anything up
+
 namespace {
 char const* usage_name="LanguageModel";
 char const* usage_short="srilm.gz [-n FeatureName] [-o StateOrder] [-m LimitLoadOrder]";
@@ -235,6 +238,21 @@ class LanguageModelImpl {
     return ngram_.wordProb(word, (VocabIndex*)context);
   }
 
+  // may be shorter than actual null-terminated length.  context must be null terminated.  len is just to save effort for subclasses that don't support contextID
+  virtual int ContextSize(WordID const* context,int len) {
+    unsigned ret;
+    ngram_.contextID((VocabIndex*)context,ret);
+    return ret;
+  }
+
+  void ShortenContext(WordID * context,int len) {
+    int slen=ContextSize(context,len);
+    while (len>slen) {
+      --len;
+      context[len]=TD::none;
+    }
+  }
+
   /// NOT a negative logp, i.e. should be worse prob = more negative.  that's what SRI wordProb returns, fortunately.
   inline double clamp(double logp) const {
     return logp < floor_ ? floor_ : logp;
@@ -448,7 +466,9 @@ struct ClientLMI : public LanguageModelImpl
   virtual double WordProb(int word, WordID const* context) {
     return client_.wordProb(word, context);
   }
-
+  virtual int ContextSize(WordID const* const, int len) {
+    return len;
+  }
 protected:
   LMClient client_;
 };
@@ -460,6 +480,11 @@ struct ReuseLMI : public LanguageModelImpl
   double WordProb(int word, WordID const* context) {
     return ng->wordProb(word, (VocabIndex*)context);
   }
+  virtual int ContextSize(WordID const* context, int len) {
+    unsigned ret;
+    ng->contextID((VocabIndex*)context,ret);
+    return ret;
+  }
 protected:
   Ngram *ng;
 };
@@ -553,10 +578,11 @@ void LanguageModelFsa::set_ngram_order(int i) {
   ngram_order_=i;
   ctxlen_=i-1;
   set_state_bytes(ctxlen_*sizeof(WordID));
-  set_end_phrase(TD::se); //TODO: pretty boring in unigram case, just adds constant prob - bu  WordID *ss=(WordID*)start.begin();
+  WordID *ss=(WordID*)start.begin();
   WordID *hs=(WordID*)h_start.begin();
-t for compat. with non-fsa version, leave it
   if (ctxlen_) { // avoid segfault in case of unigram lm (0 state)
+    set_end_phrase(TD::se);
+// se is pretty boring in unigram case, just adds constant prob.  check that this is what we want
     ss[0]=TD::ss; // start-sentence context (length 1)
     hs[0]=TD::none; // empty context
     for (int i=1;i<ctxlen_;++i) {
@@ -589,6 +615,9 @@ void LanguageModelFsa::Scan(SentenceMetadata const& /* smeta */,const Hypergraph
     WordID *nst=(WordID *)new_st;
     nst[0]=w; // new most recent word
     to_state(nst+1,ctx,ctxlen_-1); // rotate old words right
+#if LM_FSA_SHORTEN_CONTEXT
+    pimpl_->ShortenContext(nst,ctxlen_);
+#endif
   } else {
     p=pimpl_->WordProb(w,&empty_context);
   }
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
index 6a4e8201..55a8b497 100755
--- a/decoder/ff_lm_fsa.h
+++ b/decoder/ff_lm_fsa.h
@@ -1,7 +1,8 @@
 #ifndef FF_LM_FSA_H
 #define FF_LM_FSA_H
 
-//TODO: use SRI LM::contextBOW, LM::contextID to shorten state
+//TODO: use SRI LM::contextID to shorten state
+//TODO: expose ScanPhrase interface to achieve > ngram probs (e.g. unigram) with higher order lm - but that wouldn't apply to L->R maximal hook/sharing decoding
 
 #include "ff_lm.h"
 #include "ff_from_fsa.h"
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-26 05:07:07 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-26 05:07:07 +0000
commit	7520e730d47155bb05e1fcf2d69f1ed45b96e876 (patch)
tree	54c091ab5af69ebcf136352df639321cd9ebbdc6 /decoder
parent	f4b4aade473f9463dda6fac4baf9c0502d004deb (diff)