shorten fsa lm state using contextID

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@415 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-26 05:07:07 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-26 05:07:07 +0000
commit: 556354b6e8189d1964ea4477929753d0045485ae (patch)
tree: 82a22994b56832ae00beb1443ec35f3dce9f2bf5 /decoder/ff_lm.cc
parent: 5e786c1b9e465e88763b44db10f3b42f41c2a990 (diff)
1 files changed, 32 insertions, 3 deletions
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index a5f43867..e9c172da 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -1,3 +1,6 @@
+#define LM_FSA_SHORTEN_CONTEXT 1
+// seems to work great - just not sure if it actually speeds anything up
+
 namespace {
 char const* usage_name="LanguageModel";
 char const* usage_short="srilm.gz [-n FeatureName] [-o StateOrder] [-m LimitLoadOrder]";
@@ -235,6 +238,21 @@ class LanguageModelImpl {
     return ngram_.wordProb(word, (VocabIndex*)context);
   }
 
+  // may be shorter than actual null-terminated length.  context must be null terminated.  len is just to save effort for subclasses that don't support contextID
+  virtual int ContextSize(WordID const* context,int len) {
+    unsigned ret;
+    ngram_.contextID((VocabIndex*)context,ret);
+    return ret;
+  }
+
+  void ShortenContext(WordID * context,int len) {
+    int slen=ContextSize(context,len);
+    while (len>slen) {
+      --len;
+      context[len]=TD::none;
+    }
+  }
+
   /// NOT a negative logp, i.e. should be worse prob = more negative.  that's what SRI wordProb returns, fortunately.
   inline double clamp(double logp) const {
     return logp < floor_ ? floor_ : logp;
@@ -448,7 +466,9 @@ struct ClientLMI : public LanguageModelImpl
   virtual double WordProb(int word, WordID const* context) {
     return client_.wordProb(word, context);
   }
-
+  virtual int ContextSize(WordID const* const, int len) {
+    return len;
+  }
 protected:
   LMClient client_;
 };
@@ -460,6 +480,11 @@ struct ReuseLMI : public LanguageModelImpl
   double WordProb(int word, WordID const* context) {
     return ng->wordProb(word, (VocabIndex*)context);
   }
+  virtual int ContextSize(WordID const* context, int len) {
+    unsigned ret;
+    ng->contextID((VocabIndex*)context,ret);
+    return ret;
+  }
 protected:
   Ngram *ng;
 };
@@ -553,10 +578,11 @@ void LanguageModelFsa::set_ngram_order(int i) {
   ngram_order_=i;
   ctxlen_=i-1;
   set_state_bytes(ctxlen_*sizeof(WordID));
-  set_end_phrase(TD::se); //TODO: pretty boring in unigram case, just adds constant prob - bu  WordID *ss=(WordID*)start.begin();
+  WordID *ss=(WordID*)start.begin();
   WordID *hs=(WordID*)h_start.begin();
-t for compat. with non-fsa version, leave it
   if (ctxlen_) { // avoid segfault in case of unigram lm (0 state)
+    set_end_phrase(TD::se);
+// se is pretty boring in unigram case, just adds constant prob.  check that this is what we want
     ss[0]=TD::ss; // start-sentence context (length 1)
     hs[0]=TD::none; // empty context
     for (int i=1;i<ctxlen_;++i) {
@@ -589,6 +615,9 @@ void LanguageModelFsa::Scan(SentenceMetadata const& /* smeta */,const Hypergraph
     WordID *nst=(WordID *)new_st;
     nst[0]=w; // new most recent word
     to_state(nst+1,ctx,ctxlen_-1); // rotate old words right
+#if LM_FSA_SHORTEN_CONTEXT
+    pimpl_->ShortenContext(nst,ctxlen_);
+#endif
   } else {
     p=pimpl_->WordProb(w,&empty_context);
   }
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-26 05:07:07 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-26 05:07:07 +0000
commit	556354b6e8189d1964ea4477929753d0045485ae (patch)
tree	82a22994b56832ae00beb1443ec35f3dce9f2bf5 /decoder/ff_lm.cc
parent	5e786c1b9e465e88763b44db10f3b42f41c2a990 (diff)