summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-26 05:07:07 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-26 05:07:07 +0000
commit7520e730d47155bb05e1fcf2d69f1ed45b96e876 (patch)
tree54c091ab5af69ebcf136352df639321cd9ebbdc6
parentf4b4aade473f9463dda6fac4baf9c0502d004deb (diff)
shorten fsa lm state using contextID
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@415 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--decoder/ff_lm.cc35
-rwxr-xr-xdecoder/ff_lm_fsa.h3
2 files changed, 34 insertions, 4 deletions
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index a5f43867..e9c172da 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -1,3 +1,6 @@
+#define LM_FSA_SHORTEN_CONTEXT 1
+// seems to work great - just not sure if it actually speeds anything up
+
namespace {
char const* usage_name="LanguageModel";
char const* usage_short="srilm.gz [-n FeatureName] [-o StateOrder] [-m LimitLoadOrder]";
@@ -235,6 +238,21 @@ class LanguageModelImpl {
return ngram_.wordProb(word, (VocabIndex*)context);
}
+ // may be shorter than actual null-terminated length. context must be null terminated. len is just to save effort for subclasses that don't support contextID
+ virtual int ContextSize(WordID const* context,int len) {
+ unsigned ret;
+ ngram_.contextID((VocabIndex*)context,ret);
+ return ret;
+ }
+
+ void ShortenContext(WordID * context,int len) {
+ int slen=ContextSize(context,len);
+ while (len>slen) {
+ --len;
+ context[len]=TD::none;
+ }
+ }
+
/// NOT a negative logp, i.e. should be worse prob = more negative. that's what SRI wordProb returns, fortunately.
inline double clamp(double logp) const {
return logp < floor_ ? floor_ : logp;
@@ -448,7 +466,9 @@ struct ClientLMI : public LanguageModelImpl
virtual double WordProb(int word, WordID const* context) {
return client_.wordProb(word, context);
}
-
+ virtual int ContextSize(WordID const* const, int len) {
+ return len;
+ }
protected:
LMClient client_;
};
@@ -460,6 +480,11 @@ struct ReuseLMI : public LanguageModelImpl
double WordProb(int word, WordID const* context) {
return ng->wordProb(word, (VocabIndex*)context);
}
+ virtual int ContextSize(WordID const* context, int len) {
+ unsigned ret;
+ ng->contextID((VocabIndex*)context,ret);
+ return ret;
+ }
protected:
Ngram *ng;
};
@@ -553,10 +578,11 @@ void LanguageModelFsa::set_ngram_order(int i) {
ngram_order_=i;
ctxlen_=i-1;
set_state_bytes(ctxlen_*sizeof(WordID));
- set_end_phrase(TD::se); //TODO: pretty boring in unigram case, just adds constant prob - bu WordID *ss=(WordID*)start.begin();
+ WordID *ss=(WordID*)start.begin();
WordID *hs=(WordID*)h_start.begin();
-t for compat. with non-fsa version, leave it
if (ctxlen_) { // avoid segfault in case of unigram lm (0 state)
+ set_end_phrase(TD::se);
+// se is pretty boring in unigram case, just adds constant prob. check that this is what we want
ss[0]=TD::ss; // start-sentence context (length 1)
hs[0]=TD::none; // empty context
for (int i=1;i<ctxlen_;++i) {
@@ -589,6 +615,9 @@ void LanguageModelFsa::Scan(SentenceMetadata const& /* smeta */,const Hypergraph
WordID *nst=(WordID *)new_st;
nst[0]=w; // new most recent word
to_state(nst+1,ctx,ctxlen_-1); // rotate old words right
+#if LM_FSA_SHORTEN_CONTEXT
+ pimpl_->ShortenContext(nst,ctxlen_);
+#endif
} else {
p=pimpl_->WordProb(w,&empty_context);
}
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
index 6a4e8201..55a8b497 100755
--- a/decoder/ff_lm_fsa.h
+++ b/decoder/ff_lm_fsa.h
@@ -1,7 +1,8 @@
#ifndef FF_LM_FSA_H
#define FF_LM_FSA_H
-//TODO: use SRI LM::contextBOW, LM::contextID to shorten state
+//TODO: use SRI LM::contextID to shorten state
+//TODO: expose ScanPhrase interface to achieve > ngram probs (e.g. unigram) with higher order lm - but that wouldn't apply to L->R maximal hook/sharing decoding
#include "ff_lm.h"
#include "ff_from_fsa.h"