summaryrefslogtreecommitdiff
path: root/decoder/ff_lm.cc
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-26 05:07:07 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-26 05:07:07 +0000
commit556354b6e8189d1964ea4477929753d0045485ae (patch)
tree82a22994b56832ae00beb1443ec35f3dce9f2bf5 /decoder/ff_lm.cc
parent5e786c1b9e465e88763b44db10f3b42f41c2a990 (diff)
shorten fsa lm state using contextID
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@415 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder/ff_lm.cc')
-rw-r--r--decoder/ff_lm.cc35
1 files changed, 32 insertions, 3 deletions
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index a5f43867..e9c172da 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -1,3 +1,6 @@
+#define LM_FSA_SHORTEN_CONTEXT 1
+// seems to work great - just not sure if it actually speeds anything up
+
namespace {
char const* usage_name="LanguageModel";
char const* usage_short="srilm.gz [-n FeatureName] [-o StateOrder] [-m LimitLoadOrder]";
@@ -235,6 +238,21 @@ class LanguageModelImpl {
return ngram_.wordProb(word, (VocabIndex*)context);
}
+ // may be shorter than actual null-terminated length. context must be null terminated. len is just to save effort for subclasses that don't support contextID
+ virtual int ContextSize(WordID const* context,int len) {
+ unsigned ret;
+ ngram_.contextID((VocabIndex*)context,ret);
+ return ret;
+ }
+
+ void ShortenContext(WordID * context,int len) {
+ int slen=ContextSize(context,len);
+ while (len>slen) {
+ --len;
+ context[len]=TD::none;
+ }
+ }
+
/// NOT a negative logp, i.e. should be worse prob = more negative. that's what SRI wordProb returns, fortunately.
inline double clamp(double logp) const {
return logp < floor_ ? floor_ : logp;
@@ -448,7 +466,9 @@ struct ClientLMI : public LanguageModelImpl
virtual double WordProb(int word, WordID const* context) {
return client_.wordProb(word, context);
}
-
+ virtual int ContextSize(WordID const* const, int len) {
+ return len;
+ }
protected:
LMClient client_;
};
@@ -460,6 +480,11 @@ struct ReuseLMI : public LanguageModelImpl
double WordProb(int word, WordID const* context) {
return ng->wordProb(word, (VocabIndex*)context);
}
+ virtual int ContextSize(WordID const* context, int len) {
+ unsigned ret;
+ ng->contextID((VocabIndex*)context,ret);
+ return ret;
+ }
protected:
Ngram *ng;
};
@@ -553,10 +578,11 @@ void LanguageModelFsa::set_ngram_order(int i) {
ngram_order_=i;
ctxlen_=i-1;
set_state_bytes(ctxlen_*sizeof(WordID));
- set_end_phrase(TD::se); //TODO: pretty boring in unigram case, just adds constant prob - bu WordID *ss=(WordID*)start.begin();
+ WordID *ss=(WordID*)start.begin();
WordID *hs=(WordID*)h_start.begin();
-t for compat. with non-fsa version, leave it
if (ctxlen_) { // avoid segfault in case of unigram lm (0 state)
+ set_end_phrase(TD::se);
+// se is pretty boring in unigram case, just adds constant prob. check that this is what we want
ss[0]=TD::ss; // start-sentence context (length 1)
hs[0]=TD::none; // empty context
for (int i=1;i<ctxlen_;++i) {
@@ -589,6 +615,9 @@ void LanguageModelFsa::Scan(SentenceMetadata const& /* smeta */,const Hypergraph
WordID *nst=(WordID *)new_st;
nst[0]=w; // new most recent word
to_state(nst+1,ctx,ctxlen_-1); // rotate old words right
+#if LM_FSA_SHORTEN_CONTEXT
+ pimpl_->ShortenContext(nst,ctxlen_);
+#endif
} else {
p=pimpl_->WordProb(w,&empty_context);
}