summaryrefslogtreecommitdiff
path: root/decoder/ff_lm_fsa.h
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-28 07:10:09 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-28 07:10:09 +0000
commit202295eec8656a79115072d113afeb82ed660d78 (patch)
treefb5da2976ec0c44b54ba7aa1fa1850520cd15bb3 /decoder/ff_lm_fsa.h
parent6912768e34f12b615355d32b6976a56dca58a398 (diff)
debugging print - still no idea on .05% difference scoring 3gram using phrases
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@446 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder/ff_lm_fsa.h')
-rwxr-xr-xdecoder/ff_lm_fsa.h21
1 files changed, 14 insertions, 7 deletions
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
index 108698ec..9ba7b2c5 100755
--- a/decoder/ff_lm_fsa.h
+++ b/decoder/ff_lm_fsa.h
@@ -1,9 +1,10 @@
#ifndef FF_LM_FSA_H
#define FF_LM_FSA_H
-//FIXME: when FSA_LM_PHRASE 1, 3gram has differences in 4th decimal digit, compared to regular ff_lm. this is USUALLY a bug (there's way more actual precision in there). this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that). also, LM_FSA_SHORTEN_CONTEXT gives identical scores with FSA_LM_PHRASE 0
+//FIXME: when FSA_LM_PHRASE 1, 3gram fsa has differences, especially with unk words, in about the 4th decimal digit (about .05%), compared to regular ff_lm. this is USUALLY a bug (there's way more actual precision in there). this was with #define LM_FSA_SHORTEN_CONTEXT 1 and 0 (so it's not that). also, LM_FSA_SHORTEN_CONTEXT gives identical scores with FSA_LM_PHRASE 0
-#define FSA_LM_PHRASE 0
+// enabling for now - retest unigram+ more, solve above puzzle
+#define FSA_LM_PHRASE 1
#define FSA_LM_DEBUG 0
#if FSA_LM_DEBUG
@@ -42,7 +43,8 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
}
template <class Accum>
- void ScanAccum(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& /* edge */,WordID w,void const* old_st,void *new_st,Accum *a) const {
+ void ScanAccum(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& edge,WordID w,void const* old_st,void *new_st,Accum *a) const {
+ Hypergraph::Edge &de=(Hypergraph::Edge &)edge;
if (!ctxlen_) {
Add(floored(pimpl_->WordProb(w,&empty_context)),a);
return;
@@ -53,6 +55,8 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
state_copy(ctx,old_st);
ctx[ctxlen_]=TD::none; // make this part of state? wastes space but saves copies.
Featval p=floored(pimpl_->WordProb(w,ctx));
+ FSALMDBG(de,"p("<<TD::Convert(w)<<"|"<<TD::Convert(ctx,ctx+ctxlen_)<<")="<<p);
+ FSALMDBGnl(de);
// states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.).
WordID *nst=(WordID *)new_st;
nst[0]=w; // new most recent word
@@ -68,6 +72,7 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
//FIXME: there is a bug in here somewhere, or else the 3gram LM we use gives different scores for phrases (impossible? BOW nonzero when shortening context past what LM has?)
template <class Accum>
void ScanPhraseAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge&edge,WordID const* begin,WordID const* end,void const* old_st,void *new_st,Accum *a) const {
+ Hypergraph::Edge &de=(Hypergraph::Edge &)edge;
if (begin==end) return; // otherwise w/ shortening it's possible to end up with no words at all.
/* // this is forcing unigram prob always. we will instead build the phrase
if (!ctxlen_) {
@@ -85,27 +90,29 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
ctx[nboth]=TD::none;
// reverse order - state at very end of context, then [i,end) in rev order ending at ctx[0]
W ctx_score_end=wordcpy_reverse(ctx,begin,end);
- assert(ctx_score_end==ctx+nw);
wordcpy(ctx_score_end,st,st_end); // st already reversed.
+ assert(ctx_score_end==ctx+nw);
// we could just copy the filled state words, but it probably doesn't save much time (and might cost some to scan to find the nones. most contexts are full except for the shortest source spans.
-// FSALMDBG(edge," Scan("<<TD::GetString(ctx,ctx+nboth)<<')');
+ FSALMDBG(de," scan.r->l("<<TD::GetString(ctx,ctx_score_end)<<"|"<<TD::GetString(ctx_score_end,ctx+nboth)<<')');
+ FSAFFDBG(de," r->l("<<TD::GetString(ctx,ctx_score_end)<<"|"<<TD::GetString(ctx_score_end,ctx+nboth)<<')');
Featval p=0;
FSALMDBGnl(edge);
for(;ctx_score_end>ctx;--ctx_score_end)
p+=floored(pimpl_->WordProb(ctx_score_end[-1],ctx_score_end));
//TODO: look for score discrepancy -
- // i had some idea that maybe shortencontext would return a different prob if the length provided was > ctxlen_; however, since the same 4th digit disagreement happens with LM_FSA_SHORTEN_CONTEXT 0 anyway, it's not that. perhaps look to SCAN_PHRASE_ACCUM_OVERRIDE - make sure they do the right thing.
+ // i had some idea that maybe shortencontext would return a different prob if the length provided was > ctxlen_; however, since the same disagreement happens with LM_FSA_SHORTEN_CONTEXT 0 anyway, it's not that. perhaps look to SCAN_PHRASE_ACCUM_OVERRIDE - make sure they do the right thing.
#if LM_FSA_SHORTEN_CONTEXT
p+=pimpl_->ShortenContext(ctx,nboth<ctxlen_?nboth:ctxlen_);
#endif
state_copy(new_st,ctx);
- FSALMDBG(edge," lm.Scan("<<TD::GetString(begin,end)<<"|"<<describe_state(old_st)<<")"<<"="<<p<<","<<describe_state(new_st));
+ FSALMDBG(de," lm.Scan("<<TD::GetString(begin,end)<<"|"<<describe_state(old_st)<<")"<<"="<<p<<","<<describe_state(new_st));
FSALMDBGnl(edge);
Add(p,a);
}
SCAN_PHRASE_ACCUM_OVERRIDE
#endif
+
// impl details:
void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows. otherwise, it's the same as a "-o i" argument to constructor
double floor_; // log10prob minimum used (e.g. unk words)