From c4ade3091b812ca135ae6520fa7173e1bbf28754 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 25 Jan 2011 22:30:48 +0200 Subject: update kenlm --- klm/lm/blank.hh | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) (limited to 'klm/lm/blank.hh') diff --git a/klm/lm/blank.hh b/klm/lm/blank.hh index 639bc98b..4615a09e 100644 --- a/klm/lm/blank.hh +++ b/klm/lm/blank.hh @@ -1,12 +1,52 @@ #ifndef LM_BLANK__ #define LM_BLANK__ + #include +#include +#include + namespace lm { namespace ngram { -const float kBlankProb = -std::numeric_limits::quiet_NaN(); -const float kBlankBackoff = std::numeric_limits::infinity(); +/* Suppose "foo bar" appears with zero backoff but there is no trigram + * beginning with these words. Then, when scoring "foo bar", the model could + * return out_state containing "bar" or even null context if "bar" also has no + * backoff and is never followed by another word. Then the backoff is set to + * kNoExtensionBackoff. If the n-gram might be extended, then out_state must + * contain the full n-gram, in which case kExtensionBackoff is set. In any + * case, if an n-gram has non-zero backoff, the full state is returned so + * backoff can be properly charged. + * These differ only in sign bit because the backoff is in fact zero in either + * case. + */ +const float kNoExtensionBackoff = -0.0; +const float kExtensionBackoff = 0.0; + +inline void SetExtension(float &backoff) { + if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff; +} + +// This compiles down nicely. +inline bool HasExtension(const float &backoff) { + typedef union { float f; uint32_t i; } UnionValue; + UnionValue compare, interpret; + compare.f = kNoExtensionBackoff; + interpret.f = backoff; + return compare.i != interpret.i; +} + +/* Suppose "foo bar baz quux" appears in the ARPA but not "bar baz quux" or + * "baz quux" (because they were pruned). 1.2% of n-grams generated by SRI + * with default settings on the benchmark data set are like this. Since search + * proceeds by finding "quux", "baz quux", "bar baz quux", and finally + * "foo bar baz quux" and the trie needs pointer nodes anyway, blanks are + * inserted. The blanks have probability kBlankProb and backoff kBlankBackoff. + * A blank is recognized by kBlankProb in the probability field; kBlankBackoff + * must be 0 so that inference asseses zero backoff from these blanks. + */ +const float kBlankProb = -std::numeric_limits::infinity(); +const float kBlankBackoff = kNoExtensionBackoff; } // namespace ngram } // namespace lm -- cgit v1.2.3