summaryrefslogtreecommitdiff
path: root/klm/lm/blank.hh
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-01-25 22:30:48 +0200
committerChris Dyer <cdyer@cs.cmu.edu>2011-01-25 22:30:48 +0200
commitc4ade3091b812ca135ae6520fa7173e1bbf28754 (patch)
tree2528af208f6dafd0c27dcbec0d2da291a9c93ca2 /klm/lm/blank.hh
parentd04c0ca2d9df0e147239b18e90650ca8bd51d594 (diff)
update kenlm
Diffstat (limited to 'klm/lm/blank.hh')
-rw-r--r--klm/lm/blank.hh44
1 files changed, 42 insertions, 2 deletions
diff --git a/klm/lm/blank.hh b/klm/lm/blank.hh
index 639bc98b..4615a09e 100644
--- a/klm/lm/blank.hh
+++ b/klm/lm/blank.hh
@@ -1,12 +1,52 @@
#ifndef LM_BLANK__
#define LM_BLANK__
+
#include <limits>
+#include <inttypes.h>
+#include <math.h>
+
namespace lm {
namespace ngram {
-const float kBlankProb = -std::numeric_limits<float>::quiet_NaN();
-const float kBlankBackoff = std::numeric_limits<float>::infinity();
+/* Suppose "foo bar" appears with zero backoff but there is no trigram
+ * beginning with these words. Then, when scoring "foo bar", the model could
+ * return out_state containing "bar" or even null context if "bar" also has no
+ * backoff and is never followed by another word. Then the backoff is set to
+ * kNoExtensionBackoff. If the n-gram might be extended, then out_state must
+ * contain the full n-gram, in which case kExtensionBackoff is set. In any
+ * case, if an n-gram has non-zero backoff, the full state is returned so
+ * backoff can be properly charged.
+ * These differ only in sign bit because the backoff is in fact zero in either
+ * case.
+ */
+const float kNoExtensionBackoff = -0.0;
+const float kExtensionBackoff = 0.0;
+
+inline void SetExtension(float &backoff) {
+ if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
+}
+
+// This compiles down nicely.
+inline bool HasExtension(const float &backoff) {
+ typedef union { float f; uint32_t i; } UnionValue;
+ UnionValue compare, interpret;
+ compare.f = kNoExtensionBackoff;
+ interpret.f = backoff;
+ return compare.i != interpret.i;
+}
+
+/* Suppose "foo bar baz quux" appears in the ARPA but not "bar baz quux" or
+ * "baz quux" (because they were pruned). 1.2% of n-grams generated by SRI
+ * with default settings on the benchmark data set are like this. Since search
+ * proceeds by finding "quux", "baz quux", "bar baz quux", and finally
+ * "foo bar baz quux" and the trie needs pointer nodes anyway, blanks are
+ * inserted. The blanks have probability kBlankProb and backoff kBlankBackoff.
+ * A blank is recognized by kBlankProb in the probability field; kBlankBackoff
+ * must be 0 so that inference asseses zero backoff from these blanks.
+ */
+const float kBlankProb = -std::numeric_limits<float>::infinity();
+const float kBlankBackoff = kNoExtensionBackoff;
} // namespace ngram
} // namespace lm