diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2011-01-25 22:30:48 +0200 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2011-01-25 22:30:48 +0200 |
commit | c4ade3091b812ca135ae6520fa7173e1bbf28754 (patch) | |
tree | 2528af208f6dafd0c27dcbec0d2da291a9c93ca2 /klm/lm/read_arpa.cc | |
parent | d04c0ca2d9df0e147239b18e90650ca8bd51d594 (diff) |
update kenlm
Diffstat (limited to 'klm/lm/read_arpa.cc')
-rw-r--r-- | klm/lm/read_arpa.cc | 17 |
1 files changed, 14 insertions, 3 deletions
diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc index 262a9c6a..d0fe67f0 100644 --- a/klm/lm/read_arpa.cc +++ b/klm/lm/read_arpa.cc @@ -1,5 +1,7 @@ #include "lm/read_arpa.hh" +#include "lm/blank.hh" + #include <cstdlib> #include <vector> @@ -8,6 +10,9 @@ namespace lm { +// 1 for '\t', '\n', and ' '. This is stricter than isspace. +const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + namespace { bool IsEntirelyWhiteSpace(const StringPiece &line) { @@ -116,21 +121,27 @@ void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) { case '\n': break; default: - UTIL_THROW(FormatLoadException, "Expected tab or newline after unigram"); + UTIL_THROW(FormatLoadException, "Expected tab or newline for backoff"); } } void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) { + // Always make zero negative. + // Negative zero means that no (n+1)-gram has this n-gram as context. + // Therefore the hypothesis state can be shorter. Of course, many n-grams + // are context for (n+1)-grams. An algorithm in the data structure will go + // back and set the backoff to positive zero in these cases. switch (in.get()) { case '\t': weights.backoff = in.ReadFloat(); + if (weights.backoff == ngram::kExtensionBackoff) weights.backoff = ngram::kNoExtensionBackoff; if ((in.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff"); break; case '\n': - weights.backoff = 0.0; + weights.backoff = ngram::kNoExtensionBackoff; break; default: - UTIL_THROW(FormatLoadException, "Expected tab or newline after unigram"); + UTIL_THROW(FormatLoadException, "Expected tab or newline for backoff"); } } |