summaryrefslogtreecommitdiff
path: root/klm/lm/read_arpa.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-01-25 22:30:48 +0200
committerChris Dyer <cdyer@cs.cmu.edu>2011-01-25 22:30:48 +0200
commitc4ade3091b812ca135ae6520fa7173e1bbf28754 (patch)
tree2528af208f6dafd0c27dcbec0d2da291a9c93ca2 /klm/lm/read_arpa.cc
parentd04c0ca2d9df0e147239b18e90650ca8bd51d594 (diff)
update kenlm
Diffstat (limited to 'klm/lm/read_arpa.cc')
-rw-r--r--klm/lm/read_arpa.cc17
1 files changed, 14 insertions, 3 deletions
diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc
index 262a9c6a..d0fe67f0 100644
--- a/klm/lm/read_arpa.cc
+++ b/klm/lm/read_arpa.cc
@@ -1,5 +1,7 @@
#include "lm/read_arpa.hh"
+#include "lm/blank.hh"
+
#include <cstdlib>
#include <vector>
@@ -8,6 +10,9 @@
namespace lm {
+// 1 for '\t', '\n', and ' '. This is stricter than isspace.
+const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
namespace {
bool IsEntirelyWhiteSpace(const StringPiece &line) {
@@ -116,21 +121,27 @@ void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) {
case '\n':
break;
default:
- UTIL_THROW(FormatLoadException, "Expected tab or newline after unigram");
+ UTIL_THROW(FormatLoadException, "Expected tab or newline for backoff");
}
}
void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) {
+ // Always make zero negative.
+ // Negative zero means that no (n+1)-gram has this n-gram as context.
+ // Therefore the hypothesis state can be shorter. Of course, many n-grams
+ // are context for (n+1)-grams. An algorithm in the data structure will go
+ // back and set the backoff to positive zero in these cases.
switch (in.get()) {
case '\t':
weights.backoff = in.ReadFloat();
+ if (weights.backoff == ngram::kExtensionBackoff) weights.backoff = ngram::kNoExtensionBackoff;
if ((in.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff");
break;
case '\n':
- weights.backoff = 0.0;
+ weights.backoff = ngram::kNoExtensionBackoff;
break;
default:
- UTIL_THROW(FormatLoadException, "Expected tab or newline after unigram");
+ UTIL_THROW(FormatLoadException, "Expected tab or newline for backoff");
}
}