summaryrefslogtreecommitdiff
path: root/klm/lm/read_arpa.cc
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2013-01-18 17:12:51 +0000
committerKenneth Heafield <github@kheafield.com>2013-01-18 17:12:51 +0000
commitd884099e0db8b4510847ec106b59ef7dca3c245b (patch)
treeb45a3f17eb002e224a7b728e0f985a15e2503196 /klm/lm/read_arpa.cc
parentbae5fe99037ae7e101953ad0df118127191c711c (diff)
KenLM dffafbf with lmplz source (but not built)
Diffstat (limited to 'klm/lm/read_arpa.cc')
-rw-r--r--klm/lm/read_arpa.cc11
1 files changed, 9 insertions, 2 deletions
diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc
index b709fef9..9ea08798 100644
--- a/klm/lm/read_arpa.cc
+++ b/klm/lm/read_arpa.cc
@@ -1,6 +1,7 @@
#include "lm/read_arpa.hh"
#include "lm/blank.hh"
+#include "util/file.hh"
#include <cmath>
#include <cstdlib>
@@ -45,8 +46,14 @@ uint64_t ReadCount(const std::string &from) {
void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) {
number.clear();
- StringPiece line;
- while (IsEntirelyWhiteSpace(line = in.ReadLine())) {}
+ StringPiece line = in.ReadLine();
+ // In general, ARPA files can have arbitrary text before "\data\"
+ // But in KenLM, we require such lines to start with "#", so that
+ // we can do stricter error checking
+ while (IsEntirelyWhiteSpace(line) || line.starts_with("#")) {
+ line = in.ReadLine();
+ }
+
if (line != "\\data\\") {
if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast<unsigned char>(line.data()[1]) == 0x8b)) {
UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, pipe " << in.FileName() << " through zcat. If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip.");