KenLM dffafbf with lmplz source (but not built)

author: Kenneth Heafield <github@kheafield.com> 2013-01-18 17:12:51 +0000
committer: Kenneth Heafield <github@kheafield.com> 2013-01-18 17:12:51 +0000
commit: d884099e0db8b4510847ec106b59ef7dca3c245b (patch)
tree: b45a3f17eb002e224a7b728e0f985a15e2503196 /klm/lm/read_arpa.cc
parent: bae5fe99037ae7e101953ad0df118127191c711c (diff)
1 files changed, 9 insertions, 2 deletions
diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc
index b709fef9..9ea08798 100644
--- a/klm/lm/read_arpa.cc
+++ b/klm/lm/read_arpa.cc
@@ -1,6 +1,7 @@
 #include "lm/read_arpa.hh"
 
 #include "lm/blank.hh"
+#include "util/file.hh"
 
 #include <cmath>
 #include <cstdlib>
@@ -45,8 +46,14 @@ uint64_t ReadCount(const std::string &from) {
 
 void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) {
   number.clear();
-  StringPiece line;
-  while (IsEntirelyWhiteSpace(line = in.ReadLine())) {}
+  StringPiece line = in.ReadLine();
+  // In general, ARPA files can have arbitrary text before "\data\"
+  // But in KenLM, we require such lines to start with "#", so that
+  // we can do stricter error checking
+  while (IsEntirelyWhiteSpace(line) || line.starts_with("#")) {
+    line = in.ReadLine();
+  }
+
   if (line != "\\data\\") {
     if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast<unsigned char>(line.data()[1]) == 0x8b)) {
       UTIL_THROW(FormatLoadException, "Looks like a gzip file.  If this is an ARPA file, pipe " << in.FileName() << " through zcat.  If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip.");
author	Kenneth Heafield <github@kheafield.com>	2013-01-18 17:12:51 +0000
committer	Kenneth Heafield <github@kheafield.com>	2013-01-18 17:12:51 +0000
commit	d884099e0db8b4510847ec106b59ef7dca3c245b (patch)
tree	b45a3f17eb002e224a7b728e0f985a15e2503196 /klm/lm/read_arpa.cc
parent	bae5fe99037ae7e101953ad0df118127191c711c (diff)