diff options
author | Paul Baltescu <pauldb89@gmail.com> | 2013-02-21 14:13:55 +0000 |
---|---|---|
committer | Paul Baltescu <pauldb89@gmail.com> | 2013-02-21 14:13:55 +0000 |
commit | bca26d953a774b8efca12f30407390b3f5eef9d0 (patch) | |
tree | fe922de5c89b1844f677d550dcc24e87edd67a55 /klm/lm/read_arpa.cc | |
parent | 54a1c0e2bde259e3acc9c0a8ec8da3c7704e80ca (diff) | |
parent | 95c364f2cb002241c4a62bedb1c5ef6f1e9a7f22 (diff) |
Merge branch 'master' of https://github.com/pauldb89/cdec
Diffstat (limited to 'klm/lm/read_arpa.cc')
-rw-r--r-- | klm/lm/read_arpa.cc | 11 |
1 files changed, 9 insertions, 2 deletions
diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc index b709fef9..9ea08798 100644 --- a/klm/lm/read_arpa.cc +++ b/klm/lm/read_arpa.cc @@ -1,6 +1,7 @@ #include "lm/read_arpa.hh" #include "lm/blank.hh" +#include "util/file.hh" #include <cmath> #include <cstdlib> @@ -45,8 +46,14 @@ uint64_t ReadCount(const std::string &from) { void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) { number.clear(); - StringPiece line; - while (IsEntirelyWhiteSpace(line = in.ReadLine())) {} + StringPiece line = in.ReadLine(); + // In general, ARPA files can have arbitrary text before "\data\" + // But in KenLM, we require such lines to start with "#", so that + // we can do stricter error checking + while (IsEntirelyWhiteSpace(line) || line.starts_with("#")) { + line = in.ReadLine(); + } + if (line != "\\data\\") { if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast<unsigned char>(line.data()[1]) == 0x8b)) { UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, pipe " << in.FileName() << " through zcat. If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip."); |