summaryrefslogtreecommitdiff
path: root/klm/lm/read_arpa.cc
diff options
context:
space:
mode:
Diffstat (limited to 'klm/lm/read_arpa.cc')
-rw-r--r--klm/lm/read_arpa.cc83
1 files changed, 18 insertions, 65 deletions
diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc
index d0fe67f0..0e90196d 100644
--- a/klm/lm/read_arpa.cc
+++ b/klm/lm/read_arpa.cc
@@ -6,6 +6,7 @@
#include <vector>
#include <ctype.h>
+#include <string.h>
#include <inttypes.h>
namespace lm {
@@ -22,14 +23,20 @@ bool IsEntirelyWhiteSpace(const StringPiece &line) {
return true;
}
-template <class F> void GenericReadARPACounts(F &in, std::vector<uint64_t> &number) {
+const char kBinaryMagic[] = "mmap lm http://kheafield.com/code";
+
+} // namespace
+
+void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) {
number.clear();
StringPiece line;
if (!IsEntirelyWhiteSpace(line = in.ReadLine())) {
if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast<unsigned char>(line.data()[1]) == 0x8b)) {
- UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, run\nzcat " << in.FileName() << " |kenlm/build_binary /dev/stdin " << in.FileName() << ".binary\nIf this already in binary format, you need to decompress it because mmap doesn't work on top of gzip.");
+ UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, pipe " << in.FileName() << " through zcat. If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip.");
}
- UTIL_THROW(FormatLoadException, "First line was \"" << static_cast<int>(line.data()[1]) << "\" not blank");
+ if (static_cast<size_t>(line.size()) >= strlen(kBinaryMagic) && StringPiece(line.data(), strlen(kBinaryMagic)) == kBinaryMagic)
+ UTIL_THROW(FormatLoadException, "This looks like a binary file but got sent to the ARPA parser. Did you compress the binary file or pass a binary file where only ARPA files are accepted?");
+ UTIL_THROW(FormatLoadException, "First line was \"" << line.data() << "\" not blank");
}
if ((line = in.ReadLine()) != "\\data\\") UTIL_THROW(FormatLoadException, "second line was \"" << line << "\" not \\data\\.");
while (!IsEntirelyWhiteSpace(line = in.ReadLine())) {
@@ -49,66 +56,14 @@ template <class F> void GenericReadARPACounts(F &in, std::vector<uint64_t> &numb
}
}
-template <class F> void GenericReadNGramHeader(F &in, unsigned int length) {
- StringPiece line;
+void ReadNGramHeader(util::FilePiece &in, unsigned int length) {
+ StringPiece line;
while (IsEntirelyWhiteSpace(line = in.ReadLine())) {}
std::stringstream expected;
expected << '\\' << length << "-grams:";
if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead");
}
-template <class F> void GenericReadEnd(F &in) {
- StringPiece line;
- do {
- line = in.ReadLine();
- } while (IsEntirelyWhiteSpace(line));
- if (line != "\\end\\") UTIL_THROW(FormatLoadException, "Expected \\end\\ but the ARPA file has " << line);
-}
-
-class FakeFilePiece {
- public:
- explicit FakeFilePiece(std::istream &in) : in_(in) {
- in_.exceptions(std::ios::failbit | std::ios::badbit | std::ios::eofbit);
- }
-
- StringPiece ReadLine() throw(util::EndOfFileException) {
- getline(in_, buffer_);
- return StringPiece(buffer_);
- }
-
- float ReadFloat() {
- float ret;
- in_ >> ret;
- return ret;
- }
-
- const char *FileName() const {
- // This only used for error messages and we don't know the file name. . .
- return "$file";
- }
-
- private:
- std::istream &in_;
- std::string buffer_;
-};
-
-} // namespace
-
-void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) {
- GenericReadARPACounts(in, number);
-}
-void ReadARPACounts(std::istream &in, std::vector<uint64_t> &number) {
- FakeFilePiece fake(in);
- GenericReadARPACounts(fake, number);
-}
-void ReadNGramHeader(util::FilePiece &in, unsigned int length) {
- GenericReadNGramHeader(in, length);
-}
-void ReadNGramHeader(std::istream &in, unsigned int length) {
- FakeFilePiece fake(in);
- GenericReadNGramHeader(fake, length);
-}
-
void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) {
switch (in.get()) {
case '\t':
@@ -146,20 +101,18 @@ void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) {
}
void ReadEnd(util::FilePiece &in) {
- GenericReadEnd(in);
StringPiece line;
+ do {
+ line = in.ReadLine();
+ } while (IsEntirelyWhiteSpace(line));
+ if (line != "\\end\\") UTIL_THROW(FormatLoadException, "Expected \\end\\ but the ARPA file has " << line);
+
try {
while (true) {
line = in.ReadLine();
if (!IsEntirelyWhiteSpace(line)) UTIL_THROW(FormatLoadException, "Trailing line " << line);
}
- } catch (const util::EndOfFileException &e) {
- return;
- }
-}
-void ReadEnd(std::istream &in) {
- FakeFilePiece fake(in);
- GenericReadEnd(fake);
+ } catch (const util::EndOfFileException &e) {}
}
} // namespace lm