diff options
author | Kenneth Heafield <github@kheafield.com> | 2014-01-27 17:42:19 -0800 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2014-01-27 17:42:19 -0800 |
commit | 783c57b2d3312738ddcf992ac55ff750afe7cb47 (patch) | |
tree | c4811dab0d916836b8631f3c7df94f284a490b9b /klm/lm/read_arpa.cc | |
parent | f7e051a05d65ef25c2ada0b84cd82bfb375ef265 (diff) |
KenLM 5cc905bc2d214efa7de2db56a9a672b749a95591
Diffstat (limited to 'klm/lm/read_arpa.cc')
-rw-r--r-- | klm/lm/read_arpa.cc | 14 |
1 files changed, 7 insertions, 7 deletions
diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc index 9ea08798..fb8bbfa2 100644 --- a/klm/lm/read_arpa.cc +++ b/klm/lm/read_arpa.cc @@ -19,7 +19,7 @@ namespace lm { -// 1 for '\t', '\n', and ' '. This is stricter than isspace. +// 1 for '\t', '\n', and ' '. This is stricter than isspace. const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; namespace { @@ -50,7 +50,7 @@ void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) { // In general, ARPA files can have arbitrary text before "\data\" // But in KenLM, we require such lines to start with "#", so that // we can do stricter error checking - while (IsEntirelyWhiteSpace(line) || line.starts_with("#")) { + while (IsEntirelyWhiteSpace(line) || starts_with(line, "#")) { line = in.ReadLine(); } @@ -58,7 +58,7 @@ void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) { if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast<unsigned char>(line.data()[1]) == 0x8b)) { UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, pipe " << in.FileName() << " through zcat. If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip."); } - if (static_cast<size_t>(line.size()) >= strlen(kBinaryMagic) && StringPiece(line.data(), strlen(kBinaryMagic)) == kBinaryMagic) + if (static_cast<size_t>(line.size()) >= strlen(kBinaryMagic) && StringPiece(line.data(), strlen(kBinaryMagic)) == kBinaryMagic) UTIL_THROW(FormatLoadException, "This looks like a binary file but got sent to the ARPA parser. Did you compress the binary file or pass a binary file where only ARPA files are accepted?"); UTIL_THROW_IF(line.size() >= 4 && StringPiece(line.data(), 4) == "blmt", FormatLoadException, "This looks like an IRSTLM binary file. Did you forget to pass --text yes to compile-lm?"); UTIL_THROW_IF(line == "iARPA", FormatLoadException, "This looks like an IRSTLM iARPA file. You need an ARPA file. Run\n compile-lm --text yes " << in.FileName() << " " << in.FileName() << ".arpa\nfirst."); @@ -66,7 +66,7 @@ void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) { } while (!IsEntirelyWhiteSpace(line = in.ReadLine())) { if (line.size() < 6 || strncmp(line.data(), "ngram ", 6)) UTIL_THROW(FormatLoadException, "count line \"" << line << "\"doesn't begin with \"ngram \""); - // So strtol doesn't go off the end of line. + // So strtol doesn't go off the end of line. std::string remaining(line.data() + 6, line.size() - 6); char *end_ptr; unsigned int length = std::strtol(remaining.c_str(), &end_ptr, 10); @@ -102,8 +102,8 @@ void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) { } void ReadBackoff(util::FilePiece &in, float &backoff) { - // Always make zero negative. - // Negative zero means that no (n+1)-gram has this n-gram as context. + // Always make zero negative. + // Negative zero means that no (n+1)-gram has this n-gram as context. // Therefore the hypothesis state can be shorter. Of course, many n-grams // are context for (n+1)-grams. An algorithm in the data structure will go // back and set the backoff to positive zero in these cases. @@ -150,7 +150,7 @@ void PositiveProbWarn::Warn(float prob) { case THROW_UP: UTIL_THROW(FormatLoadException, "Positive log probability " << prob << " in the model. This is a bug in IRSTLM; you can set config.positive_log_probability = SILENT or pass -i to build_binary to substitute 0.0 for the log probability. Error"); case COMPLAIN: - std::cerr << "There's a positive log probability " << prob << " in the APRA file, probably because of a bug in IRSTLM. This and subsequent entires will be mapepd to 0 log probability." << std::endl; + std::cerr << "There's a positive log probability " << prob << " in the APRA file, probably because of a bug in IRSTLM. This and subsequent entires will be mapped to 0 log probability." << std::endl; action_ = SILENT; break; case SILENT: |