diff options
| author | Avneesh Saluja <asaluja@gmail.com> | 2013-03-28 18:28:16 -0700 | 
|---|---|---|
| committer | Avneesh Saluja <asaluja@gmail.com> | 2013-03-28 18:28:16 -0700 | 
| commit | 3d8d656fa7911524e0e6885647173474524e0784 (patch) | |
| tree | 81b1ee2fcb67980376d03f0aa48e42e53abff222 /klm/lm/read_arpa.cc | |
| parent | be7f57fdd484e063775d7abf083b9fa4c403b610 (diff) | |
| parent | 96fedabebafe7a38a6d5928be8fff767e411d705 (diff) | |
fixed conflicts
Diffstat (limited to 'klm/lm/read_arpa.cc')
| -rw-r--r-- | klm/lm/read_arpa.cc | 33 | 
1 files changed, 23 insertions, 10 deletions
diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc index 70727e4c..9ea08798 100644 --- a/klm/lm/read_arpa.cc +++ b/klm/lm/read_arpa.cc @@ -1,13 +1,15 @@  #include "lm/read_arpa.hh"  #include "lm/blank.hh" +#include "util/file.hh" +#include <cmath>  #include <cstdlib>  #include <iostream> +#include <sstream>  #include <vector>  #include <ctype.h> -#include <math.h>  #include <string.h>  #include <stdint.h> @@ -31,12 +33,27 @@ bool IsEntirelyWhiteSpace(const StringPiece &line) {  const char kBinaryMagic[] = "mmap lm http://kheafield.com/code"; +// strtoull isn't portable enough :-( +uint64_t ReadCount(const std::string &from) { +  std::stringstream stream(from); +  uint64_t ret; +  stream >> ret; +  UTIL_THROW_IF(!stream, FormatLoadException, "Bad count " << from); +  return ret; +} +  } // namespace  void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) {    number.clear(); -  StringPiece line; -  while (IsEntirelyWhiteSpace(line = in.ReadLine())) {} +  StringPiece line = in.ReadLine(); +  // In general, ARPA files can have arbitrary text before "\data\" +  // But in KenLM, we require such lines to start with "#", so that +  // we can do stricter error checking +  while (IsEntirelyWhiteSpace(line) || line.starts_with("#")) { +    line = in.ReadLine(); +  } +    if (line != "\\data\\") {      if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast<unsigned char>(line.data()[1]) == 0x8b)) {        UTIL_THROW(FormatLoadException, "Looks like a gzip file.  If this is an ARPA file, pipe " << in.FileName() << " through zcat.  If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip."); @@ -52,15 +69,11 @@ void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) {      // So strtol doesn't go off the end of line.        std::string remaining(line.data() + 6, line.size() - 6);      char *end_ptr; -    unsigned long int length = std::strtol(remaining.c_str(), &end_ptr, 10); +    unsigned int length = std::strtol(remaining.c_str(), &end_ptr, 10);      if ((end_ptr == remaining.c_str()) || (length - 1 != number.size())) UTIL_THROW(FormatLoadException, "ngram count lengths should be consecutive starting with 1: " << line);      if (*end_ptr != '=') UTIL_THROW(FormatLoadException, "Expected = immediately following the first number in the count line " << line);      ++end_ptr; -    const char *start = end_ptr; -    long int count = std::strtol(start, &end_ptr, 10); -    if (count < 0) UTIL_THROW(FormatLoadException, "Negative n-gram count " << count); -    if (start == end_ptr) UTIL_THROW(FormatLoadException, "Couldn't parse n-gram count from " << line); -    number.push_back(count); +    number.push_back(ReadCount(end_ptr));    }  } @@ -103,7 +116,7 @@ void ReadBackoff(util::FilePiece &in, float &backoff) {  		int float_class = _fpclass(backoff);          UTIL_THROW_IF(float_class == _FPCLASS_SNAN || float_class == _FPCLASS_QNAN || float_class == _FPCLASS_NINF || float_class == _FPCLASS_PINF, FormatLoadException, "Bad backoff " << backoff);  #else -        int float_class = fpclassify(backoff); +        int float_class = std::fpclassify(backoff);          UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff);  #endif        }  | 
