From be98f29f51350c24136c191f01af3fbfe340ef78 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Mon, 13 Dec 2010 16:18:34 -0500
Subject: new version of kenlm

---
 klm/util/bit_packing.cc      | 13 ++++++
 klm/util/bit_packing.hh      | 48 +++++++++++++--------
 klm/util/bit_packing_test.cc | 46 ++++++++++++++++++++
 klm/util/exception.cc        |  5 +++
 klm/util/file_piece.cc       | 99 ++++++++++++++++++++++++++++++--------------
 klm/util/file_piece.hh       |  5 +++
 klm/util/file_piece_test.cc  | 29 +++++++++----
 klm/util/mmap.cc             | 24 ++++++++---
 klm/util/murmur_hash.hh      |  2 +-
 klm/util/scoped.cc           | 14 +++++--
 klm/util/string_piece.hh     |  2 +-
 11 files changed, 218 insertions(+), 69 deletions(-)
 create mode 100644 klm/util/bit_packing_test.cc

(limited to 'klm/util')

diff --git a/klm/util/bit_packing.cc b/klm/util/bit_packing.cc
index dd14ffe1..9d4fdf27 100644
--- a/klm/util/bit_packing.cc
+++ b/klm/util/bit_packing.cc
@@ -1,12 +1,15 @@
 #include "util/bit_packing.hh"
 #include "util/exception.hh"
 
+#include <string.h>
+
 namespace util {
 
 namespace {
 template <bool> struct StaticCheck {};
 template <> struct StaticCheck<true> { typedef bool StaticAssertionPassed; };
 
+// If your float isn't 4 bytes, we're hosed.  
 typedef StaticCheck<sizeof(float) == 4>::StaticAssertionPassed FloatSize;
 
 } // namespace
@@ -21,6 +24,16 @@ uint8_t RequiredBits(uint64_t max_value) {
 void BitPackingSanity() {
   const detail::FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 };
   if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000");
+  char mem[57+8];
+  memset(mem, 0, sizeof(mem));
+  const uint64_t test57 = 0x123456789abcdefULL;
+  for (uint64_t b = 0; b < 57 * 8; b += 57) {
+    WriteInt57(mem + b / 8, b % 8, 57, test57);
+  }
+  for (uint64_t b = 0; b < 57 * 8; b += 57) {
+    if (test57 != ReadInt57(mem + b / 8, b % 8, 57, (1ULL << 57) - 1))
+      UTIL_THROW(Exception, "The bit packing routines are failing for your architecture.  Please send a bug report with your architecture, operating system, and compiler.");
+  }
   // TODO: more checks.  
 }
 
diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh
index 422ed873..0fd39d7f 100644
--- a/klm/util/bit_packing.hh
+++ b/klm/util/bit_packing.hh
@@ -6,56 +6,68 @@
 #include <assert.h>
 #ifdef __APPLE__
 #include <architecture/byte_order.h>
-#else
+#elif __linux__
 #include <endian.h>
-#endif
+#else
+#include <arpa/nameser_compat.h>
+#endif 
 
 #include <inttypes.h>
 
-#if __BYTE_ORDER != __LITTLE_ENDIAN
-#error The bit aligned storage functions assume little endian architecture
-#endif
-
 namespace util {
 
 /* WARNING WARNING WARNING:
  * The write functions assume that memory is zero initially.  This makes them
  * faster and is the appropriate case for mmapped language model construction.
  * These routines assume that unaligned access to uint64_t is fast and that
- * storage is little endian.  This is the case on x86_64.  It may not be the
- * case on 32-bit x86 but my target audience is large language models for which
- * 64-bit is necessary.  
+ * storage is little endian.  This is the case on x86_64.  I'm not sure how 
+ * fast unaligned 64-bit access is on x86 but my target audience is large
+ * language models for which 64-bit is necessary.  
+ *
+ * Call the BitPackingSanity function to sanity check.  Calling once suffices,
+ * but it may be called multiple times when that's inconvenient.  
  */
 
+inline uint8_t BitPackShift(uint8_t bit, uint8_t length) {
+// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct.  
+#if BYTE_ORDER == LITTLE_ENDIAN
+  return bit;
+#elif BYTE_ORDER == BIG_ENDIAN
+  return 64 - length - bit;
+#else
+#error "Bit packing code isn't written for your byte order."
+#endif
+}
+
 /* Pack integers up to 57 bits using their least significant digits. 
  * The length is specified using mask:
  * Assumes mask == (1 << length) - 1 where length <= 57.   
  */
-inline uint64_t ReadInt57(const void *base, uint8_t bit, uint64_t mask) {
-  return (*reinterpret_cast<const uint64_t*>(base) >> bit) & mask;
+inline uint64_t ReadInt57(const void *base, uint8_t bit, uint8_t length, uint64_t mask) {
+  return (*reinterpret_cast<const uint64_t*>(base) >> BitPackShift(bit, length)) & mask;
 }
-/* Assumes value <= mask and mask == (1 << length) - 1 where length <= 57.
+/* Assumes value < (1 << length) and length <= 57.
  * Assumes the memory is zero initially. 
  */
-inline void WriteInt57(void *base, uint8_t bit, uint64_t value) {
-  *reinterpret_cast<uint64_t*>(base) |= (value << bit);
+inline void WriteInt57(void *base, uint8_t bit, uint8_t length, uint64_t value) {
+  *reinterpret_cast<uint64_t*>(base) |= (value << BitPackShift(bit, length));
 }
 
 namespace detail { typedef union { float f; uint32_t i; } FloatEnc; }
 inline float ReadFloat32(const void *base, uint8_t bit) {
   detail::FloatEnc encoded;
-  encoded.i = *reinterpret_cast<const uint64_t*>(base) >> bit;
+  encoded.i = *reinterpret_cast<const uint64_t*>(base) >> BitPackShift(bit, 32);
   return encoded.f;
 }
 inline void WriteFloat32(void *base, uint8_t bit, float value) {
   detail::FloatEnc encoded;
   encoded.f = value;
-  WriteInt57(base, bit, encoded.i);
+  WriteInt57(base, bit, 32, encoded.i);
 }
 
 inline float ReadNonPositiveFloat31(const void *base, uint8_t bit) {
   detail::FloatEnc encoded;
-  encoded.i = *reinterpret_cast<const uint64_t*>(base) >> bit;
+  encoded.i = *reinterpret_cast<const uint64_t*>(base) >> BitPackShift(bit, 31);
   // Sign bit set means negative.  
   encoded.i |= 0x80000000;
   return encoded.f;
@@ -65,7 +77,7 @@ inline void WriteNonPositiveFloat31(void *base, uint8_t bit, float value) {
   detail::FloatEnc encoded;
   encoded.f = value;
   encoded.i &= ~0x80000000;
-  WriteInt57(base, bit, encoded.i);
+  WriteInt57(base, bit, 31, encoded.i);
 }
 
 void BitPackingSanity();
diff --git a/klm/util/bit_packing_test.cc b/klm/util/bit_packing_test.cc
new file mode 100644
index 00000000..c578ddd1
--- /dev/null
+++ b/klm/util/bit_packing_test.cc
@@ -0,0 +1,46 @@
+#include "util/bit_packing.hh"
+
+#define BOOST_TEST_MODULE BitPackingTest
+#include <boost/test/unit_test.hpp>
+
+#include <string.h>
+
+namespace util {
+namespace {
+
+const uint64_t test57 = 0x123456789abcdefULL;
+
+BOOST_AUTO_TEST_CASE(ZeroBit) {
+  char mem[16];
+  memset(mem, 0, sizeof(mem));
+  WriteInt57(mem, 0, 57, test57);
+  BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1));
+}
+
+BOOST_AUTO_TEST_CASE(EachBit) {
+  char mem[16];
+  for (uint8_t b = 0; b < 8; ++b) {
+    memset(mem, 0, sizeof(mem));
+    WriteInt57(mem, b, 57, test57);
+    BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
+  }
+}
+
+BOOST_AUTO_TEST_CASE(Consecutive) {
+  char mem[57+8];
+  memset(mem, 0, sizeof(mem));
+  for (uint64_t b = 0; b < 57 * 8; b += 57) {
+    WriteInt57(mem + (b / 8), b % 8, 57, test57);
+    BOOST_CHECK_EQUAL(test57, ReadInt57(mem + b / 8, b % 8, 57, (1ULL << 57) - 1));
+  }
+  for (uint64_t b = 0; b < 57 * 8; b += 57) {
+    BOOST_CHECK_EQUAL(test57, ReadInt57(mem + b / 8, b % 8, 57, (1ULL << 57) - 1));
+  }
+}
+
+BOOST_AUTO_TEST_CASE(Sanity) {
+  BitPackingSanity();
+}
+
+} // namespace
+} // namespace util
diff --git a/klm/util/exception.cc b/klm/util/exception.cc
index dd337a76..de6dd43c 100644
--- a/klm/util/exception.cc
+++ b/klm/util/exception.cc
@@ -24,7 +24,12 @@ const char *HandleStrerror(const char *ret, const char *buf) {
 ErrnoException::ErrnoException() throw() : errno_(errno) {
   char buf[200];
   buf[0] = 0;
+#ifdef sun
+  const char *add = strerror(errno);
+#else
   const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf);
+#endif
+
   if (add) {
     *this << add << ' ';
   }
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index e7bd8659..5a667ebb 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -2,12 +2,12 @@
 
 #include "util/exception.hh"
 
+#include <iostream>
 #include <string>
 #include <limits>
 
 #include <assert.h>
 #include <ctype.h>
-#include <err.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <sys/mman.h>
@@ -27,7 +27,7 @@ EndOfFileException::EndOfFileException() throw() {
 EndOfFileException::~EndOfFileException() throw() {}
 
 ParseNumberException::ParseNumberException(StringPiece value) throw() {
-  *this << "Could not parse \"" << value << "\" into a float";
+  *this << "Could not parse \"" << value << "\" into a number";
 }
 
 GZException::GZException(void *file) {
@@ -68,12 +68,52 @@ FilePiece::~FilePiece() {
     file_.release();
     int ret;
     if (Z_OK != (ret = gzclose(gz_file_))) {
-      errx(1, "could not close file %s using zlib", file_name_.c_str());
+      std::cerr << "could not close file " << file_name_ << " using zlib" << std::endl;
+      abort();
     }
   }
 #endif
 }
 
+StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) {
+  const char *start = position_;
+  do {
+    for (const char *i = start; i < position_end_; ++i) {
+      if (*i == delim) {
+        StringPiece ret(position_, i - position_);
+        position_ = i + 1;
+        return ret;
+      }
+    }
+    size_t skip = position_end_ - position_;
+    Shift();
+    start = position_ + skip;
+  } while (!at_end_);
+  StringPiece ret(position_, position_end_ - position_);
+  position_ = position_end_;
+  return ret;
+}
+
+float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) {
+  return ReadNumber<float>();
+}
+double FilePiece::ReadDouble() throw(GZException, EndOfFileException, ParseNumberException) {
+  return ReadNumber<double>();
+}
+long int FilePiece::ReadLong() throw(GZException, EndOfFileException, ParseNumberException) {
+  return ReadNumber<long int>();
+}
+unsigned long int FilePiece::ReadULong() throw(GZException, EndOfFileException, ParseNumberException) {
+  return ReadNumber<unsigned long int>();
+}
+
+void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) {
+  for (; ; ++position_) {
+    if (position_ == position_end_) Shift();
+    if (!isspace(*position_)) return;
+  }
+}
+
 void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) {
 #ifdef HAVE_ZLIB
   gz_file_ = NULL;
@@ -108,14 +148,34 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t
   }
 }
 
-float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) {
+namespace {
+void ParseNumber(const char *begin, char *&end, float &out) {
+#ifdef sun
+  out = static_cast<float>(strtod(begin, &end));
+#else
+  out = strtof(begin, &end);
+#endif
+}
+void ParseNumber(const char *begin, char *&end, double &out) {
+  out = strtod(begin, &end);
+}
+void ParseNumber(const char *begin, char *&end, long int &out) {
+  out = strtol(begin, &end, 10);
+}
+void ParseNumber(const char *begin, char *&end, unsigned long int &out) {
+  out = strtoul(begin, &end, 10);
+}
+} // namespace
+
+template <class T> T FilePiece::ReadNumber() throw(GZException, EndOfFileException, ParseNumberException) {
   SkipSpaces();
   while (last_space_ < position_) {
     if (at_end_) {
       // Hallucinate a null off the end of the file.
       std::string buffer(position_, position_end_);
       char *end;
-      float ret = strtof(buffer.c_str(), &end);
+      T ret;
+      ParseNumber(buffer.c_str(), end, ret);
       if (buffer.c_str() == end) throw ParseNumberException(buffer);
       position_ += end - buffer.c_str();
       return ret;
@@ -123,19 +183,13 @@ float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberE
     Shift();
   }
   char *end;
-  float ret = strtof(position_, &end);
+  T ret;
+  ParseNumber(position_, end, ret);
   if (end == position_) throw ParseNumberException(ReadDelimited());
   position_ = end;
   return ret;
 }
 
-void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) {
-  for (; ; ++position_) {
-    if (position_ == position_end_) Shift();
-    if (!isspace(*position_)) return;
-  }
-}
-
 const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileException) {
   for (const char *i = position_; i <= last_space_; ++i) {
     if (isspace(*i)) return i;
@@ -150,25 +204,6 @@ const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileExcepti
   return position_end_;
 }
 
-StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) {
-  const char *start = position_;
-  do {
-    for (const char *i = start; i < position_end_; ++i) {
-      if (*i == delim) {
-        StringPiece ret(position_, i - position_);
-        position_ = i + 1;
-        return ret;
-      }
-    }
-    size_t skip = position_end_ - position_;
-    Shift();
-    start = position_ + skip;
-  } while (!at_end_);
-  StringPiece ret(position_, position_end_ - position_);
-  position_ = position_end_;
-  return ret;
-}
-
 void FilePiece::Shift() throw(GZException, EndOfFileException) {
   if (at_end_) {
     progress_.Finished();
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
index 11d4a751..b7697e71 100644
--- a/klm/util/file_piece.hh
+++ b/klm/util/file_piece.hh
@@ -68,6 +68,9 @@ class FilePiece {
     StringPiece ReadLine(char delim = '\n') throw(GZException, EndOfFileException);
 
     float ReadFloat() throw(GZException, EndOfFileException, ParseNumberException);
+    double ReadDouble() throw(GZException, EndOfFileException, ParseNumberException);
+    long int ReadLong() throw(GZException, EndOfFileException, ParseNumberException);
+    unsigned long int ReadULong() throw(GZException, EndOfFileException, ParseNumberException);
 
     void SkipSpaces() throw (GZException, EndOfFileException);
 
@@ -80,6 +83,8 @@ class FilePiece {
   private:
     void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw(GZException);
 
+    template <class T> T ReadNumber() throw(GZException, EndOfFileException, ParseNumberException);
+
     StringPiece Consume(const char *to) {
       StringPiece ret(position_, to - position_);
       position_ = to;
diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc
index 23e79fe0..dc9ec7e7 100644
--- a/klm/util/file_piece_test.cc
+++ b/klm/util/file_piece_test.cc
@@ -8,6 +8,8 @@
 #include <iostream>
 
 #include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 
 namespace util {
 namespace {
@@ -27,14 +29,18 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) {
   BOOST_CHECK_THROW(test.get(), EndOfFileException);
 }
 
+#ifndef __APPLE__
+/* Apple isn't happy with the popen, fileno, dup.  And I don't want to
+ * reimplement popen.  This is an issue with the test.  
+ */
 /* read() implementation */
 BOOST_AUTO_TEST_CASE(StreamReadLine) {
   std::fstream ref("file_piece.cc", std::ios::in);
 
-  scoped_FILE catter(popen("cat file_piece.cc", "r"));
-  BOOST_REQUIRE(catter.get());
+  FILE *catter = popen("cat file_piece.cc", "r");
+  BOOST_REQUIRE(catter);
   
-  FilePiece test(dup(fileno(catter.get())), "file_piece.cc", NULL, 1);
+  FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1);
   std::string ref_line;
   while (getline(ref, ref_line)) {
     StringPiece test_line(test.ReadLine());
@@ -44,7 +50,9 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) {
     }
   }
   BOOST_CHECK_THROW(test.get(), EndOfFileException);
+  BOOST_REQUIRE(!pclose(catter));
 }
+#endif // __APPLE__
 
 #ifdef HAVE_ZLIB
 
@@ -64,14 +72,17 @@ BOOST_AUTO_TEST_CASE(PlainZipReadLine) {
   }
   BOOST_CHECK_THROW(test.get(), EndOfFileException);
 }
-// gzip stream
+
+// gzip stream.  Apple doesn't like popen, fileno, dup.  This is an issue with
+// the test.  
+#ifndef __APPLE__
 BOOST_AUTO_TEST_CASE(StreamZipReadLine) {
   std::fstream ref("file_piece.cc", std::ios::in);
 
-  scoped_FILE catter(popen("gzip <file_piece.cc", "r"));
-  BOOST_REQUIRE(catter.get());
+  FILE * catter = popen("gzip <file_piece.cc", "r");
+  BOOST_REQUIRE(catter);
   
-  FilePiece test(dup(fileno(catter.get())), "file_piece.cc", NULL, 1);
+  FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1);
   std::string ref_line;
   while (getline(ref, ref_line)) {
     StringPiece test_line(test.ReadLine());
@@ -81,9 +92,11 @@ BOOST_AUTO_TEST_CASE(StreamZipReadLine) {
     }
   }
   BOOST_CHECK_THROW(test.get(), EndOfFileException);
+  BOOST_REQUIRE(!pclose(catter));
 }
+#endif // __APPLE__
 
-#endif
+#endif // HAVE_ZLIB
 
 } // namespace
 } // namespace util
diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc
index 8685170f..5a810c64 100644
--- a/klm/util/mmap.cc
+++ b/klm/util/mmap.cc
@@ -2,8 +2,9 @@
 #include "util/mmap.hh"
 #include "util/scoped.hh"
 
+#include <iostream>
+
 #include <assert.h>
-#include <err.h>
 #include <fcntl.h>
 #include <sys/types.h>
 #include <sys/mman.h>
@@ -14,8 +15,10 @@ namespace util {
 
 scoped_mmap::~scoped_mmap() {
   if (data_ != (void*)-1) {
-    if (munmap(data_, size_))
-      err(1, "munmap failed ");
+    if (munmap(data_, size_)) {
+      std::cerr << "munmap failed for " << size_ << " bytes." << std::endl;
+      abort();
+    }
   }
 }
 
@@ -73,18 +76,27 @@ void ReadAll(int fd, void *to_void, std::size_t amount) {
     to += ret;
   }
 }
+
+const int kFileFlags =
+#ifdef MAP_FILE
+  MAP_FILE | MAP_SHARED
+#else
+  MAP_SHARED
+#endif
+  ;
+
 } // namespace
 
 void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out) {
   switch (method) {
     case LAZY:
-      out.reset(MapOrThrow(size, false, MAP_FILE | MAP_SHARED, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
+      out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
       break;
     case POPULATE_OR_LAZY:
 #ifdef MAP_POPULATE
     case POPULATE_OR_READ:
 #endif
-      out.reset(MapOrThrow(size, false, MAP_FILE | MAP_SHARED, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
+      out.reset(MapOrThrow(size, false, kFileFlags, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
       break;
 #ifndef MAP_POPULATE
     case POPULATE_OR_READ:
@@ -115,7 +127,7 @@ void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
   if (-1 == ftruncate(file.get(), size))
     UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed");
   try {
-    return MapOrThrow(size, true, MAP_FILE | MAP_SHARED, false, file.get(), 0);
+    return MapOrThrow(size, true, kFileFlags, false, file.get(), 0);
   } catch (ErrnoException &e) {
     e << " in file " << name;
     throw;
diff --git a/klm/util/murmur_hash.hh b/klm/util/murmur_hash.hh
index 638aaeb2..78fe583f 100644
--- a/klm/util/murmur_hash.hh
+++ b/klm/util/murmur_hash.hh
@@ -1,7 +1,7 @@
 #ifndef UTIL_MURMUR_HASH__
 #define UTIL_MURMUR_HASH__
 #include <cstddef>
-#include <stdint.h>
+#include <inttypes.h>
 
 namespace util {
 
diff --git a/klm/util/scoped.cc b/klm/util/scoped.cc
index 2c6d5394..a4cc5016 100644
--- a/klm/util/scoped.cc
+++ b/klm/util/scoped.cc
@@ -1,16 +1,24 @@
 #include "util/scoped.hh"
 
-#include <err.h>
+#include <iostream>
+
+#include <stdlib.h>
 #include <unistd.h>
 
 namespace util {
 
 scoped_fd::~scoped_fd() {
-  if (fd_ != -1 && close(fd_)) err(1, "Could not close file %i", fd_);
+  if (fd_ != -1 && close(fd_)) {
+    std::cerr << "Could not close file " << fd_ << std::endl;
+    abort();
+  }
 }
 
 scoped_FILE::~scoped_FILE() {
-  if (file_ && fclose(file_)) err(1, "Could not close file");
+  if (file_ && fclose(file_)) {
+    std::cerr << "Could not close file " << std::endl;
+    abort();
+  }
 }
 
 } // namespace util
diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh
index 4557173b..3ac2f8a7 100644
--- a/klm/util/string_piece.hh
+++ b/klm/util/string_piece.hh
@@ -51,7 +51,7 @@
 //Uncomment this line if you use ICU in your code.  
 //#define HAVE_ICU
 //Uncomment this line if you want boost hashing for your StringPieces.
-#define HAVE_BOOST
+//#define HAVE_BOOST
 
 #include <cstring>
 #include <iosfwd>
-- 
cgit v1.2.3