From be98f29f51350c24136c191f01af3fbfe340ef78 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 13 Dec 2010 16:18:34 -0500 Subject: new version of kenlm --- klm/util/bit_packing.cc | 13 ++++++ klm/util/bit_packing.hh | 48 +++++++++++++-------- klm/util/bit_packing_test.cc | 46 ++++++++++++++++++++ klm/util/exception.cc | 5 +++ klm/util/file_piece.cc | 99 ++++++++++++++++++++++++++++++-------------- klm/util/file_piece.hh | 5 +++ klm/util/file_piece_test.cc | 29 +++++++++---- klm/util/mmap.cc | 24 ++++++++--- klm/util/murmur_hash.hh | 2 +- klm/util/scoped.cc | 14 +++++-- klm/util/string_piece.hh | 2 +- 11 files changed, 218 insertions(+), 69 deletions(-) create mode 100644 klm/util/bit_packing_test.cc (limited to 'klm/util') diff --git a/klm/util/bit_packing.cc b/klm/util/bit_packing.cc index dd14ffe1..9d4fdf27 100644 --- a/klm/util/bit_packing.cc +++ b/klm/util/bit_packing.cc @@ -1,12 +1,15 @@ #include "util/bit_packing.hh" #include "util/exception.hh" +#include + namespace util { namespace { template struct StaticCheck {}; template <> struct StaticCheck { typedef bool StaticAssertionPassed; }; +// If your float isn't 4 bytes, we're hosed. typedef StaticCheck::StaticAssertionPassed FloatSize; } // namespace @@ -21,6 +24,16 @@ uint8_t RequiredBits(uint64_t max_value) { void BitPackingSanity() { const detail::FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 }; if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000"); + char mem[57+8]; + memset(mem, 0, sizeof(mem)); + const uint64_t test57 = 0x123456789abcdefULL; + for (uint64_t b = 0; b < 57 * 8; b += 57) { + WriteInt57(mem + b / 8, b % 8, 57, test57); + } + for (uint64_t b = 0; b < 57 * 8; b += 57) { + if (test57 != ReadInt57(mem + b / 8, b % 8, 57, (1ULL << 57) - 1)) + UTIL_THROW(Exception, "The bit packing routines are failing for your architecture. Please send a bug report with your architecture, operating system, and compiler."); + } // TODO: more checks. } diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh index 422ed873..0fd39d7f 100644 --- a/klm/util/bit_packing.hh +++ b/klm/util/bit_packing.hh @@ -6,56 +6,68 @@ #include #ifdef __APPLE__ #include -#else +#elif __linux__ #include -#endif +#else +#include +#endif #include -#if __BYTE_ORDER != __LITTLE_ENDIAN -#error The bit aligned storage functions assume little endian architecture -#endif - namespace util { /* WARNING WARNING WARNING: * The write functions assume that memory is zero initially. This makes them * faster and is the appropriate case for mmapped language model construction. * These routines assume that unaligned access to uint64_t is fast and that - * storage is little endian. This is the case on x86_64. It may not be the - * case on 32-bit x86 but my target audience is large language models for which - * 64-bit is necessary. + * storage is little endian. This is the case on x86_64. I'm not sure how + * fast unaligned 64-bit access is on x86 but my target audience is large + * language models for which 64-bit is necessary. + * + * Call the BitPackingSanity function to sanity check. Calling once suffices, + * but it may be called multiple times when that's inconvenient. */ +inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { +// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct. +#if BYTE_ORDER == LITTLE_ENDIAN + return bit; +#elif BYTE_ORDER == BIG_ENDIAN + return 64 - length - bit; +#else +#error "Bit packing code isn't written for your byte order." +#endif +} + /* Pack integers up to 57 bits using their least significant digits. * The length is specified using mask: * Assumes mask == (1 << length) - 1 where length <= 57. */ -inline uint64_t ReadInt57(const void *base, uint8_t bit, uint64_t mask) { - return (*reinterpret_cast(base) >> bit) & mask; +inline uint64_t ReadInt57(const void *base, uint8_t bit, uint8_t length, uint64_t mask) { + return (*reinterpret_cast(base) >> BitPackShift(bit, length)) & mask; } -/* Assumes value <= mask and mask == (1 << length) - 1 where length <= 57. +/* Assumes value < (1 << length) and length <= 57. * Assumes the memory is zero initially. */ -inline void WriteInt57(void *base, uint8_t bit, uint64_t value) { - *reinterpret_cast(base) |= (value << bit); +inline void WriteInt57(void *base, uint8_t bit, uint8_t length, uint64_t value) { + *reinterpret_cast(base) |= (value << BitPackShift(bit, length)); } namespace detail { typedef union { float f; uint32_t i; } FloatEnc; } inline float ReadFloat32(const void *base, uint8_t bit) { detail::FloatEnc encoded; - encoded.i = *reinterpret_cast(base) >> bit; + encoded.i = *reinterpret_cast(base) >> BitPackShift(bit, 32); return encoded.f; } inline void WriteFloat32(void *base, uint8_t bit, float value) { detail::FloatEnc encoded; encoded.f = value; - WriteInt57(base, bit, encoded.i); + WriteInt57(base, bit, 32, encoded.i); } inline float ReadNonPositiveFloat31(const void *base, uint8_t bit) { detail::FloatEnc encoded; - encoded.i = *reinterpret_cast(base) >> bit; + encoded.i = *reinterpret_cast(base) >> BitPackShift(bit, 31); // Sign bit set means negative. encoded.i |= 0x80000000; return encoded.f; @@ -65,7 +77,7 @@ inline void WriteNonPositiveFloat31(void *base, uint8_t bit, float value) { detail::FloatEnc encoded; encoded.f = value; encoded.i &= ~0x80000000; - WriteInt57(base, bit, encoded.i); + WriteInt57(base, bit, 31, encoded.i); } void BitPackingSanity(); diff --git a/klm/util/bit_packing_test.cc b/klm/util/bit_packing_test.cc new file mode 100644 index 00000000..c578ddd1 --- /dev/null +++ b/klm/util/bit_packing_test.cc @@ -0,0 +1,46 @@ +#include "util/bit_packing.hh" + +#define BOOST_TEST_MODULE BitPackingTest +#include + +#include + +namespace util { +namespace { + +const uint64_t test57 = 0x123456789abcdefULL; + +BOOST_AUTO_TEST_CASE(ZeroBit) { + char mem[16]; + memset(mem, 0, sizeof(mem)); + WriteInt57(mem, 0, 57, test57); + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1)); +} + +BOOST_AUTO_TEST_CASE(EachBit) { + char mem[16]; + for (uint8_t b = 0; b < 8; ++b) { + memset(mem, 0, sizeof(mem)); + WriteInt57(mem, b, 57, test57); + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); + } +} + +BOOST_AUTO_TEST_CASE(Consecutive) { + char mem[57+8]; + memset(mem, 0, sizeof(mem)); + for (uint64_t b = 0; b < 57 * 8; b += 57) { + WriteInt57(mem + (b / 8), b % 8, 57, test57); + BOOST_CHECK_EQUAL(test57, ReadInt57(mem + b / 8, b % 8, 57, (1ULL << 57) - 1)); + } + for (uint64_t b = 0; b < 57 * 8; b += 57) { + BOOST_CHECK_EQUAL(test57, ReadInt57(mem + b / 8, b % 8, 57, (1ULL << 57) - 1)); + } +} + +BOOST_AUTO_TEST_CASE(Sanity) { + BitPackingSanity(); +} + +} // namespace +} // namespace util diff --git a/klm/util/exception.cc b/klm/util/exception.cc index dd337a76..de6dd43c 100644 --- a/klm/util/exception.cc +++ b/klm/util/exception.cc @@ -24,7 +24,12 @@ const char *HandleStrerror(const char *ret, const char *buf) { ErrnoException::ErrnoException() throw() : errno_(errno) { char buf[200]; buf[0] = 0; +#ifdef sun + const char *add = strerror(errno); +#else const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf); +#endif + if (add) { *this << add << ' '; } diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index e7bd8659..5a667ebb 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -2,12 +2,12 @@ #include "util/exception.hh" +#include #include #include #include #include -#include #include #include #include @@ -27,7 +27,7 @@ EndOfFileException::EndOfFileException() throw() { EndOfFileException::~EndOfFileException() throw() {} ParseNumberException::ParseNumberException(StringPiece value) throw() { - *this << "Could not parse \"" << value << "\" into a float"; + *this << "Could not parse \"" << value << "\" into a number"; } GZException::GZException(void *file) { @@ -68,12 +68,52 @@ FilePiece::~FilePiece() { file_.release(); int ret; if (Z_OK != (ret = gzclose(gz_file_))) { - errx(1, "could not close file %s using zlib", file_name_.c_str()); + std::cerr << "could not close file " << file_name_ << " using zlib" << std::endl; + abort(); } } #endif } +StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) { + const char *start = position_; + do { + for (const char *i = start; i < position_end_; ++i) { + if (*i == delim) { + StringPiece ret(position_, i - position_); + position_ = i + 1; + return ret; + } + } + size_t skip = position_end_ - position_; + Shift(); + start = position_ + skip; + } while (!at_end_); + StringPiece ret(position_, position_end_ - position_); + position_ = position_end_; + return ret; +} + +float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) { + return ReadNumber(); +} +double FilePiece::ReadDouble() throw(GZException, EndOfFileException, ParseNumberException) { + return ReadNumber(); +} +long int FilePiece::ReadLong() throw(GZException, EndOfFileException, ParseNumberException) { + return ReadNumber(); +} +unsigned long int FilePiece::ReadULong() throw(GZException, EndOfFileException, ParseNumberException) { + return ReadNumber(); +} + +void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) { + for (; ; ++position_) { + if (position_ == position_end_) Shift(); + if (!isspace(*position_)) return; + } +} + void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) { #ifdef HAVE_ZLIB gz_file_ = NULL; @@ -108,14 +148,34 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t } } -float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) { +namespace { +void ParseNumber(const char *begin, char *&end, float &out) { +#ifdef sun + out = static_cast(strtod(begin, &end)); +#else + out = strtof(begin, &end); +#endif +} +void ParseNumber(const char *begin, char *&end, double &out) { + out = strtod(begin, &end); +} +void ParseNumber(const char *begin, char *&end, long int &out) { + out = strtol(begin, &end, 10); +} +void ParseNumber(const char *begin, char *&end, unsigned long int &out) { + out = strtoul(begin, &end, 10); +} +} // namespace + +template T FilePiece::ReadNumber() throw(GZException, EndOfFileException, ParseNumberException) { SkipSpaces(); while (last_space_ < position_) { if (at_end_) { // Hallucinate a null off the end of the file. std::string buffer(position_, position_end_); char *end; - float ret = strtof(buffer.c_str(), &end); + T ret; + ParseNumber(buffer.c_str(), end, ret); if (buffer.c_str() == end) throw ParseNumberException(buffer); position_ += end - buffer.c_str(); return ret; @@ -123,19 +183,13 @@ float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberE Shift(); } char *end; - float ret = strtof(position_, &end); + T ret; + ParseNumber(position_, end, ret); if (end == position_) throw ParseNumberException(ReadDelimited()); position_ = end; return ret; } -void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) { - for (; ; ++position_) { - if (position_ == position_end_) Shift(); - if (!isspace(*position_)) return; - } -} - const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileException) { for (const char *i = position_; i <= last_space_; ++i) { if (isspace(*i)) return i; @@ -150,25 +204,6 @@ const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileExcepti return position_end_; } -StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) { - const char *start = position_; - do { - for (const char *i = start; i < position_end_; ++i) { - if (*i == delim) { - StringPiece ret(position_, i - position_); - position_ = i + 1; - return ret; - } - } - size_t skip = position_end_ - position_; - Shift(); - start = position_ + skip; - } while (!at_end_); - StringPiece ret(position_, position_end_ - position_); - position_ = position_end_; - return ret; -} - void FilePiece::Shift() throw(GZException, EndOfFileException) { if (at_end_) { progress_.Finished(); diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index 11d4a751..b7697e71 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -68,6 +68,9 @@ class FilePiece { StringPiece ReadLine(char delim = '\n') throw(GZException, EndOfFileException); float ReadFloat() throw(GZException, EndOfFileException, ParseNumberException); + double ReadDouble() throw(GZException, EndOfFileException, ParseNumberException); + long int ReadLong() throw(GZException, EndOfFileException, ParseNumberException); + unsigned long int ReadULong() throw(GZException, EndOfFileException, ParseNumberException); void SkipSpaces() throw (GZException, EndOfFileException); @@ -80,6 +83,8 @@ class FilePiece { private: void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw(GZException); + template T ReadNumber() throw(GZException, EndOfFileException, ParseNumberException); + StringPiece Consume(const char *to) { StringPiece ret(position_, to - position_); position_ = to; diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc index 23e79fe0..dc9ec7e7 100644 --- a/klm/util/file_piece_test.cc +++ b/klm/util/file_piece_test.cc @@ -8,6 +8,8 @@ #include #include +#include +#include namespace util { namespace { @@ -27,14 +29,18 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) { BOOST_CHECK_THROW(test.get(), EndOfFileException); } +#ifndef __APPLE__ +/* Apple isn't happy with the popen, fileno, dup. And I don't want to + * reimplement popen. This is an issue with the test. + */ /* read() implementation */ BOOST_AUTO_TEST_CASE(StreamReadLine) { std::fstream ref("file_piece.cc", std::ios::in); - scoped_FILE catter(popen("cat file_piece.cc", "r")); - BOOST_REQUIRE(catter.get()); + FILE *catter = popen("cat file_piece.cc", "r"); + BOOST_REQUIRE(catter); - FilePiece test(dup(fileno(catter.get())), "file_piece.cc", NULL, 1); + FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); @@ -44,7 +50,9 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) { } } BOOST_CHECK_THROW(test.get(), EndOfFileException); + BOOST_REQUIRE(!pclose(catter)); } +#endif // __APPLE__ #ifdef HAVE_ZLIB @@ -64,14 +72,17 @@ BOOST_AUTO_TEST_CASE(PlainZipReadLine) { } BOOST_CHECK_THROW(test.get(), EndOfFileException); } -// gzip stream + +// gzip stream. Apple doesn't like popen, fileno, dup. This is an issue with +// the test. +#ifndef __APPLE__ BOOST_AUTO_TEST_CASE(StreamZipReadLine) { std::fstream ref("file_piece.cc", std::ios::in); - scoped_FILE catter(popen("gzip + #include -#include #include #include #include @@ -14,8 +15,10 @@ namespace util { scoped_mmap::~scoped_mmap() { if (data_ != (void*)-1) { - if (munmap(data_, size_)) - err(1, "munmap failed "); + if (munmap(data_, size_)) { + std::cerr << "munmap failed for " << size_ << " bytes." << std::endl; + abort(); + } } } @@ -73,18 +76,27 @@ void ReadAll(int fd, void *to_void, std::size_t amount) { to += ret; } } + +const int kFileFlags = +#ifdef MAP_FILE + MAP_FILE | MAP_SHARED +#else + MAP_SHARED +#endif + ; + } // namespace void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out) { switch (method) { case LAZY: - out.reset(MapOrThrow(size, false, MAP_FILE | MAP_SHARED, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED); + out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED); break; case POPULATE_OR_LAZY: #ifdef MAP_POPULATE case POPULATE_OR_READ: #endif - out.reset(MapOrThrow(size, false, MAP_FILE | MAP_SHARED, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED); + out.reset(MapOrThrow(size, false, kFileFlags, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED); break; #ifndef MAP_POPULATE case POPULATE_OR_READ: @@ -115,7 +127,7 @@ void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { if (-1 == ftruncate(file.get(), size)) UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed"); try { - return MapOrThrow(size, true, MAP_FILE | MAP_SHARED, false, file.get(), 0); + return MapOrThrow(size, true, kFileFlags, false, file.get(), 0); } catch (ErrnoException &e) { e << " in file " << name; throw; diff --git a/klm/util/murmur_hash.hh b/klm/util/murmur_hash.hh index 638aaeb2..78fe583f 100644 --- a/klm/util/murmur_hash.hh +++ b/klm/util/murmur_hash.hh @@ -1,7 +1,7 @@ #ifndef UTIL_MURMUR_HASH__ #define UTIL_MURMUR_HASH__ #include -#include +#include namespace util { diff --git a/klm/util/scoped.cc b/klm/util/scoped.cc index 2c6d5394..a4cc5016 100644 --- a/klm/util/scoped.cc +++ b/klm/util/scoped.cc @@ -1,16 +1,24 @@ #include "util/scoped.hh" -#include +#include + +#include #include namespace util { scoped_fd::~scoped_fd() { - if (fd_ != -1 && close(fd_)) err(1, "Could not close file %i", fd_); + if (fd_ != -1 && close(fd_)) { + std::cerr << "Could not close file " << fd_ << std::endl; + abort(); + } } scoped_FILE::~scoped_FILE() { - if (file_ && fclose(file_)) err(1, "Could not close file"); + if (file_ && fclose(file_)) { + std::cerr << "Could not close file " << std::endl; + abort(); + } } } // namespace util diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh index 4557173b..3ac2f8a7 100644 --- a/klm/util/string_piece.hh +++ b/klm/util/string_piece.hh @@ -51,7 +51,7 @@ //Uncomment this line if you use ICU in your code. //#define HAVE_ICU //Uncomment this line if you want boost hashing for your StringPieces. -#define HAVE_BOOST +//#define HAVE_BOOST #include #include -- cgit v1.2.3