From be98f29f51350c24136c191f01af3fbfe340ef78 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 13 Dec 2010 16:18:34 -0500 Subject: new version of kenlm --- klm/util/bit_packing.hh | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) (limited to 'klm/util/bit_packing.hh') diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh index 422ed873..0fd39d7f 100644 --- a/klm/util/bit_packing.hh +++ b/klm/util/bit_packing.hh @@ -6,56 +6,68 @@ #include #ifdef __APPLE__ #include -#else +#elif __linux__ #include -#endif +#else +#include +#endif #include -#if __BYTE_ORDER != __LITTLE_ENDIAN -#error The bit aligned storage functions assume little endian architecture -#endif - namespace util { /* WARNING WARNING WARNING: * The write functions assume that memory is zero initially. This makes them * faster and is the appropriate case for mmapped language model construction. * These routines assume that unaligned access to uint64_t is fast and that - * storage is little endian. This is the case on x86_64. It may not be the - * case on 32-bit x86 but my target audience is large language models for which - * 64-bit is necessary. + * storage is little endian. This is the case on x86_64. I'm not sure how + * fast unaligned 64-bit access is on x86 but my target audience is large + * language models for which 64-bit is necessary. + * + * Call the BitPackingSanity function to sanity check. Calling once suffices, + * but it may be called multiple times when that's inconvenient. */ +inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { +// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct. +#if BYTE_ORDER == LITTLE_ENDIAN + return bit; +#elif BYTE_ORDER == BIG_ENDIAN + return 64 - length - bit; +#else +#error "Bit packing code isn't written for your byte order." +#endif +} + /* Pack integers up to 57 bits using their least significant digits. * The length is specified using mask: * Assumes mask == (1 << length) - 1 where length <= 57. */ -inline uint64_t ReadInt57(const void *base, uint8_t bit, uint64_t mask) { - return (*reinterpret_cast(base) >> bit) & mask; +inline uint64_t ReadInt57(const void *base, uint8_t bit, uint8_t length, uint64_t mask) { + return (*reinterpret_cast(base) >> BitPackShift(bit, length)) & mask; } -/* Assumes value <= mask and mask == (1 << length) - 1 where length <= 57. +/* Assumes value < (1 << length) and length <= 57. * Assumes the memory is zero initially. */ -inline void WriteInt57(void *base, uint8_t bit, uint64_t value) { - *reinterpret_cast(base) |= (value << bit); +inline void WriteInt57(void *base, uint8_t bit, uint8_t length, uint64_t value) { + *reinterpret_cast(base) |= (value << BitPackShift(bit, length)); } namespace detail { typedef union { float f; uint32_t i; } FloatEnc; } inline float ReadFloat32(const void *base, uint8_t bit) { detail::FloatEnc encoded; - encoded.i = *reinterpret_cast(base) >> bit; + encoded.i = *reinterpret_cast(base) >> BitPackShift(bit, 32); return encoded.f; } inline void WriteFloat32(void *base, uint8_t bit, float value) { detail::FloatEnc encoded; encoded.f = value; - WriteInt57(base, bit, encoded.i); + WriteInt57(base, bit, 32, encoded.i); } inline float ReadNonPositiveFloat31(const void *base, uint8_t bit) { detail::FloatEnc encoded; - encoded.i = *reinterpret_cast(base) >> bit; + encoded.i = *reinterpret_cast(base) >> BitPackShift(bit, 31); // Sign bit set means negative. encoded.i |= 0x80000000; return encoded.f; @@ -65,7 +77,7 @@ inline void WriteNonPositiveFloat31(void *base, uint8_t bit, float value) { detail::FloatEnc encoded; encoded.f = value; encoded.i &= ~0x80000000; - WriteInt57(base, bit, encoded.i); + WriteInt57(base, bit, 31, encoded.i); } void BitPackingSanity(); -- cgit v1.2.3