summaryrefslogtreecommitdiff
path: root/klm/util
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2010-12-13 16:18:34 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2010-12-13 16:18:34 -0500
commitbe98f29f51350c24136c191f01af3fbfe340ef78 (patch)
tree2e104152110ca76b527147458050a41934e031f2 /klm/util
parent063c0623aaf5dad8d02e5eae5793c123cd7fc3fe (diff)
new version of kenlm
Diffstat (limited to 'klm/util')
-rw-r--r--klm/util/bit_packing.cc13
-rw-r--r--klm/util/bit_packing.hh48
-rw-r--r--klm/util/bit_packing_test.cc46
-rw-r--r--klm/util/exception.cc5
-rw-r--r--klm/util/file_piece.cc99
-rw-r--r--klm/util/file_piece.hh5
-rw-r--r--klm/util/file_piece_test.cc29
-rw-r--r--klm/util/mmap.cc24
-rw-r--r--klm/util/murmur_hash.hh2
-rw-r--r--klm/util/scoped.cc14
-rw-r--r--klm/util/string_piece.hh2
11 files changed, 218 insertions, 69 deletions
diff --git a/klm/util/bit_packing.cc b/klm/util/bit_packing.cc
index dd14ffe1..9d4fdf27 100644
--- a/klm/util/bit_packing.cc
+++ b/klm/util/bit_packing.cc
@@ -1,12 +1,15 @@
#include "util/bit_packing.hh"
#include "util/exception.hh"
+#include <string.h>
+
namespace util {
namespace {
template <bool> struct StaticCheck {};
template <> struct StaticCheck<true> { typedef bool StaticAssertionPassed; };
+// If your float isn't 4 bytes, we're hosed.
typedef StaticCheck<sizeof(float) == 4>::StaticAssertionPassed FloatSize;
} // namespace
@@ -21,6 +24,16 @@ uint8_t RequiredBits(uint64_t max_value) {
void BitPackingSanity() {
const detail::FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 };
if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000");
+ char mem[57+8];
+ memset(mem, 0, sizeof(mem));
+ const uint64_t test57 = 0x123456789abcdefULL;
+ for (uint64_t b = 0; b < 57 * 8; b += 57) {
+ WriteInt57(mem + b / 8, b % 8, 57, test57);
+ }
+ for (uint64_t b = 0; b < 57 * 8; b += 57) {
+ if (test57 != ReadInt57(mem + b / 8, b % 8, 57, (1ULL << 57) - 1))
+ UTIL_THROW(Exception, "The bit packing routines are failing for your architecture. Please send a bug report with your architecture, operating system, and compiler.");
+ }
// TODO: more checks.
}
diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh
index 422ed873..0fd39d7f 100644
--- a/klm/util/bit_packing.hh
+++ b/klm/util/bit_packing.hh
@@ -6,56 +6,68 @@
#include <assert.h>
#ifdef __APPLE__
#include <architecture/byte_order.h>
-#else
+#elif __linux__
#include <endian.h>
-#endif
+#else
+#include <arpa/nameser_compat.h>
+#endif
#include <inttypes.h>
-#if __BYTE_ORDER != __LITTLE_ENDIAN
-#error The bit aligned storage functions assume little endian architecture
-#endif
-
namespace util {
/* WARNING WARNING WARNING:
* The write functions assume that memory is zero initially. This makes them
* faster and is the appropriate case for mmapped language model construction.
* These routines assume that unaligned access to uint64_t is fast and that
- * storage is little endian. This is the case on x86_64. It may not be the
- * case on 32-bit x86 but my target audience is large language models for which
- * 64-bit is necessary.
+ * storage is little endian. This is the case on x86_64. I'm not sure how
+ * fast unaligned 64-bit access is on x86 but my target audience is large
+ * language models for which 64-bit is necessary.
+ *
+ * Call the BitPackingSanity function to sanity check. Calling once suffices,
+ * but it may be called multiple times when that's inconvenient.
*/
+inline uint8_t BitPackShift(uint8_t bit, uint8_t length) {
+// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct.
+#if BYTE_ORDER == LITTLE_ENDIAN
+ return bit;
+#elif BYTE_ORDER == BIG_ENDIAN
+ return 64 - length - bit;
+#else
+#error "Bit packing code isn't written for your byte order."
+#endif
+}
+
/* Pack integers up to 57 bits using their least significant digits.
* The length is specified using mask:
* Assumes mask == (1 << length) - 1 where length <= 57.
*/
-inline uint64_t ReadInt57(const void *base, uint8_t bit, uint64_t mask) {
- return (*reinterpret_cast<const uint64_t*>(base) >> bit) & mask;
+inline uint64_t ReadInt57(const void *base, uint8_t bit, uint8_t length, uint64_t mask) {
+ return (*reinterpret_cast<const uint64_t*>(base) >> BitPackShift(bit, length)) & mask;
}
-/* Assumes value <= mask and mask == (1 << length) - 1 where length <= 57.
+/* Assumes value < (1 << length) and length <= 57.
* Assumes the memory is zero initially.
*/
-inline void WriteInt57(void *base, uint8_t bit, uint64_t value) {
- *reinterpret_cast<uint64_t*>(base) |= (value << bit);
+inline void WriteInt57(void *base, uint8_t bit, uint8_t length, uint64_t value) {
+ *reinterpret_cast<uint64_t*>(base) |= (value << BitPackShift(bit, length));
}
namespace detail { typedef union { float f; uint32_t i; } FloatEnc; }
inline float ReadFloat32(const void *base, uint8_t bit) {
detail::FloatEnc encoded;
- encoded.i = *reinterpret_cast<const uint64_t*>(base) >> bit;
+ encoded.i = *reinterpret_cast<const uint64_t*>(base) >> BitPackShift(bit, 32);
return encoded.f;
}
inline void WriteFloat32(void *base, uint8_t bit, float value) {
detail::FloatEnc encoded;
encoded.f = value;
- WriteInt57(base, bit, encoded.i);
+ WriteInt57(base, bit, 32, encoded.i);
}
inline float ReadNonPositiveFloat31(const void *base, uint8_t bit) {
detail::FloatEnc encoded;
- encoded.i = *reinterpret_cast<const uint64_t*>(base) >> bit;
+ encoded.i = *reinterpret_cast<const uint64_t*>(base) >> BitPackShift(bit, 31);
// Sign bit set means negative.
encoded.i |= 0x80000000;
return encoded.f;
@@ -65,7 +77,7 @@ inline void WriteNonPositiveFloat31(void *base, uint8_t bit, float value) {
detail::FloatEnc encoded;
encoded.f = value;
encoded.i &= ~0x80000000;
- WriteInt57(base, bit, encoded.i);
+ WriteInt57(base, bit, 31, encoded.i);
}
void BitPackingSanity();
diff --git a/klm/util/bit_packing_test.cc b/klm/util/bit_packing_test.cc
new file mode 100644
index 00000000..c578ddd1
--- /dev/null
+++ b/klm/util/bit_packing_test.cc
@@ -0,0 +1,46 @@
+#include "util/bit_packing.hh"
+
+#define BOOST_TEST_MODULE BitPackingTest
+#include <boost/test/unit_test.hpp>
+
+#include <string.h>
+
+namespace util {
+namespace {
+
+const uint64_t test57 = 0x123456789abcdefULL;
+
+BOOST_AUTO_TEST_CASE(ZeroBit) {
+ char mem[16];
+ memset(mem, 0, sizeof(mem));
+ WriteInt57(mem, 0, 57, test57);
+ BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1));
+}
+
+BOOST_AUTO_TEST_CASE(EachBit) {
+ char mem[16];
+ for (uint8_t b = 0; b < 8; ++b) {
+ memset(mem, 0, sizeof(mem));
+ WriteInt57(mem, b, 57, test57);
+ BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
+ }
+}
+
+BOOST_AUTO_TEST_CASE(Consecutive) {
+ char mem[57+8];
+ memset(mem, 0, sizeof(mem));
+ for (uint64_t b = 0; b < 57 * 8; b += 57) {
+ WriteInt57(mem + (b / 8), b % 8, 57, test57);
+ BOOST_CHECK_EQUAL(test57, ReadInt57(mem + b / 8, b % 8, 57, (1ULL << 57) - 1));
+ }
+ for (uint64_t b = 0; b < 57 * 8; b += 57) {
+ BOOST_CHECK_EQUAL(test57, ReadInt57(mem + b / 8, b % 8, 57, (1ULL << 57) - 1));
+ }
+}
+
+BOOST_AUTO_TEST_CASE(Sanity) {
+ BitPackingSanity();
+}
+
+} // namespace
+} // namespace util
diff --git a/klm/util/exception.cc b/klm/util/exception.cc
index dd337a76..de6dd43c 100644
--- a/klm/util/exception.cc
+++ b/klm/util/exception.cc
@@ -24,7 +24,12 @@ const char *HandleStrerror(const char *ret, const char *buf) {
ErrnoException::ErrnoException() throw() : errno_(errno) {
char buf[200];
buf[0] = 0;
+#ifdef sun
+ const char *add = strerror(errno);
+#else
const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf);
+#endif
+
if (add) {
*this << add << ' ';
}
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index e7bd8659..5a667ebb 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -2,12 +2,12 @@
#include "util/exception.hh"
+#include <iostream>
#include <string>
#include <limits>
#include <assert.h>
#include <ctype.h>
-#include <err.h>
#include <fcntl.h>
#include <stdlib.h>
#include <sys/mman.h>
@@ -27,7 +27,7 @@ EndOfFileException::EndOfFileException() throw() {
EndOfFileException::~EndOfFileException() throw() {}
ParseNumberException::ParseNumberException(StringPiece value) throw() {
- *this << "Could not parse \"" << value << "\" into a float";
+ *this << "Could not parse \"" << value << "\" into a number";
}
GZException::GZException(void *file) {
@@ -68,12 +68,52 @@ FilePiece::~FilePiece() {
file_.release();
int ret;
if (Z_OK != (ret = gzclose(gz_file_))) {
- errx(1, "could not close file %s using zlib", file_name_.c_str());
+ std::cerr << "could not close file " << file_name_ << " using zlib" << std::endl;
+ abort();
}
}
#endif
}
+StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) {
+ const char *start = position_;
+ do {
+ for (const char *i = start; i < position_end_; ++i) {
+ if (*i == delim) {
+ StringPiece ret(position_, i - position_);
+ position_ = i + 1;
+ return ret;
+ }
+ }
+ size_t skip = position_end_ - position_;
+ Shift();
+ start = position_ + skip;
+ } while (!at_end_);
+ StringPiece ret(position_, position_end_ - position_);
+ position_ = position_end_;
+ return ret;
+}
+
+float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) {
+ return ReadNumber<float>();
+}
+double FilePiece::ReadDouble() throw(GZException, EndOfFileException, ParseNumberException) {
+ return ReadNumber<double>();
+}
+long int FilePiece::ReadLong() throw(GZException, EndOfFileException, ParseNumberException) {
+ return ReadNumber<long int>();
+}
+unsigned long int FilePiece::ReadULong() throw(GZException, EndOfFileException, ParseNumberException) {
+ return ReadNumber<unsigned long int>();
+}
+
+void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) {
+ for (; ; ++position_) {
+ if (position_ == position_end_) Shift();
+ if (!isspace(*position_)) return;
+ }
+}
+
void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) {
#ifdef HAVE_ZLIB
gz_file_ = NULL;
@@ -108,14 +148,34 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t
}
}
-float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) {
+namespace {
+void ParseNumber(const char *begin, char *&end, float &out) {
+#ifdef sun
+ out = static_cast<float>(strtod(begin, &end));
+#else
+ out = strtof(begin, &end);
+#endif
+}
+void ParseNumber(const char *begin, char *&end, double &out) {
+ out = strtod(begin, &end);
+}
+void ParseNumber(const char *begin, char *&end, long int &out) {
+ out = strtol(begin, &end, 10);
+}
+void ParseNumber(const char *begin, char *&end, unsigned long int &out) {
+ out = strtoul(begin, &end, 10);
+}
+} // namespace
+
+template <class T> T FilePiece::ReadNumber() throw(GZException, EndOfFileException, ParseNumberException) {
SkipSpaces();
while (last_space_ < position_) {
if (at_end_) {
// Hallucinate a null off the end of the file.
std::string buffer(position_, position_end_);
char *end;
- float ret = strtof(buffer.c_str(), &end);
+ T ret;
+ ParseNumber(buffer.c_str(), end, ret);
if (buffer.c_str() == end) throw ParseNumberException(buffer);
position_ += end - buffer.c_str();
return ret;
@@ -123,19 +183,13 @@ float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberE
Shift();
}
char *end;
- float ret = strtof(position_, &end);
+ T ret;
+ ParseNumber(position_, end, ret);
if (end == position_) throw ParseNumberException(ReadDelimited());
position_ = end;
return ret;
}
-void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) {
- for (; ; ++position_) {
- if (position_ == position_end_) Shift();
- if (!isspace(*position_)) return;
- }
-}
-
const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileException) {
for (const char *i = position_; i <= last_space_; ++i) {
if (isspace(*i)) return i;
@@ -150,25 +204,6 @@ const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileExcepti
return position_end_;
}
-StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) {
- const char *start = position_;
- do {
- for (const char *i = start; i < position_end_; ++i) {
- if (*i == delim) {
- StringPiece ret(position_, i - position_);
- position_ = i + 1;
- return ret;
- }
- }
- size_t skip = position_end_ - position_;
- Shift();
- start = position_ + skip;
- } while (!at_end_);
- StringPiece ret(position_, position_end_ - position_);
- position_ = position_end_;
- return ret;
-}
-
void FilePiece::Shift() throw(GZException, EndOfFileException) {
if (at_end_) {
progress_.Finished();
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
index 11d4a751..b7697e71 100644
--- a/klm/util/file_piece.hh
+++ b/klm/util/file_piece.hh
@@ -68,6 +68,9 @@ class FilePiece {
StringPiece ReadLine(char delim = '\n') throw(GZException, EndOfFileException);
float ReadFloat() throw(GZException, EndOfFileException, ParseNumberException);
+ double ReadDouble() throw(GZException, EndOfFileException, ParseNumberException);
+ long int ReadLong() throw(GZException, EndOfFileException, ParseNumberException);
+ unsigned long int ReadULong() throw(GZException, EndOfFileException, ParseNumberException);
void SkipSpaces() throw (GZException, EndOfFileException);
@@ -80,6 +83,8 @@ class FilePiece {
private:
void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw(GZException);
+ template <class T> T ReadNumber() throw(GZException, EndOfFileException, ParseNumberException);
+
StringPiece Consume(const char *to) {
StringPiece ret(position_, to - position_);
position_ = to;
diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc
index 23e79fe0..dc9ec7e7 100644
--- a/klm/util/file_piece_test.cc
+++ b/klm/util/file_piece_test.cc
@@ -8,6 +8,8 @@
#include <iostream>
#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
namespace util {
namespace {
@@ -27,14 +29,18 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) {
BOOST_CHECK_THROW(test.get(), EndOfFileException);
}
+#ifndef __APPLE__
+/* Apple isn't happy with the popen, fileno, dup. And I don't want to
+ * reimplement popen. This is an issue with the test.
+ */
/* read() implementation */
BOOST_AUTO_TEST_CASE(StreamReadLine) {
std::fstream ref("file_piece.cc", std::ios::in);
- scoped_FILE catter(popen("cat file_piece.cc", "r"));
- BOOST_REQUIRE(catter.get());
+ FILE *catter = popen("cat file_piece.cc", "r");
+ BOOST_REQUIRE(catter);
- FilePiece test(dup(fileno(catter.get())), "file_piece.cc", NULL, 1);
+ FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1);
std::string ref_line;
while (getline(ref, ref_line)) {
StringPiece test_line(test.ReadLine());
@@ -44,7 +50,9 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) {
}
}
BOOST_CHECK_THROW(test.get(), EndOfFileException);
+ BOOST_REQUIRE(!pclose(catter));
}
+#endif // __APPLE__
#ifdef HAVE_ZLIB
@@ -64,14 +72,17 @@ BOOST_AUTO_TEST_CASE(PlainZipReadLine) {
}
BOOST_CHECK_THROW(test.get(), EndOfFileException);
}
-// gzip stream
+
+// gzip stream. Apple doesn't like popen, fileno, dup. This is an issue with
+// the test.
+#ifndef __APPLE__
BOOST_AUTO_TEST_CASE(StreamZipReadLine) {
std::fstream ref("file_piece.cc", std::ios::in);
- scoped_FILE catter(popen("gzip <file_piece.cc", "r"));
- BOOST_REQUIRE(catter.get());
+ FILE * catter = popen("gzip <file_piece.cc", "r");
+ BOOST_REQUIRE(catter);
- FilePiece test(dup(fileno(catter.get())), "file_piece.cc", NULL, 1);
+ FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1);
std::string ref_line;
while (getline(ref, ref_line)) {
StringPiece test_line(test.ReadLine());
@@ -81,9 +92,11 @@ BOOST_AUTO_TEST_CASE(StreamZipReadLine) {
}
}
BOOST_CHECK_THROW(test.get(), EndOfFileException);
+ BOOST_REQUIRE(!pclose(catter));
}
+#endif // __APPLE__
-#endif
+#endif // HAVE_ZLIB
} // namespace
} // namespace util
diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc
index 8685170f..5a810c64 100644
--- a/klm/util/mmap.cc
+++ b/klm/util/mmap.cc
@@ -2,8 +2,9 @@
#include "util/mmap.hh"
#include "util/scoped.hh"
+#include <iostream>
+
#include <assert.h>
-#include <err.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/mman.h>
@@ -14,8 +15,10 @@ namespace util {
scoped_mmap::~scoped_mmap() {
if (data_ != (void*)-1) {
- if (munmap(data_, size_))
- err(1, "munmap failed ");
+ if (munmap(data_, size_)) {
+ std::cerr << "munmap failed for " << size_ << " bytes." << std::endl;
+ abort();
+ }
}
}
@@ -73,18 +76,27 @@ void ReadAll(int fd, void *to_void, std::size_t amount) {
to += ret;
}
}
+
+const int kFileFlags =
+#ifdef MAP_FILE
+ MAP_FILE | MAP_SHARED
+#else
+ MAP_SHARED
+#endif
+ ;
+
} // namespace
void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out) {
switch (method) {
case LAZY:
- out.reset(MapOrThrow(size, false, MAP_FILE | MAP_SHARED, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
+ out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
break;
case POPULATE_OR_LAZY:
#ifdef MAP_POPULATE
case POPULATE_OR_READ:
#endif
- out.reset(MapOrThrow(size, false, MAP_FILE | MAP_SHARED, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
+ out.reset(MapOrThrow(size, false, kFileFlags, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
break;
#ifndef MAP_POPULATE
case POPULATE_OR_READ:
@@ -115,7 +127,7 @@ void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
if (-1 == ftruncate(file.get(), size))
UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed");
try {
- return MapOrThrow(size, true, MAP_FILE | MAP_SHARED, false, file.get(), 0);
+ return MapOrThrow(size, true, kFileFlags, false, file.get(), 0);
} catch (ErrnoException &e) {
e << " in file " << name;
throw;
diff --git a/klm/util/murmur_hash.hh b/klm/util/murmur_hash.hh
index 638aaeb2..78fe583f 100644
--- a/klm/util/murmur_hash.hh
+++ b/klm/util/murmur_hash.hh
@@ -1,7 +1,7 @@
#ifndef UTIL_MURMUR_HASH__
#define UTIL_MURMUR_HASH__
#include <cstddef>
-#include <stdint.h>
+#include <inttypes.h>
namespace util {
diff --git a/klm/util/scoped.cc b/klm/util/scoped.cc
index 2c6d5394..a4cc5016 100644
--- a/klm/util/scoped.cc
+++ b/klm/util/scoped.cc
@@ -1,16 +1,24 @@
#include "util/scoped.hh"
-#include <err.h>
+#include <iostream>
+
+#include <stdlib.h>
#include <unistd.h>
namespace util {
scoped_fd::~scoped_fd() {
- if (fd_ != -1 && close(fd_)) err(1, "Could not close file %i", fd_);
+ if (fd_ != -1 && close(fd_)) {
+ std::cerr << "Could not close file " << fd_ << std::endl;
+ abort();
+ }
}
scoped_FILE::~scoped_FILE() {
- if (file_ && fclose(file_)) err(1, "Could not close file");
+ if (file_ && fclose(file_)) {
+ std::cerr << "Could not close file " << std::endl;
+ abort();
+ }
}
} // namespace util
diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh
index 4557173b..3ac2f8a7 100644
--- a/klm/util/string_piece.hh
+++ b/klm/util/string_piece.hh
@@ -51,7 +51,7 @@
//Uncomment this line if you use ICU in your code.
//#define HAVE_ICU
//Uncomment this line if you want boost hashing for your StringPieces.
-#define HAVE_BOOST
+//#define HAVE_BOOST
#include <cstring>
#include <iosfwd>