From 89238977fc9d8f8d9a6421b0d4f35afc200f08e7 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Tue, 28 Feb 2012 17:23:55 -0500 Subject: Subject: where's my kenlm update?? From: Chris Dyer --- klm/util/bit_packing.hh | 66 ++++++++--- klm/util/exception.cc | 2 +- klm/util/file.cc | 222 +++++++++++++++++++++++++++++++++--- klm/util/file.hh | 42 ++++++- klm/util/file_piece.cc | 62 +++++----- klm/util/file_piece.hh | 31 +++-- klm/util/file_piece_test.cc | 41 +++++-- klm/util/getopt.c | 78 +++++++++++++ klm/util/getopt.hh | 33 ++++++ klm/util/key_value_packing.hh | 126 -------------------- klm/util/key_value_packing_test.cc | 75 ------------ klm/util/mmap.cc | 123 ++++++++++++++++---- klm/util/mmap.hh | 14 ++- klm/util/murmur_hash.cc | 39 ++++++- klm/util/murmur_hash.hh | 2 +- klm/util/probing_hash_table.hh | 33 +++--- klm/util/probing_hash_table_test.cc | 27 ++++- klm/util/sized_iterator.hh | 2 +- klm/util/sorted_uniform.hh | 95 +-------------- klm/util/sorted_uniform_test.cc | 85 ++++++++------ klm/util/tokenize_piece.hh | 64 +++-------- klm/util/tokenize_piece_test.cc | 50 +------- 22 files changed, 739 insertions(+), 573 deletions(-) create mode 100644 klm/util/getopt.c create mode 100644 klm/util/getopt.hh delete mode 100644 klm/util/key_value_packing.hh delete mode 100644 klm/util/key_value_packing_test.cc (limited to 'klm/util') diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh index 33266b94..73a5cb22 100644 --- a/klm/util/bit_packing.hh +++ b/klm/util/bit_packing.hh @@ -1,33 +1,37 @@ #ifndef UTIL_BIT_PACKING__ #define UTIL_BIT_PACKING__ -/* Bit-level packing routines */ +/* Bit-level packing routines + * + * WARNING WARNING WARNING: + * The write functions assume that memory is zero initially. This makes them + * faster and is the appropriate case for mmapped language model construction. + * These routines assume that unaligned access to uint64_t is fast. This is + * the case on x86_64. I'm not sure how fast unaligned 64-bit access is on + * x86 but my target audience is large language models for which 64-bit is + * necessary. + * + * Call the BitPackingSanity function to sanity check. Calling once suffices, + * but it may be called multiple times when that's inconvenient. + * + * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at + * NICT. + */ #include #ifdef __APPLE__ #include #elif __linux__ #include -#else +#elif !defined(_WIN32) && !defined(_WIN64) #include #endif -#include - -namespace util { +#include -/* WARNING WARNING WARNING: - * The write functions assume that memory is zero initially. This makes them - * faster and is the appropriate case for mmapped language model construction. - * These routines assume that unaligned access to uint64_t is fast and that - * storage is little endian. This is the case on x86_64. I'm not sure how - * fast unaligned 64-bit access is on x86 but my target audience is large - * language models for which 64-bit is necessary. - * - * Call the BitPackingSanity function to sanity check. Calling once suffices, - * but it may be called multiple times when that's inconvenient. - */ +#include +namespace util { // Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct. #if BYTE_ORDER == LITTLE_ENDIAN @@ -43,7 +47,14 @@ inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { #endif inline uint64_t ReadOff(const void *base, uint64_t bit_off) { +#if defined(__arm) || defined(__arm__) + const uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint64_t value64; + memcpy(&value64, base_off, sizeof(value64)); + return value64; +#else return *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)); +#endif } /* Pack integers up to 57 bits using their least significant digits. @@ -57,18 +68,41 @@ inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, ui * Assumes the memory is zero initially. */ inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) { +#if defined(__arm) || defined(__arm__) + uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint64_t value64; + memcpy(&value64, base_off, sizeof(value64)); + value64 |= (value << BitPackShift(bit_off & 7, length)); + memcpy(base_off, &value64, sizeof(value64)); +#else *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) |= (value << BitPackShift(bit_off & 7, length)); +#endif } /* Same caveats as above, but for a 25 bit limit. */ inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) { +#if defined(__arm) || defined(__arm__) + const uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint32_t value32; + memcpy(&value32, base_off, sizeof(value32)); + return (value32 >> BitPackShift(bit_off & 7, length)) & mask; +#else return (*reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask; +#endif } inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) { +#if defined(__arm) || defined(__arm__) + uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint32_t value32; + memcpy(&value32, base_off, sizeof(value32)); + value32 |= (value << BitPackShift(bit_off & 7, length)); + memcpy(base_off, &value32, sizeof(value32)); +#else *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) |= (value << BitPackShift(bit_off & 7, length)); +#endif } typedef union { float f; uint32_t i; } FloatEnc; diff --git a/klm/util/exception.cc b/klm/util/exception.cc index 96951495..c4f8c04c 100644 --- a/klm/util/exception.cc +++ b/klm/util/exception.cc @@ -66,7 +66,7 @@ const char *HandleStrerror(const char *ret, const char * /*buf*/) { ErrnoException::ErrnoException() throw() : errno_(errno) { char buf[200]; buf[0] = 0; -#ifdef sun +#if defined(sun) || defined(_WIN32) || defined(_WIN64) const char *add = strerror(errno); #else const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf); diff --git a/klm/util/file.cc b/klm/util/file.cc index d707568e..aee7c77a 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -9,8 +9,12 @@ #include #include #include -#include -#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#include +#endif namespace util { @@ -30,33 +34,61 @@ scoped_FILE::~scoped_FILE() { int OpenReadOrThrow(const char *name) { int ret; +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_BINARY | _O_RDONLY)), ErrnoException, "while opening " << name); +#else UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name); +#endif return ret; } -int CreateOrThrow(const char *name) { - int ret; - UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR)), ErrnoException, "while creating " << name); - return ret; -} - -off_t SizeFile(int fd) { +uint64_t SizeFile(int fd) { +#if defined(_WIN32) || defined(_WIN64) + __int64 ret = _filelengthi64(fd); + return (ret == -1) ? kBadSize : ret; +#else struct stat sb; if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; return sb.st_size; +#endif +} + +void ResizeOrThrow(int fd, uint64_t to) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(_chsize_s(fd, to), ErrnoException, "Resizing to " << to << " bytes failed"); +#else + UTIL_THROW_IF(ftruncate(fd, to), ErrnoException, "Resizing to " << to << " bytes failed"); +#endif } +#ifdef WIN32 +typedef int ssize_t; +#endif + void ReadOrThrow(int fd, void *to_void, std::size_t amount) { uint8_t *to = static_cast(to_void); while (amount) { ssize_t ret = read(fd, to, amount); - if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed."); - if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read."); + UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << amount << " from fd " << fd << " failed."); + UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read."); amount -= ret; to += ret; } } +std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) { + uint8_t *to = static_cast(to_void); + std::size_t remaining = amount; + while (remaining) { + ssize_t ret = read(fd, to, remaining); + UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << remaining << " from fd " << fd << " failed."); + if (!ret) return amount - remaining; + remaining -= ret; + to += ret; + } + return amount; +} + void WriteOrThrow(int fd, const void *data_void, std::size_t size) { const uint8_t *data = static_cast(data_void); while (size) { @@ -67,8 +99,172 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) { } } -void RemoveOrThrow(const char *name) { - UTIL_THROW_IF(std::remove(name), util::ErrnoException, "Could not remove " << name); +void FSyncOrThrow(int fd) { +// Apparently windows doesn't have fsync? +#if !defined(_WIN32) && !defined(_WIN64) + UTIL_THROW_IF(-1 == fsync(fd), ErrnoException, "Sync of " << fd << " failed."); +#endif +} + +namespace { +void InternalSeek(int fd, off_t off, int whence) { + UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed"); +} +} // namespace + +void SeekOrThrow(int fd, uint64_t off) { + InternalSeek(fd, off, SEEK_SET); +} + +void AdvanceOrThrow(int fd, int64_t off) { + InternalSeek(fd, off, SEEK_CUR); +} + +void SeekEnd(int fd) { + InternalSeek(fd, 0, SEEK_END); +} + +std::FILE *FDOpenOrThrow(scoped_fd &file) { + std::FILE *ret = fdopen(file.get(), "r+b"); + if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen"); + file.release(); + return ret; +} + +TempMaker::TempMaker(const std::string &prefix) : base_(prefix) { + base_ += "XXXXXX"; +} + +// Sigh. Windows temporary file creation is full of race conditions. +#if defined(_WIN32) || defined(_WIN64) +/* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright + (C) 1991-1999, 2000, 2001, 2006 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. */ + +/* This has been modified from the original version to rename the function and + * set the Windows temporary flag. */ + +static const char letters[] = +"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + +/* Generate a temporary file name based on TMPL. TMPL must match the + rules for mk[s]temp (i.e. end in "XXXXXX"). The name constructed + does not exist at the time of the call to mkstemp. TMPL is + overwritten with the result. */ +int +mkstemp_and_unlink(char *tmpl) +{ + int len; + char *XXXXXX; + static unsigned long long value; + unsigned long long random_time_bits; + unsigned int count; + int fd = -1; + int save_errno = errno; + + /* A lower bound on the number of temporary files to attempt to + generate. The maximum total number of temporary file names that + can exist for a given template is 62**6. It should never be + necessary to try all these combinations. Instead if a reasonable + number of names is tried (we define reasonable as 62**3) fail to + give the system administrator the chance to remove the problems. */ +#define ATTEMPTS_MIN (62 * 62 * 62) + + /* The number of times to attempt to generate a temporary file. To + conform to POSIX, this must be no smaller than TMP_MAX. */ +#if ATTEMPTS_MIN < TMP_MAX + unsigned int attempts = TMP_MAX; +#else + unsigned int attempts = ATTEMPTS_MIN; +#endif + + len = strlen (tmpl); + if (len < 6 || strcmp (&tmpl[len - 6], "XXXXXX")) + { + errno = EINVAL; + return -1; + } + +/* This is where the Xs start. */ + XXXXXX = &tmpl[len - 6]; + + /* Get some more or less random data. */ + { + SYSTEMTIME stNow; + FILETIME ftNow; + + // get system time + GetSystemTime(&stNow); + stNow.wMilliseconds = 500; + if (!SystemTimeToFileTime(&stNow, &ftNow)) + { + errno = -1; + return -1; + } + + random_time_bits = (((unsigned long long)ftNow.dwHighDateTime << 32) + | (unsigned long long)ftNow.dwLowDateTime); + } + value += random_time_bits ^ (unsigned long long)GetCurrentThreadId (); + + for (count = 0; count < attempts; value += 7777, ++count) + { + unsigned long long v = value; + + /* Fill in the random bits. */ + XXXXXX[0] = letters[v % 62]; + v /= 62; + XXXXXX[1] = letters[v % 62]; + v /= 62; + XXXXXX[2] = letters[v % 62]; + v /= 62; + XXXXXX[3] = letters[v % 62]; + v /= 62; + XXXXXX[4] = letters[v % 62]; + v /= 62; + XXXXXX[5] = letters[v % 62]; + + /* Modified for windows and to unlink */ + // fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE); + fd = _open (tmpl, _O_RDWR | _O_CREAT | _O_TEMPORARY | _O_EXCL | _O_BINARY, _S_IREAD | _S_IWRITE); + if (fd >= 0) + { + errno = save_errno; + return fd; + } + else if (errno != EEXIST) + return -1; + } + + /* We got out of the loop because we ran out of combinations to try. */ + errno = EEXIST; + return -1; +} +#else +int +mkstemp_and_unlink(char *tmpl) { + int ret = mkstemp(tmpl); + if (ret == -1) return -1; + UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl); + return ret; +} +#endif + +int TempMaker::Make() const { + std::string copy(base_); + copy.push_back(0); + int ret; + UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(©[0])), util::ErrnoException, "Failed to make a temporary based on " << base_); + return ret; +} + +std::FILE *TempMaker::MakeFile() const { + util::scoped_fd file(Make()); + return FDOpenOrThrow(file); } } // namespace util diff --git a/klm/util/file.hh b/klm/util/file.hh index d6cca41d..5c57e2a9 100644 --- a/klm/util/file.hh +++ b/klm/util/file.hh @@ -1,8 +1,11 @@ #ifndef UTIL_FILE__ #define UTIL_FILE__ +#include #include -#include +#include + +#include namespace util { @@ -52,22 +55,49 @@ class scoped_FILE { file_ = to; } + std::FILE *release() { + std::FILE *ret = file_; + file_ = NULL; + return ret; + } + private: std::FILE *file_; }; int OpenReadOrThrow(const char *name); -int CreateOrThrow(const char *name); - // Return value for SizeFile when it can't size properly. -const off_t kBadSize = -1; -off_t SizeFile(int fd); +const uint64_t kBadSize = (uint64_t)-1; +uint64_t SizeFile(int fd); + +void ResizeOrThrow(int fd, uint64_t to); void ReadOrThrow(int fd, void *to, std::size_t size); +std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount); + void WriteOrThrow(int fd, const void *data_void, std::size_t size); -void RemoveOrThrow(const char *name); +void FSyncOrThrow(int fd); + +// Seeking +void SeekOrThrow(int fd, uint64_t off); +void AdvanceOrThrow(int fd, int64_t off); +void SeekEnd(int fd); + +std::FILE *FDOpenOrThrow(scoped_fd &file); + +class TempMaker { + public: + explicit TempMaker(const std::string &prefix); + + int Make() const; + + std::FILE *MakeFile() const; + + private: + std::string base_; +}; } // namespace util diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index b57582a0..081e662b 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -2,6 +2,10 @@ #include "util/exception.hh" #include "util/file.hh" +#include "util/mmap.hh" +#ifdef WIN32 +#include +#endif // WIN32 #include #include @@ -11,14 +15,8 @@ #include #include #include -#include #include #include -#include - -#ifdef HAVE_ZLIB -#include -#endif namespace util { @@ -26,24 +24,24 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() { *this << "Could not parse \"" << value << "\" into a number"; } -GZException::GZException(void *file) { #ifdef HAVE_ZLIB +GZException::GZException(gzFile file) { int num; - *this << gzerror(file, &num) << " from zlib"; -#endif // HAVE_ZLIB + *this << gzerror( file, &num) << " from zlib"; } +#endif // HAVE_ZLIB // Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; -FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) : - file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)), +FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) : + file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()), progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { Initialize(name, show_progress, min_buffer); } -FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) : - file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)), +FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : + file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()), progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { Initialize(name, show_progress, min_buffer); } @@ -63,7 +61,7 @@ FilePiece::~FilePiece() { } StringPiece FilePiece::ReadLine(char delim) { - size_t skip = 0; + std::size_t skip = 0; while (true) { for (const char *i = position_ + skip; i < position_end_; ++i) { if (*i == delim) { @@ -94,13 +92,13 @@ unsigned long int FilePiece::ReadULong() { return ReadNumber(); } -void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) { +void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { #ifdef HAVE_ZLIB gz_file_ = NULL; #endif file_name_ = name; - default_map_size_ = page_ * std::max((min_buffer / page_ + 1), 2); + default_map_size_ = page_ * std::max((min_buffer / page_ + 1), 2); position_ = NULL; position_end_ = NULL; mapped_offset_ = 0; @@ -130,7 +128,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t namespace { void ParseNumber(const char *begin, char *&end, float &out) { -#ifdef sun +#if defined(sun) || defined(WIN32) out = static_cast(strtod(begin, &end)); #else out = strtof(begin, &end); @@ -171,7 +169,7 @@ template T FilePiece::ReadNumber() { } const char *FilePiece::FindDelimiterOrEOF(const bool *delim) { - size_t skip = 0; + std::size_t skip = 0; while (true) { for (const char *i = position_ + skip; i < position_end_; ++i) { if (delim[static_cast(*i)]) return i; @@ -190,7 +188,7 @@ void FilePiece::Shift() { progress_.Finished(); throw EndOfFileException(); } - off_t desired_begin = position_ - data_.begin() + mapped_offset_; + uint64_t desired_begin = position_ - data_.begin() + mapped_offset_; if (!fallback_to_read_) MMapShift(desired_begin); // Notice an mmap failure might set the fallback. @@ -201,18 +199,18 @@ void FilePiece::Shift() { } } -void FilePiece::MMapShift(off_t desired_begin) { +void FilePiece::MMapShift(uint64_t desired_begin) { // Use mmap. - off_t ignore = desired_begin % page_; + uint64_t ignore = desired_begin % page_; // Duplicate request for Shift means give more data. if (position_ == data_.begin() + ignore) { default_map_size_ *= 2; } // Local version so that in case of failure it doesn't overwrite the class variable. - off_t mapped_offset = desired_begin - ignore; + uint64_t mapped_offset = desired_begin - ignore; - off_t mapped_size; - if (default_map_size_ >= static_cast(total_size_ - mapped_offset)) { + uint64_t mapped_size; + if (default_map_size_ >= static_cast(total_size_ - mapped_offset)) { at_end_ = true; mapped_size = total_size_ - mapped_offset; } else { @@ -221,15 +219,11 @@ void FilePiece::MMapShift(off_t desired_begin) { // Forcibly clear the existing mmap first. data_.reset(); - data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_SHARED - // Populate where available on linux -#ifdef MAP_POPULATE - | MAP_POPULATE -#endif - , *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED); - if (data_.get() == MAP_FAILED) { + try { + MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_); + } catch (const util::ErrnoException &e) { if (desired_begin) { - if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either."); + SeekOrThrow(*file_, desired_begin); } // The mmap was scheduled to end the file, but now we're going to read it. at_end_ = false; @@ -259,6 +253,10 @@ void FilePiece::TransitionToRead() { #endif } +#ifdef WIN32 +typedef int ssize_t; +#endif + void FilePiece::ReadShift() { assert(fallback_to_read_); // Bytes [data_.begin(), position_) have been consumed. diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index a627f38c..af93d8aa 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -8,9 +8,14 @@ #include "util/mmap.hh" #include "util/string_piece.hh" +#include #include -#include +#include + +#ifdef HAVE_ZLIB +#include +#endif namespace util { @@ -22,7 +27,9 @@ class ParseNumberException : public Exception { class GZException : public Exception { public: - explicit GZException(void *file); +#ifdef HAVE_ZLIB + explicit GZException(gzFile file); +#endif GZException() throw() {} ~GZException() throw() {} }; @@ -33,9 +40,9 @@ extern const bool kSpaces[256]; class FilePiece { public: // 32 MB default. - explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432); + explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432); // Takes ownership of fd. name is used for messages. - explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432); + explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432); ~FilePiece(); @@ -70,14 +77,14 @@ class FilePiece { } } - off_t Offset() const { + uint64_t Offset() const { return position_ - data_.begin() + mapped_offset_; } const std::string &FileName() const { return file_name_; } private: - void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer); + void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); template T ReadNumber(); @@ -91,7 +98,7 @@ class FilePiece { void Shift(); // Backends to Shift(). - void MMapShift(off_t desired_begin); + void MMapShift(uint64_t desired_begin); void TransitionToRead(); void ReadShift(); @@ -99,11 +106,11 @@ class FilePiece { const char *position_, *last_space_, *position_end_; scoped_fd file_; - const off_t total_size_; - const off_t page_; + const uint64_t total_size_; + const uint64_t page_; - size_t default_map_size_; - off_t mapped_offset_; + std::size_t default_map_size_; + uint64_t mapped_offset_; // Order matters: file_ should always be destroyed after this. scoped_memory data_; @@ -116,7 +123,7 @@ class FilePiece { std::string file_name_; #ifdef HAVE_ZLIB - void *gz_file_; + gzFile gz_file_; #endif // HAVE_ZLIB }; diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc index dc9ec7e7..f912e18a 100644 --- a/klm/util/file_piece_test.cc +++ b/klm/util/file_piece_test.cc @@ -1,3 +1,4 @@ +// Tests might fail if you have creative characters in your path. Sue me. #include "util/file_piece.hh" #include "util/scoped.hh" @@ -14,10 +15,18 @@ namespace util { namespace { +std::string FileLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "file_piece.cc"; + } + std::string ret(boost::unit_test::framework::master_test_suite().argv[1]); + return ret; +} + /* mmap implementation */ BOOST_AUTO_TEST_CASE(MMapReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); - FilePiece test("file_piece.cc", NULL, 1); + std::fstream ref(FileLocation().c_str(), std::ios::in); + FilePiece test(FileLocation().c_str(), NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); @@ -35,9 +44,13 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) { */ /* read() implementation */ BOOST_AUTO_TEST_CASE(StreamReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); + std::fstream ref(FileLocation().c_str(), std::ios::in); + + std::string popen_args = "cat \""; + popen_args += FileLocation(); + popen_args += '"'; - FILE *catter = popen("cat file_piece.cc", "r"); + FILE *catter = popen(popen_args.c_str(), "r"); BOOST_REQUIRE(catter); FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); @@ -58,10 +71,15 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) { // gzip file BOOST_AUTO_TEST_CASE(PlainZipReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); + std::string location(FileLocation()); + std::fstream ref(location.c_str(), std::ios::in); - BOOST_REQUIRE_EQUAL(0, system("gzip file_piece.cc.gz")); - FilePiece test("file_piece.cc.gz", NULL, 1); + std::string command("gzip <\""); + command += location + "\" >\"" + location + "\".gz"; + + BOOST_REQUIRE_EQUAL(0, system(command.c_str())); + FilePiece test((location + ".gz").c_str(), NULL, 1); + unlink((location + ".gz").c_str()); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); @@ -77,12 +95,15 @@ BOOST_AUTO_TEST_CASE(PlainZipReadLine) { // the test. #ifndef __APPLE__ BOOST_AUTO_TEST_CASE(StreamZipReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); + std::fstream ref(FileLocation().c_str(), std::ios::in); + + std::string command("gzip <\""); + command += FileLocation() + "\""; - FILE * catter = popen("gzip +#include + +#define NULL 0 +#define EOF (-1) +#define ERR(s, c) if(opterr){\ + char errbuf[2];\ + errbuf[0] = c; errbuf[1] = '\n';\ + fputs(argv[0], stderr);\ + fputs(s, stderr);\ + fputc(c, stderr);} + //(void) write(2, argv[0], (unsigned)strlen(argv[0]));\ + //(void) write(2, s, (unsigned)strlen(s));\ + //(void) write(2, errbuf, 2);} + +int opterr = 1; +int optind = 1; +int optopt; +char *optarg; + +int +getopt(argc, argv, opts) +int argc; +char **argv, *opts; +{ + static int sp = 1; + register int c; + register char *cp; + + if(sp == 1) + if(optind >= argc || + argv[optind][0] != '-' || argv[optind][1] == '\0') + return(EOF); + else if(strcmp(argv[optind], "--") == NULL) { + optind++; + return(EOF); + } + optopt = c = argv[optind][sp]; + if(c == ':' || (cp=strchr(opts, c)) == NULL) { + ERR(": illegal option -- ", c); + if(argv[optind][++sp] == '\0') { + optind++; + sp = 1; + } + return('?'); + } + if(*++cp == ':') { + if(argv[optind][sp+1] != '\0') + optarg = &argv[optind++][sp+1]; + else if(++optind >= argc) { + ERR(": option requires an argument -- ", c); + sp = 1; + return('?'); + } else + optarg = argv[optind++]; + sp = 1; + } else { + if(argv[optind][++sp] == '\0') { + sp = 1; + optind++; + } + optarg = NULL; + } + return(c); +} + +#endif /* __GNUC__ */ diff --git a/klm/util/getopt.hh b/klm/util/getopt.hh new file mode 100644 index 00000000..6ad97732 --- /dev/null +++ b/klm/util/getopt.hh @@ -0,0 +1,33 @@ +/* +POSIX getopt for Windows + +AT&T Public License + +Code given out at the 1985 UNIFORUM conference in Dallas. +*/ + +#ifdef __GNUC__ +#include +#endif +#ifndef __GNUC__ + +#ifndef _WINGETOPT_H_ +#define _WINGETOPT_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern int opterr; +extern int optind; +extern int optopt; +extern char *optarg; +extern int getopt(int argc, char **argv, char *opts); + +#ifdef __cplusplus +} +#endif + +#endif /* _GETOPT_H_ */ +#endif /* __GNUC__ */ + diff --git a/klm/util/key_value_packing.hh b/klm/util/key_value_packing.hh deleted file mode 100644 index b84a5aad..00000000 --- a/klm/util/key_value_packing.hh +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef UTIL_KEY_VALUE_PACKING__ -#define UTIL_KEY_VALUE_PACKING__ - -/* Why such a general interface? I'm planning on doing bit-level packing. */ - -#include -#include -#include - -#include - -namespace util { - -template struct Entry { - Key key; - Value value; - - const Key &GetKey() const { return key; } - const Value &GetValue() const { return value; } - - Value &MutableValue() { return value; } - - void Set(const Key &key_in, const Value &value_in) { - SetKey(key_in); - SetValue(value_in); - } - void SetKey(const Key &key_in) { key = key_in; } - void SetValue(const Value &value_in) { value = value_in; } - - bool operator<(const Entry &other) const { return GetKey() < other.GetKey(); } -}; - -// And now for a brief interlude to specialize std::swap. -} // namespace util -namespace std { -template void swap(util::Entry &first, util::Entry &second) { - swap(first.key, second.key); - swap(first.value, second.value); -} -}// namespace std -namespace util { - -template class AlignedPacking { - public: - typedef KeyT Key; - typedef ValueT Value; - - public: - static const std::size_t kBytes = sizeof(Entry); - static const std::size_t kBits = kBytes * 8; - - typedef Entry * MutableIterator; - typedef const Entry * ConstIterator; - typedef const Entry & ConstReference; - - static MutableIterator FromVoid(void *start) { - return reinterpret_cast(start); - } - - static Entry Make(const Key &key, const Value &value) { - Entry ret; - ret.Set(key, value); - return ret; - } -}; - -template class ByteAlignedPacking { - public: - typedef KeyT Key; - typedef ValueT Value; - - private: -#pragma pack(push) -#pragma pack(1) - struct RawEntry { - Key key; - Value value; - - const Key &GetKey() const { return key; } - const Value &GetValue() const { return value; } - - Value &MutableValue() { return value; } - - void Set(const Key &key_in, const Value &value_in) { - SetKey(key_in); - SetValue(value_in); - } - void SetKey(const Key &key_in) { key = key_in; } - void SetValue(const Value &value_in) { value = value_in; } - - bool operator<(const RawEntry &other) const { return GetKey() < other.GetKey(); } - }; -#pragma pack(pop) - - friend void std::swap<>(RawEntry&, RawEntry&); - - public: - typedef RawEntry *MutableIterator; - typedef const RawEntry *ConstIterator; - typedef RawEntry &ConstReference; - - static const std::size_t kBytes = sizeof(RawEntry); - static const std::size_t kBits = kBytes * 8; - - static MutableIterator FromVoid(void *start) { - return MutableIterator(reinterpret_cast(start)); - } - - static RawEntry Make(const Key &key, const Value &value) { - RawEntry ret; - ret.Set(key, value); - return ret; - } -}; - -} // namespace util -namespace std { -template void swap( - typename util::ByteAlignedPacking::RawEntry &first, - typename util::ByteAlignedPacking::RawEntry &second) { - swap(first.key, second.key); - swap(first.value, second.value); -} -}// namespace std - -#endif // UTIL_KEY_VALUE_PACKING__ diff --git a/klm/util/key_value_packing_test.cc b/klm/util/key_value_packing_test.cc deleted file mode 100644 index a0d33fd7..00000000 --- a/klm/util/key_value_packing_test.cc +++ /dev/null @@ -1,75 +0,0 @@ -#include "util/key_value_packing.hh" - -#include -#include -#include -#include -#define BOOST_TEST_MODULE KeyValueStoreTest -#include - -#include -#include - -namespace util { -namespace { - -BOOST_AUTO_TEST_CASE(basic_in_out) { - typedef ByteAlignedPacking Packing; - void *backing = malloc(Packing::kBytes * 2); - Packing::MutableIterator i(Packing::FromVoid(backing)); - i->SetKey(10); - BOOST_CHECK_EQUAL(10, i->GetKey()); - i->SetValue(3); - BOOST_CHECK_EQUAL(3, i->GetValue()); - ++i; - i->SetKey(5); - BOOST_CHECK_EQUAL(5, i->GetKey()); - i->SetValue(42); - BOOST_CHECK_EQUAL(42, i->GetValue()); - - Packing::ConstIterator c(i); - BOOST_CHECK_EQUAL(5, c->GetKey()); - --c; - BOOST_CHECK_EQUAL(10, c->GetKey()); - BOOST_CHECK_EQUAL(42, i->GetValue()); - - BOOST_CHECK_EQUAL(5, i->GetKey()); - free(backing); -} - -BOOST_AUTO_TEST_CASE(simple_sort) { - typedef ByteAlignedPacking Packing; - char foo[Packing::kBytes * 4]; - Packing::MutableIterator begin(Packing::FromVoid(foo)); - Packing::MutableIterator i = begin; - i->SetKey(0); ++i; - i->SetKey(2); ++i; - i->SetKey(3); ++i; - i->SetKey(1); ++i; - std::sort(begin, i); - BOOST_CHECK_EQUAL(0, begin[0].GetKey()); - BOOST_CHECK_EQUAL(1, begin[1].GetKey()); - BOOST_CHECK_EQUAL(2, begin[2].GetKey()); - BOOST_CHECK_EQUAL(3, begin[3].GetKey()); -} - -BOOST_AUTO_TEST_CASE(big_sort) { - typedef ByteAlignedPacking Packing; - boost::scoped_array memory(new char[Packing::kBytes * 1000]); - Packing::MutableIterator begin(Packing::FromVoid(memory.get())); - - boost::mt19937 rng; - boost::uniform_int range(0, std::numeric_limits::max()); - boost::variate_generator > gen(rng, range); - - for (size_t i = 0; i < 1000; ++i) { - (begin + i)->SetKey(gen()); - } - std::sort(begin, begin + 1000); - for (size_t i = 0; i < 999; ++i) { - BOOST_CHECK(begin[i] < begin[i+1]); - } -} - -} // namespace -} // namespace util diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index 279bafa8..a329ce4e 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -1,23 +1,63 @@ +/* Memory mapping wrappers. + * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at + * NICT. + */ +#include "util/mmap.hh" + #include "util/exception.hh" #include "util/file.hh" -#include "util/mmap.hh" #include #include #include #include -#include +#include #include -#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#include +#else +#include +#endif namespace util { +long SizePage() { +#if defined(_WIN32) || defined(_WIN64) + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwAllocationGranularity; +#else + return sysconf(_SC_PAGE_SIZE); +#endif +} + +void SyncOrThrow(void *start, size_t length) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap"); +#else + UTIL_THROW_IF(msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap"); +#endif +} + +void UnmapOrThrow(void *start, size_t length) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file"); +#else + UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed"); +#endif +} + scoped_mmap::~scoped_mmap() { if (data_ != (void*)-1) { - // Thanks Denis Filimonov for pointing out NFS likes msync first. - if (msync(data_, size_, MS_SYNC) || munmap(data_, size_)) { - std::cerr << "msync or mmap failed for " << size_ << " bytes." << std::endl; + try { + // Thanks Denis Filimonov for pointing out NFS likes msync first. + SyncOrThrow(data_, size_); + UnmapOrThrow(data_, size_); + } catch (const util::ErrnoException &e) { + std::cerr << e.what(); abort(); } } @@ -52,29 +92,40 @@ void scoped_memory::call_realloc(std::size_t size) { } } -void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset) { +void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) { #ifdef MAP_POPULATE // Linux specific if (prefault) { flags |= MAP_POPULATE; } #endif +#if defined(_WIN32) || defined(_WIN64) + int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY; + int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ; + uint64_t total_size = size + offset; + HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, total_size >> 32, static_cast(total_size), NULL); + UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed"); + LPVOID ret = MapViewOfFile(hMapping, protectM, offset >> 32, offset, size); + CloseHandle(hMapping); + UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed"); +#else int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; void *ret = mmap(NULL, size, protect, flags, fd, offset); - if (ret == MAP_FAILED) { - UTIL_THROW(ErrnoException, "mmap failed for size " << size << " at offset " << offset); - } + UTIL_THROW_IF(ret == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset); +#endif return ret; } const int kFileFlags = -#ifdef MAP_FILE +#if defined(_WIN32) || defined(_WIN64) + 0 // MapOrThrow ignores flags on windows +#elif defined(MAP_FILE) MAP_FILE | MAP_SHARED #else MAP_SHARED #endif ; -void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out) { +void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) { switch (method) { case LAZY: out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED); @@ -91,30 +142,52 @@ void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_m case READ: out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED); if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc"); - if (-1 == lseek(fd, offset, SEEK_SET)) UTIL_THROW(ErrnoException, "lseek to " << offset << " in fd " << fd << " failed."); + SeekOrThrow(fd, offset); ReadOrThrow(fd, out.get(), size); break; } } -void *MapAnonymous(std::size_t size) { - return MapOrThrow(size, true, -#ifdef MAP_ANONYMOUS - MAP_ANONYMOUS // Linux +// Allocates zeroed memory in to. +void MapAnonymous(std::size_t size, util::scoped_memory &to) { + to.reset(); +#if defined(_WIN32) || defined(_WIN64) + to.reset(calloc(1, size), size, scoped_memory::MALLOC_ALLOCATED); +#else + to.reset(MapOrThrow(size, true, +# if defined(MAP_ANONYMOUS) + MAP_ANONYMOUS | MAP_PRIVATE // Linux +# else + MAP_ANON | MAP_PRIVATE // BSD +# endif + , false, -1, 0), size, scoped_memory::MMAP_ALLOCATED); +#endif +} + +void *MapZeroedWrite(int fd, std::size_t size) { + ResizeOrThrow(fd, 0); + ResizeOrThrow(fd, size); + return MapOrThrow(size, true, kFileFlags, false, fd, 0); +} + +namespace { + +int CreateOrThrow(const char *name) { + int ret; +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); #else - MAP_ANON // BSD + UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); #endif - | MAP_PRIVATE, false, -1, 0); + return ret; } +} // namespace + void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { - file.reset(open(name, O_CREAT | O_RDWR | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)); - if (-1 == file.get()) - UTIL_THROW(ErrnoException, "Failed to open " << name << " for writing"); - if (-1 == ftruncate(file.get(), size)) - UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed"); + file.reset(CreateOrThrow(name)); try { - return MapOrThrow(size, true, kFileFlags, false, file.get(), 0); + return MapZeroedWrite(file.get(), size); } catch (ErrnoException &e) { e << " in file " << name; throw; diff --git a/klm/util/mmap.hh b/klm/util/mmap.hh index b0eb6672..b218c4d1 100644 --- a/klm/util/mmap.hh +++ b/klm/util/mmap.hh @@ -4,13 +4,15 @@ #include -#include +#include #include namespace util { class scoped_fd; +long SizePage(); + // (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here. class scoped_mmap { public: @@ -94,15 +96,19 @@ typedef enum { extern const int kFileFlags; // Wrapper around mmap to check it worked and hide some platform macros. -void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset = 0); +void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0); -void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out); +void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out); -void *MapAnonymous(std::size_t size); +void MapAnonymous(std::size_t size, scoped_memory &to); // Open file name with mmap of size bytes, all of which are initially zero. +void *MapZeroedWrite(int fd, std::size_t size); void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file); +// msync wrapper +void SyncOrThrow(void *start, size_t length); + } // namespace util #endif // UTIL_MMAP__ diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc index ef5783fe..6accc21a 100644 --- a/klm/util/murmur_hash.cc +++ b/klm/util/murmur_hash.cc @@ -7,9 +7,11 @@ * placed in namespace util * add MurmurHashNative * default option = 0 for seed + * ARM port from NICT */ #include "util/murmur_hash.hh" +#include namespace util { @@ -28,12 +30,24 @@ uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed ) uint64_t h = seed ^ (len * m); +#if defined(__arm) || defined(__arm__) + const size_t ksize = sizeof(uint64_t); + const unsigned char * data = (const unsigned char *)key; + const unsigned char * end = data + (std::size_t)(len/8) * ksize; +#else const uint64_t * data = (const uint64_t *)key; const uint64_t * end = data + (len/8); +#endif while(data != end) { +#if defined(__arm) || defined(__arm__) + uint64_t k; + memcpy(&k, data, ksize); + data += ksize; +#else uint64_t k = *data++; +#endif k *= m; k ^= k >> r; @@ -75,16 +89,30 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed ) unsigned int h1 = seed ^ len; unsigned int h2 = 0; +#if defined(__arm) || defined(__arm__) + size_t ksize = sizeof(unsigned int); + const unsigned char * data = (const unsigned char *)key; +#else const unsigned int * data = (const unsigned int *)key; +#endif + unsigned int k1, k2; while(len >= 8) { - unsigned int k1 = *data++; +#if defined(__arm) || defined(__arm__) + memcpy(&k1, data, ksize); + data += ksize; + memcpy(&k2, data, ksize); + data += ksize; +#else + k1 = *data++; + k2 = *data++; +#endif + k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; - unsigned int k2 = *data++; k2 *= m; k2 ^= k2 >> r; k2 *= m; h2 *= m; h2 ^= k2; len -= 4; @@ -92,7 +120,12 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed ) if(len >= 4) { - unsigned int k1 = *data++; +#if defined(__arm) || defined(__arm__) + memcpy(&k1, data, ksize); + data += ksize; +#else + k1 = *data++; +#endif k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; diff --git a/klm/util/murmur_hash.hh b/klm/util/murmur_hash.hh index 78fe583f..638aaeb2 100644 --- a/klm/util/murmur_hash.hh +++ b/klm/util/murmur_hash.hh @@ -1,7 +1,7 @@ #ifndef UTIL_MURMUR_HASH__ #define UTIL_MURMUR_HASH__ #include -#include +#include namespace util { diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh index 8122d69c..f466cebc 100644 --- a/klm/util/probing_hash_table.hh +++ b/klm/util/probing_hash_table.hh @@ -18,27 +18,33 @@ class ProbingSizeException : public Exception { ~ProbingSizeException() throw() {} }; +// std::identity is an SGI extension :-( +struct IdentityHash { + template T operator()(T arg) const { return arg; } +}; + /* Non-standard hash table * Buckets must be set at the beginning and must be greater than maximum number - * of elements, else an infinite loop happens. + * of elements, else it throws ProbingSizeException. * Memory management and initialization is externalized to make it easier to * serialize these to disk and load them quickly. * Uses linear probing to find value. * Only insert and lookup operations. */ -template > class ProbingHashTable { +template > class ProbingHashTable { public: - typedef PackingT Packing; - typedef typename Packing::Key Key; - typedef typename Packing::MutableIterator MutableIterator; - typedef typename Packing::ConstIterator ConstIterator; - + typedef EntryT Entry; + typedef typename Entry::Key Key; + typedef const Entry *ConstIterator; + typedef Entry *MutableIterator; typedef HashT Hash; typedef EqualT Equal; + public: static std::size_t Size(std::size_t entries, float multiplier) { - return std::max(entries + 1, static_cast(multiplier * static_cast(entries))) * Packing::kBytes; + std::size_t buckets = std::max(entries + 1, static_cast(multiplier * static_cast(entries))); + return buckets * sizeof(Entry); } // Must be assigned to later. @@ -49,9 +55,9 @@ template (start)), + buckets_(allocated / sizeof(Entry)), + end_(begin_ + buckets_), invalid_(invalid), hash_(hash_func), equal_(equal_func), @@ -62,11 +68,10 @@ template MutableIterator Insert(const T &t) { - if (++entries_ >= buckets_) - UTIL_THROW(ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); #ifdef DEBUG assert(initialized_); #endif + UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) { if (equal_(i->GetKey(), invalid_)) { *i = t; return i; } if (++i == end_) { i = begin_; } @@ -84,7 +89,7 @@ template bool Find(const Key key, ConstIterator &out) const { diff --git a/klm/util/probing_hash_table_test.cc b/klm/util/probing_hash_table_test.cc index ff2f5af3..ef68e5f2 100644 --- a/klm/util/probing_hash_table_test.cc +++ b/klm/util/probing_hash_table_test.cc @@ -1,6 +1,6 @@ #include "util/probing_hash_table.hh" -#include "util/key_value_packing.hh" +#include #define BOOST_TEST_MODULE ProbingHashTableTest #include @@ -9,17 +9,34 @@ namespace util { namespace { -typedef AlignedPacking Packing; -typedef ProbingHashTable > Table; +struct Entry { + unsigned char key; + typedef unsigned char Key; + + unsigned char GetKey() const { + return key; + } + + uint64_t GetValue() const { + return value; + } + + uint64_t value; +}; + +typedef ProbingHashTable > Table; BOOST_AUTO_TEST_CASE(simple) { char mem[Table::Size(10, 1.2)]; memset(mem, 0, sizeof(mem)); Table table(mem, sizeof(mem)); - Packing::ConstIterator i = Packing::ConstIterator(); + const Entry *i = NULL; BOOST_CHECK(!table.Find(2, i)); - table.Insert(Packing::Make(3, 328920)); + Entry to_ins; + to_ins.key = 3; + to_ins.value = 328920; + table.Insert(to_ins); BOOST_REQUIRE(table.Find(3, i)); BOOST_CHECK_EQUAL(3, i->GetKey()); BOOST_CHECK_EQUAL(static_cast(328920), i->GetValue()); diff --git a/klm/util/sized_iterator.hh b/klm/util/sized_iterator.hh index 47dfc245..aabcc531 100644 --- a/klm/util/sized_iterator.hh +++ b/klm/util/sized_iterator.hh @@ -6,7 +6,7 @@ #include #include -#include +#include #include namespace util { diff --git a/klm/util/sorted_uniform.hh b/klm/util/sorted_uniform.hh index 0d6ecbbd..7700d9e6 100644 --- a/klm/util/sorted_uniform.hh +++ b/klm/util/sorted_uniform.hh @@ -5,7 +5,7 @@ #include #include -#include +#include namespace util { @@ -122,99 +122,6 @@ template Iterator BinaryBelow( return begin - 1; } -// To use this template, you need to define a Pivot function to match Key. -template class SortedUniformMap { - public: - typedef PackingT Packing; - typedef typename Packing::ConstIterator ConstIterator; - typedef typename Packing::MutableIterator MutableIterator; - - struct Accessor { - public: - typedef typename Packing::Key Key; - const Key &operator()(const ConstIterator &i) const { return i->GetKey(); } - Key &operator()(const MutableIterator &i) const { return i->GetKey(); } - }; - - // Offer consistent API with probing hash. - static std::size_t Size(std::size_t entries, float /*ignore*/ = 0.0) { - return sizeof(uint64_t) + entries * Packing::kBytes; - } - - SortedUniformMap() -#ifdef DEBUG - : initialized_(false), loaded_(false) -#endif - {} - - SortedUniformMap(void *start, std::size_t /*allocated*/) : - begin_(Packing::FromVoid(reinterpret_cast(start) + 1)), - end_(begin_), size_ptr_(reinterpret_cast(start)) -#ifdef DEBUG - , initialized_(true), loaded_(false) -#endif - {} - - void LoadedBinary() { -#ifdef DEBUG - assert(initialized_); - assert(!loaded_); - loaded_ = true; -#endif - // Restore the size. - end_ = begin_ + *size_ptr_; - } - - // Caller responsible for not exceeding specified size. Do not call after FinishedInserting. - template void Insert(const T &t) { -#ifdef DEBUG - assert(initialized_); - assert(!loaded_); -#endif - *end_ = t; - ++end_; - } - - void FinishedInserting() { -#ifdef DEBUG - assert(initialized_); - assert(!loaded_); - loaded_ = true; -#endif - std::sort(begin_, end_); - *size_ptr_ = (end_ - begin_); - } - - // Don't use this to change the key. - template bool UnsafeMutableFind(const Key key, MutableIterator &out) { -#ifdef DEBUG - assert(initialized_); - assert(loaded_); -#endif - return SortedUniformFind(begin_, end_, key, out); - } - - // Do not call before FinishedInserting. - template bool Find(const Key key, ConstIterator &out) const { -#ifdef DEBUG - assert(initialized_); - assert(loaded_); -#endif - return SortedUniformFind(Accessor(), ConstIterator(begin_), ConstIterator(end_), key, out); - } - - ConstIterator begin() const { return begin_; } - ConstIterator end() const { return end_; } - - private: - typename Packing::MutableIterator begin_, end_; - uint64_t *size_ptr_; -#ifdef DEBUG - bool initialized_; - bool loaded_; -#endif -}; - } // namespace util #endif // UTIL_SORTED_UNIFORM__ diff --git a/klm/util/sorted_uniform_test.cc b/klm/util/sorted_uniform_test.cc index 4aa4c8aa..d9f6fad1 100644 --- a/klm/util/sorted_uniform_test.cc +++ b/klm/util/sorted_uniform_test.cc @@ -1,12 +1,11 @@ #include "util/sorted_uniform.hh" -#include "util/key_value_packing.hh" - #include #include #include #include #include + #define BOOST_TEST_MODULE SortedUniformTest #include @@ -17,74 +16,86 @@ namespace util { namespace { -template void Check(const Map &map, const boost::unordered_map &reference, const Key key) { +template struct Entry { + typedef KeyT Key; + typedef ValueT Value; + + Key key; + Value value; + + Key GetKey() const { + return key; + } + + Value GetValue() const { + return value; + } + + bool operator<(const Entry &other) const { + return key < other.key; + } +}; + +template struct Accessor { + typedef KeyT Key; + template Key operator()(const Entry *entry) const { + return entry->GetKey(); + } +}; + +template void Check(const Entry *begin, const Entry *end, const boost::unordered_map &reference, const Key key) { typename boost::unordered_map::const_iterator ref = reference.find(key); - typename Map::ConstIterator i = typename Map::ConstIterator(); + typedef const Entry *It; + // g++ can't tell that require will crash and burn. + It i = NULL; + bool ret = SortedUniformFind, Pivot64>(Accessor(), begin, end, key, i); if (ref == reference.end()) { - BOOST_CHECK(!map.Find(key, i)); + BOOST_CHECK(!ret); } else { - // g++ can't tell that require will crash and burn. - BOOST_REQUIRE(map.Find(key, i)); + BOOST_REQUIRE(ret); BOOST_CHECK_EQUAL(ref->second, i->GetValue()); } } -typedef SortedUniformMap > TestMap; - BOOST_AUTO_TEST_CASE(empty) { - char buf[TestMap::Size(0)]; - TestMap map(buf, TestMap::Size(0)); - map.FinishedInserting(); - TestMap::ConstIterator i; - BOOST_CHECK(!map.Find(42, i)); -} - -BOOST_AUTO_TEST_CASE(one) { - char buf[TestMap::Size(1)]; - TestMap map(buf, sizeof(buf)); - Entry e; - e.Set(42,2); - map.Insert(e); - map.FinishedInserting(); - TestMap::ConstIterator i = TestMap::ConstIterator(); - BOOST_REQUIRE(map.Find(42, i)); - BOOST_CHECK(i == map.begin()); - BOOST_CHECK(!map.Find(43, i)); - BOOST_CHECK(!map.Find(41, i)); + typedef const Entry T; + const T *i; + bool ret = SortedUniformFind, Pivot64>(Accessor(), (const T*)NULL, (const T*)NULL, (uint64_t)10, i); + BOOST_CHECK(!ret); } template void RandomTest(Key upper, size_t entries, size_t queries) { typedef unsigned char Value; - typedef SortedUniformMap > Map; - boost::scoped_array buffer(new char[Map::Size(entries)]); - Map map(buffer.get(), entries); boost::mt19937 rng; boost::uniform_int range_key(0, upper); boost::uniform_int range_value(0, 255); boost::variate_generator > gen_key(rng, range_key); boost::variate_generator > gen_value(rng, range_value); + typedef Entry Ent; + std::vector backing; boost::unordered_map reference; - Entry ent; + Ent ent; for (size_t i = 0; i < entries; ++i) { Key key = gen_key(); unsigned char value = gen_value(); if (reference.insert(std::make_pair(key, value)).second) { - ent.Set(key, value); - map.Insert(Entry(ent)); + ent.key = key; + ent.value = value; + backing.push_back(ent); } } - map.FinishedInserting(); + std::sort(backing.begin(), backing.end()); // Random queries. for (size_t i = 0; i < queries; ++i) { const Key key = gen_key(); - Check(map, reference, key); + Check(&*backing.begin(), &*backing.end(), reference, key); } typename boost::unordered_map::const_iterator it = reference.begin(); for (size_t i = 0; (i < queries) && (it != reference.end()); ++i, ++it) { - Check(map, reference, it->second); + Check(&*backing.begin(), &*backing.end(), reference, it->second); } } diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh index 413bda0b..c7e1c863 100644 --- a/klm/util/tokenize_piece.hh +++ b/klm/util/tokenize_piece.hh @@ -1,6 +1,7 @@ #ifndef UTIL_TOKENIZE_PIECE__ #define UTIL_TOKENIZE_PIECE__ +#include "util/exception.hh" #include "util/string_piece.hh" #include @@ -8,63 +9,25 @@ #include #include -/* Usage: - * - * for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) { - * std::cout << *i << "\n"; - * } - * - */ - namespace util { -// Tokenize a StringPiece using an iterator interface. boost::tokenizer doesn't work with StringPiece. -template class PieceIterator : public boost::iterator_facade, const StringPiece, boost::forward_traversal_tag> { +// Thrown on dereference when out of tokens to parse +class OutOfTokens : public Exception { public: - // Default construct is end, which is also accessed by kEndPieceIterator; - PieceIterator() {} - - explicit PieceIterator(const StringPiece &str) - : after_(str) { - increment(); - } + OutOfTokens() throw() {} + ~OutOfTokens() throw() {} +}; - bool operator!() const { - return after_.data() == 0; - } - operator bool() const { - return after_.data() != 0; - } +class SingleCharacter { + public: + explicit SingleCharacter(char delim) : delim_(delim) {} - static PieceIterator end() { - return PieceIterator(); + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::find(in.data(), in.data() + in.size(), delim_), 1); } private: - friend class boost::iterator_core_access; - - void increment() { - const char *start = after_.data(); - for (; (start != after_.data() + after_.size()) && (d == *start); ++start) {} - if (start == after_.data() + after_.size()) { - // End condition. - after_.clear(); - return; - } - const char *finish = start; - for (; (finish != after_.data() + after_.size()) && (d != *finish); ++finish) {} - current_ = StringPiece(start, finish - start); - after_ = StringPiece(finish, after_.data() + after_.size() - finish); - } - - bool equal(const PieceIterator &other) const { - return after_.data() == other.after_.data(); - } - - const StringPiece &dereference() const { return current_; } - - StringPiece current_; - StringPiece after_; + char delim_; }; class MultiCharacter { @@ -95,7 +58,7 @@ template class TokenIter : public boost::it public: TokenIter() {} - TokenIter(const StringPiece &str, const Find &finder) : after_(str), finder_(finder) { + template TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) { increment(); } @@ -130,6 +93,7 @@ template class TokenIter : public boost::it } const StringPiece &dereference() const { + UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens"); return current_; } diff --git a/klm/util/tokenize_piece_test.cc b/klm/util/tokenize_piece_test.cc index e07ebcf5..d856018f 100644 --- a/klm/util/tokenize_piece_test.cc +++ b/klm/util/tokenize_piece_test.cc @@ -9,53 +9,7 @@ namespace util { namespace { -BOOST_AUTO_TEST_CASE(simple) { - PieceIterator<' '> it("single spaced words."); - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("single"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("spaced"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("words."), *it); - ++it; - BOOST_CHECK(!it); -} - -BOOST_AUTO_TEST_CASE(null_delimiter) { - const char str[] = "\0first\0\0second\0\0\0third\0fourth\0\0\0"; - PieceIterator<'\0'> it(StringPiece(str, sizeof(str) - 1)); - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("first"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("second"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("third"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("fourth"), *it); - ++it; - BOOST_CHECK(!it); -} - -BOOST_AUTO_TEST_CASE(null_entries) { - const char str[] = "\0split\0\0 \0me\0 "; - PieceIterator<' '> it(StringPiece(str, sizeof(str) - 1)); - BOOST_REQUIRE(it); - const char first[] = "\0split\0\0"; - BOOST_CHECK_EQUAL(StringPiece(first, sizeof(first) - 1), *it); - ++it; - BOOST_REQUIRE(it); - const char second[] = "\0me\0"; - BOOST_CHECK_EQUAL(StringPiece(second, sizeof(second) - 1), *it); - ++it; - BOOST_CHECK(!it); -} - -/*BOOST_AUTO_TEST_CASE(pipe_pipe_none) { +BOOST_AUTO_TEST_CASE(pipe_pipe_none) { const char str[] = "nodelimit at all"; TokenIter it(str, MultiCharacter("|||")); BOOST_REQUIRE(it); @@ -79,7 +33,7 @@ BOOST_AUTO_TEST_CASE(remove_empty) { const char str[] = "|||"; TokenIter it(str, MultiCharacter("|||")); BOOST_CHECK(!it); -}*/ +} BOOST_AUTO_TEST_CASE(remove_empty_keep) { const char str[] = " |||"; -- cgit v1.2.3