summaryrefslogtreecommitdiff
path: root/klm/util
diff options
context:
space:
mode:
authorKenneth Heafield <kenlm@kheafield.com>2012-02-28 17:23:55 -0500
committerKenneth Heafield <kenlm@kheafield.com>2012-02-28 17:23:55 -0500
commit89238977fc9d8f8d9a6421b0d4f35afc200f08e7 (patch)
treef60871db033d20faaf406af2736f17631f490b44 /klm/util
parent1f0ded1e7f59b13d7512111dd910d0f4b2f82d02 (diff)
Subject: where's my kenlm update?? From: Chris Dyer <cdyer@cs.cmu.edu>
Diffstat (limited to 'klm/util')
-rw-r--r--klm/util/bit_packing.hh66
-rw-r--r--klm/util/exception.cc2
-rw-r--r--klm/util/file.cc222
-rw-r--r--klm/util/file.hh42
-rw-r--r--klm/util/file_piece.cc62
-rw-r--r--klm/util/file_piece.hh31
-rw-r--r--klm/util/file_piece_test.cc41
-rw-r--r--klm/util/getopt.c78
-rw-r--r--klm/util/getopt.hh33
-rw-r--r--klm/util/key_value_packing.hh126
-rw-r--r--klm/util/key_value_packing_test.cc75
-rw-r--r--klm/util/mmap.cc123
-rw-r--r--klm/util/mmap.hh14
-rw-r--r--klm/util/murmur_hash.cc39
-rw-r--r--klm/util/murmur_hash.hh2
-rw-r--r--klm/util/probing_hash_table.hh33
-rw-r--r--klm/util/probing_hash_table_test.cc27
-rw-r--r--klm/util/sized_iterator.hh2
-rw-r--r--klm/util/sorted_uniform.hh95
-rw-r--r--klm/util/sorted_uniform_test.cc85
-rw-r--r--klm/util/tokenize_piece.hh64
-rw-r--r--klm/util/tokenize_piece_test.cc50
22 files changed, 739 insertions, 573 deletions
diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh
index 33266b94..73a5cb22 100644
--- a/klm/util/bit_packing.hh
+++ b/klm/util/bit_packing.hh
@@ -1,33 +1,37 @@
#ifndef UTIL_BIT_PACKING__
#define UTIL_BIT_PACKING__
-/* Bit-level packing routines */
+/* Bit-level packing routines
+ *
+ * WARNING WARNING WARNING:
+ * The write functions assume that memory is zero initially. This makes them
+ * faster and is the appropriate case for mmapped language model construction.
+ * These routines assume that unaligned access to uint64_t is fast. This is
+ * the case on x86_64. I'm not sure how fast unaligned 64-bit access is on
+ * x86 but my target audience is large language models for which 64-bit is
+ * necessary.
+ *
+ * Call the BitPackingSanity function to sanity check. Calling once suffices,
+ * but it may be called multiple times when that's inconvenient.
+ *
+ * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at
+ * NICT.
+ */
#include <assert.h>
#ifdef __APPLE__
#include <architecture/byte_order.h>
#elif __linux__
#include <endian.h>
-#else
+#elif !defined(_WIN32) && !defined(_WIN64)
#include <arpa/nameser_compat.h>
#endif
-#include <inttypes.h>
-
-namespace util {
+#include <stdint.h>
-/* WARNING WARNING WARNING:
- * The write functions assume that memory is zero initially. This makes them
- * faster and is the appropriate case for mmapped language model construction.
- * These routines assume that unaligned access to uint64_t is fast and that
- * storage is little endian. This is the case on x86_64. I'm not sure how
- * fast unaligned 64-bit access is on x86 but my target audience is large
- * language models for which 64-bit is necessary.
- *
- * Call the BitPackingSanity function to sanity check. Calling once suffices,
- * but it may be called multiple times when that's inconvenient.
- */
+#include <string.h>
+namespace util {
// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct.
#if BYTE_ORDER == LITTLE_ENDIAN
@@ -43,7 +47,14 @@ inline uint8_t BitPackShift(uint8_t bit, uint8_t length) {
#endif
inline uint64_t ReadOff(const void *base, uint64_t bit_off) {
+#if defined(__arm) || defined(__arm__)
+ const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3);
+ uint64_t value64;
+ memcpy(&value64, base_off, sizeof(value64));
+ return value64;
+#else
return *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3));
+#endif
}
/* Pack integers up to 57 bits using their least significant digits.
@@ -57,18 +68,41 @@ inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, ui
* Assumes the memory is zero initially.
*/
inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) {
+#if defined(__arm) || defined(__arm__)
+ uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3);
+ uint64_t value64;
+ memcpy(&value64, base_off, sizeof(value64));
+ value64 |= (value << BitPackShift(bit_off & 7, length));
+ memcpy(base_off, &value64, sizeof(value64));
+#else
*reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |=
(value << BitPackShift(bit_off & 7, length));
+#endif
}
/* Same caveats as above, but for a 25 bit limit. */
inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) {
+#if defined(__arm) || defined(__arm__)
+ const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3);
+ uint32_t value32;
+ memcpy(&value32, base_off, sizeof(value32));
+ return (value32 >> BitPackShift(bit_off & 7, length)) & mask;
+#else
return (*reinterpret_cast<const uint32_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask;
+#endif
}
inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) {
+#if defined(__arm) || defined(__arm__)
+ uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3);
+ uint32_t value32;
+ memcpy(&value32, base_off, sizeof(value32));
+ value32 |= (value << BitPackShift(bit_off & 7, length));
+ memcpy(base_off, &value32, sizeof(value32));
+#else
*reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |=
(value << BitPackShift(bit_off & 7, length));
+#endif
}
typedef union { float f; uint32_t i; } FloatEnc;
diff --git a/klm/util/exception.cc b/klm/util/exception.cc
index 96951495..c4f8c04c 100644
--- a/klm/util/exception.cc
+++ b/klm/util/exception.cc
@@ -66,7 +66,7 @@ const char *HandleStrerror(const char *ret, const char * /*buf*/) {
ErrnoException::ErrnoException() throw() : errno_(errno) {
char buf[200];
buf[0] = 0;
-#ifdef sun
+#if defined(sun) || defined(_WIN32) || defined(_WIN64)
const char *add = strerror(errno);
#else
const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf);
diff --git a/klm/util/file.cc b/klm/util/file.cc
index d707568e..aee7c77a 100644
--- a/klm/util/file.cc
+++ b/klm/util/file.cc
@@ -9,8 +9,12 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
-#include <unistd.h>
-#include <inttypes.h>
+#include <stdint.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#include <io.h>
+#endif
namespace util {
@@ -30,33 +34,61 @@ scoped_FILE::~scoped_FILE() {
int OpenReadOrThrow(const char *name) {
int ret;
+#if defined(_WIN32) || defined(_WIN64)
+ UTIL_THROW_IF(-1 == (ret = _open(name, _O_BINARY | _O_RDONLY)), ErrnoException, "while opening " << name);
+#else
UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name);
+#endif
return ret;
}
-int CreateOrThrow(const char *name) {
- int ret;
- UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR)), ErrnoException, "while creating " << name);
- return ret;
-}
-
-off_t SizeFile(int fd) {
+uint64_t SizeFile(int fd) {
+#if defined(_WIN32) || defined(_WIN64)
+ __int64 ret = _filelengthi64(fd);
+ return (ret == -1) ? kBadSize : ret;
+#else
struct stat sb;
if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize;
return sb.st_size;
+#endif
+}
+
+void ResizeOrThrow(int fd, uint64_t to) {
+#if defined(_WIN32) || defined(_WIN64)
+ UTIL_THROW_IF(_chsize_s(fd, to), ErrnoException, "Resizing to " << to << " bytes failed");
+#else
+ UTIL_THROW_IF(ftruncate(fd, to), ErrnoException, "Resizing to " << to << " bytes failed");
+#endif
}
+#ifdef WIN32
+typedef int ssize_t;
+#endif
+
void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
uint8_t *to = static_cast<uint8_t*>(to_void);
while (amount) {
ssize_t ret = read(fd, to, amount);
- if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
- if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
+ UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
+ UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
amount -= ret;
to += ret;
}
}
+std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {
+ uint8_t *to = static_cast<uint8_t*>(to_void);
+ std::size_t remaining = amount;
+ while (remaining) {
+ ssize_t ret = read(fd, to, remaining);
+ UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << remaining << " from fd " << fd << " failed.");
+ if (!ret) return amount - remaining;
+ remaining -= ret;
+ to += ret;
+ }
+ return amount;
+}
+
void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
const uint8_t *data = static_cast<const uint8_t*>(data_void);
while (size) {
@@ -67,8 +99,172 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
}
}
-void RemoveOrThrow(const char *name) {
- UTIL_THROW_IF(std::remove(name), util::ErrnoException, "Could not remove " << name);
+void FSyncOrThrow(int fd) {
+// Apparently windows doesn't have fsync?
+#if !defined(_WIN32) && !defined(_WIN64)
+ UTIL_THROW_IF(-1 == fsync(fd), ErrnoException, "Sync of " << fd << " failed.");
+#endif
+}
+
+namespace {
+void InternalSeek(int fd, off_t off, int whence) {
+ UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed");
+}
+} // namespace
+
+void SeekOrThrow(int fd, uint64_t off) {
+ InternalSeek(fd, off, SEEK_SET);
+}
+
+void AdvanceOrThrow(int fd, int64_t off) {
+ InternalSeek(fd, off, SEEK_CUR);
+}
+
+void SeekEnd(int fd) {
+ InternalSeek(fd, 0, SEEK_END);
+}
+
+std::FILE *FDOpenOrThrow(scoped_fd &file) {
+ std::FILE *ret = fdopen(file.get(), "r+b");
+ if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen");
+ file.release();
+ return ret;
+}
+
+TempMaker::TempMaker(const std::string &prefix) : base_(prefix) {
+ base_ += "XXXXXX";
+}
+
+// Sigh. Windows temporary file creation is full of race conditions.
+#if defined(_WIN32) || defined(_WIN64)
+/* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright
+ (C) 1991-1999, 2000, 2001, 2006 Free Software Foundation, Inc.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version. */
+
+/* This has been modified from the original version to rename the function and
+ * set the Windows temporary flag. */
+
+static const char letters[] =
+"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+
+/* Generate a temporary file name based on TMPL. TMPL must match the
+ rules for mk[s]temp (i.e. end in "XXXXXX"). The name constructed
+ does not exist at the time of the call to mkstemp. TMPL is
+ overwritten with the result. */
+int
+mkstemp_and_unlink(char *tmpl)
+{
+ int len;
+ char *XXXXXX;
+ static unsigned long long value;
+ unsigned long long random_time_bits;
+ unsigned int count;
+ int fd = -1;
+ int save_errno = errno;
+
+ /* A lower bound on the number of temporary files to attempt to
+ generate. The maximum total number of temporary file names that
+ can exist for a given template is 62**6. It should never be
+ necessary to try all these combinations. Instead if a reasonable
+ number of names is tried (we define reasonable as 62**3) fail to
+ give the system administrator the chance to remove the problems. */
+#define ATTEMPTS_MIN (62 * 62 * 62)
+
+ /* The number of times to attempt to generate a temporary file. To
+ conform to POSIX, this must be no smaller than TMP_MAX. */
+#if ATTEMPTS_MIN < TMP_MAX
+ unsigned int attempts = TMP_MAX;
+#else
+ unsigned int attempts = ATTEMPTS_MIN;
+#endif
+
+ len = strlen (tmpl);
+ if (len < 6 || strcmp (&tmpl[len - 6], "XXXXXX"))
+ {
+ errno = EINVAL;
+ return -1;
+ }
+
+/* This is where the Xs start. */
+ XXXXXX = &tmpl[len - 6];
+
+ /* Get some more or less random data. */
+ {
+ SYSTEMTIME stNow;
+ FILETIME ftNow;
+
+ // get system time
+ GetSystemTime(&stNow);
+ stNow.wMilliseconds = 500;
+ if (!SystemTimeToFileTime(&stNow, &ftNow))
+ {
+ errno = -1;
+ return -1;
+ }
+
+ random_time_bits = (((unsigned long long)ftNow.dwHighDateTime << 32)
+ | (unsigned long long)ftNow.dwLowDateTime);
+ }
+ value += random_time_bits ^ (unsigned long long)GetCurrentThreadId ();
+
+ for (count = 0; count < attempts; value += 7777, ++count)
+ {
+ unsigned long long v = value;
+
+ /* Fill in the random bits. */
+ XXXXXX[0] = letters[v % 62];
+ v /= 62;
+ XXXXXX[1] = letters[v % 62];
+ v /= 62;
+ XXXXXX[2] = letters[v % 62];
+ v /= 62;
+ XXXXXX[3] = letters[v % 62];
+ v /= 62;
+ XXXXXX[4] = letters[v % 62];
+ v /= 62;
+ XXXXXX[5] = letters[v % 62];
+
+ /* Modified for windows and to unlink */
+ // fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE);
+ fd = _open (tmpl, _O_RDWR | _O_CREAT | _O_TEMPORARY | _O_EXCL | _O_BINARY, _S_IREAD | _S_IWRITE);
+ if (fd >= 0)
+ {
+ errno = save_errno;
+ return fd;
+ }
+ else if (errno != EEXIST)
+ return -1;
+ }
+
+ /* We got out of the loop because we ran out of combinations to try. */
+ errno = EEXIST;
+ return -1;
+}
+#else
+int
+mkstemp_and_unlink(char *tmpl) {
+ int ret = mkstemp(tmpl);
+ if (ret == -1) return -1;
+ UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl);
+ return ret;
+}
+#endif
+
+int TempMaker::Make() const {
+ std::string copy(base_);
+ copy.push_back(0);
+ int ret;
+ UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&copy[0])), util::ErrnoException, "Failed to make a temporary based on " << base_);
+ return ret;
+}
+
+std::FILE *TempMaker::MakeFile() const {
+ util::scoped_fd file(Make());
+ return FDOpenOrThrow(file);
}
} // namespace util
diff --git a/klm/util/file.hh b/klm/util/file.hh
index d6cca41d..5c57e2a9 100644
--- a/klm/util/file.hh
+++ b/klm/util/file.hh
@@ -1,8 +1,11 @@
#ifndef UTIL_FILE__
#define UTIL_FILE__
+#include <cstddef>
#include <cstdio>
-#include <unistd.h>
+#include <string>
+
+#include <stdint.h>
namespace util {
@@ -52,22 +55,49 @@ class scoped_FILE {
file_ = to;
}
+ std::FILE *release() {
+ std::FILE *ret = file_;
+ file_ = NULL;
+ return ret;
+ }
+
private:
std::FILE *file_;
};
int OpenReadOrThrow(const char *name);
-int CreateOrThrow(const char *name);
-
// Return value for SizeFile when it can't size properly.
-const off_t kBadSize = -1;
-off_t SizeFile(int fd);
+const uint64_t kBadSize = (uint64_t)-1;
+uint64_t SizeFile(int fd);
+
+void ResizeOrThrow(int fd, uint64_t to);
void ReadOrThrow(int fd, void *to, std::size_t size);
+std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount);
+
void WriteOrThrow(int fd, const void *data_void, std::size_t size);
-void RemoveOrThrow(const char *name);
+void FSyncOrThrow(int fd);
+
+// Seeking
+void SeekOrThrow(int fd, uint64_t off);
+void AdvanceOrThrow(int fd, int64_t off);
+void SeekEnd(int fd);
+
+std::FILE *FDOpenOrThrow(scoped_fd &file);
+
+class TempMaker {
+ public:
+ explicit TempMaker(const std::string &prefix);
+
+ int Make() const;
+
+ std::FILE *MakeFile() const;
+
+ private:
+ std::string base_;
+};
} // namespace util
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index b57582a0..081e662b 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -2,6 +2,10 @@
#include "util/exception.hh"
#include "util/file.hh"
+#include "util/mmap.hh"
+#ifdef WIN32
+#include <io.h>
+#endif // WIN32
#include <iostream>
#include <string>
@@ -11,14 +15,8 @@
#include <ctype.h>
#include <fcntl.h>
#include <stdlib.h>
-#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
-#include <unistd.h>
-
-#ifdef HAVE_ZLIB
-#include <zlib.h>
-#endif
namespace util {
@@ -26,24 +24,24 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
*this << "Could not parse \"" << value << "\" into a number";
}
-GZException::GZException(void *file) {
#ifdef HAVE_ZLIB
+GZException::GZException(gzFile file) {
int num;
- *this << gzerror(file, &num) << " from zlib";
-#endif // HAVE_ZLIB
+ *this << gzerror( file, &num) << " from zlib";
}
+#endif // HAVE_ZLIB
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) :
- file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
+FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) :
+ file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
Initialize(name, show_progress, min_buffer);
}
-FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) :
- file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
+FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
+ file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
Initialize(name, show_progress, min_buffer);
}
@@ -63,7 +61,7 @@ FilePiece::~FilePiece() {
}
StringPiece FilePiece::ReadLine(char delim) {
- size_t skip = 0;
+ std::size_t skip = 0;
while (true) {
for (const char *i = position_ + skip; i < position_end_; ++i) {
if (*i == delim) {
@@ -94,13 +92,13 @@ unsigned long int FilePiece::ReadULong() {
return ReadNumber<unsigned long int>();
}
-void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) {
+void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) {
#ifdef HAVE_ZLIB
gz_file_ = NULL;
#endif
file_name_ = name;
- default_map_size_ = page_ * std::max<off_t>((min_buffer / page_ + 1), 2);
+ default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2);
position_ = NULL;
position_end_ = NULL;
mapped_offset_ = 0;
@@ -130,7 +128,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t
namespace {
void ParseNumber(const char *begin, char *&end, float &out) {
-#ifdef sun
+#if defined(sun) || defined(WIN32)
out = static_cast<float>(strtod(begin, &end));
#else
out = strtof(begin, &end);
@@ -171,7 +169,7 @@ template <class T> T FilePiece::ReadNumber() {
}
const char *FilePiece::FindDelimiterOrEOF(const bool *delim) {
- size_t skip = 0;
+ std::size_t skip = 0;
while (true) {
for (const char *i = position_ + skip; i < position_end_; ++i) {
if (delim[static_cast<unsigned char>(*i)]) return i;
@@ -190,7 +188,7 @@ void FilePiece::Shift() {
progress_.Finished();
throw EndOfFileException();
}
- off_t desired_begin = position_ - data_.begin() + mapped_offset_;
+ uint64_t desired_begin = position_ - data_.begin() + mapped_offset_;
if (!fallback_to_read_) MMapShift(desired_begin);
// Notice an mmap failure might set the fallback.
@@ -201,18 +199,18 @@ void FilePiece::Shift() {
}
}
-void FilePiece::MMapShift(off_t desired_begin) {
+void FilePiece::MMapShift(uint64_t desired_begin) {
// Use mmap.
- off_t ignore = desired_begin % page_;
+ uint64_t ignore = desired_begin % page_;
// Duplicate request for Shift means give more data.
if (position_ == data_.begin() + ignore) {
default_map_size_ *= 2;
}
// Local version so that in case of failure it doesn't overwrite the class variable.
- off_t mapped_offset = desired_begin - ignore;
+ uint64_t mapped_offset = desired_begin - ignore;
- off_t mapped_size;
- if (default_map_size_ >= static_cast<size_t>(total_size_ - mapped_offset)) {
+ uint64_t mapped_size;
+ if (default_map_size_ >= static_cast<std::size_t>(total_size_ - mapped_offset)) {
at_end_ = true;
mapped_size = total_size_ - mapped_offset;
} else {
@@ -221,15 +219,11 @@ void FilePiece::MMapShift(off_t desired_begin) {
// Forcibly clear the existing mmap first.
data_.reset();
- data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_SHARED
- // Populate where available on linux
-#ifdef MAP_POPULATE
- | MAP_POPULATE
-#endif
- , *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED);
- if (data_.get() == MAP_FAILED) {
+ try {
+ MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_);
+ } catch (const util::ErrnoException &e) {
if (desired_begin) {
- if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either.");
+ SeekOrThrow(*file_, desired_begin);
}
// The mmap was scheduled to end the file, but now we're going to read it.
at_end_ = false;
@@ -259,6 +253,10 @@ void FilePiece::TransitionToRead() {
#endif
}
+#ifdef WIN32
+typedef int ssize_t;
+#endif
+
void FilePiece::ReadShift() {
assert(fallback_to_read_);
// Bytes [data_.begin(), position_) have been consumed.
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
index a627f38c..af93d8aa 100644
--- a/klm/util/file_piece.hh
+++ b/klm/util/file_piece.hh
@@ -8,9 +8,14 @@
#include "util/mmap.hh"
#include "util/string_piece.hh"
+#include <cstddef>
#include <string>
-#include <cstddef>
+#include <stdint.h>
+
+#ifdef HAVE_ZLIB
+#include <zlib.h>
+#endif
namespace util {
@@ -22,7 +27,9 @@ class ParseNumberException : public Exception {
class GZException : public Exception {
public:
- explicit GZException(void *file);
+#ifdef HAVE_ZLIB
+ explicit GZException(gzFile file);
+#endif
GZException() throw() {}
~GZException() throw() {}
};
@@ -33,9 +40,9 @@ extern const bool kSpaces[256];
class FilePiece {
public:
// 32 MB default.
- explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
+ explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432);
// Takes ownership of fd. name is used for messages.
- explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
+ explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432);
~FilePiece();
@@ -70,14 +77,14 @@ class FilePiece {
}
}
- off_t Offset() const {
+ uint64_t Offset() const {
return position_ - data_.begin() + mapped_offset_;
}
const std::string &FileName() const { return file_name_; }
private:
- void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer);
+ void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
template <class T> T ReadNumber();
@@ -91,7 +98,7 @@ class FilePiece {
void Shift();
// Backends to Shift().
- void MMapShift(off_t desired_begin);
+ void MMapShift(uint64_t desired_begin);
void TransitionToRead();
void ReadShift();
@@ -99,11 +106,11 @@ class FilePiece {
const char *position_, *last_space_, *position_end_;
scoped_fd file_;
- const off_t total_size_;
- const off_t page_;
+ const uint64_t total_size_;
+ const uint64_t page_;
- size_t default_map_size_;
- off_t mapped_offset_;
+ std::size_t default_map_size_;
+ uint64_t mapped_offset_;
// Order matters: file_ should always be destroyed after this.
scoped_memory data_;
@@ -116,7 +123,7 @@ class FilePiece {
std::string file_name_;
#ifdef HAVE_ZLIB
- void *gz_file_;
+ gzFile gz_file_;
#endif // HAVE_ZLIB
};
diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc
index dc9ec7e7..f912e18a 100644
--- a/klm/util/file_piece_test.cc
+++ b/klm/util/file_piece_test.cc
@@ -1,3 +1,4 @@
+// Tests might fail if you have creative characters in your path. Sue me.
#include "util/file_piece.hh"
#include "util/scoped.hh"
@@ -14,10 +15,18 @@
namespace util {
namespace {
+std::string FileLocation() {
+ if (boost::unit_test::framework::master_test_suite().argc < 2) {
+ return "file_piece.cc";
+ }
+ std::string ret(boost::unit_test::framework::master_test_suite().argv[1]);
+ return ret;
+}
+
/* mmap implementation */
BOOST_AUTO_TEST_CASE(MMapReadLine) {
- std::fstream ref("file_piece.cc", std::ios::in);
- FilePiece test("file_piece.cc", NULL, 1);
+ std::fstream ref(FileLocation().c_str(), std::ios::in);
+ FilePiece test(FileLocation().c_str(), NULL, 1);
std::string ref_line;
while (getline(ref, ref_line)) {
StringPiece test_line(test.ReadLine());
@@ -35,9 +44,13 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) {
*/
/* read() implementation */
BOOST_AUTO_TEST_CASE(StreamReadLine) {
- std::fstream ref("file_piece.cc", std::ios::in);
+ std::fstream ref(FileLocation().c_str(), std::ios::in);
+
+ std::string popen_args = "cat \"";
+ popen_args += FileLocation();
+ popen_args += '"';
- FILE *catter = popen("cat file_piece.cc", "r");
+ FILE *catter = popen(popen_args.c_str(), "r");
BOOST_REQUIRE(catter);
FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1);
@@ -58,10 +71,15 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) {
// gzip file
BOOST_AUTO_TEST_CASE(PlainZipReadLine) {
- std::fstream ref("file_piece.cc", std::ios::in);
+ std::string location(FileLocation());
+ std::fstream ref(location.c_str(), std::ios::in);
- BOOST_REQUIRE_EQUAL(0, system("gzip <file_piece.cc >file_piece.cc.gz"));
- FilePiece test("file_piece.cc.gz", NULL, 1);
+ std::string command("gzip <\"");
+ command += location + "\" >\"" + location + "\".gz";
+
+ BOOST_REQUIRE_EQUAL(0, system(command.c_str()));
+ FilePiece test((location + ".gz").c_str(), NULL, 1);
+ unlink((location + ".gz").c_str());
std::string ref_line;
while (getline(ref, ref_line)) {
StringPiece test_line(test.ReadLine());
@@ -77,12 +95,15 @@ BOOST_AUTO_TEST_CASE(PlainZipReadLine) {
// the test.
#ifndef __APPLE__
BOOST_AUTO_TEST_CASE(StreamZipReadLine) {
- std::fstream ref("file_piece.cc", std::ios::in);
+ std::fstream ref(FileLocation().c_str(), std::ios::in);
+
+ std::string command("gzip <\"");
+ command += FileLocation() + "\"";
- FILE * catter = popen("gzip <file_piece.cc", "r");
+ FILE * catter = popen(command.c_str(), "r");
BOOST_REQUIRE(catter);
- FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1);
+ FilePiece test(dup(fileno(catter)), "file_piece.cc.gz", NULL, 1);
std::string ref_line;
while (getline(ref, ref_line)) {
StringPiece test_line(test.ReadLine());
diff --git a/klm/util/getopt.c b/klm/util/getopt.c
new file mode 100644
index 00000000..992c96b0
--- /dev/null
+++ b/klm/util/getopt.c
@@ -0,0 +1,78 @@
+/*
+POSIX getopt for Windows
+
+AT&T Public License
+
+Code given out at the 1985 UNIFORUM conference in Dallas.
+*/
+
+#ifndef __GNUC__
+
+#include "getopt.hh"
+#include <stdio.h>
+#include <string.h>
+
+#define NULL 0
+#define EOF (-1)
+#define ERR(s, c) if(opterr){\
+ char errbuf[2];\
+ errbuf[0] = c; errbuf[1] = '\n';\
+ fputs(argv[0], stderr);\
+ fputs(s, stderr);\
+ fputc(c, stderr);}
+ //(void) write(2, argv[0], (unsigned)strlen(argv[0]));\
+ //(void) write(2, s, (unsigned)strlen(s));\
+ //(void) write(2, errbuf, 2);}
+
+int opterr = 1;
+int optind = 1;
+int optopt;
+char *optarg;
+
+int
+getopt(argc, argv, opts)
+int argc;
+char **argv, *opts;
+{
+ static int sp = 1;
+ register int c;
+ register char *cp;
+
+ if(sp == 1)
+ if(optind >= argc ||
+ argv[optind][0] != '-' || argv[optind][1] == '\0')
+ return(EOF);
+ else if(strcmp(argv[optind], "--") == NULL) {
+ optind++;
+ return(EOF);
+ }
+ optopt = c = argv[optind][sp];
+ if(c == ':' || (cp=strchr(opts, c)) == NULL) {
+ ERR(": illegal option -- ", c);
+ if(argv[optind][++sp] == '\0') {
+ optind++;
+ sp = 1;
+ }
+ return('?');
+ }
+ if(*++cp == ':') {
+ if(argv[optind][sp+1] != '\0')
+ optarg = &argv[optind++][sp+1];
+ else if(++optind >= argc) {
+ ERR(": option requires an argument -- ", c);
+ sp = 1;
+ return('?');
+ } else
+ optarg = argv[optind++];
+ sp = 1;
+ } else {
+ if(argv[optind][++sp] == '\0') {
+ sp = 1;
+ optind++;
+ }
+ optarg = NULL;
+ }
+ return(c);
+}
+
+#endif /* __GNUC__ */
diff --git a/klm/util/getopt.hh b/klm/util/getopt.hh
new file mode 100644
index 00000000..6ad97732
--- /dev/null
+++ b/klm/util/getopt.hh
@@ -0,0 +1,33 @@
+/*
+POSIX getopt for Windows
+
+AT&T Public License
+
+Code given out at the 1985 UNIFORUM conference in Dallas.
+*/
+
+#ifdef __GNUC__
+#include <getopt.h>
+#endif
+#ifndef __GNUC__
+
+#ifndef _WINGETOPT_H_
+#define _WINGETOPT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int opterr;
+extern int optind;
+extern int optopt;
+extern char *optarg;
+extern int getopt(int argc, char **argv, char *opts);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _GETOPT_H_ */
+#endif /* __GNUC__ */
+
diff --git a/klm/util/key_value_packing.hh b/klm/util/key_value_packing.hh
deleted file mode 100644
index b84a5aad..00000000
--- a/klm/util/key_value_packing.hh
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifndef UTIL_KEY_VALUE_PACKING__
-#define UTIL_KEY_VALUE_PACKING__
-
-/* Why such a general interface? I'm planning on doing bit-level packing. */
-
-#include <algorithm>
-#include <cstddef>
-#include <cstring>
-
-#include <inttypes.h>
-
-namespace util {
-
-template <class Key, class Value> struct Entry {
- Key key;
- Value value;
-
- const Key &GetKey() const { return key; }
- const Value &GetValue() const { return value; }
-
- Value &MutableValue() { return value; }
-
- void Set(const Key &key_in, const Value &value_in) {
- SetKey(key_in);
- SetValue(value_in);
- }
- void SetKey(const Key &key_in) { key = key_in; }
- void SetValue(const Value &value_in) { value = value_in; }
-
- bool operator<(const Entry<Key, Value> &other) const { return GetKey() < other.GetKey(); }
-};
-
-// And now for a brief interlude to specialize std::swap.
-} // namespace util
-namespace std {
-template <class Key, class Value> void swap(util::Entry<Key, Value> &first, util::Entry<Key, Value> &second) {
- swap(first.key, second.key);
- swap(first.value, second.value);
-}
-}// namespace std
-namespace util {
-
-template <class KeyT, class ValueT> class AlignedPacking {
- public:
- typedef KeyT Key;
- typedef ValueT Value;
-
- public:
- static const std::size_t kBytes = sizeof(Entry<Key, Value>);
- static const std::size_t kBits = kBytes * 8;
-
- typedef Entry<Key, Value> * MutableIterator;
- typedef const Entry<Key, Value> * ConstIterator;
- typedef const Entry<Key, Value> & ConstReference;
-
- static MutableIterator FromVoid(void *start) {
- return reinterpret_cast<MutableIterator>(start);
- }
-
- static Entry<Key, Value> Make(const Key &key, const Value &value) {
- Entry<Key, Value> ret;
- ret.Set(key, value);
- return ret;
- }
-};
-
-template <class KeyT, class ValueT> class ByteAlignedPacking {
- public:
- typedef KeyT Key;
- typedef ValueT Value;
-
- private:
-#pragma pack(push)
-#pragma pack(1)
- struct RawEntry {
- Key key;
- Value value;
-
- const Key &GetKey() const { return key; }
- const Value &GetValue() const { return value; }
-
- Value &MutableValue() { return value; }
-
- void Set(const Key &key_in, const Value &value_in) {
- SetKey(key_in);
- SetValue(value_in);
- }
- void SetKey(const Key &key_in) { key = key_in; }
- void SetValue(const Value &value_in) { value = value_in; }
-
- bool operator<(const RawEntry &other) const { return GetKey() < other.GetKey(); }
- };
-#pragma pack(pop)
-
- friend void std::swap<>(RawEntry&, RawEntry&);
-
- public:
- typedef RawEntry *MutableIterator;
- typedef const RawEntry *ConstIterator;
- typedef RawEntry &ConstReference;
-
- static const std::size_t kBytes = sizeof(RawEntry);
- static const std::size_t kBits = kBytes * 8;
-
- static MutableIterator FromVoid(void *start) {
- return MutableIterator(reinterpret_cast<RawEntry*>(start));
- }
-
- static RawEntry Make(const Key &key, const Value &value) {
- RawEntry ret;
- ret.Set(key, value);
- return ret;
- }
-};
-
-} // namespace util
-namespace std {
-template <class Key, class Value> void swap(
- typename util::ByteAlignedPacking<Key, Value>::RawEntry &first,
- typename util::ByteAlignedPacking<Key, Value>::RawEntry &second) {
- swap(first.key, second.key);
- swap(first.value, second.value);
-}
-}// namespace std
-
-#endif // UTIL_KEY_VALUE_PACKING__
diff --git a/klm/util/key_value_packing_test.cc b/klm/util/key_value_packing_test.cc
deleted file mode 100644
index a0d33fd7..00000000
--- a/klm/util/key_value_packing_test.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "util/key_value_packing.hh"
-
-#include <boost/random/mersenne_twister.hpp>
-#include <boost/random/uniform_int.hpp>
-#include <boost/random/variate_generator.hpp>
-#include <boost/scoped_array.hpp>
-#define BOOST_TEST_MODULE KeyValueStoreTest
-#include <boost/test/unit_test.hpp>
-
-#include <limits>
-#include <stdlib.h>
-
-namespace util {
-namespace {
-
-BOOST_AUTO_TEST_CASE(basic_in_out) {
- typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
- void *backing = malloc(Packing::kBytes * 2);
- Packing::MutableIterator i(Packing::FromVoid(backing));
- i->SetKey(10);
- BOOST_CHECK_EQUAL(10, i->GetKey());
- i->SetValue(3);
- BOOST_CHECK_EQUAL(3, i->GetValue());
- ++i;
- i->SetKey(5);
- BOOST_CHECK_EQUAL(5, i->GetKey());
- i->SetValue(42);
- BOOST_CHECK_EQUAL(42, i->GetValue());
-
- Packing::ConstIterator c(i);
- BOOST_CHECK_EQUAL(5, c->GetKey());
- --c;
- BOOST_CHECK_EQUAL(10, c->GetKey());
- BOOST_CHECK_EQUAL(42, i->GetValue());
-
- BOOST_CHECK_EQUAL(5, i->GetKey());
- free(backing);
-}
-
-BOOST_AUTO_TEST_CASE(simple_sort) {
- typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
- char foo[Packing::kBytes * 4];
- Packing::MutableIterator begin(Packing::FromVoid(foo));
- Packing::MutableIterator i = begin;
- i->SetKey(0); ++i;
- i->SetKey(2); ++i;
- i->SetKey(3); ++i;
- i->SetKey(1); ++i;
- std::sort(begin, i);
- BOOST_CHECK_EQUAL(0, begin[0].GetKey());
- BOOST_CHECK_EQUAL(1, begin[1].GetKey());
- BOOST_CHECK_EQUAL(2, begin[2].GetKey());
- BOOST_CHECK_EQUAL(3, begin[3].GetKey());
-}
-
-BOOST_AUTO_TEST_CASE(big_sort) {
- typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
- boost::scoped_array<char> memory(new char[Packing::kBytes * 1000]);
- Packing::MutableIterator begin(Packing::FromVoid(memory.get()));
-
- boost::mt19937 rng;
- boost::uniform_int<uint64_t> range(0, std::numeric_limits<uint64_t>::max());
- boost::variate_generator<boost::mt19937&, boost::uniform_int<uint64_t> > gen(rng, range);
-
- for (size_t i = 0; i < 1000; ++i) {
- (begin + i)->SetKey(gen());
- }
- std::sort(begin, begin + 1000);
- for (size_t i = 0; i < 999; ++i) {
- BOOST_CHECK(begin[i] < begin[i+1]);
- }
-}
-
-} // namespace
-} // namespace util
diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc
index 279bafa8..a329ce4e 100644
--- a/klm/util/mmap.cc
+++ b/klm/util/mmap.cc
@@ -1,23 +1,63 @@
+/* Memory mapping wrappers.
+ * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at
+ * NICT.
+ */
+#include "util/mmap.hh"
+
#include "util/exception.hh"
#include "util/file.hh"
-#include "util/mmap.hh"
#include <iostream>
#include <assert.h>
#include <fcntl.h>
#include <sys/types.h>
-#include <sys/mman.h>
+#include <sys/stat.h>
#include <stdlib.h>
-#include <unistd.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#include <io.h>
+#else
+#include <sys/mman.h>
+#endif
namespace util {
+long SizePage() {
+#if defined(_WIN32) || defined(_WIN64)
+ SYSTEM_INFO si;
+ GetSystemInfo(&si);
+ return si.dwAllocationGranularity;
+#else
+ return sysconf(_SC_PAGE_SIZE);
+#endif
+}
+
+void SyncOrThrow(void *start, size_t length) {
+#if defined(_WIN32) || defined(_WIN64)
+ UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap");
+#else
+ UTIL_THROW_IF(msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap");
+#endif
+}
+
+void UnmapOrThrow(void *start, size_t length) {
+#if defined(_WIN32) || defined(_WIN64)
+ UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file");
+#else
+ UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed");
+#endif
+}
+
scoped_mmap::~scoped_mmap() {
if (data_ != (void*)-1) {
- // Thanks Denis Filimonov for pointing out NFS likes msync first.
- if (msync(data_, size_, MS_SYNC) || munmap(data_, size_)) {
- std::cerr << "msync or mmap failed for " << size_ << " bytes." << std::endl;
+ try {
+ // Thanks Denis Filimonov for pointing out NFS likes msync first.
+ SyncOrThrow(data_, size_);
+ UnmapOrThrow(data_, size_);
+ } catch (const util::ErrnoException &e) {
+ std::cerr << e.what();
abort();
}
}
@@ -52,29 +92,40 @@ void scoped_memory::call_realloc(std::size_t size) {
}
}
-void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset) {
+void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) {
#ifdef MAP_POPULATE // Linux specific
if (prefault) {
flags |= MAP_POPULATE;
}
#endif
+#if defined(_WIN32) || defined(_WIN64)
+ int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY;
+ int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ;
+ uint64_t total_size = size + offset;
+ HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, total_size >> 32, static_cast<DWORD>(total_size), NULL);
+ UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed");
+ LPVOID ret = MapViewOfFile(hMapping, protectM, offset >> 32, offset, size);
+ CloseHandle(hMapping);
+ UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed");
+#else
int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
void *ret = mmap(NULL, size, protect, flags, fd, offset);
- if (ret == MAP_FAILED) {
- UTIL_THROW(ErrnoException, "mmap failed for size " << size << " at offset " << offset);
- }
+ UTIL_THROW_IF(ret == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset);
+#endif
return ret;
}
const int kFileFlags =
-#ifdef MAP_FILE
+#if defined(_WIN32) || defined(_WIN64)
+ 0 // MapOrThrow ignores flags on windows
+#elif defined(MAP_FILE)
MAP_FILE | MAP_SHARED
#else
MAP_SHARED
#endif
;
-void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out) {
+void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) {
switch (method) {
case LAZY:
out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
@@ -91,30 +142,52 @@ void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_m
case READ:
out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED);
if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc");
- if (-1 == lseek(fd, offset, SEEK_SET)) UTIL_THROW(ErrnoException, "lseek to " << offset << " in fd " << fd << " failed.");
+ SeekOrThrow(fd, offset);
ReadOrThrow(fd, out.get(), size);
break;
}
}
-void *MapAnonymous(std::size_t size) {
- return MapOrThrow(size, true,
-#ifdef MAP_ANONYMOUS
- MAP_ANONYMOUS // Linux
+// Allocates zeroed memory in to.
+void MapAnonymous(std::size_t size, util::scoped_memory &to) {
+ to.reset();
+#if defined(_WIN32) || defined(_WIN64)
+ to.reset(calloc(1, size), size, scoped_memory::MALLOC_ALLOCATED);
+#else
+ to.reset(MapOrThrow(size, true,
+# if defined(MAP_ANONYMOUS)
+ MAP_ANONYMOUS | MAP_PRIVATE // Linux
+# else
+ MAP_ANON | MAP_PRIVATE // BSD
+# endif
+ , false, -1, 0), size, scoped_memory::MMAP_ALLOCATED);
+#endif
+}
+
+void *MapZeroedWrite(int fd, std::size_t size) {
+ ResizeOrThrow(fd, 0);
+ ResizeOrThrow(fd, size);
+ return MapOrThrow(size, true, kFileFlags, false, fd, 0);
+}
+
+namespace {
+
+int CreateOrThrow(const char *name) {
+ int ret;
+#if defined(_WIN32) || defined(_WIN64)
+ UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
#else
- MAP_ANON // BSD
+ UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
#endif
- | MAP_PRIVATE, false, -1, 0);
+ return ret;
}
+} // namespace
+
void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
- file.reset(open(name, O_CREAT | O_RDWR | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH));
- if (-1 == file.get())
- UTIL_THROW(ErrnoException, "Failed to open " << name << " for writing");
- if (-1 == ftruncate(file.get(), size))
- UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed");
+ file.reset(CreateOrThrow(name));
try {
- return MapOrThrow(size, true, kFileFlags, false, file.get(), 0);
+ return MapZeroedWrite(file.get(), size);
} catch (ErrnoException &e) {
e << " in file " << name;
throw;
diff --git a/klm/util/mmap.hh b/klm/util/mmap.hh
index b0eb6672..b218c4d1 100644
--- a/klm/util/mmap.hh
+++ b/klm/util/mmap.hh
@@ -4,13 +4,15 @@
#include <cstddef>
-#include <inttypes.h>
+#include <stdint.h>
#include <sys/types.h>
namespace util {
class scoped_fd;
+long SizePage();
+
// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here.
class scoped_mmap {
public:
@@ -94,15 +96,19 @@ typedef enum {
extern const int kFileFlags;
// Wrapper around mmap to check it worked and hide some platform macros.
-void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset = 0);
+void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0);
-void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out);
+void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out);
-void *MapAnonymous(std::size_t size);
+void MapAnonymous(std::size_t size, scoped_memory &to);
// Open file name with mmap of size bytes, all of which are initially zero.
+void *MapZeroedWrite(int fd, std::size_t size);
void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file);
+// msync wrapper
+void SyncOrThrow(void *start, size_t length);
+
} // namespace util
#endif // UTIL_MMAP__
diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc
index ef5783fe..6accc21a 100644
--- a/klm/util/murmur_hash.cc
+++ b/klm/util/murmur_hash.cc
@@ -7,9 +7,11 @@
* placed in namespace util
* add MurmurHashNative
* default option = 0 for seed
+ * ARM port from NICT
*/
#include "util/murmur_hash.hh"
+#include <string.h>
namespace util {
@@ -28,12 +30,24 @@ uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
uint64_t h = seed ^ (len * m);
+#if defined(__arm) || defined(__arm__)
+ const size_t ksize = sizeof(uint64_t);
+ const unsigned char * data = (const unsigned char *)key;
+ const unsigned char * end = data + (std::size_t)(len/8) * ksize;
+#else
const uint64_t * data = (const uint64_t *)key;
const uint64_t * end = data + (len/8);
+#endif
while(data != end)
{
+#if defined(__arm) || defined(__arm__)
+ uint64_t k;
+ memcpy(&k, data, ksize);
+ data += ksize;
+#else
uint64_t k = *data++;
+#endif
k *= m;
k ^= k >> r;
@@ -75,16 +89,30 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
unsigned int h1 = seed ^ len;
unsigned int h2 = 0;
+#if defined(__arm) || defined(__arm__)
+ size_t ksize = sizeof(unsigned int);
+ const unsigned char * data = (const unsigned char *)key;
+#else
const unsigned int * data = (const unsigned int *)key;
+#endif
+ unsigned int k1, k2;
while(len >= 8)
{
- unsigned int k1 = *data++;
+#if defined(__arm) || defined(__arm__)
+ memcpy(&k1, data, ksize);
+ data += ksize;
+ memcpy(&k2, data, ksize);
+ data += ksize;
+#else
+ k1 = *data++;
+ k2 = *data++;
+#endif
+
k1 *= m; k1 ^= k1 >> r; k1 *= m;
h1 *= m; h1 ^= k1;
len -= 4;
- unsigned int k2 = *data++;
k2 *= m; k2 ^= k2 >> r; k2 *= m;
h2 *= m; h2 ^= k2;
len -= 4;
@@ -92,7 +120,12 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
if(len >= 4)
{
- unsigned int k1 = *data++;
+#if defined(__arm) || defined(__arm__)
+ memcpy(&k1, data, ksize);
+ data += ksize;
+#else
+ k1 = *data++;
+#endif
k1 *= m; k1 ^= k1 >> r; k1 *= m;
h1 *= m; h1 ^= k1;
len -= 4;
diff --git a/klm/util/murmur_hash.hh b/klm/util/murmur_hash.hh
index 78fe583f..638aaeb2 100644
--- a/klm/util/murmur_hash.hh
+++ b/klm/util/murmur_hash.hh
@@ -1,7 +1,7 @@
#ifndef UTIL_MURMUR_HASH__
#define UTIL_MURMUR_HASH__
#include <cstddef>
-#include <inttypes.h>
+#include <stdint.h>
namespace util {
diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh
index 8122d69c..f466cebc 100644
--- a/klm/util/probing_hash_table.hh
+++ b/klm/util/probing_hash_table.hh
@@ -18,27 +18,33 @@ class ProbingSizeException : public Exception {
~ProbingSizeException() throw() {}
};
+// std::identity is an SGI extension :-(
+struct IdentityHash {
+ template <class T> T operator()(T arg) const { return arg; }
+};
+
/* Non-standard hash table
* Buckets must be set at the beginning and must be greater than maximum number
- * of elements, else an infinite loop happens.
+ * of elements, else it throws ProbingSizeException.
* Memory management and initialization is externalized to make it easier to
* serialize these to disk and load them quickly.
* Uses linear probing to find value.
* Only insert and lookup operations.
*/
-template <class PackingT, class HashT, class EqualT = std::equal_to<typename PackingT::Key> > class ProbingHashTable {
+template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class ProbingHashTable {
public:
- typedef PackingT Packing;
- typedef typename Packing::Key Key;
- typedef typename Packing::MutableIterator MutableIterator;
- typedef typename Packing::ConstIterator ConstIterator;
-
+ typedef EntryT Entry;
+ typedef typename Entry::Key Key;
+ typedef const Entry *ConstIterator;
+ typedef Entry *MutableIterator;
typedef HashT Hash;
typedef EqualT Equal;
+ public:
static std::size_t Size(std::size_t entries, float multiplier) {
- return std::max(entries + 1, static_cast<std::size_t>(multiplier * static_cast<float>(entries))) * Packing::kBytes;
+ std::size_t buckets = std::max(entries + 1, static_cast<std::size_t>(multiplier * static_cast<float>(entries)));
+ return buckets * sizeof(Entry);
}
// Must be assigned to later.
@@ -49,9 +55,9 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac
{}
ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal())
- : begin_(Packing::FromVoid(start)),
- buckets_(allocated / Packing::kBytes),
- end_(begin_ + (allocated / Packing::kBytes)),
+ : begin_(reinterpret_cast<MutableIterator>(start)),
+ buckets_(allocated / sizeof(Entry)),
+ end_(begin_ + buckets_),
invalid_(invalid),
hash_(hash_func),
equal_(equal_func),
@@ -62,11 +68,10 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac
{}
template <class T> MutableIterator Insert(const T &t) {
- if (++entries_ >= buckets_)
- UTIL_THROW(ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
#ifdef DEBUG
assert(initialized_);
#endif
+ UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) {
if (equal_(i->GetKey(), invalid_)) { *i = t; return i; }
if (++i == end_) { i = begin_; }
@@ -84,7 +89,7 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac
if (equal_(got, key)) { out = i; return true; }
if (equal_(got, invalid_)) return false;
if (++i == end_) i = begin_;
- }
+ }
}
template <class Key> bool Find(const Key key, ConstIterator &out) const {
diff --git a/klm/util/probing_hash_table_test.cc b/klm/util/probing_hash_table_test.cc
index ff2f5af3..ef68e5f2 100644
--- a/klm/util/probing_hash_table_test.cc
+++ b/klm/util/probing_hash_table_test.cc
@@ -1,6 +1,6 @@
#include "util/probing_hash_table.hh"
-#include "util/key_value_packing.hh"
+#include <stdint.h>
#define BOOST_TEST_MODULE ProbingHashTableTest
#include <boost/test/unit_test.hpp>
@@ -9,17 +9,34 @@
namespace util {
namespace {
-typedef AlignedPacking<char, uint64_t> Packing;
-typedef ProbingHashTable<Packing, boost::hash<char> > Table;
+struct Entry {
+ unsigned char key;
+ typedef unsigned char Key;
+
+ unsigned char GetKey() const {
+ return key;
+ }
+
+ uint64_t GetValue() const {
+ return value;
+ }
+
+ uint64_t value;
+};
+
+typedef ProbingHashTable<Entry, boost::hash<unsigned char> > Table;
BOOST_AUTO_TEST_CASE(simple) {
char mem[Table::Size(10, 1.2)];
memset(mem, 0, sizeof(mem));
Table table(mem, sizeof(mem));
- Packing::ConstIterator i = Packing::ConstIterator();
+ const Entry *i = NULL;
BOOST_CHECK(!table.Find(2, i));
- table.Insert(Packing::Make(3, 328920));
+ Entry to_ins;
+ to_ins.key = 3;
+ to_ins.value = 328920;
+ table.Insert(to_ins);
BOOST_REQUIRE(table.Find(3, i));
BOOST_CHECK_EQUAL(3, i->GetKey());
BOOST_CHECK_EQUAL(static_cast<uint64_t>(328920), i->GetValue());
diff --git a/klm/util/sized_iterator.hh b/klm/util/sized_iterator.hh
index 47dfc245..aabcc531 100644
--- a/klm/util/sized_iterator.hh
+++ b/klm/util/sized_iterator.hh
@@ -6,7 +6,7 @@
#include <functional>
#include <string>
-#include <inttypes.h>
+#include <stdint.h>
#include <string.h>
namespace util {
diff --git a/klm/util/sorted_uniform.hh b/klm/util/sorted_uniform.hh
index 0d6ecbbd..7700d9e6 100644
--- a/klm/util/sorted_uniform.hh
+++ b/klm/util/sorted_uniform.hh
@@ -5,7 +5,7 @@
#include <cstddef>
#include <assert.h>
-#include <inttypes.h>
+#include <stdint.h>
namespace util {
@@ -122,99 +122,6 @@ template <class Iterator, class Accessor> Iterator BinaryBelow(
return begin - 1;
}
-// To use this template, you need to define a Pivot function to match Key.
-template <class PackingT> class SortedUniformMap {
- public:
- typedef PackingT Packing;
- typedef typename Packing::ConstIterator ConstIterator;
- typedef typename Packing::MutableIterator MutableIterator;
-
- struct Accessor {
- public:
- typedef typename Packing::Key Key;
- const Key &operator()(const ConstIterator &i) const { return i->GetKey(); }
- Key &operator()(const MutableIterator &i) const { return i->GetKey(); }
- };
-
- // Offer consistent API with probing hash.
- static std::size_t Size(std::size_t entries, float /*ignore*/ = 0.0) {
- return sizeof(uint64_t) + entries * Packing::kBytes;
- }
-
- SortedUniformMap()
-#ifdef DEBUG
- : initialized_(false), loaded_(false)
-#endif
- {}
-
- SortedUniformMap(void *start, std::size_t /*allocated*/) :
- begin_(Packing::FromVoid(reinterpret_cast<uint64_t*>(start) + 1)),
- end_(begin_), size_ptr_(reinterpret_cast<uint64_t*>(start))
-#ifdef DEBUG
- , initialized_(true), loaded_(false)
-#endif
- {}
-
- void LoadedBinary() {
-#ifdef DEBUG
- assert(initialized_);
- assert(!loaded_);
- loaded_ = true;
-#endif
- // Restore the size.
- end_ = begin_ + *size_ptr_;
- }
-
- // Caller responsible for not exceeding specified size. Do not call after FinishedInserting.
- template <class T> void Insert(const T &t) {
-#ifdef DEBUG
- assert(initialized_);
- assert(!loaded_);
-#endif
- *end_ = t;
- ++end_;
- }
-
- void FinishedInserting() {
-#ifdef DEBUG
- assert(initialized_);
- assert(!loaded_);
- loaded_ = true;
-#endif
- std::sort(begin_, end_);
- *size_ptr_ = (end_ - begin_);
- }
-
- // Don't use this to change the key.
- template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) {
-#ifdef DEBUG
- assert(initialized_);
- assert(loaded_);
-#endif
- return SortedUniformFind<MutableIterator, Accessor, Pivot64>(begin_, end_, key, out);
- }
-
- // Do not call before FinishedInserting.
- template <class Key> bool Find(const Key key, ConstIterator &out) const {
-#ifdef DEBUG
- assert(initialized_);
- assert(loaded_);
-#endif
- return SortedUniformFind<ConstIterator, Accessor, Pivot64>(Accessor(), ConstIterator(begin_), ConstIterator(end_), key, out);
- }
-
- ConstIterator begin() const { return begin_; }
- ConstIterator end() const { return end_; }
-
- private:
- typename Packing::MutableIterator begin_, end_;
- uint64_t *size_ptr_;
-#ifdef DEBUG
- bool initialized_;
- bool loaded_;
-#endif
-};
-
} // namespace util
#endif // UTIL_SORTED_UNIFORM__
diff --git a/klm/util/sorted_uniform_test.cc b/klm/util/sorted_uniform_test.cc
index 4aa4c8aa..d9f6fad1 100644
--- a/klm/util/sorted_uniform_test.cc
+++ b/klm/util/sorted_uniform_test.cc
@@ -1,12 +1,11 @@
#include "util/sorted_uniform.hh"
-#include "util/key_value_packing.hh"
-
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_int.hpp>
#include <boost/random/variate_generator.hpp>
#include <boost/scoped_array.hpp>
#include <boost/unordered_map.hpp>
+
#define BOOST_TEST_MODULE SortedUniformTest
#include <boost/test/unit_test.hpp>
@@ -17,74 +16,86 @@
namespace util {
namespace {
-template <class Map, class Key, class Value> void Check(const Map &map, const boost::unordered_map<Key, Value> &reference, const Key key) {
+template <class KeyT, class ValueT> struct Entry {
+ typedef KeyT Key;
+ typedef ValueT Value;
+
+ Key key;
+ Value value;
+
+ Key GetKey() const {
+ return key;
+ }
+
+ Value GetValue() const {
+ return value;
+ }
+
+ bool operator<(const Entry<Key,Value> &other) const {
+ return key < other.key;
+ }
+};
+
+template <class KeyT> struct Accessor {
+ typedef KeyT Key;
+ template <class Value> Key operator()(const Entry<Key, Value> *entry) const {
+ return entry->GetKey();
+ }
+};
+
+template <class Key, class Value> void Check(const Entry<Key, Value> *begin, const Entry<Key, Value> *end, const boost::unordered_map<Key, Value> &reference, const Key key) {
typename boost::unordered_map<Key, Value>::const_iterator ref = reference.find(key);
- typename Map::ConstIterator i = typename Map::ConstIterator();
+ typedef const Entry<Key, Value> *It;
+ // g++ can't tell that require will crash and burn.
+ It i = NULL;
+ bool ret = SortedUniformFind<It, Accessor<Key>, Pivot64>(Accessor<Key>(), begin, end, key, i);
if (ref == reference.end()) {
- BOOST_CHECK(!map.Find(key, i));
+ BOOST_CHECK(!ret);
} else {
- // g++ can't tell that require will crash and burn.
- BOOST_REQUIRE(map.Find(key, i));
+ BOOST_REQUIRE(ret);
BOOST_CHECK_EQUAL(ref->second, i->GetValue());
}
}
-typedef SortedUniformMap<AlignedPacking<uint64_t, uint32_t> > TestMap;
-
BOOST_AUTO_TEST_CASE(empty) {
- char buf[TestMap::Size(0)];
- TestMap map(buf, TestMap::Size(0));
- map.FinishedInserting();
- TestMap::ConstIterator i;
- BOOST_CHECK(!map.Find(42, i));
-}
-
-BOOST_AUTO_TEST_CASE(one) {
- char buf[TestMap::Size(1)];
- TestMap map(buf, sizeof(buf));
- Entry<uint64_t, uint32_t> e;
- e.Set(42,2);
- map.Insert(e);
- map.FinishedInserting();
- TestMap::ConstIterator i = TestMap::ConstIterator();
- BOOST_REQUIRE(map.Find(42, i));
- BOOST_CHECK(i == map.begin());
- BOOST_CHECK(!map.Find(43, i));
- BOOST_CHECK(!map.Find(41, i));
+ typedef const Entry<uint64_t, float> T;
+ const T *i;
+ bool ret = SortedUniformFind<const T*, Accessor<uint64_t>, Pivot64>(Accessor<uint64_t>(), (const T*)NULL, (const T*)NULL, (uint64_t)10, i);
+ BOOST_CHECK(!ret);
}
template <class Key> void RandomTest(Key upper, size_t entries, size_t queries) {
typedef unsigned char Value;
- typedef SortedUniformMap<AlignedPacking<Key, unsigned char> > Map;
- boost::scoped_array<char> buffer(new char[Map::Size(entries)]);
- Map map(buffer.get(), entries);
boost::mt19937 rng;
boost::uniform_int<Key> range_key(0, upper);
boost::uniform_int<Value> range_value(0, 255);
boost::variate_generator<boost::mt19937&, boost::uniform_int<Key> > gen_key(rng, range_key);
boost::variate_generator<boost::mt19937&, boost::uniform_int<unsigned char> > gen_value(rng, range_value);
+ typedef Entry<Key, Value> Ent;
+ std::vector<Ent> backing;
boost::unordered_map<Key, unsigned char> reference;
- Entry<Key, unsigned char> ent;
+ Ent ent;
for (size_t i = 0; i < entries; ++i) {
Key key = gen_key();
unsigned char value = gen_value();
if (reference.insert(std::make_pair(key, value)).second) {
- ent.Set(key, value);
- map.Insert(Entry<Key, unsigned char>(ent));
+ ent.key = key;
+ ent.value = value;
+ backing.push_back(ent);
}
}
- map.FinishedInserting();
+ std::sort(backing.begin(), backing.end());
// Random queries.
for (size_t i = 0; i < queries; ++i) {
const Key key = gen_key();
- Check<Map, Key, unsigned char>(map, reference, key);
+ Check<Key, unsigned char>(&*backing.begin(), &*backing.end(), reference, key);
}
typename boost::unordered_map<Key, unsigned char>::const_iterator it = reference.begin();
for (size_t i = 0; (i < queries) && (it != reference.end()); ++i, ++it) {
- Check<Map, Key, unsigned char>(map, reference, it->second);
+ Check<Key, unsigned char>(&*backing.begin(), &*backing.end(), reference, it->second);
}
}
diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh
index 413bda0b..c7e1c863 100644
--- a/klm/util/tokenize_piece.hh
+++ b/klm/util/tokenize_piece.hh
@@ -1,6 +1,7 @@
#ifndef UTIL_TOKENIZE_PIECE__
#define UTIL_TOKENIZE_PIECE__
+#include "util/exception.hh"
#include "util/string_piece.hh"
#include <boost/iterator/iterator_facade.hpp>
@@ -8,63 +9,25 @@
#include <algorithm>
#include <iostream>
-/* Usage:
- *
- * for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) {
- * std::cout << *i << "\n";
- * }
- *
- */
-
namespace util {
-// Tokenize a StringPiece using an iterator interface. boost::tokenizer doesn't work with StringPiece.
-template <char d> class PieceIterator : public boost::iterator_facade<PieceIterator<d>, const StringPiece, boost::forward_traversal_tag> {
+// Thrown on dereference when out of tokens to parse
+class OutOfTokens : public Exception {
public:
- // Default construct is end, which is also accessed by kEndPieceIterator;
- PieceIterator() {}
-
- explicit PieceIterator(const StringPiece &str)
- : after_(str) {
- increment();
- }
+ OutOfTokens() throw() {}
+ ~OutOfTokens() throw() {}
+};
- bool operator!() const {
- return after_.data() == 0;
- }
- operator bool() const {
- return after_.data() != 0;
- }
+class SingleCharacter {
+ public:
+ explicit SingleCharacter(char delim) : delim_(delim) {}
- static PieceIterator<d> end() {
- return PieceIterator<d>();
+ StringPiece Find(const StringPiece &in) const {
+ return StringPiece(std::find(in.data(), in.data() + in.size(), delim_), 1);
}
private:
- friend class boost::iterator_core_access;
-
- void increment() {
- const char *start = after_.data();
- for (; (start != after_.data() + after_.size()) && (d == *start); ++start) {}
- if (start == after_.data() + after_.size()) {
- // End condition.
- after_.clear();
- return;
- }
- const char *finish = start;
- for (; (finish != after_.data() + after_.size()) && (d != *finish); ++finish) {}
- current_ = StringPiece(start, finish - start);
- after_ = StringPiece(finish, after_.data() + after_.size() - finish);
- }
-
- bool equal(const PieceIterator &other) const {
- return after_.data() == other.after_.data();
- }
-
- const StringPiece &dereference() const { return current_; }
-
- StringPiece current_;
- StringPiece after_;
+ char delim_;
};
class MultiCharacter {
@@ -95,7 +58,7 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it
public:
TokenIter() {}
- TokenIter(const StringPiece &str, const Find &finder) : after_(str), finder_(finder) {
+ template <class Construct> TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) {
increment();
}
@@ -130,6 +93,7 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it
}
const StringPiece &dereference() const {
+ UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens");
return current_;
}
diff --git a/klm/util/tokenize_piece_test.cc b/klm/util/tokenize_piece_test.cc
index e07ebcf5..d856018f 100644
--- a/klm/util/tokenize_piece_test.cc
+++ b/klm/util/tokenize_piece_test.cc
@@ -9,53 +9,7 @@
namespace util {
namespace {
-BOOST_AUTO_TEST_CASE(simple) {
- PieceIterator<' '> it("single spaced words.");
- BOOST_REQUIRE(it);
- BOOST_CHECK_EQUAL(StringPiece("single"), *it);
- ++it;
- BOOST_REQUIRE(it);
- BOOST_CHECK_EQUAL(StringPiece("spaced"), *it);
- ++it;
- BOOST_REQUIRE(it);
- BOOST_CHECK_EQUAL(StringPiece("words."), *it);
- ++it;
- BOOST_CHECK(!it);
-}
-
-BOOST_AUTO_TEST_CASE(null_delimiter) {
- const char str[] = "\0first\0\0second\0\0\0third\0fourth\0\0\0";
- PieceIterator<'\0'> it(StringPiece(str, sizeof(str) - 1));
- BOOST_REQUIRE(it);
- BOOST_CHECK_EQUAL(StringPiece("first"), *it);
- ++it;
- BOOST_REQUIRE(it);
- BOOST_CHECK_EQUAL(StringPiece("second"), *it);
- ++it;
- BOOST_REQUIRE(it);
- BOOST_CHECK_EQUAL(StringPiece("third"), *it);
- ++it;
- BOOST_REQUIRE(it);
- BOOST_CHECK_EQUAL(StringPiece("fourth"), *it);
- ++it;
- BOOST_CHECK(!it);
-}
-
-BOOST_AUTO_TEST_CASE(null_entries) {
- const char str[] = "\0split\0\0 \0me\0 ";
- PieceIterator<' '> it(StringPiece(str, sizeof(str) - 1));
- BOOST_REQUIRE(it);
- const char first[] = "\0split\0\0";
- BOOST_CHECK_EQUAL(StringPiece(first, sizeof(first) - 1), *it);
- ++it;
- BOOST_REQUIRE(it);
- const char second[] = "\0me\0";
- BOOST_CHECK_EQUAL(StringPiece(second, sizeof(second) - 1), *it);
- ++it;
- BOOST_CHECK(!it);
-}
-
-/*BOOST_AUTO_TEST_CASE(pipe_pipe_none) {
+BOOST_AUTO_TEST_CASE(pipe_pipe_none) {
const char str[] = "nodelimit at all";
TokenIter<MultiCharacter> it(str, MultiCharacter("|||"));
BOOST_REQUIRE(it);
@@ -79,7 +33,7 @@ BOOST_AUTO_TEST_CASE(remove_empty) {
const char str[] = "|||";
TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||"));
BOOST_CHECK(!it);
-}*/
+}
BOOST_AUTO_TEST_CASE(remove_empty_keep) {
const char str[] = " |||";