summaryrefslogtreecommitdiff
path: root/klm/util
diff options
context:
space:
mode:
Diffstat (limited to 'klm/util')
-rw-r--r--klm/util/Makefile.am1
-rw-r--r--klm/util/bit_packing.cc27
-rw-r--r--klm/util/bit_packing.hh88
-rw-r--r--klm/util/ersatz_progress.cc9
-rw-r--r--klm/util/ersatz_progress.hh6
-rw-r--r--klm/util/file_piece.cc141
-rw-r--r--klm/util/file_piece.hh51
-rw-r--r--klm/util/file_piece_test.cc56
-rw-r--r--klm/util/joint_sort.hh6
-rw-r--r--klm/util/mmap.cc44
-rw-r--r--klm/util/mmap.hh31
-rw-r--r--klm/util/proxy_iterator.hh2
-rw-r--r--klm/util/scoped.cc4
-rw-r--r--klm/util/scoped.hh19
-rw-r--r--klm/util/sorted_uniform.hh4
-rw-r--r--klm/util/string_piece.cc8
-rw-r--r--klm/util/string_piece.hh12
17 files changed, 415 insertions, 94 deletions
diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am
index be84736c..9e38e0f1 100644
--- a/klm/util/Makefile.am
+++ b/klm/util/Makefile.am
@@ -20,6 +20,7 @@ noinst_LIBRARIES = libklm_util.a
libklm_util_a_SOURCES = \
ersatz_progress.cc \
+ bit_packing.cc \
exception.cc \
file_piece.cc \
mmap.cc \
diff --git a/klm/util/bit_packing.cc b/klm/util/bit_packing.cc
new file mode 100644
index 00000000..dd14ffe1
--- /dev/null
+++ b/klm/util/bit_packing.cc
@@ -0,0 +1,27 @@
+#include "util/bit_packing.hh"
+#include "util/exception.hh"
+
+namespace util {
+
+namespace {
+template <bool> struct StaticCheck {};
+template <> struct StaticCheck<true> { typedef bool StaticAssertionPassed; };
+
+typedef StaticCheck<sizeof(float) == 4>::StaticAssertionPassed FloatSize;
+
+} // namespace
+
+uint8_t RequiredBits(uint64_t max_value) {
+ if (!max_value) return 0;
+ uint8_t ret = 1;
+ while (max_value >>= 1) ++ret;
+ return ret;
+}
+
+void BitPackingSanity() {
+ const detail::FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 };
+ if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000");
+ // TODO: more checks.
+}
+
+} // namespace util
diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh
new file mode 100644
index 00000000..422ed873
--- /dev/null
+++ b/klm/util/bit_packing.hh
@@ -0,0 +1,88 @@
+#ifndef UTIL_BIT_PACKING__
+#define UTIL_BIT_PACKING__
+
+/* Bit-level packing routines */
+
+#include <assert.h>
+#ifdef __APPLE__
+#include <architecture/byte_order.h>
+#else
+#include <endian.h>
+#endif
+
+#include <inttypes.h>
+
+#if __BYTE_ORDER != __LITTLE_ENDIAN
+#error The bit aligned storage functions assume little endian architecture
+#endif
+
+namespace util {
+
+/* WARNING WARNING WARNING:
+ * The write functions assume that memory is zero initially. This makes them
+ * faster and is the appropriate case for mmapped language model construction.
+ * These routines assume that unaligned access to uint64_t is fast and that
+ * storage is little endian. This is the case on x86_64. It may not be the
+ * case on 32-bit x86 but my target audience is large language models for which
+ * 64-bit is necessary.
+ */
+
+/* Pack integers up to 57 bits using their least significant digits.
+ * The length is specified using mask:
+ * Assumes mask == (1 << length) - 1 where length <= 57.
+ */
+inline uint64_t ReadInt57(const void *base, uint8_t bit, uint64_t mask) {
+ return (*reinterpret_cast<const uint64_t*>(base) >> bit) & mask;
+}
+/* Assumes value <= mask and mask == (1 << length) - 1 where length <= 57.
+ * Assumes the memory is zero initially.
+ */
+inline void WriteInt57(void *base, uint8_t bit, uint64_t value) {
+ *reinterpret_cast<uint64_t*>(base) |= (value << bit);
+}
+
+namespace detail { typedef union { float f; uint32_t i; } FloatEnc; }
+inline float ReadFloat32(const void *base, uint8_t bit) {
+ detail::FloatEnc encoded;
+ encoded.i = *reinterpret_cast<const uint64_t*>(base) >> bit;
+ return encoded.f;
+}
+inline void WriteFloat32(void *base, uint8_t bit, float value) {
+ detail::FloatEnc encoded;
+ encoded.f = value;
+ WriteInt57(base, bit, encoded.i);
+}
+
+inline float ReadNonPositiveFloat31(const void *base, uint8_t bit) {
+ detail::FloatEnc encoded;
+ encoded.i = *reinterpret_cast<const uint64_t*>(base) >> bit;
+ // Sign bit set means negative.
+ encoded.i |= 0x80000000;
+ return encoded.f;
+}
+inline void WriteNonPositiveFloat31(void *base, uint8_t bit, float value) {
+ assert(value <= 0.0);
+ detail::FloatEnc encoded;
+ encoded.f = value;
+ encoded.i &= ~0x80000000;
+ WriteInt57(base, bit, encoded.i);
+}
+
+void BitPackingSanity();
+
+// Return bits required to store integers upto max_value. Not the most
+// efficient implementation, but this is only called a few times to size tries.
+uint8_t RequiredBits(uint64_t max_value);
+
+struct BitsMask {
+ void FromMax(uint64_t max_value) {
+ bits = RequiredBits(max_value);
+ mask = (1 << bits) - 1;
+ }
+ uint8_t bits;
+ uint64_t mask;
+};
+
+} // namespace util
+
+#endif // UTIL_BIT_PACKING__
diff --git a/klm/util/ersatz_progress.cc b/klm/util/ersatz_progress.cc
index 09e3a106..55c182bd 100644
--- a/klm/util/ersatz_progress.cc
+++ b/klm/util/ersatz_progress.cc
@@ -13,10 +13,7 @@ ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<std::s
ErsatzProgress::~ErsatzProgress() {
if (!out_) return;
- for (; stones_written_ < kWidth; ++stones_written_) {
- (*out_) << '*';
- }
- *out_ << '\n';
+ Finished();
}
ErsatzProgress::ErsatzProgress(std::ostream *to, const std::string &message, std::size_t complete)
@@ -36,8 +33,8 @@ void ErsatzProgress::Milestone() {
for (; stones_written_ < stone; ++stones_written_) {
(*out_) << '*';
}
-
- if (current_ >= complete_) {
+ if (stone == kWidth) {
+ (*out_) << std::endl;
next_ = std::numeric_limits<std::size_t>::max();
} else {
next_ = std::max(next_, (stone * complete_) / kWidth);
diff --git a/klm/util/ersatz_progress.hh b/klm/util/ersatz_progress.hh
index ea6c3bb9..92c345fe 100644
--- a/klm/util/ersatz_progress.hh
+++ b/klm/util/ersatz_progress.hh
@@ -19,7 +19,7 @@ class ErsatzProgress {
~ErsatzProgress();
ErsatzProgress &operator++() {
- if (++current_ == next_) Milestone();
+ if (++current_ >= next_) Milestone();
return *this;
}
@@ -33,6 +33,10 @@ class ErsatzProgress {
Milestone();
}
+ void Finished() {
+ Set(complete_);
+ }
+
private:
void Milestone();
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index 2b439499..e7bd8659 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -2,19 +2,23 @@
#include "util/exception.hh"
-#include <iostream>
#include <string>
#include <limits>
#include <assert.h>
-#include <cstdlib>
#include <ctype.h>
+#include <err.h>
#include <fcntl.h>
+#include <stdlib.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
+#ifdef HAVE_ZLIB
+#include <zlib.h>
+#endif
+
namespace util {
EndOfFileException::EndOfFileException() throw() {
@@ -26,6 +30,13 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
*this << "Could not parse \"" << value << "\" into a float";
}
+GZException::GZException(void *file) {
+#ifdef HAVE_ZLIB
+ int num;
+ *this << gzerror(file, &num) << " from zlib";
+#endif // HAVE_ZLIB
+}
+
int OpenReadOrThrow(const char *name) {
int ret = open(name, O_RDONLY);
if (ret == -1) UTIL_THROW(ErrnoException, "in open (" << name << ") for reading");
@@ -38,42 +49,73 @@ off_t SizeFile(int fd) {
return sb.st_size;
}
-FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) :
+FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) :
file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
Initialize(name, show_progress, min_buffer);
}
-FilePiece::FilePiece(const char *name, int fd, std::ostream *show_progress, off_t min_buffer) :
+FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) :
file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
Initialize(name, show_progress, min_buffer);
}
-void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) {
- if (total_size_ == kBadSize) {
- fallback_to_read_ = true;
- if (show_progress)
- *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl;
- } else {
- fallback_to_read_ = false;
+FilePiece::~FilePiece() {
+#ifdef HAVE_ZLIB
+ if (gz_file_) {
+ // zlib took ownership
+ file_.release();
+ int ret;
+ if (Z_OK != (ret = gzclose(gz_file_))) {
+ errx(1, "could not close file %s using zlib", file_name_.c_str());
+ }
}
+#endif
+}
+
+void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) {
+#ifdef HAVE_ZLIB
+ gz_file_ = NULL;
+#endif
+ file_name_ = name;
+
default_map_size_ = page_ * std::max<off_t>((min_buffer / page_ + 1), 2);
position_ = NULL;
position_end_ = NULL;
mapped_offset_ = 0;
at_end_ = false;
+
+ if (total_size_ == kBadSize) {
+ // So the assertion passes.
+ fallback_to_read_ = false;
+ if (show_progress)
+ *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl;
+ TransitionToRead();
+ } else {
+ fallback_to_read_ = false;
+ }
Shift();
+ // gzip detect.
+ if ((position_end_ - position_) > 2 && *position_ == 0x1f && static_cast<unsigned char>(*(position_ + 1)) == 0x8b) {
+#ifndef HAVE_ZLIB
+ UTIL_THROW(GZException, "Looks like a gzip file but support was not compiled in.");
+#endif
+ if (!fallback_to_read_) {
+ at_end_ = false;
+ TransitionToRead();
+ }
+ }
}
-float FilePiece::ReadFloat() throw(EndOfFileException, ParseNumberException) {
+float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) {
SkipSpaces();
while (last_space_ < position_) {
if (at_end_) {
// Hallucinate a null off the end of the file.
std::string buffer(position_, position_end_);
char *end;
- float ret = std::strtof(buffer.c_str(), &end);
+ float ret = strtof(buffer.c_str(), &end);
if (buffer.c_str() == end) throw ParseNumberException(buffer);
position_ += end - buffer.c_str();
return ret;
@@ -81,20 +123,20 @@ float FilePiece::ReadFloat() throw(EndOfFileException, ParseNumberException) {
Shift();
}
char *end;
- float ret = std::strtof(position_, &end);
+ float ret = strtof(position_, &end);
if (end == position_) throw ParseNumberException(ReadDelimited());
position_ = end;
return ret;
}
-void FilePiece::SkipSpaces() throw (EndOfFileException) {
+void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) {
for (; ; ++position_) {
if (position_ == position_end_) Shift();
if (!isspace(*position_)) return;
}
}
-const char *FilePiece::FindDelimiterOrEOF() throw (EndOfFileException) {
+const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileException) {
for (const char *i = position_; i <= last_space_; ++i) {
if (isspace(*i)) return i;
}
@@ -108,7 +150,7 @@ const char *FilePiece::FindDelimiterOrEOF() throw (EndOfFileException) {
return position_end_;
}
-StringPiece FilePiece::ReadLine(char delim) throw (EndOfFileException) {
+StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) {
const char *start = position_;
do {
for (const char *i = start; i < position_end_; ++i) {
@@ -124,17 +166,19 @@ StringPiece FilePiece::ReadLine(char delim) throw (EndOfFileException) {
} while (!at_end_);
StringPiece ret(position_, position_end_ - position_);
position_ = position_end_;
- return position_;
+ return ret;
}
-void FilePiece::Shift() throw(EndOfFileException) {
- if (at_end_) throw EndOfFileException();
+void FilePiece::Shift() throw(GZException, EndOfFileException) {
+ if (at_end_) {
+ progress_.Finished();
+ throw EndOfFileException();
+ }
off_t desired_begin = position_ - data_.begin() + mapped_offset_;
- progress_.Set(desired_begin);
if (!fallback_to_read_) MMapShift(desired_begin);
// Notice an mmap failure might set the fallback.
- if (fallback_to_read_) ReadShift(desired_begin);
+ if (fallback_to_read_) ReadShift();
for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) {
if (isspace(*last_space_)) break;
@@ -163,28 +207,41 @@ void FilePiece::MMapShift(off_t desired_begin) throw() {
data_.reset();
data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_PRIVATE, *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED);
if (data_.get() == MAP_FAILED) {
- fallback_to_read_ = true;
if (desired_begin) {
if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either.");
}
+ // The mmap was scheduled to end the file, but now we're going to read it.
+ at_end_ = false;
+ TransitionToRead();
return;
}
mapped_offset_ = mapped_offset;
position_ = data_.begin() + ignore;
position_end_ = data_.begin() + mapped_size;
+
+ progress_.Set(desired_begin);
+}
+
+void FilePiece::TransitionToRead() throw (GZException) {
+ assert(!fallback_to_read_);
+ fallback_to_read_ = true;
+ data_.reset();
+ data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
+ if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_);
+ position_ = data_.begin();
+ position_end_ = position_;
+
+#ifdef HAVE_ZLIB
+ assert(!gz_file_);
+ gz_file_ = gzdopen(file_.get(), "r");
+ if (!gz_file_) {
+ UTIL_THROW(GZException, "zlib failed to open " << file_name_);
+ }
+#endif
}
-void FilePiece::ReadShift(off_t desired_begin) throw() {
+void FilePiece::ReadShift() throw(GZException, EndOfFileException) {
assert(fallback_to_read_);
- if (data_.source() != scoped_memory::MALLOC_ALLOCATED) {
- // First call.
- data_.reset();
- data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
- if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_);
- position_ = data_.begin();
- position_end_ = position_;
- }
-
// Bytes [data_.begin(), position_) have been consumed.
// Bytes [position_, position_end_) have been read into the buffer.
@@ -215,9 +272,23 @@ void FilePiece::ReadShift(off_t desired_begin) throw() {
}
}
- ssize_t read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
+ ssize_t read_return;
+#ifdef HAVE_ZLIB
+ read_return = gzread(gz_file_, static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
+ if (read_return == -1) throw GZException(gz_file_);
+ if (total_size_ != kBadSize) {
+ // Just get the position, don't actually seek. Apparently this is how you do it. . .
+ off_t ret = lseek(file_.get(), 0, SEEK_CUR);
+ if (ret != -1) progress_.Set(ret);
+ }
+#else
+ read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
if (read_return == -1) UTIL_THROW(ErrnoException, "read failed");
- if (read_return == 0) at_end_ = true;
+ progress_.Set(mapped_offset_);
+#endif
+ if (read_return == 0) {
+ at_end_ = true;
+ }
position_end_ += read_return;
}
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
index 704f0ac6..11d4a751 100644
--- a/klm/util/file_piece.hh
+++ b/klm/util/file_piece.hh
@@ -11,6 +11,8 @@
#include <cstddef>
+#define HAVE_ZLIB
+
namespace util {
class EndOfFileException : public Exception {
@@ -25,6 +27,13 @@ class ParseNumberException : public Exception {
~ParseNumberException() throw() {}
};
+class GZException : public Exception {
+ public:
+ explicit GZException(void *file);
+ GZException() throw() {}
+ ~GZException() throw() {}
+};
+
int OpenReadOrThrow(const char *name);
// Return value for SizeFile when it can't size properly.
@@ -34,40 +43,42 @@ off_t SizeFile(int fd);
class FilePiece {
public:
// 32 MB default.
- explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
+ explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException);
// Takes ownership of fd. name is used for messages.
- explicit FilePiece(const char *name, int fd, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
+ explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException);
+
+ ~FilePiece();
- char get() throw(EndOfFileException) {
- if (position_ == position_end_) Shift();
+ char get() throw(GZException, EndOfFileException) {
+ if (position_ == position_end_) {
+ Shift();
+ if (at_end_) throw EndOfFileException();
+ }
return *(position_++);
}
// Memory backing the returned StringPiece may vanish on the next call.
// Leaves the delimiter, if any, to be returned by get().
- StringPiece ReadDelimited() throw(EndOfFileException) {
+ StringPiece ReadDelimited() throw(GZException, EndOfFileException) {
SkipSpaces();
return Consume(FindDelimiterOrEOF());
}
// Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
// It is similar to getline in that way.
- StringPiece ReadLine(char delim = '\n') throw(EndOfFileException);
+ StringPiece ReadLine(char delim = '\n') throw(GZException, EndOfFileException);
- float ReadFloat() throw(EndOfFileException, ParseNumberException);
+ float ReadFloat() throw(GZException, EndOfFileException, ParseNumberException);
- void SkipSpaces() throw (EndOfFileException);
+ void SkipSpaces() throw (GZException, EndOfFileException);
off_t Offset() const {
return position_ - data_.begin() + mapped_offset_;
}
- // Only for testing.
- void ForceFallbackToRead() {
- fallback_to_read_ = true;
- }
+ const std::string &FileName() const { return file_name_; }
private:
- void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer);
+ void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw(GZException);
StringPiece Consume(const char *to) {
StringPiece ret(position_, to - position_);
@@ -75,12 +86,14 @@ class FilePiece {
return ret;
}
- const char *FindDelimiterOrEOF() throw(EndOfFileException);
+ const char *FindDelimiterOrEOF() throw(EndOfFileException, GZException);
- void Shift() throw (EndOfFileException);
+ void Shift() throw (EndOfFileException, GZException);
// Backends to Shift().
void MMapShift(off_t desired_begin) throw ();
- void ReadShift(off_t desired_begin) throw ();
+
+ void TransitionToRead() throw (GZException);
+ void ReadShift() throw (GZException, EndOfFileException);
const char *position_, *last_space_, *position_end_;
@@ -98,6 +111,12 @@ class FilePiece {
bool fallback_to_read_;
ErsatzProgress progress_;
+
+ std::string file_name_;
+
+#ifdef HAVE_ZLIB
+ void *gz_file_;
+#endif // HAVE_ZLIB
};
} // namespace util
diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc
index befb7866..23e79fe0 100644
--- a/klm/util/file_piece_test.cc
+++ b/klm/util/file_piece_test.cc
@@ -1,15 +1,19 @@
#include "util/file_piece.hh"
+#include "util/scoped.hh"
+
#define BOOST_TEST_MODULE FilePieceTest
#include <boost/test/unit_test.hpp>
#include <fstream>
#include <iostream>
+#include <stdio.h>
+
namespace util {
namespace {
/* mmap implementation */
-BOOST_AUTO_TEST_CASE(MMapLine) {
+BOOST_AUTO_TEST_CASE(MMapReadLine) {
std::fstream ref("file_piece.cc", std::ios::in);
FilePiece test("file_piece.cc", NULL, 1);
std::string ref_line;
@@ -20,13 +24,17 @@ BOOST_AUTO_TEST_CASE(MMapLine) {
BOOST_CHECK_EQUAL(ref_line, test_line);
}
}
+ BOOST_CHECK_THROW(test.get(), EndOfFileException);
}
/* read() implementation */
-BOOST_AUTO_TEST_CASE(ReadLine) {
+BOOST_AUTO_TEST_CASE(StreamReadLine) {
std::fstream ref("file_piece.cc", std::ios::in);
- FilePiece test("file_piece.cc", NULL, 1);
- test.ForceFallbackToRead();
+
+ scoped_FILE catter(popen("cat file_piece.cc", "r"));
+ BOOST_REQUIRE(catter.get());
+
+ FilePiece test(dup(fileno(catter.get())), "file_piece.cc", NULL, 1);
std::string ref_line;
while (getline(ref, ref_line)) {
StringPiece test_line(test.ReadLine());
@@ -35,7 +43,47 @@ BOOST_AUTO_TEST_CASE(ReadLine) {
BOOST_CHECK_EQUAL(ref_line, test_line);
}
}
+ BOOST_CHECK_THROW(test.get(), EndOfFileException);
}
+#ifdef HAVE_ZLIB
+
+// gzip file
+BOOST_AUTO_TEST_CASE(PlainZipReadLine) {
+ std::fstream ref("file_piece.cc", std::ios::in);
+
+ BOOST_REQUIRE_EQUAL(0, system("gzip <file_piece.cc >file_piece.cc.gz"));
+ FilePiece test("file_piece.cc.gz", NULL, 1);
+ std::string ref_line;
+ while (getline(ref, ref_line)) {
+ StringPiece test_line(test.ReadLine());
+ // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
+ if (!test_line.empty() || !ref_line.empty()) {
+ BOOST_CHECK_EQUAL(ref_line, test_line);
+ }
+ }
+ BOOST_CHECK_THROW(test.get(), EndOfFileException);
+}
+// gzip stream
+BOOST_AUTO_TEST_CASE(StreamZipReadLine) {
+ std::fstream ref("file_piece.cc", std::ios::in);
+
+ scoped_FILE catter(popen("gzip <file_piece.cc", "r"));
+ BOOST_REQUIRE(catter.get());
+
+ FilePiece test(dup(fileno(catter.get())), "file_piece.cc", NULL, 1);
+ std::string ref_line;
+ while (getline(ref, ref_line)) {
+ StringPiece test_line(test.ReadLine());
+ // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
+ if (!test_line.empty() || !ref_line.empty()) {
+ BOOST_CHECK_EQUAL(ref_line, test_line);
+ }
+ }
+ BOOST_CHECK_THROW(test.get(), EndOfFileException);
+}
+
+#endif
+
} // namespace
} // namespace util
diff --git a/klm/util/joint_sort.hh b/klm/util/joint_sort.hh
index a2f1c01d..cf3d8432 100644
--- a/klm/util/joint_sort.hh
+++ b/klm/util/joint_sort.hh
@@ -119,6 +119,12 @@ template <class Proxy, class Less> class LessWrapper : public std::binary_functi
} // namespace detail
+template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > {
+ public:
+ PairedIterator(const KeyIter &key, const ValueIter &value) :
+ ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {}
+};
+
template <class KeyIter, class ValueIter, class Less> void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin, const Less &less) {
ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > full_begin(detail::JointProxy<KeyIter, ValueIter>(key_begin, value_begin));
detail::LessWrapper<detail::JointProxy<KeyIter, ValueIter>, Less> less_wrap(less);
diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc
index 648b5d0a..8685170f 100644
--- a/klm/util/mmap.cc
+++ b/klm/util/mmap.cc
@@ -53,10 +53,8 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
if (prefault) {
flags |= MAP_POPULATE;
}
- int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
-#else
- int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
#endif
+ int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
void *ret = mmap(NULL, size, protect, flags, fd, offset);
if (ret == MAP_FAILED) {
UTIL_THROW(ErrnoException, "mmap failed for size " << size << " at offset " << offset);
@@ -64,8 +62,40 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
return ret;
}
-void *MapForRead(std::size_t size, bool prefault, int fd, off_t offset) {
- return MapOrThrow(size, false, MAP_FILE | MAP_PRIVATE, prefault, fd, offset);
+namespace {
+void ReadAll(int fd, void *to_void, std::size_t amount) {
+ uint8_t *to = static_cast<uint8_t*>(to_void);
+ while (amount) {
+ ssize_t ret = read(fd, to, amount);
+ if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
+ if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
+ amount -= ret;
+ to += ret;
+ }
+}
+} // namespace
+
+void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out) {
+ switch (method) {
+ case LAZY:
+ out.reset(MapOrThrow(size, false, MAP_FILE | MAP_SHARED, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
+ break;
+ case POPULATE_OR_LAZY:
+#ifdef MAP_POPULATE
+ case POPULATE_OR_READ:
+#endif
+ out.reset(MapOrThrow(size, false, MAP_FILE | MAP_SHARED, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
+ break;
+#ifndef MAP_POPULATE
+ case POPULATE_OR_READ:
+#endif
+ case READ:
+ out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED);
+ if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc");
+ if (-1 == lseek(fd, offset, SEEK_SET)) UTIL_THROW(ErrnoException, "lseek to " << offset << " in fd " << fd << " failed.");
+ ReadAll(fd, out.get(), size);
+ break;
+ }
}
void *MapAnonymous(std::size_t size) {
@@ -78,14 +108,14 @@ void *MapAnonymous(std::size_t size) {
| MAP_PRIVATE, false, -1, 0);
}
-void MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file, scoped_mmap &mem) {
+void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
file.reset(open(name, O_CREAT | O_RDWR | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH));
if (-1 == file.get())
UTIL_THROW(ErrnoException, "Failed to open " << name << " for writing");
if (-1 == ftruncate(file.get(), size))
UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed");
try {
- mem.reset(MapOrThrow(size, true, MAP_FILE | MAP_SHARED, false, file.get(), 0), size);
+ return MapOrThrow(size, true, MAP_FILE | MAP_SHARED, false, file.get(), 0);
} catch (ErrnoException &e) {
e << " in file " << name;
throw;
diff --git a/klm/util/mmap.hh b/klm/util/mmap.hh
index c9068ec9..0a504d89 100644
--- a/klm/util/mmap.hh
+++ b/klm/util/mmap.hh
@@ -6,6 +6,7 @@
#include <cstddef>
+#include <inttypes.h>
#include <sys/types.h>
namespace util {
@@ -19,8 +20,8 @@ class scoped_mmap {
void *get() const { return data_; }
- const char *begin() const { return reinterpret_cast<char*>(data_); }
- const char *end() const { return reinterpret_cast<char*>(data_) + size_; }
+ const uint8_t *begin() const { return reinterpret_cast<uint8_t*>(data_); }
+ const uint8_t *end() const { return reinterpret_cast<uint8_t*>(data_) + size_; }
std::size_t size() const { return size_; }
void reset(void *data, std::size_t size) {
@@ -79,23 +80,27 @@ class scoped_memory {
scoped_memory &operator=(const scoped_memory &);
};
-struct scoped_mapped_file {
- scoped_fd fd;
- scoped_mmap mem;
-};
+typedef enum {
+ // mmap with no prepopulate
+ LAZY,
+ // On linux, pass MAP_POPULATE to mmap.
+ POPULATE_OR_LAZY,
+ // Populate on Linux. malloc and read on non-Linux.
+ POPULATE_OR_READ,
+ // malloc and read.
+ READ
+} LoadMethod;
+
// Wrapper around mmap to check it worked and hide some platform macros.
void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset = 0);
-void *MapForRead(std::size_t size, bool prefault, int fd, off_t offset = 0);
+void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out);
void *MapAnonymous(std::size_t size);
// Open file name with mmap of size bytes, all of which are initially zero.
-void MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file, scoped_mmap &mem);
-inline void MapZeroedWrite(const char *name, std::size_t size, scoped_mapped_file &out) {
- MapZeroedWrite(name, size, out.fd, out.mem);
-}
-
+void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file);
+
} // namespace util
-#endif // UTIL_SCOPED__
+#endif // UTIL_MMAP__
diff --git a/klm/util/proxy_iterator.hh b/klm/util/proxy_iterator.hh
index 1c5b7089..121a45fa 100644
--- a/klm/util/proxy_iterator.hh
+++ b/klm/util/proxy_iterator.hh
@@ -78,6 +78,8 @@ template <class Proxy> class ProxyIterator {
const Proxy *operator->() const { return &p_; }
Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); }
+ const InnerIterator &Inner() { return p_.Inner(); }
+
private:
InnerIterator &I() { return p_.Inner(); }
const InnerIterator &I() const { return p_.Inner(); }
diff --git a/klm/util/scoped.cc b/klm/util/scoped.cc
index 61394ffc..2c6d5394 100644
--- a/klm/util/scoped.cc
+++ b/klm/util/scoped.cc
@@ -9,4 +9,8 @@ scoped_fd::~scoped_fd() {
if (fd_ != -1 && close(fd_)) err(1, "Could not close file %i", fd_);
}
+scoped_FILE::~scoped_FILE() {
+ if (file_ && fclose(file_)) err(1, "Could not close file");
+}
+
} // namespace util
diff --git a/klm/util/scoped.hh b/klm/util/scoped.hh
index ef62a74f..52864481 100644
--- a/klm/util/scoped.hh
+++ b/klm/util/scoped.hh
@@ -4,6 +4,7 @@
/* Other scoped objects in the style of scoped_ptr. */
#include <cstddef>
+#include <cstdio>
namespace util {
@@ -61,6 +62,24 @@ class scoped_fd {
scoped_fd &operator=(const scoped_fd &);
};
+class scoped_FILE {
+ public:
+ explicit scoped_FILE(std::FILE *file = NULL) : file_(file) {}
+
+ ~scoped_FILE();
+
+ std::FILE *get() { return file_; }
+ const std::FILE *get() const { return file_; }
+
+ void reset(std::FILE *to = NULL) {
+ scoped_FILE other(file_);
+ file_ = to;
+ }
+
+ private:
+ std::FILE *file_;
+};
+
} // namespace util
#endif // UTIL_SCOPED__
diff --git a/klm/util/sorted_uniform.hh b/klm/util/sorted_uniform.hh
index 96ec4866..a8e208fb 100644
--- a/klm/util/sorted_uniform.hh
+++ b/klm/util/sorted_uniform.hh
@@ -65,7 +65,7 @@ template <class PackingT> class SortedUniformMap {
public:
// Offer consistent API with probing hash.
- static std::size_t Size(std::size_t entries, float ignore = 0.0) {
+ static std::size_t Size(std::size_t entries, float /*ignore*/ = 0.0) {
return sizeof(uint64_t) + entries * Packing::kBytes;
}
@@ -75,7 +75,7 @@ template <class PackingT> class SortedUniformMap {
#endif
{}
- SortedUniformMap(void *start, std::size_t allocated) :
+ SortedUniformMap(void *start, std::size_t /*allocated*/) :
begin_(Packing::FromVoid(reinterpret_cast<uint64_t*>(start) + 1)),
end_(begin_), size_ptr_(reinterpret_cast<uint64_t*>(start))
#ifdef DEBUG
diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc
index 6917a6bc..5b4e98f5 100644
--- a/klm/util/string_piece.cc
+++ b/klm/util/string_piece.cc
@@ -30,14 +30,14 @@
#include "util/string_piece.hh"
-#ifdef USE_BOOST
+#ifdef HAVE_BOOST
#include <boost/functional/hash/hash.hpp>
#endif
#include <algorithm>
#include <iostream>
-#ifdef USE_ICU
+#ifdef HAVE_ICU
U_NAMESPACE_BEGIN
#endif
@@ -46,12 +46,12 @@ std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
return o;
}
-#ifdef USE_BOOST
+#ifdef HAVE_BOOST
size_t hash_value(const StringPiece &str) {
return boost::hash_range(str.data(), str.data() + str.length());
}
#endif
-#ifdef USE_ICU
+#ifdef HAVE_ICU
U_NAMESPACE_END
#endif
diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh
index 58008d13..3ac2f8a7 100644
--- a/klm/util/string_piece.hh
+++ b/klm/util/string_piece.hh
@@ -1,4 +1,4 @@
-/* If you use ICU in your program, then compile with -DUSE_ICU -licui18n. If
+/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If
* you don't use ICU, then this will use the Google implementation from Chrome.
* This has been modified from the original version to let you choose.
*/
@@ -49,14 +49,14 @@
#define BASE_STRING_PIECE_H__
//Uncomment this line if you use ICU in your code.
-//#define USE_ICU
+//#define HAVE_ICU
//Uncomment this line if you want boost hashing for your StringPieces.
-//#define USE_BOOST
+//#define HAVE_BOOST
#include <cstring>
#include <iosfwd>
-#ifdef USE_ICU
+#ifdef HAVE_ICU
#include <unicode/stringpiece.h>
U_NAMESPACE_BEGIN
#else
@@ -230,7 +230,7 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) {
// allow StringPiece to be logged (needed for unit testing).
extern std::ostream& operator<<(std::ostream& o, const StringPiece& piece);
-#ifdef USE_BOOST
+#ifdef HAVE_BOOST
size_t hash_value(const StringPiece &str);
/* Support for lookup of StringPiece in boost::unordered_map<std::string> */
@@ -253,7 +253,7 @@ template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece
}
#endif
-#ifdef USE_ICU
+#ifdef HAVE_ICU
U_NAMESPACE_END
#endif