From 47a656c0b6fdba8f91f2c5808234cbb1de682652 Mon Sep 17 00:00:00 2001 From: redpony Date: Wed, 10 Nov 2010 02:02:04 +0000 Subject: new version of klm git-svn-id: https://ws10smt.googlecode.com/svn/trunk@706 ec762483-ff6d-05da-a07a-a48fb63a330f --- klm/util/Makefile.am | 1 + klm/util/bit_packing.cc | 27 +++++++++ klm/util/bit_packing.hh | 88 +++++++++++++++++++++++++++ klm/util/ersatz_progress.cc | 9 +-- klm/util/ersatz_progress.hh | 6 +- klm/util/file_piece.cc | 141 +++++++++++++++++++++++++++++++++----------- klm/util/file_piece.hh | 51 +++++++++++----- klm/util/file_piece_test.cc | 56 ++++++++++++++++-- klm/util/joint_sort.hh | 6 ++ klm/util/mmap.cc | 44 +++++++++++--- klm/util/mmap.hh | 31 ++++++---- klm/util/proxy_iterator.hh | 2 + klm/util/scoped.cc | 4 ++ klm/util/scoped.hh | 19 ++++++ klm/util/sorted_uniform.hh | 4 +- klm/util/string_piece.cc | 8 +-- klm/util/string_piece.hh | 12 ++-- 17 files changed, 415 insertions(+), 94 deletions(-) create mode 100644 klm/util/bit_packing.cc create mode 100644 klm/util/bit_packing.hh (limited to 'klm/util') diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am index be84736c..9e38e0f1 100644 --- a/klm/util/Makefile.am +++ b/klm/util/Makefile.am @@ -20,6 +20,7 @@ noinst_LIBRARIES = libklm_util.a libklm_util_a_SOURCES = \ ersatz_progress.cc \ + bit_packing.cc \ exception.cc \ file_piece.cc \ mmap.cc \ diff --git a/klm/util/bit_packing.cc b/klm/util/bit_packing.cc new file mode 100644 index 00000000..dd14ffe1 --- /dev/null +++ b/klm/util/bit_packing.cc @@ -0,0 +1,27 @@ +#include "util/bit_packing.hh" +#include "util/exception.hh" + +namespace util { + +namespace { +template struct StaticCheck {}; +template <> struct StaticCheck { typedef bool StaticAssertionPassed; }; + +typedef StaticCheck::StaticAssertionPassed FloatSize; + +} // namespace + +uint8_t RequiredBits(uint64_t max_value) { + if (!max_value) return 0; + uint8_t ret = 1; + while (max_value >>= 1) ++ret; + return ret; +} + +void BitPackingSanity() { + const detail::FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 }; + if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000"); + // TODO: more checks. +} + +} // namespace util diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh new file mode 100644 index 00000000..422ed873 --- /dev/null +++ b/klm/util/bit_packing.hh @@ -0,0 +1,88 @@ +#ifndef UTIL_BIT_PACKING__ +#define UTIL_BIT_PACKING__ + +/* Bit-level packing routines */ + +#include +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include + +#if __BYTE_ORDER != __LITTLE_ENDIAN +#error The bit aligned storage functions assume little endian architecture +#endif + +namespace util { + +/* WARNING WARNING WARNING: + * The write functions assume that memory is zero initially. This makes them + * faster and is the appropriate case for mmapped language model construction. + * These routines assume that unaligned access to uint64_t is fast and that + * storage is little endian. This is the case on x86_64. It may not be the + * case on 32-bit x86 but my target audience is large language models for which + * 64-bit is necessary. + */ + +/* Pack integers up to 57 bits using their least significant digits. + * The length is specified using mask: + * Assumes mask == (1 << length) - 1 where length <= 57. + */ +inline uint64_t ReadInt57(const void *base, uint8_t bit, uint64_t mask) { + return (*reinterpret_cast(base) >> bit) & mask; +} +/* Assumes value <= mask and mask == (1 << length) - 1 where length <= 57. + * Assumes the memory is zero initially. + */ +inline void WriteInt57(void *base, uint8_t bit, uint64_t value) { + *reinterpret_cast(base) |= (value << bit); +} + +namespace detail { typedef union { float f; uint32_t i; } FloatEnc; } +inline float ReadFloat32(const void *base, uint8_t bit) { + detail::FloatEnc encoded; + encoded.i = *reinterpret_cast(base) >> bit; + return encoded.f; +} +inline void WriteFloat32(void *base, uint8_t bit, float value) { + detail::FloatEnc encoded; + encoded.f = value; + WriteInt57(base, bit, encoded.i); +} + +inline float ReadNonPositiveFloat31(const void *base, uint8_t bit) { + detail::FloatEnc encoded; + encoded.i = *reinterpret_cast(base) >> bit; + // Sign bit set means negative. + encoded.i |= 0x80000000; + return encoded.f; +} +inline void WriteNonPositiveFloat31(void *base, uint8_t bit, float value) { + assert(value <= 0.0); + detail::FloatEnc encoded; + encoded.f = value; + encoded.i &= ~0x80000000; + WriteInt57(base, bit, encoded.i); +} + +void BitPackingSanity(); + +// Return bits required to store integers upto max_value. Not the most +// efficient implementation, but this is only called a few times to size tries. +uint8_t RequiredBits(uint64_t max_value); + +struct BitsMask { + void FromMax(uint64_t max_value) { + bits = RequiredBits(max_value); + mask = (1 << bits) - 1; + } + uint8_t bits; + uint64_t mask; +}; + +} // namespace util + +#endif // UTIL_BIT_PACKING__ diff --git a/klm/util/ersatz_progress.cc b/klm/util/ersatz_progress.cc index 09e3a106..55c182bd 100644 --- a/klm/util/ersatz_progress.cc +++ b/klm/util/ersatz_progress.cc @@ -13,10 +13,7 @@ ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits= complete_) { + if (stone == kWidth) { + (*out_) << std::endl; next_ = std::numeric_limits::max(); } else { next_ = std::max(next_, (stone * complete_) / kWidth); diff --git a/klm/util/ersatz_progress.hh b/klm/util/ersatz_progress.hh index ea6c3bb9..92c345fe 100644 --- a/klm/util/ersatz_progress.hh +++ b/klm/util/ersatz_progress.hh @@ -19,7 +19,7 @@ class ErsatzProgress { ~ErsatzProgress(); ErsatzProgress &operator++() { - if (++current_ == next_) Milestone(); + if (++current_ >= next_) Milestone(); return *this; } @@ -33,6 +33,10 @@ class ErsatzProgress { Milestone(); } + void Finished() { + Set(complete_); + } + private: void Milestone(); diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index 2b439499..e7bd8659 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -2,19 +2,23 @@ #include "util/exception.hh" -#include #include #include #include -#include #include +#include #include +#include #include #include #include #include +#ifdef HAVE_ZLIB +#include +#endif + namespace util { EndOfFileException::EndOfFileException() throw() { @@ -26,6 +30,13 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() { *this << "Could not parse \"" << value << "\" into a float"; } +GZException::GZException(void *file) { +#ifdef HAVE_ZLIB + int num; + *this << gzerror(file, &num) << " from zlib"; +#endif // HAVE_ZLIB +} + int OpenReadOrThrow(const char *name) { int ret = open(name, O_RDONLY); if (ret == -1) UTIL_THROW(ErrnoException, "in open (" << name << ") for reading"); @@ -38,42 +49,73 @@ off_t SizeFile(int fd) { return sb.st_size; } -FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) : +FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) : file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)), progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { Initialize(name, show_progress, min_buffer); } -FilePiece::FilePiece(const char *name, int fd, std::ostream *show_progress, off_t min_buffer) : +FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) : file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)), progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { Initialize(name, show_progress, min_buffer); } -void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) { - if (total_size_ == kBadSize) { - fallback_to_read_ = true; - if (show_progress) - *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl; - } else { - fallback_to_read_ = false; +FilePiece::~FilePiece() { +#ifdef HAVE_ZLIB + if (gz_file_) { + // zlib took ownership + file_.release(); + int ret; + if (Z_OK != (ret = gzclose(gz_file_))) { + errx(1, "could not close file %s using zlib", file_name_.c_str()); + } } +#endif +} + +void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) { +#ifdef HAVE_ZLIB + gz_file_ = NULL; +#endif + file_name_ = name; + default_map_size_ = page_ * std::max((min_buffer / page_ + 1), 2); position_ = NULL; position_end_ = NULL; mapped_offset_ = 0; at_end_ = false; + + if (total_size_ == kBadSize) { + // So the assertion passes. + fallback_to_read_ = false; + if (show_progress) + *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl; + TransitionToRead(); + } else { + fallback_to_read_ = false; + } Shift(); + // gzip detect. + if ((position_end_ - position_) > 2 && *position_ == 0x1f && static_cast(*(position_ + 1)) == 0x8b) { +#ifndef HAVE_ZLIB + UTIL_THROW(GZException, "Looks like a gzip file but support was not compiled in."); +#endif + if (!fallback_to_read_) { + at_end_ = false; + TransitionToRead(); + } + } } -float FilePiece::ReadFloat() throw(EndOfFileException, ParseNumberException) { +float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) { SkipSpaces(); while (last_space_ < position_) { if (at_end_) { // Hallucinate a null off the end of the file. std::string buffer(position_, position_end_); char *end; - float ret = std::strtof(buffer.c_str(), &end); + float ret = strtof(buffer.c_str(), &end); if (buffer.c_str() == end) throw ParseNumberException(buffer); position_ += end - buffer.c_str(); return ret; @@ -81,20 +123,20 @@ float FilePiece::ReadFloat() throw(EndOfFileException, ParseNumberException) { Shift(); } char *end; - float ret = std::strtof(position_, &end); + float ret = strtof(position_, &end); if (end == position_) throw ParseNumberException(ReadDelimited()); position_ = end; return ret; } -void FilePiece::SkipSpaces() throw (EndOfFileException) { +void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) { for (; ; ++position_) { if (position_ == position_end_) Shift(); if (!isspace(*position_)) return; } } -const char *FilePiece::FindDelimiterOrEOF() throw (EndOfFileException) { +const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileException) { for (const char *i = position_; i <= last_space_; ++i) { if (isspace(*i)) return i; } @@ -108,7 +150,7 @@ const char *FilePiece::FindDelimiterOrEOF() throw (EndOfFileException) { return position_end_; } -StringPiece FilePiece::ReadLine(char delim) throw (EndOfFileException) { +StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) { const char *start = position_; do { for (const char *i = start; i < position_end_; ++i) { @@ -124,17 +166,19 @@ StringPiece FilePiece::ReadLine(char delim) throw (EndOfFileException) { } while (!at_end_); StringPiece ret(position_, position_end_ - position_); position_ = position_end_; - return position_; + return ret; } -void FilePiece::Shift() throw(EndOfFileException) { - if (at_end_) throw EndOfFileException(); +void FilePiece::Shift() throw(GZException, EndOfFileException) { + if (at_end_) { + progress_.Finished(); + throw EndOfFileException(); + } off_t desired_begin = position_ - data_.begin() + mapped_offset_; - progress_.Set(desired_begin); if (!fallback_to_read_) MMapShift(desired_begin); // Notice an mmap failure might set the fallback. - if (fallback_to_read_) ReadShift(desired_begin); + if (fallback_to_read_) ReadShift(); for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) { if (isspace(*last_space_)) break; @@ -163,28 +207,41 @@ void FilePiece::MMapShift(off_t desired_begin) throw() { data_.reset(); data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_PRIVATE, *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED); if (data_.get() == MAP_FAILED) { - fallback_to_read_ = true; if (desired_begin) { if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either."); } + // The mmap was scheduled to end the file, but now we're going to read it. + at_end_ = false; + TransitionToRead(); return; } mapped_offset_ = mapped_offset; position_ = data_.begin() + ignore; position_end_ = data_.begin() + mapped_size; + + progress_.Set(desired_begin); +} + +void FilePiece::TransitionToRead() throw (GZException) { + assert(!fallback_to_read_); + fallback_to_read_ = true; + data_.reset(); + data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); + if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_); + position_ = data_.begin(); + position_end_ = position_; + +#ifdef HAVE_ZLIB + assert(!gz_file_); + gz_file_ = gzdopen(file_.get(), "r"); + if (!gz_file_) { + UTIL_THROW(GZException, "zlib failed to open " << file_name_); + } +#endif } -void FilePiece::ReadShift(off_t desired_begin) throw() { +void FilePiece::ReadShift() throw(GZException, EndOfFileException) { assert(fallback_to_read_); - if (data_.source() != scoped_memory::MALLOC_ALLOCATED) { - // First call. - data_.reset(); - data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); - if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_); - position_ = data_.begin(); - position_end_ = position_; - } - // Bytes [data_.begin(), position_) have been consumed. // Bytes [position_, position_end_) have been read into the buffer. @@ -215,9 +272,23 @@ void FilePiece::ReadShift(off_t desired_begin) throw() { } } - ssize_t read_return = read(file_.get(), static_cast(data_.get()) + already_read, default_map_size_ - already_read); + ssize_t read_return; +#ifdef HAVE_ZLIB + read_return = gzread(gz_file_, static_cast(data_.get()) + already_read, default_map_size_ - already_read); + if (read_return == -1) throw GZException(gz_file_); + if (total_size_ != kBadSize) { + // Just get the position, don't actually seek. Apparently this is how you do it. . . + off_t ret = lseek(file_.get(), 0, SEEK_CUR); + if (ret != -1) progress_.Set(ret); + } +#else + read_return = read(file_.get(), static_cast(data_.get()) + already_read, default_map_size_ - already_read); if (read_return == -1) UTIL_THROW(ErrnoException, "read failed"); - if (read_return == 0) at_end_ = true; + progress_.Set(mapped_offset_); +#endif + if (read_return == 0) { + at_end_ = true; + } position_end_ += read_return; } diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index 704f0ac6..11d4a751 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -11,6 +11,8 @@ #include +#define HAVE_ZLIB + namespace util { class EndOfFileException : public Exception { @@ -25,6 +27,13 @@ class ParseNumberException : public Exception { ~ParseNumberException() throw() {} }; +class GZException : public Exception { + public: + explicit GZException(void *file); + GZException() throw() {} + ~GZException() throw() {} +}; + int OpenReadOrThrow(const char *name); // Return value for SizeFile when it can't size properly. @@ -34,40 +43,42 @@ off_t SizeFile(int fd); class FilePiece { public: // 32 MB default. - explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432); + explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException); // Takes ownership of fd. name is used for messages. - explicit FilePiece(const char *name, int fd, std::ostream *show_progress = NULL, off_t min_buffer = 33554432); + explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException); + + ~FilePiece(); - char get() throw(EndOfFileException) { - if (position_ == position_end_) Shift(); + char get() throw(GZException, EndOfFileException) { + if (position_ == position_end_) { + Shift(); + if (at_end_) throw EndOfFileException(); + } return *(position_++); } // Memory backing the returned StringPiece may vanish on the next call. // Leaves the delimiter, if any, to be returned by get(). - StringPiece ReadDelimited() throw(EndOfFileException) { + StringPiece ReadDelimited() throw(GZException, EndOfFileException) { SkipSpaces(); return Consume(FindDelimiterOrEOF()); } // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter. // It is similar to getline in that way. - StringPiece ReadLine(char delim = '\n') throw(EndOfFileException); + StringPiece ReadLine(char delim = '\n') throw(GZException, EndOfFileException); - float ReadFloat() throw(EndOfFileException, ParseNumberException); + float ReadFloat() throw(GZException, EndOfFileException, ParseNumberException); - void SkipSpaces() throw (EndOfFileException); + void SkipSpaces() throw (GZException, EndOfFileException); off_t Offset() const { return position_ - data_.begin() + mapped_offset_; } - // Only for testing. - void ForceFallbackToRead() { - fallback_to_read_ = true; - } + const std::string &FileName() const { return file_name_; } private: - void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer); + void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw(GZException); StringPiece Consume(const char *to) { StringPiece ret(position_, to - position_); @@ -75,12 +86,14 @@ class FilePiece { return ret; } - const char *FindDelimiterOrEOF() throw(EndOfFileException); + const char *FindDelimiterOrEOF() throw(EndOfFileException, GZException); - void Shift() throw (EndOfFileException); + void Shift() throw (EndOfFileException, GZException); // Backends to Shift(). void MMapShift(off_t desired_begin) throw (); - void ReadShift(off_t desired_begin) throw (); + + void TransitionToRead() throw (GZException); + void ReadShift() throw (GZException, EndOfFileException); const char *position_, *last_space_, *position_end_; @@ -98,6 +111,12 @@ class FilePiece { bool fallback_to_read_; ErsatzProgress progress_; + + std::string file_name_; + +#ifdef HAVE_ZLIB + void *gz_file_; +#endif // HAVE_ZLIB }; } // namespace util diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc index befb7866..23e79fe0 100644 --- a/klm/util/file_piece_test.cc +++ b/klm/util/file_piece_test.cc @@ -1,15 +1,19 @@ #include "util/file_piece.hh" +#include "util/scoped.hh" + #define BOOST_TEST_MODULE FilePieceTest #include #include #include +#include + namespace util { namespace { /* mmap implementation */ -BOOST_AUTO_TEST_CASE(MMapLine) { +BOOST_AUTO_TEST_CASE(MMapReadLine) { std::fstream ref("file_piece.cc", std::ios::in); FilePiece test("file_piece.cc", NULL, 1); std::string ref_line; @@ -20,13 +24,17 @@ BOOST_AUTO_TEST_CASE(MMapLine) { BOOST_CHECK_EQUAL(ref_line, test_line); } } + BOOST_CHECK_THROW(test.get(), EndOfFileException); } /* read() implementation */ -BOOST_AUTO_TEST_CASE(ReadLine) { +BOOST_AUTO_TEST_CASE(StreamReadLine) { std::fstream ref("file_piece.cc", std::ios::in); - FilePiece test("file_piece.cc", NULL, 1); - test.ForceFallbackToRead(); + + scoped_FILE catter(popen("cat file_piece.cc", "r")); + BOOST_REQUIRE(catter.get()); + + FilePiece test(dup(fileno(catter.get())), "file_piece.cc", NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); @@ -35,7 +43,47 @@ BOOST_AUTO_TEST_CASE(ReadLine) { BOOST_CHECK_EQUAL(ref_line, test_line); } } + BOOST_CHECK_THROW(test.get(), EndOfFileException); } +#ifdef HAVE_ZLIB + +// gzip file +BOOST_AUTO_TEST_CASE(PlainZipReadLine) { + std::fstream ref("file_piece.cc", std::ios::in); + + BOOST_REQUIRE_EQUAL(0, system("gzip file_piece.cc.gz")); + FilePiece test("file_piece.cc.gz", NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} +// gzip stream +BOOST_AUTO_TEST_CASE(StreamZipReadLine) { + std::fstream ref("file_piece.cc", std::ios::in); + + scoped_FILE catter(popen("gzip class LessWrapper : public std::binary_functi } // namespace detail +template class PairedIterator : public ProxyIterator > { + public: + PairedIterator(const KeyIter &key, const ValueIter &value) : + ProxyIterator >(detail::JointProxy(key, value)) {} +}; + template void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin, const Less &less) { ProxyIterator > full_begin(detail::JointProxy(key_begin, value_begin)); detail::LessWrapper, Less> less_wrap(less); diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index 648b5d0a..8685170f 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -53,10 +53,8 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int if (prefault) { flags |= MAP_POPULATE; } - int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; -#else - int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; #endif + int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; void *ret = mmap(NULL, size, protect, flags, fd, offset); if (ret == MAP_FAILED) { UTIL_THROW(ErrnoException, "mmap failed for size " << size << " at offset " << offset); @@ -64,8 +62,40 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int return ret; } -void *MapForRead(std::size_t size, bool prefault, int fd, off_t offset) { - return MapOrThrow(size, false, MAP_FILE | MAP_PRIVATE, prefault, fd, offset); +namespace { +void ReadAll(int fd, void *to_void, std::size_t amount) { + uint8_t *to = static_cast(to_void); + while (amount) { + ssize_t ret = read(fd, to, amount); + if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed."); + if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read."); + amount -= ret; + to += ret; + } +} +} // namespace + +void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out) { + switch (method) { + case LAZY: + out.reset(MapOrThrow(size, false, MAP_FILE | MAP_SHARED, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED); + break; + case POPULATE_OR_LAZY: +#ifdef MAP_POPULATE + case POPULATE_OR_READ: +#endif + out.reset(MapOrThrow(size, false, MAP_FILE | MAP_SHARED, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED); + break; +#ifndef MAP_POPULATE + case POPULATE_OR_READ: +#endif + case READ: + out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED); + if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc"); + if (-1 == lseek(fd, offset, SEEK_SET)) UTIL_THROW(ErrnoException, "lseek to " << offset << " in fd " << fd << " failed."); + ReadAll(fd, out.get(), size); + break; + } } void *MapAnonymous(std::size_t size) { @@ -78,14 +108,14 @@ void *MapAnonymous(std::size_t size) { | MAP_PRIVATE, false, -1, 0); } -void MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file, scoped_mmap &mem) { +void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { file.reset(open(name, O_CREAT | O_RDWR | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)); if (-1 == file.get()) UTIL_THROW(ErrnoException, "Failed to open " << name << " for writing"); if (-1 == ftruncate(file.get(), size)) UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed"); try { - mem.reset(MapOrThrow(size, true, MAP_FILE | MAP_SHARED, false, file.get(), 0), size); + return MapOrThrow(size, true, MAP_FILE | MAP_SHARED, false, file.get(), 0); } catch (ErrnoException &e) { e << " in file " << name; throw; diff --git a/klm/util/mmap.hh b/klm/util/mmap.hh index c9068ec9..0a504d89 100644 --- a/klm/util/mmap.hh +++ b/klm/util/mmap.hh @@ -6,6 +6,7 @@ #include +#include #include namespace util { @@ -19,8 +20,8 @@ class scoped_mmap { void *get() const { return data_; } - const char *begin() const { return reinterpret_cast(data_); } - const char *end() const { return reinterpret_cast(data_) + size_; } + const uint8_t *begin() const { return reinterpret_cast(data_); } + const uint8_t *end() const { return reinterpret_cast(data_) + size_; } std::size_t size() const { return size_; } void reset(void *data, std::size_t size) { @@ -79,23 +80,27 @@ class scoped_memory { scoped_memory &operator=(const scoped_memory &); }; -struct scoped_mapped_file { - scoped_fd fd; - scoped_mmap mem; -}; +typedef enum { + // mmap with no prepopulate + LAZY, + // On linux, pass MAP_POPULATE to mmap. + POPULATE_OR_LAZY, + // Populate on Linux. malloc and read on non-Linux. + POPULATE_OR_READ, + // malloc and read. + READ +} LoadMethod; + // Wrapper around mmap to check it worked and hide some platform macros. void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset = 0); -void *MapForRead(std::size_t size, bool prefault, int fd, off_t offset = 0); +void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out); void *MapAnonymous(std::size_t size); // Open file name with mmap of size bytes, all of which are initially zero. -void MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file, scoped_mmap &mem); -inline void MapZeroedWrite(const char *name, std::size_t size, scoped_mapped_file &out) { - MapZeroedWrite(name, size, out.fd, out.mem); -} - +void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file); + } // namespace util -#endif // UTIL_SCOPED__ +#endif // UTIL_MMAP__ diff --git a/klm/util/proxy_iterator.hh b/klm/util/proxy_iterator.hh index 1c5b7089..121a45fa 100644 --- a/klm/util/proxy_iterator.hh +++ b/klm/util/proxy_iterator.hh @@ -78,6 +78,8 @@ template class ProxyIterator { const Proxy *operator->() const { return &p_; } Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); } + const InnerIterator &Inner() { return p_.Inner(); } + private: InnerIterator &I() { return p_.Inner(); } const InnerIterator &I() const { return p_.Inner(); } diff --git a/klm/util/scoped.cc b/klm/util/scoped.cc index 61394ffc..2c6d5394 100644 --- a/klm/util/scoped.cc +++ b/klm/util/scoped.cc @@ -9,4 +9,8 @@ scoped_fd::~scoped_fd() { if (fd_ != -1 && close(fd_)) err(1, "Could not close file %i", fd_); } +scoped_FILE::~scoped_FILE() { + if (file_ && fclose(file_)) err(1, "Could not close file"); +} + } // namespace util diff --git a/klm/util/scoped.hh b/klm/util/scoped.hh index ef62a74f..52864481 100644 --- a/klm/util/scoped.hh +++ b/klm/util/scoped.hh @@ -4,6 +4,7 @@ /* Other scoped objects in the style of scoped_ptr. */ #include +#include namespace util { @@ -61,6 +62,24 @@ class scoped_fd { scoped_fd &operator=(const scoped_fd &); }; +class scoped_FILE { + public: + explicit scoped_FILE(std::FILE *file = NULL) : file_(file) {} + + ~scoped_FILE(); + + std::FILE *get() { return file_; } + const std::FILE *get() const { return file_; } + + void reset(std::FILE *to = NULL) { + scoped_FILE other(file_); + file_ = to; + } + + private: + std::FILE *file_; +}; + } // namespace util #endif // UTIL_SCOPED__ diff --git a/klm/util/sorted_uniform.hh b/klm/util/sorted_uniform.hh index 96ec4866..a8e208fb 100644 --- a/klm/util/sorted_uniform.hh +++ b/klm/util/sorted_uniform.hh @@ -65,7 +65,7 @@ template class SortedUniformMap { public: // Offer consistent API with probing hash. - static std::size_t Size(std::size_t entries, float ignore = 0.0) { + static std::size_t Size(std::size_t entries, float /*ignore*/ = 0.0) { return sizeof(uint64_t) + entries * Packing::kBytes; } @@ -75,7 +75,7 @@ template class SortedUniformMap { #endif {} - SortedUniformMap(void *start, std::size_t allocated) : + SortedUniformMap(void *start, std::size_t /*allocated*/) : begin_(Packing::FromVoid(reinterpret_cast(start) + 1)), end_(begin_), size_ptr_(reinterpret_cast(start)) #ifdef DEBUG diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc index 6917a6bc..5b4e98f5 100644 --- a/klm/util/string_piece.cc +++ b/klm/util/string_piece.cc @@ -30,14 +30,14 @@ #include "util/string_piece.hh" -#ifdef USE_BOOST +#ifdef HAVE_BOOST #include #endif #include #include -#ifdef USE_ICU +#ifdef HAVE_ICU U_NAMESPACE_BEGIN #endif @@ -46,12 +46,12 @@ std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { return o; } -#ifdef USE_BOOST +#ifdef HAVE_BOOST size_t hash_value(const StringPiece &str) { return boost::hash_range(str.data(), str.data() + str.length()); } #endif -#ifdef USE_ICU +#ifdef HAVE_ICU U_NAMESPACE_END #endif diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh index 58008d13..3ac2f8a7 100644 --- a/klm/util/string_piece.hh +++ b/klm/util/string_piece.hh @@ -1,4 +1,4 @@ -/* If you use ICU in your program, then compile with -DUSE_ICU -licui18n. If +/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If * you don't use ICU, then this will use the Google implementation from Chrome. * This has been modified from the original version to let you choose. */ @@ -49,14 +49,14 @@ #define BASE_STRING_PIECE_H__ //Uncomment this line if you use ICU in your code. -//#define USE_ICU +//#define HAVE_ICU //Uncomment this line if you want boost hashing for your StringPieces. -//#define USE_BOOST +//#define HAVE_BOOST #include #include -#ifdef USE_ICU +#ifdef HAVE_ICU #include U_NAMESPACE_BEGIN #else @@ -230,7 +230,7 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) { // allow StringPiece to be logged (needed for unit testing). extern std::ostream& operator<<(std::ostream& o, const StringPiece& piece); -#ifdef USE_BOOST +#ifdef HAVE_BOOST size_t hash_value(const StringPiece &str); /* Support for lookup of StringPiece in boost::unordered_map */ @@ -253,7 +253,7 @@ template typename T::iterator FindStringPiece(T &t, const StringPiece } #endif -#ifdef USE_ICU +#ifdef HAVE_ICU U_NAMESPACE_END #endif -- cgit v1.2.3