diff options
Diffstat (limited to 'klm/util/file_piece.cc')
-rw-r--r-- | klm/util/file_piece.cc | 181 |
1 files changed, 103 insertions, 78 deletions
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index af341d6d..9de30fc4 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -1,19 +1,21 @@ #include "util/file_piece.hh" +#include "util/double-conversion/double-conversion.h" #include "util/exception.hh" #include "util/file.hh" #include "util/mmap.hh" -#ifdef WIN32 + +#if defined(_WIN32) || defined(_WIN64) #include <io.h> -#endif // WIN32 +#else +#include <unistd.h> +#endif #include <iostream> #include <string> #include <limits> -#include <unistd.h> #include <assert.h> -#include <ctype.h> #include <fcntl.h> #include <stdlib.h> #include <sys/types.h> @@ -25,13 +27,6 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() { *this << "Could not parse \"" << value << "\" into a number"; } -#ifdef HAVE_ZLIB -GZException::GZException(gzFile file) { - int num; - *this << gzerror(file, &num) << " from zlib"; -} -#endif // HAVE_ZLIB - // Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; @@ -41,26 +36,33 @@ FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t Initialize(name, show_progress, min_buffer); } -FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : +namespace { +std::string NamePossiblyFind(int fd, const char *name) { + if (name) return name; + return NameFromFD(fd); +} +} // namespace + +FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()), - progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) { - Initialize(name, show_progress, min_buffer); + progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + NamePossiblyFind(fd, name)) { + Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer); } -FilePiece::~FilePiece() { -#ifdef HAVE_ZLIB - if (gz_file_) { - // zlib took ownership - file_.release(); - int ret; - if (Z_OK != (ret = gzclose(gz_file_))) { - std::cerr << "could not close file " << file_name_ << " using zlib" << std::endl; - abort(); - } - } -#endif +FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buffer) : + total_size_(kBadSize), page_(SizePage()) { + InitializeNoRead(name ? name : "istream", min_buffer); + + fallback_to_read_ = true; + data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); + position_ = data_.begin(); + position_end_ = position_; + + fell_back_.Reset(stream); } +FilePiece::~FilePiece() {} + StringPiece FilePiece::ReadLine(char delim) { std::size_t skip = 0; while (true) { @@ -93,10 +95,34 @@ unsigned long int FilePiece::ReadULong() { return ReadNumber<unsigned long int>(); } -void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { -#ifdef HAVE_ZLIB - gz_file_ = NULL; -#endif +std::size_t FilePiece::Raw(void *to, std::size_t limit) { + if (!limit) return 0; + std::size_t in_buf = static_cast<std::size_t>(position_end_ - position_); + if (in_buf) { + std::size_t amount = std::min(in_buf, limit); + memcpy(to, position_, amount); + position_ += amount; + return amount; + } + + std::size_t read_return; + if (fallback_to_read_) { + read_return = fell_back_.Read(to, limit); + progress_.Set(fell_back_.RawAmount()); + } else { + uint64_t desired_begin = mapped_offset_ + static_cast<uint64_t>(position_ - data_.begin()); + SeekOrThrow(file_.get(), desired_begin); + read_return = ReadOrEOF(file_.get(), to, limit); + // Good thing we never rewind. This makes desired_begin calculate the right way the next time. + mapped_offset_ += static_cast<uint64_t>(read_return); + progress_ += read_return; + } + at_end_ |= (read_return == 0); + return read_return; +} + +// Factored out so that istream can call this. +void FilePiece::InitializeNoRead(const char *name, std::size_t min_buffer) { file_name_ = name; default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2); @@ -104,6 +130,10 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s position_end_ = NULL; mapped_offset_ = 0; at_end_ = false; +} + +void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { + InitializeNoRead(name, min_buffer); if (total_size_ == kBadSize) { // So the assertion passes. @@ -116,10 +146,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s } Shift(); // gzip detect. - if ((position_end_ - position_) > 2 && *position_ == 0x1f && static_cast<unsigned char>(*(position_ + 1)) == 0x8b) { -#ifndef HAVE_ZLIB - UTIL_THROW(GZException, "Looks like a gzip file but support was not compiled in."); -#endif + if ((position_end_ - position_) >= ReadCompressed::kMagicSize && ReadCompressed::DetectCompressedMagic(position_)) { if (!fallback_to_read_) { at_end_ = false; TransitionToRead(); @@ -128,21 +155,33 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s } namespace { -void ParseNumber(const char *begin, char *&end, float &out) { -#if defined(sun) || defined(WIN32) - out = static_cast<float>(strtod(begin, &end)); -#else - out = strtof(begin, &end); -#endif + +static const double_conversion::StringToDoubleConverter kConverter( + double_conversion::StringToDoubleConverter::ALLOW_TRAILING_JUNK | double_conversion::StringToDoubleConverter::ALLOW_LEADING_SPACES, + std::numeric_limits<double>::quiet_NaN(), + std::numeric_limits<double>::quiet_NaN(), + "inf", + "NaN"); + +void ParseNumber(const char *begin, const char *&end, float &out) { + int count; + out = kConverter.StringToFloat(begin, end - begin, &count); + end = begin + count; } -void ParseNumber(const char *begin, char *&end, double &out) { - out = strtod(begin, &end); +void ParseNumber(const char *begin, const char *&end, double &out) { + int count; + out = kConverter.StringToDouble(begin, end - begin, &count); + end = begin + count; } -void ParseNumber(const char *begin, char *&end, long int &out) { - out = strtol(begin, &end, 10); +void ParseNumber(const char *begin, const char *&end, long int &out) { + char *silly_end; + out = strtol(begin, &silly_end, 10); + end = silly_end; } -void ParseNumber(const char *begin, char *&end, unsigned long int &out) { - out = strtoul(begin, &end, 10); +void ParseNumber(const char *begin, const char *&end, unsigned long int &out) { + char *silly_end; + out = strtoul(begin, &silly_end, 10); + end = silly_end; } } // namespace @@ -152,16 +191,17 @@ template <class T> T FilePiece::ReadNumber() { if (at_end_) { // Hallucinate a null off the end of the file. std::string buffer(position_, position_end_); - char *end; + const char *buf = buffer.c_str(); + const char *end = buf + buffer.size(); T ret; - ParseNumber(buffer.c_str(), end, ret); - if (buffer.c_str() == end) throw ParseNumberException(buffer); - position_ += end - buffer.c_str(); + ParseNumber(buf, end, ret); + if (buf == end) throw ParseNumberException(buffer); + position_ += end - buf; return ret; } Shift(); } - char *end; + const char *end = last_space_; T ret; ParseNumber(position_, end, ret); if (end == position_) throw ParseNumberException(ReadDelimited()); @@ -196,7 +236,7 @@ void FilePiece::Shift() { if (fallback_to_read_) ReadShift(); for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) { - if (isspace(*last_space_)) break; + if (kSpaces[static_cast<unsigned char>(*last_space_)]) break; } } @@ -242,22 +282,18 @@ void FilePiece::TransitionToRead() { assert(!fallback_to_read_); fallback_to_read_ = true; data_.reset(); - data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); - UTIL_THROW_IF(!data_.get(), ErrnoException, "malloc failed for " << default_map_size_); + data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); position_ = data_.begin(); position_end_ = position_; -#ifdef HAVE_ZLIB - assert(!gz_file_); - gz_file_ = gzdopen(file_.get(), "r"); - UTIL_THROW_IF(!gz_file_, GZException, "zlib failed to open " << file_name_); -#endif + try { + fell_back_.Reset(file_.release()); + } catch (util::Exception &e) { + e << " in file " << file_name_; + throw; + } } -#ifdef WIN32 -typedef int ssize_t; -#endif - void FilePiece::ReadShift() { assert(fallback_to_read_); // Bytes [data_.begin(), position_) have been consumed. @@ -282,7 +318,7 @@ void FilePiece::ReadShift() { position_ = data_.begin(); position_end_ = position_ + valid_length; } else { - size_t moving = position_end_ - position_; + std::size_t moving = position_end_ - position_; memmove(data_.get(), position_, moving); position_ = data_.begin(); position_end_ = position_ + moving; @@ -290,20 +326,9 @@ void FilePiece::ReadShift() { } } - ssize_t read_return; -#ifdef HAVE_ZLIB - read_return = gzread(gz_file_, static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read); - if (read_return == -1) throw GZException(gz_file_); - if (total_size_ != kBadSize) { - // Just get the position, don't actually seek. Apparently this is how you do it. . . - off_t ret = lseek(file_.get(), 0, SEEK_CUR); - if (ret != -1) progress_.Set(ret); - } -#else - read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read); - UTIL_THROW_IF(read_return == -1, ErrnoException, "read failed"); - progress_.Set(mapped_offset_); -#endif + std::size_t read_return = fell_back_.Read(static_cast<uint8_t*>(data_.get()) + already_read, default_map_size_ - already_read); + progress_.Set(fell_back_.RawAmount()); + if (read_return == 0) { at_end_ = true; } |