summaryrefslogtreecommitdiff
path: root/klm/util/file_piece.cc
diff options
context:
space:
mode:
Diffstat (limited to 'klm/util/file_piece.cc')
-rw-r--r--klm/util/file_piece.cc181
1 files changed, 103 insertions, 78 deletions
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index af341d6d..9de30fc4 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -1,19 +1,21 @@
#include "util/file_piece.hh"
+#include "util/double-conversion/double-conversion.h"
#include "util/exception.hh"
#include "util/file.hh"
#include "util/mmap.hh"
-#ifdef WIN32
+
+#if defined(_WIN32) || defined(_WIN64)
#include <io.h>
-#endif // WIN32
+#else
+#include <unistd.h>
+#endif
#include <iostream>
#include <string>
#include <limits>
-#include <unistd.h>
#include <assert.h>
-#include <ctype.h>
#include <fcntl.h>
#include <stdlib.h>
#include <sys/types.h>
@@ -25,13 +27,6 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
*this << "Could not parse \"" << value << "\" into a number";
}
-#ifdef HAVE_ZLIB
-GZException::GZException(gzFile file) {
- int num;
- *this << gzerror(file, &num) << " from zlib";
-}
-#endif // HAVE_ZLIB
-
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
@@ -41,26 +36,33 @@ FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t
Initialize(name, show_progress, min_buffer);
}
-FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
+namespace {
+std::string NamePossiblyFind(int fd, const char *name) {
+ if (name) return name;
+ return NameFromFD(fd);
+}
+} // namespace
+
+FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()),
- progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) {
- Initialize(name, show_progress, min_buffer);
+ progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + NamePossiblyFind(fd, name)) {
+ Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer);
}
-FilePiece::~FilePiece() {
-#ifdef HAVE_ZLIB
- if (gz_file_) {
- // zlib took ownership
- file_.release();
- int ret;
- if (Z_OK != (ret = gzclose(gz_file_))) {
- std::cerr << "could not close file " << file_name_ << " using zlib" << std::endl;
- abort();
- }
- }
-#endif
+FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buffer) :
+ total_size_(kBadSize), page_(SizePage()) {
+ InitializeNoRead(name ? name : "istream", min_buffer);
+
+ fallback_to_read_ = true;
+ data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
+ position_ = data_.begin();
+ position_end_ = position_;
+
+ fell_back_.Reset(stream);
}
+FilePiece::~FilePiece() {}
+
StringPiece FilePiece::ReadLine(char delim) {
std::size_t skip = 0;
while (true) {
@@ -93,10 +95,34 @@ unsigned long int FilePiece::ReadULong() {
return ReadNumber<unsigned long int>();
}
-void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) {
-#ifdef HAVE_ZLIB
- gz_file_ = NULL;
-#endif
+std::size_t FilePiece::Raw(void *to, std::size_t limit) {
+ if (!limit) return 0;
+ std::size_t in_buf = static_cast<std::size_t>(position_end_ - position_);
+ if (in_buf) {
+ std::size_t amount = std::min(in_buf, limit);
+ memcpy(to, position_, amount);
+ position_ += amount;
+ return amount;
+ }
+
+ std::size_t read_return;
+ if (fallback_to_read_) {
+ read_return = fell_back_.Read(to, limit);
+ progress_.Set(fell_back_.RawAmount());
+ } else {
+ uint64_t desired_begin = mapped_offset_ + static_cast<uint64_t>(position_ - data_.begin());
+ SeekOrThrow(file_.get(), desired_begin);
+ read_return = ReadOrEOF(file_.get(), to, limit);
+ // Good thing we never rewind. This makes desired_begin calculate the right way the next time.
+ mapped_offset_ += static_cast<uint64_t>(read_return);
+ progress_ += read_return;
+ }
+ at_end_ |= (read_return == 0);
+ return read_return;
+}
+
+// Factored out so that istream can call this.
+void FilePiece::InitializeNoRead(const char *name, std::size_t min_buffer) {
file_name_ = name;
default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2);
@@ -104,6 +130,10 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s
position_end_ = NULL;
mapped_offset_ = 0;
at_end_ = false;
+}
+
+void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) {
+ InitializeNoRead(name, min_buffer);
if (total_size_ == kBadSize) {
// So the assertion passes.
@@ -116,10 +146,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s
}
Shift();
// gzip detect.
- if ((position_end_ - position_) > 2 && *position_ == 0x1f && static_cast<unsigned char>(*(position_ + 1)) == 0x8b) {
-#ifndef HAVE_ZLIB
- UTIL_THROW(GZException, "Looks like a gzip file but support was not compiled in.");
-#endif
+ if ((position_end_ - position_) >= ReadCompressed::kMagicSize && ReadCompressed::DetectCompressedMagic(position_)) {
if (!fallback_to_read_) {
at_end_ = false;
TransitionToRead();
@@ -128,21 +155,33 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s
}
namespace {
-void ParseNumber(const char *begin, char *&end, float &out) {
-#if defined(sun) || defined(WIN32)
- out = static_cast<float>(strtod(begin, &end));
-#else
- out = strtof(begin, &end);
-#endif
+
+static const double_conversion::StringToDoubleConverter kConverter(
+ double_conversion::StringToDoubleConverter::ALLOW_TRAILING_JUNK | double_conversion::StringToDoubleConverter::ALLOW_LEADING_SPACES,
+ std::numeric_limits<double>::quiet_NaN(),
+ std::numeric_limits<double>::quiet_NaN(),
+ "inf",
+ "NaN");
+
+void ParseNumber(const char *begin, const char *&end, float &out) {
+ int count;
+ out = kConverter.StringToFloat(begin, end - begin, &count);
+ end = begin + count;
}
-void ParseNumber(const char *begin, char *&end, double &out) {
- out = strtod(begin, &end);
+void ParseNumber(const char *begin, const char *&end, double &out) {
+ int count;
+ out = kConverter.StringToDouble(begin, end - begin, &count);
+ end = begin + count;
}
-void ParseNumber(const char *begin, char *&end, long int &out) {
- out = strtol(begin, &end, 10);
+void ParseNumber(const char *begin, const char *&end, long int &out) {
+ char *silly_end;
+ out = strtol(begin, &silly_end, 10);
+ end = silly_end;
}
-void ParseNumber(const char *begin, char *&end, unsigned long int &out) {
- out = strtoul(begin, &end, 10);
+void ParseNumber(const char *begin, const char *&end, unsigned long int &out) {
+ char *silly_end;
+ out = strtoul(begin, &silly_end, 10);
+ end = silly_end;
}
} // namespace
@@ -152,16 +191,17 @@ template <class T> T FilePiece::ReadNumber() {
if (at_end_) {
// Hallucinate a null off the end of the file.
std::string buffer(position_, position_end_);
- char *end;
+ const char *buf = buffer.c_str();
+ const char *end = buf + buffer.size();
T ret;
- ParseNumber(buffer.c_str(), end, ret);
- if (buffer.c_str() == end) throw ParseNumberException(buffer);
- position_ += end - buffer.c_str();
+ ParseNumber(buf, end, ret);
+ if (buf == end) throw ParseNumberException(buffer);
+ position_ += end - buf;
return ret;
}
Shift();
}
- char *end;
+ const char *end = last_space_;
T ret;
ParseNumber(position_, end, ret);
if (end == position_) throw ParseNumberException(ReadDelimited());
@@ -196,7 +236,7 @@ void FilePiece::Shift() {
if (fallback_to_read_) ReadShift();
for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) {
- if (isspace(*last_space_)) break;
+ if (kSpaces[static_cast<unsigned char>(*last_space_)]) break;
}
}
@@ -242,22 +282,18 @@ void FilePiece::TransitionToRead() {
assert(!fallback_to_read_);
fallback_to_read_ = true;
data_.reset();
- data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
- UTIL_THROW_IF(!data_.get(), ErrnoException, "malloc failed for " << default_map_size_);
+ data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
position_ = data_.begin();
position_end_ = position_;
-#ifdef HAVE_ZLIB
- assert(!gz_file_);
- gz_file_ = gzdopen(file_.get(), "r");
- UTIL_THROW_IF(!gz_file_, GZException, "zlib failed to open " << file_name_);
-#endif
+ try {
+ fell_back_.Reset(file_.release());
+ } catch (util::Exception &e) {
+ e << " in file " << file_name_;
+ throw;
+ }
}
-#ifdef WIN32
-typedef int ssize_t;
-#endif
-
void FilePiece::ReadShift() {
assert(fallback_to_read_);
// Bytes [data_.begin(), position_) have been consumed.
@@ -282,7 +318,7 @@ void FilePiece::ReadShift() {
position_ = data_.begin();
position_end_ = position_ + valid_length;
} else {
- size_t moving = position_end_ - position_;
+ std::size_t moving = position_end_ - position_;
memmove(data_.get(), position_, moving);
position_ = data_.begin();
position_end_ = position_ + moving;
@@ -290,20 +326,9 @@ void FilePiece::ReadShift() {
}
}
- ssize_t read_return;
-#ifdef HAVE_ZLIB
- read_return = gzread(gz_file_, static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
- if (read_return == -1) throw GZException(gz_file_);
- if (total_size_ != kBadSize) {
- // Just get the position, don't actually seek. Apparently this is how you do it. . .
- off_t ret = lseek(file_.get(), 0, SEEK_CUR);
- if (ret != -1) progress_.Set(ret);
- }
-#else
- read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
- UTIL_THROW_IF(read_return == -1, ErrnoException, "read failed");
- progress_.Set(mapped_offset_);
-#endif
+ std::size_t read_return = fell_back_.Read(static_cast<uint8_t*>(data_.get()) + already_read, default_map_size_ - already_read);
+ progress_.Set(fell_back_.RawAmount());
+
if (read_return == 0) {
at_end_ = true;
}