From b35a7f3a96ff8ae42e15922dd6949bf9f5d15501 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Tue, 22 Jan 2013 21:37:49 +0000 Subject: KenLM 58da338b --- klm/util/file_piece.hh | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'klm/util/file_piece.hh') diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index 53310976..c07c6011 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -9,6 +9,7 @@ #include "util/string_piece.hh" #include +#include #include #include @@ -31,6 +32,13 @@ class FilePiece { // Takes ownership of fd. name is used for messages. explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); + /* Read from an istream. Don't use this if you can avoid it. Raw fd IO is + * much faster. But sometimes you just have an istream like Boost's HTTP + * server and want to parse it the same way. + * name is just used for messages and FileName(). + */ + explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576); + ~FilePiece(); char get() { @@ -71,6 +79,8 @@ class FilePiece { const std::string &FileName() const { return file_name_; } private: + void InitializeNoRead(const char *name, std::size_t min_buffer); + // Calls InitializeNoRead, so don't call both. void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); template T ReadNumber(); -- cgit v1.2.3 From abf044fb2e5dcea5ba4dd54ca16c4e75e1c1295b Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 23 Jan 2013 21:46:03 +0000 Subject: Completely untested Raw read call --- klm/util/file_piece.cc | 28 +++++++++++++++++++++++++++- klm/util/file_piece.hh | 5 ++++- 2 files changed, 31 insertions(+), 2 deletions(-) (limited to 'klm/util/file_piece.hh') diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index 4d143857..9de30fc4 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -51,7 +51,7 @@ FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std: FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buffer) : total_size_(kBadSize), page_(SizePage()) { - InitializeNoRead("istream", min_buffer); + InitializeNoRead(name ? name : "istream", min_buffer); fallback_to_read_ = true; data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); @@ -95,6 +95,32 @@ unsigned long int FilePiece::ReadULong() { return ReadNumber(); } +std::size_t FilePiece::Raw(void *to, std::size_t limit) { + if (!limit) return 0; + std::size_t in_buf = static_cast(position_end_ - position_); + if (in_buf) { + std::size_t amount = std::min(in_buf, limit); + memcpy(to, position_, amount); + position_ += amount; + return amount; + } + + std::size_t read_return; + if (fallback_to_read_) { + read_return = fell_back_.Read(to, limit); + progress_.Set(fell_back_.RawAmount()); + } else { + uint64_t desired_begin = mapped_offset_ + static_cast(position_ - data_.begin()); + SeekOrThrow(file_.get(), desired_begin); + read_return = ReadOrEOF(file_.get(), to, limit); + // Good thing we never rewind. This makes desired_begin calculate the right way the next time. + mapped_offset_ += static_cast(read_return); + progress_ += read_return; + } + at_end_ |= (read_return == 0); + return read_return; +} + // Factored out so that istream can call this. void FilePiece::InitializeNoRead(const char *name, std::size_t min_buffer) { file_name_ = name; diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index c07c6011..1b110287 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -64,7 +64,10 @@ class FilePiece { long int ReadLong(); unsigned long int ReadULong(); - // Skip spaces defined by isspace. + // Fake read() function. Reads up to limit bytes, returning the amount read. Returns 0 on EOF || limit == 0. + std::size_t Raw(void *to, std::size_t limit); + + // Skip spaces defined by being in delim. void SkipSpaces(const bool *delim = kSpaces) { for (; ; ++position_) { if (position_ == position_end_) Shift(); -- cgit v1.2.3