From 00d0b3b462bd9fd230b25b3ab6021c197980bdff Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Tue, 28 Feb 2012 17:23:55 -0500 Subject: Subject: where's my kenlm update?? From: Chris Dyer --- klm/util/file_piece.cc | 62 ++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 32 deletions(-) (limited to 'klm/util/file_piece.cc') diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index b57582a0..081e662b 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -2,6 +2,10 @@ #include "util/exception.hh" #include "util/file.hh" +#include "util/mmap.hh" +#ifdef WIN32 +#include +#endif // WIN32 #include #include @@ -11,14 +15,8 @@ #include #include #include -#include #include #include -#include - -#ifdef HAVE_ZLIB -#include -#endif namespace util { @@ -26,24 +24,24 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() { *this << "Could not parse \"" << value << "\" into a number"; } -GZException::GZException(void *file) { #ifdef HAVE_ZLIB +GZException::GZException(gzFile file) { int num; - *this << gzerror(file, &num) << " from zlib"; -#endif // HAVE_ZLIB + *this << gzerror( file, &num) << " from zlib"; } +#endif // HAVE_ZLIB // Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; -FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) : - file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)), +FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) : + file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()), progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { Initialize(name, show_progress, min_buffer); } -FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) : - file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)), +FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : + file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()), progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { Initialize(name, show_progress, min_buffer); } @@ -63,7 +61,7 @@ FilePiece::~FilePiece() { } StringPiece FilePiece::ReadLine(char delim) { - size_t skip = 0; + std::size_t skip = 0; while (true) { for (const char *i = position_ + skip; i < position_end_; ++i) { if (*i == delim) { @@ -94,13 +92,13 @@ unsigned long int FilePiece::ReadULong() { return ReadNumber(); } -void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) { +void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { #ifdef HAVE_ZLIB gz_file_ = NULL; #endif file_name_ = name; - default_map_size_ = page_ * std::max((min_buffer / page_ + 1), 2); + default_map_size_ = page_ * std::max((min_buffer / page_ + 1), 2); position_ = NULL; position_end_ = NULL; mapped_offset_ = 0; @@ -130,7 +128,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t namespace { void ParseNumber(const char *begin, char *&end, float &out) { -#ifdef sun +#if defined(sun) || defined(WIN32) out = static_cast(strtod(begin, &end)); #else out = strtof(begin, &end); @@ -171,7 +169,7 @@ template T FilePiece::ReadNumber() { } const char *FilePiece::FindDelimiterOrEOF(const bool *delim) { - size_t skip = 0; + std::size_t skip = 0; while (true) { for (const char *i = position_ + skip; i < position_end_; ++i) { if (delim[static_cast(*i)]) return i; @@ -190,7 +188,7 @@ void FilePiece::Shift() { progress_.Finished(); throw EndOfFileException(); } - off_t desired_begin = position_ - data_.begin() + mapped_offset_; + uint64_t desired_begin = position_ - data_.begin() + mapped_offset_; if (!fallback_to_read_) MMapShift(desired_begin); // Notice an mmap failure might set the fallback. @@ -201,18 +199,18 @@ void FilePiece::Shift() { } } -void FilePiece::MMapShift(off_t desired_begin) { +void FilePiece::MMapShift(uint64_t desired_begin) { // Use mmap. - off_t ignore = desired_begin % page_; + uint64_t ignore = desired_begin % page_; // Duplicate request for Shift means give more data. if (position_ == data_.begin() + ignore) { default_map_size_ *= 2; } // Local version so that in case of failure it doesn't overwrite the class variable. - off_t mapped_offset = desired_begin - ignore; + uint64_t mapped_offset = desired_begin - ignore; - off_t mapped_size; - if (default_map_size_ >= static_cast(total_size_ - mapped_offset)) { + uint64_t mapped_size; + if (default_map_size_ >= static_cast(total_size_ - mapped_offset)) { at_end_ = true; mapped_size = total_size_ - mapped_offset; } else { @@ -221,15 +219,11 @@ void FilePiece::MMapShift(off_t desired_begin) { // Forcibly clear the existing mmap first. data_.reset(); - data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_SHARED - // Populate where available on linux -#ifdef MAP_POPULATE - | MAP_POPULATE -#endif - , *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED); - if (data_.get() == MAP_FAILED) { + try { + MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_); + } catch (const util::ErrnoException &e) { if (desired_begin) { - if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either."); + SeekOrThrow(*file_, desired_begin); } // The mmap was scheduled to end the file, but now we're going to read it. at_end_ = false; @@ -259,6 +253,10 @@ void FilePiece::TransitionToRead() { #endif } +#ifdef WIN32 +typedef int ssize_t; +#endif + void FilePiece::ReadShift() { assert(fallback_to_read_); // Bytes [data_.begin(), position_) have been consumed. -- cgit v1.2.3