From d3e2ec203a5cf550320caa8023ac3dd103b0be7d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 13 Oct 2014 00:42:37 -0400 Subject: new kenlm --- klm/util/file.cc | 142 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 49 deletions(-) (limited to 'klm/util/file.cc') diff --git a/klm/util/file.cc b/klm/util/file.cc index 51eaf972..aa61cf9a 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -5,28 +5,29 @@ #include "util/exception.hh" +#include #include #include -#include #include +#include +#include + #include #include +#include #include #include #include #include -#if defined __MINGW32__ +#if defined(__MINGW32__) #include #include #warning "The file functions on MinGW have not been tested for file sizes above 2^31 - 1. Please read https://stackoverflow.com/questions/12539488/determine-64-bit-file-size-in-c-on-mingw-32-bit and fix" #elif defined(_WIN32) || defined(_WIN64) #include #include -#include -#include -#include #else #include #endif @@ -40,9 +41,9 @@ scoped_fd::~scoped_fd() { } } -scoped_FILE::~scoped_FILE() { - if (file_ && std::fclose(file_)) { - std::cerr << "Could not close file " << std::endl; +void scoped_FILE_closer::Close(std::FILE *file) { + if (file && std::fclose(file)) { + std::cerr << "Could not close file " << file << std::endl; std::abort(); } } @@ -111,7 +112,7 @@ uint64_t SizeOrThrow(int fd) { void ResizeOrThrow(int fd, uint64_t to) { #if defined __MINGW32__ - // Does this handle 64-bit? + // Does this handle 64-bit? int ret = ftruncate #elif defined(_WIN32) || defined(_WIN64) errno_t ret = _chsize_s @@ -128,8 +129,10 @@ namespace { std::size_t GuardLarge(std::size_t size) { // The following operating systems have broken read/write/pread/pwrite that // only supports up to 2^31. + // OS X man pages claim to support 64-bit, but Kareem M. Darwish had problems + // building with larger files, so APPLE is also here. #if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID) || defined(__MINGW32__) - return std::min(static_cast(static_cast(-1)), size); + return size < INT_MAX ? size : INT_MAX; #else return size; #endif @@ -172,13 +175,44 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) { return amount; } -void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) { - uint8_t *to = static_cast(to_void); +void WriteOrThrow(int fd, const void *data_void, std::size_t size) { + const uint8_t *data = static_cast(data_void); + while (size) { #if defined(_WIN32) || defined(_WIN64) - UTIL_THROW(Exception, "This pread implementation for windows is broken. Please send me a patch that does not change the file pointer. Atomically. Or send me an implementation of pwrite that is allowed to change the file pointer but can be called concurrently with pread."); - const std::size_t kMaxDWORD = static_cast(4294967295UL); + int ret; +#else + ssize_t ret; #endif - for (;size ;) { + errno = 0; + do { + ret = +#if defined(_WIN32) || defined(_WIN64) + _write +#else + write +#endif + (fd, data, GuardLarge(size)); + } while (ret == -1 && errno == EINTR); + UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes"); + data += ret; + size -= ret; + } +} + +void WriteOrThrow(FILE *to, const void *data, std::size_t size) { + if (!size) return; + UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), ErrnoException, "Short write; requested size " << size); +} + +#if defined(_WIN32) || defined(_WIN64) +namespace { +const std::size_t kMaxDWORD = static_cast(4294967295UL); +} // namespace +#endif + +void ErsatzPRead(int fd, void *to_void, std::size_t size, uint64_t off) { + uint8_t *to = static_cast(to_void); + while (size) { #if defined(_WIN32) || defined(_WIN64) /* BROKEN: changes file pointer. Even if you save it and change it back, it won't be safe to use concurrently with write() or read() which lmplz does. */ // size_t might be 64-bit. DWORD is always 32. @@ -192,16 +226,15 @@ void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) { #else ssize_t ret; errno = 0; - do { - ret = + ret = #ifdef OS_ANDROID - pread64 + pread64 #else - pread + pread #endif - (fd, to, GuardLarge(size), off); - } while (ret == -1 && errno == EINTR); + (fd, to, GuardLarge(size), off); if (ret <= 0) { + if (ret == -1 && errno == EINTR) continue; UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd)); UTIL_THROW_ARG(FDException, (fd), "while reading " << size << " bytes at offset " << off); } @@ -212,34 +245,41 @@ void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) { } } -void WriteOrThrow(int fd, const void *data_void, std::size_t size) { - const uint8_t *data = static_cast(data_void); - while (size) { +void ErsatzPWrite(int fd, const void *from_void, std::size_t size, uint64_t off) { + const uint8_t *from = static_cast(from_void); + while(size) { #if defined(_WIN32) || defined(_WIN64) - int ret; + /* Changes file pointer. Even if you save it and change it back, it won't be safe to use concurrently with write() or read() */ + // size_t might be 64-bit. DWORD is always 32. + DWORD writing = static_cast(std::min(kMaxDWORD, size)); + DWORD ret; + OVERLAPPED overlapped; + memset(&overlapped, 0, sizeof(OVERLAPPED)); + overlapped.Offset = static_cast(off); + overlapped.OffsetHigh = static_cast(off >> 32); + UTIL_THROW_IF(!WriteFile((HANDLE)_get_osfhandle(fd), from, writing, &ret, &overlapped), Exception, "WriteFile failed for offset " << off); #else ssize_t ret; -#endif errno = 0; - do { - ret = -#if defined(_WIN32) || defined(_WIN64) - _write + ret = +#ifdef OS_ANDROID + pwrite64 #else - write + pwrite +#endif + (fd, from, GuardLarge(size), off); + if (ret <= 0) { + if (ret == -1 && errno == EINTR) continue; + UTIL_THROW_IF(ret == 0, EndOfFileException, " for writing " << size << " bytes at " << off << " from " << NameFromFD(fd)); + UTIL_THROW_ARG(FDException, (fd), "while writing " << size << " bytes at offset " << off); + } #endif - (fd, data, GuardLarge(size)); - } while (ret == -1 && errno == EINTR); - UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes"); - data += ret; size -= ret; + off += ret; + from += ret; } } -void WriteOrThrow(FILE *to, const void *data, std::size_t size) { - if (!size) return; - UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), ErrnoException, "Short write; requested size " << size); -} void FSyncOrThrow(int fd) { // Apparently windows doesn't have fsync? @@ -443,8 +483,8 @@ void NormalizeTempPrefix(std::string &base) { ) base += '/'; } -int MakeTemp(const std::string &base) { - std::string name(base); +int MakeTemp(const StringPiece &base) { + std::string name(base.data(), base.size()); name += "XXXXXX"; name.push_back(0); int ret; @@ -452,7 +492,7 @@ int MakeTemp(const std::string &base) { return ret; } -std::FILE *FMakeTemp(const std::string &base) { +std::FILE *FMakeTemp(const StringPiece &base) { util::scoped_fd file(MakeTemp(base)); return FDOpenOrThrow(file); } @@ -478,14 +518,18 @@ bool TryName(int fd, std::string &out) { if (-1 == lstat(name.c_str(), &sb)) return false; out.resize(sb.st_size + 1); - ssize_t ret = readlink(name.c_str(), &out[0], sb.st_size + 1); - if (-1 == ret) - return false; - if (ret > sb.st_size) { - // Increased in size?! - return false; + // lstat gave us a size, but I've seen it grow, possibly due to symlinks on top of symlinks. + while (true) { + ssize_t ret = readlink(name.c_str(), &out[0], out.size()); + if (-1 == ret) + return false; + if ((size_t)ret < out.size()) { + out.resize(ret); + break; + } + // Exponential growth. + out.resize(out.size() * 2); } - out.resize(ret); // Don't use the non-file names. if (!out.empty() && out[0] != '/') return false; -- cgit v1.2.3