summaryrefslogtreecommitdiff
path: root/klm/util/file_piece.cc
diff options
context:
space:
mode:
authorKenneth Heafield <kenlm@kheafield.com>2012-02-28 17:23:55 -0500
committerKenneth Heafield <kenlm@kheafield.com>2012-02-28 17:23:55 -0500
commit89238977fc9d8f8d9a6421b0d4f35afc200f08e7 (patch)
treef60871db033d20faaf406af2736f17631f490b44 /klm/util/file_piece.cc
parent1f0ded1e7f59b13d7512111dd910d0f4b2f82d02 (diff)
Subject: where's my kenlm update?? From: Chris Dyer <cdyer@cs.cmu.edu>
Diffstat (limited to 'klm/util/file_piece.cc')
-rw-r--r--klm/util/file_piece.cc62
1 files changed, 30 insertions, 32 deletions
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index b57582a0..081e662b 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -2,6 +2,10 @@
#include "util/exception.hh"
#include "util/file.hh"
+#include "util/mmap.hh"
+#ifdef WIN32
+#include <io.h>
+#endif // WIN32
#include <iostream>
#include <string>
@@ -11,14 +15,8 @@
#include <ctype.h>
#include <fcntl.h>
#include <stdlib.h>
-#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
-#include <unistd.h>
-
-#ifdef HAVE_ZLIB
-#include <zlib.h>
-#endif
namespace util {
@@ -26,24 +24,24 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
*this << "Could not parse \"" << value << "\" into a number";
}
-GZException::GZException(void *file) {
#ifdef HAVE_ZLIB
+GZException::GZException(gzFile file) {
int num;
- *this << gzerror(file, &num) << " from zlib";
-#endif // HAVE_ZLIB
+ *this << gzerror( file, &num) << " from zlib";
}
+#endif // HAVE_ZLIB
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) :
- file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
+FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) :
+ file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
Initialize(name, show_progress, min_buffer);
}
-FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) :
- file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
+FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
+ file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
Initialize(name, show_progress, min_buffer);
}
@@ -63,7 +61,7 @@ FilePiece::~FilePiece() {
}
StringPiece FilePiece::ReadLine(char delim) {
- size_t skip = 0;
+ std::size_t skip = 0;
while (true) {
for (const char *i = position_ + skip; i < position_end_; ++i) {
if (*i == delim) {
@@ -94,13 +92,13 @@ unsigned long int FilePiece::ReadULong() {
return ReadNumber<unsigned long int>();
}
-void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) {
+void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) {
#ifdef HAVE_ZLIB
gz_file_ = NULL;
#endif
file_name_ = name;
- default_map_size_ = page_ * std::max<off_t>((min_buffer / page_ + 1), 2);
+ default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2);
position_ = NULL;
position_end_ = NULL;
mapped_offset_ = 0;
@@ -130,7 +128,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t
namespace {
void ParseNumber(const char *begin, char *&end, float &out) {
-#ifdef sun
+#if defined(sun) || defined(WIN32)
out = static_cast<float>(strtod(begin, &end));
#else
out = strtof(begin, &end);
@@ -171,7 +169,7 @@ template <class T> T FilePiece::ReadNumber() {
}
const char *FilePiece::FindDelimiterOrEOF(const bool *delim) {
- size_t skip = 0;
+ std::size_t skip = 0;
while (true) {
for (const char *i = position_ + skip; i < position_end_; ++i) {
if (delim[static_cast<unsigned char>(*i)]) return i;
@@ -190,7 +188,7 @@ void FilePiece::Shift() {
progress_.Finished();
throw EndOfFileException();
}
- off_t desired_begin = position_ - data_.begin() + mapped_offset_;
+ uint64_t desired_begin = position_ - data_.begin() + mapped_offset_;
if (!fallback_to_read_) MMapShift(desired_begin);
// Notice an mmap failure might set the fallback.
@@ -201,18 +199,18 @@ void FilePiece::Shift() {
}
}
-void FilePiece::MMapShift(off_t desired_begin) {
+void FilePiece::MMapShift(uint64_t desired_begin) {
// Use mmap.
- off_t ignore = desired_begin % page_;
+ uint64_t ignore = desired_begin % page_;
// Duplicate request for Shift means give more data.
if (position_ == data_.begin() + ignore) {
default_map_size_ *= 2;
}
// Local version so that in case of failure it doesn't overwrite the class variable.
- off_t mapped_offset = desired_begin - ignore;
+ uint64_t mapped_offset = desired_begin - ignore;
- off_t mapped_size;
- if (default_map_size_ >= static_cast<size_t>(total_size_ - mapped_offset)) {
+ uint64_t mapped_size;
+ if (default_map_size_ >= static_cast<std::size_t>(total_size_ - mapped_offset)) {
at_end_ = true;
mapped_size = total_size_ - mapped_offset;
} else {
@@ -221,15 +219,11 @@ void FilePiece::MMapShift(off_t desired_begin) {
// Forcibly clear the existing mmap first.
data_.reset();
- data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_SHARED
- // Populate where available on linux
-#ifdef MAP_POPULATE
- | MAP_POPULATE
-#endif
- , *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED);
- if (data_.get() == MAP_FAILED) {
+ try {
+ MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_);
+ } catch (const util::ErrnoException &e) {
if (desired_begin) {
- if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either.");
+ SeekOrThrow(*file_, desired_begin);
}
// The mmap was scheduled to end the file, but now we're going to read it.
at_end_ = false;
@@ -259,6 +253,10 @@ void FilePiece::TransitionToRead() {
#endif
}
+#ifdef WIN32
+typedef int ssize_t;
+#endif
+
void FilePiece::ReadShift() {
assert(fallback_to_read_);
// Bytes [data_.begin(), position_) have been consumed.