summaryrefslogtreecommitdiff
path: root/klm/util/file_piece.cc
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-11-10 02:02:04 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-11-10 02:02:04 +0000
commit15b03336564d5e57e50693f19dd81b45076af5d4 (patch)
treec2072893a43f4c75f0ad5ebe3080bfa901faf18f /klm/util/file_piece.cc
parent1336aecfe930546f8836ffe65dd5ff78434084eb (diff)
new version of klm
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@706 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'klm/util/file_piece.cc')
-rw-r--r--klm/util/file_piece.cc141
1 files changed, 106 insertions, 35 deletions
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index 2b439499..e7bd8659 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -2,19 +2,23 @@
#include "util/exception.hh"
-#include <iostream>
#include <string>
#include <limits>
#include <assert.h>
-#include <cstdlib>
#include <ctype.h>
+#include <err.h>
#include <fcntl.h>
+#include <stdlib.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
+#ifdef HAVE_ZLIB
+#include <zlib.h>
+#endif
+
namespace util {
EndOfFileException::EndOfFileException() throw() {
@@ -26,6 +30,13 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
*this << "Could not parse \"" << value << "\" into a float";
}
+GZException::GZException(void *file) {
+#ifdef HAVE_ZLIB
+ int num;
+ *this << gzerror(file, &num) << " from zlib";
+#endif // HAVE_ZLIB
+}
+
int OpenReadOrThrow(const char *name) {
int ret = open(name, O_RDONLY);
if (ret == -1) UTIL_THROW(ErrnoException, "in open (" << name << ") for reading");
@@ -38,42 +49,73 @@ off_t SizeFile(int fd) {
return sb.st_size;
}
-FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) :
+FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) :
file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
Initialize(name, show_progress, min_buffer);
}
-FilePiece::FilePiece(const char *name, int fd, std::ostream *show_progress, off_t min_buffer) :
+FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) :
file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
Initialize(name, show_progress, min_buffer);
}
-void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) {
- if (total_size_ == kBadSize) {
- fallback_to_read_ = true;
- if (show_progress)
- *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl;
- } else {
- fallback_to_read_ = false;
+FilePiece::~FilePiece() {
+#ifdef HAVE_ZLIB
+ if (gz_file_) {
+ // zlib took ownership
+ file_.release();
+ int ret;
+ if (Z_OK != (ret = gzclose(gz_file_))) {
+ errx(1, "could not close file %s using zlib", file_name_.c_str());
+ }
}
+#endif
+}
+
+void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) {
+#ifdef HAVE_ZLIB
+ gz_file_ = NULL;
+#endif
+ file_name_ = name;
+
default_map_size_ = page_ * std::max<off_t>((min_buffer / page_ + 1), 2);
position_ = NULL;
position_end_ = NULL;
mapped_offset_ = 0;
at_end_ = false;
+
+ if (total_size_ == kBadSize) {
+ // So the assertion passes.
+ fallback_to_read_ = false;
+ if (show_progress)
+ *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl;
+ TransitionToRead();
+ } else {
+ fallback_to_read_ = false;
+ }
Shift();
+ // gzip detect.
+ if ((position_end_ - position_) > 2 && *position_ == 0x1f && static_cast<unsigned char>(*(position_ + 1)) == 0x8b) {
+#ifndef HAVE_ZLIB
+ UTIL_THROW(GZException, "Looks like a gzip file but support was not compiled in.");
+#endif
+ if (!fallback_to_read_) {
+ at_end_ = false;
+ TransitionToRead();
+ }
+ }
}
-float FilePiece::ReadFloat() throw(EndOfFileException, ParseNumberException) {
+float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) {
SkipSpaces();
while (last_space_ < position_) {
if (at_end_) {
// Hallucinate a null off the end of the file.
std::string buffer(position_, position_end_);
char *end;
- float ret = std::strtof(buffer.c_str(), &end);
+ float ret = strtof(buffer.c_str(), &end);
if (buffer.c_str() == end) throw ParseNumberException(buffer);
position_ += end - buffer.c_str();
return ret;
@@ -81,20 +123,20 @@ float FilePiece::ReadFloat() throw(EndOfFileException, ParseNumberException) {
Shift();
}
char *end;
- float ret = std::strtof(position_, &end);
+ float ret = strtof(position_, &end);
if (end == position_) throw ParseNumberException(ReadDelimited());
position_ = end;
return ret;
}
-void FilePiece::SkipSpaces() throw (EndOfFileException) {
+void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) {
for (; ; ++position_) {
if (position_ == position_end_) Shift();
if (!isspace(*position_)) return;
}
}
-const char *FilePiece::FindDelimiterOrEOF() throw (EndOfFileException) {
+const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileException) {
for (const char *i = position_; i <= last_space_; ++i) {
if (isspace(*i)) return i;
}
@@ -108,7 +150,7 @@ const char *FilePiece::FindDelimiterOrEOF() throw (EndOfFileException) {
return position_end_;
}
-StringPiece FilePiece::ReadLine(char delim) throw (EndOfFileException) {
+StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) {
const char *start = position_;
do {
for (const char *i = start; i < position_end_; ++i) {
@@ -124,17 +166,19 @@ StringPiece FilePiece::ReadLine(char delim) throw (EndOfFileException) {
} while (!at_end_);
StringPiece ret(position_, position_end_ - position_);
position_ = position_end_;
- return position_;
+ return ret;
}
-void FilePiece::Shift() throw(EndOfFileException) {
- if (at_end_) throw EndOfFileException();
+void FilePiece::Shift() throw(GZException, EndOfFileException) {
+ if (at_end_) {
+ progress_.Finished();
+ throw EndOfFileException();
+ }
off_t desired_begin = position_ - data_.begin() + mapped_offset_;
- progress_.Set(desired_begin);
if (!fallback_to_read_) MMapShift(desired_begin);
// Notice an mmap failure might set the fallback.
- if (fallback_to_read_) ReadShift(desired_begin);
+ if (fallback_to_read_) ReadShift();
for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) {
if (isspace(*last_space_)) break;
@@ -163,28 +207,41 @@ void FilePiece::MMapShift(off_t desired_begin) throw() {
data_.reset();
data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_PRIVATE, *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED);
if (data_.get() == MAP_FAILED) {
- fallback_to_read_ = true;
if (desired_begin) {
if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either.");
}
+ // The mmap was scheduled to end the file, but now we're going to read it.
+ at_end_ = false;
+ TransitionToRead();
return;
}
mapped_offset_ = mapped_offset;
position_ = data_.begin() + ignore;
position_end_ = data_.begin() + mapped_size;
+
+ progress_.Set(desired_begin);
+}
+
+void FilePiece::TransitionToRead() throw (GZException) {
+ assert(!fallback_to_read_);
+ fallback_to_read_ = true;
+ data_.reset();
+ data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
+ if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_);
+ position_ = data_.begin();
+ position_end_ = position_;
+
+#ifdef HAVE_ZLIB
+ assert(!gz_file_);
+ gz_file_ = gzdopen(file_.get(), "r");
+ if (!gz_file_) {
+ UTIL_THROW(GZException, "zlib failed to open " << file_name_);
+ }
+#endif
}
-void FilePiece::ReadShift(off_t desired_begin) throw() {
+void FilePiece::ReadShift() throw(GZException, EndOfFileException) {
assert(fallback_to_read_);
- if (data_.source() != scoped_memory::MALLOC_ALLOCATED) {
- // First call.
- data_.reset();
- data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
- if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_);
- position_ = data_.begin();
- position_end_ = position_;
- }
-
// Bytes [data_.begin(), position_) have been consumed.
// Bytes [position_, position_end_) have been read into the buffer.
@@ -215,9 +272,23 @@ void FilePiece::ReadShift(off_t desired_begin) throw() {
}
}
- ssize_t read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
+ ssize_t read_return;
+#ifdef HAVE_ZLIB
+ read_return = gzread(gz_file_, static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
+ if (read_return == -1) throw GZException(gz_file_);
+ if (total_size_ != kBadSize) {
+ // Just get the position, don't actually seek. Apparently this is how you do it. . .
+ off_t ret = lseek(file_.get(), 0, SEEK_CUR);
+ if (ret != -1) progress_.Set(ret);
+ }
+#else
+ read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
if (read_return == -1) UTIL_THROW(ErrnoException, "read failed");
- if (read_return == 0) at_end_ = true;
+ progress_.Set(mapped_offset_);
+#endif
+ if (read_return == 0) {
+ at_end_ = true;
+ }
position_end_ += read_return;
}