summaryrefslogtreecommitdiff
path: root/klm/util
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-12-14 12:48:26 -0800
committerKenneth Heafield <github@kheafield.com>2012-12-14 12:48:26 -0800
commitdd0fdabb1db41a4230e487c80b61ace9697f150d (patch)
tree82686a3461645f83343a1926482007946d5a542c /klm/util
parent5d42134cc676278189b0f77708908542fbb5ccc9 (diff)
Updated kenlm
Diffstat (limited to 'klm/util')
-rw-r--r--klm/util/Makefile.am1
-rw-r--r--klm/util/exception.hh8
-rw-r--r--klm/util/file.cc38
-rw-r--r--klm/util/file.hh8
-rw-r--r--klm/util/file_piece.cc66
-rw-r--r--klm/util/file_piece.hh41
-rw-r--r--klm/util/file_piece_test.cc4
-rw-r--r--klm/util/have.hh12
-rw-r--r--klm/util/joint_sort.hh4
-rw-r--r--klm/util/read_compressed.cc403
-rw-r--r--klm/util/read_compressed.hh74
-rw-r--r--klm/util/read_compressed_test.cc94
-rw-r--r--klm/util/scoped.hh65
-rw-r--r--klm/util/string_piece.hh19
-rw-r--r--klm/util/tokenize_piece.hh14
15 files changed, 698 insertions, 153 deletions
diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am
index 5306850f..a676bdb3 100644
--- a/klm/util/Makefile.am
+++ b/klm/util/Makefile.am
@@ -27,6 +27,7 @@ libklm_util_a_SOURCES = \
mmap.cc \
murmur_hash.cc \
pool.cc \
+ read_compressed.cc \
string_piece.cc \
usage.cc
diff --git a/klm/util/exception.hh b/klm/util/exception.hh
index 053a850b..0165a7a3 100644
--- a/klm/util/exception.hh
+++ b/klm/util/exception.hh
@@ -87,8 +87,14 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep
throw UTIL_e; \
} while (0)
+#if __GNUC__ >= 3
+#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0)
+#else
+#define UTIL_UNLIKELY(x) (x)
+#endif
+
#define UTIL_THROW_IF(Condition, Exception, Modify) do { \
- if (Condition) { \
+ if (UTIL_UNLIKELY(Condition)) { \
Exception UTIL_e; \
UTIL_SET_LOCATION(UTIL_e, #Exception, #Condition); \
UTIL_e << Modify; \
diff --git a/klm/util/file.cc b/klm/util/file.cc
index 6bf879ac..b9a77cf9 100644
--- a/klm/util/file.cc
+++ b/klm/util/file.cc
@@ -15,6 +15,8 @@
#if defined(_WIN32) || defined(_WIN64)
#include <windows.h>
#include <io.h>
+#include <algorithm>
+#include <limits.h>
#else
#include <unistd.h>
#endif
@@ -48,7 +50,7 @@ int OpenReadOrThrow(const char *name) {
int CreateOrThrow(const char *name) {
int ret;
#if defined(_WIN32) || defined(_WIN64)
- UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
+ UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR | _O_BINARY, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
#else
UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
#endif
@@ -74,16 +76,22 @@ void ResizeOrThrow(int fd, uint64_t to) {
#endif
}
-#ifdef WIN32
-typedef int ssize_t;
+std::size_t PartialRead(int fd, void *to, std::size_t amount) {
+#if defined(_WIN32) || defined(_WIN64)
+ amount = min(static_cast<std::size_t>(INT_MAX), amount);
+ int ret = _read(fd, to, amount);
+#else
+ ssize_t ret = read(fd, to, amount);
#endif
+ UTIL_THROW_IF(ret < 0, ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
+ return static_cast<std::size_t>(ret);
+}
void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
uint8_t *to = static_cast<uint8_t*>(to_void);
while (amount) {
- ssize_t ret = read(fd, to, amount);
- UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
- UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
+ std::size_t ret = PartialRead(fd, to, amount);
+ UTIL_THROW_IF(ret == 0, EndOfFileException, " in fd " << fd << " but there should be " << amount << " more bytes to read.");
amount -= ret;
to += ret;
}
@@ -93,8 +101,7 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {
uint8_t *to = static_cast<uint8_t*>(to_void);
std::size_t remaining = amount;
while (remaining) {
- ssize_t ret = read(fd, to, remaining);
- UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << remaining << " from fd " << fd << " failed.");
+ std::size_t ret = PartialRead(fd, to, remaining);
if (!ret) return amount - remaining;
remaining -= ret;
to += ret;
@@ -105,7 +112,11 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {
void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
const uint8_t *data = static_cast<const uint8_t*>(data_void);
while (size) {
+#if defined(_WIN32) || defined(_WIN64)
+ int ret = write(fd, data, min(static_cast<std::size_t>(INT_MAX), size));
+#else
ssize_t ret = write(fd, data, size);
+#endif
if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed");
data += ret;
size -= ret;
@@ -114,7 +125,7 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
void WriteOrThrow(FILE *to, const void *data, std::size_t size) {
assert(size);
- if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size);
+ UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), util::ErrnoException, "Short write; requested size " << size);
}
void FSyncOrThrow(int fd) {
@@ -149,14 +160,15 @@ void SeekEnd(int fd) {
std::FILE *FDOpenOrThrow(scoped_fd &file) {
std::FILE *ret = fdopen(file.get(), "r+b");
- if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen");
+ if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen descriptor " << file.get());
file.release();
return ret;
}
-std::FILE *FOpenOrThrow(const char *path, const char *mode) {
- std::FILE *ret;
- UTIL_THROW_IF(!(ret = fopen(path, mode)), util::ErrnoException, "Could not fopen " << path << " for " << mode);
+std::FILE *FDOpenReadOrThrow(scoped_fd &file) {
+ std::FILE *ret = fdopen(file.get(), "rb");
+ if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen descriptor " << file.get());
+ file.release();
return ret;
}
diff --git a/klm/util/file.hh b/klm/util/file.hh
index 185cb1f3..c24580d6 100644
--- a/klm/util/file.hh
+++ b/klm/util/file.hh
@@ -32,8 +32,6 @@ class scoped_fd {
return ret;
}
- operator bool() { return fd_ != -1; }
-
private:
int fd_;
@@ -76,8 +74,9 @@ uint64_t SizeFile(int fd);
void ResizeOrThrow(int fd, uint64_t to);
+std::size_t PartialRead(int fd, void *to, std::size_t size);
void ReadOrThrow(int fd, void *to, std::size_t size);
-std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount);
+std::size_t ReadOrEOF(int fd, void *to_void, std::size_t size);
void WriteOrThrow(int fd, const void *data_void, std::size_t size);
void WriteOrThrow(FILE *to, const void *data, std::size_t size);
@@ -90,8 +89,7 @@ void AdvanceOrThrow(int fd, int64_t off);
void SeekEnd(int fd);
std::FILE *FDOpenOrThrow(scoped_fd &file);
-
-std::FILE *FOpenOrThrow(const char *path, const char *mode);
+std::FILE *FDOpenReadOrThrow(scoped_fd &file);
class TempMaker {
public:
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index 280f438c..5a208eff 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -14,7 +14,6 @@
#include <limits>
#include <assert.h>
-#include <ctype.h>
#include <fcntl.h>
#include <stdlib.h>
#include <sys/types.h>
@@ -26,13 +25,6 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
*this << "Could not parse \"" << value << "\" into a number";
}
-#ifdef HAVE_ZLIB
-GZException::GZException(gzFile file) {
- int num;
- *this << gzerror(file, &num) << " from zlib";
-}
-#endif // HAVE_ZLIB
-
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
@@ -48,19 +40,7 @@ FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std:
Initialize(name, show_progress, min_buffer);
}
-FilePiece::~FilePiece() {
-#ifdef HAVE_ZLIB
- if (gz_file_) {
- // zlib took ownership
- file_.release();
- int ret;
- if (Z_OK != (ret = gzclose(gz_file_))) {
- std::cerr << "could not close file " << file_name_ << " using zlib" << std::endl;
- abort();
- }
- }
-#endif
-}
+FilePiece::~FilePiece() {}
StringPiece FilePiece::ReadLine(char delim) {
std::size_t skip = 0;
@@ -95,9 +75,6 @@ unsigned long int FilePiece::ReadULong() {
}
void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) {
-#ifdef HAVE_ZLIB
- gz_file_ = NULL;
-#endif
file_name_ = name;
default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2);
@@ -117,10 +94,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s
}
Shift();
// gzip detect.
- if ((position_end_ - position_) > 2 && *position_ == 0x1f && static_cast<unsigned char>(*(position_ + 1)) == 0x8b) {
-#ifndef HAVE_ZLIB
- UTIL_THROW(GZException, "Looks like a gzip file but support was not compiled in.");
-#endif
+ if ((position_end_ - position_) >= ReadCompressed::kMagicSize && ReadCompressed::DetectCompressedMagic(position_)) {
if (!fallback_to_read_) {
at_end_ = false;
TransitionToRead();
@@ -197,7 +171,7 @@ void FilePiece::Shift() {
if (fallback_to_read_) ReadShift();
for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) {
- if (isspace(*last_space_)) break;
+ if (kSpaces[static_cast<unsigned char>(*last_space_)]) break;
}
}
@@ -248,17 +222,14 @@ void FilePiece::TransitionToRead() {
position_ = data_.begin();
position_end_ = position_;
-#ifdef HAVE_ZLIB
- assert(!gz_file_);
- gz_file_ = gzdopen(file_.get(), "r");
- UTIL_THROW_IF(!gz_file_, GZException, "zlib failed to open " << file_name_);
-#endif
+ try {
+ fell_back_.Reset(file_.release());
+ } catch (util::Exception &e) {
+ e << " in file " << file_name_;
+ throw;
+ }
}
-#ifdef WIN32
-typedef int ssize_t;
-#endif
-
void FilePiece::ReadShift() {
assert(fallback_to_read_);
// Bytes [data_.begin(), position_) have been consumed.
@@ -283,7 +254,7 @@ void FilePiece::ReadShift() {
position_ = data_.begin();
position_end_ = position_ + valid_length;
} else {
- size_t moving = position_end_ - position_;
+ std::size_t moving = position_end_ - position_;
memmove(data_.get(), position_, moving);
position_ = data_.begin();
position_end_ = position_ + moving;
@@ -291,20 +262,9 @@ void FilePiece::ReadShift() {
}
}
- ssize_t read_return;
-#ifdef HAVE_ZLIB
- read_return = gzread(gz_file_, static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
- if (read_return == -1) throw GZException(gz_file_);
- if (total_size_ != kBadSize) {
- // Just get the position, don't actually seek. Apparently this is how you do it. . .
- off_t ret = lseek(file_.get(), 0, SEEK_CUR);
- if (ret != -1) progress_.Set(ret);
- }
-#else
- read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
- UTIL_THROW_IF(read_return == -1, ErrnoException, "read failed");
- progress_.Set(mapped_offset_);
-#endif
+ std::size_t read_return = fell_back_.Read(static_cast<uint8_t*>(data_.get()) + already_read, default_map_size_ - already_read);
+ progress_.Set(fell_back_.RawAmount());
+
if (read_return == 0) {
at_end_ = true;
}
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
index af93d8aa..39bd1581 100644
--- a/klm/util/file_piece.hh
+++ b/klm/util/file_piece.hh
@@ -4,8 +4,8 @@
#include "util/ersatz_progress.hh"
#include "util/exception.hh"
#include "util/file.hh"
-#include "util/have.hh"
#include "util/mmap.hh"
+#include "util/read_compressed.hh"
#include "util/string_piece.hh"
#include <cstddef>
@@ -13,10 +13,6 @@
#include <stdint.h>
-#ifdef HAVE_ZLIB
-#include <zlib.h>
-#endif
-
namespace util {
class ParseNumberException : public Exception {
@@ -25,28 +21,19 @@ class ParseNumberException : public Exception {
~ParseNumberException() throw() {}
};
-class GZException : public Exception {
- public:
-#ifdef HAVE_ZLIB
- explicit GZException(gzFile file);
-#endif
- GZException() throw() {}
- ~GZException() throw() {}
-};
-
extern const bool kSpaces[256];
-// Memory backing the returned StringPiece may vanish on the next call.
+// Memory backing the returned StringPiece may vanish on the next call.
class FilePiece {
public:
- // 32 MB default.
- explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432);
- // Takes ownership of fd. name is used for messages.
- explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432);
+ // 1 MB default.
+ explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
+ // Takes ownership of fd. name is used for messages.
+ explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
~FilePiece();
-
- char get() {
+
+ char get() {
if (position_ == position_end_) {
Shift();
if (at_end_) throw EndOfFileException();
@@ -54,14 +41,14 @@ class FilePiece {
return *(position_++);
}
- // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
+ // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
StringPiece ReadDelimited(const bool *delim = kSpaces) {
SkipSpaces(delim);
return Consume(FindDelimiterOrEOF(delim));
}
// Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
- // It is similar to getline in that way.
+ // It is similar to getline in that way.
StringPiece ReadLine(char delim = '\n');
float ReadFloat();
@@ -69,7 +56,7 @@ class FilePiece {
long int ReadLong();
unsigned long int ReadULong();
- // Skip spaces defined by isspace.
+ // Skip spaces defined by isspace.
void SkipSpaces(const bool *delim = kSpaces) {
for (; ; ++position_) {
if (position_ == position_end_) Shift();
@@ -82,7 +69,7 @@ class FilePiece {
}
const std::string &FileName() const { return file_name_; }
-
+
private:
void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
@@ -122,9 +109,7 @@ class FilePiece {
std::string file_name_;
-#ifdef HAVE_ZLIB
- gzFile gz_file_;
-#endif // HAVE_ZLIB
+ ReadCompressed fell_back_;
};
} // namespace util
diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc
index f912e18a..e79ece7a 100644
--- a/klm/util/file_piece_test.cc
+++ b/klm/util/file_piece_test.cc
@@ -38,7 +38,7 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) {
BOOST_CHECK_THROW(test.get(), EndOfFileException);
}
-#ifndef __APPLE__
+#if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__)
/* Apple isn't happy with the popen, fileno, dup. And I don't want to
* reimplement popen. This is an issue with the test.
*/
@@ -65,7 +65,7 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) {
BOOST_CHECK_THROW(test.get(), EndOfFileException);
BOOST_REQUIRE(!pclose(catter));
}
-#endif // __APPLE__
+#endif
#ifdef HAVE_ZLIB
diff --git a/klm/util/have.hh b/klm/util/have.hh
index b8181e99..1523c0c5 100644
--- a/klm/util/have.hh
+++ b/klm/util/have.hh
@@ -2,22 +2,12 @@
#ifndef UTIL_HAVE__
#define UTIL_HAVE__
-#ifndef HAVE_ZLIB
-#if !defined(_WIN32) && !defined(_WIN64)
-#define HAVE_ZLIB
-#endif
-#endif
-
#ifndef HAVE_ICU
//#define HAVE_ICU
#endif
#ifndef HAVE_BOOST
-#define HAVE_BOOST
-#endif
-
-#ifndef HAVE_THREADS
-//#define HAVE_THREADS
+//#define HAVE_BOOST
#endif
#endif // UTIL_HAVE__
diff --git a/klm/util/joint_sort.hh b/klm/util/joint_sort.hh
index cf3d8432..1b43ddcf 100644
--- a/klm/util/joint_sort.hh
+++ b/klm/util/joint_sort.hh
@@ -60,7 +60,7 @@ template <class KeyIter, class ValueIter> class JointProxy {
JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {}
JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {}
- operator const value_type() const {
+ operator value_type() const {
value_type ret;
ret.key = *inner_.key_;
ret.value = *inner_.value_;
@@ -121,7 +121,7 @@ template <class Proxy, class Less> class LessWrapper : public std::binary_functi
template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > {
public:
- PairedIterator(const KeyIter &key, const ValueIter &value) :
+ PairedIterator(const KeyIter &key, const ValueIter &value) :
ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {}
};
diff --git a/klm/util/read_compressed.cc b/klm/util/read_compressed.cc
new file mode 100644
index 00000000..4ec94c4e
--- /dev/null
+++ b/klm/util/read_compressed.cc
@@ -0,0 +1,403 @@
+#include "util/read_compressed.hh"
+
+#include "util/file.hh"
+#include "util/have.hh"
+#include "util/scoped.hh"
+
+#include <algorithm>
+#include <iostream>
+
+#include <assert.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HAVE_ZLIB
+#include <zlib.h>
+#endif
+
+#ifdef HAVE_BZLIB
+#include <bzlib.h>
+#endif
+
+#ifdef HAVE_XZLIB
+#include <lzma.h>
+#endif
+
+namespace util {
+
+CompressedException::CompressedException() throw() {}
+CompressedException::~CompressedException() throw() {}
+
+GZException::GZException() throw() {}
+GZException::~GZException() throw() {}
+
+BZException::BZException() throw() {}
+BZException::~BZException() throw() {}
+
+XZException::XZException() throw() {}
+XZException::~XZException() throw() {}
+
+class ReadBase {
+ public:
+ virtual ~ReadBase() {}
+
+ virtual std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) = 0;
+
+ protected:
+ static void ReplaceThis(ReadBase *with, ReadCompressed &thunk) {
+ thunk.internal_.reset(with);
+ }
+
+ static uint64_t &ReadCount(ReadCompressed &thunk) {
+ return thunk.raw_amount_;
+ }
+};
+
+namespace {
+
+// Completed file that other classes can thunk to.
+class Complete : public ReadBase {
+ public:
+ std::size_t Read(void *, std::size_t, ReadCompressed &) {
+ return 0;
+ }
+};
+
+class Uncompressed : public ReadBase {
+ public:
+ explicit Uncompressed(int fd) : fd_(fd) {}
+
+ std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+ std::size_t got = PartialRead(fd_.get(), to, amount);
+ ReadCount(thunk) += got;
+ return got;
+ }
+
+ private:
+ scoped_fd fd_;
+};
+
+class UncompressedWithHeader : public ReadBase {
+ public:
+ UncompressedWithHeader(int fd, void *already_data, std::size_t already_size) : fd_(fd) {
+ assert(already_size);
+ buf_.reset(malloc(already_size));
+ if (!buf_.get()) throw std::bad_alloc();
+ memcpy(buf_.get(), already_data, already_size);
+ remain_ = static_cast<uint8_t*>(buf_.get());
+ end_ = remain_ + already_size;
+ }
+
+ std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+ assert(buf_.get());
+ std::size_t sending = std::min<std::size_t>(amount, end_ - remain_);
+ memcpy(to, remain_, sending);
+ remain_ += sending;
+ if (remain_ == end_) {
+ ReplaceThis(new Uncompressed(fd_.release()), thunk);
+ }
+ return sending;
+ }
+
+ private:
+ scoped_malloc buf_;
+ uint8_t *remain_;
+ uint8_t *end_;
+
+ scoped_fd fd_;
+};
+
+#ifdef HAVE_ZLIB
+class GZip : public ReadBase {
+ private:
+ static const std::size_t kInputBuffer = 16384;
+ public:
+ GZip(int fd, void *already_data, std::size_t already_size)
+ : file_(fd), in_buffer_(malloc(kInputBuffer)) {
+ if (!in_buffer_.get()) throw std::bad_alloc();
+ assert(already_size < kInputBuffer);
+ if (already_size) {
+ memcpy(in_buffer_.get(), already_data, already_size);
+ stream_.next_in = static_cast<Bytef *>(in_buffer_.get());
+ stream_.avail_in = already_size;
+ stream_.avail_in += ReadOrEOF(file_.get(), static_cast<uint8_t*>(in_buffer_.get()) + already_size, kInputBuffer - already_size);
+ } else {
+ stream_.avail_in = 0;
+ }
+ stream_.zalloc = Z_NULL;
+ stream_.zfree = Z_NULL;
+ stream_.opaque = Z_NULL;
+ stream_.msg = NULL;
+ // 32 for zlib and gzip decoding with automatic header detection.
+ // 15 for maximum window size.
+ UTIL_THROW_IF(Z_OK != inflateInit2(&stream_, 32 + 15), GZException, "Failed to initialize zlib.");
+ }
+
+ ~GZip() {
+ if (Z_OK != inflateEnd(&stream_)) {
+ std::cerr << "zlib could not close properly." << std::endl;
+ abort();
+ }
+ }
+
+ std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+ if (amount == 0) return 0;
+ stream_.next_out = static_cast<Bytef*>(to);
+ stream_.avail_out = std::min<std::size_t>(std::numeric_limits<uInt>::max(), amount);
+ do {
+ if (!stream_.avail_in) ReadInput(thunk);
+ int result = inflate(&stream_, 0);
+ switch (result) {
+ case Z_OK:
+ break;
+ case Z_STREAM_END:
+ {
+ std::size_t ret = static_cast<uint8_t*>(stream_.next_out) - static_cast<uint8_t*>(to);
+ ReplaceThis(new Complete(), thunk);
+ return ret;
+ }
+ case Z_ERRNO:
+ UTIL_THROW(ErrnoException, "zlib error");
+ default:
+ UTIL_THROW(GZException, "zlib encountered " << (stream_.msg ? stream_.msg : "an error ") << " code " << result);
+ }
+ } while (stream_.next_out == to);
+ return static_cast<uint8_t*>(stream_.next_out) - static_cast<uint8_t*>(to);
+ }
+
+ private:
+ void ReadInput(ReadCompressed &thunk) {
+ assert(!stream_.avail_in);
+ stream_.next_in = static_cast<Bytef *>(in_buffer_.get());
+ stream_.avail_in = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer);
+ ReadCount(thunk) += stream_.avail_in;
+ }
+
+ scoped_fd file_;
+ scoped_malloc in_buffer_;
+ z_stream stream_;
+};
+#endif // HAVE_ZLIB
+
+#ifdef HAVE_BZLIB
+class BZip : public ReadBase {
+ public:
+ explicit BZip(int fd, void *already_data, std::size_t already_size) {
+ scoped_fd hold(fd);
+ closer_.reset(FDOpenReadOrThrow(hold));
+ int bzerror = BZ_OK;
+ file_ = BZ2_bzReadOpen(&bzerror, closer_.get(), 0, 0, already_data, already_size);
+ switch (bzerror) {
+ case BZ_OK:
+ return;
+ case BZ_CONFIG_ERROR:
+ UTIL_THROW(BZException, "Looks like bzip2 was miscompiled.");
+ case BZ_PARAM_ERROR:
+ UTIL_THROW(BZException, "Parameter error");
+ case BZ_IO_ERROR:
+ UTIL_THROW(BZException, "IO error reading file");
+ case BZ_MEM_ERROR:
+ throw std::bad_alloc();
+ }
+ }
+
+ ~BZip() {
+ int bzerror = BZ_OK;
+ BZ2_bzReadClose(&bzerror, file_);
+ if (bzerror != BZ_OK) {
+ std::cerr << "bz2 readclose error" << std::endl;
+ abort();
+ }
+ }
+
+ std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+ int bzerror = BZ_OK;
+ int ret = BZ2_bzRead(&bzerror, file_, to, std::min<std::size_t>(static_cast<std::size_t>(INT_MAX), amount));
+ long pos;
+ switch (bzerror) {
+ case BZ_STREAM_END:
+ pos = ftell(closer_.get());
+ if (pos != -1) ReadCount(thunk) = pos;
+ ReplaceThis(new Complete(), thunk);
+ return ret;
+ case BZ_OK:
+ pos = ftell(closer_.get());
+ if (pos != -1) ReadCount(thunk) = pos;
+ return ret;
+ default:
+ UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror);
+ }
+ }
+
+ private:
+ scoped_FILE closer_;
+ BZFILE *file_;
+};
+#endif // HAVE_BZLIB
+
+#ifdef HAVE_XZLIB
+class XZip : public ReadBase {
+ private:
+ static const std::size_t kInputBuffer = 16384;
+ public:
+ XZip(int fd, void *already_data, std::size_t already_size)
+ : file_(fd), in_buffer_(malloc(kInputBuffer)), stream_(), action_(LZMA_RUN) {
+ if (!in_buffer_.get()) throw std::bad_alloc();
+ assert(already_size < kInputBuffer);
+ if (already_size) {
+ memcpy(in_buffer_.get(), already_data, already_size);
+ stream_.next_in = static_cast<const uint8_t*>(in_buffer_.get());
+ stream_.avail_in = already_size;
+ stream_.avail_in += ReadOrEOF(file_.get(), static_cast<uint8_t*>(in_buffer_.get()) + already_size, kInputBuffer - already_size);
+ } else {
+ stream_.avail_in = 0;
+ }
+ stream_.allocator = NULL;
+ lzma_ret ret = lzma_stream_decoder(&stream_, UINT64_MAX, LZMA_CONCATENATED);
+ switch (ret) {
+ case LZMA_OK:
+ break;
+ case LZMA_MEM_ERROR:
+ UTIL_THROW(ErrnoException, "xz open error");
+ default:
+ UTIL_THROW(XZException, "xz error code " << ret);
+ }
+ }
+
+ ~XZip() {
+ lzma_end(&stream_);
+ }
+
+ std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+ if (amount == 0) return 0;
+ stream_.next_out = static_cast<uint8_t*>(to);
+ stream_.avail_out = amount;
+ do {
+ if (!stream_.avail_in) ReadInput(thunk);
+ lzma_ret status = lzma_code(&stream_, action_);
+ switch (status) {
+ case LZMA_OK:
+ break;
+ case LZMA_STREAM_END:
+ UTIL_THROW_IF(action_ != LZMA_FINISH, XZException, "Input not finished yet.");
+ {
+ std::size_t ret = static_cast<uint8_t*>(stream_.next_out) - static_cast<uint8_t*>(to);
+ ReplaceThis(new Complete(), thunk);
+ return ret;
+ }
+ case LZMA_MEM_ERROR:
+ throw std::bad_alloc();
+ case LZMA_FORMAT_ERROR:
+ UTIL_THROW(XZException, "xzlib says file format not recognized");
+ case LZMA_OPTIONS_ERROR:
+ UTIL_THROW(XZException, "xzlib says unsupported compression options");
+ case LZMA_DATA_ERROR:
+ UTIL_THROW(XZException, "xzlib says this file is corrupt");
+ case LZMA_BUF_ERROR:
+ UTIL_THROW(XZException, "xzlib says unexpected end of input");
+ default:
+ UTIL_THROW(XZException, "unrecognized xzlib error " << status);
+ }
+ } while (stream_.next_out == to);
+ return static_cast<uint8_t*>(stream_.next_out) - static_cast<uint8_t*>(to);
+ }
+
+ private:
+ void ReadInput(ReadCompressed &thunk) {
+ assert(!stream_.avail_in);
+ stream_.next_in = static_cast<const uint8_t*>(in_buffer_.get());
+ stream_.avail_in = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer);
+ if (!stream_.avail_in) action_ = LZMA_FINISH;
+ ReadCount(thunk) += stream_.avail_in;
+ }
+
+ scoped_fd file_;
+ scoped_malloc in_buffer_;
+ lzma_stream stream_;
+
+ lzma_action action_;
+};
+#endif // HAVE_XZLIB
+
+enum MagicResult {
+ UNKNOWN, GZIP, BZIP, XZIP
+};
+
+MagicResult DetectMagic(const void *from_void) {
+ const uint8_t *header = static_cast<const uint8_t*>(from_void);
+ if (header[0] == 0x1f && header[1] == 0x8b) {
+ return GZIP;
+ }
+ if (header[0] == 'B' && header[1] == 'Z') {
+ return BZIP;
+ }
+ const uint8_t xzmagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
+ if (!memcmp(header, xzmagic, 6)) {
+ return XZIP;
+ }
+ return UNKNOWN;
+}
+
+ReadBase *ReadFactory(int fd, uint64_t &raw_amount) {
+ scoped_fd hold(fd);
+ unsigned char header[ReadCompressed::kMagicSize];
+ raw_amount = ReadOrEOF(fd, header, ReadCompressed::kMagicSize);
+ if (!raw_amount)
+ return new Uncompressed(hold.release());
+ if (raw_amount != ReadCompressed::kMagicSize)
+ return new UncompressedWithHeader(hold.release(), header, raw_amount);
+ switch (DetectMagic(header)) {
+ case GZIP:
+#ifdef HAVE_ZLIB
+ return new GZip(hold.release(), header, ReadCompressed::kMagicSize);
+#else
+ UTIL_THROW(CompressedException, "This looks like a gzip file but gzip support was not compiled in.");
+#endif
+ case BZIP:
+#ifdef HAVE_BZLIB
+ return new BZip(hold.release(), header, ReadCompressed::kMagicSize);
+#else
+ UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZ), but bzip support was not compiled in.");
+#endif
+ case XZIP:
+#ifdef HAVE_XZLIB
+ return new XZip(hold.release(), header, ReadCompressed::kMagicSize);
+#else
+ UTIL_THROW(CompressedException, "This looks like an xz file, but xz support was not compiled in.");
+#endif
+ case UNKNOWN:
+ break;
+ }
+ try {
+ AdvanceOrThrow(fd, -ReadCompressed::kMagicSize);
+ } catch (const util::ErrnoException &e) {
+ return new UncompressedWithHeader(hold.release(), header, ReadCompressed::kMagicSize);
+ }
+ return new Uncompressed(hold.release());
+}
+
+} // namespace
+
+bool ReadCompressed::DetectCompressedMagic(const void *from_void) {
+ return DetectMagic(from_void) != UNKNOWN;
+}
+
+ReadCompressed::ReadCompressed(int fd) {
+ Reset(fd);
+}
+
+ReadCompressed::ReadCompressed() {}
+
+ReadCompressed::~ReadCompressed() {}
+
+void ReadCompressed::Reset(int fd) {
+ internal_.reset();
+ internal_.reset(ReadFactory(fd, raw_amount_));
+}
+
+std::size_t ReadCompressed::Read(void *to, std::size_t amount) {
+ return internal_->Read(to, amount, *this);
+}
+
+} // namespace util
diff --git a/klm/util/read_compressed.hh b/klm/util/read_compressed.hh
new file mode 100644
index 00000000..83ca9fb2
--- /dev/null
+++ b/klm/util/read_compressed.hh
@@ -0,0 +1,74 @@
+#ifndef UTIL_READ_COMPRESSED__
+#define UTIL_READ_COMPRESSED__
+
+#include "util/exception.hh"
+#include "util/scoped.hh"
+
+#include <cstddef>
+
+#include <stdint.h>
+
+namespace util {
+
+class CompressedException : public Exception {
+ public:
+ CompressedException() throw();
+ virtual ~CompressedException() throw();
+};
+
+class GZException : public CompressedException {
+ public:
+ GZException() throw();
+ ~GZException() throw();
+};
+
+class BZException : public CompressedException {
+ public:
+ BZException() throw();
+ ~BZException() throw();
+};
+
+class XZException : public CompressedException {
+ public:
+ XZException() throw();
+ ~XZException() throw();
+};
+
+class ReadBase;
+
+class ReadCompressed {
+ public:
+ static const std::size_t kMagicSize = 6;
+ // Must have at least kMagicSize bytes.
+ static bool DetectCompressedMagic(const void *from);
+
+ // Takes ownership of fd.
+ explicit ReadCompressed(int fd);
+
+ // Must call Reset later.
+ ReadCompressed();
+
+ ~ReadCompressed();
+
+ // Takes ownership of fd.
+ void Reset(int fd);
+
+ std::size_t Read(void *to, std::size_t amount);
+
+ uint64_t RawAmount() const { return raw_amount_; }
+
+ private:
+ friend class ReadBase;
+
+ scoped_ptr<ReadBase> internal_;
+
+ uint64_t raw_amount_;
+
+ // No copying.
+ ReadCompressed(const ReadCompressed &);
+ void operator=(const ReadCompressed &);
+};
+
+} // namespace util
+
+#endif // UTIL_READ_COMPRESSED__
diff --git a/klm/util/read_compressed_test.cc b/klm/util/read_compressed_test.cc
new file mode 100644
index 00000000..6fd97e5e
--- /dev/null
+++ b/klm/util/read_compressed_test.cc
@@ -0,0 +1,94 @@
+#include "util/read_compressed.hh"
+
+#include "util/file.hh"
+#include "util/have.hh"
+
+#define BOOST_TEST_MODULE ReadCompressedTest
+#include <boost/test/unit_test.hpp>
+#include <boost/scoped_ptr.hpp>
+
+#include <fstream>
+#include <string>
+
+#include <stdlib.h>
+
+namespace util {
+namespace {
+
+void ReadLoop(ReadCompressed &reader, void *to_void, std::size_t amount) {
+ uint8_t *to = static_cast<uint8_t*>(to_void);
+ while (amount) {
+ std::size_t ret = reader.Read(to, amount);
+ BOOST_REQUIRE(ret);
+ to += ret;
+ amount -= ret;
+ }
+}
+
+void TestRandom(const char *compressor) {
+ const uint32_t kSize4 = 100000 / 4;
+ char name[] = "tempXXXXXX";
+
+ // Write test file.
+ {
+ scoped_fd original(mkstemp(name));
+ BOOST_REQUIRE(original.get() > 0);
+ for (uint32_t i = 0; i < kSize4; ++i) {
+ WriteOrThrow(original.get(), &i, sizeof(uint32_t));
+ }
+ }
+
+ char gzname[] = "tempXXXXXX";
+ scoped_fd gzipped(mkstemp(gzname));
+
+ std::string command(compressor);
+#ifdef __CYGWIN__
+ command += ".exe";
+#endif
+ command += " <\"";
+ command += name;
+ command += "\" >\"";
+ command += gzname;
+ command += "\"";
+ BOOST_REQUIRE_EQUAL(0, system(command.c_str()));
+
+ BOOST_CHECK_EQUAL(0, unlink(name));
+ BOOST_CHECK_EQUAL(0, unlink(gzname));
+
+ ReadCompressed reader(gzipped.release());
+ for (uint32_t i = 0; i < kSize4; ++i) {
+ uint32_t got;
+ ReadLoop(reader, &got, sizeof(uint32_t));
+ BOOST_CHECK_EQUAL(i, got);
+ }
+
+ char ignored;
+ BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1));
+ // Test double EOF call.
+ BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1));
+}
+
+BOOST_AUTO_TEST_CASE(Uncompressed) {
+ TestRandom("cat");
+}
+
+#ifdef HAVE_ZLIB
+BOOST_AUTO_TEST_CASE(ReadGZ) {
+ TestRandom("gzip");
+}
+#endif // HAVE_ZLIB
+
+#ifdef HAVE_BZLIB
+BOOST_AUTO_TEST_CASE(ReadBZ) {
+ TestRandom("bzip2");
+}
+#endif // HAVE_BZLIB
+
+#ifdef HAVE_XZLIB
+BOOST_AUTO_TEST_CASE(ReadXZ) {
+ TestRandom("xz");
+}
+#endif
+
+} // namespace
+} // namespace util
diff --git a/klm/util/scoped.hh b/klm/util/scoped.hh
index 93e2e817..d62c6df1 100644
--- a/klm/util/scoped.hh
+++ b/klm/util/scoped.hh
@@ -1,40 +1,13 @@
#ifndef UTIL_SCOPED__
#define UTIL_SCOPED__
+/* Other scoped objects in the style of scoped_ptr. */
#include "util/exception.hh"
-
-/* Other scoped objects in the style of scoped_ptr. */
#include <cstddef>
#include <cstdlib>
namespace util {
-template <class T, class R, R (*Free)(T*)> class scoped_thing {
- public:
- explicit scoped_thing(T *c = static_cast<T*>(0)) : c_(c) {}
-
- ~scoped_thing() { if (c_) Free(c_); }
-
- void reset(T *c) {
- if (c_) Free(c_);
- c_ = c;
- }
-
- T &operator*() { return *c_; }
- const T&operator*() const { return *c_; }
- T &operator->() { return *c_; }
- const T&operator->() const { return *c_; }
-
- T *get() { return c_; }
- const T *get() const { return c_; }
-
- private:
- T *c_;
-
- scoped_thing(const scoped_thing &);
- scoped_thing &operator=(const scoped_thing &);
-};
-
class scoped_malloc {
public:
scoped_malloc() : p_(NULL) {}
@@ -77,9 +50,6 @@ template <class T> class scoped_array {
T &operator*() { return *c_; }
const T&operator*() const { return *c_; }
- T &operator->() { return *c_; }
- const T&operator->() const { return *c_; }
-
T &operator[](std::size_t idx) { return c_[idx]; }
const T &operator[](std::size_t idx) const { return c_[idx]; }
@@ -90,6 +60,39 @@ template <class T> class scoped_array {
private:
T *c_;
+
+ scoped_array(const scoped_array &);
+ void operator=(const scoped_array &);
+};
+
+template <class T> class scoped_ptr {
+ public:
+ explicit scoped_ptr(T *content = NULL) : c_(content) {}
+
+ ~scoped_ptr() { delete c_; }
+
+ T *get() { return c_; }
+ const T* get() const { return c_; }
+
+ T &operator*() { return *c_; }
+ const T&operator*() const { return *c_; }
+
+ T *operator->() { return c_; }
+ const T*operator->() const { return c_; }
+
+ T &operator[](std::size_t idx) { return c_[idx]; }
+ const T &operator[](std::size_t idx) const { return c_[idx]; }
+
+ void reset(T *to = NULL) {
+ scoped_ptr<T> other(c_);
+ c_ = to;
+ }
+
+ private:
+ T *c_;
+
+ scoped_ptr(const scoped_ptr &);
+ void operator=(const scoped_ptr &);
};
} // namespace util
diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh
index be6a643d..51481646 100644
--- a/klm/util/string_piece.hh
+++ b/klm/util/string_piece.hh
@@ -1,6 +1,6 @@
/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If
* you don't use ICU, then this will use the Google implementation from Chrome.
- * This has been modified from the original version to let you choose.
+ * This has been modified from the original version to let you choose.
*/
// Copyright 2008, Google Inc.
@@ -62,9 +62,9 @@
#include <unicode/stringpiece.h>
#include <unicode/uversion.h>
-// Old versions of ICU don't define operator== and operator!=.
+// Old versions of ICU don't define operator== and operator!=.
#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
-#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
+#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
inline bool operator==(const StringPiece& x, const StringPiece& y) {
if (x.size() != y.size())
return false;
@@ -274,15 +274,28 @@ struct StringPieceCompatibleEquals : public std::binary_function<const StringPie
}
};
template <class T> typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) {
+#if BOOST_VERSION < 104200
+ std::string temp(key.data(), key.size());
+ return t.find(temp);
+#else
return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
+#endif
}
+
template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece &key) {
+#if BOOST_VERSION < 104200
+ std::string temp(key.data(), key.size());
+ return t.find(temp);
+#else
return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
+#endif
}
#endif
#ifdef HAVE_ICU
U_NAMESPACE_END
+using U_NAMESPACE_QUALIFIER StringPiece;
#endif
+
#endif // BASE_STRING_PIECE_H__
diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh
index 4a7f5460..a588c3fc 100644
--- a/klm/util/tokenize_piece.hh
+++ b/klm/util/tokenize_piece.hh
@@ -20,6 +20,7 @@ class OutOfTokens : public Exception {
class SingleCharacter {
public:
+ SingleCharacter() {}
explicit SingleCharacter(char delim) : delim_(delim) {}
StringPiece Find(const StringPiece &in) const {
@@ -32,6 +33,8 @@ class SingleCharacter {
class MultiCharacter {
public:
+ MultiCharacter() {}
+
explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {}
StringPiece Find(const StringPiece &in) const {
@@ -44,6 +47,7 @@ class MultiCharacter {
class AnyCharacter {
public:
+ AnyCharacter() {}
explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {}
StringPiece Find(const StringPiece &in) const {
@@ -56,6 +60,8 @@ class AnyCharacter {
class AnyCharacterLast {
public:
+ AnyCharacterLast() {}
+
explicit AnyCharacterLast(const StringPiece &chars) : chars_(chars) {}
StringPiece Find(const StringPiece &in) const {
@@ -81,8 +87,8 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it
return current_.data() != 0;
}
- static TokenIter<Find> end() {
- return TokenIter<Find>();
+ static TokenIter<Find, SkipEmpty> end() {
+ return TokenIter<Find, SkipEmpty>();
}
private:
@@ -100,8 +106,8 @@ template <class Find, bool SkipEmpty = false> class TokenIter : public boost::it
} while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false.
}
- bool equal(const TokenIter<Find> &other) const {
- return after_.data() == other.after_.data();
+ bool equal(const TokenIter<Find, SkipEmpty> &other) const {
+ return current_.data() == other.current_.data();
}
const StringPiece &dereference() const {