From 516c132fb683b5bf77ae3230a1b3709beb57618e Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Tue, 22 Jan 2013 21:37:49 +0000 Subject: KenLM 58da338b --- klm/util/Makefile.am | 1 + klm/util/double-conversion/strtod.cc | 4 +++ klm/util/file.cc | 47 +++++++++++++++++++++--------- klm/util/file_piece.cc | 22 +++++++++++++-- klm/util/file_piece.hh | 10 +++++++ klm/util/file_piece_test.cc | 14 +++++++++ klm/util/have.hh | 4 --- klm/util/read_compressed.cc | 28 +++++++++++++++++- klm/util/read_compressed.hh | 7 +++++ klm/util/read_compressed_test.cc | 55 +++++++++++++++++++++++------------- klm/util/stream/io.cc | 8 ++++-- klm/util/stream/sort.hh | 12 +++++--- klm/util/string_piece.cc | 3 +- klm/util/string_piece.hh | 41 --------------------------- klm/util/string_piece_hash.hh | 43 ++++++++++++++++++++++++++++ klm/util/usage.cc | 2 +- 16 files changed, 209 insertions(+), 92 deletions(-) create mode 100644 klm/util/string_piece_hash.hh (limited to 'klm/util') diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am index 248cc844..7f873e96 100644 --- a/klm/util/Makefile.am +++ b/klm/util/Makefile.am @@ -38,6 +38,7 @@ libklm_util_a_SOURCES = \ sized_iterator.hh \ sorted_uniform.hh \ string_piece.hh \ + string_piece_hash.hh \ thread_pool.hh \ tokenize_piece.hh \ usage.hh \ diff --git a/klm/util/double-conversion/strtod.cc b/klm/util/double-conversion/strtod.cc index 9758989f..e298766a 100644 --- a/klm/util/double-conversion/strtod.cc +++ b/klm/util/double-conversion/strtod.cc @@ -506,7 +506,9 @@ float Strtof(Vector buffer, int exponent) { double double_previous = Double(double_guess).PreviousDouble(); float f1 = static_cast(double_previous); +#ifndef NDEBUG float f2 = float_guess; +#endif float f3 = static_cast(double_next); float f4; if (is_correct) { @@ -515,7 +517,9 @@ float Strtof(Vector buffer, int exponent) { double double_next2 = Double(double_next).NextDouble(); f4 = static_cast(double_next2); } +#ifndef NDEBUG ASSERT(f1 <= f2 && f2 <= f3 && f3 <= f4); +#endif // If the guess doesn't lie near a single-precision boundary we can simply // return its float-value. diff --git a/klm/util/file.cc b/klm/util/file.cc index 9a6d2e64..86d9b12d 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -22,6 +22,7 @@ #include #include #include +#include #else #include #endif @@ -99,15 +100,15 @@ uint64_t SizeOrThrow(int fd) { } void ResizeOrThrow(int fd, uint64_t to) { - UTIL_THROW_IF_ARG( #if defined(_WIN32) || defined(_WIN64) - _chsize_s + errno_t ret = _chsize_s #elif defined(OS_ANDROID) - ftruncate64 + int ret = ftruncate64 #else - ftruncate + int ret = ftruncate #endif - (fd, to), FDException, (fd), "while resizing to " << to << " bytes"); + (fd, to); + UTIL_THROW_IF_ARG(ret, FDException, (fd), "while resizing to " << to << " bytes"); } std::size_t PartialRead(int fd, void *to, std::size_t amount) { @@ -150,9 +151,21 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) { void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) { uint8_t *to = static_cast(to_void); #if defined(_WIN32) || defined(_WIN64) - UTIL_THROW(Exception, "TODO: PReadOrThrow for windows using ReadFile http://stackoverflow.com/questions/766477/are-there-equivalents-to-pread-on-different-platforms"); -#else + UTIL_THROW(Exception, "This pread implementation for windows is broken. Please send me a patch that does not change the file pointer. Atomically. Or send me an implementation of pwrite that is allowed to change the file pointer but can be called concurrently with pread."); + const std::size_t kMaxDWORD = static_cast(4294967295UL); +#endif for (;size ;) { +#if defined(_WIN32) || defined(_WIN64) + /* BROKEN: changes file pointer. Even if you save it and change it back, it won't be safe to use concurrently with write() or read() which lmplz does. */ + // size_t might be 64-bit. DWORD is always 32. + DWORD reading = static_cast(std::min(kMaxDWORD, size)); + DWORD ret; + OVERLAPPED overlapped; + memset(&overlapped, 0, sizeof(OVERLAPPED)); + overlapped.Offset = static_cast(off); + overlapped.OffsetHigh = static_cast(off >> 32); + UTIL_THROW_IF(!ReadFile((HANDLE)_get_osfhandle(fd), to, reading, &ret, &overlapped), Exception, "ReadFile failed for offset " << off); +#else ssize_t ret; errno = 0; do { @@ -166,11 +179,11 @@ void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) { UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd)); UTIL_THROW_ARG(FDException, (fd), "while reading " << size << " bytes at offset " << off); } +#endif size -= ret; off += ret; to += ret; } -#endif } void WriteOrThrow(int fd, const void *data_void, std::size_t size) { @@ -218,15 +231,15 @@ typedef CheckOffT::True IgnoredType; // Can't we all just get along? void InternalSeek(int fd, int64_t off, int whence) { - UTIL_THROW_IF_ARG( + if ( #if defined(_WIN32) || defined(_WIN64) - (__int64)-1 == _lseeki64(fd, off, whence), + (__int64)-1 == _lseeki64(fd, off, whence) #elif defined(OS_ANDROID) - (off64_t)-1 == lseek64(fd, off, whence), + (off64_t)-1 == lseek64(fd, off, whence) #else - (off_t)-1 == lseek(fd, off, whence), + (off_t)-1 == lseek(fd, off, whence) #endif - FDException, (fd), "while seeking to " << off << " whence " << whence); + ) UTIL_THROW_ARG(FDException, (fd), "while seeking to " << off << " whence " << whence); } } // namespace @@ -386,7 +399,13 @@ void NormalizeTempPrefix(std::string &base) { struct stat sb; // It's fine for it to not exist. if (-1 == stat(base.c_str(), &sb)) return; - if (S_ISDIR(sb.st_mode)) base += '/'; + if ( +#if defined(_WIN32) || defined(_WIN64) + sb.st_mode & _S_IFDIR +#else + S_ISDIR(sb.st_mode) +#endif + ) base += '/'; } int MakeTemp(const std::string &base) { diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index fbfa0e0e..4d143857 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -49,6 +49,18 @@ FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std: Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer); } +FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buffer) : + total_size_(kBadSize), page_(SizePage()) { + InitializeNoRead("istream", min_buffer); + + fallback_to_read_ = true; + data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); + position_ = data_.begin(); + position_end_ = position_; + + fell_back_.Reset(stream); +} + FilePiece::~FilePiece() {} StringPiece FilePiece::ReadLine(char delim) { @@ -83,7 +95,8 @@ unsigned long int FilePiece::ReadULong() { return ReadNumber(); } -void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { +// Factored out so that istream can call this. +void FilePiece::InitializeNoRead(const char *name, std::size_t min_buffer) { file_name_ = name; default_map_size_ = page_ * std::max((min_buffer / page_ + 1), 2); @@ -91,6 +104,10 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s position_end_ = NULL; mapped_offset_ = 0; at_end_ = false; +} + +void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { + InitializeNoRead(name, min_buffer); if (total_size_ == kBadSize) { // So the assertion passes. @@ -239,8 +256,7 @@ void FilePiece::TransitionToRead() { assert(!fallback_to_read_); fallback_to_read_ = true; data_.reset(); - data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); - UTIL_THROW_IF(!data_.get(), ErrnoException, "malloc failed for " << default_map_size_); + data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); position_ = data_.begin(); position_end_ = position_; diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index 53310976..c07c6011 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -9,6 +9,7 @@ #include "util/string_piece.hh" #include +#include #include #include @@ -31,6 +32,13 @@ class FilePiece { // Takes ownership of fd. name is used for messages. explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); + /* Read from an istream. Don't use this if you can avoid it. Raw fd IO is + * much faster. But sometimes you just have an istream like Boost's HTTP + * server and want to parse it the same way. + * name is just used for messages and FileName(). + */ + explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576); + ~FilePiece(); char get() { @@ -71,6 +79,8 @@ class FilePiece { const std::string &FileName() const { return file_name_; } private: + void InitializeNoRead(const char *name, std::size_t min_buffer); + // Calls InitializeNoRead, so don't call both. void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); template T ReadNumber(); diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc index 91e4c559..7336007d 100644 --- a/klm/util/file_piece_test.cc +++ b/klm/util/file_piece_test.cc @@ -24,6 +24,20 @@ std::string FileLocation() { return ret; } +/* istream */ +BOOST_AUTO_TEST_CASE(IStream) { + std::fstream ref(FileLocation().c_str(), std::ios::in); + std::fstream backing(FileLocation().c_str(), std::ios::in); + FilePiece test(backing); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + BOOST_CHECK_EQUAL(ref_line, test_line); + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} + /* mmap implementation */ BOOST_AUTO_TEST_CASE(MMapReadLine) { std::fstream ref(FileLocation().c_str(), std::ios::in); diff --git a/klm/util/have.hh b/klm/util/have.hh index e9a4d946..6e18529d 100644 --- a/klm/util/have.hh +++ b/klm/util/have.hh @@ -10,8 +10,4 @@ //#define HAVE_ICU #endif -#ifndef HAVE_BOOST -//#define HAVE_BOOST -#endif - #endif // UTIL_HAVE__ diff --git a/klm/util/read_compressed.cc b/klm/util/read_compressed.cc index 7a1a8fb5..b81549e4 100644 --- a/klm/util/read_compressed.cc +++ b/klm/util/read_compressed.cc @@ -320,6 +320,23 @@ class XZip : public ReadBase { }; #endif // HAVE_XZLIB +class IStreamReader : public ReadBase { + public: + explicit IStreamReader(std::istream &stream) : stream_(stream) {} + + std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { + if (!stream_.read(static_cast(to), amount)) { + UTIL_THROW_IF(!stream_.eof(), ErrnoException, "istream error"); + amount = stream_.gcount(); + } + ReadCount(thunk) += amount; + return amount; + } + + private: + std::istream &stream_; +}; + enum MagicResult { UNKNOWN, GZIP, BZIP, XZIP }; @@ -329,7 +346,7 @@ MagicResult DetectMagic(const void *from_void) { if (header[0] == 0x1f && header[1] == 0x8b) { return GZIP; } - if (header[0] == 'B' && header[1] == 'Z') { + if (header[0] == 'B' && header[1] == 'Z' && header[2] == 'h') { return BZIP; } const uint8_t xzmagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; @@ -387,6 +404,10 @@ ReadCompressed::ReadCompressed(int fd) { Reset(fd); } +ReadCompressed::ReadCompressed(std::istream &in) { + Reset(in); +} + ReadCompressed::ReadCompressed() {} ReadCompressed::~ReadCompressed() {} @@ -396,6 +417,11 @@ void ReadCompressed::Reset(int fd) { internal_.reset(ReadFactory(fd, raw_amount_)); } +void ReadCompressed::Reset(std::istream &in) { + internal_.reset(); + internal_.reset(new IStreamReader(in)); +} + std::size_t ReadCompressed::Read(void *to, std::size_t amount) { return internal_->Read(to, amount, *this); } diff --git a/klm/util/read_compressed.hh b/klm/util/read_compressed.hh index 83ca9fb2..8b54c9e8 100644 --- a/klm/util/read_compressed.hh +++ b/klm/util/read_compressed.hh @@ -45,6 +45,10 @@ class ReadCompressed { // Takes ownership of fd. explicit ReadCompressed(int fd); + // Try to avoid using this. Use the fd instead. + // There is no decompression support for istreams. + explicit ReadCompressed(std::istream &in); + // Must call Reset later. ReadCompressed(); @@ -53,6 +57,9 @@ class ReadCompressed { // Takes ownership of fd. void Reset(int fd); + // Same advice as the constructor. + void Reset(std::istream &in); + std::size_t Read(void *to, std::size_t amount); uint64_t RawAmount() const { return raw_amount_; } diff --git a/klm/util/read_compressed_test.cc b/klm/util/read_compressed_test.cc index 6fd97e5e..9cb4a4b9 100644 --- a/klm/util/read_compressed_test.cc +++ b/klm/util/read_compressed_test.cc @@ -25,19 +25,34 @@ void ReadLoop(ReadCompressed &reader, void *to_void, std::size_t amount) { } } -void TestRandom(const char *compressor) { - const uint32_t kSize4 = 100000 / 4; +const uint32_t kSize4 = 100000 / 4; + +std::string WriteRandom() { char name[] = "tempXXXXXX"; + scoped_fd original(mkstemp(name)); + BOOST_REQUIRE(original.get() > 0); + for (uint32_t i = 0; i < kSize4; ++i) { + WriteOrThrow(original.get(), &i, sizeof(uint32_t)); + } + return name; +} - // Write test file. - { - scoped_fd original(mkstemp(name)); - BOOST_REQUIRE(original.get() > 0); - for (uint32_t i = 0; i < kSize4; ++i) { - WriteOrThrow(original.get(), &i, sizeof(uint32_t)); - } +void VerifyRead(ReadCompressed &reader) { + for (uint32_t i = 0; i < kSize4; ++i) { + uint32_t got; + ReadLoop(reader, &got, sizeof(uint32_t)); + BOOST_CHECK_EQUAL(i, got); } + char ignored; + BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); + // Test double EOF call. + BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); +} + +void TestRandom(const char *compressor) { + std::string name(WriteRandom()); + char gzname[] = "tempXXXXXX"; scoped_fd gzipped(mkstemp(gzname)); @@ -52,20 +67,11 @@ void TestRandom(const char *compressor) { command += "\""; BOOST_REQUIRE_EQUAL(0, system(command.c_str())); - BOOST_CHECK_EQUAL(0, unlink(name)); + BOOST_CHECK_EQUAL(0, unlink(name.c_str())); BOOST_CHECK_EQUAL(0, unlink(gzname)); ReadCompressed reader(gzipped.release()); - for (uint32_t i = 0; i < kSize4; ++i) { - uint32_t got; - ReadLoop(reader, &got, sizeof(uint32_t)); - BOOST_CHECK_EQUAL(i, got); - } - - char ignored; - BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); - // Test double EOF call. - BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); + VerifyRead(reader); } BOOST_AUTO_TEST_CASE(Uncompressed) { @@ -90,5 +96,14 @@ BOOST_AUTO_TEST_CASE(ReadXZ) { } #endif +BOOST_AUTO_TEST_CASE(IStream) { + std::string name(WriteRandom()); + std::fstream stream(name.c_str(), std::ios::in); + BOOST_CHECK_EQUAL(0, unlink(name.c_str())); + ReadCompressed reader; + reader.Reset(stream); + VerifyRead(reader); +} + } // namespace } // namespace util diff --git a/klm/util/stream/io.cc b/klm/util/stream/io.cc index c7ad2980..0459f706 100644 --- a/klm/util/stream/io.cc +++ b/klm/util/stream/io.cc @@ -29,15 +29,17 @@ void Read::Run(const ChainPosition &position) { void PRead::Run(const ChainPosition &position) { scoped_fd owner; if (own_) owner.reset(file_); - uint64_t size = SizeOrThrow(file_); + const uint64_t size = SizeOrThrow(file_); UTIL_THROW_IF(size % static_cast(position.GetChain().EntrySize()), ReadSizeException, "File size " << file_ << " size is " << size << " not a multiple of " << position.GetChain().EntrySize()); - std::size_t block_size = position.GetChain().BlockSize(); + const std::size_t block_size = position.GetChain().BlockSize(); + const uint64_t block_size64 = static_cast(block_size); Link link(position); uint64_t offset = 0; - for (; offset + block_size < size; offset += block_size, ++link) { + for (; offset + block_size64 < size; offset += block_size64, ++link) { PReadOrThrow(file_, link->Get(), block_size, offset); link->SetValidSize(block_size); } + // size - offset is <= block_size, so it casts to 32-bit fine. if (size - offset) { PReadOrThrow(file_, link->Get(), size - offset, offset); link->SetValidSize(size - offset); diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh index a86f160f..16aa6a03 100644 --- a/klm/util/stream/sort.hh +++ b/klm/util/stream/sort.hh @@ -365,10 +365,14 @@ template class BlockSorter { // Record the size of each block in a separate file. offsets_->Append(link->ValidSize()); void *end = static_cast(link->Get()) + link->ValidSize(); - std::sort( - SizedIt(link->Get(), entry_size), - SizedIt(end, entry_size), - compare_); +#if defined(_WIN32) || defined(_WIN64) + std::stable_sort +#else + std::sort +#endif + (SizedIt(link->Get(), entry_size), + SizedIt(end, entry_size), + compare_); } offsets_->FinishedAppending(); } diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc index b422cefc..ec394b96 100644 --- a/klm/util/string_piece.cc +++ b/klm/util/string_piece.cc @@ -17,7 +17,8 @@ void StringPiece::CopyToString(std::string* target) const { } size_type StringPiece::find(const StringPiece& s, size_type pos) const { - if (length_ < 0 || pos > static_cast(length_)) + // Not sure why length_ < 0 was here since it's std::size_t. + if (/*length_ < 0 || */pos > static_cast(length_)) return npos; const char* result = std::search(ptr_ + pos, ptr_ + length_, diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh index 51481646..9cf4c7f6 100644 --- a/klm/util/string_piece.hh +++ b/klm/util/string_piece.hh @@ -50,10 +50,6 @@ #include "util/have.hh" -#ifdef HAVE_BOOST -#include -#endif // HAVE_BOOST - #include #include #include @@ -256,46 +252,9 @@ inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { return o.write(piece.data(), static_cast(piece.size())); } -#ifdef HAVE_BOOST -inline size_t hash_value(const StringPiece &str) { - return boost::hash_range(str.data(), str.data() + str.length()); -} - -/* Support for lookup of StringPiece in boost::unordered_map */ -struct StringPieceCompatibleHash : public std::unary_function { - size_t operator()(const StringPiece &str) const { - return hash_value(str); - } -}; - -struct StringPieceCompatibleEquals : public std::binary_function { - bool operator()(const StringPiece &first, const StringPiece &second) const { - return first == second; - } -}; -template typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) { -#if BOOST_VERSION < 104200 - std::string temp(key.data(), key.size()); - return t.find(temp); -#else - return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); -#endif -} - -template typename T::iterator FindStringPiece(T &t, const StringPiece &key) { -#if BOOST_VERSION < 104200 - std::string temp(key.data(), key.size()); - return t.find(temp); -#else - return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); -#endif -} -#endif - #ifdef HAVE_ICU U_NAMESPACE_END using U_NAMESPACE_QUALIFIER StringPiece; #endif - #endif // BASE_STRING_PIECE_H__ diff --git a/klm/util/string_piece_hash.hh b/klm/util/string_piece_hash.hh new file mode 100644 index 00000000..f206b1d8 --- /dev/null +++ b/klm/util/string_piece_hash.hh @@ -0,0 +1,43 @@ +#ifndef UTIL_STRING_PIECE_HASH__ +#define UTIL_STRING_PIECE_HASH__ + +#include "util/string_piece.hh" + +#include +#include + +inline size_t hash_value(const StringPiece &str) { + return boost::hash_range(str.data(), str.data() + str.length()); +} + +/* Support for lookup of StringPiece in boost::unordered_map */ +struct StringPieceCompatibleHash : public std::unary_function { + size_t operator()(const StringPiece &str) const { + return hash_value(str); + } +}; + +struct StringPieceCompatibleEquals : public std::binary_function { + bool operator()(const StringPiece &first, const StringPiece &second) const { + return first == second; + } +}; +template typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) { +#if BOOST_VERSION < 104200 + std::string temp(key.data(), key.size()); + return t.find(temp); +#else + return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); +#endif +} + +template typename T::iterator FindStringPiece(T &t, const StringPiece &key) { +#if BOOST_VERSION < 104200 + std::string temp(key.data(), key.size()); + return t.find(temp); +#else + return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); +#endif +} + +#endif // UTIL_STRING_PIECE_HASH__ diff --git a/klm/util/usage.cc b/klm/util/usage.cc index 16a004bb..b8e125d0 100644 --- a/klm/util/usage.cc +++ b/klm/util/usage.cc @@ -81,7 +81,7 @@ template uint64_t ParseNum(const std::string &arg) { UTIL_THROW_IF_ARG(stream >> throwaway, SizeParseError, (arg), "because there was more cruft " << throwaway << " after the number."); // Silly sort, using kilobytes as your default unit. - if (after.empty()) after == "K"; + if (after.empty()) after = "K"; if (after == "%") { uint64_t mem = GuessPhysicalMemory(); UTIL_THROW_IF_ARG(!mem, SizeParseError, (arg), "because % was specified but the physical memory size could not be determined."); -- cgit v1.2.3