From 2b63fa0755954edf467a2421997eaf72771260cf Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 16 May 2012 13:24:08 -0700 Subject: Big kenlm change includes lower order models for probing only. And other stuff. --- klm/util/Jamfile | 2 +- klm/util/Makefile.am | 3 ++- klm/util/bit_packing.hh | 7 +++++++ klm/util/ersatz_progress.cc | 8 ++++---- klm/util/ersatz_progress.hh | 4 ++-- klm/util/file.cc | 10 --------- klm/util/file.hh | 3 --- klm/util/file_piece.cc | 17 ++++++++++------ klm/util/file_piece.hh | 10 ++------- klm/util/have.hh | 10 ++++++++- klm/util/mmap.cc | 14 +++++++++++++ klm/util/murmur_hash.cc | 11 +++++----- klm/util/murmur_hash.hh | 6 +++--- klm/util/probing_hash_table.hh | 21 +++++++++++++++++++ klm/util/usage.cc | 46 ++++++++++++++++++++++++++++++++++++++++++ klm/util/usage.hh | 8 ++++++++ 16 files changed, 136 insertions(+), 44 deletions(-) create mode 100644 klm/util/usage.cc create mode 100644 klm/util/usage.hh (limited to 'klm/util') diff --git a/klm/util/Jamfile b/klm/util/Jamfile index b8c14347..3ee2c2c2 100644 --- a/klm/util/Jamfile +++ b/klm/util/Jamfile @@ -1,4 +1,4 @@ -lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc ../..//z : .. : : .. ; +lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc usage.cc ../..//z : .. : : .. ; import testing ; diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am index a8d6299b..5ceccf2c 100644 --- a/klm/util/Makefile.am +++ b/klm/util/Makefile.am @@ -25,6 +25,7 @@ libklm_util_a_SOURCES = \ file.cc \ file_piece.cc \ mmap.cc \ - murmur_hash.cc + murmur_hash.cc \ + usage.cc AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh index 73a5cb22..dcbd814c 100644 --- a/klm/util/bit_packing.hh +++ b/klm/util/bit_packing.hh @@ -174,6 +174,13 @@ struct BitsMask { uint64_t mask; }; +struct BitAddress { + BitAddress(void *in_base, uint64_t in_offset) : base(in_base), offset(in_offset) {} + + void *base; + uint64_t offset; +}; + } // namespace util #endif // UTIL_BIT_PACKING__ diff --git a/klm/util/ersatz_progress.cc b/klm/util/ersatz_progress.cc index a82ce672..07b14e26 100644 --- a/klm/util/ersatz_progress.cc +++ b/klm/util/ersatz_progress.cc @@ -12,17 +12,17 @@ namespace { const unsigned char kWidth = 100; } ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits::max()), complete_(next_), out_(NULL) {} ErsatzProgress::~ErsatzProgress() { - if (!out_) return; - Finished(); + if (out_) Finished(); } -ErsatzProgress::ErsatzProgress(std::ostream *to, const std::string &message, std::size_t complete) +ErsatzProgress::ErsatzProgress(std::size_t complete, std::ostream *to, const std::string &message) : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) { if (!out_) { next_ = std::numeric_limits::max(); return; } - *out_ << message << "\n----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"; + if (!message.empty()) *out_ << message << '\n'; + *out_ << "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"; } void ErsatzProgress::Milestone() { diff --git a/klm/util/ersatz_progress.hh b/klm/util/ersatz_progress.hh index 92c345fe..f709dc51 100644 --- a/klm/util/ersatz_progress.hh +++ b/klm/util/ersatz_progress.hh @@ -1,7 +1,7 @@ #ifndef UTIL_ERSATZ_PROGRESS__ #define UTIL_ERSATZ_PROGRESS__ -#include +#include #include // Ersatz version of boost::progress so core language model doesn't depend on @@ -14,7 +14,7 @@ class ErsatzProgress { ErsatzProgress(); // Null means no output. The null value is useful for passing along the ostream pointer from another caller. - ErsatzProgress(std::ostream *to, const std::string &message, std::size_t complete); + explicit ErsatzProgress(std::size_t complete, std::ostream *to = &std::cerr, const std::string &message = ""); ~ErsatzProgress(); diff --git a/klm/util/file.cc b/klm/util/file.cc index de206bc8..1bd056fc 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -43,16 +43,6 @@ int OpenReadOrThrow(const char *name) { return ret; } -int CreateOrThrow(const char *name) { - int ret; -#if defined(_WIN32) || defined(_WIN64) - UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); -#else - UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); -#endif - return ret; -} - uint64_t SizeFile(int fd) { #if defined(_WIN32) || defined(_WIN64) __int64 ret = _filelengthi64(fd); diff --git a/klm/util/file.hh b/klm/util/file.hh index 72c8ea76..5c57e2a9 100644 --- a/klm/util/file.hh +++ b/klm/util/file.hh @@ -65,10 +65,7 @@ class scoped_FILE { std::FILE *file_; }; -// Open for read only. int OpenReadOrThrow(const char *name); -// Create file if it doesn't exist, truncate if it does. Opened for write. -int CreateOrThrow(const char *name); // Return value for SizeFile when it can't size properly. const uint64_t kBadSize = (uint64_t)-1; diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index 081e662b..7b6a01dd 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -18,31 +18,35 @@ #include #include +#ifdef HAVE_ZLIB +#include +#endif + namespace util { ParseNumberException::ParseNumberException(StringPiece value) throw() { *this << "Could not parse \"" << value << "\" into a number"; } +GZException::GZException(void *file) { #ifdef HAVE_ZLIB -GZException::GZException(gzFile file) { int num; - *this << gzerror( file, &num) << " from zlib"; -} + *this << gzerror(file, &num) << " from zlib"; #endif // HAVE_ZLIB +} // Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) : file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()), - progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { + progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) { Initialize(name, show_progress, min_buffer); } FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()), - progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { + progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) { Initialize(name, show_progress, min_buffer); } @@ -149,8 +153,9 @@ template T FilePiece::ReadNumber() { SkipSpaces(); while (last_space_ < position_) { if (at_end_) { + if (position_ >= position_end_) throw EndOfFileException(); // Hallucinate a null off the end of the file. - std::string buffer(position_, position_end_); + std::string buffer(position_, position_end_ - position_); char *end; T ret; ParseNumber(buffer.c_str(), end, ret); diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index af93d8aa..b81ac0e2 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -13,10 +13,6 @@ #include -#ifdef HAVE_ZLIB -#include -#endif - namespace util { class ParseNumberException : public Exception { @@ -27,9 +23,7 @@ class ParseNumberException : public Exception { class GZException : public Exception { public: -#ifdef HAVE_ZLIB - explicit GZException(gzFile file); -#endif + explicit GZException(void *file); GZException() throw() {} ~GZException() throw() {} }; @@ -123,7 +117,7 @@ class FilePiece { std::string file_name_; #ifdef HAVE_ZLIB - gzFile gz_file_; + void *gz_file_; #endif // HAVE_ZLIB }; diff --git a/klm/util/have.hh b/klm/util/have.hh index f2f0cf90..b8181e99 100644 --- a/klm/util/have.hh +++ b/klm/util/have.hh @@ -3,13 +3,21 @@ #define UTIL_HAVE__ #ifndef HAVE_ZLIB +#if !defined(_WIN32) && !defined(_WIN64) #define HAVE_ZLIB #endif +#endif -// #define HAVE_ICU +#ifndef HAVE_ICU +//#define HAVE_ICU +#endif #ifndef HAVE_BOOST #define HAVE_BOOST #endif +#ifndef HAVE_THREADS +//#define HAVE_THREADS +#endif + #endif // UTIL_HAVE__ diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index 2db35b56..e0d2570b 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -171,6 +171,20 @@ void *MapZeroedWrite(int fd, std::size_t size) { return MapOrThrow(size, true, kFileFlags, false, fd, 0); } +namespace { + +int CreateOrThrow(const char *name) { + int ret; +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); +#else + UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); +#endif + return ret; +} + +} // namespace + void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { file.reset(CreateOrThrow(name)); try { diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc index 6accc21a..4f519312 100644 --- a/klm/util/murmur_hash.cc +++ b/klm/util/murmur_hash.cc @@ -23,7 +23,7 @@ namespace util { // 64-bit hash for 64-bit platforms -uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed ) +uint64_t MurmurHash64A ( const void * key, std::size_t len, uint64_t seed ) { const uint64_t m = 0xc6a4a7935bd1e995ULL; const int r = 47; @@ -81,7 +81,7 @@ uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed ) // 64-bit hash for 32-bit platforms -uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed ) +uint64_t MurmurHash64B ( const void * key, std::size_t len, uint64_t seed ) { const unsigned int m = 0x5bd1e995; const int r = 24; @@ -150,17 +150,18 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed ) return h; } + // Trick to test for 64-bit architecture at compile time. namespace { -template uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, unsigned int seed) { +template inline uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, uint64_t seed) { return MurmurHash64A(key, len, seed); } -template <> uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, unsigned int seed) { +template <> inline uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, uint64_t seed) { return MurmurHash64B(key, len, seed); } } // namespace -uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) { +uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed) { return MurmurHashNativeBackend(key, len, seed); } diff --git a/klm/util/murmur_hash.hh b/klm/util/murmur_hash.hh index 638aaeb2..ae7e88de 100644 --- a/klm/util/murmur_hash.hh +++ b/klm/util/murmur_hash.hh @@ -5,9 +5,9 @@ namespace util { -uint64_t MurmurHash64A(const void * key, std::size_t len, unsigned int seed = 0); -uint64_t MurmurHash64B(const void * key, std::size_t len, unsigned int seed = 0); -uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed = 0); +uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0); +uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0); +uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0); } // namespace util diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh index f466cebc..3354b68e 100644 --- a/klm/util/probing_hash_table.hh +++ b/klm/util/probing_hash_table.hh @@ -78,12 +78,33 @@ template bool FindOrInsert(const T &t, MutableIterator &out) { +#ifdef DEBUG + assert(initialized_); +#endif + for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) { + Key got(i->GetKey()); + if (equal_(got, t.GetKey())) { out = i; return true; } + if (equal_(got, invalid_)) { + UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); + *i = t; + out = i; + return false; + } + if (++i == end_) i = begin_; + } + } + void FinishedInserting() {} void LoadedBinary() {} // Don't change anything related to GetKey, template bool UnsafeMutableFind(const Key key, MutableIterator &out) { +#ifdef DEBUG + assert(initialized_); +#endif for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) { Key got(i->GetKey()); if (equal_(got, key)) { out = i; return true; } diff --git a/klm/util/usage.cc b/klm/util/usage.cc new file mode 100644 index 00000000..e5cf76f0 --- /dev/null +++ b/klm/util/usage.cc @@ -0,0 +1,46 @@ +#include "util/usage.hh" + +#include +#include + +#include +#include +#if !defined(_WIN32) && !defined(_WIN64) +#include +#include +#endif + +namespace util { + +namespace { +#if !defined(_WIN32) && !defined(_WIN64) +float FloatSec(const struct timeval &tv) { + return static_cast(tv.tv_sec) + (static_cast(tv.tv_usec) / 1000000.0); +} +#endif +} // namespace + +void PrintUsage(std::ostream &out) { +#if !defined(_WIN32) && !defined(_WIN64) + struct rusage usage; + if (getrusage(RUSAGE_SELF, &usage)) { + perror("getrusage"); + return; + } + out << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n'; + + // Linux doesn't set memory usage :-(. + std::ifstream status("/proc/self/status", std::ios::in); + std::string line; + while (getline(status, line)) { + if (!strncmp(line.c_str(), "VmRSS:\t", 7)) { + out << "VmRSS: " << (line.c_str() + 7) << '\n'; + break; + } else if (!strncmp(line.c_str(), "VmPeak:\t", 8)) { + out << "VmPeak: " << (line.c_str() + 8) << '\n'; + } + } +#endif +} + +} // namespace util diff --git a/klm/util/usage.hh b/klm/util/usage.hh new file mode 100644 index 00000000..d331ff74 --- /dev/null +++ b/klm/util/usage.hh @@ -0,0 +1,8 @@ +#ifndef UTIL_USAGE__ +#define UTIL_USAGE__ +#include + +namespace util { +void PrintUsage(std::ostream &to); +} // namespace util +#endif // UTIL_USAGE__ -- cgit v1.2.3