diff options
author | Kenneth Heafield <github@kheafield.com> | 2014-01-27 17:42:19 -0800 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2014-01-27 17:42:19 -0800 |
commit | 783c57b2d3312738ddcf992ac55ff750afe7cb47 (patch) | |
tree | c4811dab0d916836b8631f3c7df94f284a490b9b /klm/util | |
parent | f7e051a05d65ef25c2ada0b84cd82bfb375ef265 (diff) |
KenLM 5cc905bc2d214efa7de2db56a9a672b749a95591
Diffstat (limited to 'klm/util')
-rw-r--r-- | klm/util/exception.cc | 8 | ||||
-rw-r--r-- | klm/util/exception.hh | 10 | ||||
-rw-r--r-- | klm/util/file.cc | 26 | ||||
-rw-r--r-- | klm/util/file_piece.cc | 4 | ||||
-rw-r--r-- | klm/util/file_piece.hh | 10 | ||||
-rw-r--r-- | klm/util/file_piece_test.cc | 12 | ||||
-rw-r--r-- | klm/util/joint_sort.hh | 26 | ||||
-rw-r--r-- | klm/util/mmap.cc | 4 | ||||
-rw-r--r-- | klm/util/multi_intersection.hh | 2 | ||||
-rw-r--r-- | klm/util/murmur_hash.cc | 7 | ||||
-rw-r--r-- | klm/util/murmur_hash.hh | 4 | ||||
-rw-r--r-- | klm/util/pcqueue.hh | 58 | ||||
-rw-r--r-- | klm/util/pcqueue_test.cc | 20 | ||||
-rw-r--r-- | klm/util/pool.cc | 4 | ||||
-rw-r--r-- | klm/util/probing_hash_table.hh | 84 | ||||
-rw-r--r-- | klm/util/proxy_iterator.hh | 12 | ||||
-rw-r--r-- | klm/util/read_compressed_test.cc | 17 | ||||
-rw-r--r-- | klm/util/sized_iterator.hh | 14 | ||||
-rw-r--r-- | klm/util/sized_iterator_test.cc | 16 | ||||
-rw-r--r-- | klm/util/string_piece.hh | 12 | ||||
-rw-r--r-- | klm/util/tokenize_piece.hh | 17 | ||||
-rw-r--r-- | klm/util/usage.cc | 201 | ||||
-rw-r--r-- | klm/util/usage.hh | 3 |
23 files changed, 464 insertions, 107 deletions
diff --git a/klm/util/exception.cc b/klm/util/exception.cc index 557c3986..083bac20 100644 --- a/klm/util/exception.cc +++ b/klm/util/exception.cc @@ -51,6 +51,11 @@ void Exception::SetLocation(const char *file, unsigned int line, const char *fun } namespace { +// At least one of these functions will not be called. +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif // The XOPEN version. const char *HandleStrerror(int ret, const char *buf) { if (!ret) return buf; @@ -61,6 +66,9 @@ const char *HandleStrerror(int ret, const char *buf) { const char *HandleStrerror(const char *ret, const char * /*buf*/) { return ret; } +#ifdef __clang__ +#pragma clang diagnostic pop +#endif } // namespace ErrnoException::ErrnoException() throw() : errno_(errno) { diff --git a/klm/util/exception.hh b/klm/util/exception.hh index 74046cf9..0298272b 100644 --- a/klm/util/exception.hh +++ b/klm/util/exception.hh @@ -98,6 +98,9 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep #define UTIL_THROW_IF(Condition, Exception, Modify) \ UTIL_THROW_IF_ARG(Condition, Exception, , Modify) +#define UTIL_THROW_IF2(Condition, Modify) \ + UTIL_THROW_IF_ARG(Condition, util::Exception, , Modify) + // Exception that records errno and adds it to the message. class ErrnoException : public Exception { public: @@ -111,6 +114,13 @@ class ErrnoException : public Exception { int errno_; }; +// file wasn't there, or couldn't be open for some reason +class FileOpenException : public Exception { + public: + FileOpenException() throw() {} + ~FileOpenException() throw() {} +}; + // Utilities for overflow checking. class OverflowException : public Exception { public: diff --git a/klm/util/file.cc b/klm/util/file.cc index bef04cb1..51eaf972 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -17,7 +17,11 @@ #include <fcntl.h> #include <stdint.h> -#if defined(_WIN32) || defined(_WIN64) +#if defined __MINGW32__ +#include <windows.h> +#include <unistd.h> +#warning "The file functions on MinGW have not been tested for file sizes above 2^31 - 1. Please read https://stackoverflow.com/questions/12539488/determine-64-bit-file-size-in-c-on-mingw-32-bit and fix" +#elif defined(_WIN32) || defined(_WIN64) #include <windows.h> #include <io.h> #include <algorithm> @@ -76,7 +80,13 @@ int CreateOrThrow(const char *name) { } uint64_t SizeFile(int fd) { -#if defined(_WIN32) || defined(_WIN64) +#if defined __MINGW32__ + struct stat sb; + // Does this handle 64-bit? + int ret = fstat(fd, &sb); + if (ret == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; + return sb.st_size; +#elif defined(_WIN32) || defined(_WIN64) __int64 ret = _filelengthi64(fd); return (ret == -1) ? kBadSize : ret; #else // Not windows. @@ -100,7 +110,10 @@ uint64_t SizeOrThrow(int fd) { } void ResizeOrThrow(int fd, uint64_t to) { -#if defined(_WIN32) || defined(_WIN64) +#if defined __MINGW32__ + // Does this handle 64-bit? + int ret = ftruncate +#elif defined(_WIN32) || defined(_WIN64) errno_t ret = _chsize_s #elif defined(OS_ANDROID) int ret = ftruncate64 @@ -115,7 +128,7 @@ namespace { std::size_t GuardLarge(std::size_t size) { // The following operating systems have broken read/write/pread/pwrite that // only supports up to 2^31. -#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID) +#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID) || defined(__MINGW32__) return std::min(static_cast<std::size_t>(static_cast<unsigned>(-1)), size); #else return size; @@ -251,7 +264,10 @@ typedef CheckOffT<sizeof(off_t)>::True IgnoredType; // Can't we all just get along? void InternalSeek(int fd, int64_t off, int whence) { if ( -#if defined(_WIN32) || defined(_WIN64) +#if defined __MINGW32__ + // Does this handle 64-bit? + (off_t)-1 == lseek(fd, off, whence) +#elif defined(_WIN32) || defined(_WIN64) (__int64)-1 == _lseeki64(fd, off, whence) #elif defined(OS_ANDROID) (off64_t)-1 == lseek64(fd, off, whence) diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index b5961bea..9c7e00c4 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -74,7 +74,9 @@ StringPiece FilePiece::ReadLine(char delim) { } } if (at_end_) { - if (position_ == position_end_) Shift(); + if (position_ == position_end_) { + Shift(); + } return Consume(position_end_); } skip = position_end_ - position_; diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index c07c6011..ed3dc5ad 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -12,6 +12,7 @@ #include <iosfwd> #include <string> +#include <assert.h> #include <stdint.h> namespace util { @@ -66,8 +67,14 @@ class FilePiece { // Skip spaces defined by isspace. void SkipSpaces(const bool *delim = kSpaces) { + assert(position_ <= position_end_); for (; ; ++position_) { - if (position_ == position_end_) Shift(); + if (position_ == position_end_) { + Shift(); + // And break out at end of file. + if (position_ == position_end_) return; + } + assert(position_ < position_end_); if (!delim[static_cast<unsigned char>(*position_)]) return; } } @@ -86,6 +93,7 @@ class FilePiece { template <class T> T ReadNumber(); StringPiece Consume(const char *to) { + assert(to >= position_); StringPiece ret(position_, to - position_); position_ = to; return ret; diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc index 7336007d..4361877f 100644 --- a/klm/util/file_piece_test.cc +++ b/klm/util/file_piece_test.cc @@ -1,4 +1,4 @@ -// Tests might fail if you have creative characters in your path. Sue me. +// Tests might fail if you have creative characters in your path. Sue me. #include "util/file_piece.hh" #include "util/file.hh" @@ -55,7 +55,7 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) { #if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__) /* Apple isn't happy with the popen, fileno, dup. And I don't want to - * reimplement popen. This is an issue with the test. + * reimplement popen. This is an issue with the test. */ /* read() implementation */ BOOST_AUTO_TEST_CASE(StreamReadLine) { @@ -67,7 +67,7 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) { FILE *catter = popen(popen_args.c_str(), "r"); BOOST_REQUIRE(catter); - + FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { @@ -107,8 +107,8 @@ BOOST_AUTO_TEST_CASE(PlainZipReadLine) { } // gzip stream. Apple doesn't like popen, fileno, dup. This is an issue with -// the test. -#ifndef __APPLE__ +// the test. +#if !defined __APPLE__ && !defined __MINGW32__ BOOST_AUTO_TEST_CASE(StreamZipReadLine) { std::fstream ref(FileLocation().c_str(), std::ios::in); @@ -117,7 +117,7 @@ BOOST_AUTO_TEST_CASE(StreamZipReadLine) { FILE * catter = popen(command.c_str(), "r"); BOOST_REQUIRE(catter); - + FilePiece test(dup(fileno(catter)), "file_piece.cc.gz", NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { diff --git a/klm/util/joint_sort.hh b/klm/util/joint_sort.hh index 1b43ddcf..13a52b67 100644 --- a/klm/util/joint_sort.hh +++ b/klm/util/joint_sort.hh @@ -9,7 +9,6 @@ #include <algorithm> #include <functional> -#include <iostream> namespace util { @@ -35,9 +34,10 @@ template <class KeyIter, class ValueIter> class JointIter { return *this; } - void swap(const JointIter &other) { - std::swap(key_, other.key_); - std::swap(value_, other.value_); + friend void swap(JointIter &first, JointIter &second) { + using std::swap; + swap(first.key_, second.key_); + swap(first.value_, second.value_); } private: @@ -83,9 +83,11 @@ template <class KeyIter, class ValueIter> class JointProxy { return *(inner_.key_); } - void swap(JointProxy<KeyIter, ValueIter> &other) { - std::swap(*inner_.key_, *other.inner_.key_); - std::swap(*inner_.value_, *other.inner_.value_); + friend void swap(JointProxy<KeyIter, ValueIter> first, JointProxy<KeyIter, ValueIter> second) { + // Allow argument-dependent lookup. + using std::swap; + swap(*first.inner_.key_, *second.inner_.key_); + swap(*first.inner_.value_, *second.inner_.value_); } private: @@ -138,14 +140,4 @@ template <class KeyIter, class ValueIter> void JointSort(const KeyIter &key_begi } // namespace util -namespace std { -template <class KeyIter, class ValueIter> void swap(util::detail::JointIter<KeyIter, ValueIter> &left, util::detail::JointIter<KeyIter, ValueIter> &right) { - left.swap(right); -} - -template <class KeyIter, class ValueIter> void swap(util::detail::JointProxy<KeyIter, ValueIter> &left, util::detail::JointProxy<KeyIter, ValueIter> &right) { - left.swap(right); -} -} // namespace std - #endif // UTIL_JOINT_SORT__ diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index 6f79f26f..cee6a970 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -90,7 +90,9 @@ void scoped_memory::call_realloc(std::size_t size) { if (!new_data) { reset(); } else { - reset(new_data, size, MALLOC_ALLOCATED); + data_ = new_data; + size_ = size; + source_ = MALLOC_ALLOCATED; } } diff --git a/klm/util/multi_intersection.hh b/klm/util/multi_intersection.hh index 8334d39d..04678352 100644 --- a/klm/util/multi_intersection.hh +++ b/klm/util/multi_intersection.hh @@ -66,7 +66,7 @@ template <class Iterator, class Output, class Less> void AllIntersection(std::ve std::sort(sets.begin(), sets.end(), detail::RangeLessBySize<boost::iterator_range<Iterator> >()); boost::optional<Value> ret; - for (boost::optional<Value> ret; ret = detail::FirstIntersectionSorted(sets, less); sets.front().advance_begin(1)) { + for (boost::optional<Value> ret; (ret = detail::FirstIntersectionSorted(sets, less)); sets.front().advance_begin(1)) { out(*ret); } } diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc index 4f519312..189668c0 100644 --- a/klm/util/murmur_hash.cc +++ b/klm/util/murmur_hash.cc @@ -153,12 +153,19 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, uint64_t seed ) // Trick to test for 64-bit architecture at compile time. namespace { +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif template <unsigned L> inline uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, uint64_t seed) { return MurmurHash64A(key, len, seed); } template <> inline uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, uint64_t seed) { return MurmurHash64B(key, len, seed); } +#ifdef __clang__ +#pragma clang diagnostic pop +#endif } // namespace uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed) { diff --git a/klm/util/murmur_hash.hh b/klm/util/murmur_hash.hh index ae7e88de..4891833e 100644 --- a/klm/util/murmur_hash.hh +++ b/klm/util/murmur_hash.hh @@ -5,8 +5,12 @@ namespace util { +// 64-bit machine version uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0); +// 32-bit machine version (not the same function as above) uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0); +// Use the version for this arch. Because the values differ across +// architectures, really only use it for in-memory structures. uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0); } // namespace util diff --git a/klm/util/pcqueue.hh b/klm/util/pcqueue.hh index 3df8749b..07e4146f 100644 --- a/klm/util/pcqueue.hh +++ b/klm/util/pcqueue.hh @@ -1,6 +1,8 @@ #ifndef UTIL_PCQUEUE__ #define UTIL_PCQUEUE__ +#include "util/exception.hh" + #include <boost/interprocess/sync/interprocess_semaphore.hpp> #include <boost/scoped_array.hpp> #include <boost/thread/mutex.hpp> @@ -8,20 +10,68 @@ #include <errno.h> +#ifdef __APPLE__ +#include <mach/semaphore.h> +#include <mach/task.h> +#include <mach/mach_traps.h> +#include <mach/mach.h> +#endif // __APPLE__ + namespace util { -inline void WaitSemaphore (boost::interprocess::interprocess_semaphore &on) { +/* OS X Maverick and Boost interprocess were doing "Function not implemented." + * So this is my own wrapper around the mach kernel APIs. + */ +#ifdef __APPLE__ + +#define MACH_CALL(call) UTIL_THROW_IF(KERN_SUCCESS != (call), Exception, "Mach call failure") + +class Semaphore { + public: + explicit Semaphore(int value) : task_(mach_task_self()) { + MACH_CALL(semaphore_create(task_, &back_, SYNC_POLICY_FIFO, value)); + } + + ~Semaphore() { + MACH_CALL(semaphore_destroy(task_, back_)); + } + + void wait() { + MACH_CALL(semaphore_wait(back_)); + } + + void post() { + MACH_CALL(semaphore_signal(back_)); + } + + private: + semaphore_t back_; + task_t task_; +}; + +inline void WaitSemaphore(Semaphore &semaphore) { + semaphore.wait(); +} + +#else +typedef boost::interprocess::interprocess_semaphore Semaphore; + +inline void WaitSemaphore (Semaphore &on) { while (1) { try { on.wait(); break; } catch (boost::interprocess::interprocess_exception &e) { - if (e.get_native_error() != EINTR) throw; + if (e.get_native_error() != EINTR) { + throw; + } } } } +#endif // __APPLE__ + /* Producer consumer queue safe for multiple producers and multiple consumers. * T must be default constructable and have operator=. * The value is copied twice for Consume(T &out) or three times for Consume(), @@ -82,9 +132,9 @@ template <class T> class PCQueue : boost::noncopyable { private: // Number of empty spaces in storage_. - boost::interprocess::interprocess_semaphore empty_; + Semaphore empty_; // Number of occupied spaces in storage_. - boost::interprocess::interprocess_semaphore used_; + Semaphore used_; boost::scoped_array<T> storage_; diff --git a/klm/util/pcqueue_test.cc b/klm/util/pcqueue_test.cc new file mode 100644 index 00000000..22ed2c6f --- /dev/null +++ b/klm/util/pcqueue_test.cc @@ -0,0 +1,20 @@ +#include "util/pcqueue.hh" + +#define BOOST_TEST_MODULE PCQueueTest +#include <boost/test/unit_test.hpp> + +namespace util { +namespace { + +BOOST_AUTO_TEST_CASE(SingleThread) { + PCQueue<int> queue(10); + for (int i = 0; i < 10; ++i) { + queue.Produce(i); + } + for (int i = 0; i < 10; ++i) { + BOOST_CHECK_EQUAL(i, queue.Consume()); + } +} + +} +} // namespace util diff --git a/klm/util/pool.cc b/klm/util/pool.cc index db72a8ec..429ba158 100644 --- a/klm/util/pool.cc +++ b/klm/util/pool.cc @@ -25,9 +25,7 @@ void Pool::FreeAll() { } void *Pool::More(std::size_t size) { - // Double until we hit 2^21 (2 MB). Then grow in 2 MB blocks. - std::size_t desired_size = static_cast<size_t>(32) << std::min(static_cast<std::size_t>(16), free_list_.size()); - std::size_t amount = std::max(desired_size, size); + std::size_t amount = std::max(static_cast<size_t>(32) << free_list_.size(), size); uint8_t *ret = static_cast<uint8_t*>(MallocOrThrow(amount)); free_list_.push_back(ret); current_ = ret + size; diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh index 51a2944d..38524806 100644 --- a/klm/util/probing_hash_table.hh +++ b/klm/util/probing_hash_table.hh @@ -2,6 +2,7 @@ #define UTIL_PROBING_HASH_TABLE__ #include "util/exception.hh" +#include "util/scoped.hh" #include <algorithm> #include <cstddef> @@ -25,6 +26,8 @@ struct IdentityHash { template <class T> T operator()(T arg) const { return arg; } }; +template <class EntryT, class HashT, class EqualT> class AutoProbing; + /* Non-standard hash table * Buckets must be set at the beginning and must be greater than maximum number * of elements, else it throws ProbingSizeException. @@ -33,7 +36,6 @@ struct IdentityHash { * Uses linear probing to find value. * Only insert and lookup operations. */ - template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class ProbingHashTable { public: typedef EntryT Entry; @@ -43,7 +45,6 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry typedef HashT Hash; typedef EqualT Equal; - public: static uint64_t Size(uint64_t entries, float multiplier) { uint64_t buckets = std::max(entries + 1, static_cast<uint64_t>(multiplier * static_cast<float>(entries))); return buckets * sizeof(Entry); @@ -69,6 +70,11 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry #endif {} + void Relocate(void *new_base) { + begin_ = reinterpret_cast<MutableIterator>(new_base); + end_ = begin_ + buckets_; + } + template <class T> MutableIterator Insert(const T &t) { #ifdef DEBUG assert(initialized_); @@ -82,7 +88,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry #ifdef DEBUG assert(initialized_); #endif - for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) { + for (MutableIterator i = Ideal(t);;) { Key got(i->GetKey()); if (equal_(got, t.GetKey())) { out = i; return true; } if (equal_(got, invalid_)) { @@ -97,8 +103,6 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry void FinishedInserting() {} - void LoadedBinary() {} - // Don't change anything related to GetKey, template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) { #ifdef DEBUG @@ -224,6 +228,8 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry } private: + friend class AutoProbing<Entry, Hash, Equal>; + template <class T> MutableIterator Ideal(const T &t) { return begin_ + (hash_(t.GetKey()) % buckets_); } @@ -247,6 +253,74 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry #endif }; +// Resizable linear probing hash table. This owns the memory. +template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class AutoProbing { + private: + typedef ProbingHashTable<EntryT, HashT, EqualT> Backend; + public: + typedef EntryT Entry; + typedef typename Entry::Key Key; + typedef const Entry *ConstIterator; + typedef Entry *MutableIterator; + typedef HashT Hash; + typedef EqualT Equal; + + AutoProbing(std::size_t initial_size = 10, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) : + allocated_(Backend::Size(initial_size, 1.5)), mem_(util::MallocOrThrow(allocated_)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) { + threshold_ = initial_size * 1.2; + } + + // Assumes that the key is unique. Multiple insertions won't cause a failure, just inconsistent lookup. + template <class T> MutableIterator Insert(const T &t) { + DoubleIfNeeded(); + return backend_.UncheckedInsert(t); + } + + template <class T> bool FindOrInsert(const T &t, MutableIterator &out) { + DoubleIfNeeded(); + return backend_.FindOrInsert(t, out); + } + + template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) { + return backend_.UnsafeMutableFind(key, out); + } + + template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) { + return backend_.UnsafeMutableMustFind(key); + } + + template <class Key> bool Find(const Key key, ConstIterator &out) const { + return backend_.Find(key, out); + } + + template <class Key> ConstIterator MustFind(const Key key) const { + return backend_.MustFind(key); + } + + std::size_t Size() const { + return backend_.SizeNoSerialization(); + } + + void Clear() { + backend_.Clear(); + } + + private: + void DoubleIfNeeded() { + if (Size() < threshold_) + return; + mem_.call_realloc(backend_.DoubleTo()); + allocated_ = backend_.DoubleTo(); + backend_.Double(mem_.get()); + threshold_ *= 2; + } + + std::size_t allocated_; + util::scoped_malloc mem_; + Backend backend_; + std::size_t threshold_; +}; + } // namespace util #endif // UTIL_PROBING_HASH_TABLE__ diff --git a/klm/util/proxy_iterator.hh b/klm/util/proxy_iterator.hh index 0ee1716f..a2810a47 100644 --- a/klm/util/proxy_iterator.hh +++ b/klm/util/proxy_iterator.hh @@ -38,8 +38,8 @@ template <class Proxy> class ProxyIterator { typedef std::random_access_iterator_tag iterator_category; typedef typename Proxy::value_type value_type; typedef std::ptrdiff_t difference_type; - typedef Proxy & reference; - typedef Proxy * pointer; + typedef Proxy reference; + typedef ProxyIterator<Proxy> * pointer; ProxyIterator() {} @@ -47,10 +47,10 @@ template <class Proxy> class ProxyIterator { template <class AlternateProxy> ProxyIterator(const ProxyIterator<AlternateProxy> &in) : p_(*in) {} explicit ProxyIterator(const Proxy &p) : p_(p) {} - // p_'s swap does value swapping, but here we want iterator swapping +/* // p_'s swap does value swapping, but here we want iterator swapping friend inline void swap(ProxyIterator<Proxy> &first, ProxyIterator<Proxy> &second) { swap(first.I(), second.I()); - } + }*/ // p_'s operator= does value copying, but here we want iterator copying. S &operator=(const S &other) { @@ -77,8 +77,8 @@ template <class Proxy> class ProxyIterator { std::ptrdiff_t operator-(const S &other) const { return I() - other.I(); } - Proxy &operator*() { return p_; } - const Proxy &operator*() const { return p_; } + Proxy operator*() { return p_; } + const Proxy operator*() const { return p_; } Proxy *operator->() { return &p_; } const Proxy *operator->() const { return &p_; } Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); } diff --git a/klm/util/read_compressed_test.cc b/klm/util/read_compressed_test.cc index 9cb4a4b9..50450a02 100644 --- a/klm/util/read_compressed_test.cc +++ b/klm/util/read_compressed_test.cc @@ -12,6 +12,23 @@ #include <stdlib.h> +#if defined __MINGW32__ +#include <time.h> +#include <fcntl.h> + +#if !defined mkstemp +// TODO insecure +int mkstemp(char * stemplate) +{ + char *filename = mktemp(stemplate); + if (filename == NULL) + return -1; + return open(filename, O_RDWR | O_CREAT, 0600); +} +#endif + +#endif // defined + namespace util { namespace { diff --git a/klm/util/sized_iterator.hh b/klm/util/sized_iterator.hh index dce8f229..a72657b5 100644 --- a/klm/util/sized_iterator.hh +++ b/klm/util/sized_iterator.hh @@ -36,7 +36,7 @@ class SizedInnerIterator { void *Data() { return ptr_; } std::size_t EntrySize() const { return size_; } - friend inline void swap(SizedInnerIterator &first, SizedInnerIterator &second) { + friend void swap(SizedInnerIterator &first, SizedInnerIterator &second) { std::swap(first.ptr_, second.ptr_); std::swap(first.size_, second.size_); } @@ -69,17 +69,7 @@ class SizedProxy { const void *Data() const { return inner_.Data(); } void *Data() { return inner_.Data(); } - /** - // TODO: this (deep) swap was recently added. why? if any std heap sort etc - // algs are using swap, that's going to be worse performance than using - // =. i'm not sure why we *want* a deep swap. if C++11 compilers are - // choosing between move constructor and swap, then we'd better implement a - // (deep) move constructor. it may also be that this is moot since i made - // ProxyIterator a reference and added a shallow ProxyIterator swap? (I - // need Ken or someone competent to judge whether that's correct also. - - // let me know at graehl@gmail.com - */ - friend void swap(SizedProxy &first, SizedProxy &second) { + friend void swap(SizedProxy first, SizedProxy second) { std::swap_ranges( static_cast<char*>(first.inner_.Data()), static_cast<char*>(first.inner_.Data()) + first.inner_.EntrySize(), diff --git a/klm/util/sized_iterator_test.cc b/klm/util/sized_iterator_test.cc new file mode 100644 index 00000000..c36bcb2d --- /dev/null +++ b/klm/util/sized_iterator_test.cc @@ -0,0 +1,16 @@ +#include "util/sized_iterator.hh" + +#define BOOST_TEST_MODULE SizedIteratorTest +#include <boost/test/unit_test.hpp> + +namespace util { namespace { + +BOOST_AUTO_TEST_CASE(swap_works) { + char str[2] = { 0, 1 }; + SizedProxy first(str, 1), second(str + 1, 1); + swap(first, second); + BOOST_CHECK_EQUAL(1, str[0]); + BOOST_CHECK_EQUAL(0, str[1]); +} + +}} // namespace anonymous util diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh index 9cf4c7f6..84431db1 100644 --- a/klm/util/string_piece.hh +++ b/klm/util/string_piece.hh @@ -74,6 +74,12 @@ inline bool operator!=(const StringPiece& x, const StringPiece& y) { #endif // old version of ICU U_NAMESPACE_BEGIN + +inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { + int longersize = longer.size(), prefixsize = prefix.size(); + return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0; +} + #else #include <algorithm> @@ -212,7 +218,7 @@ class StringPiece { StringPiece substr(size_type pos, size_type n = npos) const; static int wordmemcmp(const char* p, const char* p2, size_type N) { - return memcmp(p, p2, N); + return std::memcmp(p, p2, N); } }; @@ -227,6 +233,10 @@ inline bool operator!=(const StringPiece& x, const StringPiece& y) { return !(x == y); } +inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { + return longer.starts_with(prefix); +} + #endif // HAVE_ICU undefined inline bool operator<(const StringPiece& x, const StringPiece& y) { diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh index a588c3fc..24eae8fb 100644 --- a/klm/util/tokenize_piece.hh +++ b/klm/util/tokenize_piece.hh @@ -58,6 +58,23 @@ class AnyCharacter { StringPiece chars_; }; +class BoolCharacter { + public: + BoolCharacter() {} + + explicit BoolCharacter(const bool *delimiter) { delimiter_ = delimiter; } + + StringPiece Find(const StringPiece &in) const { + for (const char *i = in.data(); i != in.data() + in.size(); ++i) { + if (delimiter_[static_cast<unsigned char>(*i)]) return StringPiece(i, 1); + } + return StringPiece(in.data() + in.size(), 0); + } + + private: + const bool *delimiter_; +}; + class AnyCharacterLast { public: AnyCharacterLast() {} diff --git a/klm/util/usage.cc b/klm/util/usage.cc index 8db375e1..e68d7c7c 100644 --- a/klm/util/usage.cc +++ b/klm/util/usage.cc @@ -10,61 +10,117 @@ #include <string.h> #include <ctype.h> -#if !defined(_WIN32) && !defined(_WIN64) +#include <time.h> +#if defined(_WIN32) || defined(_WIN64) +// This code lifted from physmem.c in gnulib. See the copyright statement +// below. +# define WIN32_LEAN_AND_MEAN +# include <windows.h> +/* MEMORYSTATUSEX is missing from older windows headers, so define + a local replacement. */ +typedef struct +{ + DWORD dwLength; + DWORD dwMemoryLoad; + DWORDLONG ullTotalPhys; + DWORDLONG ullAvailPhys; + DWORDLONG ullTotalPageFile; + DWORDLONG ullAvailPageFile; + DWORDLONG ullTotalVirtual; + DWORDLONG ullAvailVirtual; + DWORDLONG ullAvailExtendedVirtual; +} lMEMORYSTATUSEX; +typedef WINBOOL (WINAPI *PFN_MS_EX) (lMEMORYSTATUSEX*); +#else #include <sys/resource.h> #include <sys/time.h> -#include <time.h> #include <unistd.h> #endif -namespace util { +#if defined(__MACH__) || defined(__FreeBSD__) || defined(__APPLE__) +#include <sys/types.h> +#include <sys/sysctl.h> +#endif -#if !defined(_WIN32) && !defined(_WIN64) +namespace util { namespace { -// On Mac OS X, clock_gettime is not implemented. -// CLOCK_MONOTONIC is not defined either. -#ifdef __MACH__ -#define CLOCK_MONOTONIC 0 - -int clock_gettime(int clk_id, struct timespec *tp) { +#if defined(__MACH__) +typedef struct timeval Wall; +Wall GetWall() { struct timeval tv; gettimeofday(&tv, NULL); - tp->tv_sec = tv.tv_sec; - tp->tv_nsec = tv.tv_usec * 1000; - return 0; + return tv; } -#endif // __MACH__ - -float FloatSec(const struct timeval &tv) { - return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000.0); +#elif defined(_WIN32) || defined(_WIN64) +typedef time_t Wall; +Wall GetWall() { + return time(NULL); } -float FloatSec(const struct timespec &tv) { - return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_nsec) / 1000000000.0); +#else +typedef struct timespec Wall; +Wall GetWall() { + Wall ret; + clock_gettime(CLOCK_MONOTONIC, &ret); + return ret; } +#endif -const char *SkipSpaces(const char *at) { - for (; *at == ' ' || *at == '\t'; ++at) {} - return at; +// Some of these functions are only used on some platforms. +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif +// These all assume first > second +double Subtract(time_t first, time_t second) { + return difftime(first, second); +} +double DoubleSec(time_t tv) { + return static_cast<double>(tv); +} +#if !defined(_WIN32) && !defined(_WIN64) +double Subtract(const struct timeval &first, const struct timeval &second) { + return static_cast<double>(first.tv_sec - second.tv_sec) + static_cast<double>(first.tv_usec - second.tv_usec) / 1000000.0; } +double Subtract(const struct timespec &first, const struct timespec &second) { + return static_cast<double>(first.tv_sec - second.tv_sec) + static_cast<double>(first.tv_nsec - second.tv_nsec) / 1000000000.0; +} +double DoubleSec(const struct timeval &tv) { + return static_cast<double>(tv.tv_sec) + (static_cast<double>(tv.tv_usec) / 1000000.0); +} +double DoubleSec(const struct timespec &tv) { + return static_cast<double>(tv.tv_sec) + (static_cast<double>(tv.tv_nsec) / 1000000000.0); +} +#endif +#ifdef __clang__ +#pragma clang diagnostic pop +#endif class RecordStart { public: RecordStart() { - clock_gettime(CLOCK_MONOTONIC, &started_); + started_ = GetWall(); } - const struct timespec &Started() const { + const Wall &Started() const { return started_; } private: - struct timespec started_; + Wall started_; }; const RecordStart kRecordStart; + +const char *SkipSpaces(const char *at) { + for (; *at == ' ' || *at == '\t'; ++at) {} + return at; +} } // namespace -#endif + +double WallTime() { + return Subtract(GetWall(), kRecordStart.Started()); +} void PrintUsage(std::ostream &out) { #if !defined(_WIN32) && !defined(_WIN64) @@ -83,32 +139,89 @@ void PrintUsage(std::ostream &out) { } struct rusage usage; - if (getrusage(RUSAGE_CHILDREN, &usage)) { + if (getrusage(RUSAGE_SELF, &usage)) { perror("getrusage"); return; } out << "RSSMax:" << usage.ru_maxrss << " kB" << '\t'; - out << "user:" << FloatSec(usage.ru_utime) << "\tsys:" << FloatSec(usage.ru_stime) << '\t'; - out << "CPU:" << (FloatSec(usage.ru_utime) + FloatSec(usage.ru_stime)); - - struct timespec current; - clock_gettime(CLOCK_MONOTONIC, ¤t); - out << "\treal:" << (FloatSec(current) - FloatSec(kRecordStart.Started())) << '\n'; + out << "user:" << DoubleSec(usage.ru_utime) << "\tsys:" << DoubleSec(usage.ru_stime) << '\t'; + out << "CPU:" << (DoubleSec(usage.ru_utime) + DoubleSec(usage.ru_stime)); + out << '\t'; #endif + + out << "real:" << WallTime() << '\n'; } +/* Adapted from physmem.c in gnulib 831b84c59ef413c57a36b67344467d66a8a2ba70 */ +/* Calculate the size of physical memory. + + Copyright (C) 2000-2001, 2003, 2005-2006, 2009-2013 Free Software + Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* Written by Paul Eggert. */ uint64_t GuessPhysicalMemory() { +#if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) + { + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGESIZE); + if (pages != -1 && page_size != -1) + return static_cast<uint64_t>(pages) * static_cast<uint64_t>(page_size); + } +#endif +#ifdef HW_PHYSMEM + { /* This works on *bsd and darwin. */ + unsigned int physmem; + size_t len = sizeof physmem; + static int mib[2] = { CTL_HW, HW_PHYSMEM }; + + if (sysctl (mib, sizeof(mib) / sizeof(mib[0]), &physmem, &len, NULL, 0) == 0 + && len == sizeof (physmem)) + return static_cast<uint64_t>(physmem); + } +#endif + #if defined(_WIN32) || defined(_WIN64) - return 0; -#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) - long pages = sysconf(_SC_PHYS_PAGES); - if (pages == -1) return 0; - long page_size = sysconf(_SC_PAGESIZE); - if (page_size == -1) return 0; - return static_cast<uint64_t>(pages) * static_cast<uint64_t>(page_size); -#else - return 0; + { /* this works on windows */ + PFN_MS_EX pfnex; + HMODULE h = GetModuleHandle ("kernel32.dll"); + + if (!h) + return 0; + + /* Use GlobalMemoryStatusEx if available. */ + if ((pfnex = (PFN_MS_EX) GetProcAddress (h, "GlobalMemoryStatusEx"))) + { + lMEMORYSTATUSEX lms_ex; + lms_ex.dwLength = sizeof lms_ex; + if (!pfnex (&lms_ex)) + return 0; + return lms_ex.ullTotalPhys; + } + + /* Fall back to GlobalMemoryStatus which is always available. + but returns wrong results for physical memory > 4GB. */ + else + { + MEMORYSTATUS ms; + GlobalMemoryStatus (&ms); + return ms.dwTotalPhys; + } + } #endif + return 0; } namespace { @@ -135,7 +248,7 @@ template <class Num> uint64_t ParseNum(const std::string &arg) { if (after == "%") { uint64_t mem = GuessPhysicalMemory(); UTIL_THROW_IF_ARG(!mem, SizeParseError, (arg), "because % was specified but the physical memory size could not be determined."); - return static_cast<double>(value) * static_cast<double>(mem) / 100.0; + return static_cast<uint64_t>(static_cast<double>(value) * static_cast<double>(mem) / 100.0); } std::string units("bKMGTPEZY"); @@ -144,7 +257,7 @@ template <class Num> uint64_t ParseNum(const std::string &arg) { for (std::string::size_type i = 0; i < index; ++i) { value *= 1024; } - return value; + return static_cast<uint64_t>(value); } } // namespace diff --git a/klm/util/usage.hh b/klm/util/usage.hh index e19eda7b..da53b9e3 100644 --- a/klm/util/usage.hh +++ b/klm/util/usage.hh @@ -7,6 +7,9 @@ #include <stdint.h> namespace util { +// Time in seconds since process started. Zero on unsupported platforms. +double WallTime(); + void PrintUsage(std::ostream &to); // Determine how much physical memory there is. Return 0 on failure. |