diff options
| author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2014-01-28 00:18:37 -0500 | 
|---|---|---|
| committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2014-01-28 00:18:37 -0500 | 
| commit | 0ee2b44c5c0981358ade9ddab1d083bbe1de5daf (patch) | |
| tree | 2dc8d9b9efb24b3a3b5c2bcaf0b55df30743d151 /klm/util | |
| parent | 2ac0704a463d45f0bfe23184a1ea9950d60fd546 (diff) | |
| parent | 783c57b2d3312738ddcf992ac55ff750afe7cb47 (diff) | |
Merge branch 'master' of github.com:redpony/cdec
Diffstat (limited to 'klm/util')
| -rw-r--r-- | klm/util/exception.cc | 8 | ||||
| -rw-r--r-- | klm/util/exception.hh | 10 | ||||
| -rw-r--r-- | klm/util/file.cc | 26 | ||||
| -rw-r--r-- | klm/util/file_piece.cc | 4 | ||||
| -rw-r--r-- | klm/util/file_piece.hh | 10 | ||||
| -rw-r--r-- | klm/util/file_piece_test.cc | 12 | ||||
| -rw-r--r-- | klm/util/joint_sort.hh | 26 | ||||
| -rw-r--r-- | klm/util/mmap.cc | 4 | ||||
| -rw-r--r-- | klm/util/multi_intersection.hh | 2 | ||||
| -rw-r--r-- | klm/util/murmur_hash.cc | 7 | ||||
| -rw-r--r-- | klm/util/murmur_hash.hh | 4 | ||||
| -rw-r--r-- | klm/util/pcqueue.hh | 58 | ||||
| -rw-r--r-- | klm/util/pcqueue_test.cc | 20 | ||||
| -rw-r--r-- | klm/util/pool.cc | 4 | ||||
| -rw-r--r-- | klm/util/probing_hash_table.hh | 84 | ||||
| -rw-r--r-- | klm/util/proxy_iterator.hh | 12 | ||||
| -rw-r--r-- | klm/util/read_compressed_test.cc | 17 | ||||
| -rw-r--r-- | klm/util/sized_iterator.hh | 14 | ||||
| -rw-r--r-- | klm/util/sized_iterator_test.cc | 16 | ||||
| -rw-r--r-- | klm/util/string_piece.hh | 12 | ||||
| -rw-r--r-- | klm/util/tokenize_piece.hh | 17 | ||||
| -rw-r--r-- | klm/util/usage.cc | 201 | ||||
| -rw-r--r-- | klm/util/usage.hh | 3 | 
23 files changed, 464 insertions, 107 deletions
| diff --git a/klm/util/exception.cc b/klm/util/exception.cc index 557c3986..083bac20 100644 --- a/klm/util/exception.cc +++ b/klm/util/exception.cc @@ -51,6 +51,11 @@ void Exception::SetLocation(const char *file, unsigned int line, const char *fun  }  namespace { +// At least one of these functions will not be called. +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif  // The XOPEN version.  const char *HandleStrerror(int ret, const char *buf) {    if (!ret) return buf; @@ -61,6 +66,9 @@ const char *HandleStrerror(int ret, const char *buf) {  const char *HandleStrerror(const char *ret, const char * /*buf*/) {    return ret;  } +#ifdef __clang__ +#pragma clang diagnostic pop +#endif  } // namespace  ErrnoException::ErrnoException() throw() : errno_(errno) { diff --git a/klm/util/exception.hh b/klm/util/exception.hh index 74046cf9..0298272b 100644 --- a/klm/util/exception.hh +++ b/klm/util/exception.hh @@ -98,6 +98,9 @@ template <class Except, class Data> typename Except::template ExceptionTag<Excep  #define UTIL_THROW_IF(Condition, Exception, Modify) \    UTIL_THROW_IF_ARG(Condition, Exception, , Modify) +#define UTIL_THROW_IF2(Condition, Modify) \ +  UTIL_THROW_IF_ARG(Condition, util::Exception, , Modify) +  // Exception that records errno and adds it to the message.  class ErrnoException : public Exception {    public: @@ -111,6 +114,13 @@ class ErrnoException : public Exception {      int errno_;  }; +// file wasn't there, or couldn't be open for some reason +class FileOpenException : public Exception { +  public: +	FileOpenException() throw() {} +    ~FileOpenException() throw() {} +}; +  // Utilities for overflow checking.    class OverflowException : public Exception {    public: diff --git a/klm/util/file.cc b/klm/util/file.cc index bef04cb1..51eaf972 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -17,7 +17,11 @@  #include <fcntl.h>  #include <stdint.h> -#if defined(_WIN32) || defined(_WIN64) +#if defined __MINGW32__ +#include <windows.h> +#include <unistd.h> +#warning "The file functions on MinGW have not been tested for file sizes above 2^31 - 1.  Please read https://stackoverflow.com/questions/12539488/determine-64-bit-file-size-in-c-on-mingw-32-bit and fix" +#elif defined(_WIN32) || defined(_WIN64)  #include <windows.h>  #include <io.h>  #include <algorithm> @@ -76,7 +80,13 @@ int CreateOrThrow(const char *name) {  }  uint64_t SizeFile(int fd) { -#if defined(_WIN32) || defined(_WIN64) +#if defined __MINGW32__ +  struct stat sb; +  // Does this handle 64-bit? +  int ret = fstat(fd, &sb); +  if (ret == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; +  return sb.st_size; +#elif defined(_WIN32) || defined(_WIN64)    __int64 ret = _filelengthi64(fd);    return (ret == -1) ? kBadSize : ret;  #else // Not windows. @@ -100,7 +110,10 @@ uint64_t SizeOrThrow(int fd) {  }  void ResizeOrThrow(int fd, uint64_t to) { -#if defined(_WIN32) || defined(_WIN64) +#if defined __MINGW32__ +    // Does this handle 64-bit?   +    int ret = ftruncate +#elif defined(_WIN32) || defined(_WIN64)      errno_t ret = _chsize_s  #elif defined(OS_ANDROID)      int ret = ftruncate64 @@ -115,7 +128,7 @@ namespace {  std::size_t GuardLarge(std::size_t size) {    // The following operating systems have broken read/write/pread/pwrite that    // only supports up to 2^31. -#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID) +#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID) || defined(__MINGW32__)    return std::min(static_cast<std::size_t>(static_cast<unsigned>(-1)), size);  #else    return size; @@ -251,7 +264,10 @@ typedef CheckOffT<sizeof(off_t)>::True IgnoredType;  // Can't we all just get along?  void InternalSeek(int fd, int64_t off, int whence) {    if ( -#if defined(_WIN32) || defined(_WIN64) +#if defined __MINGW32__ +    // Does this handle 64-bit? +    (off_t)-1 == lseek(fd, off, whence) +#elif defined(_WIN32) || defined(_WIN64)      (__int64)-1 == _lseeki64(fd, off, whence)  #elif defined(OS_ANDROID)      (off64_t)-1 == lseek64(fd, off, whence) diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index b5961bea..9c7e00c4 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -74,7 +74,9 @@ StringPiece FilePiece::ReadLine(char delim) {        }      }      if (at_end_) { -      if (position_ == position_end_) Shift(); +      if (position_ == position_end_) { +        Shift(); +      }        return Consume(position_end_);      }      skip = position_end_ - position_; diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index c07c6011..ed3dc5ad 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -12,6 +12,7 @@  #include <iosfwd>  #include <string> +#include <assert.h>  #include <stdint.h>  namespace util { @@ -66,8 +67,14 @@ class FilePiece {      // Skip spaces defined by isspace.      void SkipSpaces(const bool *delim = kSpaces) { +      assert(position_ <= position_end_);        for (; ; ++position_) { -        if (position_ == position_end_) Shift(); +        if (position_ == position_end_) { +          Shift(); +          // And break out at end of file. +          if (position_ == position_end_) return; +        } +        assert(position_ < position_end_);          if (!delim[static_cast<unsigned char>(*position_)]) return;        }      } @@ -86,6 +93,7 @@ class FilePiece {      template <class T> T ReadNumber();      StringPiece Consume(const char *to) { +      assert(to >= position_);        StringPiece ret(position_, to - position_);        position_ = to;        return ret; diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc index 7336007d..4361877f 100644 --- a/klm/util/file_piece_test.cc +++ b/klm/util/file_piece_test.cc @@ -1,4 +1,4 @@ -// Tests might fail if you have creative characters in your path.  Sue me.   +// Tests might fail if you have creative characters in your path.  Sue me.  #include "util/file_piece.hh"  #include "util/file.hh" @@ -55,7 +55,7 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) {  #if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__)  /* Apple isn't happy with the popen, fileno, dup.  And I don't want to - * reimplement popen.  This is an issue with the test.   + * reimplement popen.  This is an issue with the test.   */  /* read() implementation */  BOOST_AUTO_TEST_CASE(StreamReadLine) { @@ -67,7 +67,7 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) {    FILE *catter = popen(popen_args.c_str(), "r");    BOOST_REQUIRE(catter); -   +    FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1);    std::string ref_line;    while (getline(ref, ref_line)) { @@ -107,8 +107,8 @@ BOOST_AUTO_TEST_CASE(PlainZipReadLine) {  }  // gzip stream.  Apple doesn't like popen, fileno, dup.  This is an issue with -// the test.   -#ifndef __APPLE__ +// the test. +#if !defined __APPLE__ && !defined __MINGW32__  BOOST_AUTO_TEST_CASE(StreamZipReadLine) {    std::fstream ref(FileLocation().c_str(), std::ios::in); @@ -117,7 +117,7 @@ BOOST_AUTO_TEST_CASE(StreamZipReadLine) {    FILE * catter = popen(command.c_str(), "r");    BOOST_REQUIRE(catter); -   +    FilePiece test(dup(fileno(catter)), "file_piece.cc.gz", NULL, 1);    std::string ref_line;    while (getline(ref, ref_line)) { diff --git a/klm/util/joint_sort.hh b/klm/util/joint_sort.hh index 1b43ddcf..13a52b67 100644 --- a/klm/util/joint_sort.hh +++ b/klm/util/joint_sort.hh @@ -9,7 +9,6 @@  #include <algorithm>  #include <functional> -#include <iostream>  namespace util { @@ -35,9 +34,10 @@ template <class KeyIter, class ValueIter> class JointIter {        return *this;      } -    void swap(const JointIter &other) { -      std::swap(key_, other.key_); -      std::swap(value_, other.value_); +    friend void swap(JointIter &first, JointIter &second) { +      using std::swap; +      swap(first.key_, second.key_); +      swap(first.value_, second.value_);      }    private: @@ -83,9 +83,11 @@ template <class KeyIter, class ValueIter> class JointProxy {        return *(inner_.key_);      } -    void swap(JointProxy<KeyIter, ValueIter> &other) { -      std::swap(*inner_.key_, *other.inner_.key_); -      std::swap(*inner_.value_, *other.inner_.value_); +    friend void swap(JointProxy<KeyIter, ValueIter> first, JointProxy<KeyIter, ValueIter> second) { +      // Allow argument-dependent lookup. +      using std::swap; +      swap(*first.inner_.key_, *second.inner_.key_); +      swap(*first.inner_.value_, *second.inner_.value_);      }    private: @@ -138,14 +140,4 @@ template <class KeyIter, class ValueIter> void JointSort(const KeyIter &key_begi  } // namespace util -namespace std { -template <class KeyIter, class ValueIter> void swap(util::detail::JointIter<KeyIter, ValueIter> &left, util::detail::JointIter<KeyIter, ValueIter> &right) { -  left.swap(right); -} - -template <class KeyIter, class ValueIter> void swap(util::detail::JointProxy<KeyIter, ValueIter> &left, util::detail::JointProxy<KeyIter, ValueIter> &right) { -  left.swap(right); -} -} // namespace std -  #endif // UTIL_JOINT_SORT__ diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index 6f79f26f..cee6a970 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -90,7 +90,9 @@ void scoped_memory::call_realloc(std::size_t size) {    if (!new_data) {      reset();    } else { -    reset(new_data, size, MALLOC_ALLOCATED); +    data_ = new_data; +    size_ = size; +    source_ = MALLOC_ALLOCATED;    }  } diff --git a/klm/util/multi_intersection.hh b/klm/util/multi_intersection.hh index 8334d39d..04678352 100644 --- a/klm/util/multi_intersection.hh +++ b/klm/util/multi_intersection.hh @@ -66,7 +66,7 @@ template <class Iterator, class Output, class Less> void AllIntersection(std::ve    std::sort(sets.begin(), sets.end(), detail::RangeLessBySize<boost::iterator_range<Iterator> >());    boost::optional<Value> ret; -  for (boost::optional<Value> ret; ret = detail::FirstIntersectionSorted(sets, less); sets.front().advance_begin(1)) { +  for (boost::optional<Value> ret; (ret = detail::FirstIntersectionSorted(sets, less)); sets.front().advance_begin(1)) {      out(*ret);    }  } diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc index 4f519312..189668c0 100644 --- a/klm/util/murmur_hash.cc +++ b/klm/util/murmur_hash.cc @@ -153,12 +153,19 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, uint64_t seed )  // Trick to test for 64-bit architecture at compile time.    namespace { +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif  template <unsigned L> inline uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, uint64_t seed) {    return MurmurHash64A(key, len, seed);  }  template <> inline uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, uint64_t seed) {    return MurmurHash64B(key, len, seed);  } +#ifdef __clang__ +#pragma clang diagnostic pop +#endif  } // namespace  uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed) { diff --git a/klm/util/murmur_hash.hh b/klm/util/murmur_hash.hh index ae7e88de..4891833e 100644 --- a/klm/util/murmur_hash.hh +++ b/klm/util/murmur_hash.hh @@ -5,8 +5,12 @@  namespace util { +// 64-bit machine version  uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0); +// 32-bit machine version (not the same function as above)  uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0); +// Use the version for this arch.  Because the values differ across +// architectures, really only use it for in-memory structures.  uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0);  } // namespace util diff --git a/klm/util/pcqueue.hh b/klm/util/pcqueue.hh index 3df8749b..07e4146f 100644 --- a/klm/util/pcqueue.hh +++ b/klm/util/pcqueue.hh @@ -1,6 +1,8 @@  #ifndef UTIL_PCQUEUE__  #define UTIL_PCQUEUE__ +#include "util/exception.hh" +  #include <boost/interprocess/sync/interprocess_semaphore.hpp>  #include <boost/scoped_array.hpp>  #include <boost/thread/mutex.hpp> @@ -8,20 +10,68 @@  #include <errno.h> +#ifdef __APPLE__ +#include <mach/semaphore.h> +#include <mach/task.h> +#include <mach/mach_traps.h> +#include <mach/mach.h> +#endif // __APPLE__ +  namespace util { -inline void WaitSemaphore (boost::interprocess::interprocess_semaphore &on) { +/* OS X Maverick and Boost interprocess were doing "Function not implemented." + * So this is my own wrapper around the mach kernel APIs. + */ +#ifdef __APPLE__ + +#define MACH_CALL(call) UTIL_THROW_IF(KERN_SUCCESS != (call), Exception, "Mach call failure") + +class Semaphore { +  public: +    explicit Semaphore(int value) : task_(mach_task_self()) { +      MACH_CALL(semaphore_create(task_, &back_, SYNC_POLICY_FIFO, value)); +    } + +    ~Semaphore() { +      MACH_CALL(semaphore_destroy(task_, back_)); +    } + +    void wait() { +      MACH_CALL(semaphore_wait(back_)); +    } + +    void post() { +      MACH_CALL(semaphore_signal(back_)); +    } + +  private: +    semaphore_t back_; +    task_t task_; +}; + +inline void WaitSemaphore(Semaphore &semaphore) { +  semaphore.wait(); +} + +#else +typedef boost::interprocess::interprocess_semaphore Semaphore; + +inline void WaitSemaphore (Semaphore &on) {    while (1) {      try {        on.wait();        break;      }      catch (boost::interprocess::interprocess_exception &e) { -      if (e.get_native_error() != EINTR) throw; +      if (e.get_native_error() != EINTR) { +        throw; +      }      }    }  } +#endif // __APPLE__ +  /* Producer consumer queue safe for multiple producers and multiple consumers.   * T must be default constructable and have operator=.     * The value is copied twice for Consume(T &out) or three times for Consume(), @@ -82,9 +132,9 @@ template <class T> class PCQueue : boost::noncopyable {   private:    // Number of empty spaces in storage_. -  boost::interprocess::interprocess_semaphore empty_; +  Semaphore empty_;    // Number of occupied spaces in storage_. -  boost::interprocess::interprocess_semaphore used_; +  Semaphore used_;    boost::scoped_array<T> storage_; diff --git a/klm/util/pcqueue_test.cc b/klm/util/pcqueue_test.cc new file mode 100644 index 00000000..22ed2c6f --- /dev/null +++ b/klm/util/pcqueue_test.cc @@ -0,0 +1,20 @@ +#include "util/pcqueue.hh" + +#define BOOST_TEST_MODULE PCQueueTest +#include <boost/test/unit_test.hpp> + +namespace util { +namespace { + +BOOST_AUTO_TEST_CASE(SingleThread) { +  PCQueue<int> queue(10); +  for (int i = 0; i < 10; ++i) { +    queue.Produce(i); +  } +  for (int i = 0; i < 10; ++i) { +    BOOST_CHECK_EQUAL(i, queue.Consume()); +  } +} + +} +} // namespace util diff --git a/klm/util/pool.cc b/klm/util/pool.cc index db72a8ec..429ba158 100644 --- a/klm/util/pool.cc +++ b/klm/util/pool.cc @@ -25,9 +25,7 @@ void Pool::FreeAll() {  }  void *Pool::More(std::size_t size) { -  // Double until we hit 2^21 (2 MB).  Then grow in 2 MB blocks.  -  std::size_t desired_size = static_cast<size_t>(32) << std::min(static_cast<std::size_t>(16), free_list_.size()); -  std::size_t amount = std::max(desired_size, size); +  std::size_t amount = std::max(static_cast<size_t>(32) << free_list_.size(), size);    uint8_t *ret = static_cast<uint8_t*>(MallocOrThrow(amount));    free_list_.push_back(ret);    current_ = ret + size; diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh index 51a2944d..38524806 100644 --- a/klm/util/probing_hash_table.hh +++ b/klm/util/probing_hash_table.hh @@ -2,6 +2,7 @@  #define UTIL_PROBING_HASH_TABLE__  #include "util/exception.hh" +#include "util/scoped.hh"  #include <algorithm>  #include <cstddef> @@ -25,6 +26,8 @@ struct IdentityHash {    template <class T> T operator()(T arg) const { return arg; }  }; +template <class EntryT, class HashT, class EqualT> class AutoProbing; +  /* Non-standard hash table   * Buckets must be set at the beginning and must be greater than maximum number   * of elements, else it throws ProbingSizeException. @@ -33,7 +36,6 @@ struct IdentityHash {   * Uses linear probing to find value.   * Only insert and lookup operations.     */ -  template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class ProbingHashTable {    public:      typedef EntryT Entry; @@ -43,7 +45,6 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry      typedef HashT Hash;      typedef EqualT Equal; -  public:      static uint64_t Size(uint64_t entries, float multiplier) {        uint64_t buckets = std::max(entries + 1, static_cast<uint64_t>(multiplier * static_cast<float>(entries)));        return buckets * sizeof(Entry); @@ -69,6 +70,11 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry  #endif      {} +    void Relocate(void *new_base) { +      begin_ = reinterpret_cast<MutableIterator>(new_base); +      end_ = begin_ + buckets_; +    } +      template <class T> MutableIterator Insert(const T &t) {  #ifdef DEBUG        assert(initialized_); @@ -82,7 +88,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry  #ifdef DEBUG        assert(initialized_);  #endif -      for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) { +      for (MutableIterator i = Ideal(t);;) {          Key got(i->GetKey());          if (equal_(got, t.GetKey())) { out = i; return true; }          if (equal_(got, invalid_)) { @@ -97,8 +103,6 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry      void FinishedInserting() {} -    void LoadedBinary() {} -      // Don't change anything related to GetKey,        template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) {  #ifdef DEBUG @@ -224,6 +228,8 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry      }    private: +    friend class AutoProbing<Entry, Hash, Equal>; +      template <class T> MutableIterator Ideal(const T &t) {        return begin_ + (hash_(t.GetKey()) % buckets_);      } @@ -247,6 +253,74 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry  #endif  }; +// Resizable linear probing hash table.  This owns the memory.   +template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class AutoProbing { +  private: +    typedef ProbingHashTable<EntryT, HashT, EqualT> Backend; +  public: +    typedef EntryT Entry; +    typedef typename Entry::Key Key; +    typedef const Entry *ConstIterator; +    typedef Entry *MutableIterator; +    typedef HashT Hash; +    typedef EqualT Equal; + +    AutoProbing(std::size_t initial_size = 10, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) : +      allocated_(Backend::Size(initial_size, 1.5)), mem_(util::MallocOrThrow(allocated_)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) { +      threshold_ = initial_size * 1.2; +    } + +    // Assumes that the key is unique.  Multiple insertions won't cause a failure, just inconsistent lookup. +    template <class T> MutableIterator Insert(const T &t) { +      DoubleIfNeeded(); +      return backend_.UncheckedInsert(t); +    } + +    template <class T> bool FindOrInsert(const T &t, MutableIterator &out) { +      DoubleIfNeeded(); +      return backend_.FindOrInsert(t, out); +    } + +    template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) { +      return backend_.UnsafeMutableFind(key, out); +    } + +    template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) { +      return backend_.UnsafeMutableMustFind(key); +    } + +    template <class Key> bool Find(const Key key, ConstIterator &out) const { +      return backend_.Find(key, out); +    } + +    template <class Key> ConstIterator MustFind(const Key key) const { +      return backend_.MustFind(key); +    } + +    std::size_t Size() const { +      return backend_.SizeNoSerialization(); +    } + +    void Clear() { +      backend_.Clear(); +    } + +  private: +    void DoubleIfNeeded() { +      if (Size() < threshold_) +        return; +      mem_.call_realloc(backend_.DoubleTo()); +      allocated_ = backend_.DoubleTo(); +      backend_.Double(mem_.get()); +      threshold_ *= 2; +    } + +    std::size_t allocated_; +    util::scoped_malloc mem_; +    Backend backend_; +    std::size_t threshold_; +}; +  } // namespace util  #endif // UTIL_PROBING_HASH_TABLE__ diff --git a/klm/util/proxy_iterator.hh b/klm/util/proxy_iterator.hh index 0ee1716f..a2810a47 100644 --- a/klm/util/proxy_iterator.hh +++ b/klm/util/proxy_iterator.hh @@ -38,8 +38,8 @@ template <class Proxy> class ProxyIterator {      typedef std::random_access_iterator_tag iterator_category;      typedef typename Proxy::value_type value_type;      typedef std::ptrdiff_t difference_type; -    typedef Proxy & reference; -    typedef Proxy * pointer; +    typedef Proxy reference; +    typedef ProxyIterator<Proxy> * pointer;      ProxyIterator() {} @@ -47,10 +47,10 @@ template <class Proxy> class ProxyIterator {      template <class AlternateProxy> ProxyIterator(const ProxyIterator<AlternateProxy> &in) : p_(*in) {}      explicit ProxyIterator(const Proxy &p) : p_(p) {} -    // p_'s swap does value swapping, but here we want iterator swapping +/*    // p_'s swap does value swapping, but here we want iterator swapping      friend inline void swap(ProxyIterator<Proxy> &first, ProxyIterator<Proxy> &second) {        swap(first.I(), second.I()); -    } +    }*/      // p_'s operator= does value copying, but here we want iterator copying.      S &operator=(const S &other) { @@ -77,8 +77,8 @@ template <class Proxy> class ProxyIterator {      std::ptrdiff_t operator-(const S &other) const { return I() - other.I(); } -    Proxy &operator*() { return p_; } -    const Proxy &operator*() const { return p_; } +    Proxy operator*() { return p_; } +    const Proxy operator*() const { return p_; }      Proxy *operator->() { return &p_; }      const Proxy *operator->() const { return &p_; }      Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); } diff --git a/klm/util/read_compressed_test.cc b/klm/util/read_compressed_test.cc index 9cb4a4b9..50450a02 100644 --- a/klm/util/read_compressed_test.cc +++ b/klm/util/read_compressed_test.cc @@ -12,6 +12,23 @@  #include <stdlib.h> +#if defined __MINGW32__ +#include <time.h> +#include <fcntl.h> + +#if !defined mkstemp +// TODO insecure +int mkstemp(char * stemplate) +{ +    char *filename = mktemp(stemplate); +    if (filename == NULL) +        return -1; +    return open(filename, O_RDWR | O_CREAT, 0600); +} +#endif + +#endif // defined +  namespace util {  namespace { diff --git a/klm/util/sized_iterator.hh b/klm/util/sized_iterator.hh index dce8f229..a72657b5 100644 --- a/klm/util/sized_iterator.hh +++ b/klm/util/sized_iterator.hh @@ -36,7 +36,7 @@ class SizedInnerIterator {      void *Data() { return ptr_; }      std::size_t EntrySize() const { return size_; } -    friend inline void swap(SizedInnerIterator &first, SizedInnerIterator &second) { +    friend void swap(SizedInnerIterator &first, SizedInnerIterator &second) {        std::swap(first.ptr_, second.ptr_);        std::swap(first.size_, second.size_);      } @@ -69,17 +69,7 @@ class SizedProxy {      const void *Data() const { return inner_.Data(); }      void *Data() { return inner_.Data(); } -  /** -     // TODO: this (deep) swap was recently added. why? if any std heap sort etc -     // algs are using swap, that's going to be worse performance than using -     // =. i'm not sure why we *want* a deep swap. if C++11 compilers are -     // choosing between move constructor and swap, then we'd better implement a -     // (deep) move constructor. it may also be that this is moot since i made -     // ProxyIterator a reference and added a shallow ProxyIterator swap? (I -     // need Ken or someone competent to judge whether that's correct also. - -     // let me know at graehl@gmail.com -  */ -    friend void swap(SizedProxy &first, SizedProxy &second) { +    friend void swap(SizedProxy first, SizedProxy second) {        std::swap_ranges(            static_cast<char*>(first.inner_.Data()),            static_cast<char*>(first.inner_.Data()) + first.inner_.EntrySize(), diff --git a/klm/util/sized_iterator_test.cc b/klm/util/sized_iterator_test.cc new file mode 100644 index 00000000..c36bcb2d --- /dev/null +++ b/klm/util/sized_iterator_test.cc @@ -0,0 +1,16 @@ +#include "util/sized_iterator.hh" + +#define BOOST_TEST_MODULE SizedIteratorTest +#include <boost/test/unit_test.hpp> + +namespace util { namespace { + +BOOST_AUTO_TEST_CASE(swap_works) { +  char str[2] = { 0, 1 }; +  SizedProxy first(str, 1), second(str + 1, 1); +  swap(first, second); +  BOOST_CHECK_EQUAL(1, str[0]); +  BOOST_CHECK_EQUAL(0, str[1]); +} + +}} // namespace anonymous util diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh index 9cf4c7f6..84431db1 100644 --- a/klm/util/string_piece.hh +++ b/klm/util/string_piece.hh @@ -74,6 +74,12 @@ inline bool operator!=(const StringPiece& x, const StringPiece& y) {  #endif // old version of ICU  U_NAMESPACE_BEGIN + +inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { +  int longersize = longer.size(), prefixsize = prefix.size(); +  return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0; +} +  #else  #include <algorithm> @@ -212,7 +218,7 @@ class StringPiece {    StringPiece substr(size_type pos, size_type n = npos) const;    static int wordmemcmp(const char* p, const char* p2, size_type N) { -    return memcmp(p, p2, N); +    return std::memcmp(p, p2, N);    }  }; @@ -227,6 +233,10 @@ inline bool operator!=(const StringPiece& x, const StringPiece& y) {    return !(x == y);  } +inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { +  return longer.starts_with(prefix); +} +  #endif // HAVE_ICU undefined  inline bool operator<(const StringPiece& x, const StringPiece& y) { diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh index a588c3fc..24eae8fb 100644 --- a/klm/util/tokenize_piece.hh +++ b/klm/util/tokenize_piece.hh @@ -58,6 +58,23 @@ class AnyCharacter {      StringPiece chars_;  }; +class BoolCharacter { +  public: +    BoolCharacter() {} + +    explicit BoolCharacter(const bool *delimiter) { delimiter_ = delimiter; } + +    StringPiece Find(const StringPiece &in) const { +      for (const char *i = in.data(); i != in.data() + in.size(); ++i) { +        if (delimiter_[static_cast<unsigned char>(*i)]) return StringPiece(i, 1); +      } +      return StringPiece(in.data() + in.size(), 0); +    } + +  private: +    const bool *delimiter_; +}; +  class AnyCharacterLast {    public:      AnyCharacterLast() {} diff --git a/klm/util/usage.cc b/klm/util/usage.cc index 8db375e1..e68d7c7c 100644 --- a/klm/util/usage.cc +++ b/klm/util/usage.cc @@ -10,61 +10,117 @@  #include <string.h>  #include <ctype.h> -#if !defined(_WIN32) && !defined(_WIN64) +#include <time.h> +#if defined(_WIN32) || defined(_WIN64) +// This code lifted from physmem.c in gnulib.  See the copyright statement +// below.   +# define WIN32_LEAN_AND_MEAN +# include <windows.h> +/*  MEMORYSTATUSEX is missing from older windows headers, so define +    a local replacement.  */ +typedef struct +{ +  DWORD dwLength; +  DWORD dwMemoryLoad; +  DWORDLONG ullTotalPhys; +  DWORDLONG ullAvailPhys; +  DWORDLONG ullTotalPageFile; +  DWORDLONG ullAvailPageFile; +  DWORDLONG ullTotalVirtual; +  DWORDLONG ullAvailVirtual; +  DWORDLONG ullAvailExtendedVirtual; +} lMEMORYSTATUSEX; +typedef WINBOOL (WINAPI *PFN_MS_EX) (lMEMORYSTATUSEX*); +#else  #include <sys/resource.h>  #include <sys/time.h> -#include <time.h>  #include <unistd.h>  #endif -namespace util { +#if defined(__MACH__) || defined(__FreeBSD__) || defined(__APPLE__) +#include <sys/types.h> +#include <sys/sysctl.h> +#endif -#if !defined(_WIN32) && !defined(_WIN64) +namespace util {  namespace { -// On Mac OS X, clock_gettime is not implemented. -// CLOCK_MONOTONIC is not defined either. -#ifdef __MACH__ -#define CLOCK_MONOTONIC 0 - -int clock_gettime(int clk_id, struct timespec *tp) { +#if defined(__MACH__) +typedef struct timeval Wall; +Wall GetWall() {    struct timeval tv;    gettimeofday(&tv, NULL); -  tp->tv_sec = tv.tv_sec; -  tp->tv_nsec = tv.tv_usec * 1000; -  return 0; +  return tv;  } -#endif // __MACH__ - -float FloatSec(const struct timeval &tv) { -  return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000.0); +#elif defined(_WIN32) || defined(_WIN64) +typedef time_t Wall; +Wall GetWall() { +  return time(NULL);  } -float FloatSec(const struct timespec &tv) { -  return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_nsec) / 1000000000.0); +#else +typedef struct timespec Wall; +Wall GetWall() { +  Wall ret; +  clock_gettime(CLOCK_MONOTONIC, &ret); +  return ret;  } +#endif -const char *SkipSpaces(const char *at) { -  for (; *at == ' ' || *at == '\t'; ++at) {} -  return at; +// Some of these functions are only used on some platforms. +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif +// These all assume first > second +double Subtract(time_t first, time_t second) { +  return difftime(first, second); +} +double DoubleSec(time_t tv) { +  return static_cast<double>(tv); +} +#if !defined(_WIN32) && !defined(_WIN64) +double Subtract(const struct timeval &first, const struct timeval &second) { +  return static_cast<double>(first.tv_sec - second.tv_sec) + static_cast<double>(first.tv_usec - second.tv_usec) / 1000000.0;  } +double Subtract(const struct timespec &first, const struct timespec &second) { +  return static_cast<double>(first.tv_sec - second.tv_sec) + static_cast<double>(first.tv_nsec - second.tv_nsec) / 1000000000.0; +} +double DoubleSec(const struct timeval &tv) { +  return static_cast<double>(tv.tv_sec) + (static_cast<double>(tv.tv_usec) / 1000000.0); +} +double DoubleSec(const struct timespec &tv) { +  return static_cast<double>(tv.tv_sec) + (static_cast<double>(tv.tv_nsec) / 1000000000.0); +} +#endif +#ifdef __clang__ +#pragma clang diagnostic pop +#endif  class RecordStart {    public:      RecordStart() { -      clock_gettime(CLOCK_MONOTONIC, &started_); +      started_ = GetWall();      } -    const struct timespec &Started() const { +    const Wall &Started() const {        return started_;      }    private: -    struct timespec started_; +    Wall started_;  };  const RecordStart kRecordStart; + +const char *SkipSpaces(const char *at) { +  for (; *at == ' ' || *at == '\t'; ++at) {} +  return at; +}  } // namespace -#endif + +double WallTime() { +  return Subtract(GetWall(), kRecordStart.Started()); +}  void PrintUsage(std::ostream &out) {  #if !defined(_WIN32) && !defined(_WIN64) @@ -83,32 +139,89 @@ void PrintUsage(std::ostream &out) {    }    struct rusage usage; -  if (getrusage(RUSAGE_CHILDREN, &usage)) { +  if (getrusage(RUSAGE_SELF, &usage)) {      perror("getrusage");      return;    }    out << "RSSMax:" << usage.ru_maxrss << " kB" << '\t'; -  out << "user:" << FloatSec(usage.ru_utime) << "\tsys:" << FloatSec(usage.ru_stime) << '\t'; -  out << "CPU:" << (FloatSec(usage.ru_utime) + FloatSec(usage.ru_stime)); - -  struct timespec current; -  clock_gettime(CLOCK_MONOTONIC, ¤t); -  out << "\treal:" << (FloatSec(current) - FloatSec(kRecordStart.Started())) << '\n'; +  out << "user:" << DoubleSec(usage.ru_utime) << "\tsys:" << DoubleSec(usage.ru_stime) << '\t'; +  out << "CPU:" << (DoubleSec(usage.ru_utime) + DoubleSec(usage.ru_stime)); +  out << '\t';  #endif + +  out << "real:" << WallTime() << '\n';  } +/* Adapted from physmem.c in gnulib 831b84c59ef413c57a36b67344467d66a8a2ba70 */ +/* Calculate the size of physical memory. + +   Copyright (C) 2000-2001, 2003, 2005-2006, 2009-2013 Free Software +   Foundation, Inc. + +   This program is free software: you can redistribute it and/or modify +   it under the terms of the GNU Lesser General Public License as published by +   the Free Software Foundation; either version 2.1 of the License, or +   (at your option) any later version. + +   This program is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +   GNU Lesser General Public License for more details. + +   You should have received a copy of the GNU Lesser General Public License +   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */ + +/* Written by Paul Eggert.  */  uint64_t GuessPhysicalMemory() { +#if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) +  { +    long pages = sysconf(_SC_PHYS_PAGES); +    long page_size = sysconf(_SC_PAGESIZE); +    if (pages != -1 && page_size != -1) +      return static_cast<uint64_t>(pages) * static_cast<uint64_t>(page_size); +  } +#endif +#ifdef HW_PHYSMEM +  { /* This works on *bsd and darwin.  */ +    unsigned int physmem; +    size_t len = sizeof physmem; +    static int mib[2] = { CTL_HW, HW_PHYSMEM }; + +    if (sysctl (mib, sizeof(mib) / sizeof(mib[0]), &physmem, &len, NULL, 0) == 0 +        && len == sizeof (physmem)) +      return static_cast<uint64_t>(physmem); +  } +#endif +  #if defined(_WIN32) || defined(_WIN64) -  return 0; -#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) -  long pages = sysconf(_SC_PHYS_PAGES); -  if (pages == -1) return 0; -  long page_size = sysconf(_SC_PAGESIZE); -  if (page_size == -1) return 0; -  return static_cast<uint64_t>(pages) * static_cast<uint64_t>(page_size); -#else -  return 0; +  { /* this works on windows */ +    PFN_MS_EX pfnex; +    HMODULE h = GetModuleHandle ("kernel32.dll"); + +    if (!h) +      return 0; + +    /*  Use GlobalMemoryStatusEx if available.  */ +    if ((pfnex = (PFN_MS_EX) GetProcAddress (h, "GlobalMemoryStatusEx"))) +      { +        lMEMORYSTATUSEX lms_ex; +        lms_ex.dwLength = sizeof lms_ex; +        if (!pfnex (&lms_ex)) +          return 0; +        return lms_ex.ullTotalPhys; +      } + +    /*  Fall back to GlobalMemoryStatus which is always available. +        but returns wrong results for physical memory > 4GB.  */ +    else +      { +        MEMORYSTATUS ms; +        GlobalMemoryStatus (&ms); +        return ms.dwTotalPhys; +      } +  }  #endif +  return 0;  }  namespace { @@ -135,7 +248,7 @@ template <class Num> uint64_t ParseNum(const std::string &arg) {    if (after == "%") {      uint64_t mem = GuessPhysicalMemory();      UTIL_THROW_IF_ARG(!mem, SizeParseError, (arg), "because % was specified but the physical memory size could not be determined."); -    return static_cast<double>(value) * static_cast<double>(mem) / 100.0; +    return static_cast<uint64_t>(static_cast<double>(value) * static_cast<double>(mem) / 100.0);    }    std::string units("bKMGTPEZY"); @@ -144,7 +257,7 @@ template <class Num> uint64_t ParseNum(const std::string &arg) {    for (std::string::size_type i = 0; i < index; ++i) {      value *= 1024;    } -  return value; +  return static_cast<uint64_t>(value);  }  } // namespace diff --git a/klm/util/usage.hh b/klm/util/usage.hh index e19eda7b..da53b9e3 100644 --- a/klm/util/usage.hh +++ b/klm/util/usage.hh @@ -7,6 +7,9 @@  #include <stdint.h>  namespace util { +// Time in seconds since process started.  Zero on unsupported platforms. +double WallTime(); +  void PrintUsage(std::ostream &to);  // Determine how much physical memory there is.  Return 0 on failure. | 
