diff options
| author | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2012-11-05 15:29:46 +0100 | 
|---|---|---|
| committer | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2012-11-05 15:29:46 +0100 | 
| commit | 6f29f345dc06c1a1033475eac1d1340781d1d603 (patch) | |
| tree | 6fa4cdd7aefd7d54c9585c2c6274db61bb8b159a /klm/util | |
| parent | b510da2e562c695c90d565eb295c749569c59be8 (diff) | |
| parent | c615c37501fa8576584a510a9d2bfe2fdd5bace7 (diff) | |
merge upstream/master
Diffstat (limited to 'klm/util')
| -rw-r--r-- | klm/util/Jamfile | 10 | ||||
| -rw-r--r-- | klm/util/Makefile.am | 2 | ||||
| -rw-r--r-- | klm/util/ersatz_progress.cc | 10 | ||||
| -rw-r--r-- | klm/util/ersatz_progress.hh | 10 | ||||
| -rw-r--r-- | klm/util/exception.cc | 3 | ||||
| -rw-r--r-- | klm/util/exception.hh | 22 | ||||
| -rw-r--r-- | klm/util/file.cc | 44 | ||||
| -rw-r--r-- | klm/util/file.hh | 8 | ||||
| -rw-r--r-- | klm/util/file_piece.cc | 4 | ||||
| -rw-r--r-- | klm/util/mmap.cc | 16 | ||||
| -rw-r--r-- | klm/util/pool.cc | 35 | ||||
| -rw-r--r-- | klm/util/pool.hh | 45 | ||||
| -rw-r--r-- | klm/util/probing_hash_table.hh | 5 | ||||
| -rw-r--r-- | klm/util/string_piece.cc | 192 | ||||
| -rw-r--r-- | klm/util/string_piece.hh | 5 | ||||
| -rw-r--r-- | klm/util/tokenize_piece.hh | 12 | 
16 files changed, 378 insertions, 45 deletions
diff --git a/klm/util/Jamfile b/klm/util/Jamfile deleted file mode 100644 index 3ee2c2c2..00000000 --- a/klm/util/Jamfile +++ /dev/null @@ -1,10 +0,0 @@ -lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc usage.cc ../..//z : <include>.. : : <include>.. ; - -import testing ; - -unit-test bit_packing_test : bit_packing_test.cc kenutil ../..///boost_unit_test_framework ; -run file_piece_test.cc kenutil ../..///boost_unit_test_framework : : file_piece.cc ; -unit-test joint_sort_test : joint_sort_test.cc kenutil ../..///boost_unit_test_framework ; -unit-test probing_hash_table_test : probing_hash_table_test.cc kenutil ../..///boost_unit_test_framework ; -unit-test sorted_uniform_test : sorted_uniform_test.cc kenutil ../..///boost_unit_test_framework ; -unit-test tokenize_piece_test : tokenize_piece_test.cc kenutil ../..///boost_unit_test_framework ; diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am index 5ceccf2c..5306850f 100644 --- a/klm/util/Makefile.am +++ b/klm/util/Makefile.am @@ -26,6 +26,8 @@ libklm_util_a_SOURCES = \    file_piece.cc \    mmap.cc \    murmur_hash.cc \ +  pool.cc \ +  string_piece.cc \  	usage.cc  AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. diff --git a/klm/util/ersatz_progress.cc b/klm/util/ersatz_progress.cc index 07b14e26..eb635ad8 100644 --- a/klm/util/ersatz_progress.cc +++ b/klm/util/ersatz_progress.cc @@ -9,16 +9,16 @@ namespace util {  namespace { const unsigned char kWidth = 100; } -ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<std::size_t>::max()), complete_(next_), out_(NULL) {} +ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<uint64_t>::max()), complete_(next_), out_(NULL) {}  ErsatzProgress::~ErsatzProgress() {    if (out_) Finished();  } -ErsatzProgress::ErsatzProgress(std::size_t complete, std::ostream *to, const std::string &message)  +ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message)     : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) {    if (!out_) { -    next_ = std::numeric_limits<std::size_t>::max(); +    next_ = std::numeric_limits<uint64_t>::max();      return;    }    if (!message.empty()) *out_ << message << '\n'; @@ -28,14 +28,14 @@ ErsatzProgress::ErsatzProgress(std::size_t complete, std::ostream *to, const std  void ErsatzProgress::Milestone() {    if (!out_) { current_ = 0; return; }    if (!complete_) return; -  unsigned char stone = std::min(static_cast<std::size_t>(kWidth), (current_ * kWidth) / complete_); +  unsigned char stone = std::min(static_cast<uint64_t>(kWidth), (current_ * kWidth) / complete_);    for (; stones_written_ < stone; ++stones_written_) {      (*out_) << '*';    }    if (stone == kWidth) {      (*out_) << std::endl; -    next_ = std::numeric_limits<std::size_t>::max(); +    next_ = std::numeric_limits<uint64_t>::max();      out_ = NULL;    } else {      next_ = std::max(next_, (stone * complete_) / kWidth); diff --git a/klm/util/ersatz_progress.hh b/klm/util/ersatz_progress.hh index f709dc51..9909736d 100644 --- a/klm/util/ersatz_progress.hh +++ b/klm/util/ersatz_progress.hh @@ -4,6 +4,8 @@  #include <iostream>  #include <string> +#include <stdint.h> +  // Ersatz version of boost::progress so core language model doesn't depend on  // boost.  Also adds option to print nothing.   @@ -14,7 +16,7 @@ class ErsatzProgress {      ErsatzProgress();      // Null means no output.  The null value is useful for passing along the ostream pointer from another caller.    -    explicit ErsatzProgress(std::size_t complete, std::ostream *to = &std::cerr, const std::string &message = ""); +    explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = "");      ~ErsatzProgress(); @@ -23,12 +25,12 @@ class ErsatzProgress {        return *this;      } -    ErsatzProgress &operator+=(std::size_t amount) { +    ErsatzProgress &operator+=(uint64_t amount) {        if ((current_ += amount) >= next_) Milestone();        return *this;      } -    void Set(std::size_t to) { +    void Set(uint64_t to) {        if ((current_ = to) >= next_) Milestone();        Milestone();      } @@ -40,7 +42,7 @@ class ErsatzProgress {    private:      void Milestone(); -    std::size_t current_, next_, complete_; +    uint64_t current_, next_, complete_;      unsigned char stones_written_;      std::ostream *out_; diff --git a/klm/util/exception.cc b/klm/util/exception.cc index c4f8c04c..3806e6de 100644 --- a/klm/util/exception.cc +++ b/klm/util/exception.cc @@ -84,4 +84,7 @@ EndOfFileException::EndOfFileException() throw() {  }  EndOfFileException::~EndOfFileException() throw() {} +OverflowException::OverflowException() throw() {} +OverflowException::~OverflowException() throw() {} +  } // namespace util diff --git a/klm/util/exception.hh b/klm/util/exception.hh index 6d6a37cb..053a850b 100644 --- a/klm/util/exception.hh +++ b/klm/util/exception.hh @@ -2,9 +2,12 @@  #define UTIL_EXCEPTION__  #include <exception> +#include <limits>  #include <sstream>  #include <string> +#include <stdint.h> +  namespace util {  template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data); @@ -111,6 +114,25 @@ class EndOfFileException : public Exception {      ~EndOfFileException() throw();  }; +class OverflowException : public Exception { +  public: +    OverflowException() throw(); +    ~OverflowException() throw(); +}; + +template <unsigned len> inline std::size_t CheckOverflowInternal(uint64_t value) { +  UTIL_THROW_IF(value > static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), OverflowException, "Integer overflow detected.  This model is too big for 32-bit code."); +  return value; +} + +template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) { +  return value; +} + +inline std::size_t CheckOverflow(uint64_t value) { +  return CheckOverflowInternal<sizeof(std::size_t)>(value); +} +  } // namespace util  #endif // UTIL_EXCEPTION__ diff --git a/klm/util/file.cc b/klm/util/file.cc index 6a3885a7..6bf879ac 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -6,6 +6,7 @@  #include <cstdio>  #include <iostream> +#include <assert.h>  #include <sys/types.h>  #include <sys/stat.h>  #include <fcntl.h> @@ -44,6 +45,16 @@ int OpenReadOrThrow(const char *name) {    return ret;  } +int CreateOrThrow(const char *name) { +  int ret; +#if defined(_WIN32) || defined(_WIN64) +  UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); +#else +  UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); +#endif +  return ret; +} +  uint64_t SizeFile(int fd) {  #if defined(_WIN32) || defined(_WIN64)    __int64 ret = _filelengthi64(fd); @@ -101,6 +112,11 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {    }  } +void WriteOrThrow(FILE *to, const void *data, std::size_t size) { +  assert(size); +  if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size); +} +  void FSyncOrThrow(int fd) {  // Apparently windows doesn't have fsync?    #if !defined(_WIN32) && !defined(_WIN64) @@ -109,8 +125,13 @@ void FSyncOrThrow(int fd) {  }  namespace { -void InternalSeek(int fd, off_t off, int whence) { +void InternalSeek(int fd, int64_t off, int whence) { +#if defined(_WIN32) || defined(_WIN64) +  UTIL_THROW_IF((__int64)-1 == _lseeki64(fd, off, whence), ErrnoException, "Windows seek failed"); + +#else    UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed"); +#endif  }  } // namespace @@ -133,6 +154,12 @@ std::FILE *FDOpenOrThrow(scoped_fd &file) {    return ret;  } +std::FILE *FOpenOrThrow(const char *path, const char *mode) { +  std::FILE *ret; +  UTIL_THROW_IF(!(ret = fopen(path, mode)), util::ErrnoException, "Could not fopen " << path << " for " << mode); +  return ret; +} +  TempMaker::TempMaker(const std::string &prefix) : base_(prefix) {    base_ += "XXXXXX";  } @@ -232,7 +259,9 @@ mkstemp_and_unlink(char *tmpl)      /* Modified for windows and to unlink */      //      fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE); -    fd = _open (tmpl, _O_RDWR | _O_CREAT | _O_TEMPORARY | _O_EXCL | _O_BINARY, _S_IREAD | _S_IWRITE); +    int flags = _O_RDWR | _O_CREAT | _O_EXCL | _O_BINARY; +    flags |= _O_TEMPORARY; +    fd = _open (tmpl, flags, _S_IREAD | _S_IWRITE);      if (fd >= 0)      {        errno = save_errno; @@ -250,17 +279,18 @@ mkstemp_and_unlink(char *tmpl)  int  mkstemp_and_unlink(char *tmpl) {    int ret = mkstemp(tmpl); -  if (ret == -1) return -1; -  UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl); +  if (ret != -1) { +    UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl); +  }    return ret;  }  #endif  int TempMaker::Make() const { -  std::string copy(base_); -  copy.push_back(0); +  std::string name(base_); +  name.push_back(0);    int ret; -  UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(©[0])), util::ErrnoException, "Failed to make a temporary based on " << base_); +  UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&name[0])), util::ErrnoException, "Failed to make a temporary based on " << base_);    return ret;  } diff --git a/klm/util/file.hh b/klm/util/file.hh index 5c57e2a9..185cb1f3 100644 --- a/klm/util/file.hh +++ b/klm/util/file.hh @@ -65,7 +65,10 @@ class scoped_FILE {      std::FILE *file_;  }; +// Open for read only.    int OpenReadOrThrow(const char *name); +// Create file if it doesn't exist, truncate if it does.  Opened for write.    +int CreateOrThrow(const char *name);  // Return value for SizeFile when it can't size properly.    const uint64_t kBadSize = (uint64_t)-1; @@ -77,6 +80,7 @@ void ReadOrThrow(int fd, void *to, std::size_t size);  std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount);  void WriteOrThrow(int fd, const void *data_void, std::size_t size); +void WriteOrThrow(FILE *to, const void *data, std::size_t size);  void FSyncOrThrow(int fd); @@ -87,12 +91,14 @@ void SeekEnd(int fd);  std::FILE *FDOpenOrThrow(scoped_fd &file); +std::FILE *FOpenOrThrow(const char *path, const char *mode); +  class TempMaker {    public:      explicit TempMaker(const std::string &prefix); +    // These will already be unlinked for you.        int Make() const; -      std::FILE *MakeFile() const;    private: diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index a205995a..280f438c 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -5,6 +5,8 @@  #include "util/mmap.hh"  #ifdef WIN32  #include <io.h> +#else +#include <unistd.h>  #endif // WIN32  #include <iostream> @@ -27,7 +29,7 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {  #ifdef HAVE_ZLIB  GZException::GZException(gzFile file) {    int num; -  *this << gzerror( file, &num) << " from zlib"; +  *this << gzerror(file, &num) << " from zlib";  }  #endif // HAVE_ZLIB diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index 576fd4cc..bc9e3f81 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -19,8 +19,8 @@  #include <windows.h>  #include <io.h>  #else -#include <unistd.h>  #include <sys/mman.h> +#include <unistd.h>  #endif  namespace util { @@ -171,20 +171,6 @@ void *MapZeroedWrite(int fd, std::size_t size) {    return MapOrThrow(size, true, kFileFlags, false, fd, 0);  } -namespace { - -int CreateOrThrow(const char *name) { -  int ret; -#if defined(_WIN32) || defined(_WIN64) -  UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); -#else -  UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); -#endif -  return ret; -} - -} // namespace -  void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {    file.reset(CreateOrThrow(name));    try { diff --git a/klm/util/pool.cc b/klm/util/pool.cc new file mode 100644 index 00000000..2dffd06f --- /dev/null +++ b/klm/util/pool.cc @@ -0,0 +1,35 @@ +#include "util/pool.hh" + +#include <stdlib.h> + +namespace util { + +Pool::Pool() { +  current_ = NULL; +  current_end_ = NULL; +} + +Pool::~Pool() { +  FreeAll(); +} + +void Pool::FreeAll() { +  for (std::vector<void *>::const_iterator i(free_list_.begin()); i != free_list_.end(); ++i) { +    free(*i); +  } +  free_list_.clear(); +  current_ = NULL; +  current_end_ = NULL; +} + +void *Pool::More(std::size_t size) { +  std::size_t amount = std::max(static_cast<size_t>(32) << free_list_.size(), size); +  uint8_t *ret = static_cast<uint8_t*>(malloc(amount)); +  if (!ret) throw std::bad_alloc(); +  free_list_.push_back(ret); +  current_ = ret + size; +  current_end_ = ret + amount; +  return ret; +} + +} // namespace util diff --git a/klm/util/pool.hh b/klm/util/pool.hh new file mode 100644 index 00000000..72f8a0c8 --- /dev/null +++ b/klm/util/pool.hh @@ -0,0 +1,45 @@ +// Very simple pool.  It can only allocate memory.  And all of the memory it +// allocates must be freed at the same time.   + +#ifndef UTIL_POOL__ +#define UTIL_POOL__ + +#include <vector> + +#include <stdint.h> + +namespace util { + +class Pool { +  public: +    Pool(); + +    ~Pool(); + +    void *Allocate(std::size_t size) { +      void *ret = current_; +      current_ += size; +      if (current_ < current_end_) { +        return ret; +      } else { +        return More(size); +      } +    } + +    void FreeAll(); + +  private: +    void *More(std::size_t size); + +    std::vector<void *> free_list_; + +    uint8_t *current_, *current_end_; + +    // no copying +    Pool(const Pool &); +    Pool &operator=(const Pool &); +};  + +} // namespace util + +#endif // UTIL_POOL__ diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh index 3354b68e..4a8aff35 100644 --- a/klm/util/probing_hash_table.hh +++ b/klm/util/probing_hash_table.hh @@ -8,6 +8,7 @@  #include <functional>  #include <assert.h> +#include <stdint.h>  namespace util { @@ -42,8 +43,8 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry      typedef EqualT Equal;    public: -    static std::size_t Size(std::size_t entries, float multiplier) { -      std::size_t buckets = std::max(entries + 1, static_cast<std::size_t>(multiplier * static_cast<float>(entries))); +    static uint64_t Size(uint64_t entries, float multiplier) { +      uint64_t buckets = std::max(entries + 1, static_cast<uint64_t>(multiplier * static_cast<float>(entries)));        return buckets * sizeof(Entry);      } diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc new file mode 100644 index 00000000..b422cefc --- /dev/null +++ b/klm/util/string_piece.cc @@ -0,0 +1,192 @@ +// Copyright 2004 The RE2 Authors.  All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in string_piece.hh. + +#include "util/string_piece.hh" + +#include <algorithm> + +#include <limits.h> + +#ifndef HAVE_ICU + +typedef StringPiece::size_type size_type; + +void StringPiece::CopyToString(std::string* target) const { +  target->assign(ptr_, length_); +} + +size_type StringPiece::find(const StringPiece& s, size_type pos) const { +  if (length_ < 0 || pos > static_cast<size_type>(length_)) +    return npos; + +  const char* result = std::search(ptr_ + pos, ptr_ + length_, +                                   s.ptr_, s.ptr_ + s.length_); +  const size_type xpos = result - ptr_; +  return xpos + s.length_ <= length_ ? xpos : npos; +} + +size_type StringPiece::find(char c, size_type pos) const { +  if (length_ <= 0 || pos >= static_cast<size_type>(length_)) { +    return npos; +  } +  const char* result = std::find(ptr_ + pos, ptr_ + length_, c); +  return result != ptr_ + length_ ? result - ptr_ : npos; +} + +size_type StringPiece::rfind(const StringPiece& s, size_type pos) const { +  if (length_ < s.length_) return npos; +  const size_t ulen = length_; +  if (s.length_ == 0) return std::min(ulen, pos); + +  const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_; +  const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_); +  return result != last ? result - ptr_ : npos; +} + +size_type StringPiece::rfind(char c, size_type pos) const { +  if (length_ <= 0) return npos; +  for (int i = std::min(pos, static_cast<size_type>(length_ - 1)); +       i >= 0; --i) { +    if (ptr_[i] == c) { +      return i; +    } +  } +  return npos; +} + +// For each character in characters_wanted, sets the index corresponding +// to the ASCII code of that character to 1 in table.  This is used by +// the find_.*_of methods below to tell whether or not a character is in +// the lookup table in constant time. +// The argument `table' must be an array that is large enough to hold all +// the possible values of an unsigned char.  Thus it should be be declared +// as follows: +//   bool table[UCHAR_MAX + 1] +static inline void BuildLookupTable(const StringPiece& characters_wanted, +                                    bool* table) { +  const size_type length = characters_wanted.length(); +  const char* const data = characters_wanted.data(); +  for (size_type i = 0; i < length; ++i) { +    table[static_cast<unsigned char>(data[i])] = true; +  } +} + +size_type StringPiece::find_first_of(const StringPiece& s, +                                     size_type pos) const { +  if (length_ == 0 || s.length_ == 0) +    return npos; + +  // Avoid the cost of BuildLookupTable() for a single-character search. +  if (s.length_ == 1) +    return find_first_of(s.ptr_[0], pos); + +  bool lookup[UCHAR_MAX + 1] = { false }; +  BuildLookupTable(s, lookup); +  for (size_type i = pos; i < length_; ++i) { +    if (lookup[static_cast<unsigned char>(ptr_[i])]) { +      return i; +    } +  } +  return npos; +} + +size_type StringPiece::find_first_not_of(const StringPiece& s, +                                         size_type pos) const { +  if (length_ == 0) +    return npos; + +  if (s.length_ == 0) +    return 0; + +  // Avoid the cost of BuildLookupTable() for a single-character search. +  if (s.length_ == 1) +    return find_first_not_of(s.ptr_[0], pos); + +  bool lookup[UCHAR_MAX + 1] = { false }; +  BuildLookupTable(s, lookup); +  for (size_type i = pos; i < length_; ++i) { +    if (!lookup[static_cast<unsigned char>(ptr_[i])]) { +      return i; +    } +  } +  return npos; +} + +size_type StringPiece::find_first_not_of(char c, size_type pos) const { +  if (length_ == 0) +    return npos; + +  for (; pos < length_; ++pos) { +    if (ptr_[pos] != c) { +      return pos; +    } +  } +  return npos; +} + +size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const { +  if (length_ == 0 || s.length_ == 0) +    return npos; + +  // Avoid the cost of BuildLookupTable() for a single-character search. +  if (s.length_ == 1) +    return find_last_of(s.ptr_[0], pos); + +  bool lookup[UCHAR_MAX + 1] = { false }; +  BuildLookupTable(s, lookup); +  for (size_type i = std::min(pos, length_ - 1); ; --i) { +    if (lookup[static_cast<unsigned char>(ptr_[i])]) +      return i; +    if (i == 0) +      break; +  } +  return npos; +} + +size_type StringPiece::find_last_not_of(const StringPiece& s, +                                        size_type pos) const { +  if (length_ == 0) +    return npos; + +  size_type i = std::min(pos, length_ - 1); +  if (s.length_ == 0) +    return i; + +  // Avoid the cost of BuildLookupTable() for a single-character search. +  if (s.length_ == 1) +    return find_last_not_of(s.ptr_[0], pos); + +  bool lookup[UCHAR_MAX + 1] = { false }; +  BuildLookupTable(s, lookup); +  for (; ; --i) { +    if (!lookup[static_cast<unsigned char>(ptr_[i])]) +      return i; +    if (i == 0) +      break; +  } +  return npos; +} + +size_type StringPiece::find_last_not_of(char c, size_type pos) const { +  if (length_ == 0) +    return npos; + +  for (size_type i = std::min(pos, length_ - 1); ; --i) { +    if (ptr_[i] != c) +      return i; +    if (i == 0) +      break; +  } +  return npos; +} + +StringPiece StringPiece::substr(size_type pos, size_type n) const { +  if (pos > length_) pos = length_; +  if (n > length_ - pos) n = length_ - pos; +  return StringPiece(ptr_ + pos, n); +} + +const size_type StringPiece::npos = size_type(-1); + +#endif // !HAVE_ICU diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh index 5de053aa..be6a643d 100644 --- a/klm/util/string_piece.hh +++ b/klm/util/string_piece.hh @@ -85,6 +85,11 @@ U_NAMESPACE_BEGIN  #include <string>  #include <string.h> +#ifdef WIN32 +#undef max +#undef min +#endif +  class StringPiece {   public:    typedef size_t size_type; diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh index c7e1c863..4a7f5460 100644 --- a/klm/util/tokenize_piece.hh +++ b/klm/util/tokenize_piece.hh @@ -54,6 +54,18 @@ class AnyCharacter {      StringPiece chars_;  }; +class AnyCharacterLast { +  public: +    explicit AnyCharacterLast(const StringPiece &chars) : chars_(chars) {} + +    StringPiece Find(const StringPiece &in) const { +      return StringPiece(std::find_end(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1); +    } + +  private: +    StringPiece chars_; +}; +  template <class Find, bool SkipEmpty = false> class TokenIter : public boost::iterator_facade<TokenIter<Find, SkipEmpty>, const StringPiece, boost::forward_traversal_tag> {    public:      TokenIter() {}  | 
