summaryrefslogtreecommitdiff
path: root/klm/util
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-11-05 15:29:46 +0100
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-11-05 15:29:46 +0100
commit6f29f345dc06c1a1033475eac1d1340781d1d603 (patch)
tree6fa4cdd7aefd7d54c9585c2c6274db61bb8b159a /klm/util
parentb510da2e562c695c90d565eb295c749569c59be8 (diff)
parentc615c37501fa8576584a510a9d2bfe2fdd5bace7 (diff)
merge upstream/master
Diffstat (limited to 'klm/util')
-rw-r--r--klm/util/Jamfile10
-rw-r--r--klm/util/Makefile.am2
-rw-r--r--klm/util/ersatz_progress.cc10
-rw-r--r--klm/util/ersatz_progress.hh10
-rw-r--r--klm/util/exception.cc3
-rw-r--r--klm/util/exception.hh22
-rw-r--r--klm/util/file.cc44
-rw-r--r--klm/util/file.hh8
-rw-r--r--klm/util/file_piece.cc4
-rw-r--r--klm/util/mmap.cc16
-rw-r--r--klm/util/pool.cc35
-rw-r--r--klm/util/pool.hh45
-rw-r--r--klm/util/probing_hash_table.hh5
-rw-r--r--klm/util/string_piece.cc192
-rw-r--r--klm/util/string_piece.hh5
-rw-r--r--klm/util/tokenize_piece.hh12
16 files changed, 378 insertions, 45 deletions
diff --git a/klm/util/Jamfile b/klm/util/Jamfile
deleted file mode 100644
index 3ee2c2c2..00000000
--- a/klm/util/Jamfile
+++ /dev/null
@@ -1,10 +0,0 @@
-lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc usage.cc ../..//z : <include>.. : : <include>.. ;
-
-import testing ;
-
-unit-test bit_packing_test : bit_packing_test.cc kenutil ../..///boost_unit_test_framework ;
-run file_piece_test.cc kenutil ../..///boost_unit_test_framework : : file_piece.cc ;
-unit-test joint_sort_test : joint_sort_test.cc kenutil ../..///boost_unit_test_framework ;
-unit-test probing_hash_table_test : probing_hash_table_test.cc kenutil ../..///boost_unit_test_framework ;
-unit-test sorted_uniform_test : sorted_uniform_test.cc kenutil ../..///boost_unit_test_framework ;
-unit-test tokenize_piece_test : tokenize_piece_test.cc kenutil ../..///boost_unit_test_framework ;
diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am
index 5ceccf2c..5306850f 100644
--- a/klm/util/Makefile.am
+++ b/klm/util/Makefile.am
@@ -26,6 +26,8 @@ libklm_util_a_SOURCES = \
file_piece.cc \
mmap.cc \
murmur_hash.cc \
+ pool.cc \
+ string_piece.cc \
usage.cc
AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I..
diff --git a/klm/util/ersatz_progress.cc b/klm/util/ersatz_progress.cc
index 07b14e26..eb635ad8 100644
--- a/klm/util/ersatz_progress.cc
+++ b/klm/util/ersatz_progress.cc
@@ -9,16 +9,16 @@ namespace util {
namespace { const unsigned char kWidth = 100; }
-ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<std::size_t>::max()), complete_(next_), out_(NULL) {}
+ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<uint64_t>::max()), complete_(next_), out_(NULL) {}
ErsatzProgress::~ErsatzProgress() {
if (out_) Finished();
}
-ErsatzProgress::ErsatzProgress(std::size_t complete, std::ostream *to, const std::string &message)
+ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message)
: current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) {
if (!out_) {
- next_ = std::numeric_limits<std::size_t>::max();
+ next_ = std::numeric_limits<uint64_t>::max();
return;
}
if (!message.empty()) *out_ << message << '\n';
@@ -28,14 +28,14 @@ ErsatzProgress::ErsatzProgress(std::size_t complete, std::ostream *to, const std
void ErsatzProgress::Milestone() {
if (!out_) { current_ = 0; return; }
if (!complete_) return;
- unsigned char stone = std::min(static_cast<std::size_t>(kWidth), (current_ * kWidth) / complete_);
+ unsigned char stone = std::min(static_cast<uint64_t>(kWidth), (current_ * kWidth) / complete_);
for (; stones_written_ < stone; ++stones_written_) {
(*out_) << '*';
}
if (stone == kWidth) {
(*out_) << std::endl;
- next_ = std::numeric_limits<std::size_t>::max();
+ next_ = std::numeric_limits<uint64_t>::max();
out_ = NULL;
} else {
next_ = std::max(next_, (stone * complete_) / kWidth);
diff --git a/klm/util/ersatz_progress.hh b/klm/util/ersatz_progress.hh
index f709dc51..9909736d 100644
--- a/klm/util/ersatz_progress.hh
+++ b/klm/util/ersatz_progress.hh
@@ -4,6 +4,8 @@
#include <iostream>
#include <string>
+#include <stdint.h>
+
// Ersatz version of boost::progress so core language model doesn't depend on
// boost. Also adds option to print nothing.
@@ -14,7 +16,7 @@ class ErsatzProgress {
ErsatzProgress();
// Null means no output. The null value is useful for passing along the ostream pointer from another caller.
- explicit ErsatzProgress(std::size_t complete, std::ostream *to = &std::cerr, const std::string &message = "");
+ explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = "");
~ErsatzProgress();
@@ -23,12 +25,12 @@ class ErsatzProgress {
return *this;
}
- ErsatzProgress &operator+=(std::size_t amount) {
+ ErsatzProgress &operator+=(uint64_t amount) {
if ((current_ += amount) >= next_) Milestone();
return *this;
}
- void Set(std::size_t to) {
+ void Set(uint64_t to) {
if ((current_ = to) >= next_) Milestone();
Milestone();
}
@@ -40,7 +42,7 @@ class ErsatzProgress {
private:
void Milestone();
- std::size_t current_, next_, complete_;
+ uint64_t current_, next_, complete_;
unsigned char stones_written_;
std::ostream *out_;
diff --git a/klm/util/exception.cc b/klm/util/exception.cc
index c4f8c04c..3806e6de 100644
--- a/klm/util/exception.cc
+++ b/klm/util/exception.cc
@@ -84,4 +84,7 @@ EndOfFileException::EndOfFileException() throw() {
}
EndOfFileException::~EndOfFileException() throw() {}
+OverflowException::OverflowException() throw() {}
+OverflowException::~OverflowException() throw() {}
+
} // namespace util
diff --git a/klm/util/exception.hh b/klm/util/exception.hh
index 6d6a37cb..053a850b 100644
--- a/klm/util/exception.hh
+++ b/klm/util/exception.hh
@@ -2,9 +2,12 @@
#define UTIL_EXCEPTION__
#include <exception>
+#include <limits>
#include <sstream>
#include <string>
+#include <stdint.h>
+
namespace util {
template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
@@ -111,6 +114,25 @@ class EndOfFileException : public Exception {
~EndOfFileException() throw();
};
+class OverflowException : public Exception {
+ public:
+ OverflowException() throw();
+ ~OverflowException() throw();
+};
+
+template <unsigned len> inline std::size_t CheckOverflowInternal(uint64_t value) {
+ UTIL_THROW_IF(value > static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code.");
+ return value;
+}
+
+template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) {
+ return value;
+}
+
+inline std::size_t CheckOverflow(uint64_t value) {
+ return CheckOverflowInternal<sizeof(std::size_t)>(value);
+}
+
} // namespace util
#endif // UTIL_EXCEPTION__
diff --git a/klm/util/file.cc b/klm/util/file.cc
index 6a3885a7..6bf879ac 100644
--- a/klm/util/file.cc
+++ b/klm/util/file.cc
@@ -6,6 +6,7 @@
#include <cstdio>
#include <iostream>
+#include <assert.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
@@ -44,6 +45,16 @@ int OpenReadOrThrow(const char *name) {
return ret;
}
+int CreateOrThrow(const char *name) {
+ int ret;
+#if defined(_WIN32) || defined(_WIN64)
+ UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
+#else
+ UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
+#endif
+ return ret;
+}
+
uint64_t SizeFile(int fd) {
#if defined(_WIN32) || defined(_WIN64)
__int64 ret = _filelengthi64(fd);
@@ -101,6 +112,11 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
}
}
+void WriteOrThrow(FILE *to, const void *data, std::size_t size) {
+ assert(size);
+ if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size);
+}
+
void FSyncOrThrow(int fd) {
// Apparently windows doesn't have fsync?
#if !defined(_WIN32) && !defined(_WIN64)
@@ -109,8 +125,13 @@ void FSyncOrThrow(int fd) {
}
namespace {
-void InternalSeek(int fd, off_t off, int whence) {
+void InternalSeek(int fd, int64_t off, int whence) {
+#if defined(_WIN32) || defined(_WIN64)
+ UTIL_THROW_IF((__int64)-1 == _lseeki64(fd, off, whence), ErrnoException, "Windows seek failed");
+
+#else
UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed");
+#endif
}
} // namespace
@@ -133,6 +154,12 @@ std::FILE *FDOpenOrThrow(scoped_fd &file) {
return ret;
}
+std::FILE *FOpenOrThrow(const char *path, const char *mode) {
+ std::FILE *ret;
+ UTIL_THROW_IF(!(ret = fopen(path, mode)), util::ErrnoException, "Could not fopen " << path << " for " << mode);
+ return ret;
+}
+
TempMaker::TempMaker(const std::string &prefix) : base_(prefix) {
base_ += "XXXXXX";
}
@@ -232,7 +259,9 @@ mkstemp_and_unlink(char *tmpl)
/* Modified for windows and to unlink */
// fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE);
- fd = _open (tmpl, _O_RDWR | _O_CREAT | _O_TEMPORARY | _O_EXCL | _O_BINARY, _S_IREAD | _S_IWRITE);
+ int flags = _O_RDWR | _O_CREAT | _O_EXCL | _O_BINARY;
+ flags |= _O_TEMPORARY;
+ fd = _open (tmpl, flags, _S_IREAD | _S_IWRITE);
if (fd >= 0)
{
errno = save_errno;
@@ -250,17 +279,18 @@ mkstemp_and_unlink(char *tmpl)
int
mkstemp_and_unlink(char *tmpl) {
int ret = mkstemp(tmpl);
- if (ret == -1) return -1;
- UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl);
+ if (ret != -1) {
+ UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl);
+ }
return ret;
}
#endif
int TempMaker::Make() const {
- std::string copy(base_);
- copy.push_back(0);
+ std::string name(base_);
+ name.push_back(0);
int ret;
- UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&copy[0])), util::ErrnoException, "Failed to make a temporary based on " << base_);
+ UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&name[0])), util::ErrnoException, "Failed to make a temporary based on " << base_);
return ret;
}
diff --git a/klm/util/file.hh b/klm/util/file.hh
index 5c57e2a9..185cb1f3 100644
--- a/klm/util/file.hh
+++ b/klm/util/file.hh
@@ -65,7 +65,10 @@ class scoped_FILE {
std::FILE *file_;
};
+// Open for read only.
int OpenReadOrThrow(const char *name);
+// Create file if it doesn't exist, truncate if it does. Opened for write.
+int CreateOrThrow(const char *name);
// Return value for SizeFile when it can't size properly.
const uint64_t kBadSize = (uint64_t)-1;
@@ -77,6 +80,7 @@ void ReadOrThrow(int fd, void *to, std::size_t size);
std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount);
void WriteOrThrow(int fd, const void *data_void, std::size_t size);
+void WriteOrThrow(FILE *to, const void *data, std::size_t size);
void FSyncOrThrow(int fd);
@@ -87,12 +91,14 @@ void SeekEnd(int fd);
std::FILE *FDOpenOrThrow(scoped_fd &file);
+std::FILE *FOpenOrThrow(const char *path, const char *mode);
+
class TempMaker {
public:
explicit TempMaker(const std::string &prefix);
+ // These will already be unlinked for you.
int Make() const;
-
std::FILE *MakeFile() const;
private:
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index a205995a..280f438c 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -5,6 +5,8 @@
#include "util/mmap.hh"
#ifdef WIN32
#include <io.h>
+#else
+#include <unistd.h>
#endif // WIN32
#include <iostream>
@@ -27,7 +29,7 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
#ifdef HAVE_ZLIB
GZException::GZException(gzFile file) {
int num;
- *this << gzerror( file, &num) << " from zlib";
+ *this << gzerror(file, &num) << " from zlib";
}
#endif // HAVE_ZLIB
diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc
index 576fd4cc..bc9e3f81 100644
--- a/klm/util/mmap.cc
+++ b/klm/util/mmap.cc
@@ -19,8 +19,8 @@
#include <windows.h>
#include <io.h>
#else
-#include <unistd.h>
#include <sys/mman.h>
+#include <unistd.h>
#endif
namespace util {
@@ -171,20 +171,6 @@ void *MapZeroedWrite(int fd, std::size_t size) {
return MapOrThrow(size, true, kFileFlags, false, fd, 0);
}
-namespace {
-
-int CreateOrThrow(const char *name) {
- int ret;
-#if defined(_WIN32) || defined(_WIN64)
- UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
-#else
- UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
-#endif
- return ret;
-}
-
-} // namespace
-
void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
file.reset(CreateOrThrow(name));
try {
diff --git a/klm/util/pool.cc b/klm/util/pool.cc
new file mode 100644
index 00000000..2dffd06f
--- /dev/null
+++ b/klm/util/pool.cc
@@ -0,0 +1,35 @@
+#include "util/pool.hh"
+
+#include <stdlib.h>
+
+namespace util {
+
+Pool::Pool() {
+ current_ = NULL;
+ current_end_ = NULL;
+}
+
+Pool::~Pool() {
+ FreeAll();
+}
+
+void Pool::FreeAll() {
+ for (std::vector<void *>::const_iterator i(free_list_.begin()); i != free_list_.end(); ++i) {
+ free(*i);
+ }
+ free_list_.clear();
+ current_ = NULL;
+ current_end_ = NULL;
+}
+
+void *Pool::More(std::size_t size) {
+ std::size_t amount = std::max(static_cast<size_t>(32) << free_list_.size(), size);
+ uint8_t *ret = static_cast<uint8_t*>(malloc(amount));
+ if (!ret) throw std::bad_alloc();
+ free_list_.push_back(ret);
+ current_ = ret + size;
+ current_end_ = ret + amount;
+ return ret;
+}
+
+} // namespace util
diff --git a/klm/util/pool.hh b/klm/util/pool.hh
new file mode 100644
index 00000000..72f8a0c8
--- /dev/null
+++ b/klm/util/pool.hh
@@ -0,0 +1,45 @@
+// Very simple pool. It can only allocate memory. And all of the memory it
+// allocates must be freed at the same time.
+
+#ifndef UTIL_POOL__
+#define UTIL_POOL__
+
+#include <vector>
+
+#include <stdint.h>
+
+namespace util {
+
+class Pool {
+ public:
+ Pool();
+
+ ~Pool();
+
+ void *Allocate(std::size_t size) {
+ void *ret = current_;
+ current_ += size;
+ if (current_ < current_end_) {
+ return ret;
+ } else {
+ return More(size);
+ }
+ }
+
+ void FreeAll();
+
+ private:
+ void *More(std::size_t size);
+
+ std::vector<void *> free_list_;
+
+ uint8_t *current_, *current_end_;
+
+ // no copying
+ Pool(const Pool &);
+ Pool &operator=(const Pool &);
+};
+
+} // namespace util
+
+#endif // UTIL_POOL__
diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh
index 3354b68e..4a8aff35 100644
--- a/klm/util/probing_hash_table.hh
+++ b/klm/util/probing_hash_table.hh
@@ -8,6 +8,7 @@
#include <functional>
#include <assert.h>
+#include <stdint.h>
namespace util {
@@ -42,8 +43,8 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
typedef EqualT Equal;
public:
- static std::size_t Size(std::size_t entries, float multiplier) {
- std::size_t buckets = std::max(entries + 1, static_cast<std::size_t>(multiplier * static_cast<float>(entries)));
+ static uint64_t Size(uint64_t entries, float multiplier) {
+ uint64_t buckets = std::max(entries + 1, static_cast<uint64_t>(multiplier * static_cast<float>(entries)));
return buckets * sizeof(Entry);
}
diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc
new file mode 100644
index 00000000..b422cefc
--- /dev/null
+++ b/klm/util/string_piece.cc
@@ -0,0 +1,192 @@
+// Copyright 2004 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in string_piece.hh.
+
+#include "util/string_piece.hh"
+
+#include <algorithm>
+
+#include <limits.h>
+
+#ifndef HAVE_ICU
+
+typedef StringPiece::size_type size_type;
+
+void StringPiece::CopyToString(std::string* target) const {
+ target->assign(ptr_, length_);
+}
+
+size_type StringPiece::find(const StringPiece& s, size_type pos) const {
+ if (length_ < 0 || pos > static_cast<size_type>(length_))
+ return npos;
+
+ const char* result = std::search(ptr_ + pos, ptr_ + length_,
+ s.ptr_, s.ptr_ + s.length_);
+ const size_type xpos = result - ptr_;
+ return xpos + s.length_ <= length_ ? xpos : npos;
+}
+
+size_type StringPiece::find(char c, size_type pos) const {
+ if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
+ return npos;
+ }
+ const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
+ return result != ptr_ + length_ ? result - ptr_ : npos;
+}
+
+size_type StringPiece::rfind(const StringPiece& s, size_type pos) const {
+ if (length_ < s.length_) return npos;
+ const size_t ulen = length_;
+ if (s.length_ == 0) return std::min(ulen, pos);
+
+ const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_;
+ const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
+ return result != last ? result - ptr_ : npos;
+}
+
+size_type StringPiece::rfind(char c, size_type pos) const {
+ if (length_ <= 0) return npos;
+ for (int i = std::min(pos, static_cast<size_type>(length_ - 1));
+ i >= 0; --i) {
+ if (ptr_[i] == c) {
+ return i;
+ }
+ }
+ return npos;
+}
+
+// For each character in characters_wanted, sets the index corresponding
+// to the ASCII code of that character to 1 in table. This is used by
+// the find_.*_of methods below to tell whether or not a character is in
+// the lookup table in constant time.
+// The argument `table' must be an array that is large enough to hold all
+// the possible values of an unsigned char. Thus it should be be declared
+// as follows:
+// bool table[UCHAR_MAX + 1]
+static inline void BuildLookupTable(const StringPiece& characters_wanted,
+ bool* table) {
+ const size_type length = characters_wanted.length();
+ const char* const data = characters_wanted.data();
+ for (size_type i = 0; i < length; ++i) {
+ table[static_cast<unsigned char>(data[i])] = true;
+ }
+}
+
+size_type StringPiece::find_first_of(const StringPiece& s,
+ size_type pos) const {
+ if (length_ == 0 || s.length_ == 0)
+ return npos;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.length_ == 1)
+ return find_first_of(s.ptr_[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (size_type i = pos; i < length_; ++i) {
+ if (lookup[static_cast<unsigned char>(ptr_[i])]) {
+ return i;
+ }
+ }
+ return npos;
+}
+
+size_type StringPiece::find_first_not_of(const StringPiece& s,
+ size_type pos) const {
+ if (length_ == 0)
+ return npos;
+
+ if (s.length_ == 0)
+ return 0;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.length_ == 1)
+ return find_first_not_of(s.ptr_[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (size_type i = pos; i < length_; ++i) {
+ if (!lookup[static_cast<unsigned char>(ptr_[i])]) {
+ return i;
+ }
+ }
+ return npos;
+}
+
+size_type StringPiece::find_first_not_of(char c, size_type pos) const {
+ if (length_ == 0)
+ return npos;
+
+ for (; pos < length_; ++pos) {
+ if (ptr_[pos] != c) {
+ return pos;
+ }
+ }
+ return npos;
+}
+
+size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const {
+ if (length_ == 0 || s.length_ == 0)
+ return npos;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.length_ == 1)
+ return find_last_of(s.ptr_[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (size_type i = std::min(pos, length_ - 1); ; --i) {
+ if (lookup[static_cast<unsigned char>(ptr_[i])])
+ return i;
+ if (i == 0)
+ break;
+ }
+ return npos;
+}
+
+size_type StringPiece::find_last_not_of(const StringPiece& s,
+ size_type pos) const {
+ if (length_ == 0)
+ return npos;
+
+ size_type i = std::min(pos, length_ - 1);
+ if (s.length_ == 0)
+ return i;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.length_ == 1)
+ return find_last_not_of(s.ptr_[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (; ; --i) {
+ if (!lookup[static_cast<unsigned char>(ptr_[i])])
+ return i;
+ if (i == 0)
+ break;
+ }
+ return npos;
+}
+
+size_type StringPiece::find_last_not_of(char c, size_type pos) const {
+ if (length_ == 0)
+ return npos;
+
+ for (size_type i = std::min(pos, length_ - 1); ; --i) {
+ if (ptr_[i] != c)
+ return i;
+ if (i == 0)
+ break;
+ }
+ return npos;
+}
+
+StringPiece StringPiece::substr(size_type pos, size_type n) const {
+ if (pos > length_) pos = length_;
+ if (n > length_ - pos) n = length_ - pos;
+ return StringPiece(ptr_ + pos, n);
+}
+
+const size_type StringPiece::npos = size_type(-1);
+
+#endif // !HAVE_ICU
diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh
index 5de053aa..be6a643d 100644
--- a/klm/util/string_piece.hh
+++ b/klm/util/string_piece.hh
@@ -85,6 +85,11 @@ U_NAMESPACE_BEGIN
#include <string>
#include <string.h>
+#ifdef WIN32
+#undef max
+#undef min
+#endif
+
class StringPiece {
public:
typedef size_t size_type;
diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh
index c7e1c863..4a7f5460 100644
--- a/klm/util/tokenize_piece.hh
+++ b/klm/util/tokenize_piece.hh
@@ -54,6 +54,18 @@ class AnyCharacter {
StringPiece chars_;
};
+class AnyCharacterLast {
+ public:
+ explicit AnyCharacterLast(const StringPiece &chars) : chars_(chars) {}
+
+ StringPiece Find(const StringPiece &in) const {
+ return StringPiece(std::find_end(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1);
+ }
+
+ private:
+ StringPiece chars_;
+};
+
template <class Find, bool SkipEmpty = false> class TokenIter : public boost::iterator_facade<TokenIter<Find, SkipEmpty>, const StringPiece, boost::forward_traversal_tag> {
public:
TokenIter() {}