diff options
author | Chris Dyer <redpony@gmail.com> | 2014-06-12 23:10:00 -0400 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2014-06-12 23:10:00 -0400 |
commit | 3955a9607cf0e9722eb8ae3f14d951081df9f820 (patch) | |
tree | e59145e6d456d1ba930aac300d72c35d76689a1e /klm/util | |
parent | 0e2f8d3d049f06afb08b4639c6a28aa5461cdc78 (diff) |
move string_piece to utils/ from inside klm
Diffstat (limited to 'klm/util')
-rw-r--r-- | klm/util/string_piece.cc | 193 | ||||
-rw-r--r-- | klm/util/string_piece.hh | 270 |
2 files changed, 2 insertions, 461 deletions
diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc index ec394b96..973091c4 100644 --- a/klm/util/string_piece.cc +++ b/klm/util/string_piece.cc @@ -1,193 +1,2 @@ -// Copyright 2004 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in string_piece.hh. +// this has been moved to utils/ in cdec -#include "util/string_piece.hh" - -#include <algorithm> - -#include <limits.h> - -#ifndef HAVE_ICU - -typedef StringPiece::size_type size_type; - -void StringPiece::CopyToString(std::string* target) const { - target->assign(ptr_, length_); -} - -size_type StringPiece::find(const StringPiece& s, size_type pos) const { - // Not sure why length_ < 0 was here since it's std::size_t. - if (/*length_ < 0 || */pos > static_cast<size_type>(length_)) - return npos; - - const char* result = std::search(ptr_ + pos, ptr_ + length_, - s.ptr_, s.ptr_ + s.length_); - const size_type xpos = result - ptr_; - return xpos + s.length_ <= length_ ? xpos : npos; -} - -size_type StringPiece::find(char c, size_type pos) const { - if (length_ <= 0 || pos >= static_cast<size_type>(length_)) { - return npos; - } - const char* result = std::find(ptr_ + pos, ptr_ + length_, c); - return result != ptr_ + length_ ? result - ptr_ : npos; -} - -size_type StringPiece::rfind(const StringPiece& s, size_type pos) const { - if (length_ < s.length_) return npos; - const size_t ulen = length_; - if (s.length_ == 0) return std::min(ulen, pos); - - const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_; - const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_); - return result != last ? result - ptr_ : npos; -} - -size_type StringPiece::rfind(char c, size_type pos) const { - if (length_ <= 0) return npos; - for (int i = std::min(pos, static_cast<size_type>(length_ - 1)); - i >= 0; --i) { - if (ptr_[i] == c) { - return i; - } - } - return npos; -} - -// For each character in characters_wanted, sets the index corresponding -// to the ASCII code of that character to 1 in table. This is used by -// the find_.*_of methods below to tell whether or not a character is in -// the lookup table in constant time. -// The argument `table' must be an array that is large enough to hold all -// the possible values of an unsigned char. Thus it should be be declared -// as follows: -// bool table[UCHAR_MAX + 1] -static inline void BuildLookupTable(const StringPiece& characters_wanted, - bool* table) { - const size_type length = characters_wanted.length(); - const char* const data = characters_wanted.data(); - for (size_type i = 0; i < length; ++i) { - table[static_cast<unsigned char>(data[i])] = true; - } -} - -size_type StringPiece::find_first_of(const StringPiece& s, - size_type pos) const { - if (length_ == 0 || s.length_ == 0) - return npos; - - // Avoid the cost of BuildLookupTable() for a single-character search. - if (s.length_ == 1) - return find_first_of(s.ptr_[0], pos); - - bool lookup[UCHAR_MAX + 1] = { false }; - BuildLookupTable(s, lookup); - for (size_type i = pos; i < length_; ++i) { - if (lookup[static_cast<unsigned char>(ptr_[i])]) { - return i; - } - } - return npos; -} - -size_type StringPiece::find_first_not_of(const StringPiece& s, - size_type pos) const { - if (length_ == 0) - return npos; - - if (s.length_ == 0) - return 0; - - // Avoid the cost of BuildLookupTable() for a single-character search. - if (s.length_ == 1) - return find_first_not_of(s.ptr_[0], pos); - - bool lookup[UCHAR_MAX + 1] = { false }; - BuildLookupTable(s, lookup); - for (size_type i = pos; i < length_; ++i) { - if (!lookup[static_cast<unsigned char>(ptr_[i])]) { - return i; - } - } - return npos; -} - -size_type StringPiece::find_first_not_of(char c, size_type pos) const { - if (length_ == 0) - return npos; - - for (; pos < length_; ++pos) { - if (ptr_[pos] != c) { - return pos; - } - } - return npos; -} - -size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const { - if (length_ == 0 || s.length_ == 0) - return npos; - - // Avoid the cost of BuildLookupTable() for a single-character search. - if (s.length_ == 1) - return find_last_of(s.ptr_[0], pos); - - bool lookup[UCHAR_MAX + 1] = { false }; - BuildLookupTable(s, lookup); - for (size_type i = std::min(pos, length_ - 1); ; --i) { - if (lookup[static_cast<unsigned char>(ptr_[i])]) - return i; - if (i == 0) - break; - } - return npos; -} - -size_type StringPiece::find_last_not_of(const StringPiece& s, - size_type pos) const { - if (length_ == 0) - return npos; - - size_type i = std::min(pos, length_ - 1); - if (s.length_ == 0) - return i; - - // Avoid the cost of BuildLookupTable() for a single-character search. - if (s.length_ == 1) - return find_last_not_of(s.ptr_[0], pos); - - bool lookup[UCHAR_MAX + 1] = { false }; - BuildLookupTable(s, lookup); - for (; ; --i) { - if (!lookup[static_cast<unsigned char>(ptr_[i])]) - return i; - if (i == 0) - break; - } - return npos; -} - -size_type StringPiece::find_last_not_of(char c, size_type pos) const { - if (length_ == 0) - return npos; - - for (size_type i = std::min(pos, length_ - 1); ; --i) { - if (ptr_[i] != c) - return i; - if (i == 0) - break; - } - return npos; -} - -StringPiece StringPiece::substr(size_type pos, size_type n) const { - if (pos > length_) pos = length_; - if (n > length_ - pos) n = length_ - pos; - return StringPiece(ptr_ + pos, n); -} - -const size_type StringPiece::npos = size_type(-1); - -#endif // !HAVE_ICU diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh index 84431db1..696ca084 100644 --- a/klm/util/string_piece.hh +++ b/klm/util/string_piece.hh @@ -1,270 +1,2 @@ -/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If - * you don't use ICU, then this will use the Google implementation from Chrome. - * This has been modified from the original version to let you choose. - */ +#include "utils/string_piece.hh" -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// Copied from strings/stringpiece.h with modifications -// -// A string-like object that points to a sized piece of memory. -// -// Functions or methods may use const StringPiece& parameters to accept either -// a "const char*" or a "string" value that will be implicitly converted to -// a StringPiece. The implicit conversion means that it is often appropriate -// to include this .h file in other files rather than forward-declaring -// StringPiece as would be appropriate for most other Google classes. -// -// Systematic usage of StringPiece is encouraged as it will reduce unnecessary -// conversions from "const char*" to "string" and back again. -// - -#ifndef BASE_STRING_PIECE_H__ -#define BASE_STRING_PIECE_H__ - -#include "util/have.hh" - -#include <cstring> -#include <iosfwd> -#include <ostream> - -#ifdef HAVE_ICU -#include <unicode/stringpiece.h> -#include <unicode/uversion.h> - -// Old versions of ICU don't define operator== and operator!=. -#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4)) -#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6. -inline bool operator==(const StringPiece& x, const StringPiece& y) { - if (x.size() != y.size()) - return false; - - return std::memcmp(x.data(), y.data(), x.size()) == 0; -} - -inline bool operator!=(const StringPiece& x, const StringPiece& y) { - return !(x == y); -} -#endif // old version of ICU - -U_NAMESPACE_BEGIN - -inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { - int longersize = longer.size(), prefixsize = prefix.size(); - return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0; -} - -#else - -#include <algorithm> -#include <cstddef> -#include <string> -#include <string.h> - -#ifdef WIN32 -#undef max -#undef min -#endif - -class StringPiece { - public: - typedef size_t size_type; - - private: - const char* ptr_; - size_type length_; - - public: - // We provide non-explicit singleton constructors so users can pass - // in a "const char*" or a "string" wherever a "StringPiece" is - // expected. - StringPiece() : ptr_(NULL), length_(0) { } - StringPiece(const char* str) - : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { } - StringPiece(const std::string& str) - : ptr_(str.data()), length_(str.size()) { } - StringPiece(const char* offset, size_type len) - : ptr_(offset), length_(len) { } - - // data() may return a pointer to a buffer with embedded NULs, and the - // returned buffer may or may not be null terminated. Therefore it is - // typically a mistake to pass data() to a routine that expects a NUL - // terminated string. - const char* data() const { return ptr_; } - size_type size() const { return length_; } - size_type length() const { return length_; } - bool empty() const { return length_ == 0; } - - void clear() { ptr_ = NULL; length_ = 0; } - void set(const char* data, size_type len) { ptr_ = data; length_ = len; } - void set(const char* str) { - ptr_ = str; - length_ = str ? strlen(str) : 0; - } - void set(const void* data, size_type len) { - ptr_ = reinterpret_cast<const char*>(data); - length_ = len; - } - - char operator[](size_type i) const { return ptr_[i]; } - - void remove_prefix(size_type n) { - ptr_ += n; - length_ -= n; - } - - void remove_suffix(size_type n) { - length_ -= n; - } - - int compare(const StringPiece& x) const { - int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_)); - if (r == 0) { - if (length_ < x.length_) r = -1; - else if (length_ > x.length_) r = +1; - } - return r; - } - - std::string as_string() const { - // std::string doesn't like to take a NULL pointer even with a 0 size. - return std::string(!empty() ? data() : "", size()); - } - - void CopyToString(std::string* target) const; - void AppendToString(std::string* target) const; - - // Does "this" start with "x" - bool starts_with(const StringPiece& x) const { - return ((length_ >= x.length_) && - (wordmemcmp(ptr_, x.ptr_, x.length_) == 0)); - } - - // Does "this" end with "x" - bool ends_with(const StringPiece& x) const { - return ((length_ >= x.length_) && - (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); - } - - // standard STL container boilerplate - typedef char value_type; - typedef const char* pointer; - typedef const char& reference; - typedef const char& const_reference; - typedef ptrdiff_t difference_type; - static const size_type npos; - typedef const char* const_iterator; - typedef const char* iterator; - typedef std::reverse_iterator<const_iterator> const_reverse_iterator; - typedef std::reverse_iterator<iterator> reverse_iterator; - iterator begin() const { return ptr_; } - iterator end() const { return ptr_ + length_; } - const_reverse_iterator rbegin() const { - return const_reverse_iterator(ptr_ + length_); - } - const_reverse_iterator rend() const { - return const_reverse_iterator(ptr_); - } - - size_type max_size() const { return length_; } - size_type capacity() const { return length_; } - - size_type copy(char* buf, size_type n, size_type pos = 0) const; - - size_type find(const StringPiece& s, size_type pos = 0) const; - size_type find(char c, size_type pos = 0) const; - size_type rfind(const StringPiece& s, size_type pos = npos) const; - size_type rfind(char c, size_type pos = npos) const; - - size_type find_first_of(const StringPiece& s, size_type pos = 0) const; - size_type find_first_of(char c, size_type pos = 0) const { - return find(c, pos); - } - size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const; - size_type find_first_not_of(char c, size_type pos = 0) const; - size_type find_last_of(const StringPiece& s, size_type pos = npos) const; - size_type find_last_of(char c, size_type pos = npos) const { - return rfind(c, pos); - } - size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const; - size_type find_last_not_of(char c, size_type pos = npos) const; - - StringPiece substr(size_type pos, size_type n = npos) const; - - static int wordmemcmp(const char* p, const char* p2, size_type N) { - return std::memcmp(p, p2, N); - } -}; - -inline bool operator==(const StringPiece& x, const StringPiece& y) { - if (x.size() != y.size()) - return false; - - return std::memcmp(x.data(), y.data(), x.size()) == 0; -} - -inline bool operator!=(const StringPiece& x, const StringPiece& y) { - return !(x == y); -} - -inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { - return longer.starts_with(prefix); -} - -#endif // HAVE_ICU undefined - -inline bool operator<(const StringPiece& x, const StringPiece& y) { - const int r = std::memcmp(x.data(), y.data(), - std::min(x.size(), y.size())); - return ((r < 0) || ((r == 0) && (x.size() < y.size()))); -} - -inline bool operator>(const StringPiece& x, const StringPiece& y) { - return y < x; -} - -inline bool operator<=(const StringPiece& x, const StringPiece& y) { - return !(x > y); -} - -inline bool operator>=(const StringPiece& x, const StringPiece& y) { - return !(x < y); -} - -// allow StringPiece to be logged (needed for unit testing). -inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { - return o.write(piece.data(), static_cast<std::streamsize>(piece.size())); -} - -#ifdef HAVE_ICU -U_NAMESPACE_END -using U_NAMESPACE_QUALIFIER StringPiece; -#endif - -#endif // BASE_STRING_PIECE_H__ |