From 72ef434a1e4b507d1252e39074f26226f236c83d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 12 Jun 2014 23:10:00 -0400 Subject: move string_piece to utils/ from inside klm --- klm/util/string_piece.cc | 193 +-------------------------------- klm/util/string_piece.hh | 270 +--------------------------------------------- utils/Makefile.am | 7 +- utils/string_piece.cc | 193 +++++++++++++++++++++++++++++++++ utils/string_piece.hh | 272 +++++++++++++++++++++++++++++++++++++++++++++++ utils/stringlib.h | 17 +++ utils/stringlib_test.cc | 4 + utils/sv_test.cc | 11 ++ 8 files changed, 505 insertions(+), 462 deletions(-) create mode 100644 utils/string_piece.cc create mode 100644 utils/string_piece.hh diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc index ec394b96..973091c4 100644 --- a/klm/util/string_piece.cc +++ b/klm/util/string_piece.cc @@ -1,193 +1,2 @@ -// Copyright 2004 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in string_piece.hh. +// this has been moved to utils/ in cdec -#include "util/string_piece.hh" - -#include - -#include - -#ifndef HAVE_ICU - -typedef StringPiece::size_type size_type; - -void StringPiece::CopyToString(std::string* target) const { - target->assign(ptr_, length_); -} - -size_type StringPiece::find(const StringPiece& s, size_type pos) const { - // Not sure why length_ < 0 was here since it's std::size_t. - if (/*length_ < 0 || */pos > static_cast(length_)) - return npos; - - const char* result = std::search(ptr_ + pos, ptr_ + length_, - s.ptr_, s.ptr_ + s.length_); - const size_type xpos = result - ptr_; - return xpos + s.length_ <= length_ ? xpos : npos; -} - -size_type StringPiece::find(char c, size_type pos) const { - if (length_ <= 0 || pos >= static_cast(length_)) { - return npos; - } - const char* result = std::find(ptr_ + pos, ptr_ + length_, c); - return result != ptr_ + length_ ? result - ptr_ : npos; -} - -size_type StringPiece::rfind(const StringPiece& s, size_type pos) const { - if (length_ < s.length_) return npos; - const size_t ulen = length_; - if (s.length_ == 0) return std::min(ulen, pos); - - const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_; - const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_); - return result != last ? result - ptr_ : npos; -} - -size_type StringPiece::rfind(char c, size_type pos) const { - if (length_ <= 0) return npos; - for (int i = std::min(pos, static_cast(length_ - 1)); - i >= 0; --i) { - if (ptr_[i] == c) { - return i; - } - } - return npos; -} - -// For each character in characters_wanted, sets the index corresponding -// to the ASCII code of that character to 1 in table. This is used by -// the find_.*_of methods below to tell whether or not a character is in -// the lookup table in constant time. -// The argument `table' must be an array that is large enough to hold all -// the possible values of an unsigned char. Thus it should be be declared -// as follows: -// bool table[UCHAR_MAX + 1] -static inline void BuildLookupTable(const StringPiece& characters_wanted, - bool* table) { - const size_type length = characters_wanted.length(); - const char* const data = characters_wanted.data(); - for (size_type i = 0; i < length; ++i) { - table[static_cast(data[i])] = true; - } -} - -size_type StringPiece::find_first_of(const StringPiece& s, - size_type pos) const { - if (length_ == 0 || s.length_ == 0) - return npos; - - // Avoid the cost of BuildLookupTable() for a single-character search. - if (s.length_ == 1) - return find_first_of(s.ptr_[0], pos); - - bool lookup[UCHAR_MAX + 1] = { false }; - BuildLookupTable(s, lookup); - for (size_type i = pos; i < length_; ++i) { - if (lookup[static_cast(ptr_[i])]) { - return i; - } - } - return npos; -} - -size_type StringPiece::find_first_not_of(const StringPiece& s, - size_type pos) const { - if (length_ == 0) - return npos; - - if (s.length_ == 0) - return 0; - - // Avoid the cost of BuildLookupTable() for a single-character search. - if (s.length_ == 1) - return find_first_not_of(s.ptr_[0], pos); - - bool lookup[UCHAR_MAX + 1] = { false }; - BuildLookupTable(s, lookup); - for (size_type i = pos; i < length_; ++i) { - if (!lookup[static_cast(ptr_[i])]) { - return i; - } - } - return npos; -} - -size_type StringPiece::find_first_not_of(char c, size_type pos) const { - if (length_ == 0) - return npos; - - for (; pos < length_; ++pos) { - if (ptr_[pos] != c) { - return pos; - } - } - return npos; -} - -size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const { - if (length_ == 0 || s.length_ == 0) - return npos; - - // Avoid the cost of BuildLookupTable() for a single-character search. - if (s.length_ == 1) - return find_last_of(s.ptr_[0], pos); - - bool lookup[UCHAR_MAX + 1] = { false }; - BuildLookupTable(s, lookup); - for (size_type i = std::min(pos, length_ - 1); ; --i) { - if (lookup[static_cast(ptr_[i])]) - return i; - if (i == 0) - break; - } - return npos; -} - -size_type StringPiece::find_last_not_of(const StringPiece& s, - size_type pos) const { - if (length_ == 0) - return npos; - - size_type i = std::min(pos, length_ - 1); - if (s.length_ == 0) - return i; - - // Avoid the cost of BuildLookupTable() for a single-character search. - if (s.length_ == 1) - return find_last_not_of(s.ptr_[0], pos); - - bool lookup[UCHAR_MAX + 1] = { false }; - BuildLookupTable(s, lookup); - for (; ; --i) { - if (!lookup[static_cast(ptr_[i])]) - return i; - if (i == 0) - break; - } - return npos; -} - -size_type StringPiece::find_last_not_of(char c, size_type pos) const { - if (length_ == 0) - return npos; - - for (size_type i = std::min(pos, length_ - 1); ; --i) { - if (ptr_[i] != c) - return i; - if (i == 0) - break; - } - return npos; -} - -StringPiece StringPiece::substr(size_type pos, size_type n) const { - if (pos > length_) pos = length_; - if (n > length_ - pos) n = length_ - pos; - return StringPiece(ptr_ + pos, n); -} - -const size_type StringPiece::npos = size_type(-1); - -#endif // !HAVE_ICU diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh index 84431db1..696ca084 100644 --- a/klm/util/string_piece.hh +++ b/klm/util/string_piece.hh @@ -1,270 +1,2 @@ -/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If - * you don't use ICU, then this will use the Google implementation from Chrome. - * This has been modified from the original version to let you choose. - */ +#include "utils/string_piece.hh" -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// Copied from strings/stringpiece.h with modifications -// -// A string-like object that points to a sized piece of memory. -// -// Functions or methods may use const StringPiece& parameters to accept either -// a "const char*" or a "string" value that will be implicitly converted to -// a StringPiece. The implicit conversion means that it is often appropriate -// to include this .h file in other files rather than forward-declaring -// StringPiece as would be appropriate for most other Google classes. -// -// Systematic usage of StringPiece is encouraged as it will reduce unnecessary -// conversions from "const char*" to "string" and back again. -// - -#ifndef BASE_STRING_PIECE_H__ -#define BASE_STRING_PIECE_H__ - -#include "util/have.hh" - -#include -#include -#include - -#ifdef HAVE_ICU -#include -#include - -// Old versions of ICU don't define operator== and operator!=. -#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4)) -#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6. -inline bool operator==(const StringPiece& x, const StringPiece& y) { - if (x.size() != y.size()) - return false; - - return std::memcmp(x.data(), y.data(), x.size()) == 0; -} - -inline bool operator!=(const StringPiece& x, const StringPiece& y) { - return !(x == y); -} -#endif // old version of ICU - -U_NAMESPACE_BEGIN - -inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { - int longersize = longer.size(), prefixsize = prefix.size(); - return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0; -} - -#else - -#include -#include -#include -#include - -#ifdef WIN32 -#undef max -#undef min -#endif - -class StringPiece { - public: - typedef size_t size_type; - - private: - const char* ptr_; - size_type length_; - - public: - // We provide non-explicit singleton constructors so users can pass - // in a "const char*" or a "string" wherever a "StringPiece" is - // expected. - StringPiece() : ptr_(NULL), length_(0) { } - StringPiece(const char* str) - : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { } - StringPiece(const std::string& str) - : ptr_(str.data()), length_(str.size()) { } - StringPiece(const char* offset, size_type len) - : ptr_(offset), length_(len) { } - - // data() may return a pointer to a buffer with embedded NULs, and the - // returned buffer may or may not be null terminated. Therefore it is - // typically a mistake to pass data() to a routine that expects a NUL - // terminated string. - const char* data() const { return ptr_; } - size_type size() const { return length_; } - size_type length() const { return length_; } - bool empty() const { return length_ == 0; } - - void clear() { ptr_ = NULL; length_ = 0; } - void set(const char* data, size_type len) { ptr_ = data; length_ = len; } - void set(const char* str) { - ptr_ = str; - length_ = str ? strlen(str) : 0; - } - void set(const void* data, size_type len) { - ptr_ = reinterpret_cast(data); - length_ = len; - } - - char operator[](size_type i) const { return ptr_[i]; } - - void remove_prefix(size_type n) { - ptr_ += n; - length_ -= n; - } - - void remove_suffix(size_type n) { - length_ -= n; - } - - int compare(const StringPiece& x) const { - int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_)); - if (r == 0) { - if (length_ < x.length_) r = -1; - else if (length_ > x.length_) r = +1; - } - return r; - } - - std::string as_string() const { - // std::string doesn't like to take a NULL pointer even with a 0 size. - return std::string(!empty() ? data() : "", size()); - } - - void CopyToString(std::string* target) const; - void AppendToString(std::string* target) const; - - // Does "this" start with "x" - bool starts_with(const StringPiece& x) const { - return ((length_ >= x.length_) && - (wordmemcmp(ptr_, x.ptr_, x.length_) == 0)); - } - - // Does "this" end with "x" - bool ends_with(const StringPiece& x) const { - return ((length_ >= x.length_) && - (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); - } - - // standard STL container boilerplate - typedef char value_type; - typedef const char* pointer; - typedef const char& reference; - typedef const char& const_reference; - typedef ptrdiff_t difference_type; - static const size_type npos; - typedef const char* const_iterator; - typedef const char* iterator; - typedef std::reverse_iterator const_reverse_iterator; - typedef std::reverse_iterator reverse_iterator; - iterator begin() const { return ptr_; } - iterator end() const { return ptr_ + length_; } - const_reverse_iterator rbegin() const { - return const_reverse_iterator(ptr_ + length_); - } - const_reverse_iterator rend() const { - return const_reverse_iterator(ptr_); - } - - size_type max_size() const { return length_; } - size_type capacity() const { return length_; } - - size_type copy(char* buf, size_type n, size_type pos = 0) const; - - size_type find(const StringPiece& s, size_type pos = 0) const; - size_type find(char c, size_type pos = 0) const; - size_type rfind(const StringPiece& s, size_type pos = npos) const; - size_type rfind(char c, size_type pos = npos) const; - - size_type find_first_of(const StringPiece& s, size_type pos = 0) const; - size_type find_first_of(char c, size_type pos = 0) const { - return find(c, pos); - } - size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const; - size_type find_first_not_of(char c, size_type pos = 0) const; - size_type find_last_of(const StringPiece& s, size_type pos = npos) const; - size_type find_last_of(char c, size_type pos = npos) const { - return rfind(c, pos); - } - size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const; - size_type find_last_not_of(char c, size_type pos = npos) const; - - StringPiece substr(size_type pos, size_type n = npos) const; - - static int wordmemcmp(const char* p, const char* p2, size_type N) { - return std::memcmp(p, p2, N); - } -}; - -inline bool operator==(const StringPiece& x, const StringPiece& y) { - if (x.size() != y.size()) - return false; - - return std::memcmp(x.data(), y.data(), x.size()) == 0; -} - -inline bool operator!=(const StringPiece& x, const StringPiece& y) { - return !(x == y); -} - -inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { - return longer.starts_with(prefix); -} - -#endif // HAVE_ICU undefined - -inline bool operator<(const StringPiece& x, const StringPiece& y) { - const int r = std::memcmp(x.data(), y.data(), - std::min(x.size(), y.size())); - return ((r < 0) || ((r == 0) && (x.size() < y.size()))); -} - -inline bool operator>(const StringPiece& x, const StringPiece& y) { - return y < x; -} - -inline bool operator<=(const StringPiece& x, const StringPiece& y) { - return !(x > y); -} - -inline bool operator>=(const StringPiece& x, const StringPiece& y) { - return !(x < y); -} - -// allow StringPiece to be logged (needed for unit testing). -inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { - return o.write(piece.data(), static_cast(piece.size())); -} - -#ifdef HAVE_ICU -U_NAMESPACE_END -using U_NAMESPACE_QUALIFIER StringPiece; -#endif - -#endif // BASE_STRING_PIECE_H__ diff --git a/utils/Makefile.am b/utils/Makefile.am index 341fd80b..18495c3a 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -8,9 +8,10 @@ noinst_PROGRAMS = \ weights_test \ logval_test \ small_vector_test \ + stringlib_test \ sv_test -TESTS = ts small_vector_test logval_test weights_test dict_test m_test sv_test +TESTS = ts small_vector_test logval_test weights_test dict_test m_test sv_test stringlib_test noinst_LIBRARIES = libutils.a @@ -54,6 +55,7 @@ libutils_a_SOURCES = \ star.h \ static_utoa.h \ stringlib.h \ + string_piece.hh \ tdict.h \ timing_stats.h \ utoa.h \ @@ -76,6 +78,7 @@ libutils_a_SOURCES = \ gzstream.cc \ filelib.cc \ stringlib.cc \ + string_piece.cc \ sparse_vector.cc \ timing_stats.cc \ verbose.cc \ @@ -106,6 +109,8 @@ small_vector_test_SOURCES = small_vector_test.cc small_vector_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) sv_test_SOURCES = sv_test.cc sv_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) +stringlib_test_SOURCES = stringlib_test.cc +stringlib_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) ################################################################ # do NOT NOT NOT add any other -I includes NO NO NO NO NO ###### diff --git a/utils/string_piece.cc b/utils/string_piece.cc new file mode 100644 index 00000000..0685c05f --- /dev/null +++ b/utils/string_piece.cc @@ -0,0 +1,193 @@ +// Copyright 2004 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in string_piece.hh. + +#include "string_piece.hh" + +#include + +#include + +#ifndef HAVE_ICU + +typedef StringPiece::size_type size_type; + +void StringPiece::CopyToString(std::string* target) const { + target->assign(ptr_, length_); +} + +size_type StringPiece::find(const StringPiece& s, size_type pos) const { + // Not sure why length_ < 0 was here since it's std::size_t. + if (/*length_ < 0 || */pos > static_cast(length_)) + return npos; + + const char* result = std::search(ptr_ + pos, ptr_ + length_, + s.ptr_, s.ptr_ + s.length_); + const size_type xpos = result - ptr_; + return xpos + s.length_ <= length_ ? xpos : npos; +} + +size_type StringPiece::find(char c, size_type pos) const { + if (length_ <= 0 || pos >= static_cast(length_)) { + return npos; + } + const char* result = std::find(ptr_ + pos, ptr_ + length_, c); + return result != ptr_ + length_ ? result - ptr_ : npos; +} + +size_type StringPiece::rfind(const StringPiece& s, size_type pos) const { + if (length_ < s.length_) return npos; + const size_t ulen = length_; + if (s.length_ == 0) return std::min(ulen, pos); + + const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_; + const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_); + return result != last ? result - ptr_ : npos; +} + +size_type StringPiece::rfind(char c, size_type pos) const { + if (length_ <= 0) return npos; + for (int i = std::min(pos, static_cast(length_ - 1)); + i >= 0; --i) { + if (ptr_[i] == c) { + return i; + } + } + return npos; +} + +// For each character in characters_wanted, sets the index corresponding +// to the ASCII code of that character to 1 in table. This is used by +// the find_.*_of methods below to tell whether or not a character is in +// the lookup table in constant time. +// The argument `table' must be an array that is large enough to hold all +// the possible values of an unsigned char. Thus it should be be declared +// as follows: +// bool table[UCHAR_MAX + 1] +static inline void BuildLookupTable(const StringPiece& characters_wanted, + bool* table) { + const size_type length = characters_wanted.length(); + const char* const data = characters_wanted.data(); + for (size_type i = 0; i < length; ++i) { + table[static_cast(data[i])] = true; + } +} + +size_type StringPiece::find_first_of(const StringPiece& s, + size_type pos) const { + if (length_ == 0 || s.length_ == 0) + return npos; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.length_ == 1) + return find_first_of(s.ptr_[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (size_type i = pos; i < length_; ++i) { + if (lookup[static_cast(ptr_[i])]) { + return i; + } + } + return npos; +} + +size_type StringPiece::find_first_not_of(const StringPiece& s, + size_type pos) const { + if (length_ == 0) + return npos; + + if (s.length_ == 0) + return 0; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.length_ == 1) + return find_first_not_of(s.ptr_[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (size_type i = pos; i < length_; ++i) { + if (!lookup[static_cast(ptr_[i])]) { + return i; + } + } + return npos; +} + +size_type StringPiece::find_first_not_of(char c, size_type pos) const { + if (length_ == 0) + return npos; + + for (; pos < length_; ++pos) { + if (ptr_[pos] != c) { + return pos; + } + } + return npos; +} + +size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const { + if (length_ == 0 || s.length_ == 0) + return npos; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.length_ == 1) + return find_last_of(s.ptr_[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (size_type i = std::min(pos, length_ - 1); ; --i) { + if (lookup[static_cast(ptr_[i])]) + return i; + if (i == 0) + break; + } + return npos; +} + +size_type StringPiece::find_last_not_of(const StringPiece& s, + size_type pos) const { + if (length_ == 0) + return npos; + + size_type i = std::min(pos, length_ - 1); + if (s.length_ == 0) + return i; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.length_ == 1) + return find_last_not_of(s.ptr_[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (; ; --i) { + if (!lookup[static_cast(ptr_[i])]) + return i; + if (i == 0) + break; + } + return npos; +} + +size_type StringPiece::find_last_not_of(char c, size_type pos) const { + if (length_ == 0) + return npos; + + for (size_type i = std::min(pos, length_ - 1); ; --i) { + if (ptr_[i] != c) + return i; + if (i == 0) + break; + } + return npos; +} + +StringPiece StringPiece::substr(size_type pos, size_type n) const { + if (pos > length_) pos = length_; + if (n > length_ - pos) n = length_ - pos; + return StringPiece(ptr_ + pos, n); +} + +const size_type StringPiece::npos = size_type(-1); + +#endif // !HAVE_ICU diff --git a/utils/string_piece.hh b/utils/string_piece.hh new file mode 100644 index 00000000..9f21336c --- /dev/null +++ b/utils/string_piece.hh @@ -0,0 +1,272 @@ +/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If + * you don't use ICU, then this will use the Google implementation from Chrome. + * This has been modified from the original version to let you choose. + */ + +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Copied from strings/stringpiece.h with modifications +// +// A string-like object that points to a sized piece of memory. +// +// Functions or methods may use const StringPiece& parameters to accept either +// a "const char*" or a "string" value that will be implicitly converted to +// a StringPiece. The implicit conversion means that it is often appropriate +// to include this .h file in other files rather than forward-declaring +// StringPiece as would be appropriate for most other Google classes. +// +// Systematic usage of StringPiece is encouraged as it will reduce unnecessary +// conversions from "const char*" to "string" and back again. +// + +#ifndef BASE_STRING_PIECE_H__ +#define BASE_STRING_PIECE_H__ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include + +#ifdef HAVE_ICU +#include +#include + +// Old versions of ICU don't define operator== and operator!=. +#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4)) +#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6. +inline bool operator==(const StringPiece& x, const StringPiece& y) { + if (x.size() != y.size()) + return false; + + return std::memcmp(x.data(), y.data(), x.size()) == 0; +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} +#endif // old version of ICU + +U_NAMESPACE_BEGIN + +inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { + int longersize = longer.size(), prefixsize = prefix.size(); + return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0; +} + +#else + +#include +#include +#include +#include + +#ifdef WIN32 +#undef max +#undef min +#endif + +class StringPiece { + public: + typedef size_t size_type; + + private: + const char* ptr_; + size_type length_; + + public: + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected. + StringPiece() : ptr_(NULL), length_(0) { } + StringPiece(const char* str) + : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { } + StringPiece(const std::string& str) + : ptr_(str.data()), length_(str.size()) { } + StringPiece(const char* offset, size_type len) + : ptr_(offset), length_(len) { } + + // data() may return a pointer to a buffer with embedded NULs, and the + // returned buffer may or may not be null terminated. Therefore it is + // typically a mistake to pass data() to a routine that expects a NUL + // terminated string. + const char* data() const { return ptr_; } + size_type size() const { return length_; } + size_type length() const { return length_; } + bool empty() const { return length_ == 0; } + + void clear() { ptr_ = NULL; length_ = 0; } + void set(const char* data, size_type len) { ptr_ = data; length_ = len; } + void set(const char* str) { + ptr_ = str; + length_ = str ? strlen(str) : 0; + } + void set(const void* data, size_type len) { + ptr_ = reinterpret_cast(data); + length_ = len; + } + + char operator[](size_type i) const { return ptr_[i]; } + + void remove_prefix(size_type n) { + ptr_ += n; + length_ -= n; + } + + void remove_suffix(size_type n) { + length_ -= n; + } + + int compare(const StringPiece& x) const { + int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_)); + if (r == 0) { + if (length_ < x.length_) r = -1; + else if (length_ > x.length_) r = +1; + } + return r; + } + + std::string as_string() const { + // std::string doesn't like to take a NULL pointer even with a 0 size. + return std::string(!empty() ? data() : "", size()); + } + + void CopyToString(std::string* target) const; + void AppendToString(std::string* target) const; + + // Does "this" start with "x" + bool starts_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (wordmemcmp(ptr_, x.ptr_, x.length_) == 0)); + } + + // Does "this" end with "x" + bool ends_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); + } + + // standard STL container boilerplate + typedef char value_type; + typedef const char* pointer; + typedef const char& reference; + typedef const char& const_reference; + typedef ptrdiff_t difference_type; + static const size_type npos; + typedef const char* const_iterator; + typedef const char* iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + iterator begin() const { return ptr_; } + iterator end() const { return ptr_ + length_; } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(ptr_ + length_); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(ptr_); + } + + size_type max_size() const { return length_; } + size_type capacity() const { return length_; } + + size_type copy(char* buf, size_type n, size_type pos = 0) const; + + size_type find(const StringPiece& s, size_type pos = 0) const; + size_type find(char c, size_type pos = 0) const; + size_type rfind(const StringPiece& s, size_type pos = npos) const; + size_type rfind(char c, size_type pos = npos) const; + + size_type find_first_of(const StringPiece& s, size_type pos = 0) const; + size_type find_first_of(char c, size_type pos = 0) const { + return find(c, pos); + } + size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const; + size_type find_first_not_of(char c, size_type pos = 0) const; + size_type find_last_of(const StringPiece& s, size_type pos = npos) const; + size_type find_last_of(char c, size_type pos = npos) const { + return rfind(c, pos); + } + size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const; + size_type find_last_not_of(char c, size_type pos = npos) const; + + StringPiece substr(size_type pos, size_type n = npos) const; + + static int wordmemcmp(const char* p, const char* p2, size_type N) { + return std::memcmp(p, p2, N); + } +}; + +inline bool operator==(const StringPiece& x, const StringPiece& y) { + if (x.size() != y.size()) + return false; + + return std::memcmp(x.data(), y.data(), x.size()) == 0; +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} + +inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { + return longer.starts_with(prefix); +} + +#endif // HAVE_ICU undefined + +inline bool operator<(const StringPiece& x, const StringPiece& y) { + const int r = std::memcmp(x.data(), y.data(), + std::min(x.size(), y.size())); + return ((r < 0) || ((r == 0) && (x.size() < y.size()))); +} + +inline bool operator>(const StringPiece& x, const StringPiece& y) { + return y < x; +} + +inline bool operator<=(const StringPiece& x, const StringPiece& y) { + return !(x > y); +} + +inline bool operator>=(const StringPiece& x, const StringPiece& y) { + return !(x < y); +} + +// allow StringPiece to be logged (needed for unit testing). +inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { + return o.write(piece.data(), static_cast(piece.size())); +} + +#ifdef HAVE_ICU +U_NAMESPACE_END +using U_NAMESPACE_QUALIFIER StringPiece; +#endif + +#endif // BASE_STRING_PIECE_H__ diff --git a/utils/stringlib.h b/utils/stringlib.h index 2fdbfff8..aadec93c 100644 --- a/utils/stringlib.h +++ b/utils/stringlib.h @@ -16,6 +16,8 @@ #include #include +#include "string_piece.hh" + namespace { const char c_isspace[]=" \t\n\r\f\v"; // somewhat ridiculous, including characters nobody uses. const char common_isspace[]=" \t\n\r"; // even \n\r is borderline, but maybe you pass multiline DOS format text. @@ -221,6 +223,21 @@ void VisitTokens(char *p,char *const end,F f) { } } +inline std::vector TokenizeMultisep(const StringPiece& text, const StringPiece& separator) { + std::vector res; + size_t cur = 0; + while(cur < text.size()) { + const auto n = text.find(separator, cur); + if (n == StringPiece::npos) { + res.push_back(text.substr(cur)); + break; + } + res.push_back(text.substr(cur, n - cur)); + cur = n + separator.size(); + } + return res; +} + template void VisitTokens(char *p,F f) { VisitTokens(p,p+std::strlen(p),f); diff --git a/utils/stringlib_test.cc b/utils/stringlib_test.cc index f66cdbeb..00b31b4b 100644 --- a/utils/stringlib_test.cc +++ b/utils/stringlib_test.cc @@ -14,4 +14,8 @@ int main(int argc, char *argv[]) { std::string const& w="verylongword"; VisitTokens(p,print()); VisitTokens(w,print()); + vector vsp = TokenizeMultisep("field 1 ||| field 2 is a bit longer ||| field 3", " ||| "); + for (auto& sp : vsp) { + cerr << "'" << sp << "'" << endl; + } } diff --git a/utils/sv_test.cc b/utils/sv_test.cc index c7ac9e54..67df8c57 100644 --- a/utils/sv_test.cc +++ b/utils/sv_test.cc @@ -5,6 +5,17 @@ using namespace std; +BOOST_AUTO_TEST_CASE(Dot) { + SparseVector x; + SparseVector y; + x.set_value(1,0.8); + y.set_value(1,5); + x.set_value(2,-2); + y.set_value(2,1); + x.set_value(3,80); + BOOST_CHECK_CLOSE(x.dot(y), 2.0, 1e-9); +} + BOOST_AUTO_TEST_CASE(Equality) { SparseVector x; SparseVector y; -- cgit v1.2.3 From 02184018ff661833dd7b10b463e5cb1ac2b6e98c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 13 Jun 2014 00:23:19 -0400 Subject: tree fragment accepts StringPiece instead of string --- decoder/tree_fragment.cc | 10 +++++----- decoder/tree_fragment.h | 6 ++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/decoder/tree_fragment.cc b/decoder/tree_fragment.cc index 696c8601..aad0b2c4 100644 --- a/decoder/tree_fragment.cc +++ b/decoder/tree_fragment.cc @@ -8,7 +8,7 @@ using namespace std; namespace cdec { -TreeFragment::TreeFragment(const string& tree, bool allow_frontier_sites) { +TreeFragment::TreeFragment(const StringPiece& tree, bool allow_frontier_sites) { int bal = 0; const unsigned len = tree.size(); unsigned cur = 0; @@ -50,7 +50,7 @@ void TreeFragment::DebugRec(unsigned cur, ostream* out) const { // cp is the character index in the tree // np keeps track of the nodes (nonterminals) that have been built // symp keeps track of the terminal symbols that have been built -void TreeFragment::ParseRec(const string& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp) { +void TreeFragment::ParseRec(const StringPiece& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp) { if (tree[cp] != '(') { cerr << "Expected ( at " << cp << endl; abort(); @@ -79,12 +79,12 @@ void TreeFragment::ParseRec(const string& tree, bool afs, unsigned cp, unsigned // TODO: add a terminal symbol to the current edge const bool is_terminal = tree[t_start] != '[' || (t_end - t_start < 3 || tree[t_end - 1] != ']'); if (is_terminal) { - const unsigned term = TD::Convert(tree.substr(t_start, t_end - t_start)); + const unsigned term = TD::Convert(tree.substr(t_start, t_end - t_start).as_string()); rhs.push_back(term); // cerr << "T='" << TD::Convert(term) << "'\n"; ++terminals; } else { // frontier site (NT but no recursion) - const unsigned nt = TD::Convert(tree.substr(t_start + 1, t_end - t_start - 2)) | FRONTIER_BIT; + const unsigned nt = TD::Convert(tree.substr(t_start + 1, t_end - t_start - 2).as_string()) | FRONTIER_BIT; rhs.push_back(nt); ++frontier_sites; // cerr << "FRONT-NT=[" << TD::Convert(nt & ALL_MASK) << "]\n"; @@ -97,7 +97,7 @@ void TreeFragment::ParseRec(const string& tree, bool afs, unsigned cp, unsigned } // continuent has completed, cp is at ), build node const unsigned j = symp; // span from (i,j) // add an internal non-terminal symbol - const unsigned nt = TD::Convert(tree.substr(nt_start, nt_end - nt_start)) | RHS_BIT; + const unsigned nt = TD::Convert(tree.substr(nt_start, nt_end - nt_start).as_string()) | RHS_BIT; nodes[np] = TreeFragmentProduction(nt, rhs); //cerr << np << " production(" << i << "," << j << ")= " << TD::Convert(nt & ALL_MASK) << " -->"; //for (auto& x : rhs) { diff --git a/decoder/tree_fragment.h b/decoder/tree_fragment.h index 79722b5a..8bb7251a 100644 --- a/decoder/tree_fragment.h +++ b/decoder/tree_fragment.h @@ -8,6 +8,8 @@ #include #include +#include "string_piece.hh" + namespace cdec { class BreadthFirstIterator; @@ -52,7 +54,7 @@ class TreeFragment { public: TreeFragment() : frontier_sites(), terminals() {} // (S (NP a (X b) c d) (VP (V foo) (NP (NN bar)))) - explicit TreeFragment(const std::string& tree, bool allow_frontier_sites = false); + explicit TreeFragment(const StringPiece& tree, bool allow_frontier_sites = false); void DebugRec(unsigned cur, std::ostream* out) const; typedef DepthFirstIterator iterator; typedef ptrdiff_t difference_type; @@ -73,7 +75,7 @@ class TreeFragment { // cp is the character index in the tree // np keeps track of the nodes (nonterminals) that have been built // symp keeps track of the terminal symbols that have been built - void ParseRec(const std::string& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp); + void ParseRec(const StringPiece& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp); public: unsigned root; unsigned char frontier_sites; -- cgit v1.2.3 From cca7e44e48cda331cd7642315a8ea0ff03b9d816 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 13 Jun 2014 00:25:00 -0400 Subject: stringpiece based parser for tree-to-string rules (foundation for t2t rules) --- decoder/tree2string_translator.cc | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/decoder/tree2string_translator.cc b/decoder/tree2string_translator.cc index b5b47d5d..d61b9aba 100644 --- a/decoder/tree2string_translator.cc +++ b/decoder/tree2string_translator.cc @@ -26,14 +26,21 @@ struct Tree2StringGrammarNode { // use a lexer probably static void ReadTree2StringGrammar(istream* in, Tree2StringGrammarNode* root, bool has_multiple_states) { string line; + int lc = 0; while(getline(*in, line)) { - size_t pos = line.find("|||"); - assert(pos != string::npos); - assert(pos > 3); - unsigned xc = 0; - while (line[pos - 1] == ' ') { --pos; xc++; } - cdec::TreeFragment rule_src(line.substr(0, pos), true); - // TODO transducer_state should (optionally?) be read from input + ++lc; + std::vector fields = TokenizeMultisep(line, " ||| "); + if (has_multiple_states && fields.size() != 4) { + cerr << "Expected 4 fields in rule file but line " << lc << " is:\n" << line << endl; + abort(); + } + if (!has_multiple_states && fields.size() != 3) { + cerr << "Expected 3 fields in rule file but line " << lc << " is:\n" << line << endl; + abort(); + } + + cdec::TreeFragment rule_src(fields[has_multiple_states ? 1 : 0], true); + // TODO transducer_state should be read from input const unsigned transducer_state = 0; Tree2StringGrammarNode* cur = &root->next[transducer_state]; ostringstream os; @@ -59,12 +66,13 @@ static void ReadTree2StringGrammar(istream* in, Tree2StringGrammarNode* root, bo else os << TD::Convert(x); } - pos += 3 + xc; - while(line[pos] == ' ') { ++pos; } - os << " ||| " << line.substr(pos); - TRulePtr rule(new TRule(os.str())); - // TODO the transducer_state you end up in after using this rule (for each NT) - // needs to be read and encoded somehow in the rule (for use XXX) + TRulePtr rule; + if (has_multiple_states) { + cerr << "Not implemented...\n"; abort(); // TODO read in states + } else { + os << " ||| " << fields[1] << " ||| " << fields[2]; + rule.reset(new TRule(os.str())); + } cur->rules.push_back(rule); //cerr << "RULE: " << rule->AsString() << "\n\n"; } -- cgit v1.2.3 From b4ce7c0b51d8615abf84c022ec3a981bee3277fe Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 13 Jun 2014 00:26:36 -0400 Subject: new trule datastructure that can store t2s with multiple states (i.e., t2t rules) --- decoder/trule.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/decoder/trule.h b/decoder/trule.h index cc370757..243b0da9 100644 --- a/decoder/trule.h +++ b/decoder/trule.h @@ -144,6 +144,9 @@ class TRule { SparseVector scores_; char arity_; + std::vector ext_states_; // in t2s or t2t translation, this is of length arity_ and + // indicates what state the transducer is in after having processed + // this transduction rule // these attributes are application-specific and should probably be refactored TRulePtr parent_rule_; // usually NULL, except when doing constrained decoding -- cgit v1.2.3