Merge remote-tracking branch 'upstream/master'

author: Patrick Simianer <p@simianer.de> 2014-06-15 04:26:08 +0200
committer: Patrick Simianer <p@simianer.de> 2014-06-15 04:26:08 +0200
commit: 7c19e97d2b1216daf95055291c8bbda9b5705dd6 (patch)
tree: 5f76f798c59ea9b67050cf9ee854c13e1fdc4539 /utils
parent: a39aa79b18347e22ef36ebc0da5a7eb220bcb23f (diff)
parent: 3e82edae3cb6a347a3094b82f961082421fe6cb9 (diff)
6 files changed, 503 insertions, 1 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 341fd80b..18495c3a 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -8,9 +8,10 @@ noinst_PROGRAMS = \
   weights_test \
   logval_test \
   small_vector_test \
+  stringlib_test \
   sv_test
 
-TESTS = ts small_vector_test logval_test weights_test dict_test m_test sv_test
+TESTS = ts small_vector_test logval_test weights_test dict_test m_test sv_test stringlib_test
 
 noinst_LIBRARIES = libutils.a
 
@@ -54,6 +55,7 @@ libutils_a_SOURCES = \
   star.h \
   static_utoa.h \
   stringlib.h \
+  string_piece.hh \
   tdict.h \
   timing_stats.h \
   utoa.h \
@@ -76,6 +78,7 @@ libutils_a_SOURCES = \
   gzstream.cc \
   filelib.cc \
   stringlib.cc \
+  string_piece.cc \
   sparse_vector.cc \
   timing_stats.cc \
   verbose.cc \
@@ -106,6 +109,8 @@ small_vector_test_SOURCES = small_vector_test.cc
 small_vector_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
 sv_test_SOURCES = sv_test.cc
 sv_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
+stringlib_test_SOURCES = stringlib_test.cc
+stringlib_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
 
 ################################################################
 # do NOT NOT NOT add any other -I includes NO NO NO NO NO ######
diff --git a/utils/string_piece.cc b/utils/string_piece.cc
new file mode 100644
index 00000000..0685c05f
--- /dev/null
+++ b/utils/string_piece.cc
@@ -0,0 +1,193 @@
+// Copyright 2004 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in string_piece.hh.
+
+#include "string_piece.hh"
+
+#include <algorithm>
+
+#include <limits.h>
+
+#ifndef HAVE_ICU
+
+typedef StringPiece::size_type size_type;
+
+void StringPiece::CopyToString(std::string* target) const {
+  target->assign(ptr_, length_);
+}
+
+size_type StringPiece::find(const StringPiece& s, size_type pos) const {
+  // Not sure why length_ < 0 was here since it's std::size_t.
+  if (/*length_ < 0 || */pos > static_cast<size_type>(length_))
+    return npos;
+
+  const char* result = std::search(ptr_ + pos, ptr_ + length_,
+                                   s.ptr_, s.ptr_ + s.length_);
+  const size_type xpos = result - ptr_;
+  return xpos + s.length_ <= length_ ? xpos : npos;
+}
+
+size_type StringPiece::find(char c, size_type pos) const {
+  if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
+    return npos;
+  }
+  const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
+  return result != ptr_ + length_ ? result - ptr_ : npos;
+}
+
+size_type StringPiece::rfind(const StringPiece& s, size_type pos) const {
+  if (length_ < s.length_) return npos;
+  const size_t ulen = length_;
+  if (s.length_ == 0) return std::min(ulen, pos);
+
+  const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_;
+  const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
+  return result != last ? result - ptr_ : npos;
+}
+
+size_type StringPiece::rfind(char c, size_type pos) const {
+  if (length_ <= 0) return npos;
+  for (int i = std::min(pos, static_cast<size_type>(length_ - 1));
+       i >= 0; --i) {
+    if (ptr_[i] == c) {
+      return i;
+    }
+  }
+  return npos;
+}
+
+// For each character in characters_wanted, sets the index corresponding
+// to the ASCII code of that character to 1 in table.  This is used by
+// the find_.*_of methods below to tell whether or not a character is in
+// the lookup table in constant time.
+// The argument `table' must be an array that is large enough to hold all
+// the possible values of an unsigned char.  Thus it should be be declared
+// as follows:
+//   bool table[UCHAR_MAX + 1]
+static inline void BuildLookupTable(const StringPiece& characters_wanted,
+                                    bool* table) {
+  const size_type length = characters_wanted.length();
+  const char* const data = characters_wanted.data();
+  for (size_type i = 0; i < length; ++i) {
+    table[static_cast<unsigned char>(data[i])] = true;
+  }
+}
+
+size_type StringPiece::find_first_of(const StringPiece& s,
+                                     size_type pos) const {
+  if (length_ == 0 || s.length_ == 0)
+    return npos;
+
+  // Avoid the cost of BuildLookupTable() for a single-character search.
+  if (s.length_ == 1)
+    return find_first_of(s.ptr_[0], pos);
+
+  bool lookup[UCHAR_MAX + 1] = { false };
+  BuildLookupTable(s, lookup);
+  for (size_type i = pos; i < length_; ++i) {
+    if (lookup[static_cast<unsigned char>(ptr_[i])]) {
+      return i;
+    }
+  }
+  return npos;
+}
+
+size_type StringPiece::find_first_not_of(const StringPiece& s,
+                                         size_type pos) const {
+  if (length_ == 0)
+    return npos;
+
+  if (s.length_ == 0)
+    return 0;
+
+  // Avoid the cost of BuildLookupTable() for a single-character search.
+  if (s.length_ == 1)
+    return find_first_not_of(s.ptr_[0], pos);
+
+  bool lookup[UCHAR_MAX + 1] = { false };
+  BuildLookupTable(s, lookup);
+  for (size_type i = pos; i < length_; ++i) {
+    if (!lookup[static_cast<unsigned char>(ptr_[i])]) {
+      return i;
+    }
+  }
+  return npos;
+}
+
+size_type StringPiece::find_first_not_of(char c, size_type pos) const {
+  if (length_ == 0)
+    return npos;
+
+  for (; pos < length_; ++pos) {
+    if (ptr_[pos] != c) {
+      return pos;
+    }
+  }
+  return npos;
+}
+
+size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const {
+  if (length_ == 0 || s.length_ == 0)
+    return npos;
+
+  // Avoid the cost of BuildLookupTable() for a single-character search.
+  if (s.length_ == 1)
+    return find_last_of(s.ptr_[0], pos);
+
+  bool lookup[UCHAR_MAX + 1] = { false };
+  BuildLookupTable(s, lookup);
+  for (size_type i = std::min(pos, length_ - 1); ; --i) {
+    if (lookup[static_cast<unsigned char>(ptr_[i])])
+      return i;
+    if (i == 0)
+      break;
+  }
+  return npos;
+}
+
+size_type StringPiece::find_last_not_of(const StringPiece& s,
+                                        size_type pos) const {
+  if (length_ == 0)
+    return npos;
+
+  size_type i = std::min(pos, length_ - 1);
+  if (s.length_ == 0)
+    return i;
+
+  // Avoid the cost of BuildLookupTable() for a single-character search.
+  if (s.length_ == 1)
+    return find_last_not_of(s.ptr_[0], pos);
+
+  bool lookup[UCHAR_MAX + 1] = { false };
+  BuildLookupTable(s, lookup);
+  for (; ; --i) {
+    if (!lookup[static_cast<unsigned char>(ptr_[i])])
+      return i;
+    if (i == 0)
+      break;
+  }
+  return npos;
+}
+
+size_type StringPiece::find_last_not_of(char c, size_type pos) const {
+  if (length_ == 0)
+    return npos;
+
+  for (size_type i = std::min(pos, length_ - 1); ; --i) {
+    if (ptr_[i] != c)
+      return i;
+    if (i == 0)
+      break;
+  }
+  return npos;
+}
+
+StringPiece StringPiece::substr(size_type pos, size_type n) const {
+  if (pos > length_) pos = length_;
+  if (n > length_ - pos) n = length_ - pos;
+  return StringPiece(ptr_ + pos, n);
+}
+
+const size_type StringPiece::npos = size_type(-1);
+
+#endif // !HAVE_ICU
diff --git a/utils/string_piece.hh b/utils/string_piece.hh
new file mode 100644
index 00000000..9f21336c
--- /dev/null
+++ b/utils/string_piece.hh
@@ -0,0 +1,272 @@
+/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n.  If
+ * you don't use ICU, then this will use the Google implementation from Chrome.
+ * This has been modified from the original version to let you choose.
+ */
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copied from strings/stringpiece.h with modifications
+//
+// A string-like object that points to a sized piece of memory.
+//
+// Functions or methods may use const StringPiece& parameters to accept either
+// a "const char*" or a "string" value that will be implicitly converted to
+// a StringPiece.  The implicit conversion means that it is often appropriate
+// to include this .h file in other files rather than forward-declaring
+// StringPiece as would be appropriate for most other Google classes.
+//
+// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
+// conversions from "const char*" to "string" and back again.
+//
+
+#ifndef BASE_STRING_PIECE_H__
+#define BASE_STRING_PIECE_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <cstring>
+#include <iosfwd>
+#include <ostream>
+
+#ifdef HAVE_ICU
+#include <unicode/stringpiece.h>
+#include <unicode/uversion.h>
+
+// Old versions of ICU don't define operator== and operator!=.
+#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
+#warning You are using an old version of ICU.  Consider upgrading to ICU >= 4.6.
+inline bool operator==(const StringPiece& x, const StringPiece& y) {
+  if (x.size() != y.size())
+    return false;
+
+  return std::memcmp(x.data(), y.data(), x.size()) == 0;
+}
+
+inline bool operator!=(const StringPiece& x, const StringPiece& y) {
+  return !(x == y);
+}
+#endif // old version of ICU
+
+U_NAMESPACE_BEGIN
+
+inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) {
+  int longersize = longer.size(), prefixsize = prefix.size();
+  return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0;
+}
+
+#else
+
+#include <algorithm>
+#include <cstddef>
+#include <string>
+#include <string.h>
+
+#ifdef WIN32
+#undef max
+#undef min
+#endif
+
+class StringPiece {
+ public:
+  typedef size_t size_type;
+
+ private:
+  const char*   ptr_;
+  size_type     length_;
+
+ public:
+  // We provide non-explicit singleton constructors so users can pass
+  // in a "const char*" or a "string" wherever a "StringPiece" is
+  // expected.
+  StringPiece() : ptr_(NULL), length_(0) { }
+  StringPiece(const char* str)
+    : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { }
+  StringPiece(const std::string& str)
+    : ptr_(str.data()), length_(str.size()) { }
+  StringPiece(const char* offset, size_type len)
+    : ptr_(offset), length_(len) { }
+
+  // data() may return a pointer to a buffer with embedded NULs, and the
+  // returned buffer may or may not be null terminated.  Therefore it is
+  // typically a mistake to pass data() to a routine that expects a NUL
+  // terminated string.
+  const char* data() const { return ptr_; }
+  size_type size() const { return length_; }
+  size_type length() const { return length_; }
+  bool empty() const { return length_ == 0; }
+
+  void clear() { ptr_ = NULL; length_ = 0; }
+  void set(const char* data, size_type len) { ptr_ = data; length_ = len; }
+  void set(const char* str) {
+    ptr_ = str;
+    length_ = str ? strlen(str) : 0;
+  }
+  void set(const void* data, size_type len) {
+    ptr_ = reinterpret_cast<const char*>(data);
+    length_ = len;
+  }
+
+  char operator[](size_type i) const { return ptr_[i]; }
+
+  void remove_prefix(size_type n) {
+    ptr_ += n;
+    length_ -= n;
+  }
+
+  void remove_suffix(size_type n) {
+    length_ -= n;
+  }
+
+  int compare(const StringPiece& x) const {
+    int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_));
+    if (r == 0) {
+      if (length_ < x.length_) r = -1;
+      else if (length_ > x.length_) r = +1;
+    }
+    return r;
+  }
+
+  std::string as_string() const {
+    // std::string doesn't like to take a NULL pointer even with a 0 size.
+    return std::string(!empty() ? data() : "", size());
+  }
+
+  void CopyToString(std::string* target) const;
+  void AppendToString(std::string* target) const;
+
+  // Does "this" start with "x"
+  bool starts_with(const StringPiece& x) const {
+    return ((length_ >= x.length_) &&
+            (wordmemcmp(ptr_, x.ptr_, x.length_) == 0));
+  }
+
+  // Does "this" end with "x"
+  bool ends_with(const StringPiece& x) const {
+    return ((length_ >= x.length_) &&
+            (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
+  }
+
+  // standard STL container boilerplate
+  typedef char value_type;
+  typedef const char* pointer;
+  typedef const char& reference;
+  typedef const char& const_reference;
+  typedef ptrdiff_t difference_type;
+  static const size_type npos;
+  typedef const char* const_iterator;
+  typedef const char* iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  iterator begin() const { return ptr_; }
+  iterator end() const { return ptr_ + length_; }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(ptr_ + length_);
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(ptr_);
+  }
+
+  size_type max_size() const { return length_; }
+  size_type capacity() const { return length_; }
+
+  size_type copy(char* buf, size_type n, size_type pos = 0) const;
+
+  size_type find(const StringPiece& s, size_type pos = 0) const;
+  size_type find(char c, size_type pos = 0) const;
+  size_type rfind(const StringPiece& s, size_type pos = npos) const;
+  size_type rfind(char c, size_type pos = npos) const;
+
+  size_type find_first_of(const StringPiece& s, size_type pos = 0) const;
+  size_type find_first_of(char c, size_type pos = 0) const {
+    return find(c, pos);
+  }
+  size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const;
+  size_type find_first_not_of(char c, size_type pos = 0) const;
+  size_type find_last_of(const StringPiece& s, size_type pos = npos) const;
+  size_type find_last_of(char c, size_type pos = npos) const {
+    return rfind(c, pos);
+  }
+  size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const;
+  size_type find_last_not_of(char c, size_type pos = npos) const;
+
+  StringPiece substr(size_type pos, size_type n = npos) const;
+
+  static int wordmemcmp(const char* p, const char* p2, size_type N) {
+    return std::memcmp(p, p2, N);
+  }
+};
+
+inline bool operator==(const StringPiece& x, const StringPiece& y) {
+  if (x.size() != y.size())
+    return false;
+
+  return std::memcmp(x.data(), y.data(), x.size()) == 0;
+}
+
+inline bool operator!=(const StringPiece& x, const StringPiece& y) {
+  return !(x == y);
+}
+
+inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) {
+  return longer.starts_with(prefix);
+}
+
+#endif // HAVE_ICU undefined
+
+inline bool operator<(const StringPiece& x, const StringPiece& y) {
+  const int r = std::memcmp(x.data(), y.data(),
+                                       std::min(x.size(), y.size()));
+  return ((r < 0) || ((r == 0) && (x.size() < y.size())));
+}
+
+inline bool operator>(const StringPiece& x, const StringPiece& y) {
+  return y < x;
+}
+
+inline bool operator<=(const StringPiece& x, const StringPiece& y) {
+  return !(x > y);
+}
+
+inline bool operator>=(const StringPiece& x, const StringPiece& y) {
+  return !(x < y);
+}
+
+// allow StringPiece to be logged (needed for unit testing).
+inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
+  return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
+}
+
+#ifdef HAVE_ICU
+U_NAMESPACE_END
+using U_NAMESPACE_QUALIFIER StringPiece;
+#endif
+
+#endif  // BASE_STRING_PIECE_H__
diff --git a/utils/stringlib.h b/utils/stringlib.h
index 2fdbfff8..aadec93c 100644
--- a/utils/stringlib.h
+++ b/utils/stringlib.h
@@ -16,6 +16,8 @@
 #include <sstream>
 #include <algorithm>
 
+#include "string_piece.hh"
+
 namespace {
 const char c_isspace[]=" \t\n\r\f\v"; // somewhat ridiculous, including characters nobody uses.
 const char common_isspace[]=" \t\n\r"; // even \n\r is borderline, but maybe you pass multiline DOS format text.
@@ -221,6 +223,21 @@ void VisitTokens(char *p,char *const end,F f) {
   }
 }
 
+inline std::vector<StringPiece> TokenizeMultisep(const StringPiece& text, const StringPiece& separator) {
+  std::vector<StringPiece> res;
+  size_t cur = 0;
+  while(cur < text.size()) {
+    const auto n = text.find(separator, cur);
+    if (n == StringPiece::npos) {
+      res.push_back(text.substr(cur));
+      break;
+    }
+    res.push_back(text.substr(cur, n - cur));
+    cur = n + separator.size();
+  }
+  return res;
+}
+
 template <class F>
 void VisitTokens(char *p,F f) {
   VisitTokens(p,p+std::strlen(p),f);
diff --git a/utils/stringlib_test.cc b/utils/stringlib_test.cc
index f66cdbeb..00b31b4b 100644
--- a/utils/stringlib_test.cc
+++ b/utils/stringlib_test.cc
@@ -14,4 +14,8 @@ int main(int argc, char *argv[]) {
   std::string const& w="verylongword";
   VisitTokens(p,print());
   VisitTokens(w,print());
+  vector<StringPiece> vsp = TokenizeMultisep("field 1 ||| field 2 is a bit longer ||| field 3", " ||| ");
+  for (auto& sp : vsp) {
+    cerr << "'" << sp << "'" << endl;
+  }
 }
diff --git a/utils/sv_test.cc b/utils/sv_test.cc
index c7ac9e54..67df8c57 100644
--- a/utils/sv_test.cc
+++ b/utils/sv_test.cc
@@ -5,6 +5,17 @@
 
 using namespace std;
 
+BOOST_AUTO_TEST_CASE(Dot) {
+  SparseVector<double> x;
+  SparseVector<double> y;
+  x.set_value(1,0.8);
+  y.set_value(1,5);
+  x.set_value(2,-2);
+  y.set_value(2,1);
+  x.set_value(3,80);
+  BOOST_CHECK_CLOSE(x.dot(y), 2.0, 1e-9);
+}
+
 BOOST_AUTO_TEST_CASE(Equality) {
   SparseVector<double> x;
   SparseVector<double> y;
author	Patrick Simianer <p@simianer.de>	2014-06-15 04:26:08 +0200
committer	Patrick Simianer <p@simianer.de>	2014-06-15 04:26:08 +0200
commit	7c19e97d2b1216daf95055291c8bbda9b5705dd6 (patch)
tree	5f76f798c59ea9b67050cf9ee854c13e1fdc4539 /utils
parent	a39aa79b18347e22ef36ebc0da5a7eb220bcb23f (diff)
parent	3e82edae3cb6a347a3094b82f961082421fe6cb9 (diff)