Merge remote-tracking branch 'upstream/master'

author: Patrick Simianer <p@simianer.de> 2014-06-15 04:26:08 +0200
committer: Patrick Simianer <p@simianer.de> 2014-06-15 04:26:08 +0200
commit: 7c19e97d2b1216daf95055291c8bbda9b5705dd6 (patch)
tree: 5f76f798c59ea9b67050cf9ee854c13e1fdc4539
parent: a39aa79b18347e22ef36ebc0da5a7eb220bcb23f (diff)
parent: 3e82edae3cb6a347a3094b82f961082421fe6cb9 (diff)
12 files changed, 538 insertions, 482 deletions
diff --git a/decoder/tree2string_translator.cc b/decoder/tree2string_translator.cc
index b5b47d5d..d61b9aba 100644
--- a/decoder/tree2string_translator.cc
+++ b/decoder/tree2string_translator.cc
@@ -26,14 +26,21 @@ struct Tree2StringGrammarNode {
 // use a lexer probably
 static void ReadTree2StringGrammar(istream* in, Tree2StringGrammarNode* root, bool has_multiple_states) {
   string line;
+  int lc = 0;
   while(getline(*in, line)) {
-    size_t pos = line.find("|||");
-    assert(pos != string::npos);
-    assert(pos > 3);
-    unsigned xc = 0;
-    while (line[pos - 1] == ' ') { --pos; xc++; }
-    cdec::TreeFragment rule_src(line.substr(0, pos), true);
-    // TODO transducer_state should (optionally?) be read from input
+    ++lc;
+    std::vector<StringPiece> fields = TokenizeMultisep(line, " ||| ");
+    if (has_multiple_states && fields.size() != 4) {
+      cerr << "Expected 4 fields in rule file but line " << lc << " is:\n" << line << endl;
+      abort();
+    }
+    if (!has_multiple_states && fields.size() != 3) {
+      cerr << "Expected 3 fields in rule file but line " << lc << " is:\n" << line << endl;
+      abort();
+    }
+    
+    cdec::TreeFragment rule_src(fields[has_multiple_states ? 1 : 0], true);
+    // TODO transducer_state should be read from input
     const unsigned transducer_state = 0;
     Tree2StringGrammarNode* cur = &root->next[transducer_state];
     ostringstream os;
@@ -59,12 +66,13 @@ static void ReadTree2StringGrammar(istream* in, Tree2StringGrammarNode* root, bo
       else
         os << TD::Convert(x);
     }
-    pos += 3 + xc;
-    while(line[pos] == ' ') { ++pos; }
-    os << " ||| " << line.substr(pos);
-    TRulePtr rule(new TRule(os.str()));
-    // TODO the transducer_state you end up in after using this rule (for each NT)
-    // needs to be read and encoded somehow in the rule (for use XXX)
+    TRulePtr rule;
+    if (has_multiple_states) {
+      cerr << "Not implemented...\n"; abort(); // TODO read in states
+    } else {
+      os << " ||| " << fields[1] << " ||| " << fields[2];
+      rule.reset(new TRule(os.str()));
+    }
     cur->rules.push_back(rule);
     //cerr << "RULE: " << rule->AsString() << "\n\n";
   }
diff --git a/decoder/tree_fragment.cc b/decoder/tree_fragment.cc
index 696c8601..aad0b2c4 100644
--- a/decoder/tree_fragment.cc
+++ b/decoder/tree_fragment.cc
@@ -8,7 +8,7 @@ using namespace std;
 
 namespace cdec {
 
-TreeFragment::TreeFragment(const string& tree, bool allow_frontier_sites) {
+TreeFragment::TreeFragment(const StringPiece& tree, bool allow_frontier_sites) {
   int bal = 0;
   const unsigned len = tree.size();
   unsigned cur = 0;
@@ -50,7 +50,7 @@ void TreeFragment::DebugRec(unsigned cur, ostream* out) const {
 // cp is the character index in the tree
 // np keeps track of the nodes (nonterminals) that have been built
 // symp keeps track of the terminal symbols that have been built
-void TreeFragment::ParseRec(const string& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp) {
+void TreeFragment::ParseRec(const StringPiece& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp) {
   if (tree[cp] != '(') {
     cerr << "Expected ( at " << cp << endl;
     abort();
@@ -79,12 +79,12 @@ void TreeFragment::ParseRec(const string& tree, bool afs, unsigned cp, unsigned
       // TODO: add a terminal symbol to the current edge
       const bool is_terminal = tree[t_start] != '[' || (t_end - t_start < 3 || tree[t_end - 1] != ']');
       if (is_terminal) {
-        const unsigned term = TD::Convert(tree.substr(t_start, t_end - t_start));
+        const unsigned term = TD::Convert(tree.substr(t_start, t_end - t_start).as_string());
         rhs.push_back(term);
         // cerr << "T='" << TD::Convert(term) << "'\n";
         ++terminals;
       } else { // frontier site (NT but no recursion)
-        const unsigned nt = TD::Convert(tree.substr(t_start + 1, t_end - t_start - 2)) | FRONTIER_BIT;
+        const unsigned nt = TD::Convert(tree.substr(t_start + 1, t_end - t_start - 2).as_string()) | FRONTIER_BIT;
         rhs.push_back(nt);
         ++frontier_sites;
         // cerr << "FRONT-NT=[" << TD::Convert(nt & ALL_MASK) << "]\n";
@@ -97,7 +97,7 @@ void TreeFragment::ParseRec(const string& tree, bool afs, unsigned cp, unsigned
   } // continuent has completed, cp is at ), build node
   const unsigned j = symp; // span from (i,j)
   // add an internal non-terminal symbol
-  const unsigned nt = TD::Convert(tree.substr(nt_start, nt_end - nt_start)) | RHS_BIT;
+  const unsigned nt = TD::Convert(tree.substr(nt_start, nt_end - nt_start).as_string()) | RHS_BIT;
   nodes[np] = TreeFragmentProduction(nt, rhs);
   //cerr << np << " production(" << i << "," << j << ")=  " << TD::Convert(nt & ALL_MASK) << " -->";
   //for (auto& x : rhs) {
diff --git a/decoder/tree_fragment.h b/decoder/tree_fragment.h
index 79722b5a..8bb7251a 100644
--- a/decoder/tree_fragment.h
+++ b/decoder/tree_fragment.h
@@ -8,6 +8,8 @@
 #include <cassert>
 #include <cstddef>
 
+#include "string_piece.hh"
+
 namespace cdec {
 
 class BreadthFirstIterator;
@@ -52,7 +54,7 @@ class TreeFragment {
  public:
   TreeFragment() : frontier_sites(), terminals() {}
   // (S (NP a (X b) c d) (VP (V foo) (NP (NN bar))))
-  explicit TreeFragment(const std::string& tree, bool allow_frontier_sites = false);
+  explicit TreeFragment(const StringPiece& tree, bool allow_frontier_sites = false);
   void DebugRec(unsigned cur, std::ostream* out) const;
   typedef DepthFirstIterator iterator;
   typedef ptrdiff_t difference_type;
@@ -73,7 +75,7 @@ class TreeFragment {
   // cp is the character index in the tree
   // np keeps track of the nodes (nonterminals) that have been built
   // symp keeps track of the terminal symbols that have been built
-  void ParseRec(const std::string& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp);
+  void ParseRec(const StringPiece& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp);
  public:
   unsigned root;
   unsigned char frontier_sites;
diff --git a/decoder/trule.h b/decoder/trule.h
index cc370757..243b0da9 100644
--- a/decoder/trule.h
+++ b/decoder/trule.h
@@ -144,6 +144,9 @@ class TRule {
   SparseVector<double> scores_;
 
   char arity_;
+  std::vector<WordID> ext_states_; // in t2s or t2t translation, this is of length arity_ and
+                                   // indicates what state the transducer is in after having processed
+                                   // this transduction rule
 
   // these attributes are application-specific and should probably be refactored
   TRulePtr parent_rule_;  // usually NULL, except when doing constrained decoding
diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc
index ec394b96..973091c4 100644
--- a/klm/util/string_piece.cc
+++ b/klm/util/string_piece.cc
@@ -1,193 +1,2 @@
-// Copyright 2004 The RE2 Authors.  All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in string_piece.hh.
+// this has been moved to utils/ in cdec
 
-#include "util/string_piece.hh"
-
-#include <algorithm>
-
-#include <limits.h>
-
-#ifndef HAVE_ICU
-
-typedef StringPiece::size_type size_type;
-
-void StringPiece::CopyToString(std::string* target) const {
-  target->assign(ptr_, length_);
-}
-
-size_type StringPiece::find(const StringPiece& s, size_type pos) const {
-  // Not sure why length_ < 0 was here since it's std::size_t.
-  if (/*length_ < 0 || */pos > static_cast<size_type>(length_))
-    return npos;
-
-  const char* result = std::search(ptr_ + pos, ptr_ + length_,
-                                   s.ptr_, s.ptr_ + s.length_);
-  const size_type xpos = result - ptr_;
-  return xpos + s.length_ <= length_ ? xpos : npos;
-}
-
-size_type StringPiece::find(char c, size_type pos) const {
-  if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
-    return npos;
-  }
-  const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
-  return result != ptr_ + length_ ? result - ptr_ : npos;
-}
-
-size_type StringPiece::rfind(const StringPiece& s, size_type pos) const {
-  if (length_ < s.length_) return npos;
-  const size_t ulen = length_;
-  if (s.length_ == 0) return std::min(ulen, pos);
-
-  const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_;
-  const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
-  return result != last ? result - ptr_ : npos;
-}
-
-size_type StringPiece::rfind(char c, size_type pos) const {
-  if (length_ <= 0) return npos;
-  for (int i = std::min(pos, static_cast<size_type>(length_ - 1));
-       i >= 0; --i) {
-    if (ptr_[i] == c) {
-      return i;
-    }
-  }
-  return npos;
-}
-
-// For each character in characters_wanted, sets the index corresponding
-// to the ASCII code of that character to 1 in table.  This is used by
-// the find_.*_of methods below to tell whether or not a character is in
-// the lookup table in constant time.
-// The argument `table' must be an array that is large enough to hold all
-// the possible values of an unsigned char.  Thus it should be be declared
-// as follows:
-//   bool table[UCHAR_MAX + 1]
-static inline void BuildLookupTable(const StringPiece& characters_wanted,
-                                    bool* table) {
-  const size_type length = characters_wanted.length();
-  const char* const data = characters_wanted.data();
-  for (size_type i = 0; i < length; ++i) {
-    table[static_cast<unsigned char>(data[i])] = true;
-  }
-}
-
-size_type StringPiece::find_first_of(const StringPiece& s,
-                                     size_type pos) const {
-  if (length_ == 0 || s.length_ == 0)
-    return npos;
-
-  // Avoid the cost of BuildLookupTable() for a single-character search.
-  if (s.length_ == 1)
-    return find_first_of(s.ptr_[0], pos);
-
-  bool lookup[UCHAR_MAX + 1] = { false };
-  BuildLookupTable(s, lookup);
-  for (size_type i = pos; i < length_; ++i) {
-    if (lookup[static_cast<unsigned char>(ptr_[i])]) {
-      return i;
-    }
-  }
-  return npos;
-}
-
-size_type StringPiece::find_first_not_of(const StringPiece& s,
-                                         size_type pos) const {
-  if (length_ == 0)
-    return npos;
-
-  if (s.length_ == 0)
-    return 0;
-
-  // Avoid the cost of BuildLookupTable() for a single-character search.
-  if (s.length_ == 1)
-    return find_first_not_of(s.ptr_[0], pos);
-
-  bool lookup[UCHAR_MAX + 1] = { false };
-  BuildLookupTable(s, lookup);
-  for (size_type i = pos; i < length_; ++i) {
-    if (!lookup[static_cast<unsigned char>(ptr_[i])]) {
-      return i;
-    }
-  }
-  return npos;
-}
-
-size_type StringPiece::find_first_not_of(char c, size_type pos) const {
-  if (length_ == 0)
-    return npos;
-
-  for (; pos < length_; ++pos) {
-    if (ptr_[pos] != c) {
-      return pos;
-    }
-  }
-  return npos;
-}
-
-size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const {
-  if (length_ == 0 || s.length_ == 0)
-    return npos;
-
-  // Avoid the cost of BuildLookupTable() for a single-character search.
-  if (s.length_ == 1)
-    return find_last_of(s.ptr_[0], pos);
-
-  bool lookup[UCHAR_MAX + 1] = { false };
-  BuildLookupTable(s, lookup);
-  for (size_type i = std::min(pos, length_ - 1); ; --i) {
-    if (lookup[static_cast<unsigned char>(ptr_[i])])
-      return i;
-    if (i == 0)
-      break;
-  }
-  return npos;
-}
-
-size_type StringPiece::find_last_not_of(const StringPiece& s,
-                                        size_type pos) const {
-  if (length_ == 0)
-    return npos;
-
-  size_type i = std::min(pos, length_ - 1);
-  if (s.length_ == 0)
-    return i;
-
-  // Avoid the cost of BuildLookupTable() for a single-character search.
-  if (s.length_ == 1)
-    return find_last_not_of(s.ptr_[0], pos);
-
-  bool lookup[UCHAR_MAX + 1] = { false };
-  BuildLookupTable(s, lookup);
-  for (; ; --i) {
-    if (!lookup[static_cast<unsigned char>(ptr_[i])])
-      return i;
-    if (i == 0)
-      break;
-  }
-  return npos;
-}
-
-size_type StringPiece::find_last_not_of(char c, size_type pos) const {
-  if (length_ == 0)
-    return npos;
-
-  for (size_type i = std::min(pos, length_ - 1); ; --i) {
-    if (ptr_[i] != c)
-      return i;
-    if (i == 0)
-      break;
-  }
-  return npos;
-}
-
-StringPiece StringPiece::substr(size_type pos, size_type n) const {
-  if (pos > length_) pos = length_;
-  if (n > length_ - pos) n = length_ - pos;
-  return StringPiece(ptr_ + pos, n);
-}
-
-const size_type StringPiece::npos = size_type(-1);
-
-#endif // !HAVE_ICU
diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh
index 84431db1..696ca084 100644
--- a/klm/util/string_piece.hh
+++ b/klm/util/string_piece.hh
@@ -1,270 +1,2 @@
-/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n.  If
- * you don't use ICU, then this will use the Google implementation from Chrome.
- * This has been modified from the original version to let you choose.
- */
+#include "utils/string_piece.hh"
 
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//    * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//    * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//    * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// Copied from strings/stringpiece.h with modifications
-//
-// A string-like object that points to a sized piece of memory.
-//
-// Functions or methods may use const StringPiece& parameters to accept either
-// a "const char*" or a "string" value that will be implicitly converted to
-// a StringPiece.  The implicit conversion means that it is often appropriate
-// to include this .h file in other files rather than forward-declaring
-// StringPiece as would be appropriate for most other Google classes.
-//
-// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
-// conversions from "const char*" to "string" and back again.
-//
-
-#ifndef BASE_STRING_PIECE_H__
-#define BASE_STRING_PIECE_H__
-
-#include "util/have.hh"
-
-#include <cstring>
-#include <iosfwd>
-#include <ostream>
-
-#ifdef HAVE_ICU
-#include <unicode/stringpiece.h>
-#include <unicode/uversion.h>
-
-// Old versions of ICU don't define operator== and operator!=.
-#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
-#warning You are using an old version of ICU.  Consider upgrading to ICU >= 4.6.
-inline bool operator==(const StringPiece& x, const StringPiece& y) {
-  if (x.size() != y.size())
-    return false;
-
-  return std::memcmp(x.data(), y.data(), x.size()) == 0;
-}
-
-inline bool operator!=(const StringPiece& x, const StringPiece& y) {
-  return !(x == y);
-}
-#endif // old version of ICU
-
-U_NAMESPACE_BEGIN
-
-inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) {
-  int longersize = longer.size(), prefixsize = prefix.size();
-  return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0;
-}
-
-#else
-
-#include <algorithm>
-#include <cstddef>
-#include <string>
-#include <string.h>
-
-#ifdef WIN32
-#undef max
-#undef min
-#endif
-
-class StringPiece {
- public:
-  typedef size_t size_type;
-
- private:
-  const char*   ptr_;
-  size_type     length_;
-
- public:
-  // We provide non-explicit singleton constructors so users can pass
-  // in a "const char*" or a "string" wherever a "StringPiece" is
-  // expected.
-  StringPiece() : ptr_(NULL), length_(0) { }
-  StringPiece(const char* str)
-    : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { }
-  StringPiece(const std::string& str)
-    : ptr_(str.data()), length_(str.size()) { }
-  StringPiece(const char* offset, size_type len)
-    : ptr_(offset), length_(len) { }
-
-  // data() may return a pointer to a buffer with embedded NULs, and the
-  // returned buffer may or may not be null terminated.  Therefore it is
-  // typically a mistake to pass data() to a routine that expects a NUL
-  // terminated string.
-  const char* data() const { return ptr_; }
-  size_type size() const { return length_; }
-  size_type length() const { return length_; }
-  bool empty() const { return length_ == 0; }
-
-  void clear() { ptr_ = NULL; length_ = 0; }
-  void set(const char* data, size_type len) { ptr_ = data; length_ = len; }
-  void set(const char* str) {
-    ptr_ = str;
-    length_ = str ? strlen(str) : 0;
-  }
-  void set(const void* data, size_type len) {
-    ptr_ = reinterpret_cast<const char*>(data);
-    length_ = len;
-  }
-
-  char operator[](size_type i) const { return ptr_[i]; }
-
-  void remove_prefix(size_type n) {
-    ptr_ += n;
-    length_ -= n;
-  }
-
-  void remove_suffix(size_type n) {
-    length_ -= n;
-  }
-
-  int compare(const StringPiece& x) const {
-    int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_));
-    if (r == 0) {
-      if (length_ < x.length_) r = -1;
-      else if (length_ > x.length_) r = +1;
-    }
-    return r;
-  }
-
-  std::string as_string() const {
-    // std::string doesn't like to take a NULL pointer even with a 0 size.
-    return std::string(!empty() ? data() : "", size());
-  }
-
-  void CopyToString(std::string* target) const;
-  void AppendToString(std::string* target) const;
-
-  // Does "this" start with "x"
-  bool starts_with(const StringPiece& x) const {
-    return ((length_ >= x.length_) &&
-            (wordmemcmp(ptr_, x.ptr_, x.length_) == 0));
-  }
-
-  // Does "this" end with "x"
-  bool ends_with(const StringPiece& x) const {
-    return ((length_ >= x.length_) &&
-            (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
-  }
-
-  // standard STL container boilerplate
-  typedef char value_type;
-  typedef const char* pointer;
-  typedef const char& reference;
-  typedef const char& const_reference;
-  typedef ptrdiff_t difference_type;
-  static const size_type npos;
-  typedef const char* const_iterator;
-  typedef const char* iterator;
-  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
-  typedef std::reverse_iterator<iterator> reverse_iterator;
-  iterator begin() const { return ptr_; }
-  iterator end() const { return ptr_ + length_; }
-  const_reverse_iterator rbegin() const {
-    return const_reverse_iterator(ptr_ + length_);
-  }
-  const_reverse_iterator rend() const {
-    return const_reverse_iterator(ptr_);
-  }
-
-  size_type max_size() const { return length_; }
-  size_type capacity() const { return length_; }
-
-  size_type copy(char* buf, size_type n, size_type pos = 0) const;
-
-  size_type find(const StringPiece& s, size_type pos = 0) const;
-  size_type find(char c, size_type pos = 0) const;
-  size_type rfind(const StringPiece& s, size_type pos = npos) const;
-  size_type rfind(char c, size_type pos = npos) const;
-
-  size_type find_first_of(const StringPiece& s, size_type pos = 0) const;
-  size_type find_first_of(char c, size_type pos = 0) const {
-    return find(c, pos);
-  }
-  size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const;
-  size_type find_first_not_of(char c, size_type pos = 0) const;
-  size_type find_last_of(const StringPiece& s, size_type pos = npos) const;
-  size_type find_last_of(char c, size_type pos = npos) const {
-    return rfind(c, pos);
-  }
-  size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const;
-  size_type find_last_not_of(char c, size_type pos = npos) const;
-
-  StringPiece substr(size_type pos, size_type n = npos) const;
-
-  static int wordmemcmp(const char* p, const char* p2, size_type N) {
-    return std::memcmp(p, p2, N);
-  }
-};
-
-inline bool operator==(const StringPiece& x, const StringPiece& y) {
-  if (x.size() != y.size())
-    return false;
-
-  return std::memcmp(x.data(), y.data(), x.size()) == 0;
-}
-
-inline bool operator!=(const StringPiece& x, const StringPiece& y) {
-  return !(x == y);
-}
-
-inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) {
-  return longer.starts_with(prefix);
-}
-
-#endif // HAVE_ICU undefined
-
-inline bool operator<(const StringPiece& x, const StringPiece& y) {
-  const int r = std::memcmp(x.data(), y.data(),
-                                       std::min(x.size(), y.size()));
-  return ((r < 0) || ((r == 0) && (x.size() < y.size())));
-}
-
-inline bool operator>(const StringPiece& x, const StringPiece& y) {
-  return y < x;
-}
-
-inline bool operator<=(const StringPiece& x, const StringPiece& y) {
-  return !(x > y);
-}
-
-inline bool operator>=(const StringPiece& x, const StringPiece& y) {
-  return !(x < y);
-}
-
-// allow StringPiece to be logged (needed for unit testing).
-inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
-  return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
-}
-
-#ifdef HAVE_ICU
-U_NAMESPACE_END
-using U_NAMESPACE_QUALIFIER StringPiece;
-#endif
-
-#endif  // BASE_STRING_PIECE_H__
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 341fd80b..18495c3a 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -8,9 +8,10 @@ noinst_PROGRAMS = \
   weights_test \
   logval_test \
   small_vector_test \
+  stringlib_test \
   sv_test
 
-TESTS = ts small_vector_test logval_test weights_test dict_test m_test sv_test
+TESTS = ts small_vector_test logval_test weights_test dict_test m_test sv_test stringlib_test
 
 noinst_LIBRARIES = libutils.a
 
@@ -54,6 +55,7 @@ libutils_a_SOURCES = \
   star.h \
   static_utoa.h \
   stringlib.h \
+  string_piece.hh \
   tdict.h \
   timing_stats.h \
   utoa.h \
@@ -76,6 +78,7 @@ libutils_a_SOURCES = \
   gzstream.cc \
   filelib.cc \
   stringlib.cc \
+  string_piece.cc \
   sparse_vector.cc \
   timing_stats.cc \
   verbose.cc \
@@ -106,6 +109,8 @@ small_vector_test_SOURCES = small_vector_test.cc
 small_vector_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
 sv_test_SOURCES = sv_test.cc
 sv_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
+stringlib_test_SOURCES = stringlib_test.cc
+stringlib_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
 
 ################################################################
 # do NOT NOT NOT add any other -I includes NO NO NO NO NO ######
diff --git a/utils/string_piece.cc b/utils/string_piece.cc
new file mode 100644
index 00000000..0685c05f
--- /dev/null
+++ b/utils/string_piece.cc
@@ -0,0 +1,193 @@
+// Copyright 2004 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in string_piece.hh.
+
+#include "string_piece.hh"
+
+#include <algorithm>
+
+#include <limits.h>
+
+#ifndef HAVE_ICU
+
+typedef StringPiece::size_type size_type;
+
+void StringPiece::CopyToString(std::string* target) const {
+  target->assign(ptr_, length_);
+}
+
+size_type StringPiece::find(const StringPiece& s, size_type pos) const {
+  // Not sure why length_ < 0 was here since it's std::size_t.
+  if (/*length_ < 0 || */pos > static_cast<size_type>(length_))
+    return npos;
+
+  const char* result = std::search(ptr_ + pos, ptr_ + length_,
+                                   s.ptr_, s.ptr_ + s.length_);
+  const size_type xpos = result - ptr_;
+  return xpos + s.length_ <= length_ ? xpos : npos;
+}
+
+size_type StringPiece::find(char c, size_type pos) const {
+  if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
+    return npos;
+  }
+  const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
+  return result != ptr_ + length_ ? result - ptr_ : npos;
+}
+
+size_type StringPiece::rfind(const StringPiece& s, size_type pos) const {
+  if (length_ < s.length_) return npos;
+  const size_t ulen = length_;
+  if (s.length_ == 0) return std::min(ulen, pos);
+
+  const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_;
+  const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
+  return result != last ? result - ptr_ : npos;
+}
+
+size_type StringPiece::rfind(char c, size_type pos) const {
+  if (length_ <= 0) return npos;
+  for (int i = std::min(pos, static_cast<size_type>(length_ - 1));
+       i >= 0; --i) {
+    if (ptr_[i] == c) {
+      return i;
+    }
+  }
+  return npos;
+}
+
+// For each character in characters_wanted, sets the index corresponding
+// to the ASCII code of that character to 1 in table.  This is used by
+// the find_.*_of methods below to tell whether or not a character is in
+// the lookup table in constant time.
+// The argument `table' must be an array that is large enough to hold all
+// the possible values of an unsigned char.  Thus it should be be declared
+// as follows:
+//   bool table[UCHAR_MAX + 1]
+static inline void BuildLookupTable(const StringPiece& characters_wanted,
+                                    bool* table) {
+  const size_type length = characters_wanted.length();
+  const char* const data = characters_wanted.data();
+  for (size_type i = 0; i < length; ++i) {
+    table[static_cast<unsigned char>(data[i])] = true;
+  }
+}
+
+size_type StringPiece::find_first_of(const StringPiece& s,
+                                     size_type pos) const {
+  if (length_ == 0 || s.length_ == 0)
+    return npos;
+
+  // Avoid the cost of BuildLookupTable() for a single-character search.
+  if (s.length_ == 1)
+    return find_first_of(s.ptr_[0], pos);
+
+  bool lookup[UCHAR_MAX + 1] = { false };
+  BuildLookupTable(s, lookup);
+  for (size_type i = pos; i < length_; ++i) {
+    if (lookup[static_cast<unsigned char>(ptr_[i])]) {
+      return i;
+    }
+  }
+  return npos;
+}
+
+size_type StringPiece::find_first_not_of(const StringPiece& s,
+                                         size_type pos) const {
+  if (length_ == 0)
+    return npos;
+
+  if (s.length_ == 0)
+    return 0;
+
+  // Avoid the cost of BuildLookupTable() for a single-character search.
+  if (s.length_ == 1)
+    return find_first_not_of(s.ptr_[0], pos);
+
+  bool lookup[UCHAR_MAX + 1] = { false };
+  BuildLookupTable(s, lookup);
+  for (size_type i = pos; i < length_; ++i) {
+    if (!lookup[static_cast<unsigned char>(ptr_[i])]) {
+      return i;
+    }
+  }
+  return npos;
+}
+
+size_type StringPiece::find_first_not_of(char c, size_type pos) const {
+  if (length_ == 0)
+    return npos;
+
+  for (; pos < length_; ++pos) {
+    if (ptr_[pos] != c) {
+      return pos;
+    }
+  }
+  return npos;
+}
+
+size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const {
+  if (length_ == 0 || s.length_ == 0)
+    return npos;
+
+  // Avoid the cost of BuildLookupTable() for a single-character search.
+  if (s.length_ == 1)
+    return find_last_of(s.ptr_[0], pos);
+
+  bool lookup[UCHAR_MAX + 1] = { false };
+  BuildLookupTable(s, lookup);
+  for (size_type i = std::min(pos, length_ - 1); ; --i) {
+    if (lookup[static_cast<unsigned char>(ptr_[i])])
+      return i;
+    if (i == 0)
+      break;
+  }
+  return npos;
+}
+
+size_type StringPiece::find_last_not_of(const StringPiece& s,
+                                        size_type pos) const {
+  if (length_ == 0)
+    return npos;
+
+  size_type i = std::min(pos, length_ - 1);
+  if (s.length_ == 0)
+    return i;
+
+  // Avoid the cost of BuildLookupTable() for a single-character search.
+  if (s.length_ == 1)
+    return find_last_not_of(s.ptr_[0], pos);
+
+  bool lookup[UCHAR_MAX + 1] = { false };
+  BuildLookupTable(s, lookup);
+  for (; ; --i) {
+    if (!lookup[static_cast<unsigned char>(ptr_[i])])
+      return i;
+    if (i == 0)
+      break;
+  }
+  return npos;
+}
+
+size_type StringPiece::find_last_not_of(char c, size_type pos) const {
+  if (length_ == 0)
+    return npos;
+
+  for (size_type i = std::min(pos, length_ - 1); ; --i) {
+    if (ptr_[i] != c)
+      return i;
+    if (i == 0)
+      break;
+  }
+  return npos;
+}
+
+StringPiece StringPiece::substr(size_type pos, size_type n) const {
+  if (pos > length_) pos = length_;
+  if (n > length_ - pos) n = length_ - pos;
+  return StringPiece(ptr_ + pos, n);
+}
+
+const size_type StringPiece::npos = size_type(-1);
+
+#endif // !HAVE_ICU
diff --git a/utils/string_piece.hh b/utils/string_piece.hh
new file mode 100644
index 00000000..9f21336c
--- /dev/null
+++ b/utils/string_piece.hh
@@ -0,0 +1,272 @@
+/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n.  If
+ * you don't use ICU, then this will use the Google implementation from Chrome.
+ * This has been modified from the original version to let you choose.
+ */
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copied from strings/stringpiece.h with modifications
+//
+// A string-like object that points to a sized piece of memory.
+//
+// Functions or methods may use const StringPiece& parameters to accept either
+// a "const char*" or a "string" value that will be implicitly converted to
+// a StringPiece.  The implicit conversion means that it is often appropriate
+// to include this .h file in other files rather than forward-declaring
+// StringPiece as would be appropriate for most other Google classes.
+//
+// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
+// conversions from "const char*" to "string" and back again.
+//
+
+#ifndef BASE_STRING_PIECE_H__
+#define BASE_STRING_PIECE_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <cstring>
+#include <iosfwd>
+#include <ostream>
+
+#ifdef HAVE_ICU
+#include <unicode/stringpiece.h>
+#include <unicode/uversion.h>
+
+// Old versions of ICU don't define operator== and operator!=.
+#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
+#warning You are using an old version of ICU.  Consider upgrading to ICU >= 4.6.
+inline bool operator==(const StringPiece& x, const StringPiece& y) {
+  if (x.size() != y.size())
+    return false;
+
+  return std::memcmp(x.data(), y.data(), x.size()) == 0;
+}
+
+inline bool operator!=(const StringPiece& x, const StringPiece& y) {
+  return !(x == y);
+}
+#endif // old version of ICU
+
+U_NAMESPACE_BEGIN
+
+inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) {
+  int longersize = longer.size(), prefixsize = prefix.size();
+  return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0;
+}
+
+#else
+
+#include <algorithm>
+#include <cstddef>
+#include <string>
+#include <string.h>
+
+#ifdef WIN32
+#undef max
+#undef min
+#endif
+
+class StringPiece {
+ public:
+  typedef size_t size_type;
+
+ private:
+  const char*   ptr_;
+  size_type     length_;
+
+ public:
+  // We provide non-explicit singleton constructors so users can pass
+  // in a "const char*" or a "string" wherever a "StringPiece" is
+  // expected.
+  StringPiece() : ptr_(NULL), length_(0) { }
+  StringPiece(const char* str)
+    : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { }
+  StringPiece(const std::string& str)
+    : ptr_(str.data()), length_(str.size()) { }
+  StringPiece(const char* offset, size_type len)
+    : ptr_(offset), length_(len) { }
+
+  // data() may return a pointer to a buffer with embedded NULs, and the
+  // returned buffer may or may not be null terminated.  Therefore it is
+  // typically a mistake to pass data() to a routine that expects a NUL
+  // terminated string.
+  const char* data() const { return ptr_; }
+  size_type size() const { return length_; }
+  size_type length() const { return length_; }
+  bool empty() const { return length_ == 0; }
+
+  void clear() { ptr_ = NULL; length_ = 0; }
+  void set(const char* data, size_type len) { ptr_ = data; length_ = len; }
+  void set(const char* str) {
+    ptr_ = str;
+    length_ = str ? strlen(str) : 0;
+  }
+  void set(const void* data, size_type len) {
+    ptr_ = reinterpret_cast<const char*>(data);
+    length_ = len;
+  }
+
+  char operator[](size_type i) const { return ptr_[i]; }
+
+  void remove_prefix(size_type n) {
+    ptr_ += n;
+    length_ -= n;
+  }
+
+  void remove_suffix(size_type n) {
+    length_ -= n;
+  }
+
+  int compare(const StringPiece& x) const {
+    int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_));
+    if (r == 0) {
+      if (length_ < x.length_) r = -1;
+      else if (length_ > x.length_) r = +1;
+    }
+    return r;
+  }
+
+  std::string as_string() const {
+    // std::string doesn't like to take a NULL pointer even with a 0 size.
+    return std::string(!empty() ? data() : "", size());
+  }
+
+  void CopyToString(std::string* target) const;
+  void AppendToString(std::string* target) const;
+
+  // Does "this" start with "x"
+  bool starts_with(const StringPiece& x) const {
+    return ((length_ >= x.length_) &&
+            (wordmemcmp(ptr_, x.ptr_, x.length_) == 0));
+  }
+
+  // Does "this" end with "x"
+  bool ends_with(const StringPiece& x) const {
+    return ((length_ >= x.length_) &&
+            (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
+  }
+
+  // standard STL container boilerplate
+  typedef char value_type;
+  typedef const char* pointer;
+  typedef const char& reference;
+  typedef const char& const_reference;
+  typedef ptrdiff_t difference_type;
+  static const size_type npos;
+  typedef const char* const_iterator;
+  typedef const char* iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  iterator begin() const { return ptr_; }
+  iterator end() const { return ptr_ + length_; }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(ptr_ + length_);
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(ptr_);
+  }
+
+  size_type max_size() const { return length_; }
+  size_type capacity() const { return length_; }
+
+  size_type copy(char* buf, size_type n, size_type pos = 0) const;
+
+  size_type find(const StringPiece& s, size_type pos = 0) const;
+  size_type find(char c, size_type pos = 0) const;
+  size_type rfind(const StringPiece& s, size_type pos = npos) const;
+  size_type rfind(char c, size_type pos = npos) const;
+
+  size_type find_first_of(const StringPiece& s, size_type pos = 0) const;
+  size_type find_first_of(char c, size_type pos = 0) const {
+    return find(c, pos);
+  }
+  size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const;
+  size_type find_first_not_of(char c, size_type pos = 0) const;
+  size_type find_last_of(const StringPiece& s, size_type pos = npos) const;
+  size_type find_last_of(char c, size_type pos = npos) const {
+    return rfind(c, pos);
+  }
+  size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const;
+  size_type find_last_not_of(char c, size_type pos = npos) const;
+
+  StringPiece substr(size_type pos, size_type n = npos) const;
+
+  static int wordmemcmp(const char* p, const char* p2, size_type N) {
+    return std::memcmp(p, p2, N);
+  }
+};
+
+inline bool operator==(const StringPiece& x, const StringPiece& y) {
+  if (x.size() != y.size())
+    return false;
+
+  return std::memcmp(x.data(), y.data(), x.size()) == 0;
+}
+
+inline bool operator!=(const StringPiece& x, const StringPiece& y) {
+  return !(x == y);
+}
+
+inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) {
+  return longer.starts_with(prefix);
+}
+
+#endif // HAVE_ICU undefined
+
+inline bool operator<(const StringPiece& x, const StringPiece& y) {
+  const int r = std::memcmp(x.data(), y.data(),
+                                       std::min(x.size(), y.size()));
+  return ((r < 0) || ((r == 0) && (x.size() < y.size())));
+}
+
+inline bool operator>(const StringPiece& x, const StringPiece& y) {
+  return y < x;
+}
+
+inline bool operator<=(const StringPiece& x, const StringPiece& y) {
+  return !(x > y);
+}
+
+inline bool operator>=(const StringPiece& x, const StringPiece& y) {
+  return !(x < y);
+}
+
+// allow StringPiece to be logged (needed for unit testing).
+inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
+  return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
+}
+
+#ifdef HAVE_ICU
+U_NAMESPACE_END
+using U_NAMESPACE_QUALIFIER StringPiece;
+#endif
+
+#endif  // BASE_STRING_PIECE_H__
diff --git a/utils/stringlib.h b/utils/stringlib.h
index 2fdbfff8..aadec93c 100644
--- a/utils/stringlib.h
+++ b/utils/stringlib.h
@@ -16,6 +16,8 @@
 #include <sstream>
 #include <algorithm>
 
+#include "string_piece.hh"
+
 namespace {
 const char c_isspace[]=" \t\n\r\f\v"; // somewhat ridiculous, including characters nobody uses.
 const char common_isspace[]=" \t\n\r"; // even \n\r is borderline, but maybe you pass multiline DOS format text.
@@ -221,6 +223,21 @@ void VisitTokens(char *p,char *const end,F f) {
   }
 }
 
+inline std::vector<StringPiece> TokenizeMultisep(const StringPiece& text, const StringPiece& separator) {
+  std::vector<StringPiece> res;
+  size_t cur = 0;
+  while(cur < text.size()) {
+    const auto n = text.find(separator, cur);
+    if (n == StringPiece::npos) {
+      res.push_back(text.substr(cur));
+      break;
+    }
+    res.push_back(text.substr(cur, n - cur));
+    cur = n + separator.size();
+  }
+  return res;
+}
+
 template <class F>
 void VisitTokens(char *p,F f) {
   VisitTokens(p,p+std::strlen(p),f);
diff --git a/utils/stringlib_test.cc b/utils/stringlib_test.cc
index f66cdbeb..00b31b4b 100644
--- a/utils/stringlib_test.cc
+++ b/utils/stringlib_test.cc
@@ -14,4 +14,8 @@ int main(int argc, char *argv[]) {
   std::string const& w="verylongword";
   VisitTokens(p,print());
   VisitTokens(w,print());
+  vector<StringPiece> vsp = TokenizeMultisep("field 1 ||| field 2 is a bit longer ||| field 3", " ||| ");
+  for (auto& sp : vsp) {
+    cerr << "'" << sp << "'" << endl;
+  }
 }
diff --git a/utils/sv_test.cc b/utils/sv_test.cc
index c7ac9e54..67df8c57 100644
--- a/utils/sv_test.cc
+++ b/utils/sv_test.cc
@@ -5,6 +5,17 @@
 
 using namespace std;
 
+BOOST_AUTO_TEST_CASE(Dot) {
+  SparseVector<double> x;
+  SparseVector<double> y;
+  x.set_value(1,0.8);
+  y.set_value(1,5);
+  x.set_value(2,-2);
+  y.set_value(2,1);
+  x.set_value(3,80);
+  BOOST_CHECK_CLOSE(x.dot(y), 2.0, 1e-9);
+}
+
 BOOST_AUTO_TEST_CASE(Equality) {
   SparseVector<double> x;
   SparseVector<double> y;
author	Patrick Simianer <p@simianer.de>	2014-06-15 04:26:08 +0200
committer	Patrick Simianer <p@simianer.de>	2014-06-15 04:26:08 +0200
commit	7c19e97d2b1216daf95055291c8bbda9b5705dd6 (patch)
tree	5f76f798c59ea9b67050cf9ee854c13e1fdc4539
parent	a39aa79b18347e22ef36ebc0da5a7eb220bcb23f (diff)
parent	3e82edae3cb6a347a3094b82f961082421fe6cb9 (diff)