summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--decoder/tree2string_translator.cc34
-rw-r--r--decoder/tree_fragment.cc10
-rw-r--r--decoder/tree_fragment.h6
-rw-r--r--decoder/trule.h3
-rw-r--r--klm/util/string_piece.cc193
-rw-r--r--klm/util/string_piece.hh270
-rw-r--r--utils/Makefile.am7
-rw-r--r--utils/string_piece.cc193
-rw-r--r--utils/string_piece.hh272
-rw-r--r--utils/stringlib.h17
-rw-r--r--utils/stringlib_test.cc4
-rw-r--r--utils/sv_test.cc11
12 files changed, 538 insertions, 482 deletions
diff --git a/decoder/tree2string_translator.cc b/decoder/tree2string_translator.cc
index b5b47d5d..d61b9aba 100644
--- a/decoder/tree2string_translator.cc
+++ b/decoder/tree2string_translator.cc
@@ -26,14 +26,21 @@ struct Tree2StringGrammarNode {
// use a lexer probably
static void ReadTree2StringGrammar(istream* in, Tree2StringGrammarNode* root, bool has_multiple_states) {
string line;
+ int lc = 0;
while(getline(*in, line)) {
- size_t pos = line.find("|||");
- assert(pos != string::npos);
- assert(pos > 3);
- unsigned xc = 0;
- while (line[pos - 1] == ' ') { --pos; xc++; }
- cdec::TreeFragment rule_src(line.substr(0, pos), true);
- // TODO transducer_state should (optionally?) be read from input
+ ++lc;
+ std::vector<StringPiece> fields = TokenizeMultisep(line, " ||| ");
+ if (has_multiple_states && fields.size() != 4) {
+ cerr << "Expected 4 fields in rule file but line " << lc << " is:\n" << line << endl;
+ abort();
+ }
+ if (!has_multiple_states && fields.size() != 3) {
+ cerr << "Expected 3 fields in rule file but line " << lc << " is:\n" << line << endl;
+ abort();
+ }
+
+ cdec::TreeFragment rule_src(fields[has_multiple_states ? 1 : 0], true);
+ // TODO transducer_state should be read from input
const unsigned transducer_state = 0;
Tree2StringGrammarNode* cur = &root->next[transducer_state];
ostringstream os;
@@ -59,12 +66,13 @@ static void ReadTree2StringGrammar(istream* in, Tree2StringGrammarNode* root, bo
else
os << TD::Convert(x);
}
- pos += 3 + xc;
- while(line[pos] == ' ') { ++pos; }
- os << " ||| " << line.substr(pos);
- TRulePtr rule(new TRule(os.str()));
- // TODO the transducer_state you end up in after using this rule (for each NT)
- // needs to be read and encoded somehow in the rule (for use XXX)
+ TRulePtr rule;
+ if (has_multiple_states) {
+ cerr << "Not implemented...\n"; abort(); // TODO read in states
+ } else {
+ os << " ||| " << fields[1] << " ||| " << fields[2];
+ rule.reset(new TRule(os.str()));
+ }
cur->rules.push_back(rule);
//cerr << "RULE: " << rule->AsString() << "\n\n";
}
diff --git a/decoder/tree_fragment.cc b/decoder/tree_fragment.cc
index 696c8601..aad0b2c4 100644
--- a/decoder/tree_fragment.cc
+++ b/decoder/tree_fragment.cc
@@ -8,7 +8,7 @@ using namespace std;
namespace cdec {
-TreeFragment::TreeFragment(const string& tree, bool allow_frontier_sites) {
+TreeFragment::TreeFragment(const StringPiece& tree, bool allow_frontier_sites) {
int bal = 0;
const unsigned len = tree.size();
unsigned cur = 0;
@@ -50,7 +50,7 @@ void TreeFragment::DebugRec(unsigned cur, ostream* out) const {
// cp is the character index in the tree
// np keeps track of the nodes (nonterminals) that have been built
// symp keeps track of the terminal symbols that have been built
-void TreeFragment::ParseRec(const string& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp) {
+void TreeFragment::ParseRec(const StringPiece& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp) {
if (tree[cp] != '(') {
cerr << "Expected ( at " << cp << endl;
abort();
@@ -79,12 +79,12 @@ void TreeFragment::ParseRec(const string& tree, bool afs, unsigned cp, unsigned
// TODO: add a terminal symbol to the current edge
const bool is_terminal = tree[t_start] != '[' || (t_end - t_start < 3 || tree[t_end - 1] != ']');
if (is_terminal) {
- const unsigned term = TD::Convert(tree.substr(t_start, t_end - t_start));
+ const unsigned term = TD::Convert(tree.substr(t_start, t_end - t_start).as_string());
rhs.push_back(term);
// cerr << "T='" << TD::Convert(term) << "'\n";
++terminals;
} else { // frontier site (NT but no recursion)
- const unsigned nt = TD::Convert(tree.substr(t_start + 1, t_end - t_start - 2)) | FRONTIER_BIT;
+ const unsigned nt = TD::Convert(tree.substr(t_start + 1, t_end - t_start - 2).as_string()) | FRONTIER_BIT;
rhs.push_back(nt);
++frontier_sites;
// cerr << "FRONT-NT=[" << TD::Convert(nt & ALL_MASK) << "]\n";
@@ -97,7 +97,7 @@ void TreeFragment::ParseRec(const string& tree, bool afs, unsigned cp, unsigned
} // continuent has completed, cp is at ), build node
const unsigned j = symp; // span from (i,j)
// add an internal non-terminal symbol
- const unsigned nt = TD::Convert(tree.substr(nt_start, nt_end - nt_start)) | RHS_BIT;
+ const unsigned nt = TD::Convert(tree.substr(nt_start, nt_end - nt_start).as_string()) | RHS_BIT;
nodes[np] = TreeFragmentProduction(nt, rhs);
//cerr << np << " production(" << i << "," << j << ")= " << TD::Convert(nt & ALL_MASK) << " -->";
//for (auto& x : rhs) {
diff --git a/decoder/tree_fragment.h b/decoder/tree_fragment.h
index 79722b5a..8bb7251a 100644
--- a/decoder/tree_fragment.h
+++ b/decoder/tree_fragment.h
@@ -8,6 +8,8 @@
#include <cassert>
#include <cstddef>
+#include "string_piece.hh"
+
namespace cdec {
class BreadthFirstIterator;
@@ -52,7 +54,7 @@ class TreeFragment {
public:
TreeFragment() : frontier_sites(), terminals() {}
// (S (NP a (X b) c d) (VP (V foo) (NP (NN bar))))
- explicit TreeFragment(const std::string& tree, bool allow_frontier_sites = false);
+ explicit TreeFragment(const StringPiece& tree, bool allow_frontier_sites = false);
void DebugRec(unsigned cur, std::ostream* out) const;
typedef DepthFirstIterator iterator;
typedef ptrdiff_t difference_type;
@@ -73,7 +75,7 @@ class TreeFragment {
// cp is the character index in the tree
// np keeps track of the nodes (nonterminals) that have been built
// symp keeps track of the terminal symbols that have been built
- void ParseRec(const std::string& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp);
+ void ParseRec(const StringPiece& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp);
public:
unsigned root;
unsigned char frontier_sites;
diff --git a/decoder/trule.h b/decoder/trule.h
index cc370757..243b0da9 100644
--- a/decoder/trule.h
+++ b/decoder/trule.h
@@ -144,6 +144,9 @@ class TRule {
SparseVector<double> scores_;
char arity_;
+ std::vector<WordID> ext_states_; // in t2s or t2t translation, this is of length arity_ and
+ // indicates what state the transducer is in after having processed
+ // this transduction rule
// these attributes are application-specific and should probably be refactored
TRulePtr parent_rule_; // usually NULL, except when doing constrained decoding
diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc
index ec394b96..973091c4 100644
--- a/klm/util/string_piece.cc
+++ b/klm/util/string_piece.cc
@@ -1,193 +1,2 @@
-// Copyright 2004 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in string_piece.hh.
+// this has been moved to utils/ in cdec
-#include "util/string_piece.hh"
-
-#include <algorithm>
-
-#include <limits.h>
-
-#ifndef HAVE_ICU
-
-typedef StringPiece::size_type size_type;
-
-void StringPiece::CopyToString(std::string* target) const {
- target->assign(ptr_, length_);
-}
-
-size_type StringPiece::find(const StringPiece& s, size_type pos) const {
- // Not sure why length_ < 0 was here since it's std::size_t.
- if (/*length_ < 0 || */pos > static_cast<size_type>(length_))
- return npos;
-
- const char* result = std::search(ptr_ + pos, ptr_ + length_,
- s.ptr_, s.ptr_ + s.length_);
- const size_type xpos = result - ptr_;
- return xpos + s.length_ <= length_ ? xpos : npos;
-}
-
-size_type StringPiece::find(char c, size_type pos) const {
- if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
- return npos;
- }
- const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
- return result != ptr_ + length_ ? result - ptr_ : npos;
-}
-
-size_type StringPiece::rfind(const StringPiece& s, size_type pos) const {
- if (length_ < s.length_) return npos;
- const size_t ulen = length_;
- if (s.length_ == 0) return std::min(ulen, pos);
-
- const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_;
- const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
- return result != last ? result - ptr_ : npos;
-}
-
-size_type StringPiece::rfind(char c, size_type pos) const {
- if (length_ <= 0) return npos;
- for (int i = std::min(pos, static_cast<size_type>(length_ - 1));
- i >= 0; --i) {
- if (ptr_[i] == c) {
- return i;
- }
- }
- return npos;
-}
-
-// For each character in characters_wanted, sets the index corresponding
-// to the ASCII code of that character to 1 in table. This is used by
-// the find_.*_of methods below to tell whether or not a character is in
-// the lookup table in constant time.
-// The argument `table' must be an array that is large enough to hold all
-// the possible values of an unsigned char. Thus it should be be declared
-// as follows:
-// bool table[UCHAR_MAX + 1]
-static inline void BuildLookupTable(const StringPiece& characters_wanted,
- bool* table) {
- const size_type length = characters_wanted.length();
- const char* const data = characters_wanted.data();
- for (size_type i = 0; i < length; ++i) {
- table[static_cast<unsigned char>(data[i])] = true;
- }
-}
-
-size_type StringPiece::find_first_of(const StringPiece& s,
- size_type pos) const {
- if (length_ == 0 || s.length_ == 0)
- return npos;
-
- // Avoid the cost of BuildLookupTable() for a single-character search.
- if (s.length_ == 1)
- return find_first_of(s.ptr_[0], pos);
-
- bool lookup[UCHAR_MAX + 1] = { false };
- BuildLookupTable(s, lookup);
- for (size_type i = pos; i < length_; ++i) {
- if (lookup[static_cast<unsigned char>(ptr_[i])]) {
- return i;
- }
- }
- return npos;
-}
-
-size_type StringPiece::find_first_not_of(const StringPiece& s,
- size_type pos) const {
- if (length_ == 0)
- return npos;
-
- if (s.length_ == 0)
- return 0;
-
- // Avoid the cost of BuildLookupTable() for a single-character search.
- if (s.length_ == 1)
- return find_first_not_of(s.ptr_[0], pos);
-
- bool lookup[UCHAR_MAX + 1] = { false };
- BuildLookupTable(s, lookup);
- for (size_type i = pos; i < length_; ++i) {
- if (!lookup[static_cast<unsigned char>(ptr_[i])]) {
- return i;
- }
- }
- return npos;
-}
-
-size_type StringPiece::find_first_not_of(char c, size_type pos) const {
- if (length_ == 0)
- return npos;
-
- for (; pos < length_; ++pos) {
- if (ptr_[pos] != c) {
- return pos;
- }
- }
- return npos;
-}
-
-size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const {
- if (length_ == 0 || s.length_ == 0)
- return npos;
-
- // Avoid the cost of BuildLookupTable() for a single-character search.
- if (s.length_ == 1)
- return find_last_of(s.ptr_[0], pos);
-
- bool lookup[UCHAR_MAX + 1] = { false };
- BuildLookupTable(s, lookup);
- for (size_type i = std::min(pos, length_ - 1); ; --i) {
- if (lookup[static_cast<unsigned char>(ptr_[i])])
- return i;
- if (i == 0)
- break;
- }
- return npos;
-}
-
-size_type StringPiece::find_last_not_of(const StringPiece& s,
- size_type pos) const {
- if (length_ == 0)
- return npos;
-
- size_type i = std::min(pos, length_ - 1);
- if (s.length_ == 0)
- return i;
-
- // Avoid the cost of BuildLookupTable() for a single-character search.
- if (s.length_ == 1)
- return find_last_not_of(s.ptr_[0], pos);
-
- bool lookup[UCHAR_MAX + 1] = { false };
- BuildLookupTable(s, lookup);
- for (; ; --i) {
- if (!lookup[static_cast<unsigned char>(ptr_[i])])
- return i;
- if (i == 0)
- break;
- }
- return npos;
-}
-
-size_type StringPiece::find_last_not_of(char c, size_type pos) const {
- if (length_ == 0)
- return npos;
-
- for (size_type i = std::min(pos, length_ - 1); ; --i) {
- if (ptr_[i] != c)
- return i;
- if (i == 0)
- break;
- }
- return npos;
-}
-
-StringPiece StringPiece::substr(size_type pos, size_type n) const {
- if (pos > length_) pos = length_;
- if (n > length_ - pos) n = length_ - pos;
- return StringPiece(ptr_ + pos, n);
-}
-
-const size_type StringPiece::npos = size_type(-1);
-
-#endif // !HAVE_ICU
diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh
index 84431db1..696ca084 100644
--- a/klm/util/string_piece.hh
+++ b/klm/util/string_piece.hh
@@ -1,270 +1,2 @@
-/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If
- * you don't use ICU, then this will use the Google implementation from Chrome.
- * This has been modified from the original version to let you choose.
- */
+#include "utils/string_piece.hh"
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// Copied from strings/stringpiece.h with modifications
-//
-// A string-like object that points to a sized piece of memory.
-//
-// Functions or methods may use const StringPiece& parameters to accept either
-// a "const char*" or a "string" value that will be implicitly converted to
-// a StringPiece. The implicit conversion means that it is often appropriate
-// to include this .h file in other files rather than forward-declaring
-// StringPiece as would be appropriate for most other Google classes.
-//
-// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
-// conversions from "const char*" to "string" and back again.
-//
-
-#ifndef BASE_STRING_PIECE_H__
-#define BASE_STRING_PIECE_H__
-
-#include "util/have.hh"
-
-#include <cstring>
-#include <iosfwd>
-#include <ostream>
-
-#ifdef HAVE_ICU
-#include <unicode/stringpiece.h>
-#include <unicode/uversion.h>
-
-// Old versions of ICU don't define operator== and operator!=.
-#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
-#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
-inline bool operator==(const StringPiece& x, const StringPiece& y) {
- if (x.size() != y.size())
- return false;
-
- return std::memcmp(x.data(), y.data(), x.size()) == 0;
-}
-
-inline bool operator!=(const StringPiece& x, const StringPiece& y) {
- return !(x == y);
-}
-#endif // old version of ICU
-
-U_NAMESPACE_BEGIN
-
-inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) {
- int longersize = longer.size(), prefixsize = prefix.size();
- return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0;
-}
-
-#else
-
-#include <algorithm>
-#include <cstddef>
-#include <string>
-#include <string.h>
-
-#ifdef WIN32
-#undef max
-#undef min
-#endif
-
-class StringPiece {
- public:
- typedef size_t size_type;
-
- private:
- const char* ptr_;
- size_type length_;
-
- public:
- // We provide non-explicit singleton constructors so users can pass
- // in a "const char*" or a "string" wherever a "StringPiece" is
- // expected.
- StringPiece() : ptr_(NULL), length_(0) { }
- StringPiece(const char* str)
- : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { }
- StringPiece(const std::string& str)
- : ptr_(str.data()), length_(str.size()) { }
- StringPiece(const char* offset, size_type len)
- : ptr_(offset), length_(len) { }
-
- // data() may return a pointer to a buffer with embedded NULs, and the
- // returned buffer may or may not be null terminated. Therefore it is
- // typically a mistake to pass data() to a routine that expects a NUL
- // terminated string.
- const char* data() const { return ptr_; }
- size_type size() const { return length_; }
- size_type length() const { return length_; }
- bool empty() const { return length_ == 0; }
-
- void clear() { ptr_ = NULL; length_ = 0; }
- void set(const char* data, size_type len) { ptr_ = data; length_ = len; }
- void set(const char* str) {
- ptr_ = str;
- length_ = str ? strlen(str) : 0;
- }
- void set(const void* data, size_type len) {
- ptr_ = reinterpret_cast<const char*>(data);
- length_ = len;
- }
-
- char operator[](size_type i) const { return ptr_[i]; }
-
- void remove_prefix(size_type n) {
- ptr_ += n;
- length_ -= n;
- }
-
- void remove_suffix(size_type n) {
- length_ -= n;
- }
-
- int compare(const StringPiece& x) const {
- int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_));
- if (r == 0) {
- if (length_ < x.length_) r = -1;
- else if (length_ > x.length_) r = +1;
- }
- return r;
- }
-
- std::string as_string() const {
- // std::string doesn't like to take a NULL pointer even with a 0 size.
- return std::string(!empty() ? data() : "", size());
- }
-
- void CopyToString(std::string* target) const;
- void AppendToString(std::string* target) const;
-
- // Does "this" start with "x"
- bool starts_with(const StringPiece& x) const {
- return ((length_ >= x.length_) &&
- (wordmemcmp(ptr_, x.ptr_, x.length_) == 0));
- }
-
- // Does "this" end with "x"
- bool ends_with(const StringPiece& x) const {
- return ((length_ >= x.length_) &&
- (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
- }
-
- // standard STL container boilerplate
- typedef char value_type;
- typedef const char* pointer;
- typedef const char& reference;
- typedef const char& const_reference;
- typedef ptrdiff_t difference_type;
- static const size_type npos;
- typedef const char* const_iterator;
- typedef const char* iterator;
- typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
- typedef std::reverse_iterator<iterator> reverse_iterator;
- iterator begin() const { return ptr_; }
- iterator end() const { return ptr_ + length_; }
- const_reverse_iterator rbegin() const {
- return const_reverse_iterator(ptr_ + length_);
- }
- const_reverse_iterator rend() const {
- return const_reverse_iterator(ptr_);
- }
-
- size_type max_size() const { return length_; }
- size_type capacity() const { return length_; }
-
- size_type copy(char* buf, size_type n, size_type pos = 0) const;
-
- size_type find(const StringPiece& s, size_type pos = 0) const;
- size_type find(char c, size_type pos = 0) const;
- size_type rfind(const StringPiece& s, size_type pos = npos) const;
- size_type rfind(char c, size_type pos = npos) const;
-
- size_type find_first_of(const StringPiece& s, size_type pos = 0) const;
- size_type find_first_of(char c, size_type pos = 0) const {
- return find(c, pos);
- }
- size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const;
- size_type find_first_not_of(char c, size_type pos = 0) const;
- size_type find_last_of(const StringPiece& s, size_type pos = npos) const;
- size_type find_last_of(char c, size_type pos = npos) const {
- return rfind(c, pos);
- }
- size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const;
- size_type find_last_not_of(char c, size_type pos = npos) const;
-
- StringPiece substr(size_type pos, size_type n = npos) const;
-
- static int wordmemcmp(const char* p, const char* p2, size_type N) {
- return std::memcmp(p, p2, N);
- }
-};
-
-inline bool operator==(const StringPiece& x, const StringPiece& y) {
- if (x.size() != y.size())
- return false;
-
- return std::memcmp(x.data(), y.data(), x.size()) == 0;
-}
-
-inline bool operator!=(const StringPiece& x, const StringPiece& y) {
- return !(x == y);
-}
-
-inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) {
- return longer.starts_with(prefix);
-}
-
-#endif // HAVE_ICU undefined
-
-inline bool operator<(const StringPiece& x, const StringPiece& y) {
- const int r = std::memcmp(x.data(), y.data(),
- std::min(x.size(), y.size()));
- return ((r < 0) || ((r == 0) && (x.size() < y.size())));
-}
-
-inline bool operator>(const StringPiece& x, const StringPiece& y) {
- return y < x;
-}
-
-inline bool operator<=(const StringPiece& x, const StringPiece& y) {
- return !(x > y);
-}
-
-inline bool operator>=(const StringPiece& x, const StringPiece& y) {
- return !(x < y);
-}
-
-// allow StringPiece to be logged (needed for unit testing).
-inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
- return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
-}
-
-#ifdef HAVE_ICU
-U_NAMESPACE_END
-using U_NAMESPACE_QUALIFIER StringPiece;
-#endif
-
-#endif // BASE_STRING_PIECE_H__
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 341fd80b..18495c3a 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -8,9 +8,10 @@ noinst_PROGRAMS = \
weights_test \
logval_test \
small_vector_test \
+ stringlib_test \
sv_test
-TESTS = ts small_vector_test logval_test weights_test dict_test m_test sv_test
+TESTS = ts small_vector_test logval_test weights_test dict_test m_test sv_test stringlib_test
noinst_LIBRARIES = libutils.a
@@ -54,6 +55,7 @@ libutils_a_SOURCES = \
star.h \
static_utoa.h \
stringlib.h \
+ string_piece.hh \
tdict.h \
timing_stats.h \
utoa.h \
@@ -76,6 +78,7 @@ libutils_a_SOURCES = \
gzstream.cc \
filelib.cc \
stringlib.cc \
+ string_piece.cc \
sparse_vector.cc \
timing_stats.cc \
verbose.cc \
@@ -106,6 +109,8 @@ small_vector_test_SOURCES = small_vector_test.cc
small_vector_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
sv_test_SOURCES = sv_test.cc
sv_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
+stringlib_test_SOURCES = stringlib_test.cc
+stringlib_test_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
################################################################
# do NOT NOT NOT add any other -I includes NO NO NO NO NO ######
diff --git a/utils/string_piece.cc b/utils/string_piece.cc
new file mode 100644
index 00000000..0685c05f
--- /dev/null
+++ b/utils/string_piece.cc
@@ -0,0 +1,193 @@
+// Copyright 2004 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in string_piece.hh.
+
+#include "string_piece.hh"
+
+#include <algorithm>
+
+#include <limits.h>
+
+#ifndef HAVE_ICU
+
+typedef StringPiece::size_type size_type;
+
+void StringPiece::CopyToString(std::string* target) const {
+ target->assign(ptr_, length_);
+}
+
+size_type StringPiece::find(const StringPiece& s, size_type pos) const {
+ // Not sure why length_ < 0 was here since it's std::size_t.
+ if (/*length_ < 0 || */pos > static_cast<size_type>(length_))
+ return npos;
+
+ const char* result = std::search(ptr_ + pos, ptr_ + length_,
+ s.ptr_, s.ptr_ + s.length_);
+ const size_type xpos = result - ptr_;
+ return xpos + s.length_ <= length_ ? xpos : npos;
+}
+
+size_type StringPiece::find(char c, size_type pos) const {
+ if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
+ return npos;
+ }
+ const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
+ return result != ptr_ + length_ ? result - ptr_ : npos;
+}
+
+size_type StringPiece::rfind(const StringPiece& s, size_type pos) const {
+ if (length_ < s.length_) return npos;
+ const size_t ulen = length_;
+ if (s.length_ == 0) return std::min(ulen, pos);
+
+ const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_;
+ const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
+ return result != last ? result - ptr_ : npos;
+}
+
+size_type StringPiece::rfind(char c, size_type pos) const {
+ if (length_ <= 0) return npos;
+ for (int i = std::min(pos, static_cast<size_type>(length_ - 1));
+ i >= 0; --i) {
+ if (ptr_[i] == c) {
+ return i;
+ }
+ }
+ return npos;
+}
+
+// For each character in characters_wanted, sets the index corresponding
+// to the ASCII code of that character to 1 in table. This is used by
+// the find_.*_of methods below to tell whether or not a character is in
+// the lookup table in constant time.
+// The argument `table' must be an array that is large enough to hold all
+// the possible values of an unsigned char. Thus it should be be declared
+// as follows:
+// bool table[UCHAR_MAX + 1]
+static inline void BuildLookupTable(const StringPiece& characters_wanted,
+ bool* table) {
+ const size_type length = characters_wanted.length();
+ const char* const data = characters_wanted.data();
+ for (size_type i = 0; i < length; ++i) {
+ table[static_cast<unsigned char>(data[i])] = true;
+ }
+}
+
+size_type StringPiece::find_first_of(const StringPiece& s,
+ size_type pos) const {
+ if (length_ == 0 || s.length_ == 0)
+ return npos;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.length_ == 1)
+ return find_first_of(s.ptr_[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (size_type i = pos; i < length_; ++i) {
+ if (lookup[static_cast<unsigned char>(ptr_[i])]) {
+ return i;
+ }
+ }
+ return npos;
+}
+
+size_type StringPiece::find_first_not_of(const StringPiece& s,
+ size_type pos) const {
+ if (length_ == 0)
+ return npos;
+
+ if (s.length_ == 0)
+ return 0;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.length_ == 1)
+ return find_first_not_of(s.ptr_[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (size_type i = pos; i < length_; ++i) {
+ if (!lookup[static_cast<unsigned char>(ptr_[i])]) {
+ return i;
+ }
+ }
+ return npos;
+}
+
+size_type StringPiece::find_first_not_of(char c, size_type pos) const {
+ if (length_ == 0)
+ return npos;
+
+ for (; pos < length_; ++pos) {
+ if (ptr_[pos] != c) {
+ return pos;
+ }
+ }
+ return npos;
+}
+
+size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const {
+ if (length_ == 0 || s.length_ == 0)
+ return npos;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.length_ == 1)
+ return find_last_of(s.ptr_[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (size_type i = std::min(pos, length_ - 1); ; --i) {
+ if (lookup[static_cast<unsigned char>(ptr_[i])])
+ return i;
+ if (i == 0)
+ break;
+ }
+ return npos;
+}
+
+size_type StringPiece::find_last_not_of(const StringPiece& s,
+ size_type pos) const {
+ if (length_ == 0)
+ return npos;
+
+ size_type i = std::min(pos, length_ - 1);
+ if (s.length_ == 0)
+ return i;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.length_ == 1)
+ return find_last_not_of(s.ptr_[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (; ; --i) {
+ if (!lookup[static_cast<unsigned char>(ptr_[i])])
+ return i;
+ if (i == 0)
+ break;
+ }
+ return npos;
+}
+
+size_type StringPiece::find_last_not_of(char c, size_type pos) const {
+ if (length_ == 0)
+ return npos;
+
+ for (size_type i = std::min(pos, length_ - 1); ; --i) {
+ if (ptr_[i] != c)
+ return i;
+ if (i == 0)
+ break;
+ }
+ return npos;
+}
+
+StringPiece StringPiece::substr(size_type pos, size_type n) const {
+ if (pos > length_) pos = length_;
+ if (n > length_ - pos) n = length_ - pos;
+ return StringPiece(ptr_ + pos, n);
+}
+
+const size_type StringPiece::npos = size_type(-1);
+
+#endif // !HAVE_ICU
diff --git a/utils/string_piece.hh b/utils/string_piece.hh
new file mode 100644
index 00000000..9f21336c
--- /dev/null
+++ b/utils/string_piece.hh
@@ -0,0 +1,272 @@
+/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If
+ * you don't use ICU, then this will use the Google implementation from Chrome.
+ * This has been modified from the original version to let you choose.
+ */
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Copied from strings/stringpiece.h with modifications
+//
+// A string-like object that points to a sized piece of memory.
+//
+// Functions or methods may use const StringPiece& parameters to accept either
+// a "const char*" or a "string" value that will be implicitly converted to
+// a StringPiece. The implicit conversion means that it is often appropriate
+// to include this .h file in other files rather than forward-declaring
+// StringPiece as would be appropriate for most other Google classes.
+//
+// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
+// conversions from "const char*" to "string" and back again.
+//
+
+#ifndef BASE_STRING_PIECE_H__
+#define BASE_STRING_PIECE_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <cstring>
+#include <iosfwd>
+#include <ostream>
+
+#ifdef HAVE_ICU
+#include <unicode/stringpiece.h>
+#include <unicode/uversion.h>
+
+// Old versions of ICU don't define operator== and operator!=.
+#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
+#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
+inline bool operator==(const StringPiece& x, const StringPiece& y) {
+ if (x.size() != y.size())
+ return false;
+
+ return std::memcmp(x.data(), y.data(), x.size()) == 0;
+}
+
+inline bool operator!=(const StringPiece& x, const StringPiece& y) {
+ return !(x == y);
+}
+#endif // old version of ICU
+
+U_NAMESPACE_BEGIN
+
+inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) {
+ int longersize = longer.size(), prefixsize = prefix.size();
+ return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0;
+}
+
+#else
+
+#include <algorithm>
+#include <cstddef>
+#include <string>
+#include <string.h>
+
+#ifdef WIN32
+#undef max
+#undef min
+#endif
+
+class StringPiece {
+ public:
+ typedef size_t size_type;
+
+ private:
+ const char* ptr_;
+ size_type length_;
+
+ public:
+ // We provide non-explicit singleton constructors so users can pass
+ // in a "const char*" or a "string" wherever a "StringPiece" is
+ // expected.
+ StringPiece() : ptr_(NULL), length_(0) { }
+ StringPiece(const char* str)
+ : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { }
+ StringPiece(const std::string& str)
+ : ptr_(str.data()), length_(str.size()) { }
+ StringPiece(const char* offset, size_type len)
+ : ptr_(offset), length_(len) { }
+
+ // data() may return a pointer to a buffer with embedded NULs, and the
+ // returned buffer may or may not be null terminated. Therefore it is
+ // typically a mistake to pass data() to a routine that expects a NUL
+ // terminated string.
+ const char* data() const { return ptr_; }
+ size_type size() const { return length_; }
+ size_type length() const { return length_; }
+ bool empty() const { return length_ == 0; }
+
+ void clear() { ptr_ = NULL; length_ = 0; }
+ void set(const char* data, size_type len) { ptr_ = data; length_ = len; }
+ void set(const char* str) {
+ ptr_ = str;
+ length_ = str ? strlen(str) : 0;
+ }
+ void set(const void* data, size_type len) {
+ ptr_ = reinterpret_cast<const char*>(data);
+ length_ = len;
+ }
+
+ char operator[](size_type i) const { return ptr_[i]; }
+
+ void remove_prefix(size_type n) {
+ ptr_ += n;
+ length_ -= n;
+ }
+
+ void remove_suffix(size_type n) {
+ length_ -= n;
+ }
+
+ int compare(const StringPiece& x) const {
+ int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_));
+ if (r == 0) {
+ if (length_ < x.length_) r = -1;
+ else if (length_ > x.length_) r = +1;
+ }
+ return r;
+ }
+
+ std::string as_string() const {
+ // std::string doesn't like to take a NULL pointer even with a 0 size.
+ return std::string(!empty() ? data() : "", size());
+ }
+
+ void CopyToString(std::string* target) const;
+ void AppendToString(std::string* target) const;
+
+ // Does "this" start with "x"
+ bool starts_with(const StringPiece& x) const {
+ return ((length_ >= x.length_) &&
+ (wordmemcmp(ptr_, x.ptr_, x.length_) == 0));
+ }
+
+ // Does "this" end with "x"
+ bool ends_with(const StringPiece& x) const {
+ return ((length_ >= x.length_) &&
+ (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
+ }
+
+ // standard STL container boilerplate
+ typedef char value_type;
+ typedef const char* pointer;
+ typedef const char& reference;
+ typedef const char& const_reference;
+ typedef ptrdiff_t difference_type;
+ static const size_type npos;
+ typedef const char* const_iterator;
+ typedef const char* iterator;
+ typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+ typedef std::reverse_iterator<iterator> reverse_iterator;
+ iterator begin() const { return ptr_; }
+ iterator end() const { return ptr_ + length_; }
+ const_reverse_iterator rbegin() const {
+ return const_reverse_iterator(ptr_ + length_);
+ }
+ const_reverse_iterator rend() const {
+ return const_reverse_iterator(ptr_);
+ }
+
+ size_type max_size() const { return length_; }
+ size_type capacity() const { return length_; }
+
+ size_type copy(char* buf, size_type n, size_type pos = 0) const;
+
+ size_type find(const StringPiece& s, size_type pos = 0) const;
+ size_type find(char c, size_type pos = 0) const;
+ size_type rfind(const StringPiece& s, size_type pos = npos) const;
+ size_type rfind(char c, size_type pos = npos) const;
+
+ size_type find_first_of(const StringPiece& s, size_type pos = 0) const;
+ size_type find_first_of(char c, size_type pos = 0) const {
+ return find(c, pos);
+ }
+ size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const;
+ size_type find_first_not_of(char c, size_type pos = 0) const;
+ size_type find_last_of(const StringPiece& s, size_type pos = npos) const;
+ size_type find_last_of(char c, size_type pos = npos) const {
+ return rfind(c, pos);
+ }
+ size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const;
+ size_type find_last_not_of(char c, size_type pos = npos) const;
+
+ StringPiece substr(size_type pos, size_type n = npos) const;
+
+ static int wordmemcmp(const char* p, const char* p2, size_type N) {
+ return std::memcmp(p, p2, N);
+ }
+};
+
+inline bool operator==(const StringPiece& x, const StringPiece& y) {
+ if (x.size() != y.size())
+ return false;
+
+ return std::memcmp(x.data(), y.data(), x.size()) == 0;
+}
+
+inline bool operator!=(const StringPiece& x, const StringPiece& y) {
+ return !(x == y);
+}
+
+inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) {
+ return longer.starts_with(prefix);
+}
+
+#endif // HAVE_ICU undefined
+
+inline bool operator<(const StringPiece& x, const StringPiece& y) {
+ const int r = std::memcmp(x.data(), y.data(),
+ std::min(x.size(), y.size()));
+ return ((r < 0) || ((r == 0) && (x.size() < y.size())));
+}
+
+inline bool operator>(const StringPiece& x, const StringPiece& y) {
+ return y < x;
+}
+
+inline bool operator<=(const StringPiece& x, const StringPiece& y) {
+ return !(x > y);
+}
+
+inline bool operator>=(const StringPiece& x, const StringPiece& y) {
+ return !(x < y);
+}
+
+// allow StringPiece to be logged (needed for unit testing).
+inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
+ return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
+}
+
+#ifdef HAVE_ICU
+U_NAMESPACE_END
+using U_NAMESPACE_QUALIFIER StringPiece;
+#endif
+
+#endif // BASE_STRING_PIECE_H__
diff --git a/utils/stringlib.h b/utils/stringlib.h
index 2fdbfff8..aadec93c 100644
--- a/utils/stringlib.h
+++ b/utils/stringlib.h
@@ -16,6 +16,8 @@
#include <sstream>
#include <algorithm>
+#include "string_piece.hh"
+
namespace {
const char c_isspace[]=" \t\n\r\f\v"; // somewhat ridiculous, including characters nobody uses.
const char common_isspace[]=" \t\n\r"; // even \n\r is borderline, but maybe you pass multiline DOS format text.
@@ -221,6 +223,21 @@ void VisitTokens(char *p,char *const end,F f) {
}
}
+inline std::vector<StringPiece> TokenizeMultisep(const StringPiece& text, const StringPiece& separator) {
+ std::vector<StringPiece> res;
+ size_t cur = 0;
+ while(cur < text.size()) {
+ const auto n = text.find(separator, cur);
+ if (n == StringPiece::npos) {
+ res.push_back(text.substr(cur));
+ break;
+ }
+ res.push_back(text.substr(cur, n - cur));
+ cur = n + separator.size();
+ }
+ return res;
+}
+
template <class F>
void VisitTokens(char *p,F f) {
VisitTokens(p,p+std::strlen(p),f);
diff --git a/utils/stringlib_test.cc b/utils/stringlib_test.cc
index f66cdbeb..00b31b4b 100644
--- a/utils/stringlib_test.cc
+++ b/utils/stringlib_test.cc
@@ -14,4 +14,8 @@ int main(int argc, char *argv[]) {
std::string const& w="verylongword";
VisitTokens(p,print());
VisitTokens(w,print());
+ vector<StringPiece> vsp = TokenizeMultisep("field 1 ||| field 2 is a bit longer ||| field 3", " ||| ");
+ for (auto& sp : vsp) {
+ cerr << "'" << sp << "'" << endl;
+ }
}
diff --git a/utils/sv_test.cc b/utils/sv_test.cc
index c7ac9e54..67df8c57 100644
--- a/utils/sv_test.cc
+++ b/utils/sv_test.cc
@@ -5,6 +5,17 @@
using namespace std;
+BOOST_AUTO_TEST_CASE(Dot) {
+ SparseVector<double> x;
+ SparseVector<double> y;
+ x.set_value(1,0.8);
+ y.set_value(1,5);
+ x.set_value(2,-2);
+ y.set_value(2,1);
+ x.set_value(3,80);
+ BOOST_CHECK_CLOSE(x.dot(y), 2.0, 1e-9);
+}
+
BOOST_AUTO_TEST_CASE(Equality) {
SparseVector<double> x;
SparseVector<double> y;