From f111672dd611f78656fceb3df3729a290453ef56 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 21 Sep 2011 18:23:50 -0400 Subject: Updated kenlm. Includes left state support but not the cdec-side use of it. Updated binary format. --- klm/util/tokenize_piece.hh | 69 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 klm/util/tokenize_piece.hh (limited to 'klm/util/tokenize_piece.hh') diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh new file mode 100644 index 00000000..ee1c7ab2 --- /dev/null +++ b/klm/util/tokenize_piece.hh @@ -0,0 +1,69 @@ +#ifndef UTIL_TOKENIZE_PIECE__ +#define UTIL_TOKENIZE_PIECE__ + +#include "util/string_piece.hh" + +#include + +/* Usage: + * + * for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) { + * std::cout << *i << "\n"; + * } + * + */ + +namespace util { + +// Tokenize a StringPiece using an iterator interface. boost::tokenizer doesn't work with StringPiece. +template class PieceIterator : public boost::iterator_facade, const StringPiece, boost::forward_traversal_tag> { + public: + // Default construct is end, which is also accessed by kEndPieceIterator; + PieceIterator() {} + + explicit PieceIterator(const StringPiece &str) + : after_(str) { + increment(); + } + + bool operator!() const { + return after_.data() == 0; + } + operator bool() const { + return after_.data() != 0; + } + + static PieceIterator end() { + return PieceIterator(); + } + + private: + friend class boost::iterator_core_access; + + void increment() { + const char *start = after_.data(); + for (; (start != after_.data() + after_.size()) && (d == *start); ++start) {} + if (start == after_.data() + after_.size()) { + // End condition. + after_.clear(); + return; + } + const char *finish = start; + for (; (finish != after_.data() + after_.size()) && (d != *finish); ++finish) {} + current_ = StringPiece(start, finish - start); + after_ = StringPiece(finish, after_.data() + after_.size() - finish); + } + + bool equal(const PieceIterator &other) const { + return after_.data() == other.after_.data(); + } + + const StringPiece &dereference() const { return current_; } + + StringPiece current_; + StringPiece after_; +}; + +} // namespace util + +#endif // UTIL_TOKENIZE_PIECE__ -- cgit v1.2.3