diff options
author | Kenneth Heafield <github@kheafield.com> | 2011-10-24 18:17:24 +0100 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2011-10-24 18:17:24 +0100 |
commit | 3106cf8eca76df8b46d139b8f5ce5002200d660d (patch) | |
tree | 637930011a082de54bf21a97078cb67483ea248c /klm/util/tokenize_piece.hh | |
parent | b2171f53c6c597ac4326f63250269aa13df84718 (diff) |
KenLM update. EnumerateVocab moved up a namespace. Fix trie building when bigrams are pruned. Make Chris feel better about MurmurHashNative.
Diffstat (limited to 'klm/util/tokenize_piece.hh')
-rw-r--r-- | klm/util/tokenize_piece.hh | 75 |
1 files changed, 75 insertions, 0 deletions
diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh index ee1c7ab2..413bda0b 100644 --- a/klm/util/tokenize_piece.hh +++ b/klm/util/tokenize_piece.hh @@ -5,6 +5,9 @@ #include <boost/iterator/iterator_facade.hpp> +#include <algorithm> +#include <iostream> + /* Usage: * * for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) { @@ -64,6 +67,78 @@ template <char d> class PieceIterator : public boost::iterator_facade<PieceItera StringPiece after_; }; +class MultiCharacter { + public: + explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {} + + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::search(in.data(), in.data() + in.size(), delimiter_.data(), delimiter_.data() + delimiter_.size()), delimiter_.size()); + } + + private: + StringPiece delimiter_; +}; + +class AnyCharacter { + public: + explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {} + + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::find_first_of(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1); + } + + private: + StringPiece chars_; +}; + +template <class Find, bool SkipEmpty = false> class TokenIter : public boost::iterator_facade<TokenIter<Find, SkipEmpty>, const StringPiece, boost::forward_traversal_tag> { + public: + TokenIter() {} + + TokenIter(const StringPiece &str, const Find &finder) : after_(str), finder_(finder) { + increment(); + } + + bool operator!() const { + return current_.data() == 0; + } + operator bool() const { + return current_.data() != 0; + } + + static TokenIter<Find> end() { + return TokenIter<Find>(); + } + + private: + friend class boost::iterator_core_access; + + void increment() { + do { + StringPiece found(finder_.Find(after_)); + current_ = StringPiece(after_.data(), found.data() - after_.data()); + if (found.data() == after_.data() + after_.size()) { + after_ = StringPiece(NULL, 0); + } else { + after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size()); + } + } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false. + } + + bool equal(const TokenIter<Find> &other) const { + return after_.data() == other.after_.data(); + } + + const StringPiece &dereference() const { + return current_; + } + + StringPiece current_; + StringPiece after_; + + Find finder_; +}; + } // namespace util #endif // UTIL_TOKENIZE_PIECE__ |