summaryrefslogtreecommitdiff
path: root/klm/util/tokenize_piece.hh
blob: a588c3fca9acb68acf874d9fc7abb03745087798 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#ifndef UTIL_TOKENIZE_PIECE__
#define UTIL_TOKENIZE_PIECE__

#include "util/exception.hh"
#include "util/string_piece.hh"

#include <boost/iterator/iterator_facade.hpp>

#include <algorithm>
#include <iostream>

namespace util {

// Thrown on dereference when out of tokens to parse
class OutOfTokens : public Exception {
  public:
    OutOfTokens() throw() {}
    ~OutOfTokens() throw() {}
};

class SingleCharacter {
  public:
    SingleCharacter() {}
    explicit SingleCharacter(char delim) : delim_(delim) {}

    StringPiece Find(const StringPiece &in) const {
      return StringPiece(std::find(in.data(), in.data() + in.size(), delim_), 1);
    }

  private:
    char delim_;
};

class MultiCharacter {
  public:
    MultiCharacter() {}

    explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {}

    StringPiece Find(const StringPiece &in) const {
      return StringPiece(std::search(in.data(), in.data() + in.size(), delimiter_.data(), delimiter_.data() + delimiter_.size()), delimiter_.size());
    }

  private:
    StringPiece delimiter_;
};

class AnyCharacter {
  public:
    AnyCharacter() {}
    explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {}

    StringPiece Find(const StringPiece &in) const {
      return StringPiece(std::find_first_of(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1);
    }

  private:
    StringPiece chars_;
};

class AnyCharacterLast {
  public:
    AnyCharacterLast() {}

    explicit AnyCharacterLast(const StringPiece &chars) : chars_(chars) {}

    StringPiece Find(const StringPiece &in) const {
      return StringPiece(std::find_end(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1);
    }

  private:
    StringPiece chars_;
};

template <class Find, bool SkipEmpty = false> class TokenIter : public boost::iterator_facade<TokenIter<Find, SkipEmpty>, const StringPiece, boost::forward_traversal_tag> {
  public:
    TokenIter() {}

    template <class Construct> TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) {
      increment();
    }

    bool operator!() const {
      return current_.data() == 0;
    }
    operator bool() const {
      return current_.data() != 0;
    }

    static TokenIter<Find, SkipEmpty> end() {
      return TokenIter<Find, SkipEmpty>();
    }

  private:
    friend class boost::iterator_core_access;

    void increment() {
      do {
        StringPiece found(finder_.Find(after_));
        current_ = StringPiece(after_.data(), found.data() - after_.data());
        if (found.data() == after_.data() + after_.size()) {
          after_ = StringPiece(NULL, 0);
        } else {
          after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size());
        }
      } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false.  
    }

    bool equal(const TokenIter<Find, SkipEmpty> &other) const {
      return current_.data() == other.current_.data();
    }

    const StringPiece &dereference() const {
      UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens");
      return current_;
    }

    StringPiece current_;
    StringPiece after_;

    Find finder_;
};

} // namespace util

#endif // UTIL_TOKENIZE_PIECE__