utils/stringlib.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317

#ifndef CDEC_STRINGLIB_H_
#define CDEC_STRINGLIB_H_

#ifdef STRINGLIB_DEBUG
#include <iostream>
#define SLIBDBG(x) do { std::cerr<<"DBG(stringlib): "<<x<<std::endl; } while(0)
#else
#define SLIBDBG(x)
#endif

#include <map>
#include <vector>
#include <cctype>
#include <cstring>
#include <string>
#include <sstream>
#include <algorithm>

#include "string_piece.hh"

namespace {
const char c_isspace[]=" \t\n\r\f\v"; // somewhat ridiculous, including characters nobody uses.
const char common_isspace[]=" \t\n\r"; // even \n\r is borderline, but maybe you pass multiline DOS format text.
}

inline std::size_t skip_ws(std::string const& s,std::size_t starting=0,char const* ws=common_isspace) {
  return s.find_first_not_of(ws,starting);
}

// returns position of end of all non-ws chars before ending, i.e. string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)) strips both ends
inline std::size_t trailing_ws(std::string const& s,std::size_t ending=std::string::npos,char const* ws=common_isspace) {
  std::size_t n=s.find_last_not_of(ws,ending);
  if (n==std::string::npos) return n;
  else return n+1;
}


inline bool is_single_line(std::string const& line) {
  return std::count(line.begin(),line.end(),'\n')==0; // but we want to allow terminal newlines/blanks
}

// is_single_line(strip_ws(line))
inline bool is_single_line_stripped(std::string const& line) {
  std::size_t b=skip_ws(line),e=trailing_ws(line);
  std::size_t n=line.find('\n',b);
  return n==std::string::npos || n>=e;
}

struct toupperc {
  inline char operator()(char c) const {
    return std::toupper(c);
  }
};

inline std::string toupper(std::string s) {
  std::transform(s.begin(),s.end(),s.begin(),toupperc());
  return s;
}

template <class Istr, class Isubstr> inline
bool match_begin(Istr bstr,Istr estr,Isubstr bsub,Isubstr esub)
{
  while (bsub != esub) {
    if (bstr == estr)
      return false;
    if (*bsub++ != *bstr++)
      return false;
  }
  return true;
}

template <class Istr, class Prefix> inline
bool match_begin(Istr bstr,Istr estr,Prefix prefix)
{
  return match_begin(bstr,estr,prefix.begin(),prefix.end());
}

template <class Str, class Prefix> inline
bool match_begin(Str const& str,Prefix const& prefix)
{
  return match_begin(str.begin(),str.end(),prefix.begin(),prefix.end());
}


// read line in the form of either:
//   source
//   source ||| target
// source will be returned as a string, target must be a sentence or
// a lattice (in PLF format) and will be returned as a Lattice object
void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref);
class Lattice;
void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref);

inline std::string Trim(const std::string& str, const std::string& dropChars = " \t") {
  std::string res = str;
  res.erase(str.find_last_not_of(dropChars)+1);
  return res.erase(0, res.find_first_not_of(dropChars));
}

inline void Tokenize(const std::string& str, char delimiter, std::vector<std::string>* res) {
  std::string s = str;
  unsigned last = 0;
  res->clear();
  for (unsigned i=0; i < s.size(); ++i)
    if (s[i] == delimiter) {
      s[i]=0;
      if (last != i) {
        res->push_back(&s[last]);
      }
      last = i + 1;
    }
  if (last != s.size())
    res->push_back(&s[last]);
}

inline unsigned NTokens(const std::string& str, char delimiter)
{
  std::vector<std::string> r;
  Tokenize(str,delimiter,&r);
  return r.size();
}

inline std::string LowercaseString(const std::string& in) {
  std::string res(in.size(),' ');
  for (unsigned i = 0; i < in.size(); ++i)
    res[i] = tolower(in[i]);
  return res;
}

inline std::string UppercaseString(const std::string& in) {
  std::string res(in.size(),' ');
  for (unsigned i = 0; i < in.size(); ++i)
    res[i] = toupper(in[i]);
  return res;
}

inline int CountSubstrings(const std::string& str, const std::string& sub) {
  size_t p = 0;
  int res = 0;
  while (p < str.size()) {
    p = str.find(sub, p);
    if (p == std::string::npos) break;
    ++res;
    p += sub.size();
  }
  return res;
}

inline int SplitOnWhitespace(const std::string& in, std::vector<std::string>* out) {
  out->clear();
  unsigned i = 0;
  unsigned start = 0;
  std::string cur;
  while(i < in.size()) {
    if (in[i] == ' ' || in[i] == '\t') {
      if (i - start > 0)
        out->push_back(in.substr(start, i - start));
      start = i + 1;
    }
    ++i;
  }
  if (i > start)
    out->push_back(in.substr(start, i - start));
  return out->size();
}

inline std::vector<std::string> SplitOnWhitespace(std::string const& in)
{
  std::vector<std::string> r;
  SplitOnWhitespace(in,&r);
  return r;
}


struct mutable_c_str {
  // because making a copy of a string might not copy its storage, so modifying a c_str() could screw up original (nobody uses cow nowadays because it needs locking under threading)
  char *p;
  mutable_c_str(std::string const& s) : p((char *)::operator new(s.size()+1)) {
    std::memcpy(p,s.data(),s.size());
    p[s.size()]=0;
  }
  ~mutable_c_str() { ::operator delete(p); }
private:
  mutable_c_str(mutable_c_str const&);
};

// ' ' '\t' tokens hardcoded
//NOTE: you should have stripped endline chars out first.
inline bool IsWordSep(char c) {
  return c==' '||c=='\t';
}


template <class F>
// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens
void VisitTokens(char *p,char *const end,F f) {
  SLIBDBG("VisitTokens. p="<<p<<" Nleft="<<end-p);
  if (p==end) return;
  char *last; // 0 terminated already.  this is ok to mutilate because s is a copy of the string passed in.  well, barring copy on write i guess.
  while(IsWordSep(*p)) { ++p;if (p==end) return; } // skip init whitespace
  last=p; // first non-ws char
  for(;;) {
    SLIBDBG("Start of word. last="<<last<<" *p="<<*p<<" Nleft="<<end-p);
    // last==p, pointing at first non-ws char not yet translated into f(word) call
    for(;;) {// p to end of word
      ++p;
      if (p==end) {
        f(last);
        SLIBDBG("Returning. word="<<last<<" *p="<<*p<<" Nleft="<<end-p);
        return;
      }
      if (IsWordSep(*p)) break;
    }
    *p=0;
    f(last);
    SLIBDBG("End of word. word="<<last<<" rest="<<p+1<<" Nleft="<<end-p);
    for(;;) { // again skip extra whitespace
      ++p;
      if (p==end) return;
      if (!IsWordSep(*p)) break;
    }
    last=p;
  }
}

inline std::vector<StringPiece> TokenizeMultisep(const StringPiece& text, const StringPiece& separator) {
  std::vector<StringPiece> res;
  size_t cur = 0;
  while(cur < text.size()) {
    const auto n = text.find(separator, cur);
    if (n == StringPiece::npos) {
      res.push_back(text.substr(cur));
      break;
    }
    res.push_back(text.substr(cur, n - cur));
    cur = n + separator.size();
  }
  return res;
}

template <class F>
void VisitTokens(char *p,F f) {
  VisitTokens(p,p+std::strlen(p),f);
}


template <class F>
void VisitTokens(std::string const& s,F f) {
  if (0) {
  std::vector<std::string> ss=SplitOnWhitespace(s);
  for (unsigned i=0;i<ss.size();++i)
    f(ss[i]);
  return;
  }
  //FIXME:
  if (s.empty()) return;
  mutable_c_str mp(s);
  SLIBDBG("mp="<<mp.p);
  VisitTokens(mp.p,mp.p+s.size(),f);
}

template <class F>
void VisitTokens(std::string const& s,F f, unsigned start) {
  if (0) {
  std::vector<std::string> ss=SplitOnWhitespace(s);
  for (unsigned i=0;i<ss.size();++i)
    f(ss[i]);
  return;
  }
  //FIXME:
  if (s.empty()) return;
  mutable_c_str mp(s);
  SLIBDBG("mp="<<mp.p);
  VisitTokens(mp.p+start,mp.p+s.size(),f);
}

inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) {
  cmd->clear();
  param->clear();
  std::vector<std::string> x;
  SplitOnWhitespace(in, &x);
  if (x.size() == 0) return;
  *cmd = x[0];
  for (unsigned i = 1; i < x.size(); ++i) {
    if (i > 1) { *param += " "; }
    *param += x[i];
  }
}

void ProcessAndStripSGML(std::string* line, std::map<std::string, std::string>* out);
std::string SGMLOpenSegTag(const std::map<std::string, std::string>& attr);

// given the first character of a UTF8 block, find out how wide it is
// see http://en.wikipedia.org/wiki/UTF-8 for more info
inline unsigned int UTF8Len(unsigned char x) {
  if (x < 0x80) return 1;
  else if ((x >> 5) == 0x06) return 2;
  else if ((x >> 4) == 0x0e) return 3;
  else if ((x >> 3) == 0x1e) return 4;
  else if ((x >> 2) == 0x3e) return 5;
  else if ((x >> 1) == 0x7e) return 6;
  else return 0;
}

inline unsigned int UTF8StringLen(const std::string& x) {
  unsigned pos = 0;
  int len = 0;
  while(pos < x.size()) {
    ++len;
    pos += UTF8Len(x[pos]);
  }
  return len;
}

std::string md5(const std::string& in);

#endif