diff options
Diffstat (limited to 'utils/stringlib.h')
-rw-r--r-- | utils/stringlib.h | 267 |
1 files changed, 267 insertions, 0 deletions
diff --git a/utils/stringlib.h b/utils/stringlib.h new file mode 100644 index 00000000..84e95d44 --- /dev/null +++ b/utils/stringlib.h @@ -0,0 +1,267 @@ +#ifndef CDEC_STRINGLIB_H_ +#define CDEC_STRINGLIB_H_ + +//usage: string s=MAKESTRE(1<<" "<<c); +#define MAKESTR(expr) ((dynamic_cast<ostringstream &>(ostringstream()<<std::dec<<expr)).str()) +// std::dec (or seekp, or another manip) is needed to convert to std::ostream reference. + +#ifdef STRINGLIB_DEBUG +#include <iostream> +#define SLIBDBG(x) do { std::cerr<<"DBG(stringlib): "<<x<<std::endl; } while(0) +#else +#define SLIBDBG(x) +#endif + +#include <map> +#include <vector> +#include <cctype> +#include <cstring> +#include <string> +#include <sstream> +#include <algorithm> + +inline std::size_t skip_ws(std::string const& s,std::size_t starting=0,char const* ws=" \t\n\r") { + return s.find_first_not_of(ws,starting); +} + +// returns position of end of all non-ws chars before ending, i.e. string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)) strips both ends +inline std::size_t trailing_ws(std::string const& s,std::size_t ending=std::string::npos,char const* ws=" \t\n\r") { + std::size_t n=s.find_last_not_of(ws,ending); + if (n==std::string::npos) return n; + else return n+1; +} + +//TEST: if string is all whitespace, make sure that string(a+npos,a+npos) can't segfault (i.e. won't access any memory because begin==end) +inline std::string strip_ws(std::string const& s) { + return std::string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)); +} + + +inline bool is_single_line(std::string const& line) { + return std::count(line.begin(),line.end(),'\n')==0; // but we want to allow terminal newlines/blanks +} + +// is_single_line(strip_ws(line)) +inline bool is_single_line_stripped(std::string const& line) { + std::size_t b=skip_ws(line),e=trailing_ws(line); + std::size_t n=line.find('\n',b); + return n==std::string::npos || n>=e; +} + +struct toupperc { + inline char operator()(char c) const { + return std::toupper(c); + } +}; + +inline std::string toupper(std::string s) { + std::transform(s.begin(),s.end(),s.begin(),toupperc()); + return s; +} + +template <class Istr, class Isubstr> inline +bool match_begin(Istr bstr,Istr estr,Isubstr bsub,Isubstr esub) +{ + while (bsub != esub) { + if (bstr == estr) + return false; + if (*bsub++ != *bstr++) + return false; + } + return true; +} + +template <class Istr, class Prefix> inline +bool match_begin(Istr bstr,Istr estr,Prefix prefix) +{ + return match_begin(bstr,estr,prefix.begin(),prefix.end()); +} + +template <class Str, class Prefix> inline +bool match_begin(Str const& str,Prefix const& prefix) +{ + return match_begin(str.begin(),str.end(),prefix.begin(),prefix.end()); +} + + +// read line in the form of either: +// source +// source ||| target +// source will be returned as a string, target must be a sentence or +// a lattice (in PLF format) and will be returned as a Lattice object +void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref); +struct Lattice; +void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref); + +inline std::string Trim(const std::string& str, const std::string& dropChars = " \t") { + std::string res = str; + res.erase(str.find_last_not_of(dropChars)+1); + return res.erase(0, res.find_first_not_of(dropChars)); +} + +inline void Tokenize(const std::string& str, char delimiter, std::vector<std::string>* res) { + std::string s = str; + int last = 0; + res->clear(); + for (int i=0; i < s.size(); ++i) + if (s[i] == delimiter) { + s[i]=0; + if (last != i) { + res->push_back(&s[last]); + } + last = i + 1; + } + if (last != s.size()) + res->push_back(&s[last]); +} + +inline unsigned NTokens(const std::string& str, char delimiter) +{ + std::vector<std::string> r; + Tokenize(str,delimiter,&r); + return r.size(); +} + +inline std::string LowercaseString(const std::string& in) { + std::string res(in.size(),' '); + for (int i = 0; i < in.size(); ++i) + res[i] = tolower(in[i]); + return res; +} + +inline int CountSubstrings(const std::string& str, const std::string& sub) { + size_t p = 0; + int res = 0; + while (p < str.size()) { + p = str.find(sub, p); + if (p == std::string::npos) break; + ++res; + p += sub.size(); + } + return res; +} + +inline int SplitOnWhitespace(const std::string& in, std::vector<std::string>* out) { + out->clear(); + int i = 0; + int start = 0; + std::string cur; + while(i < in.size()) { + if (in[i] == ' ' || in[i] == '\t') { + if (i - start > 0) + out->push_back(in.substr(start, i - start)); + start = i + 1; + } + ++i; + } + if (i > start) + out->push_back(in.substr(start, i - start)); + return out->size(); +} + +inline std::vector<std::string> SplitOnWhitespace(std::string const& in) +{ + std::vector<std::string> r; + SplitOnWhitespace(in,&r); + return r; +} + + +struct mutable_c_str { + // because making a copy of a string might not copy its storage, so modifying a c_str() could screw up original (nobody uses cow nowadays because it needs locking under threading) + char *p; + mutable_c_str(std::string const& s) : p((char *)::operator new(s.size()+1)) { + std::memcpy(p,s.data(),s.size()); + p[s.size()]=0; + } + ~mutable_c_str() { ::operator delete(p); } +private: + mutable_c_str(mutable_c_str const&); +}; + +// ' ' '\t' tokens hardcoded +//NOTE: you should have stripped endline chars out first. +inline bool IsWordSep(char c) { + return c==' '||c=='\t'; +} + + +template <class F> +// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens +void VisitTokens(char *p,char *const end,F f) { + SLIBDBG("VisitTokens. p="<<p<<" Nleft="<<end-p); + if (p==end) return; + char *last; // 0 terminated already. this is ok to mutilate because s is a copy of the string passed in. well, barring copy on write i guess. + while(IsWordSep(*p)) { ++p;if (p==end) return; } // skip init whitespace + last=p; // first non-ws char + for(;;) { + SLIBDBG("Start of word. last="<<last<<" *p="<<*p<<" Nleft="<<end-p); + // last==p, pointing at first non-ws char not yet translated into f(word) call + for(;;) {// p to end of word + ++p; + if (p==end) { + f(last); + SLIBDBG("Returning. word="<<last<<" *p="<<*p<<" Nleft="<<end-p); + return; + } + if (IsWordSep(*p)) break; + } + *p=0; + f(last); + SLIBDBG("End of word. word="<<last<<" rest="<<p+1<<" Nleft="<<end-p); + for(;;) { // again skip extra whitespace + ++p; + if (p==end) return; + if (!IsWordSep(*p)) break; + } + last=p; + } +} + +template <class F> +void VisitTokens(char *p,F f) { + VisitTokens(p,p+std::strlen(p),f); +} + + +template <class F> +void VisitTokens(std::string const& s,F f) { + if (0) { + std::vector<std::string> ss=SplitOnWhitespace(s); + for (int i=0;i<ss.size();++i) + f(ss[i]); + return; + } + //FIXME: + if (s.empty()) return; + mutable_c_str mp(s); + SLIBDBG("mp="<<mp.p); + VisitTokens(mp.p,mp.p+s.size(),f); +} + +inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) { + cmd->clear(); + param->clear(); + std::vector<std::string> x; + SplitOnWhitespace(in, &x); + if (x.size() == 0) return; + *cmd = x[0]; + for (int i = 1; i < x.size(); ++i) { + if (i > 1) { *param += " "; } + *param += x[i]; + } +} + +void ProcessAndStripSGML(std::string* line, std::map<std::string, std::string>* out); + +// given the first character of a UTF8 block, find out how wide it is +// see http://en.wikipedia.org/wiki/UTF-8 for more info +inline unsigned int UTF8Len(unsigned char x) { + if (x < 0x80) return 1; + else if ((x >> 5) == 0x06) return 2; + else if ((x >> 4) == 0x0e) return 3; + else if ((x >> 3) == 0x1e) return 4; + else return 0; +} + +#endif |