From 80686d4e567bae579ea39e009826a2de92cd4ace Mon Sep 17 00:00:00 2001 From: redpony Date: Wed, 11 Aug 2010 02:37:10 +0000 Subject: major refactor, break bad circular deps git-svn-id: https://ws10smt.googlecode.com/svn/trunk@509 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/stringlib.h | 267 ---------------------------------------------------- 1 file changed, 267 deletions(-) delete mode 100644 decoder/stringlib.h (limited to 'decoder/stringlib.h') diff --git a/decoder/stringlib.h b/decoder/stringlib.h deleted file mode 100644 index 84e95d44..00000000 --- a/decoder/stringlib.h +++ /dev/null @@ -1,267 +0,0 @@ -#ifndef CDEC_STRINGLIB_H_ -#define CDEC_STRINGLIB_H_ - -//usage: string s=MAKESTRE(1<<" "<(ostringstream()< -#define SLIBDBG(x) do { std::cerr<<"DBG(stringlib): "< -#include -#include -#include -#include -#include -#include - -inline std::size_t skip_ws(std::string const& s,std::size_t starting=0,char const* ws=" \t\n\r") { - return s.find_first_not_of(ws,starting); -} - -// returns position of end of all non-ws chars before ending, i.e. string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)) strips both ends -inline std::size_t trailing_ws(std::string const& s,std::size_t ending=std::string::npos,char const* ws=" \t\n\r") { - std::size_t n=s.find_last_not_of(ws,ending); - if (n==std::string::npos) return n; - else return n+1; -} - -//TEST: if string is all whitespace, make sure that string(a+npos,a+npos) can't segfault (i.e. won't access any memory because begin==end) -inline std::string strip_ws(std::string const& s) { - return std::string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)); -} - - -inline bool is_single_line(std::string const& line) { - return std::count(line.begin(),line.end(),'\n')==0; // but we want to allow terminal newlines/blanks -} - -// is_single_line(strip_ws(line)) -inline bool is_single_line_stripped(std::string const& line) { - std::size_t b=skip_ws(line),e=trailing_ws(line); - std::size_t n=line.find('\n',b); - return n==std::string::npos || n>=e; -} - -struct toupperc { - inline char operator()(char c) const { - return std::toupper(c); - } -}; - -inline std::string toupper(std::string s) { - std::transform(s.begin(),s.end(),s.begin(),toupperc()); - return s; -} - -template inline -bool match_begin(Istr bstr,Istr estr,Isubstr bsub,Isubstr esub) -{ - while (bsub != esub) { - if (bstr == estr) - return false; - if (*bsub++ != *bstr++) - return false; - } - return true; -} - -template inline -bool match_begin(Istr bstr,Istr estr,Prefix prefix) -{ - return match_begin(bstr,estr,prefix.begin(),prefix.end()); -} - -template inline -bool match_begin(Str const& str,Prefix const& prefix) -{ - return match_begin(str.begin(),str.end(),prefix.begin(),prefix.end()); -} - - -// read line in the form of either: -// source -// source ||| target -// source will be returned as a string, target must be a sentence or -// a lattice (in PLF format) and will be returned as a Lattice object -void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref); -struct Lattice; -void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref); - -inline std::string Trim(const std::string& str, const std::string& dropChars = " \t") { - std::string res = str; - res.erase(str.find_last_not_of(dropChars)+1); - return res.erase(0, res.find_first_not_of(dropChars)); -} - -inline void Tokenize(const std::string& str, char delimiter, std::vector* res) { - std::string s = str; - int last = 0; - res->clear(); - for (int i=0; i < s.size(); ++i) - if (s[i] == delimiter) { - s[i]=0; - if (last != i) { - res->push_back(&s[last]); - } - last = i + 1; - } - if (last != s.size()) - res->push_back(&s[last]); -} - -inline unsigned NTokens(const std::string& str, char delimiter) -{ - std::vector r; - Tokenize(str,delimiter,&r); - return r.size(); -} - -inline std::string LowercaseString(const std::string& in) { - std::string res(in.size(),' '); - for (int i = 0; i < in.size(); ++i) - res[i] = tolower(in[i]); - return res; -} - -inline int CountSubstrings(const std::string& str, const std::string& sub) { - size_t p = 0; - int res = 0; - while (p < str.size()) { - p = str.find(sub, p); - if (p == std::string::npos) break; - ++res; - p += sub.size(); - } - return res; -} - -inline int SplitOnWhitespace(const std::string& in, std::vector* out) { - out->clear(); - int i = 0; - int start = 0; - std::string cur; - while(i < in.size()) { - if (in[i] == ' ' || in[i] == '\t') { - if (i - start > 0) - out->push_back(in.substr(start, i - start)); - start = i + 1; - } - ++i; - } - if (i > start) - out->push_back(in.substr(start, i - start)); - return out->size(); -} - -inline std::vector SplitOnWhitespace(std::string const& in) -{ - std::vector r; - SplitOnWhitespace(in,&r); - return r; -} - - -struct mutable_c_str { - // because making a copy of a string might not copy its storage, so modifying a c_str() could screw up original (nobody uses cow nowadays because it needs locking under threading) - char *p; - mutable_c_str(std::string const& s) : p((char *)::operator new(s.size()+1)) { - std::memcpy(p,s.data(),s.size()); - p[s.size()]=0; - } - ~mutable_c_str() { ::operator delete(p); } -private: - mutable_c_str(mutable_c_str const&); -}; - -// ' ' '\t' tokens hardcoded -//NOTE: you should have stripped endline chars out first. -inline bool IsWordSep(char c) { - return c==' '||c=='\t'; -} - - -template -// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens -void VisitTokens(char *p,char *const end,F f) { - SLIBDBG("VisitTokens. p="<* out); - -// given the first character of a UTF8 block, find out how wide it is -// see http://en.wikipedia.org/wiki/UTF-8 for more info -inline unsigned int UTF8Len(unsigned char x) { - if (x < 0x80) return 1; - else if ((x >> 5) == 0x06) return 2; - else if ((x >> 4) == 0x0e) return 3; - else if ((x >> 3) == 0x1e) return 4; - else return 0; -} - -#endif -- cgit v1.2.3