summaryrefslogtreecommitdiff
path: root/utils/stringlib.h
diff options
context:
space:
mode:
Diffstat (limited to 'utils/stringlib.h')
-rw-r--r--utils/stringlib.h267
1 files changed, 267 insertions, 0 deletions
diff --git a/utils/stringlib.h b/utils/stringlib.h
new file mode 100644
index 00000000..84e95d44
--- /dev/null
+++ b/utils/stringlib.h
@@ -0,0 +1,267 @@
+#ifndef CDEC_STRINGLIB_H_
+#define CDEC_STRINGLIB_H_
+
+//usage: string s=MAKESTRE(1<<" "<<c);
+#define MAKESTR(expr) ((dynamic_cast<ostringstream &>(ostringstream()<<std::dec<<expr)).str())
+// std::dec (or seekp, or another manip) is needed to convert to std::ostream reference.
+
+#ifdef STRINGLIB_DEBUG
+#include <iostream>
+#define SLIBDBG(x) do { std::cerr<<"DBG(stringlib): "<<x<<std::endl; } while(0)
+#else
+#define SLIBDBG(x)
+#endif
+
+#include <map>
+#include <vector>
+#include <cctype>
+#include <cstring>
+#include <string>
+#include <sstream>
+#include <algorithm>
+
+inline std::size_t skip_ws(std::string const& s,std::size_t starting=0,char const* ws=" \t\n\r") {
+ return s.find_first_not_of(ws,starting);
+}
+
+// returns position of end of all non-ws chars before ending, i.e. string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)) strips both ends
+inline std::size_t trailing_ws(std::string const& s,std::size_t ending=std::string::npos,char const* ws=" \t\n\r") {
+ std::size_t n=s.find_last_not_of(ws,ending);
+ if (n==std::string::npos) return n;
+ else return n+1;
+}
+
+//TEST: if string is all whitespace, make sure that string(a+npos,a+npos) can't segfault (i.e. won't access any memory because begin==end)
+inline std::string strip_ws(std::string const& s) {
+ return std::string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s));
+}
+
+
+inline bool is_single_line(std::string const& line) {
+ return std::count(line.begin(),line.end(),'\n')==0; // but we want to allow terminal newlines/blanks
+}
+
+// is_single_line(strip_ws(line))
+inline bool is_single_line_stripped(std::string const& line) {
+ std::size_t b=skip_ws(line),e=trailing_ws(line);
+ std::size_t n=line.find('\n',b);
+ return n==std::string::npos || n>=e;
+}
+
+struct toupperc {
+ inline char operator()(char c) const {
+ return std::toupper(c);
+ }
+};
+
+inline std::string toupper(std::string s) {
+ std::transform(s.begin(),s.end(),s.begin(),toupperc());
+ return s;
+}
+
+template <class Istr, class Isubstr> inline
+bool match_begin(Istr bstr,Istr estr,Isubstr bsub,Isubstr esub)
+{
+ while (bsub != esub) {
+ if (bstr == estr)
+ return false;
+ if (*bsub++ != *bstr++)
+ return false;
+ }
+ return true;
+}
+
+template <class Istr, class Prefix> inline
+bool match_begin(Istr bstr,Istr estr,Prefix prefix)
+{
+ return match_begin(bstr,estr,prefix.begin(),prefix.end());
+}
+
+template <class Str, class Prefix> inline
+bool match_begin(Str const& str,Prefix const& prefix)
+{
+ return match_begin(str.begin(),str.end(),prefix.begin(),prefix.end());
+}
+
+
+// read line in the form of either:
+// source
+// source ||| target
+// source will be returned as a string, target must be a sentence or
+// a lattice (in PLF format) and will be returned as a Lattice object
+void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref);
+struct Lattice;
+void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref);
+
+inline std::string Trim(const std::string& str, const std::string& dropChars = " \t") {
+ std::string res = str;
+ res.erase(str.find_last_not_of(dropChars)+1);
+ return res.erase(0, res.find_first_not_of(dropChars));
+}
+
+inline void Tokenize(const std::string& str, char delimiter, std::vector<std::string>* res) {
+ std::string s = str;
+ int last = 0;
+ res->clear();
+ for (int i=0; i < s.size(); ++i)
+ if (s[i] == delimiter) {
+ s[i]=0;
+ if (last != i) {
+ res->push_back(&s[last]);
+ }
+ last = i + 1;
+ }
+ if (last != s.size())
+ res->push_back(&s[last]);
+}
+
+inline unsigned NTokens(const std::string& str, char delimiter)
+{
+ std::vector<std::string> r;
+ Tokenize(str,delimiter,&r);
+ return r.size();
+}
+
+inline std::string LowercaseString(const std::string& in) {
+ std::string res(in.size(),' ');
+ for (int i = 0; i < in.size(); ++i)
+ res[i] = tolower(in[i]);
+ return res;
+}
+
+inline int CountSubstrings(const std::string& str, const std::string& sub) {
+ size_t p = 0;
+ int res = 0;
+ while (p < str.size()) {
+ p = str.find(sub, p);
+ if (p == std::string::npos) break;
+ ++res;
+ p += sub.size();
+ }
+ return res;
+}
+
+inline int SplitOnWhitespace(const std::string& in, std::vector<std::string>* out) {
+ out->clear();
+ int i = 0;
+ int start = 0;
+ std::string cur;
+ while(i < in.size()) {
+ if (in[i] == ' ' || in[i] == '\t') {
+ if (i - start > 0)
+ out->push_back(in.substr(start, i - start));
+ start = i + 1;
+ }
+ ++i;
+ }
+ if (i > start)
+ out->push_back(in.substr(start, i - start));
+ return out->size();
+}
+
+inline std::vector<std::string> SplitOnWhitespace(std::string const& in)
+{
+ std::vector<std::string> r;
+ SplitOnWhitespace(in,&r);
+ return r;
+}
+
+
+struct mutable_c_str {
+ // because making a copy of a string might not copy its storage, so modifying a c_str() could screw up original (nobody uses cow nowadays because it needs locking under threading)
+ char *p;
+ mutable_c_str(std::string const& s) : p((char *)::operator new(s.size()+1)) {
+ std::memcpy(p,s.data(),s.size());
+ p[s.size()]=0;
+ }
+ ~mutable_c_str() { ::operator delete(p); }
+private:
+ mutable_c_str(mutable_c_str const&);
+};
+
+// ' ' '\t' tokens hardcoded
+//NOTE: you should have stripped endline chars out first.
+inline bool IsWordSep(char c) {
+ return c==' '||c=='\t';
+}
+
+
+template <class F>
+// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens
+void VisitTokens(char *p,char *const end,F f) {
+ SLIBDBG("VisitTokens. p="<<p<<" Nleft="<<end-p);
+ if (p==end) return;
+ char *last; // 0 terminated already. this is ok to mutilate because s is a copy of the string passed in. well, barring copy on write i guess.
+ while(IsWordSep(*p)) { ++p;if (p==end) return; } // skip init whitespace
+ last=p; // first non-ws char
+ for(;;) {
+ SLIBDBG("Start of word. last="<<last<<" *p="<<*p<<" Nleft="<<end-p);
+ // last==p, pointing at first non-ws char not yet translated into f(word) call
+ for(;;) {// p to end of word
+ ++p;
+ if (p==end) {
+ f(last);
+ SLIBDBG("Returning. word="<<last<<" *p="<<*p<<" Nleft="<<end-p);
+ return;
+ }
+ if (IsWordSep(*p)) break;
+ }
+ *p=0;
+ f(last);
+ SLIBDBG("End of word. word="<<last<<" rest="<<p+1<<" Nleft="<<end-p);
+ for(;;) { // again skip extra whitespace
+ ++p;
+ if (p==end) return;
+ if (!IsWordSep(*p)) break;
+ }
+ last=p;
+ }
+}
+
+template <class F>
+void VisitTokens(char *p,F f) {
+ VisitTokens(p,p+std::strlen(p),f);
+}
+
+
+template <class F>
+void VisitTokens(std::string const& s,F f) {
+ if (0) {
+ std::vector<std::string> ss=SplitOnWhitespace(s);
+ for (int i=0;i<ss.size();++i)
+ f(ss[i]);
+ return;
+ }
+ //FIXME:
+ if (s.empty()) return;
+ mutable_c_str mp(s);
+ SLIBDBG("mp="<<mp.p);
+ VisitTokens(mp.p,mp.p+s.size(),f);
+}
+
+inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) {
+ cmd->clear();
+ param->clear();
+ std::vector<std::string> x;
+ SplitOnWhitespace(in, &x);
+ if (x.size() == 0) return;
+ *cmd = x[0];
+ for (int i = 1; i < x.size(); ++i) {
+ if (i > 1) { *param += " "; }
+ *param += x[i];
+ }
+}
+
+void ProcessAndStripSGML(std::string* line, std::map<std::string, std::string>* out);
+
+// given the first character of a UTF8 block, find out how wide it is
+// see http://en.wikipedia.org/wiki/UTF-8 for more info
+inline unsigned int UTF8Len(unsigned char x) {
+ if (x < 0x80) return 1;
+ else if ((x >> 5) == 0x06) return 2;
+ else if ((x >> 4) == 0x0e) return 3;
+ else if ((x >> 3) == 0x1e) return 4;
+ else return 0;
+}
+
+#endif