diff options
author | Chris Dyer <redpony@gmail.com> | 2009-12-14 20:35:11 -0500 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2009-12-14 20:35:11 -0500 |
commit | 851e389dffdd6996ea32d70defb8906de80b9edc (patch) | |
tree | 8c68ee77205badc056b8ab5b332e67e3e98017df /src/stringlib.h | |
parent | dc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff) |
few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec
Diffstat (limited to 'src/stringlib.h')
-rw-r--r-- | src/stringlib.h | 101 |
1 files changed, 0 insertions, 101 deletions
diff --git a/src/stringlib.h b/src/stringlib.h deleted file mode 100644 index 76efee8f..00000000 --- a/src/stringlib.h +++ /dev/null @@ -1,101 +0,0 @@ -#ifndef _STRINGLIB_H_ - -#include <map> -#include <vector> -#include <cctype> -#include <string> - -// read line in the form of either: -// source -// source ||| target -// source will be returned as a string, target must be a sentence or -// a lattice (in PLF format) and will be returned as a Lattice object -void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref); -struct Lattice; -void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref); - -inline const std::string Trim(const std::string& str, const std::string& dropChars = " \t") { - std::string res = str; - res.erase(str.find_last_not_of(dropChars)+1); - return res.erase(0, res.find_first_not_of(dropChars)); -} - -inline void Tokenize(const std::string& str, char delimiter, std::vector<std::string>* res) { - std::string s = str; - int last = 0; - res->clear(); - for (int i=0; i < s.size(); ++i) - if (s[i] == delimiter) { - s[i]=0; - if (last != i) { - res->push_back(&s[last]); - } - last = i + 1; - } - if (last != s.size()) - res->push_back(&s[last]); -} - -inline std::string LowercaseString(const std::string& in) { - std::string res(in.size(),' '); - for (int i = 0; i < in.size(); ++i) - res[i] = tolower(in[i]); - return res; -} - -inline int CountSubstrings(const std::string& str, const std::string& sub) { - size_t p = 0; - int res = 0; - while (p < str.size()) { - p = str.find(sub, p); - if (p == std::string::npos) break; - ++res; - p += sub.size(); - } - return res; -} - -inline int SplitOnWhitespace(const std::string& in, std::vector<std::string>* out) { - out->clear(); - int i = 0; - int start = 0; - std::string cur; - while(i < in.size()) { - if (in[i] == ' ' || in[i] == '\t') { - if (i - start > 0) - out->push_back(in.substr(start, i - start)); - start = i + 1; - } - ++i; - } - if (i > start) - out->push_back(in.substr(start, i - start)); - return out->size(); -} - -inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) { - cmd->clear(); - param->clear(); - std::vector<std::string> x; - SplitOnWhitespace(in, &x); - if (x.size() == 0) return; - *cmd = x[0]; - for (int i = 1; i < x.size(); ++i) { - if (i > 1) { *param += " "; } - *param += x[i]; - } -} - -void ProcessAndStripSGML(std::string* line, std::map<std::string, std::string>* out); - -// given the first character of a UTF8 block, find out how wide it is -// see http://en.wikipedia.org/wiki/UTF-8 for more info -inline unsigned int UTF8Len(unsigned char x) { - if (x < 0x80) return 1; - else if ((x >> 5) == 0x06) return 2; - else if ((x >> 4) == 0x0e) return 3; - else if ((x >> 3) == 0x1e) return 4; - else return 0; -} - -#endif |