diff options
author | Chris Dyer <redpony@gmail.com> | 2009-12-06 22:25:25 -0500 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2009-12-06 22:25:25 -0500 |
commit | 2a18010e255810cc2b5bcbe688f3db8eabda23ca (patch) | |
tree | e310286257e5445072303dcca03acb85a865c26a /src/stringlib.h | |
parent | 59ea352f3dcf3bf58969f404615fed4ff6b931f7 (diff) |
add compound splitting logic and features (Dyer 2008, NAACL)
Diffstat (limited to 'src/stringlib.h')
-rw-r--r-- | src/stringlib.h | 10 |
1 files changed, 10 insertions, 0 deletions
diff --git a/src/stringlib.h b/src/stringlib.h index d26952c7..76efee8f 100644 --- a/src/stringlib.h +++ b/src/stringlib.h @@ -88,4 +88,14 @@ inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::s void ProcessAndStripSGML(std::string* line, std::map<std::string, std::string>* out); +// given the first character of a UTF8 block, find out how wide it is +// see http://en.wikipedia.org/wiki/UTF-8 for more info +inline unsigned int UTF8Len(unsigned char x) { + if (x < 0x80) return 1; + else if ((x >> 5) == 0x06) return 2; + else if ((x >> 4) == 0x0e) return 3; + else if ((x >> 3) == 0x1e) return 4; + else return 0; +} + #endif |