diff options
| author | Chris Dyer <redpony@gmail.com> | 2009-12-06 22:25:25 -0500 | 
|---|---|---|
| committer | Chris Dyer <redpony@gmail.com> | 2009-12-06 22:25:25 -0500 | 
| commit | 2a18010e255810cc2b5bcbe688f3db8eabda23ca (patch) | |
| tree | e310286257e5445072303dcca03acb85a865c26a /src/stringlib.h | |
| parent | 59ea352f3dcf3bf58969f404615fed4ff6b931f7 (diff) | |
add compound splitting logic and features (Dyer 2008, NAACL)
Diffstat (limited to 'src/stringlib.h')
| -rw-r--r-- | src/stringlib.h | 10 | 
1 files changed, 10 insertions, 0 deletions
| diff --git a/src/stringlib.h b/src/stringlib.h index d26952c7..76efee8f 100644 --- a/src/stringlib.h +++ b/src/stringlib.h @@ -88,4 +88,14 @@ inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::s  void ProcessAndStripSGML(std::string* line, std::map<std::string, std::string>* out); +// given the first character of a UTF8 block, find out how wide it is +// see http://en.wikipedia.org/wiki/UTF-8 for more info +inline unsigned int UTF8Len(unsigned char x) { +  if (x < 0x80) return 1; +  else if ((x >> 5) == 0x06) return 2; +  else if ((x >> 4) == 0x0e) return 3; +  else if ((x >> 3) == 0x1e) return 4; +  else return 0; +} +  #endif | 
