From 2a18010e255810cc2b5bcbe688f3db8eabda23ca Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 6 Dec 2009 22:25:25 -0500 Subject: add compound splitting logic and features (Dyer 2008, NAACL) --- src/stringlib.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'src/stringlib.h') diff --git a/src/stringlib.h b/src/stringlib.h index d26952c7..76efee8f 100644 --- a/src/stringlib.h +++ b/src/stringlib.h @@ -88,4 +88,14 @@ inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::s void ProcessAndStripSGML(std::string* line, std::map* out); +// given the first character of a UTF8 block, find out how wide it is +// see http://en.wikipedia.org/wiki/UTF-8 for more info +inline unsigned int UTF8Len(unsigned char x) { + if (x < 0x80) return 1; + else if ((x >> 5) == 0x06) return 2; + else if ((x >> 4) == 0x0e) return 3; + else if ((x >> 3) == 0x1e) return 4; + else return 0; +} + #endif -- cgit v1.2.3