summaryrefslogtreecommitdiff
path: root/src/stringlib.h
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2009-12-06 22:25:25 -0500
committerChris Dyer <redpony@gmail.com>2009-12-06 22:25:25 -0500
commit2a18010e255810cc2b5bcbe688f3db8eabda23ca (patch)
treee310286257e5445072303dcca03acb85a865c26a /src/stringlib.h
parent59ea352f3dcf3bf58969f404615fed4ff6b931f7 (diff)
add compound splitting logic and features (Dyer 2008, NAACL)
Diffstat (limited to 'src/stringlib.h')
-rw-r--r--src/stringlib.h10
1 files changed, 10 insertions, 0 deletions
diff --git a/src/stringlib.h b/src/stringlib.h
index d26952c7..76efee8f 100644
--- a/src/stringlib.h
+++ b/src/stringlib.h
@@ -88,4 +88,14 @@ inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::s
void ProcessAndStripSGML(std::string* line, std::map<std::string, std::string>* out);
+// given the first character of a UTF8 block, find out how wide it is
+// see http://en.wikipedia.org/wiki/UTF-8 for more info
+inline unsigned int UTF8Len(unsigned char x) {
+ if (x < 0x80) return 1;
+ else if ((x >> 5) == 0x06) return 2;
+ else if ((x >> 4) == 0x0e) return 3;
+ else if ((x >> 3) == 0x1e) return 4;
+ else return 0;
+}
+
#endif