summaryrefslogtreecommitdiff
path: root/decoder/stringlib.h
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-20 22:44:40 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-20 22:44:40 +0000
commit0720de0bee526e8e9b311bb91d0a3a1efa8c1438 (patch)
tree10f4f50fbaff6c8ecec0c68a72a31c3aa0d3d2a9 /decoder/stringlib.h
parentac98d8465d9b7a6faf2b51bcd18260375842f6c8 (diff)
beautiful (?) low-level tokenization to fix valgrind problem with previous TD string->Sentence
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@345 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder/stringlib.h')
-rw-r--r--decoder/stringlib.h54
1 files changed, 54 insertions, 0 deletions
diff --git a/decoder/stringlib.h b/decoder/stringlib.h
index 6bb8cff0..a21ffd59 100644
--- a/decoder/stringlib.h
+++ b/decoder/stringlib.h
@@ -4,6 +4,7 @@
#include <map>
#include <vector>
#include <cctype>
+#include <cstring>
#include <string>
// read line in the form of either:
@@ -88,6 +89,59 @@ inline std::vector<std::string> SplitOnWhitespace(std::string const& in)
return r;
}
+
+struct mutable_c_str {
+ // because making a copy of a string might not copy its storage, so modifying a c_str() could screw up original (nobody uses cow nowadays because it needs locking under threading)
+ char *p;
+ mutable_c_str(std::string const& s) : p((char *)::operator new(s.size()+1)) {
+ std::memcpy(p,s.data(),s.size());
+ p[s.size()]=0;
+ }
+ ~mutable_c_str() { ::operator delete(p); }
+private:
+ mutable_c_str(mutable_c_str const&);
+};
+
+// ' ' '\t' tokens hardcoded
+//NOTE: you should have stripped endline chars out first.
+inline bool IsWordSep(char c) {
+ return c==' '||c=='\t';
+}
+
+
+template <class F>
+// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens
+void VisitTokens(char *p,char *const end,F f) {
+ if (p==end) return;
+ char *last; // 0 terminated already. this is ok to mutilate because s is a copy of the string passed in. well, barring copy on write i guess.
+ while(IsWordSep(*p)) { ++p;if (p==end) return; } // skip init whitespace
+ last=p; // first non-ws char
+ for(;;) {
+ ++p;
+ // now last is a non-ws char, and p is one past it.
+ for(;;) {// p to end of word
+ if (p==end) { f(last); return; }
+ if (!IsWordSep(*p)) break;
+ ++p;
+ }
+ *p=0;
+ f(last);
+ for(;;) { // again skip extra whitespace
+ ++p;
+ if (p==end) return;
+ if (!IsWordSep(*p)) break;
+ }
+ last=p;
+ }
+}
+
+template <class F>
+void VisitTokens(std::string const& s,F f) {
+ if (s.empty()) return;
+ mutable_c_str mp(s);
+ VisitTokens(mp.p,mp.p+s.size(),f);
+}
+
inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) {
cmd->clear();
param->clear();