summaryrefslogtreecommitdiff
path: root/decoder/stringlib.h
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-20 23:57:46 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-20 23:57:46 +0000
commit8baa12d8f4973471105a590a7bd42b2ec2e3592a (patch)
treebfa7d2de32b285848652202cce23ed00f6308e90 /decoder/stringlib.h
parent234e92201c32eac97d472d2b2454809028538df4 (diff)
fabulous string tokenization tested
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@350 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder/stringlib.h')
-rw-r--r--decoder/stringlib.h30
1 files changed, 26 insertions, 4 deletions
diff --git a/decoder/stringlib.h b/decoder/stringlib.h
index a7c6c3c4..a0e03624 100644
--- a/decoder/stringlib.h
+++ b/decoder/stringlib.h
@@ -1,6 +1,13 @@
#ifndef CDEC_STRINGLIB_H_
#define CDEC_STRINGLIB_H_
+#ifdef STRINGLIB_DEBUG
+#include <iostream>
+#define SLIBDBG(x) do { std::cerr<<"DBG(stringlib): "<<x<<std::endl; } while(0)
+#else
+#define SLIBDBG(x)
+#endif
+
#include <map>
#include <vector>
#include <cctype>
@@ -112,20 +119,26 @@ inline bool IsWordSep(char c) {
template <class F>
// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens
void VisitTokens(char *p,char *const end,F f) {
+ SLIBDBG("VisitTokens. p="<<p<<" Nleft="<<end-p);
if (p==end) return;
char *last; // 0 terminated already. this is ok to mutilate because s is a copy of the string passed in. well, barring copy on write i guess.
while(IsWordSep(*p)) { ++p;if (p==end) return; } // skip init whitespace
last=p; // first non-ws char
for(;;) {
- ++p;
- // now last is a non-ws char, and p is one past it.
+ SLIBDBG("Start of word. last="<<last<<" *p="<<*p<<" Nleft="<<end-p);
+ // last==p, pointing at first non-ws char not yet translated into f(word) call
for(;;) {// p to end of word
- if (p==end) { f(last); return; }
- if (!IsWordSep(*p)) break;
++p;
+ if (p==end) {
+ f(last);
+ SLIBDBG("Returning. word="<<last<<" *p="<<*p<<" Nleft="<<end-p);
+ return;
+ }
+ if (IsWordSep(*p)) break;
}
*p=0;
f(last);
+ SLIBDBG("End of word. word="<<last<<" rest="<<p+1<<" Nleft="<<end-p);
for(;;) { // again skip extra whitespace
++p;
if (p==end) return;
@@ -136,14 +149,23 @@ void VisitTokens(char *p,char *const end,F f) {
}
template <class F>
+void VisitTokens(char *p,F f) {
+ VisitTokens(p,p+std::strlen(p),f);
+}
+
+
+template <class F>
void VisitTokens(std::string const& s,F f) {
+ if (0) {
std::vector<std::string> ss=SplitOnWhitespace(s);
for (int i=0;i<ss.size();++i)
f(ss[i]);
return;
+ }
//FIXME:
if (s.empty()) return;
mutable_c_str mp(s);
+ SLIBDBG("mp="<<mp.p);
VisitTokens(mp.p,mp.p+s.size(),f);
}