diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-20 23:57:46 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-20 23:57:46 +0000 |
commit | f6cd72d39cdc7a034b36ad73d613f6f268826e48 (patch) | |
tree | 6ca964655dc8e2989087d1b32c26f60f7ea6aece | |
parent | 232cf4e95235bc29bf04e4ea2bf1483d1baf2b7d (diff) |
fabulous string tokenization tested
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@350 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-x | decoder/sentences.h | 5 | ||||
-rw-r--r-- | decoder/stringlib.h | 30 | ||||
-rwxr-xr-x | decoder/stringlib_test.cc | 17 | ||||
-rwxr-xr-x | decoder/value_array.h | 2 |
4 files changed, 49 insertions, 5 deletions
diff --git a/decoder/sentences.h b/decoder/sentences.h index 622a6f43..6ab216bf 100755 --- a/decoder/sentences.h +++ b/decoder/sentences.h @@ -15,9 +15,12 @@ inline std::ostream & operator<<(std::ostream &out,Sentence const& s) { inline void StringToSentence(std::string const& str,Sentence &s) { using namespace std; - vector<string> ss=SplitOnWhitespace(str); s.clear(); + TD::ConvertSentence(str,&s); +/* vector<string> ss=SplitOnWhitespace(str); transform(ss.begin(),ss.end(),back_inserter(s),ToTD()); +*/ + } inline Sentence StringToSentence(std::string const& str) { diff --git a/decoder/stringlib.h b/decoder/stringlib.h index a7c6c3c4..a0e03624 100644 --- a/decoder/stringlib.h +++ b/decoder/stringlib.h @@ -1,6 +1,13 @@ #ifndef CDEC_STRINGLIB_H_ #define CDEC_STRINGLIB_H_ +#ifdef STRINGLIB_DEBUG +#include <iostream> +#define SLIBDBG(x) do { std::cerr<<"DBG(stringlib): "<<x<<std::endl; } while(0) +#else +#define SLIBDBG(x) +#endif + #include <map> #include <vector> #include <cctype> @@ -112,20 +119,26 @@ inline bool IsWordSep(char c) { template <class F> // *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens void VisitTokens(char *p,char *const end,F f) { + SLIBDBG("VisitTokens. p="<<p<<" Nleft="<<end-p); if (p==end) return; char *last; // 0 terminated already. this is ok to mutilate because s is a copy of the string passed in. well, barring copy on write i guess. while(IsWordSep(*p)) { ++p;if (p==end) return; } // skip init whitespace last=p; // first non-ws char for(;;) { - ++p; - // now last is a non-ws char, and p is one past it. + SLIBDBG("Start of word. last="<<last<<" *p="<<*p<<" Nleft="<<end-p); + // last==p, pointing at first non-ws char not yet translated into f(word) call for(;;) {// p to end of word - if (p==end) { f(last); return; } - if (!IsWordSep(*p)) break; ++p; + if (p==end) { + f(last); + SLIBDBG("Returning. word="<<last<<" *p="<<*p<<" Nleft="<<end-p); + return; + } + if (IsWordSep(*p)) break; } *p=0; f(last); + SLIBDBG("End of word. word="<<last<<" rest="<<p+1<<" Nleft="<<end-p); for(;;) { // again skip extra whitespace ++p; if (p==end) return; @@ -136,14 +149,23 @@ void VisitTokens(char *p,char *const end,F f) { } template <class F> +void VisitTokens(char *p,F f) { + VisitTokens(p,p+std::strlen(p),f); +} + + +template <class F> void VisitTokens(std::string const& s,F f) { + if (0) { std::vector<std::string> ss=SplitOnWhitespace(s); for (int i=0;i<ss.size();++i) f(ss[i]); return; + } //FIXME: if (s.empty()) return; mutable_c_str mp(s); + SLIBDBG("mp="<<mp.p); VisitTokens(mp.p,mp.p+s.size(),f); } diff --git a/decoder/stringlib_test.cc b/decoder/stringlib_test.cc new file mode 100755 index 00000000..ea39f30f --- /dev/null +++ b/decoder/stringlib_test.cc @@ -0,0 +1,17 @@ +#define STRINGLIB_DEBUG +#include "stringlib.h" + +using namespace std; +struct print { + template <class S> + void operator()(S const& s) const { + cout<<s<<endl; + } +}; + +int main(int argc, char *argv[]) { + char *p=" 1 are u 2 serious?"; + std::string const& w="verylongword"; + VisitTokens(p,print()); + VisitTokens(w,print()); +} diff --git a/decoder/value_array.h b/decoder/value_array.h index 7401938a..042247a1 100755 --- a/decoder/value_array.h +++ b/decoder/value_array.h @@ -17,6 +17,8 @@ template <class T, class A = std::allocator<T> > class ValueArray : A // private inheritance so stateless allocator adds no size. { public: + const int SV_MAX=sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1; + //space optimization: SV_MAX T will fit inside what would otherwise be a pointer to heap data. todo in the far future if bored. typedef T value_type; typedef T& reference; typedef T const& const_reference; |