summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-20 23:57:46 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-20 23:57:46 +0000
commitf6cd72d39cdc7a034b36ad73d613f6f268826e48 (patch)
tree6ca964655dc8e2989087d1b32c26f60f7ea6aece
parent232cf4e95235bc29bf04e4ea2bf1483d1baf2b7d (diff)
fabulous string tokenization tested
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@350 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-xdecoder/sentences.h5
-rw-r--r--decoder/stringlib.h30
-rwxr-xr-xdecoder/stringlib_test.cc17
-rwxr-xr-xdecoder/value_array.h2
4 files changed, 49 insertions, 5 deletions
diff --git a/decoder/sentences.h b/decoder/sentences.h
index 622a6f43..6ab216bf 100755
--- a/decoder/sentences.h
+++ b/decoder/sentences.h
@@ -15,9 +15,12 @@ inline std::ostream & operator<<(std::ostream &out,Sentence const& s) {
inline void StringToSentence(std::string const& str,Sentence &s) {
using namespace std;
- vector<string> ss=SplitOnWhitespace(str);
s.clear();
+ TD::ConvertSentence(str,&s);
+/* vector<string> ss=SplitOnWhitespace(str);
transform(ss.begin(),ss.end(),back_inserter(s),ToTD());
+*/
+
}
inline Sentence StringToSentence(std::string const& str) {
diff --git a/decoder/stringlib.h b/decoder/stringlib.h
index a7c6c3c4..a0e03624 100644
--- a/decoder/stringlib.h
+++ b/decoder/stringlib.h
@@ -1,6 +1,13 @@
#ifndef CDEC_STRINGLIB_H_
#define CDEC_STRINGLIB_H_
+#ifdef STRINGLIB_DEBUG
+#include <iostream>
+#define SLIBDBG(x) do { std::cerr<<"DBG(stringlib): "<<x<<std::endl; } while(0)
+#else
+#define SLIBDBG(x)
+#endif
+
#include <map>
#include <vector>
#include <cctype>
@@ -112,20 +119,26 @@ inline bool IsWordSep(char c) {
template <class F>
// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens
void VisitTokens(char *p,char *const end,F f) {
+ SLIBDBG("VisitTokens. p="<<p<<" Nleft="<<end-p);
if (p==end) return;
char *last; // 0 terminated already. this is ok to mutilate because s is a copy of the string passed in. well, barring copy on write i guess.
while(IsWordSep(*p)) { ++p;if (p==end) return; } // skip init whitespace
last=p; // first non-ws char
for(;;) {
- ++p;
- // now last is a non-ws char, and p is one past it.
+ SLIBDBG("Start of word. last="<<last<<" *p="<<*p<<" Nleft="<<end-p);
+ // last==p, pointing at first non-ws char not yet translated into f(word) call
for(;;) {// p to end of word
- if (p==end) { f(last); return; }
- if (!IsWordSep(*p)) break;
++p;
+ if (p==end) {
+ f(last);
+ SLIBDBG("Returning. word="<<last<<" *p="<<*p<<" Nleft="<<end-p);
+ return;
+ }
+ if (IsWordSep(*p)) break;
}
*p=0;
f(last);
+ SLIBDBG("End of word. word="<<last<<" rest="<<p+1<<" Nleft="<<end-p);
for(;;) { // again skip extra whitespace
++p;
if (p==end) return;
@@ -136,14 +149,23 @@ void VisitTokens(char *p,char *const end,F f) {
}
template <class F>
+void VisitTokens(char *p,F f) {
+ VisitTokens(p,p+std::strlen(p),f);
+}
+
+
+template <class F>
void VisitTokens(std::string const& s,F f) {
+ if (0) {
std::vector<std::string> ss=SplitOnWhitespace(s);
for (int i=0;i<ss.size();++i)
f(ss[i]);
return;
+ }
//FIXME:
if (s.empty()) return;
mutable_c_str mp(s);
+ SLIBDBG("mp="<<mp.p);
VisitTokens(mp.p,mp.p+s.size(),f);
}
diff --git a/decoder/stringlib_test.cc b/decoder/stringlib_test.cc
new file mode 100755
index 00000000..ea39f30f
--- /dev/null
+++ b/decoder/stringlib_test.cc
@@ -0,0 +1,17 @@
+#define STRINGLIB_DEBUG
+#include "stringlib.h"
+
+using namespace std;
+struct print {
+ template <class S>
+ void operator()(S const& s) const {
+ cout<<s<<endl;
+ }
+};
+
+int main(int argc, char *argv[]) {
+ char *p=" 1 are u 2 serious?";
+ std::string const& w="verylongword";
+ VisitTokens(p,print());
+ VisitTokens(w,print());
+}
diff --git a/decoder/value_array.h b/decoder/value_array.h
index 7401938a..042247a1 100755
--- a/decoder/value_array.h
+++ b/decoder/value_array.h
@@ -17,6 +17,8 @@ template <class T, class A = std::allocator<T> >
class ValueArray : A // private inheritance so stateless allocator adds no size.
{
public:
+ const int SV_MAX=sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1;
+ //space optimization: SV_MAX T will fit inside what would otherwise be a pointer to heap data. todo in the far future if bored.
typedef T value_type;
typedef T& reference;
typedef T const& const_reference;