From 0720de0bee526e8e9b311bb91d0a3a1efa8c1438 Mon Sep 17 00:00:00 2001 From: graehl Date: Tue, 20 Jul 2010 22:44:40 +0000 Subject: beautiful (?) low-level tokenization to fix valgrind problem with previous TD string->Sentence git-svn-id: https://ws10smt.googlecode.com/svn/trunk@345 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/stringlib.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++ decoder/tdict.cc | 44 +++++++++++++++++++++++++------------------ decoder/tdict.h | 3 ++- 3 files changed, 82 insertions(+), 19 deletions(-) diff --git a/decoder/stringlib.h b/decoder/stringlib.h index 6bb8cff0..a21ffd59 100644 --- a/decoder/stringlib.h +++ b/decoder/stringlib.h @@ -4,6 +4,7 @@ #include #include #include +#include #include // read line in the form of either: @@ -88,6 +89,59 @@ inline std::vector SplitOnWhitespace(std::string const& in) return r; } + +struct mutable_c_str { + // because making a copy of a string might not copy its storage, so modifying a c_str() could screw up original (nobody uses cow nowadays because it needs locking under threading) + char *p; + mutable_c_str(std::string const& s) : p((char *)::operator new(s.size()+1)) { + std::memcpy(p,s.data(),s.size()); + p[s.size()]=0; + } + ~mutable_c_str() { ::operator delete(p); } +private: + mutable_c_str(mutable_c_str const&); +}; + +// ' ' '\t' tokens hardcoded +//NOTE: you should have stripped endline chars out first. +inline bool IsWordSep(char c) { + return c==' '||c=='\t'; +} + + +template +// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens +void VisitTokens(char *p,char *const end,F f) { + if (p==end) return; + char *last; // 0 terminated already. this is ok to mutilate because s is a copy of the string passed in. well, barring copy on write i guess. + while(IsWordSep(*p)) { ++p;if (p==end) return; } // skip init whitespace + last=p; // first non-ws char + for(;;) { + ++p; + // now last is a non-ws char, and p is one past it. + for(;;) {// p to end of word + if (p==end) { f(last); return; } + if (!IsWordSep(*p)) break; + ++p; + } + *p=0; + f(last); + for(;;) { // again skip extra whitespace + ++p; + if (p==end) return; + if (!IsWordSep(*p)) break; + } + last=p; + } +} + +template +void VisitTokens(std::string const& s,F f) { + if (s.empty()) return; + mutable_c_str mp(s); + VisitTokens(mp.p,mp.p+s.size(),f); +} + inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) { cmd->clear(); param->clear(); diff --git a/decoder/tdict.cc b/decoder/tdict.cc index 93f7b0eb..d7fc7eb7 100644 --- a/decoder/tdict.cc +++ b/decoder/tdict.cc @@ -1,7 +1,9 @@ +#include #include "Ngram.h" #include "dict.h" #include "tdict.h" #include "Vocab.h" +#include "stringlib.h" using namespace std; @@ -16,6 +18,10 @@ WordID TD::Convert(const std::string& s) { return dict_.addWord((VocabString)s.c_str()); } +WordID TD::Convert(char const* s) { + return dict_.addWord((VocabString)s); +} + const char* TD::Convert(const WordID& w) { return dict_.getWord((VocabIndex)w); } @@ -31,25 +37,27 @@ void TD::GetWordIDs(const std::vector& strings, std::vector } std::string TD::GetString(const std::vector& str) { - string res; - for (vector::const_iterator i = str.begin(); i != str.end(); ++i) - res += (i == str.begin() ? empty : space) + TD::Convert(*i); - return res; + ostringstream o; + for (int i=0;i* ids) { - string s = sent; - int last = 0; - ids->clear(); - for (int i=0; i < s.size(); ++i) - if (s[i] == 32 || s[i] == '\t') { - s[i]=0; - if (last != i) { - ids->push_back(Convert(&s[last])); - } - last = i + 1; - } - if (last != s.size()) - ids->push_back(Convert(&s[last])); +namespace { +struct add_wordids { + typedef std::vector Ws; + Ws *ids; + explicit add_wordids(Ws *i) : ids(i) { } + void operator()(char const* s) { + ids->push_back(TD::Convert(s)); + } +}; + } +void TD::ConvertSentence(std::string const& s, std::vector* ids) { + ids->clear(); + VisitTokens(s,add_wordids(ids)); +} diff --git a/decoder/tdict.h b/decoder/tdict.h index af1612ba..6b90becb 100644 --- a/decoder/tdict.h +++ b/decoder/tdict.h @@ -9,7 +9,7 @@ class Vocab; struct TD { static Vocab dict_; - static void ConvertSentence(const std::string& sent, std::vector* ids); + static void ConvertSentence(std::string const& sent, std::vector* ids); static void GetWordIDs(const std::vector& strings, std::vector* ids); static std::string GetString(const std::vector& str); static int AppendString(const WordID& w, int pos, int bufsize, char* buffer) { @@ -25,6 +25,7 @@ struct TD { } static unsigned int NumWords(); static WordID Convert(const std::string& s); + static WordID Convert(char const* s); static const char* Convert(const WordID& w); }; -- cgit v1.2.3