beautiful (?) low-level tokenization to fix valgrind problem with previous TD string->Sentence

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@345 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-20 22:44:40 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-20 22:44:40 +0000
commit: 0720de0bee526e8e9b311bb91d0a3a1efa8c1438 (patch)
tree: 10f4f50fbaff6c8ecec0c68a72a31c3aa0d3d2a9 /decoder
parent: ac98d8465d9b7a6faf2b51bcd18260375842f6c8 (diff)
3 files changed, 82 insertions, 19 deletions
diff --git a/decoder/stringlib.h b/decoder/stringlib.h
index 6bb8cff0..a21ffd59 100644
--- a/decoder/stringlib.h
+++ b/decoder/stringlib.h
@@ -4,6 +4,7 @@
 #include <map>
 #include <vector>
 #include <cctype>
+#include <cstring>
 #include <string>
 
 // read line in the form of either:
@@ -88,6 +89,59 @@ inline std::vector<std::string> SplitOnWhitespace(std::string const& in)
   return r;
 }
 
+
+struct mutable_c_str {
+  // because making a copy of a string might not copy its storage, so modifying a c_str() could screw up original (nobody uses cow nowadays because it needs locking under threading)
+  char *p;
+  mutable_c_str(std::string const& s) : p((char *)::operator new(s.size()+1)) {
+    std::memcpy(p,s.data(),s.size());
+    p[s.size()]=0;
+  }
+  ~mutable_c_str() { ::operator delete(p); }
+private:
+  mutable_c_str(mutable_c_str const&);
+};
+
+// ' ' '\t' tokens hardcoded
+//NOTE: you should have stripped endline chars out first.
+inline bool IsWordSep(char c) {
+  return c==' '||c=='\t';
+}
+
+
+template <class F>
+// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens
+void VisitTokens(char *p,char *const end,F f) {
+  if (p==end) return;
+  char *last; // 0 terminated already.  this is ok to mutilate because s is a copy of the string passed in.  well, barring copy on write i guess.
+  while(IsWordSep(*p)) { ++p;if (p==end) return; } // skip init whitespace
+  last=p; // first non-ws char
+  for(;;) {
+    ++p;
+    // now last is a non-ws char, and p is one past it.
+    for(;;) {// p to end of word
+      if (p==end) { f(last); return; }
+      if (!IsWordSep(*p)) break;
+      ++p;
+    }
+    *p=0;
+    f(last);
+    for(;;) { // again skip extra whitespace
+      ++p;
+      if (p==end) return;
+      if (!IsWordSep(*p)) break;
+    }
+    last=p;
+  }
+}
+
+template <class F>
+void VisitTokens(std::string const& s,F f) {
+  if (s.empty()) return;
+  mutable_c_str mp(s);
+  VisitTokens(mp.p,mp.p+s.size(),f);
+}
+
 inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) {
   cmd->clear();
   param->clear();
diff --git a/decoder/tdict.cc b/decoder/tdict.cc
index 93f7b0eb..d7fc7eb7 100644
--- a/decoder/tdict.cc
+++ b/decoder/tdict.cc
@@ -1,7 +1,9 @@
+#include <sstream>
 #include "Ngram.h"
 #include "dict.h"
 #include "tdict.h"
 #include "Vocab.h"
+#include "stringlib.h"
 
 using namespace std;
 
@@ -16,6 +18,10 @@ WordID TD::Convert(const std::string& s) {
   return dict_.addWord((VocabString)s.c_str());
 }
 
+WordID TD::Convert(char const* s) {
+  return dict_.addWord((VocabString)s);
+}
+
 const char* TD::Convert(const WordID& w) {
   return dict_.getWord((VocabIndex)w);
 }
@@ -31,25 +37,27 @@ void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>
 }
 
 std::string TD::GetString(const std::vector<WordID>& str) {
-  string res;
-  for (vector<WordID>::const_iterator i = str.begin(); i != str.end(); ++i)
-    res += (i == str.begin() ? empty : space) + TD::Convert(*i);
-  return res;
+  ostringstream o;
+  for (int i=0;i<str.size();++i) {
+    if (i) o << ' ';
+    o << TD::Convert(str[i]);
+  }
+  return o.str();
 }
 
-void TD::ConvertSentence(const std::string& sent, std::vector<WordID>* ids) {
-  string s = sent;
-  int last = 0;
-  ids->clear();
-  for (int i=0; i < s.size(); ++i)
-    if (s[i] == 32 || s[i] == '\t') {
-      s[i]=0;
-      if (last != i) {
-        ids->push_back(Convert(&s[last]));
-      }
-      last = i + 1;
-    }
-  if (last != s.size())
-    ids->push_back(Convert(&s[last]));
+namespace {
+struct add_wordids {
+  typedef std::vector<WordID> Ws;
+  Ws *ids;
+  explicit add_wordids(Ws *i) : ids(i) {  }
+  void operator()(char const* s) {
+    ids->push_back(TD::Convert(s));
+  }
+};
+
 }
 
+void TD::ConvertSentence(std::string const& s, std::vector<WordID>* ids) {
+  ids->clear();
+  VisitTokens(s,add_wordids(ids));
+}
diff --git a/decoder/tdict.h b/decoder/tdict.h
index af1612ba..6b90becb 100644
--- a/decoder/tdict.h
+++ b/decoder/tdict.h
@@ -9,7 +9,7 @@ class Vocab;
 
 struct TD {
   static Vocab dict_;
-  static void ConvertSentence(const std::string& sent, std::vector<WordID>* ids);
+  static void ConvertSentence(std::string const& sent, std::vector<WordID>* ids);
   static void GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids);
   static std::string GetString(const std::vector<WordID>& str);
   static int AppendString(const WordID& w, int pos, int bufsize, char* buffer) {
@@ -25,6 +25,7 @@ struct TD {
   }
   static unsigned int NumWords();
   static WordID Convert(const std::string& s);
+  static WordID Convert(char const* s);
   static const char* Convert(const WordID& w);
 };
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-20 22:44:40 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-20 22:44:40 +0000
commit	0720de0bee526e8e9b311bb91d0a3a1efa8c1438 (patch)
tree	10f4f50fbaff6c8ecec0c68a72a31c3aa0d3d2a9 /decoder
parent	ac98d8465d9b7a6faf2b51bcd18260375842f6c8 (diff)