diff options
| -rw-r--r-- | decoder/stringlib.h | 54 | ||||
| -rw-r--r-- | decoder/tdict.cc | 44 | ||||
| -rw-r--r-- | decoder/tdict.h | 3 | 
3 files changed, 82 insertions, 19 deletions
| diff --git a/decoder/stringlib.h b/decoder/stringlib.h index 6bb8cff0..a21ffd59 100644 --- a/decoder/stringlib.h +++ b/decoder/stringlib.h @@ -4,6 +4,7 @@  #include <map>  #include <vector>  #include <cctype> +#include <cstring>  #include <string>  // read line in the form of either: @@ -88,6 +89,59 @@ inline std::vector<std::string> SplitOnWhitespace(std::string const& in)    return r;  } + +struct mutable_c_str { +  // because making a copy of a string might not copy its storage, so modifying a c_str() could screw up original (nobody uses cow nowadays because it needs locking under threading) +  char *p; +  mutable_c_str(std::string const& s) : p((char *)::operator new(s.size()+1)) { +    std::memcpy(p,s.data(),s.size()); +    p[s.size()]=0; +  } +  ~mutable_c_str() { ::operator delete(p); } +private: +  mutable_c_str(mutable_c_str const&); +}; + +// ' ' '\t' tokens hardcoded +//NOTE: you should have stripped endline chars out first. +inline bool IsWordSep(char c) { +  return c==' '||c=='\t'; +} + + +template <class F> +// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens +void VisitTokens(char *p,char *const end,F f) { +  if (p==end) return; +  char *last; // 0 terminated already.  this is ok to mutilate because s is a copy of the string passed in.  well, barring copy on write i guess. +  while(IsWordSep(*p)) { ++p;if (p==end) return; } // skip init whitespace +  last=p; // first non-ws char +  for(;;) { +    ++p; +    // now last is a non-ws char, and p is one past it. +    for(;;) {// p to end of word +      if (p==end) { f(last); return; } +      if (!IsWordSep(*p)) break; +      ++p; +    } +    *p=0; +    f(last); +    for(;;) { // again skip extra whitespace +      ++p; +      if (p==end) return; +      if (!IsWordSep(*p)) break; +    } +    last=p; +  } +} + +template <class F> +void VisitTokens(std::string const& s,F f) { +  if (s.empty()) return; +  mutable_c_str mp(s); +  VisitTokens(mp.p,mp.p+s.size(),f); +} +  inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) {    cmd->clear();    param->clear(); diff --git a/decoder/tdict.cc b/decoder/tdict.cc index 93f7b0eb..d7fc7eb7 100644 --- a/decoder/tdict.cc +++ b/decoder/tdict.cc @@ -1,7 +1,9 @@ +#include <sstream>  #include "Ngram.h"  #include "dict.h"  #include "tdict.h"  #include "Vocab.h" +#include "stringlib.h"  using namespace std; @@ -16,6 +18,10 @@ WordID TD::Convert(const std::string& s) {    return dict_.addWord((VocabString)s.c_str());  } +WordID TD::Convert(char const* s) { +  return dict_.addWord((VocabString)s); +} +  const char* TD::Convert(const WordID& w) {    return dict_.getWord((VocabIndex)w);  } @@ -31,25 +37,27 @@ void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>  }  std::string TD::GetString(const std::vector<WordID>& str) { -  string res; -  for (vector<WordID>::const_iterator i = str.begin(); i != str.end(); ++i) -    res += (i == str.begin() ? empty : space) + TD::Convert(*i); -  return res; +  ostringstream o; +  for (int i=0;i<str.size();++i) { +    if (i) o << ' '; +    o << TD::Convert(str[i]); +  } +  return o.str();  } -void TD::ConvertSentence(const std::string& sent, std::vector<WordID>* ids) { -  string s = sent; -  int last = 0; -  ids->clear(); -  for (int i=0; i < s.size(); ++i) -    if (s[i] == 32 || s[i] == '\t') { -      s[i]=0; -      if (last != i) { -        ids->push_back(Convert(&s[last])); -      } -      last = i + 1; -    } -  if (last != s.size()) -    ids->push_back(Convert(&s[last])); +namespace { +struct add_wordids { +  typedef std::vector<WordID> Ws; +  Ws *ids; +  explicit add_wordids(Ws *i) : ids(i) {  } +  void operator()(char const* s) { +    ids->push_back(TD::Convert(s)); +  } +}; +  } +void TD::ConvertSentence(std::string const& s, std::vector<WordID>* ids) { +  ids->clear(); +  VisitTokens(s,add_wordids(ids)); +} diff --git a/decoder/tdict.h b/decoder/tdict.h index af1612ba..6b90becb 100644 --- a/decoder/tdict.h +++ b/decoder/tdict.h @@ -9,7 +9,7 @@ class Vocab;  struct TD {    static Vocab dict_; -  static void ConvertSentence(const std::string& sent, std::vector<WordID>* ids); +  static void ConvertSentence(std::string const& sent, std::vector<WordID>* ids);    static void GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids);    static std::string GetString(const std::vector<WordID>& str);    static int AppendString(const WordID& w, int pos, int bufsize, char* buffer) { @@ -25,6 +25,7 @@ struct TD {    }    static unsigned int NumWords();    static WordID Convert(const std::string& s); +  static WordID Convert(char const* s);    static const char* Convert(const WordID& w);  }; | 
