diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-20 22:44:40 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-20 22:44:40 +0000 |
commit | 77b1850d21d0a6a77009e5112badaa3698151a1b (patch) | |
tree | 07bc54c7935219761ef4ea14496e658d70d83748 /decoder/tdict.cc | |
parent | 3da5d905ba7ff7cbb139377909b8f5cd103457ed (diff) |
beautiful (?) low-level tokenization to fix valgrind problem with previous TD string->Sentence
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@345 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder/tdict.cc')
-rw-r--r-- | decoder/tdict.cc | 44 |
1 files changed, 26 insertions, 18 deletions
diff --git a/decoder/tdict.cc b/decoder/tdict.cc index 93f7b0eb..d7fc7eb7 100644 --- a/decoder/tdict.cc +++ b/decoder/tdict.cc @@ -1,7 +1,9 @@ +#include <sstream> #include "Ngram.h" #include "dict.h" #include "tdict.h" #include "Vocab.h" +#include "stringlib.h" using namespace std; @@ -16,6 +18,10 @@ WordID TD::Convert(const std::string& s) { return dict_.addWord((VocabString)s.c_str()); } +WordID TD::Convert(char const* s) { + return dict_.addWord((VocabString)s); +} + const char* TD::Convert(const WordID& w) { return dict_.getWord((VocabIndex)w); } @@ -31,25 +37,27 @@ void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID> } std::string TD::GetString(const std::vector<WordID>& str) { - string res; - for (vector<WordID>::const_iterator i = str.begin(); i != str.end(); ++i) - res += (i == str.begin() ? empty : space) + TD::Convert(*i); - return res; + ostringstream o; + for (int i=0;i<str.size();++i) { + if (i) o << ' '; + o << TD::Convert(str[i]); + } + return o.str(); } -void TD::ConvertSentence(const std::string& sent, std::vector<WordID>* ids) { - string s = sent; - int last = 0; - ids->clear(); - for (int i=0; i < s.size(); ++i) - if (s[i] == 32 || s[i] == '\t') { - s[i]=0; - if (last != i) { - ids->push_back(Convert(&s[last])); - } - last = i + 1; - } - if (last != s.size()) - ids->push_back(Convert(&s[last])); +namespace { +struct add_wordids { + typedef std::vector<WordID> Ws; + Ws *ids; + explicit add_wordids(Ws *i) : ids(i) { } + void operator()(char const* s) { + ids->push_back(TD::Convert(s)); + } +}; + } +void TD::ConvertSentence(std::string const& s, std::vector<WordID>* ids) { + ids->clear(); + VisitTokens(s,add_wordids(ids)); +} |