diff options
Diffstat (limited to 'decoder/tdict.cc')
-rw-r--r-- | decoder/tdict.cc | 154 |
1 files changed, 0 insertions, 154 deletions
diff --git a/decoder/tdict.cc b/decoder/tdict.cc deleted file mode 100644 index 1f68feae..00000000 --- a/decoder/tdict.cc +++ /dev/null @@ -1,154 +0,0 @@ -#define TD_ALLOW_UNDEFINED_WORDIDS 0 - -// if 1, word ids that are >= end() will give a numeric token name (single per-thread shared buffer), which of course won't be Convert-able back to the id, because it's not added to the dict. This is a convenience for logging fake token indices. Any tokens actually added to the dict may cause end() to overlap the range of fake ids you were using - that's up to you to prevent. - -#include <stdlib.h> -#include <cstring> -#include <sstream> -#include "Ngram.h" -#include "dict.h" -#include "tdict.h" -#include "Vocab.h" -#include "stringlib.h" -#include "threadlocal.h" - -using namespace std; - -Vocab TD::dict_(0,TD::max_wordid); -WordID TD::ss=dict_.ssIndex(); -WordID TD::se=dict_.seIndex(); -WordID TD::unk=dict_.unkIndex(); -char const*const TD::ss_str=Vocab_SentStart; -char const*const TD::se_str=Vocab_SentEnd; -char const*const TD::unk_str=Vocab_Unknown; - -// pre+(i-base)+">" for i in [base,e) -inline void pad(std::string const& pre,int base,int e) { - assert(base<=e); - ostringstream o; - for (int i=base;i<e;++i) { - o.str(pre); - o<<(i-base)<<'>'; - WordID id=TD::Convert(o.str()); - assert(id==i); // this fails. why? - } -} - - -namespace { -struct TD_init { - TD_init() { - /* - // disabled for now since it's breaking trunk - assert(TD::Convert(TD::ss_str)==TD::ss); - assert(TD::Convert(TD::se_str)==TD::se); - assert(TD::Convert(TD::unk_str)==TD::unk); - assert(TD::none==Vocab_None); - pad("<FILLER",TD::end(),TD::reserved_begin); - assert(TD::end()==TD::reserved_begin); - int reserved_end=TD::begin(); - pad("<RESERVED",TD::end(),reserved_end); - assert(TD::end()==reserved_end); - */ - } -}; - -TD_init td_init; -} - -unsigned int TD::NumWords() { - return dict_.numWords(); -} -WordID TD::end() { - return dict_.highIndex(); -} - -WordID TD::Convert(const std::string& s) { - return dict_.addWord((VocabString)s.c_str()); -} - -WordID TD::Convert(char const* s) { - return dict_.addWord((VocabString)s); -} - - -#if TD_ALLOW_UNDEFINED_WORDIDS -# include "static_utoa.h" -char undef_prefix[]="UNDEF_"; -static const int undefpre_n=sizeof(undef_prefix)/sizeof(undef_prefix[0]); -THREADLOCAL char undef_buf[]="UNDEF_________________"; -inline char const* undef_token(WordID w) -{ - append_utoa(undef_buf+undefpre_n,w); - return undef_buf; -} -#endif - -const char* TD::Convert(WordID w) { -#if TD_ALLOW_UNDEFINED_WORDIDS - if (w>=dict_.highIndex()) return undef_token(w); -#endif - return dict_.getWord((VocabIndex)w); -} - - -void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) { - ids->clear(); - for (vector<string>::const_iterator i = strings.begin(); i != strings.end(); ++i) - ids->push_back(TD::Convert(*i)); -} - -std::string TD::GetString(const std::vector<WordID>& str) { - ostringstream o; - for (int i=0;i<str.size();++i) { - if (i) o << ' '; - o << TD::Convert(str[i]); - } - return o.str(); -} - -std::string TD::GetString(WordID const* i,WordID const* e) { - ostringstream o; - bool sp=false; - for (;i<e;++i,sp=true) { - if (sp) - o << ' '; - o << TD::Convert(*i); - } - return o.str(); -} - -int TD::AppendString(const WordID& w, int pos, int bufsize, char* buffer) -{ - const char* word = TD::Convert(w); - const char* const end_buf = buffer + bufsize; - char* dest = buffer + pos; - while(dest < end_buf && *word) { - *dest = *word; - ++dest; - ++word; - } - return (dest - buffer); -} - - -namespace { -struct add_wordids { - typedef std::vector<WordID> Ws; - Ws *ids; - explicit add_wordids(Ws *i) : ids(i) { } - add_wordids(const add_wordids& o) : ids(o.ids) { } - void operator()(char const* s) { - ids->push_back(TD::Convert(s)); - } - void operator()(std::string const& s) { - ids->push_back(TD::Convert(s)); - } -}; - -} - -void TD::ConvertSentence(std::string const& s, std::vector<WordID>* ids) { - ids->clear(); - VisitTokens(s,add_wordids(ids)); -} |