diff options
Diffstat (limited to 'utils')
-rw-r--r-- | utils/sparse_vector.h | 5 | ||||
-rw-r--r-- | utils/tdict.cc | 74 | ||||
-rw-r--r-- | utils/tdict.h | 23 |
3 files changed, 12 insertions, 90 deletions
diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h index f76fc14c..1bcb9502 100644 --- a/utils/sparse_vector.h +++ b/utils/sparse_vector.h @@ -201,6 +201,11 @@ public: return found==values_.end() || !found->second; } + void remove_zeros() { + typename MapType::iterator it = values_.begin(); + for (; it != values_.end(); ++it) + if (!it->second) values_.erase(it); + } T get(int index) const { typename MapType::const_iterator found = values_.find(index); diff --git a/utils/tdict.cc b/utils/tdict.cc index 1f68feae..23a298f8 100644 --- a/utils/tdict.cc +++ b/utils/tdict.cc @@ -5,93 +5,27 @@ #include <stdlib.h> #include <cstring> #include <sstream> -#include "Ngram.h" #include "dict.h" #include "tdict.h" -#include "Vocab.h" #include "stringlib.h" #include "threadlocal.h" using namespace std; -Vocab TD::dict_(0,TD::max_wordid); -WordID TD::ss=dict_.ssIndex(); -WordID TD::se=dict_.seIndex(); -WordID TD::unk=dict_.unkIndex(); -char const*const TD::ss_str=Vocab_SentStart; -char const*const TD::se_str=Vocab_SentEnd; -char const*const TD::unk_str=Vocab_Unknown; - -// pre+(i-base)+">" for i in [base,e) -inline void pad(std::string const& pre,int base,int e) { - assert(base<=e); - ostringstream o; - for (int i=base;i<e;++i) { - o.str(pre); - o<<(i-base)<<'>'; - WordID id=TD::Convert(o.str()); - assert(id==i); // this fails. why? - } -} - - -namespace { -struct TD_init { - TD_init() { - /* - // disabled for now since it's breaking trunk - assert(TD::Convert(TD::ss_str)==TD::ss); - assert(TD::Convert(TD::se_str)==TD::se); - assert(TD::Convert(TD::unk_str)==TD::unk); - assert(TD::none==Vocab_None); - pad("<FILLER",TD::end(),TD::reserved_begin); - assert(TD::end()==TD::reserved_begin); - int reserved_end=TD::begin(); - pad("<RESERVED",TD::end(),reserved_end); - assert(TD::end()==reserved_end); - */ - } -}; - -TD_init td_init; -} - -unsigned int TD::NumWords() { - return dict_.numWords(); -} -WordID TD::end() { - return dict_.highIndex(); -} +Dict TD::dict_; WordID TD::Convert(const std::string& s) { - return dict_.addWord((VocabString)s.c_str()); + return dict_.Convert(s); } WordID TD::Convert(char const* s) { - return dict_.addWord((VocabString)s); + return dict_.Convert(string(s)); } - -#if TD_ALLOW_UNDEFINED_WORDIDS -# include "static_utoa.h" -char undef_prefix[]="UNDEF_"; -static const int undefpre_n=sizeof(undef_prefix)/sizeof(undef_prefix[0]); -THREADLOCAL char undef_buf[]="UNDEF_________________"; -inline char const* undef_token(WordID w) -{ - append_utoa(undef_buf+undefpre_n,w); - return undef_buf; -} -#endif - const char* TD::Convert(WordID w) { -#if TD_ALLOW_UNDEFINED_WORDIDS - if (w>=dict_.highIndex()) return undef_token(w); -#endif - return dict_.getWord((VocabIndex)w); + return dict_.Convert(w).c_str(); } - void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) { ids->clear(); for (vector<string>::const_iterator i = strings.begin(); i != strings.end(); ++i) diff --git a/utils/tdict.h b/utils/tdict.h index dd7f0237..393146fa 100644 --- a/utils/tdict.h +++ b/utils/tdict.h @@ -6,29 +6,10 @@ #include "wordid.h" #include <assert.h> -class Vocab; +class Dict; struct TD { - /* // disabled for now - static const int reserved_begin=10; // allow room for SRI special tokens e.g. unk ss se pause. tokens until this get "<FILLERi>" - static const int n_reserved=10; // 0...n_reserved-1 get token '<RESERVEDi>' - static inline WordID reserved(int i) { - assert(i>=0 && i<n_reserved); - return (WordID)(reserved_begin+i); - } - static inline WordID begin() { - return reserved(n_reserved); - } - */ - static const WordID max_wordid=0x7fffffff; - static const WordID null=max_wordid-1; - static const WordID none=(WordID)-1; // Vocab_None - this will collide with mixed node/variable id / word space, though. max_wordid will be distinct (still positive) - static char const* const ss_str; //="<s>"; - static char const* const se_str; //="</s>"; - static char const* const unk_str; //="<unk>"; - static WordID ss,se,unk; // x=Convert(x_str) static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far - static Vocab dict_; static void ConvertSentence(std::string const& sent, std::vector<WordID>* ids); static void GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids); static std::string GetString(const std::vector<WordID>& str); @@ -38,6 +19,8 @@ struct TD { static WordID Convert(const std::string& s); static WordID Convert(char const* s); static const char* Convert(WordID w); + private: + static Dict dict_; }; struct ToTD { |