From 19e0a382269042605c347b48e5ac92c5012f1ccc Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 10 Mar 2011 01:58:30 -0500 Subject: remove dependency on SRILM --- utils/sparse_vector.h | 5 ++++ utils/tdict.cc | 74 +++------------------------------------------------ utils/tdict.h | 23 +++------------- 3 files changed, 12 insertions(+), 90 deletions(-) (limited to 'utils') diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h index f76fc14c..1bcb9502 100644 --- a/utils/sparse_vector.h +++ b/utils/sparse_vector.h @@ -201,6 +201,11 @@ public: return found==values_.end() || !found->second; } + void remove_zeros() { + typename MapType::iterator it = values_.begin(); + for (; it != values_.end(); ++it) + if (!it->second) values_.erase(it); + } T get(int index) const { typename MapType::const_iterator found = values_.find(index); diff --git a/utils/tdict.cc b/utils/tdict.cc index 1f68feae..23a298f8 100644 --- a/utils/tdict.cc +++ b/utils/tdict.cc @@ -5,93 +5,27 @@ #include #include #include -#include "Ngram.h" #include "dict.h" #include "tdict.h" -#include "Vocab.h" #include "stringlib.h" #include "threadlocal.h" using namespace std; -Vocab TD::dict_(0,TD::max_wordid); -WordID TD::ss=dict_.ssIndex(); -WordID TD::se=dict_.seIndex(); -WordID TD::unk=dict_.unkIndex(); -char const*const TD::ss_str=Vocab_SentStart; -char const*const TD::se_str=Vocab_SentEnd; -char const*const TD::unk_str=Vocab_Unknown; - -// pre+(i-base)+">" for i in [base,e) -inline void pad(std::string const& pre,int base,int e) { - assert(base<=e); - ostringstream o; - for (int i=base;i'; - WordID id=TD::Convert(o.str()); - assert(id==i); // this fails. why? - } -} - - -namespace { -struct TD_init { - TD_init() { - /* - // disabled for now since it's breaking trunk - assert(TD::Convert(TD::ss_str)==TD::ss); - assert(TD::Convert(TD::se_str)==TD::se); - assert(TD::Convert(TD::unk_str)==TD::unk); - assert(TD::none==Vocab_None); - pad("=dict_.highIndex()) return undef_token(w); -#endif - return dict_.getWord((VocabIndex)w); + return dict_.Convert(w).c_str(); } - void TD::GetWordIDs(const std::vector& strings, std::vector* ids) { ids->clear(); for (vector::const_iterator i = strings.begin(); i != strings.end(); ++i) diff --git a/utils/tdict.h b/utils/tdict.h index dd7f0237..393146fa 100644 --- a/utils/tdict.h +++ b/utils/tdict.h @@ -6,29 +6,10 @@ #include "wordid.h" #include -class Vocab; +class Dict; struct TD { - /* // disabled for now - static const int reserved_begin=10; // allow room for SRI special tokens e.g. unk ss se pause. tokens until this get "" - static const int n_reserved=10; // 0...n_reserved-1 get token '' - static inline WordID reserved(int i) { - assert(i>=0 && i"; - static char const* const se_str; //=""; - static char const* const unk_str; //=""; - static WordID ss,se,unk; // x=Convert(x_str) static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far - static Vocab dict_; static void ConvertSentence(std::string const& sent, std::vector* ids); static void GetWordIDs(const std::vector& strings, std::vector* ids); static std::string GetString(const std::vector& str); @@ -38,6 +19,8 @@ struct TD { static WordID Convert(const std::string& s); static WordID Convert(char const* s); static const char* Convert(WordID w); + private: + static Dict dict_; }; struct ToTD { -- cgit v1.2.3