summaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
authorJonathan Clark <jon.h.clark@gmail.com>2011-03-10 06:54:59 -0500
committerJonathan Clark <jon.h.clark@gmail.com>2011-03-10 06:54:59 -0500
commit7491d7f72bd50cbc08a28f64cc3939a310d65801 (patch)
treefe18c5bc6ca1e0bf18f0ca473833162ecd3a908b /utils
parent8c8ff6c6915ebc5ce30156e3f05bf8d1966ec0a1 (diff)
parent4f9933d668d247ea5831c3f2af0b996a94da28f7 (diff)
Merge branch 'master' of git://github.com/redpony/cdec
Diffstat (limited to 'utils')
-rw-r--r--utils/sparse_vector.h5
-rw-r--r--utils/tdict.cc74
-rw-r--r--utils/tdict.h23
3 files changed, 12 insertions, 90 deletions
diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h
index f76fc14c..1bcb9502 100644
--- a/utils/sparse_vector.h
+++ b/utils/sparse_vector.h
@@ -201,6 +201,11 @@ public:
return found==values_.end() || !found->second;
}
+ void remove_zeros() {
+ typename MapType::iterator it = values_.begin();
+ for (; it != values_.end(); ++it)
+ if (!it->second) values_.erase(it);
+ }
T get(int index) const {
typename MapType::const_iterator found = values_.find(index);
diff --git a/utils/tdict.cc b/utils/tdict.cc
index 1f68feae..23a298f8 100644
--- a/utils/tdict.cc
+++ b/utils/tdict.cc
@@ -5,93 +5,27 @@
#include <stdlib.h>
#include <cstring>
#include <sstream>
-#include "Ngram.h"
#include "dict.h"
#include "tdict.h"
-#include "Vocab.h"
#include "stringlib.h"
#include "threadlocal.h"
using namespace std;
-Vocab TD::dict_(0,TD::max_wordid);
-WordID TD::ss=dict_.ssIndex();
-WordID TD::se=dict_.seIndex();
-WordID TD::unk=dict_.unkIndex();
-char const*const TD::ss_str=Vocab_SentStart;
-char const*const TD::se_str=Vocab_SentEnd;
-char const*const TD::unk_str=Vocab_Unknown;
-
-// pre+(i-base)+">" for i in [base,e)
-inline void pad(std::string const& pre,int base,int e) {
- assert(base<=e);
- ostringstream o;
- for (int i=base;i<e;++i) {
- o.str(pre);
- o<<(i-base)<<'>';
- WordID id=TD::Convert(o.str());
- assert(id==i); // this fails. why?
- }
-}
-
-
-namespace {
-struct TD_init {
- TD_init() {
- /*
- // disabled for now since it's breaking trunk
- assert(TD::Convert(TD::ss_str)==TD::ss);
- assert(TD::Convert(TD::se_str)==TD::se);
- assert(TD::Convert(TD::unk_str)==TD::unk);
- assert(TD::none==Vocab_None);
- pad("<FILLER",TD::end(),TD::reserved_begin);
- assert(TD::end()==TD::reserved_begin);
- int reserved_end=TD::begin();
- pad("<RESERVED",TD::end(),reserved_end);
- assert(TD::end()==reserved_end);
- */
- }
-};
-
-TD_init td_init;
-}
-
-unsigned int TD::NumWords() {
- return dict_.numWords();
-}
-WordID TD::end() {
- return dict_.highIndex();
-}
+Dict TD::dict_;
WordID TD::Convert(const std::string& s) {
- return dict_.addWord((VocabString)s.c_str());
+ return dict_.Convert(s);
}
WordID TD::Convert(char const* s) {
- return dict_.addWord((VocabString)s);
+ return dict_.Convert(string(s));
}
-
-#if TD_ALLOW_UNDEFINED_WORDIDS
-# include "static_utoa.h"
-char undef_prefix[]="UNDEF_";
-static const int undefpre_n=sizeof(undef_prefix)/sizeof(undef_prefix[0]);
-THREADLOCAL char undef_buf[]="UNDEF_________________";
-inline char const* undef_token(WordID w)
-{
- append_utoa(undef_buf+undefpre_n,w);
- return undef_buf;
-}
-#endif
-
const char* TD::Convert(WordID w) {
-#if TD_ALLOW_UNDEFINED_WORDIDS
- if (w>=dict_.highIndex()) return undef_token(w);
-#endif
- return dict_.getWord((VocabIndex)w);
+ return dict_.Convert(w).c_str();
}
-
void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) {
ids->clear();
for (vector<string>::const_iterator i = strings.begin(); i != strings.end(); ++i)
diff --git a/utils/tdict.h b/utils/tdict.h
index dd7f0237..393146fa 100644
--- a/utils/tdict.h
+++ b/utils/tdict.h
@@ -6,29 +6,10 @@
#include "wordid.h"
#include <assert.h>
-class Vocab;
+class Dict;
struct TD {
- /* // disabled for now
- static const int reserved_begin=10; // allow room for SRI special tokens e.g. unk ss se pause. tokens until this get "<FILLERi>"
- static const int n_reserved=10; // 0...n_reserved-1 get token '<RESERVEDi>'
- static inline WordID reserved(int i) {
- assert(i>=0 && i<n_reserved);
- return (WordID)(reserved_begin+i);
- }
- static inline WordID begin() {
- return reserved(n_reserved);
- }
- */
- static const WordID max_wordid=0x7fffffff;
- static const WordID null=max_wordid-1;
- static const WordID none=(WordID)-1; // Vocab_None - this will collide with mixed node/variable id / word space, though. max_wordid will be distinct (still positive)
- static char const* const ss_str; //="<s>";
- static char const* const se_str; //="</s>";
- static char const* const unk_str; //="<unk>";
- static WordID ss,se,unk; // x=Convert(x_str)
static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far
- static Vocab dict_;
static void ConvertSentence(std::string const& sent, std::vector<WordID>* ids);
static void GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids);
static std::string GetString(const std::vector<WordID>& str);
@@ -38,6 +19,8 @@ struct TD {
static WordID Convert(const std::string& s);
static WordID Convert(char const* s);
static const char* Convert(WordID w);
+ private:
+ static Dict dict_;
};
struct ToTD {