Merge branch 'master' of git://github.com/redpony/cdec

author: Jonathan Clark <jon.h.clark@gmail.com> 2011-03-10 06:54:59 -0500
committer: Jonathan Clark <jon.h.clark@gmail.com> 2011-03-10 06:54:59 -0500
commit: 187a7282671bdcee26ade95abcca5282e925c362 (patch)
tree: d62eb5b7653b7dbc949d0de3b6b15642fe8912bf /utils
parent: ee0efe4d807aef64117776a476eeaeca98287315 (diff)
parent: 19e0a382269042605c347b48e5ac92c5012f1ccc (diff)
3 files changed, 12 insertions, 90 deletions
diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h
index f76fc14c..1bcb9502 100644
--- a/utils/sparse_vector.h
+++ b/utils/sparse_vector.h
@@ -201,6 +201,11 @@ public:
     return found==values_.end() || !found->second;
   }
 
+  void remove_zeros() {
+    typename MapType::iterator it = values_.begin();
+    for (; it != values_.end(); ++it)
+      if (!it->second) values_.erase(it);
+  }
 
   T get(int index) const {
     typename MapType::const_iterator found = values_.find(index);
diff --git a/utils/tdict.cc b/utils/tdict.cc
index 1f68feae..23a298f8 100644
--- a/utils/tdict.cc
+++ b/utils/tdict.cc
@@ -5,93 +5,27 @@
 #include <stdlib.h>
 #include <cstring>
 #include <sstream>
-#include "Ngram.h"
 #include "dict.h"
 #include "tdict.h"
-#include "Vocab.h"
 #include "stringlib.h"
 #include "threadlocal.h"
 
 using namespace std;
 
-Vocab TD::dict_(0,TD::max_wordid);
-WordID TD::ss=dict_.ssIndex();
-WordID TD::se=dict_.seIndex();
-WordID TD::unk=dict_.unkIndex();
-char const*const TD::ss_str=Vocab_SentStart;
-char const*const TD::se_str=Vocab_SentEnd;
-char const*const TD::unk_str=Vocab_Unknown;
-
-// pre+(i-base)+">" for i in [base,e)
-inline void pad(std::string const& pre,int base,int e) {
-  assert(base<=e);
-  ostringstream o;
-  for (int i=base;i<e;++i) {
-    o.str(pre);
-    o<<(i-base)<<'>';
-    WordID id=TD::Convert(o.str());
-    assert(id==i); // this fails.  why?
-  }
-}
-
-
-namespace {
-struct TD_init {
-  TD_init() {
-    /*
-      // disabled for now since it's breaking trunk
-    assert(TD::Convert(TD::ss_str)==TD::ss);
-    assert(TD::Convert(TD::se_str)==TD::se);
-    assert(TD::Convert(TD::unk_str)==TD::unk);
-    assert(TD::none==Vocab_None);
-    pad("<FILLER",TD::end(),TD::reserved_begin);
-    assert(TD::end()==TD::reserved_begin);
-    int reserved_end=TD::begin();
-    pad("<RESERVED",TD::end(),reserved_end);
-    assert(TD::end()==reserved_end);
-    */
-  }
-};
-
-TD_init td_init;
-}
-
-unsigned int TD::NumWords() {
-  return dict_.numWords();
-}
-WordID TD::end() {
-  return dict_.highIndex();
-}
+Dict TD::dict_;
 
 WordID TD::Convert(const std::string& s) {
-  return dict_.addWord((VocabString)s.c_str());
+  return dict_.Convert(s);
 }
 
 WordID TD::Convert(char const* s) {
-  return dict_.addWord((VocabString)s);
+  return dict_.Convert(string(s));
 }
 
-
-#if TD_ALLOW_UNDEFINED_WORDIDS
-# include "static_utoa.h"
-char undef_prefix[]="UNDEF_";
-static const int undefpre_n=sizeof(undef_prefix)/sizeof(undef_prefix[0]);
-THREADLOCAL char undef_buf[]="UNDEF_________________";
-inline char const* undef_token(WordID w)
-{
-  append_utoa(undef_buf+undefpre_n,w);
-  return undef_buf;
-}
-#endif
-
 const char* TD::Convert(WordID w) {
-#if TD_ALLOW_UNDEFINED_WORDIDS
-  if (w>=dict_.highIndex()) return undef_token(w);
-#endif
-  return dict_.getWord((VocabIndex)w);
+  return dict_.Convert(w).c_str();
 }
 
-
 void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) {
   ids->clear();
   for (vector<string>::const_iterator i = strings.begin(); i != strings.end(); ++i)
diff --git a/utils/tdict.h b/utils/tdict.h
index dd7f0237..393146fa 100644
--- a/utils/tdict.h
+++ b/utils/tdict.h
@@ -6,29 +6,10 @@
 #include "wordid.h"
 #include <assert.h>
 
-class Vocab;
+class Dict;
 
 struct TD {
-  /* // disabled for now
-  static const int reserved_begin=10; // allow room for SRI special tokens e.g. unk ss se pause.  tokens until this get "<FILLERi>"
-  static const int n_reserved=10; // 0...n_reserved-1 get token '<RESERVEDi>'
-  static inline WordID reserved(int i) {
-    assert(i>=0 && i<n_reserved);
-    return (WordID)(reserved_begin+i);
-  }
-  static inline WordID begin() {
-    return reserved(n_reserved);
-  }
-  */
-  static const WordID max_wordid=0x7fffffff;
-  static const WordID null=max_wordid-1;
-  static const WordID none=(WordID)-1; // Vocab_None - this will collide with mixed node/variable id / word space, though.  max_wordid will be distinct (still positive)
-  static char const* const ss_str;  //="<s>";
-  static char const* const se_str;  //="</s>";
-  static char const* const unk_str; //="<unk>";
-  static WordID ss,se,unk; // x=Convert(x_str)
   static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far
-  static Vocab dict_;
   static void ConvertSentence(std::string const& sent, std::vector<WordID>* ids);
   static void GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids);
   static std::string GetString(const std::vector<WordID>& str);
@@ -38,6 +19,8 @@ struct TD {
   static WordID Convert(const std::string& s);
   static WordID Convert(char const* s);
   static const char* Convert(WordID w);
+ private:
+  static Dict dict_;
 };
 
 struct ToTD {
author	Jonathan Clark <jon.h.clark@gmail.com>	2011-03-10 06:54:59 -0500
committer	Jonathan Clark <jon.h.clark@gmail.com>	2011-03-10 06:54:59 -0500
commit	187a7282671bdcee26ade95abcca5282e925c362 (patch)
tree	d62eb5b7653b7dbc949d0de3b6b15642fe8912bf /utils
parent	ee0efe4d807aef64117776a476eeaeca98287315 (diff)
parent	19e0a382269042605c347b48e5ac92c5012f1ccc (diff)