fake tdict names for non-ids, push viterbi cost to root in hg, store as feature. type erased fsa feature via virtual interface. made lexical_cast assume C locale for speed.

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@465 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-08-02 07:57:23 +0000
committer: graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-08-02 07:57:23 +0000
commit: f9859ad4116733e145d7b8eb31c3cc9318ff7564 (patch)
tree: 92f6942fc7fd7066eb400bce6d2cbd2fee46c801 /decoder/tdict.cc
parent: 6da285dfa7b0a1929dcec882d7e48a585e878d18 (diff)
1 files changed, 24 insertions, 2 deletions
diff --git a/decoder/tdict.cc b/decoder/tdict.cc
index 7b56d259..f0588cfc 100644
--- a/decoder/tdict.cc
+++ b/decoder/tdict.cc
@@ -1,13 +1,19 @@
+#define TD_ALLOW_UNDEFINED_WORDIDS 1
+
+// if 1, word ids that are >= end() will give a numeric token name (single per-thread shared buffer), which of course won't be Convert-able back to the id, because it's not added to the dict.  This is a convenience for logging fake token indices.  Any tokens actually added to the dict may cause end() to overlap the range of fake ids you were using - that's up to you to prevent.
+
+#include <stdlib.h>
+#include <cstring>
 #include <sstream>
 #include "Ngram.h"
 #include "dict.h"
 #include "tdict.h"
 #include "Vocab.h"
 #include "stringlib.h"
+#include "threadlocal.h"
 
 using namespace std;
 
-//FIXME: valgrind errors (static init order?)
 Vocab TD::dict_(0,TD::max_wordid);
 WordID TD::ss=dict_.ssIndex();
 WordID TD::se=dict_.seIndex();
@@ -65,7 +71,23 @@ WordID TD::Convert(char const* s) {
   return dict_.addWord((VocabString)s);
 }
 
-const char* TD::Convert(const WordID& w) {
+
+#if TD_ALLOW_UNDEFINED_WORDIDS
+# include "static_utoa.h"
+char undef_prefix[]="UNDEF_";
+static const int undefpre_n=sizeof(undef_prefix)/sizeof(undef_prefix[0]);
+THREADLOCAL char undef_buf[]="UNDEF_________________";
+inline char const* undef_token(WordID w)
+{
+  append_utoa(undef_buf+undefpre_n,w);
+  return undef_buf;
+}
+#endif
+
+const char* TD::Convert(WordID w) {
+#if TD_ALLOW_UNDEFINED_WORDIDS
+  if (w>=dict_.highIndex()) return undef_token(w);
+#endif
   return dict_.getWord((VocabIndex)w);
 }
author	graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-08-02 07:57:23 +0000
committer	graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-08-02 07:57:23 +0000
commit	f9859ad4116733e145d7b8eb31c3cc9318ff7564 (patch)
tree	92f6942fc7fd7066eb400bce6d2cbd2fee46c801 /decoder/tdict.cc
parent	6da285dfa7b0a1929dcec882d7e48a585e878d18 (diff)