diff options
author | graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-02 07:57:23 +0000 |
---|---|---|
committer | graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-02 07:57:23 +0000 |
commit | f9859ad4116733e145d7b8eb31c3cc9318ff7564 (patch) | |
tree | 92f6942fc7fd7066eb400bce6d2cbd2fee46c801 /decoder/tdict.cc | |
parent | 6da285dfa7b0a1929dcec882d7e48a585e878d18 (diff) |
fake tdict names for non-ids, push viterbi cost to root in hg, store as feature. type erased fsa feature via virtual interface. made lexical_cast assume C locale for speed.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@465 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder/tdict.cc')
-rw-r--r-- | decoder/tdict.cc | 26 |
1 files changed, 24 insertions, 2 deletions
diff --git a/decoder/tdict.cc b/decoder/tdict.cc index 7b56d259..f0588cfc 100644 --- a/decoder/tdict.cc +++ b/decoder/tdict.cc @@ -1,13 +1,19 @@ +#define TD_ALLOW_UNDEFINED_WORDIDS 1 + +// if 1, word ids that are >= end() will give a numeric token name (single per-thread shared buffer), which of course won't be Convert-able back to the id, because it's not added to the dict. This is a convenience for logging fake token indices. Any tokens actually added to the dict may cause end() to overlap the range of fake ids you were using - that's up to you to prevent. + +#include <stdlib.h> +#include <cstring> #include <sstream> #include "Ngram.h" #include "dict.h" #include "tdict.h" #include "Vocab.h" #include "stringlib.h" +#include "threadlocal.h" using namespace std; -//FIXME: valgrind errors (static init order?) Vocab TD::dict_(0,TD::max_wordid); WordID TD::ss=dict_.ssIndex(); WordID TD::se=dict_.seIndex(); @@ -65,7 +71,23 @@ WordID TD::Convert(char const* s) { return dict_.addWord((VocabString)s); } -const char* TD::Convert(const WordID& w) { + +#if TD_ALLOW_UNDEFINED_WORDIDS +# include "static_utoa.h" +char undef_prefix[]="UNDEF_"; +static const int undefpre_n=sizeof(undef_prefix)/sizeof(undef_prefix[0]); +THREADLOCAL char undef_buf[]="UNDEF_________________"; +inline char const* undef_token(WordID w) +{ + append_utoa(undef_buf+undefpre_n,w); + return undef_buf; +} +#endif + +const char* TD::Convert(WordID w) { +#if TD_ALLOW_UNDEFINED_WORDIDS + if (w>=dict_.highIndex()) return undef_token(w); +#endif return dict_.getWord((VocabIndex)w); } |