diff options
Diffstat (limited to 'decoder/tdict.h')
| -rw-r--r-- | decoder/tdict.h | 29 | 
1 files changed, 18 insertions, 11 deletions
| diff --git a/decoder/tdict.h b/decoder/tdict.h index 6b90becb..26e94edf 100644 --- a/decoder/tdict.h +++ b/decoder/tdict.h @@ -4,25 +4,32 @@  #include <string>  #include <vector>  #include "wordid.h" +#include <assert.h>  class Vocab;  struct TD { +  static const int reserved_begin=10; // allow room for SRI special tokens e.g. unk ss se pause.  tokens until this get "<FILLERi>" +  static const int n_reserved=10; // 0...n_reserved-1 get token '<RESERVEDi>' +  static inline WordID reserved(int i) { +    assert(i>=0 && i<n_reserved); +    return (WordID)(reserved_begin+i); +  } +  static const WordID max_wordid=0x7fffffff; +  static const WordID none=(WordID)-1; // Vocab_None +  static char const* const ss_str;  //="<s>"; +  static char const* const se_str;  //="</s>"; +  static char const* const unk_str; //="<unk>"; +  static WordID ss,se,unk; // x=Convert(x_str) +  static inline WordID begin() { +    return reserved(n_reserved); +  } +  static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far    static Vocab dict_;    static void ConvertSentence(std::string const& sent, std::vector<WordID>* ids);    static void GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids);    static std::string GetString(const std::vector<WordID>& str); -  static int AppendString(const WordID& w, int pos, int bufsize, char* buffer) { -    const char* word = TD::Convert(w); -    const char* const end_buf = buffer + bufsize; -    char* dest = buffer + pos; -    while(dest < end_buf && *word) { -      *dest = *word; -      ++dest; -      ++word; -    } -    return (dest - buffer); -  } +  static int AppendString(const WordID& w, int pos, int bufsize, char* buffer);    static unsigned int NumWords();    static WordID Convert(const std::string& s);    static WordID Convert(char const* s); | 
