diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-21 20:52:35 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-21 20:52:35 +0000 |
commit | c946ad175601eda5a8cb3e6cd0e7c973d3656012 (patch) | |
tree | 2766abaeb876e0cb6a9bad4308a11349a072c084 | |
parent | cb094b00983dabc0393d1fab40b3450266c7c8a9 (diff) |
tdict TD:: ss se unk and reserved(i)
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@362 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-x | decoder/ff_fsa.h | 15 | ||||
-rw-r--r-- | decoder/ff_lm.cc | 4 | ||||
-rw-r--r-- | decoder/tdict.cc | 59 | ||||
-rw-r--r-- | decoder/tdict.h | 29 |
4 files changed, 88 insertions, 19 deletions
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h index 0b60ff81..ed159853 100755 --- a/decoder/ff_fsa.h +++ b/decoder/ff_fsa.h @@ -6,6 +6,7 @@ #include "ff.h" #include "sparse_vector.h" #include "value_array.h" +#include "tdict.h" typedef ValueArray<uint8_t> Bytes; @@ -32,8 +33,18 @@ struct FsaFeatureFunction { // regular bottom up scorer from Fsa feature template <class Impl> -struct FeatureFunctionFromFsa : public FeatureFunction,Impl { - FeatureFunctionFromFsa( +struct FeatureFunctionFromFsa : public FeatureFunction { + Impl& d() { return static_cast<Impl&>(*this); } + Impl const& d() { return static_cast<Impl const&>(*this); } + + FeatureFunctionFromFsa() { } + Init() { + name=d().name; + SetStateSize(sizeof(WordID)*2*MarkovOrder + } // can't do this in constructor because we come before d() in order + + virtual Features Features() const { return d().Features(); } + }; diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index 2f0277c8..15e3f20e 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -187,7 +187,7 @@ class LanguageModelImpl { kSTART = TD::Convert("<s>"); kSTOP = TD::Convert("</s>"); kUNKNOWN = TD::Convert("<unk>"); - kNONE = -1; + kNONE = TD::none; kSTAR = TD::Convert("<{STAR}>"); } @@ -289,7 +289,7 @@ class LanguageModelImpl { //TODO: use stateless_cost instead of ProbNoRemnant, check left words only. for items w/ fewer words than ctx len, how are they represented? kNONE padded? - //TODO: make sure that Vocab_None is set to kNONE in srilm (-1), or that SRILM otherwise interprets -1 as a terminator and not a word + //Vocab_None is (unsigned)-1 in srilm, same as kNONE. in srilm (-1), or that SRILM otherwise interprets -1 as a terminator and not a word double EstimateProb(const void* state) { if (unigram) return 0.; int len = StateSize(state); diff --git a/decoder/tdict.cc b/decoder/tdict.cc index 43bc4cbd..04b82c51 100644 --- a/decoder/tdict.cc +++ b/decoder/tdict.cc @@ -8,11 +8,51 @@ using namespace std; //FIXME: valgrind errors (static init order?) -Vocab TD::dict_; +Vocab TD::dict_(0,TD::max_wordid); +WordID TD::ss=dict_.ssIndex(); +WordID TD::se=dict_.seIndex(); +WordID TD::unk=dict_.unkIndex(); +char const*const TD::ss_str=Vocab_SentStart; +char const*const TD::se_str=Vocab_SentEnd; +char const*const TD::unk_str=Vocab_Unknown; + +// pre+(i-base)+">" for i in [base,e) +inline void pad(std::string const& pre,int base,int e) { + assert(base<=e); + ostringstream o; + for (int i=base;i<e;++i) { + o.str(pre); + o<<(i-base)<<'>'; + WordID id=TD::Convert(o.str()); + assert(id==i); + } +} + + +namespace { +struct TD_init { + TD_init() { + assert(TD::Convert(TD::ss_str)==TD::ss); + assert(TD::Convert(TD::se_str)==TD::se); + assert(TD::Convert(TD::unk_str)==TD::unk); + assert(TD::none==Vocab_None); + pad("<FILLER",TD::end(),TD::reserved_begin); + assert(TD::end()==TD::reserved_begin); + int reserved_end=TD::begin(); + pad("<RESERVED",TD::end(),reserved_end); + assert(TD::end()==reserved_end); + } +}; + +TD_init td_init; +} unsigned int TD::NumWords() { return dict_.numWords(); } +WordID TD::end() { + return dict_.highIndex(); +} WordID TD::Convert(const std::string& s) { return dict_.addWord((VocabString)s.c_str()); @@ -26,9 +66,6 @@ const char* TD::Convert(const WordID& w) { return dict_.getWord((VocabIndex)w); } -static const string empty; -static const string space = " "; - void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) { ids->clear(); @@ -45,6 +82,20 @@ std::string TD::GetString(const std::vector<WordID>& str) { return o.str(); } +int TD::AppendString(const WordID& w, int pos, int bufsize, char* buffer) +{ + const char* word = TD::Convert(w); + const char* const end_buf = buffer + bufsize; + char* dest = buffer + pos; + while(dest < end_buf && *word) { + *dest = *word; + ++dest; + ++word; + } + return (dest - buffer); +} + + namespace { struct add_wordids { typedef std::vector<WordID> Ws; diff --git a/decoder/tdict.h b/decoder/tdict.h index 6b90becb..26e94edf 100644 --- a/decoder/tdict.h +++ b/decoder/tdict.h @@ -4,25 +4,32 @@ #include <string> #include <vector> #include "wordid.h" +#include <assert.h> class Vocab; struct TD { + static const int reserved_begin=10; // allow room for SRI special tokens e.g. unk ss se pause. tokens until this get "<FILLERi>" + static const int n_reserved=10; // 0...n_reserved-1 get token '<RESERVEDi>' + static inline WordID reserved(int i) { + assert(i>=0 && i<n_reserved); + return (WordID)(reserved_begin+i); + } + static const WordID max_wordid=0x7fffffff; + static const WordID none=(WordID)-1; // Vocab_None + static char const* const ss_str; //="<s>"; + static char const* const se_str; //="</s>"; + static char const* const unk_str; //="<unk>"; + static WordID ss,se,unk; // x=Convert(x_str) + static inline WordID begin() { + return reserved(n_reserved); + } + static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far static Vocab dict_; static void ConvertSentence(std::string const& sent, std::vector<WordID>* ids); static void GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids); static std::string GetString(const std::vector<WordID>& str); - static int AppendString(const WordID& w, int pos, int bufsize, char* buffer) { - const char* word = TD::Convert(w); - const char* const end_buf = buffer + bufsize; - char* dest = buffer + pos; - while(dest < end_buf && *word) { - *dest = *word; - ++dest; - ++word; - } - return (dest - buffer); - } + static int AppendString(const WordID& w, int pos, int bufsize, char* buffer); static unsigned int NumWords(); static WordID Convert(const std::string& s); static WordID Convert(char const* s); |