tdict TD:: ss se unk and reserved(i)

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@362 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-21 20:52:35 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-21 20:52:35 +0000
commit: c946ad175601eda5a8cb3e6cd0e7c973d3656012 (patch)
tree: 2766abaeb876e0cb6a9bad4308a11349a072c084 /decoder
parent: cb094b00983dabc0393d1fab40b3450266c7c8a9 (diff)
4 files changed, 88 insertions, 19 deletions
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
index 0b60ff81..ed159853 100755
--- a/decoder/ff_fsa.h
+++ b/decoder/ff_fsa.h
@@ -6,6 +6,7 @@
 #include "ff.h"
 #include "sparse_vector.h"
 #include "value_array.h"
+#include "tdict.h"
 
 typedef ValueArray<uint8_t> Bytes;
 
@@ -32,8 +33,18 @@ struct FsaFeatureFunction {
 
 // regular bottom up scorer from Fsa feature
 template <class Impl>
-struct FeatureFunctionFromFsa : public FeatureFunction,Impl {
-  FeatureFunctionFromFsa(
+struct FeatureFunctionFromFsa : public FeatureFunction {
+  Impl& d() { return static_cast<Impl&>(*this); }
+  Impl const& d() { return static_cast<Impl const&>(*this); }
+
+  FeatureFunctionFromFsa() {  }
+  Init() {
+    name=d().name;
+    SetStateSize(sizeof(WordID)*2*MarkovOrder
+  } // can't do this in constructor because we come before d() in order
+
+  virtual Features Features() const { return d().Features(); }
+
 };
 
 
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index 2f0277c8..15e3f20e 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -187,7 +187,7 @@ class LanguageModelImpl {
     kSTART = TD::Convert("<s>");
     kSTOP = TD::Convert("</s>");
     kUNKNOWN = TD::Convert("<unk>");
-    kNONE = -1;
+    kNONE = TD::none;
     kSTAR = TD::Convert("<{STAR}>");
   }
 
@@ -289,7 +289,7 @@ class LanguageModelImpl {
 
   //TODO: use stateless_cost instead of ProbNoRemnant, check left words only.  for items w/ fewer words than ctx len, how are they represented?  kNONE padded?
 
-  //TODO: make sure that Vocab_None is set to kNONE in srilm (-1), or that SRILM otherwise interprets -1 as a terminator and not a word
+  //Vocab_None is (unsigned)-1 in srilm, same as kNONE. in srilm (-1), or that SRILM otherwise interprets -1 as a terminator and not a word
   double EstimateProb(const void* state) {
     if (unigram) return 0.;
     int len = StateSize(state);
diff --git a/decoder/tdict.cc b/decoder/tdict.cc
index 43bc4cbd..04b82c51 100644
--- a/decoder/tdict.cc
+++ b/decoder/tdict.cc
@@ -8,11 +8,51 @@
 using namespace std;
 
 //FIXME: valgrind errors (static init order?)
-Vocab TD::dict_;
+Vocab TD::dict_(0,TD::max_wordid);
+WordID TD::ss=dict_.ssIndex();
+WordID TD::se=dict_.seIndex();
+WordID TD::unk=dict_.unkIndex();
+char const*const TD::ss_str=Vocab_SentStart;
+char const*const TD::se_str=Vocab_SentEnd;
+char const*const TD::unk_str=Vocab_Unknown;
+
+// pre+(i-base)+">" for i in [base,e)
+inline void pad(std::string const& pre,int base,int e) {
+  assert(base<=e);
+  ostringstream o;
+  for (int i=base;i<e;++i) {
+    o.str(pre);
+    o<<(i-base)<<'>';
+    WordID id=TD::Convert(o.str());
+    assert(id==i);
+  }
+}
+
+
+namespace {
+struct TD_init {
+  TD_init() {
+    assert(TD::Convert(TD::ss_str)==TD::ss);
+    assert(TD::Convert(TD::se_str)==TD::se);
+    assert(TD::Convert(TD::unk_str)==TD::unk);
+    assert(TD::none==Vocab_None);
+    pad("<FILLER",TD::end(),TD::reserved_begin);
+    assert(TD::end()==TD::reserved_begin);
+    int reserved_end=TD::begin();
+    pad("<RESERVED",TD::end(),reserved_end);
+    assert(TD::end()==reserved_end);
+  }
+};
+
+TD_init td_init;
+}
 
 unsigned int TD::NumWords() {
   return dict_.numWords();
 }
+WordID TD::end() {
+  return dict_.highIndex();
+}
 
 WordID TD::Convert(const std::string& s) {
   return dict_.addWord((VocabString)s.c_str());
@@ -26,9 +66,6 @@ const char* TD::Convert(const WordID& w) {
   return dict_.getWord((VocabIndex)w);
 }
 
-static const string empty;
-static const string space = " ";
-
 
 void TD::GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids) {
   ids->clear();
@@ -45,6 +82,20 @@ std::string TD::GetString(const std::vector<WordID>& str) {
   return o.str();
 }
 
+int TD::AppendString(const WordID& w, int pos, int bufsize, char* buffer)
+{
+  const char* word = TD::Convert(w);
+  const char* const end_buf = buffer + bufsize;
+  char* dest = buffer + pos;
+  while(dest < end_buf && *word) {
+    *dest = *word;
+    ++dest;
+    ++word;
+  }
+  return (dest - buffer);
+}
+
+
 namespace {
 struct add_wordids {
   typedef std::vector<WordID> Ws;
diff --git a/decoder/tdict.h b/decoder/tdict.h
index 6b90becb..26e94edf 100644
--- a/decoder/tdict.h
+++ b/decoder/tdict.h
@@ -4,25 +4,32 @@
 #include <string>
 #include <vector>
 #include "wordid.h"
+#include <assert.h>
 
 class Vocab;
 
 struct TD {
+  static const int reserved_begin=10; // allow room for SRI special tokens e.g. unk ss se pause.  tokens until this get "<FILLERi>"
+  static const int n_reserved=10; // 0...n_reserved-1 get token '<RESERVEDi>'
+  static inline WordID reserved(int i) {
+    assert(i>=0 && i<n_reserved);
+    return (WordID)(reserved_begin+i);
+  }
+  static const WordID max_wordid=0x7fffffff;
+  static const WordID none=(WordID)-1; // Vocab_None
+  static char const* const ss_str;  //="<s>";
+  static char const* const se_str;  //="</s>";
+  static char const* const unk_str; //="<unk>";
+  static WordID ss,se,unk; // x=Convert(x_str)
+  static inline WordID begin() {
+    return reserved(n_reserved);
+  }
+  static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far
   static Vocab dict_;
   static void ConvertSentence(std::string const& sent, std::vector<WordID>* ids);
   static void GetWordIDs(const std::vector<std::string>& strings, std::vector<WordID>* ids);
   static std::string GetString(const std::vector<WordID>& str);
-  static int AppendString(const WordID& w, int pos, int bufsize, char* buffer) {
-    const char* word = TD::Convert(w);
-    const char* const end_buf = buffer + bufsize;
-    char* dest = buffer + pos;
-    while(dest < end_buf && *word) {
-      *dest = *word;
-      ++dest;
-      ++word;
-    }
-    return (dest - buffer);
-  }
+  static int AppendString(const WordID& w, int pos, int bufsize, char* buffer);
   static unsigned int NumWords();
   static WordID Convert(const std::string& s);
   static WordID Convert(char const* s);
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-21 20:52:35 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-21 20:52:35 +0000
commit	c946ad175601eda5a8cb3e6cd0e7c973d3656012 (patch)
tree	2766abaeb876e0cb6a9bad4308a11349a072c084 /decoder
parent	cb094b00983dabc0393d1fab40b3450266c7c8a9 (diff)