fsa: stateless works, debug sample bigram {Longer,Shorter}ThanPrev

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@375 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-23 04:21:51 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-23 04:21:51 +0000
commit: 04cce54639520ca6a8175194a463d0f5297b01b5 (patch)
tree: ea059be5ccc3baf5344f4aad5b67b377bbcad6a0 /decoder
parent: 8a841e06196b3c1149d7548968da36b09183f19b (diff)
4 files changed, 169 insertions, 24 deletions
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index ecb244d8..78c67fb3 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -8,13 +8,15 @@
 #include "ff_factory.h"
 #include "ff_ruleshape.h"
 #include "ff_bleu.h"
-#include "ff_from_fsa.h"
+#include "ff_sample_fsa.h"
 
 boost::shared_ptr<FFRegistry> global_ff_registry;
 
 void register_feature_functions() {
   global_ff_registry->Register(new FFFactory<LanguageModel>);
   global_ff_registry->Register(new FFFactory<WordPenaltyFromFsa>); // same as WordPenalty, but implemented using ff_fsa
+  global_ff_registry->Register(new FFFactory<FeatureFunctionFromFsa<LongerThanPrev> >);
+  global_ff_registry->Register(new FFFactory<FeatureFunctionFromFsa<ShorterThanPrev> >);
   //TODO: use for all features the new Register which requires usage(...)
 #ifdef HAVE_RANDLM
   global_ff_registry->Register("RandLM", new FFFactory<LanguageModelRandLM>);
diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h
index cb7255d8..6f2e27f0 100755
--- a/decoder/ff_from_fsa.h
+++ b/decoder/ff_from_fsa.h
@@ -187,7 +187,6 @@ private:
 
 };
 
-typedef FeatureFunctionFromFsa<WordPenaltyFsa> WordPenaltyFromFsa;
 
 
 #endif
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
index 3096f049..3a9478e2 100755
--- a/decoder/ff_fsa.h
+++ b/decoder/ff_fsa.h
@@ -30,6 +30,8 @@ protected:
   Sentence end_phrase_; // words appended for final traversal (final state cost is assessed using Scan) e.g. "</s>" for lm.
   int state_bytes_; // don't forget to set this. default 0 (it may depend on params of course)
   void set_state_bytes(int sb=0) {
+    if (start.size()!=sb) start.resize(sb);
+    if (h_start.size()!=sb) h_start.resize(sb);
     state_bytes_=sb;
   }
 
@@ -37,8 +39,21 @@ protected:
   void init_fid(std::string const& name) { // call this, though, if you have a single feature
     fid_=FD::Convert(name);
   }
+  inline void static to_state(void *state,char const* begin,char const* end) {
+    std::memcpy(state,begin,end-begin);
+  }
+  inline void static to_state(void *state,char const* begin,int n) {
+    std::memcpy(state,begin,n);
+  }
+  template <class T>
+  inline void static to_state(void *state,T const* begin,int n) {
+    to_state(state,(char const*)begin,n);
+  }
+  template <class T>
+  inline void static to_state(void *state,T const* begin,T const* end) {
+    to_state(state,(char const*)begin,(char const*)end);
+  }
 public:
-
   // return m: all strings x with the same final m+1 letters must end in this state
   /* markov chain of order m: P(xn|xn-1...x1)=P(xn|xn-1...xn-m) */
   int markov_order() const { return 0; } // override if you use state.  order 0 implies state_bytes()==0 as well, as far as scoring/splitting is concerned (you can still track state, though)
@@ -64,12 +79,49 @@ public:
   //TODO: decide if we want to require you to support dest same as src, since that's how we use it most often in ff_from_fsa bottom-up wrapper (in l->r scoring, however, distinct copies will be the rule), and it probably wouldn't be too hard for most people to support.  however, it's good to hide the complexity here, once (see overly clever FsaScan loop that swaps src/dest addresses repeatedly to scan a sequence by effectively swapping)
 
   // NOTE: if you want to e.g. track statistics, cache, whatever, cast const away or use mutable members
-  void Scan(SentenceMetadata const& smeta,WordID x,void const* state,void *next_state,FeatureVector *features) const {
+  void Scan(SentenceMetadata const& smeta,WordID w,void const* state,void *next_state,FeatureVector *features) const {
   }
 
   // don't set state-bytes etc. in ctor because it may depend on parsing param string
-  FsaFeatureFunctionBase() : start(0),h_start(0),state_bytes_(0) {  }
+  FsaFeatureFunctionBase(int statesz=0) : start(statesz),h_start(statesz),state_bytes_(statesz) {  }
+
+};
+
+// if State is pod.  sets state size and allocs start, h_start
+template <class St>
+struct FsaTypedBase : public FsaFeatureFunctionBase {
+protected:
+  typedef St State;
+  static inline State & state(void *state) {
+    return *(State*)state;
+  }
+  static inline State const& state(void const* state) {
+    return *(State const*)state;
+  }
+  void set_starts(State const& s,State const& heuristic_s) {
+    if (0) { // already in ctor
+      start.resize(sizeof(State));
+      h_start.resize(sizeof(State));
+    }
+    state(start.begin())=s;
+    state(h_start.begin())=heuristic_s;
+  }
+  void set_h_start(State const& s) {
+  }
+public:
+  int markov_order() const { return 1; }
+  FsaTypedBase() : FsaFeatureFunctionBase(sizeof(State)) {
+  }
+};
 
+// usage (if you're lazy):
+// struct ShorterThanPrev : public FsaTypedBase<int>,FsaTypedScan<ShorterThanPrev>
+template <class Impl>
+struct FsaTypedScan  {
+  void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const {
+    Impl const* impl=static_cast<Impl const*>(this);
+    impl->Scan(smeta,w,impl->state(st),impl->state(next_state),features);
+  }
 };
 
 
@@ -169,26 +221,7 @@ void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,WordID const* i, W
 
 //TODO: combine 2 FsaFeatures typelist style (can recurse for more)
 
-// example: feature val = -1 * # of target words
-struct WordPenaltyFsa : public FsaFeatureFunctionBase {
-  WordPenaltyFsa(std::string const& param) {
-    init_fid(usage(false,false));
-    return;
-    //below are all defaults:
-    set_state_bytes(0);
-    start.clear();
-    h_start.clear();
-  }
-  static const float val_per_target_word=-1;
-  // move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates.  state and next_state may be same memory.
-  void Scan(SentenceMetadata const& smeta,WordID x,void const* state,void *next_state,FeatureVector *features) const {
-    features->add_value(fid_,val_per_target_word);
-  }
-  static std::string usage(bool param,bool verbose) {
-    return FeatureFunction::usage_helper("WordPenaltyFsa","","-1 per target word",param,verbose);
-  }
 
-};
 
 
 #endif
diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h
new file mode 100755
index 00000000..13a25387
--- /dev/null
+++ b/decoder/ff_sample_fsa.h
@@ -0,0 +1,111 @@
+#ifndef FF_SAMPLE_FSA_H
+#define FF_SAMPLE_FSA_H
+
+#include "ff_from_fsa.h"
+
+// example: feature val = -1 * # of target words
+struct WordPenaltyFsa : public FsaFeatureFunctionBase {
+  static std::string usage(bool param,bool verbose) {
+    return FeatureFunction::usage_helper(
+      "WordPenaltyFsa","","-1 per target word"
+      ,param,verbose);
+  }
+
+  WordPenaltyFsa(std::string const& param) {
+    init_fid(usage(false,false));
+    return;
+    //below are all defaults:
+    set_state_bytes(0);
+    start.clear();
+    h_start.clear();
+  }
+  static const float val_per_target_word=-1;
+  // move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates.  state and next_state may be same memory.
+  void Scan(SentenceMetadata const& smeta,WordID w,void const* state,void *next_state,FeatureVector *features) const {
+    features->add_value(fid_,val_per_target_word);
+  }
+};
+
+typedef FeatureFunctionFromFsa<WordPenaltyFsa> WordPenaltyFromFsa;
+
+
+//
+struct LongerThanPrev : public FsaFeatureFunctionBase {
+  static std::string usage(bool param,bool verbose) {
+    return FeatureFunction::usage_helper(
+      "LongerThanPrev",
+      "",
+      "stupid example stateful (bigram) feature: -1 per target word that's longer than the previous word (always fires for first word of sentence)",
+      param,verbose);
+  }
+
+  static inline int &wordlen(void *state) {
+    return *(int*)state;
+  }
+  static inline int wordlen(void const* state) {
+    return *(int const*)state;
+  }
+  static inline int wordlen(WordID w) {
+    return std::strlen(TD::Convert(w));
+  }
+  int markov_order() const { return 1; }
+  LongerThanPrev(std::string const& param) {
+    init_fid(usage(false,false));
+    set_state_bytes(sizeof(int));
+//    start.resize(state_bytes()); // this is done by set_state_bytes already.
+//    h_start.resize(state_bytes());
+//    int ss=-1;
+//    wordcpy((WordID*)start.begin(),&ss,&ss+1);
+    //to_state(start.begin(),&ss,1);
+    wordlen(start.begin())=-1; // same as above.
+    wordlen(h_start.begin())=4; // estimate: anything >4 chars is usually longer than previous
+  }
+
+  static const float val_per_target_word=-1;
+  void Scan(SentenceMetadata const& smeta,WordID w,void const* state,void *next_state,FeatureVector *features) const {
+    int prevlen=wordlen(state);
+    int len=wordlen(w);
+    wordlen(next_state)=len;
+    if (len>prevlen)
+      features->add_value(fid_,val_per_target_word);
+  }
+
+};
+
+// similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State)
+struct ShorterThanPrev : public FsaTypedBase<int>,FsaTypedScan<ShorterThanPrev> {
+  typedef int State; // defines # of bytes in state and return type of state(void *)
+  static std::string usage(bool param,bool verbose) {
+    return FeatureFunction::usage_helper(
+      "ShorterThanPrev",
+      "",
+      "stupid example stateful (bigram) feature: -1 per target word that's shorter than the previous word (always fires for end of sentence)",
+      param,verbose);
+  }
+
+  static inline int wordlen(WordID w) {
+    return std::strlen(TD::Convert(w));
+  }
+  ShorterThanPrev(std::string const& param) {
+    init_fid(usage(false,false));
+    end_phrase_.push_back(TD::Convert("")); // this triggers end of sentence firing
+    set_starts(-1,4); // estimate: anything <4 chars is usually shorter than previous
+  }
+
+  static const float val_per_target_word=-1;
+  // evil anti-google int & len out-param:
+  void Scan(SentenceMetadata const& smeta,WordID w,int prevlen,int &len,FeatureVector *features) const {
+    len=wordlen(w);
+    if (len<prevlen)
+      features->add_value(fid_,val_per_target_word);
+  }
+
+  // already provided by FsaTypedScan<ShorterThanPrev>
+  void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const {
+    Scan(smeta,w,state(st),state(next_state),features);
+  }
+
+};
+
+
+#endif
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-23 04:21:51 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-23 04:21:51 +0000
commit	04cce54639520ca6a8175194a463d0f5297b01b5 (patch)
tree	ea059be5ccc3baf5344f4aad5b67b377bbcad6a0 /decoder
parent	8a841e06196b3c1149d7548968da36b09183f19b (diff)