LanguageModelFsa works. TODO: sri context shortening?

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@414 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-26 04:53:15 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-26 04:53:15 +0000
commit: 5e786c1b9e465e88763b44db10f3b42f41c2a990 (patch)
tree: 8a36eb9dde79c03cb9d0b4aeba7fc9b566dff366 /decoder/ff_fsa.h
parent: 5dc4b5c2d796b339dd6b9f7616c5765ee28728f0 (diff)
1 files changed, 41 insertions, 7 deletions
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
index e21cbf6f..4575b648 100755
--- a/decoder/ff_fsa.h
+++ b/decoder/ff_fsa.h
@@ -4,14 +4,15 @@
 /*
   features whose score is just some PFSA over target string.  however, PFSA can use edge and smeta info (e.g. spans on edge) - not usually useful.
 
+//SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h
+
   state is some fixed width byte array.  could actually be a void *, WordID sequence, whatever.
 
   TODO: fsa feature aggregator that presents itself as a single fsa; benefit: when wrapped in ff_from_fsa, only one set of left words is stored.  downside: compared to separate ff, the inside portion of lower-order models is incorporated later.  however, the full heuristic is already available and exact for those words.  so don't sweat it.
 
-  TODO: state (+ possibly span-specific) custom heuristic, e.g. in "longer than previous word" model, you can expect a higher outside if your state is a word of 2 letters.  this is on top of the nice heuristic for the unscored words, of course.  in ngrams, the avg prob will be about the same, but if the words possible for a source span are summarized, maybe it's possible to predict.  probably not worht the time.
+  TODO: state (+ possibly span-specific) custom heuristic, e.g. in "longer than previous word" model, you can expect a higher outside if your state is a word of 2 letters.  this is on top of the nice heuristic for the unscored words, of course.  in ngrams, the avg prob will be about the same, but if the words possible for a source span are summarized, maybe it's possible to predict.  probably not worth the effort.
 */
 
-//SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h
 
 //TODO: decide whether to use init_features / add_value vs. summing elsewhere + set_value once (or inefficient for from_fsa: sum distinct feature_vectors.  but L->R if we only scan 1 word at a time, that's fine
 
@@ -48,11 +49,28 @@
 
 typedef ValueArray<uint8_t> Bytes;
 
-// it's not necessary to inherit from this, but you probably should to save yourself some boilerplate.  defaults to no-state
+/*
+usage:
+struct SameFirstLetter : public FsaFeatureFunctionBase<SameFirstLetter> {
+SameFirstLetter(string const& param) : FsaFeatureFunctionBase<SameFirstLetter>(1,singleton_sentence("END")) {  start[0]='a';h_start[0]=0; } // 1 byte of state, scan final (single) symbol "END" to get final state cost
+  int markov_order() const { return 1; }
+  Featval Scan1(WordID w,void const* old_state,void *new_state) const {
+    char cw=TD::Convert(w)[0];
+    char co=*(char const*)old_state;
+    *(char *)new_state = cw;
+    return cw==co?1:0;
+  }
+  void print_state(std::ostream &o,void const* st) const {
+    o<<*(char const*)st;
+  }
+  static std::string usage(bool param,bool verbose) {
+    return FeatureFunction::usage_helper("SameFirstLetter","[no args]","1 each time 2 consecutive words start with the same letter",param,verbose);
+  }
+};
+
+// then, to decode, see ff_from_fsa.h
+ */
 
-// usage:
-// struct FsaFeat : public FsaTypedBase<int,FsaFeat>
-// i.e. Impl is a CRTP
 template <class Impl>
 struct FsaFeatureFunctionBase {
   Impl const& d() const { return static_cast<Impl const&>(*this); }
@@ -66,6 +84,10 @@ protected:
     if (h_start.size()!=sb) h_start.resize(sb);
     state_bytes_=sb;
   }
+  void set_end_phrase(WordID single) {
+    end_phrase_=singleton_sentence(single);
+  }
+
   int fid_; // you can have more than 1 feature of course.
   void Init() { // CALL THIS MANUALLY (because feature name(s) may depend on param
     fid_=FD::Convert(d().name());
@@ -85,6 +107,7 @@ protected:
   inline void static to_state(void *state,T const* begin,T const* end) {
     to_state(state,(char const*)begin,(char const*)end);
   }
+
   inline static char hexdigit(int i) {
     int j=i-10;
     return j>=0?'a'+j:'0'+i;
@@ -95,6 +118,10 @@ protected:
   }
 
 public:
+  void state_cpy(void *to,void const*from) const {
+    std::memcpy(to,from,state_bytes_);
+  }
+
   // can override to different return type, e.g. just return feats:
   Featval describe_features(FeatureVector const& feats) const {
     return feats.get(fid_);
@@ -155,7 +182,14 @@ public:
 
   // NOTE: if you want to e.g. track statistics, cache, whatever, cast const away or use mutable members
   inline void Scan(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID w,void const* state,void *next_state,FeatureVector *features) const {
-    features->maybe_add(fid_,d().Scan1(w,state,next_state));
+    maybe_add_feat(features,d().Scan1(w,state,next_state));
+  }
+
+  inline void maybe_add_feat(FeatureVector *features,Featval v) const {
+    features->maybe_add(fid_,v);
+  }
+  inline void add_feat(FeatureVector *features,Featval v) const {
+    features->add_value(fid_,v);
   }
 
   // don't set state-bytes etc. in ctor because it may depend on parsing param string
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-26 04:53:15 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-26 04:53:15 +0000
commit	5e786c1b9e465e88763b44db10f3b42f41c2a990 (patch)
tree	8a36eb9dde79c03cb9d0b4aeba7fc9b566dff366 /decoder/ff_fsa.h
parent	5dc4b5c2d796b339dd6b9f7616c5765ee28728f0 (diff)