From f4b4aade473f9463dda6fac4baf9c0502d004deb Mon Sep 17 00:00:00 2001
From: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Mon, 26 Jul 2010 04:53:15 +0000
Subject: LanguageModelFsa works.  TODO: sri context shortening?

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@414 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 decoder/cdec_ff.cc      |   3 +-
 decoder/ff_fsa.h        |  48 ++++++++++++++++++----
 decoder/ff_lm.cc        | 103 ++++++++++++++++++++++++++++++++++++++++--------
 decoder/ff_lm_fsa.h     |  15 ++++++-
 decoder/ff_sample_fsa.h |  17 ++++++++
 5 files changed, 160 insertions(+), 26 deletions(-)

(limited to 'decoder')
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 78c67fb3..037cd92e 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -9,11 +9,12 @@
 #include "ff_ruleshape.h"
 #include "ff_bleu.h"
 #include "ff_sample_fsa.h"
-
+#include "ff_lm_fsa.h"
 boost::shared_ptr<FFRegistry> global_ff_registry;
 
 void register_feature_functions() {
   global_ff_registry->Register(new FFFactory<LanguageModel>);
+  global_ff_registry->Register(new FFFactory<FeatureFunctionFromFsa<LanguageModelFsa> >); // same as LM but using fsa wrapper
   global_ff_registry->Register(new FFFactory<WordPenaltyFromFsa>); // same as WordPenalty, but implemented using ff_fsa
   global_ff_registry->Register(new FFFactory<FeatureFunctionFromFsa<LongerThanPrev> >);
   global_ff_registry->Register(new FFFactory<FeatureFunctionFromFsa<ShorterThanPrev> >);
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
index e21cbf6f..4575b648 100755
--- a/decoder/ff_fsa.h
+++ b/decoder/ff_fsa.h
@@ -4,14 +4,15 @@
 /*
   features whose score is just some PFSA over target string.  however, PFSA can use edge and smeta info (e.g. spans on edge) - not usually useful.
 
+//SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h
+
   state is some fixed width byte array.  could actually be a void *, WordID sequence, whatever.
 
   TODO: fsa feature aggregator that presents itself as a single fsa; benefit: when wrapped in ff_from_fsa, only one set of left words is stored.  downside: compared to separate ff, the inside portion of lower-order models is incorporated later.  however, the full heuristic is already available and exact for those words.  so don't sweat it.
 
-  TODO: state (+ possibly span-specific) custom heuristic, e.g. in "longer than previous word" model, you can expect a higher outside if your state is a word of 2 letters.  this is on top of the nice heuristic for the unscored words, of course.  in ngrams, the avg prob will be about the same, but if the words possible for a source span are summarized, maybe it's possible to predict.  probably not worht the time.
+  TODO: state (+ possibly span-specific) custom heuristic, e.g. in "longer than previous word" model, you can expect a higher outside if your state is a word of 2 letters.  this is on top of the nice heuristic for the unscored words, of course.  in ngrams, the avg prob will be about the same, but if the words possible for a source span are summarized, maybe it's possible to predict.  probably not worth the effort.
 */
 
-//SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h
 
 //TODO: decide whether to use init_features / add_value vs. summing elsewhere + set_value once (or inefficient for from_fsa: sum distinct feature_vectors.  but L->R if we only scan 1 word at a time, that's fine
 
@@ -48,11 +49,28 @@
 
 typedef ValueArray<uint8_t> Bytes;
 
-// it's not necessary to inherit from this, but you probably should to save yourself some boilerplate.  defaults to no-state
+/*
+usage:
+struct SameFirstLetter : public FsaFeatureFunctionBase<SameFirstLetter> {
+SameFirstLetter(string const& param) : FsaFeatureFunctionBase<SameFirstLetter>(1,singleton_sentence("END")) {  start[0]='a';h_start[0]=0; } // 1 byte of state, scan final (single) symbol "END" to get final state cost
+  int markov_order() const { return 1; }
+  Featval Scan1(WordID w,void const* old_state,void *new_state) const {
+    char cw=TD::Convert(w)[0];
+    char co=*(char const*)old_state;
+    *(char *)new_state = cw;
+    return cw==co?1:0;
+  }
+  void print_state(std::ostream &o,void const* st) const {
+    o<<*(char const*)st;
+  }
+  static std::string usage(bool param,bool verbose) {
+    return FeatureFunction::usage_helper("SameFirstLetter","[no args]","1 each time 2 consecutive words start with the same letter",param,verbose);
+  }
+};
+
+// then, to decode, see ff_from_fsa.h
+ */
 
-// usage:
-// struct FsaFeat : public FsaTypedBase<int,FsaFeat>
-// i.e. Impl is a CRTP
 template <class Impl>
 struct FsaFeatureFunctionBase {
   Impl const& d() const { return static_cast<Impl const&>(*this); }
@@ -66,6 +84,10 @@ protected:
     if (h_start.size()!=sb) h_start.resize(sb);
     state_bytes_=sb;
   }
+  void set_end_phrase(WordID single) {
+    end_phrase_=singleton_sentence(single);
+  }
+
   int fid_; // you can have more than 1 feature of course.
   void Init() { // CALL THIS MANUALLY (because feature name(s) may depend on param
     fid_=FD::Convert(d().name());
@@ -85,6 +107,7 @@ protected:
   inline void static to_state(void *state,T const* begin,T const* end) {
     to_state(state,(char const*)begin,(char const*)end);
   }
+
   inline static char hexdigit(int i) {
     int j=i-10;
     return j>=0?'a'+j:'0'+i;
@@ -95,6 +118,10 @@ protected:
   }
 
 public:
+  void state_cpy(void *to,void const*from) const {
+    std::memcpy(to,from,state_bytes_);
+  }
+
   // can override to different return type, e.g. just return feats:
   Featval describe_features(FeatureVector const& feats) const {
     return feats.get(fid_);
@@ -155,7 +182,14 @@ public:
 
   // NOTE: if you want to e.g. track statistics, cache, whatever, cast const away or use mutable members
   inline void Scan(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID w,void const* state,void *next_state,FeatureVector *features) const {
-    features->maybe_add(fid_,d().Scan1(w,state,next_state));
+    maybe_add_feat(features,d().Scan1(w,state,next_state));
+  }
+
+  inline void maybe_add_feat(FeatureVector *features,Featval v) const {
+    features->maybe_add(fid_,v);
+  }
+  inline void add_feat(FeatureVector *features,Featval v) const {
+    features->add_value(fid_,v);
   }
 
   // don't set state-bytes etc. in ctor because it may depend on parsing param string
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index 6579fbee..a5f43867 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -20,6 +20,7 @@ char const* usage_verbose="-n determines the name of the feature (and its weight
 #endif
 
 #include "ff_lm.h"
+#include "ff_lm_fsa.h"
 
 #include <sstream>
 #include <unistd.h>
@@ -44,8 +45,12 @@ char const* usage_verbose="-n determines the name of the feature (and its weight
 
 using namespace std;
 
+string LanguageModelFsa::usage(bool param,bool verbose) {
+  return FeatureFunction::usage_helper("LanguageModelFsa",usage_short,usage_verbose,param,verbose);
+}
+
 string LanguageModel::usage(bool param,bool verbose) {
-  return usage_helper(usage_name,usage_short,usage_verbose,param,verbose);
+  return FeatureFunction::usage_helper(usage_name,usage_short,usage_verbose,param,verbose);
 }
 
 
@@ -126,7 +131,7 @@ struct LMClient {
     cerr << "Connected to LM on " << host << " on port " << port << endl;
   }
 
-  float wordProb(int word, int* context) {
+  float wordProb(int word, WordID const* context) {
     NgramCache::Cache* cur = &NgramCache::cache_;
     int i = 0;
     while (context[i] > 0) {
@@ -183,10 +188,10 @@ class LanguageModelImpl {
     order_=order;
     state_size_ = OrderToStateSize(order)-1;
     unigram=(order<=1);
-    floor_=-100;
-    kSTART = TD::Convert("<s>");
-    kSTOP = TD::Convert("</s>");
-    kUNKNOWN = TD::Convert("<unk>");
+    floor_ = -100;
+    kSTART = TD::ss;
+    kSTOP = TD::se;
+    kUNKNOWN = TD::unk;
     kNONE = TD::none;
     kSTAR = TD::Convert("<{STAR}>");
   }
@@ -226,7 +231,7 @@ class LanguageModelImpl {
     *(static_cast<char*>(state) + state_size_) = size;
   }
 
-  virtual double WordProb(int word, int* context) {
+  virtual double WordProb(WordID word, WordID const* context) {
     return ngram_.wordProb(word, (VocabIndex*)context);
   }
 
@@ -425,8 +430,8 @@ public:
   vector<WordID> buffer_;
   int order_;
   int state_size_;
-  double floor_;
  public:
+  double floor_;
   WordID kSTART;
   WordID kSTOP;
   WordID kUNKNOWN;
@@ -440,7 +445,7 @@ struct ClientLMI : public LanguageModelImpl
   ClientLMI(int order,string const& server) : LanguageModelImpl(order), client_(server)
   {}
 
-  virtual double WordProb(int word, int* context) {
+  virtual double WordProb(int word, WordID const* context) {
     return client_.wordProb(word, context);
   }
 
@@ -452,7 +457,7 @@ struct ReuseLMI : public LanguageModelImpl
 {
   ReuseLMI(int order, Ngram *ng) : LanguageModelImpl(order), ng(ng)
   {}
-  double WordProb(int word, int* context) {
+  double WordProb(int word, WordID const* context) {
     return ng->wordProb(word, (VocabIndex*)context);
   }
 protected:
@@ -520,8 +525,7 @@ usage:
   return false;
 }
 
-
-LanguageModel::LanguageModel(const string& param) {
+LanguageModelImpl *make_lm_impl(string const& param, int *order_out, int *fid_out) {
   int order,load_order;
   string featurename,filename;
   if (!parse_lmspec(param,order,featurename,filename,load_order))
@@ -530,12 +534,80 @@ LanguageModel::LanguageModel(const string& param) {
   if (load_order)
     cerr<<" loading LM as order "<<load_order;
   cerr<<endl;
-  fid_=FD::Convert(featurename);
-  pimpl_ = make_lm_impl(order,filename,load_order);
+  *order_out=order;
+  *fid_out=FD::Convert(featurename);
+  return make_lm_impl(order,filename,load_order);
+}
+
+
+LanguageModel::LanguageModel(const string& param) {
+  int order;
+  pimpl_ = make_lm_impl(param,&order,&fid_);
   //TODO: see if it's actually possible to set order_ later to mutate an already used FF for e.g. multipass.  comment in ff.h says only to change state size in constructor.  clone instead?  differently -n named ones from same lm filename are already possible, so no urgency.
   SetStateSize(LanguageModelImpl::OrderToStateSize(order));
 }
 
+//TODO: decide whether to waste a word of space so states are always none-terminated for SRILM.  otherwise we have to copy
+void LanguageModelFsa::set_ngram_order(int i) {
+  assert(i>0);
+  ngram_order_=i;
+  ctxlen_=i-1;
+  set_state_bytes(ctxlen_*sizeof(WordID));
+  set_end_phrase(TD::se); //TODO: pretty boring in unigram case, just adds constant prob - bu  WordID *ss=(WordID*)start.begin();
+  WordID *hs=(WordID*)h_start.begin();
+t for compat. with non-fsa version, leave it
+  if (ctxlen_) { // avoid segfault in case of unigram lm (0 state)
+    ss[0]=TD::ss; // start-sentence context (length 1)
+    hs[0]=TD::none; // empty context
+    for (int i=1;i<ctxlen_;++i) {
+      ss[i]=hs[i]=TD::none; // need this so storage is initialized for hashing.
+      //TODO: reevaluate whether state space comes cleared by allocator or not.
+    }
+  }
+}
+namespace {
+WordID empty_context=TD::none;
+}
+
+LanguageModelFsa::LanguageModelFsa(string const& param) {
+  int lmorder;
+  pimpl_ = make_lm_impl(param,&lmorder,&fid_);
+  floor_=pimpl_->floor_;
+  set_ngram_order(lmorder);
+}
+
+//TODO: use sri equivalent states (expose in lm impl?)
+void LanguageModelFsa::Scan(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,FeatureVector *features) const {
+  //variable length array is in C99, msvc++, if it doesn't support it, #ifdef it or use a stackalloc call (forget the name)
+  Featval p;
+  if (ctxlen_) {
+    WordID ctx[ngram_order_];
+    state_cpy(ctx,old_st);
+    ctx[ctxlen_]=TD::none; // make this part of state?  wastes space but saves copies.
+    p=pimpl_->WordProb(w,ctx);
+// states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.).
+    WordID *nst=(WordID *)new_st;
+    nst[0]=w; // new most recent word
+    to_state(nst+1,ctx,ctxlen_-1); // rotate old words right
+  } else {
+    p=pimpl_->WordProb(w,&empty_context);
+  }
+  add_feat(features,(p<floor_)?floor_:p);
+}
+
+void LanguageModelFsa::print_state(ostream &o,void *st) const {
+  WordID *wst=(WordID *)st;
+  o<<'[';
+  for (int i=ctxlen_;i>0;) {
+    --i;
+    WordID w=wst[i];
+    if (w==TD::none) continue;
+    if (i) o<<' ';
+    o << TD::Convert(w);
+  }
+  o<<']';
+}
+
 Features LanguageModel::features() const {
   return single_feature(fid_);
 }
@@ -548,13 +620,12 @@ string LanguageModel::DebugStateToString(const void* state) const{
   return pimpl_->DebugStateToString(state);
 }
 
-void LanguageModel::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+void LanguageModel::TraversalFeaturesImpl(const SentenceMetadata& /* smeta */,
                                           const Hypergraph::Edge& edge,
                                           const vector<const void*>& ant_states,
                                           SparseVector<double>* features,
                                           SparseVector<double>* estimated_features,
                                           void* state) const {
-  (void) smeta;
   features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state));
   estimated_features->set_value(fid_, pimpl_->EstimateProb(state));
 }
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
index 01b3764e..6a4e8201 100755
--- a/decoder/ff_lm_fsa.h
+++ b/decoder/ff_lm_fsa.h
@@ -6,10 +6,21 @@
 #include "ff_lm.h"
 #include "ff_from_fsa.h"
 
-class LanguageModelFsa : public FsaFeatureFunctionBase {
+struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
+  // overrides; implementations in ff_lm.cc
   static std::string usage(bool,bool);
   LanguageModelFsa(std::string const& param);
-  // implementations in ff_lm.cc
+  int markov_order() const { return ctxlen_; }
+  void Scan(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,FeatureVector *features) const;
+  void print_state(std::ostream &,void *) const;
+
+  // impl details:
+  void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows.  otherwise, it's the same as a "-o i" argument to constructor
+  double floor_; // log10prob minimum used (e.g. unk words)
+private:
+  int ngram_order_;
+  int ctxlen_; // 1 less than above
+  LanguageModelImpl *pimpl_;
 };
 
 typedef FeatureFunctionFromFsa<LanguageModelFsa> LanguageModelFromFsa;
diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h
index 24f12560..6e6ad30e 100755
--- a/decoder/ff_sample_fsa.h
+++ b/decoder/ff_sample_fsa.h
@@ -27,6 +27,23 @@ struct WordPenaltyFsa : public FsaFeatureFunctionBase<WordPenaltyFsa> {
 
 typedef FeatureFunctionFromFsa<WordPenaltyFsa> WordPenaltyFromFsa;
 
+struct SameFirstLetter : public FsaFeatureFunctionBase<SameFirstLetter> {
+  SameFirstLetter(std::string const& param) : FsaFeatureFunctionBase<SameFirstLetter>(1,singleton_sentence("END")) {  start[0]='a';h_start[0]=0; } // 1 byte of state, scan final (single) symbol "END" to get final state cost
+  int markov_order() const { return 1; }
+  Featval Scan1(WordID w,void const* old_state,void *new_state) const {
+    char cw=TD::Convert(w)[0];
+    char co=*(char const*)old_state;
+    *(char *)new_state = cw;
+    return cw==co?1:0;
+  }
+  void print_state(std::ostream &o,void const* st) const {
+    o<<*(char const*)st;
+  }
+  static std::string usage(bool param,bool verbose) {
+    return FeatureFunction::usage_helper("SameFirstLetter","[no args]","1 each time 2 consecutive words start with the same letter",param,verbose);
+  }
+};
+
 
 // appears to be buggy right now: give it a bonus weight (+)
 struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> {
-- 
cgit v1.2.3