From f4b4aade473f9463dda6fac4baf9c0502d004deb Mon Sep 17 00:00:00 2001 From: graehl Date: Mon, 26 Jul 2010 04:53:15 +0000 Subject: LanguageModelFsa works. TODO: sri context shortening? git-svn-id: https://ws10smt.googlecode.com/svn/trunk@414 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/cdec_ff.cc | 3 +- decoder/ff_fsa.h | 48 ++++++++++++++++++---- decoder/ff_lm.cc | 103 ++++++++++++++++++++++++++++++++++++++++-------- decoder/ff_lm_fsa.h | 15 ++++++- decoder/ff_sample_fsa.h | 17 ++++++++ 5 files changed, 160 insertions(+), 26 deletions(-) diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 78c67fb3..037cd92e 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -9,11 +9,12 @@ #include "ff_ruleshape.h" #include "ff_bleu.h" #include "ff_sample_fsa.h" - +#include "ff_lm_fsa.h" boost::shared_ptr global_ff_registry; void register_feature_functions() { global_ff_registry->Register(new FFFactory); + global_ff_registry->Register(new FFFactory >); // same as LM but using fsa wrapper global_ff_registry->Register(new FFFactory); // same as WordPenalty, but implemented using ff_fsa global_ff_registry->Register(new FFFactory >); global_ff_registry->Register(new FFFactory >); diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h index e21cbf6f..4575b648 100755 --- a/decoder/ff_fsa.h +++ b/decoder/ff_fsa.h @@ -4,14 +4,15 @@ /* features whose score is just some PFSA over target string. however, PFSA can use edge and smeta info (e.g. spans on edge) - not usually useful. +//SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h + state is some fixed width byte array. could actually be a void *, WordID sequence, whatever. TODO: fsa feature aggregator that presents itself as a single fsa; benefit: when wrapped in ff_from_fsa, only one set of left words is stored. downside: compared to separate ff, the inside portion of lower-order models is incorporated later. however, the full heuristic is already available and exact for those words. so don't sweat it. - TODO: state (+ possibly span-specific) custom heuristic, e.g. in "longer than previous word" model, you can expect a higher outside if your state is a word of 2 letters. this is on top of the nice heuristic for the unscored words, of course. in ngrams, the avg prob will be about the same, but if the words possible for a source span are summarized, maybe it's possible to predict. probably not worht the time. + TODO: state (+ possibly span-specific) custom heuristic, e.g. in "longer than previous word" model, you can expect a higher outside if your state is a word of 2 letters. this is on top of the nice heuristic for the unscored words, of course. in ngrams, the avg prob will be about the same, but if the words possible for a source span are summarized, maybe it's possible to predict. probably not worth the effort. */ -//SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h //TODO: decide whether to use init_features / add_value vs. summing elsewhere + set_value once (or inefficient for from_fsa: sum distinct feature_vectors. but L->R if we only scan 1 word at a time, that's fine @@ -48,11 +49,28 @@ typedef ValueArray Bytes; -// it's not necessary to inherit from this, but you probably should to save yourself some boilerplate. defaults to no-state +/* +usage: +struct SameFirstLetter : public FsaFeatureFunctionBase { +SameFirstLetter(string const& param) : FsaFeatureFunctionBase(1,singleton_sentence("END")) { start[0]='a';h_start[0]=0; } // 1 byte of state, scan final (single) symbol "END" to get final state cost + int markov_order() const { return 1; } + Featval Scan1(WordID w,void const* old_state,void *new_state) const { + char cw=TD::Convert(w)[0]; + char co=*(char const*)old_state; + *(char *)new_state = cw; + return cw==co?1:0; + } + void print_state(std::ostream &o,void const* st) const { + o<<*(char const*)st; + } + static std::string usage(bool param,bool verbose) { + return FeatureFunction::usage_helper("SameFirstLetter","[no args]","1 each time 2 consecutive words start with the same letter",param,verbose); + } +}; + +// then, to decode, see ff_from_fsa.h + */ -// usage: -// struct FsaFeat : public FsaTypedBase -// i.e. Impl is a CRTP template struct FsaFeatureFunctionBase { Impl const& d() const { return static_cast(*this); } @@ -66,6 +84,10 @@ protected: if (h_start.size()!=sb) h_start.resize(sb); state_bytes_=sb; } + void set_end_phrase(WordID single) { + end_phrase_=singleton_sentence(single); + } + int fid_; // you can have more than 1 feature of course. void Init() { // CALL THIS MANUALLY (because feature name(s) may depend on param fid_=FD::Convert(d().name()); @@ -85,6 +107,7 @@ protected: inline void static to_state(void *state,T const* begin,T const* end) { to_state(state,(char const*)begin,(char const*)end); } + inline static char hexdigit(int i) { int j=i-10; return j>=0?'a'+j:'0'+i; @@ -95,6 +118,10 @@ protected: } public: + void state_cpy(void *to,void const*from) const { + std::memcpy(to,from,state_bytes_); + } + // can override to different return type, e.g. just return feats: Featval describe_features(FeatureVector const& feats) const { return feats.get(fid_); @@ -155,7 +182,14 @@ public: // NOTE: if you want to e.g. track statistics, cache, whatever, cast const away or use mutable members inline void Scan(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID w,void const* state,void *next_state,FeatureVector *features) const { - features->maybe_add(fid_,d().Scan1(w,state,next_state)); + maybe_add_feat(features,d().Scan1(w,state,next_state)); + } + + inline void maybe_add_feat(FeatureVector *features,Featval v) const { + features->maybe_add(fid_,v); + } + inline void add_feat(FeatureVector *features,Featval v) const { + features->add_value(fid_,v); } // don't set state-bytes etc. in ctor because it may depend on parsing param string diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index 6579fbee..a5f43867 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -20,6 +20,7 @@ char const* usage_verbose="-n determines the name of the feature (and its weight #endif #include "ff_lm.h" +#include "ff_lm_fsa.h" #include #include @@ -44,8 +45,12 @@ char const* usage_verbose="-n determines the name of the feature (and its weight using namespace std; +string LanguageModelFsa::usage(bool param,bool verbose) { + return FeatureFunction::usage_helper("LanguageModelFsa",usage_short,usage_verbose,param,verbose); +} + string LanguageModel::usage(bool param,bool verbose) { - return usage_helper(usage_name,usage_short,usage_verbose,param,verbose); + return FeatureFunction::usage_helper(usage_name,usage_short,usage_verbose,param,verbose); } @@ -126,7 +131,7 @@ struct LMClient { cerr << "Connected to LM on " << host << " on port " << port << endl; } - float wordProb(int word, int* context) { + float wordProb(int word, WordID const* context) { NgramCache::Cache* cur = &NgramCache::cache_; int i = 0; while (context[i] > 0) { @@ -183,10 +188,10 @@ class LanguageModelImpl { order_=order; state_size_ = OrderToStateSize(order)-1; unigram=(order<=1); - floor_=-100; - kSTART = TD::Convert(""); - kSTOP = TD::Convert(""); - kUNKNOWN = TD::Convert(""); + floor_ = -100; + kSTART = TD::ss; + kSTOP = TD::se; + kUNKNOWN = TD::unk; kNONE = TD::none; kSTAR = TD::Convert("<{STAR}>"); } @@ -226,7 +231,7 @@ class LanguageModelImpl { *(static_cast(state) + state_size_) = size; } - virtual double WordProb(int word, int* context) { + virtual double WordProb(WordID word, WordID const* context) { return ngram_.wordProb(word, (VocabIndex*)context); } @@ -425,8 +430,8 @@ public: vector buffer_; int order_; int state_size_; - double floor_; public: + double floor_; WordID kSTART; WordID kSTOP; WordID kUNKNOWN; @@ -440,7 +445,7 @@ struct ClientLMI : public LanguageModelImpl ClientLMI(int order,string const& server) : LanguageModelImpl(order), client_(server) {} - virtual double WordProb(int word, int* context) { + virtual double WordProb(int word, WordID const* context) { return client_.wordProb(word, context); } @@ -452,7 +457,7 @@ struct ReuseLMI : public LanguageModelImpl { ReuseLMI(int order, Ngram *ng) : LanguageModelImpl(order), ng(ng) {} - double WordProb(int word, int* context) { + double WordProb(int word, WordID const* context) { return ng->wordProb(word, (VocabIndex*)context); } protected: @@ -520,8 +525,7 @@ usage: return false; } - -LanguageModel::LanguageModel(const string& param) { +LanguageModelImpl *make_lm_impl(string const& param, int *order_out, int *fid_out) { int order,load_order; string featurename,filename; if (!parse_lmspec(param,order,featurename,filename,load_order)) @@ -530,12 +534,80 @@ LanguageModel::LanguageModel(const string& param) { if (load_order) cerr<<" loading LM as order "<0); + ngram_order_=i; + ctxlen_=i-1; + set_state_bytes(ctxlen_*sizeof(WordID)); + set_end_phrase(TD::se); //TODO: pretty boring in unigram case, just adds constant prob - bu WordID *ss=(WordID*)start.begin(); + WordID *hs=(WordID*)h_start.begin(); +t for compat. with non-fsa version, leave it + if (ctxlen_) { // avoid segfault in case of unigram lm (0 state) + ss[0]=TD::ss; // start-sentence context (length 1) + hs[0]=TD::none; // empty context + for (int i=1;ifloor_; + set_ngram_order(lmorder); +} + +//TODO: use sri equivalent states (expose in lm impl?) +void LanguageModelFsa::Scan(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,FeatureVector *features) const { + //variable length array is in C99, msvc++, if it doesn't support it, #ifdef it or use a stackalloc call (forget the name) + Featval p; + if (ctxlen_) { + WordID ctx[ngram_order_]; + state_cpy(ctx,old_st); + ctx[ctxlen_]=TD::none; // make this part of state? wastes space but saves copies. + p=pimpl_->WordProb(w,ctx); +// states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.). + WordID *nst=(WordID *)new_st; + nst[0]=w; // new most recent word + to_state(nst+1,ctx,ctxlen_-1); // rotate old words right + } else { + p=pimpl_->WordProb(w,&empty_context); + } + add_feat(features,(p0;) { + --i; + WordID w=wst[i]; + if (w==TD::none) continue; + if (i) o<<' '; + o << TD::Convert(w); + } + o<<']'; +} + Features LanguageModel::features() const { return single_feature(fid_); } @@ -548,13 +620,12 @@ string LanguageModel::DebugStateToString(const void* state) const{ return pimpl_->DebugStateToString(state); } -void LanguageModel::TraversalFeaturesImpl(const SentenceMetadata& smeta, +void LanguageModel::TraversalFeaturesImpl(const SentenceMetadata& /* smeta */, const Hypergraph::Edge& edge, const vector& ant_states, SparseVector* features, SparseVector* estimated_features, void* state) const { - (void) smeta; features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state)); estimated_features->set_value(fid_, pimpl_->EstimateProb(state)); } diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h index 01b3764e..6a4e8201 100755 --- a/decoder/ff_lm_fsa.h +++ b/decoder/ff_lm_fsa.h @@ -6,10 +6,21 @@ #include "ff_lm.h" #include "ff_from_fsa.h" -class LanguageModelFsa : public FsaFeatureFunctionBase { +struct LanguageModelFsa : public FsaFeatureFunctionBase { + // overrides; implementations in ff_lm.cc static std::string usage(bool,bool); LanguageModelFsa(std::string const& param); - // implementations in ff_lm.cc + int markov_order() const { return ctxlen_; } + void Scan(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,FeatureVector *features) const; + void print_state(std::ostream &,void *) const; + + // impl details: + void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows. otherwise, it's the same as a "-o i" argument to constructor + double floor_; // log10prob minimum used (e.g. unk words) +private: + int ngram_order_; + int ctxlen_; // 1 less than above + LanguageModelImpl *pimpl_; }; typedef FeatureFunctionFromFsa LanguageModelFromFsa; diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h index 24f12560..6e6ad30e 100755 --- a/decoder/ff_sample_fsa.h +++ b/decoder/ff_sample_fsa.h @@ -27,6 +27,23 @@ struct WordPenaltyFsa : public FsaFeatureFunctionBase { typedef FeatureFunctionFromFsa WordPenaltyFromFsa; +struct SameFirstLetter : public FsaFeatureFunctionBase { + SameFirstLetter(std::string const& param) : FsaFeatureFunctionBase(1,singleton_sentence("END")) { start[0]='a';h_start[0]=0; } // 1 byte of state, scan final (single) symbol "END" to get final state cost + int markov_order() const { return 1; } + Featval Scan1(WordID w,void const* old_state,void *new_state) const { + char cw=TD::Convert(w)[0]; + char co=*(char const*)old_state; + *(char *)new_state = cw; + return cw==co?1:0; + } + void print_state(std::ostream &o,void const* st) const { + o<<*(char const*)st; + } + static std::string usage(bool param,bool verbose) { + return FeatureFunction::usage_helper("SameFirstLetter","[no args]","1 each time 2 consecutive words start with the same letter",param,verbose); + } +}; + // appears to be buggy right now: give it a bonus weight (+) struct LongerThanPrev : public FsaFeatureFunctionBase { -- cgit v1.2.3