summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-26 04:53:15 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-26 04:53:15 +0000
commitf4b4aade473f9463dda6fac4baf9c0502d004deb (patch)
tree7b6641f2733b4d64a9f1e273c0f6f2b8fd757d5f
parentb2ad842245f1645e4e9f3c60a80a07e13151a560 (diff)
LanguageModelFsa works. TODO: sri context shortening?
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@414 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--decoder/cdec_ff.cc3
-rwxr-xr-xdecoder/ff_fsa.h48
-rw-r--r--decoder/ff_lm.cc103
-rwxr-xr-xdecoder/ff_lm_fsa.h15
-rwxr-xr-xdecoder/ff_sample_fsa.h17
5 files changed, 160 insertions, 26 deletions
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 78c67fb3..037cd92e 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -9,11 +9,12 @@
#include "ff_ruleshape.h"
#include "ff_bleu.h"
#include "ff_sample_fsa.h"
-
+#include "ff_lm_fsa.h"
boost::shared_ptr<FFRegistry> global_ff_registry;
void register_feature_functions() {
global_ff_registry->Register(new FFFactory<LanguageModel>);
+ global_ff_registry->Register(new FFFactory<FeatureFunctionFromFsa<LanguageModelFsa> >); // same as LM but using fsa wrapper
global_ff_registry->Register(new FFFactory<WordPenaltyFromFsa>); // same as WordPenalty, but implemented using ff_fsa
global_ff_registry->Register(new FFFactory<FeatureFunctionFromFsa<LongerThanPrev> >);
global_ff_registry->Register(new FFFactory<FeatureFunctionFromFsa<ShorterThanPrev> >);
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
index e21cbf6f..4575b648 100755
--- a/decoder/ff_fsa.h
+++ b/decoder/ff_fsa.h
@@ -4,14 +4,15 @@
/*
features whose score is just some PFSA over target string. however, PFSA can use edge and smeta info (e.g. spans on edge) - not usually useful.
+//SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h
+
state is some fixed width byte array. could actually be a void *, WordID sequence, whatever.
TODO: fsa feature aggregator that presents itself as a single fsa; benefit: when wrapped in ff_from_fsa, only one set of left words is stored. downside: compared to separate ff, the inside portion of lower-order models is incorporated later. however, the full heuristic is already available and exact for those words. so don't sweat it.
- TODO: state (+ possibly span-specific) custom heuristic, e.g. in "longer than previous word" model, you can expect a higher outside if your state is a word of 2 letters. this is on top of the nice heuristic for the unscored words, of course. in ngrams, the avg prob will be about the same, but if the words possible for a source span are summarized, maybe it's possible to predict. probably not worht the time.
+ TODO: state (+ possibly span-specific) custom heuristic, e.g. in "longer than previous word" model, you can expect a higher outside if your state is a word of 2 letters. this is on top of the nice heuristic for the unscored words, of course. in ngrams, the avg prob will be about the same, but if the words possible for a source span are summarized, maybe it's possible to predict. probably not worth the effort.
*/
-//SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h
//TODO: decide whether to use init_features / add_value vs. summing elsewhere + set_value once (or inefficient for from_fsa: sum distinct feature_vectors. but L->R if we only scan 1 word at a time, that's fine
@@ -48,11 +49,28 @@
typedef ValueArray<uint8_t> Bytes;
-// it's not necessary to inherit from this, but you probably should to save yourself some boilerplate. defaults to no-state
+/*
+usage:
+struct SameFirstLetter : public FsaFeatureFunctionBase<SameFirstLetter> {
+SameFirstLetter(string const& param) : FsaFeatureFunctionBase<SameFirstLetter>(1,singleton_sentence("END")) { start[0]='a';h_start[0]=0; } // 1 byte of state, scan final (single) symbol "END" to get final state cost
+ int markov_order() const { return 1; }
+ Featval Scan1(WordID w,void const* old_state,void *new_state) const {
+ char cw=TD::Convert(w)[0];
+ char co=*(char const*)old_state;
+ *(char *)new_state = cw;
+ return cw==co?1:0;
+ }
+ void print_state(std::ostream &o,void const* st) const {
+ o<<*(char const*)st;
+ }
+ static std::string usage(bool param,bool verbose) {
+ return FeatureFunction::usage_helper("SameFirstLetter","[no args]","1 each time 2 consecutive words start with the same letter",param,verbose);
+ }
+};
+
+// then, to decode, see ff_from_fsa.h
+ */
-// usage:
-// struct FsaFeat : public FsaTypedBase<int,FsaFeat>
-// i.e. Impl is a CRTP
template <class Impl>
struct FsaFeatureFunctionBase {
Impl const& d() const { return static_cast<Impl const&>(*this); }
@@ -66,6 +84,10 @@ protected:
if (h_start.size()!=sb) h_start.resize(sb);
state_bytes_=sb;
}
+ void set_end_phrase(WordID single) {
+ end_phrase_=singleton_sentence(single);
+ }
+
int fid_; // you can have more than 1 feature of course.
void Init() { // CALL THIS MANUALLY (because feature name(s) may depend on param
fid_=FD::Convert(d().name());
@@ -85,6 +107,7 @@ protected:
inline void static to_state(void *state,T const* begin,T const* end) {
to_state(state,(char const*)begin,(char const*)end);
}
+
inline static char hexdigit(int i) {
int j=i-10;
return j>=0?'a'+j:'0'+i;
@@ -95,6 +118,10 @@ protected:
}
public:
+ void state_cpy(void *to,void const*from) const {
+ std::memcpy(to,from,state_bytes_);
+ }
+
// can override to different return type, e.g. just return feats:
Featval describe_features(FeatureVector const& feats) const {
return feats.get(fid_);
@@ -155,7 +182,14 @@ public:
// NOTE: if you want to e.g. track statistics, cache, whatever, cast const away or use mutable members
inline void Scan(SentenceMetadata const& smeta,const Hypergraph::Edge& edge,WordID w,void const* state,void *next_state,FeatureVector *features) const {
- features->maybe_add(fid_,d().Scan1(w,state,next_state));
+ maybe_add_feat(features,d().Scan1(w,state,next_state));
+ }
+
+ inline void maybe_add_feat(FeatureVector *features,Featval v) const {
+ features->maybe_add(fid_,v);
+ }
+ inline void add_feat(FeatureVector *features,Featval v) const {
+ features->add_value(fid_,v);
}
// don't set state-bytes etc. in ctor because it may depend on parsing param string
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index 6579fbee..a5f43867 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -20,6 +20,7 @@ char const* usage_verbose="-n determines the name of the feature (and its weight
#endif
#include "ff_lm.h"
+#include "ff_lm_fsa.h"
#include <sstream>
#include <unistd.h>
@@ -44,8 +45,12 @@ char const* usage_verbose="-n determines the name of the feature (and its weight
using namespace std;
+string LanguageModelFsa::usage(bool param,bool verbose) {
+ return FeatureFunction::usage_helper("LanguageModelFsa",usage_short,usage_verbose,param,verbose);
+}
+
string LanguageModel::usage(bool param,bool verbose) {
- return usage_helper(usage_name,usage_short,usage_verbose,param,verbose);
+ return FeatureFunction::usage_helper(usage_name,usage_short,usage_verbose,param,verbose);
}
@@ -126,7 +131,7 @@ struct LMClient {
cerr << "Connected to LM on " << host << " on port " << port << endl;
}
- float wordProb(int word, int* context) {
+ float wordProb(int word, WordID const* context) {
NgramCache::Cache* cur = &NgramCache::cache_;
int i = 0;
while (context[i] > 0) {
@@ -183,10 +188,10 @@ class LanguageModelImpl {
order_=order;
state_size_ = OrderToStateSize(order)-1;
unigram=(order<=1);
- floor_=-100;
- kSTART = TD::Convert("<s>");
- kSTOP = TD::Convert("</s>");
- kUNKNOWN = TD::Convert("<unk>");
+ floor_ = -100;
+ kSTART = TD::ss;
+ kSTOP = TD::se;
+ kUNKNOWN = TD::unk;
kNONE = TD::none;
kSTAR = TD::Convert("<{STAR}>");
}
@@ -226,7 +231,7 @@ class LanguageModelImpl {
*(static_cast<char*>(state) + state_size_) = size;
}
- virtual double WordProb(int word, int* context) {
+ virtual double WordProb(WordID word, WordID const* context) {
return ngram_.wordProb(word, (VocabIndex*)context);
}
@@ -425,8 +430,8 @@ public:
vector<WordID> buffer_;
int order_;
int state_size_;
- double floor_;
public:
+ double floor_;
WordID kSTART;
WordID kSTOP;
WordID kUNKNOWN;
@@ -440,7 +445,7 @@ struct ClientLMI : public LanguageModelImpl
ClientLMI(int order,string const& server) : LanguageModelImpl(order), client_(server)
{}
- virtual double WordProb(int word, int* context) {
+ virtual double WordProb(int word, WordID const* context) {
return client_.wordProb(word, context);
}
@@ -452,7 +457,7 @@ struct ReuseLMI : public LanguageModelImpl
{
ReuseLMI(int order, Ngram *ng) : LanguageModelImpl(order), ng(ng)
{}
- double WordProb(int word, int* context) {
+ double WordProb(int word, WordID const* context) {
return ng->wordProb(word, (VocabIndex*)context);
}
protected:
@@ -520,8 +525,7 @@ usage:
return false;
}
-
-LanguageModel::LanguageModel(const string& param) {
+LanguageModelImpl *make_lm_impl(string const& param, int *order_out, int *fid_out) {
int order,load_order;
string featurename,filename;
if (!parse_lmspec(param,order,featurename,filename,load_order))
@@ -530,12 +534,80 @@ LanguageModel::LanguageModel(const string& param) {
if (load_order)
cerr<<" loading LM as order "<<load_order;
cerr<<endl;
- fid_=FD::Convert(featurename);
- pimpl_ = make_lm_impl(order,filename,load_order);
+ *order_out=order;
+ *fid_out=FD::Convert(featurename);
+ return make_lm_impl(order,filename,load_order);
+}
+
+
+LanguageModel::LanguageModel(const string& param) {
+ int order;
+ pimpl_ = make_lm_impl(param,&order,&fid_);
//TODO: see if it's actually possible to set order_ later to mutate an already used FF for e.g. multipass. comment in ff.h says only to change state size in constructor. clone instead? differently -n named ones from same lm filename are already possible, so no urgency.
SetStateSize(LanguageModelImpl::OrderToStateSize(order));
}
+//TODO: decide whether to waste a word of space so states are always none-terminated for SRILM. otherwise we have to copy
+void LanguageModelFsa::set_ngram_order(int i) {
+ assert(i>0);
+ ngram_order_=i;
+ ctxlen_=i-1;
+ set_state_bytes(ctxlen_*sizeof(WordID));
+ set_end_phrase(TD::se); //TODO: pretty boring in unigram case, just adds constant prob - bu WordID *ss=(WordID*)start.begin();
+ WordID *hs=(WordID*)h_start.begin();
+t for compat. with non-fsa version, leave it
+ if (ctxlen_) { // avoid segfault in case of unigram lm (0 state)
+ ss[0]=TD::ss; // start-sentence context (length 1)
+ hs[0]=TD::none; // empty context
+ for (int i=1;i<ctxlen_;++i) {
+ ss[i]=hs[i]=TD::none; // need this so storage is initialized for hashing.
+ //TODO: reevaluate whether state space comes cleared by allocator or not.
+ }
+ }
+}
+namespace {
+WordID empty_context=TD::none;
+}
+
+LanguageModelFsa::LanguageModelFsa(string const& param) {
+ int lmorder;
+ pimpl_ = make_lm_impl(param,&lmorder,&fid_);
+ floor_=pimpl_->floor_;
+ set_ngram_order(lmorder);
+}
+
+//TODO: use sri equivalent states (expose in lm impl?)
+void LanguageModelFsa::Scan(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,FeatureVector *features) const {
+ //variable length array is in C99, msvc++, if it doesn't support it, #ifdef it or use a stackalloc call (forget the name)
+ Featval p;
+ if (ctxlen_) {
+ WordID ctx[ngram_order_];
+ state_cpy(ctx,old_st);
+ ctx[ctxlen_]=TD::none; // make this part of state? wastes space but saves copies.
+ p=pimpl_->WordProb(w,ctx);
+// states are sri contexts so are in reverse order (most recent word is first, then 1-back comes next, etc.).
+ WordID *nst=(WordID *)new_st;
+ nst[0]=w; // new most recent word
+ to_state(nst+1,ctx,ctxlen_-1); // rotate old words right
+ } else {
+ p=pimpl_->WordProb(w,&empty_context);
+ }
+ add_feat(features,(p<floor_)?floor_:p);
+}
+
+void LanguageModelFsa::print_state(ostream &o,void *st) const {
+ WordID *wst=(WordID *)st;
+ o<<'[';
+ for (int i=ctxlen_;i>0;) {
+ --i;
+ WordID w=wst[i];
+ if (w==TD::none) continue;
+ if (i) o<<' ';
+ o << TD::Convert(w);
+ }
+ o<<']';
+}
+
Features LanguageModel::features() const {
return single_feature(fid_);
}
@@ -548,13 +620,12 @@ string LanguageModel::DebugStateToString(const void* state) const{
return pimpl_->DebugStateToString(state);
}
-void LanguageModel::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+void LanguageModel::TraversalFeaturesImpl(const SentenceMetadata& /* smeta */,
const Hypergraph::Edge& edge,
const vector<const void*>& ant_states,
SparseVector<double>* features,
SparseVector<double>* estimated_features,
void* state) const {
- (void) smeta;
features->set_value(fid_, pimpl_->LookupWords(*edge.rule_, ant_states, state));
estimated_features->set_value(fid_, pimpl_->EstimateProb(state));
}
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
index 01b3764e..6a4e8201 100755
--- a/decoder/ff_lm_fsa.h
+++ b/decoder/ff_lm_fsa.h
@@ -6,10 +6,21 @@
#include "ff_lm.h"
#include "ff_from_fsa.h"
-class LanguageModelFsa : public FsaFeatureFunctionBase {
+struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> {
+ // overrides; implementations in ff_lm.cc
static std::string usage(bool,bool);
LanguageModelFsa(std::string const& param);
- // implementations in ff_lm.cc
+ int markov_order() const { return ctxlen_; }
+ void Scan(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,void const* old_st,void *new_st,FeatureVector *features) const;
+ void print_state(std::ostream &,void *) const;
+
+ // impl details:
+ void set_ngram_order(int i); // if you build ff_from_fsa first, then increase this, you will get memory overflows. otherwise, it's the same as a "-o i" argument to constructor
+ double floor_; // log10prob minimum used (e.g. unk words)
+private:
+ int ngram_order_;
+ int ctxlen_; // 1 less than above
+ LanguageModelImpl *pimpl_;
};
typedef FeatureFunctionFromFsa<LanguageModelFsa> LanguageModelFromFsa;
diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h
index 24f12560..6e6ad30e 100755
--- a/decoder/ff_sample_fsa.h
+++ b/decoder/ff_sample_fsa.h
@@ -27,6 +27,23 @@ struct WordPenaltyFsa : public FsaFeatureFunctionBase<WordPenaltyFsa> {
typedef FeatureFunctionFromFsa<WordPenaltyFsa> WordPenaltyFromFsa;
+struct SameFirstLetter : public FsaFeatureFunctionBase<SameFirstLetter> {
+ SameFirstLetter(std::string const& param) : FsaFeatureFunctionBase<SameFirstLetter>(1,singleton_sentence("END")) { start[0]='a';h_start[0]=0; } // 1 byte of state, scan final (single) symbol "END" to get final state cost
+ int markov_order() const { return 1; }
+ Featval Scan1(WordID w,void const* old_state,void *new_state) const {
+ char cw=TD::Convert(w)[0];
+ char co=*(char const*)old_state;
+ *(char *)new_state = cw;
+ return cw==co?1:0;
+ }
+ void print_state(std::ostream &o,void const* st) const {
+ o<<*(char const*)st;
+ }
+ static std::string usage(bool param,bool verbose) {
+ return FeatureFunction::usage_helper("SameFirstLetter","[no args]","1 each time 2 consecutive words start with the same letter",param,verbose);
+ }
+};
+
// appears to be buggy right now: give it a bonus weight (+)
struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> {