summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--decoder/apply_models.cc2
-rw-r--r--decoder/ff.cc2
-rw-r--r--decoder/ff.h1
-rw-r--r--decoder/ff_factory.cc16
-rwxr-xr-xdecoder/ff_from_fsa.h38
-rwxr-xr-xdecoder/ff_fsa.h95
-rwxr-xr-xdecoder/ff_sample_fsa.h21
-rw-r--r--decoder/sparse_vector.h2
-rw-r--r--decoder/stringlib.h25
9 files changed, 163 insertions, 39 deletions
diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc
index 2b518d62..11d43e93 100644
--- a/decoder/apply_models.cc
+++ b/decoder/apply_models.cc
@@ -1,3 +1,5 @@
+////TODO: keep model state in forest?
+
//TODO: (for many nonterminals, or multi-rescoring pass) either global
//best-first, or group by (NT,span) - use prev forest outside as a (admissable,
//if models are a subset and weights are same) heuristic
diff --git a/decoder/ff.cc b/decoder/ff.cc
index b6a541e3..28d6f732 100644
--- a/decoder/ff.cc
+++ b/decoder/ff.cc
@@ -1,3 +1,5 @@
+//TODO: actually score rule_feature()==true features once only, hash keyed on rule or modify TRule directly? need to keep clear in forest which features come from models vs. rules; then rescoring could drop all the old models features at once
+
//TODO: 0 size state != rule-local feature, i.e. still may depend on source span loc/context. identify truly rule-local features so if we want they can be added to grammar rules (minor speedup)
#include <boost/lexical_cast.hpp>
diff --git a/decoder/ff.h b/decoder/ff.h
index 0bfc8582..a0b39c26 100644
--- a/decoder/ff.h
+++ b/decoder/ff.h
@@ -17,6 +17,7 @@ typedef std::vector<WordID> Features; // set of features ids
class FeatureFunction {
public:
std::string name; // set by FF factory using usage()
+ bool debug; // also set by FF factory checking param for immediate initial "debug"
FeatureFunction() : state_size_() {}
explicit FeatureFunction(int state_size) : state_size_(state_size) {}
virtual ~FeatureFunction();
diff --git a/decoder/ff_factory.cc b/decoder/ff_factory.cc
index fe733ca5..a6d834e0 100644
--- a/decoder/ff_factory.cc
+++ b/decoder/ff_factory.cc
@@ -1,6 +1,7 @@
#include "ff_factory.h"
#include "ff.h"
+#include "stringlib.h"
using boost::shared_ptr;
using namespace std;
@@ -21,14 +22,27 @@ string FFRegistry::usage(string const& ffname,bool params,bool verbose) const {
: it->second->usage(params,verbose);
}
+namespace {
+std::string const& debug_pre="debug";
+}
+
shared_ptr<FeatureFunction> FFRegistry::Create(const string& ffname, const string& param) const {
map<string, shared_ptr<FFFactoryBase> >::const_iterator it = reg_.find(ffname);
shared_ptr<FeatureFunction> res;
if (it == reg_.end()) {
cerr << "I don't know how to create feature " << ffname << endl;
} else {
- res = it->second->Create(param);
+ int pl=debug_pre.size();
+ bool space=false;
+ std::string p=param;
+ bool debug=match_begin(p,debug_pre)&&(p.size()==pl||(space=p[pl]==' '));
+ if (debug) {
+ p.erase(0,debug_pre.size()+space);
+ cerr<<"debug enabled for "<<ffname<< " - rest of param='"<<p<<"'\n";
+ }
+ res = it->second->Create(p);
res->name=ffname;
+ res->debug=debug;
}
return res;
}
diff --git a/decoder/ff_from_fsa.h b/decoder/ff_from_fsa.h
index f84bda31..42fa1e80 100755
--- a/decoder/ff_from_fsa.h
+++ b/decoder/ff_from_fsa.h
@@ -3,6 +3,13 @@
#include "ff_fsa.h"
+#define FSA_FF_DEBUG
+#ifdef FSA_FF_DEBUG
+# define FSAFFDBG(x) do { if (debug) { std::cerr << x; } } while(0)
+#else
+# define FSAFFDBG(x)
+#endif
+
/* regular bottom up scorer from Fsa feature
uses guarantee about markov order=N to score ASAP
encoding of state: if less than N-1 (ctxlen) words
@@ -39,6 +46,7 @@ public:
FeatureVector* estimated_features,
void* out_state) const
{
+ FSAFFDBG("(FromFsa) "<<name);
ff.init_features(features); // estimated_features is fresh
if (!ssz) {
TRule const& rule=*edge.rule_;
@@ -47,9 +55,11 @@ public:
if (e[j] < 1) { // variable
} else {
WordID ew=e[j];
+ FSAFFDBG(' '<<TD::Convert(ew));
ff.Scan(smeta,ew,0,0,features);
}
}
+ FSAFFDBG('\n');
return;
}
@@ -63,6 +73,7 @@ public:
for (int j = 0; j < e.size(); ++j) { // items in target side of rule
if (e[j] < 1) { // variable
SP a = ant_contexts[-e[j]];
+ FSAFFDBG(' '<<describe_state(a));
WP al=(WP)a;
WP ale=left_end(a);
// scan(al,le) these - the same as below else. macro for now; pull into closure object later?
@@ -87,6 +98,7 @@ public:
fsa.reset(fsa_state(a));
} else { // single word
WordID ew=e[j];
+ FSAFFDBG(' '<<TD::Convert(ew));
// some redundancy: non-vectorized version of above handling of left words of child item
if (left_out<left_full) {
*left_out++=ew;
@@ -105,13 +117,31 @@ public:
clear_fsa_state(out_state); // 0 bytes so we compare / hash correctly. don't know state yet
while(left_out<left_full) *left_out++=TD::none; // mark as partial left word seq
} else // or else store final right-state. heuristic was already assigned
- fstatecpy(fsa_state(out_state),fsa.cs);
+ fstatecpy(out_state,fsa.cs);
+ FSAFFDBG(" = " << describe_state(out_state)<<" "<<(*features)[ff.fid()]<<" h="<<(*estimated_features)[ff.fid()]<<'\n');
+ }
+
+ void print_state(std::ostream &o,void const*ant) const {
+ WP l=(WP)ant,le=left_end(ant),lf=left_end_full(ant);
+ o<<'['<<Sentence(l,le);
+ if (le==lf) {
+ o<<" : ";
+ ff.print_state(o,lf);
+ }
+ o << ']';
+ }
+
+ std::string describe_state(void const*ant) const {
+ std::ostringstream o;
+ print_state(o,ant);
+ return o.str();
}
virtual void FinalTraversalFeatures(const SentenceMetadata& smeta,
const void* residual_state,
FeatureVector* final_features) const
{
+ ff.init_features(final_features); // estimated_features is fresh
Sentence const& ends=ff.end_phrase();
if (!ssz) {
AccumFeatures(ff,smeta,begin(ends),end(ends),final_features,0);
@@ -132,6 +162,7 @@ public:
// whole = left-words + end-phrase
AccumFeatures(ff,smeta,w,end(whole),final_features,ss);
}
+ FSAFFDBG("Final "<<name<<" = "<<*final_features<<'\n');
}
bool rule_feature() const {
@@ -190,8 +221,8 @@ private:
std::memset(fsa_state(ant),0,ssz);
}
- inline void fstatecpy(void *dest,void const* src) const {
- std::memcpy(dest,src,ssz);
+ inline void fstatecpy(void *ant,void const* src) const {
+ std::memcpy(fsa_state(ant),src,ssz);
}
};
@@ -201,6 +232,7 @@ private:
# include "ff_sample_fsa.h"
int main() {
std::cerr<<"Testing left_end...\n";
+ std::cerr<<"sizeof(FeatureVector)="<<sizeof(FeatureVector)<<"\nsizeof(FeatureVectorList)="<<sizeof(FeatureVectorList)<<"\n";
WordPenaltyFromFsa::test();
return 0;
}
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h
index 4e40f51b..8ca1951f 100755
--- a/decoder/ff_fsa.h
+++ b/decoder/ff_fsa.h
@@ -3,14 +3,17 @@
//SEE ALSO: ff_fsa_dynamic.h, ff_from_fsa.h
-#define FSA_DEBUG
+//#define FSA_DEBUG
+
#ifdef FSA_DEBUG
# include <iostream>
-# define FSADBG(x) do { std::cerr << x; } while(0)
+# define FSADBG(x) do { if (d().debug()) { std::cerr << x; } } while(0)
#else
# define FSADBG(x)
#endif
+#include <boost/lexical_cast.hpp>
+#include <sstream>
#include <stdint.h> //C99
#include <string>
#include "ff.h"
@@ -30,20 +33,28 @@ typedef ValueArray<uint8_t> Bytes;
*/
// it's not necessary to inherit from this, but you probably should to save yourself some boilerplate. defaults to no-state
+
+// usage:
+// struct FsaFeat : public FsaTypedBase<int,FsaFeat>
+// i.e. Impl is a CRTP
+template <class Impl>
struct FsaFeatureFunctionBase {
+ Impl const& d() const { return static_cast<Impl const&>(*this); }
+ Impl & d() { return static_cast<Impl &>(*this); }
protected:
+ int state_bytes_; // don't forget to set this. default 0 (it may depend on params of course)
Bytes start,h_start; // start state and estimated-features (heuristic) start state. set these. default empty.
Sentence end_phrase_; // words appended for final traversal (final state cost is assessed using Scan) e.g. "</s>" for lm.
- int state_bytes_; // don't forget to set this. default 0 (it may depend on params of course)
void set_state_bytes(int sb=0) {
if (start.size()!=sb) start.resize(sb);
if (h_start.size()!=sb) h_start.resize(sb);
state_bytes_=sb;
}
int fid_; // you can have more than 1 feature of course.
- void init_fid(std::string const& name) { // call this, though, if you have a single feature
- fid_=FD::Convert(name);
+ void Init() { // CALL THIS MANUALLY (because feature name(s) may depend on param
+ fid_=FD::Convert(d().name());
}
+
inline void static to_state(void *state,char const* begin,char const* end) {
std::memcpy(state,begin,end-begin);
}
@@ -58,7 +69,34 @@ protected:
inline void static to_state(void *state,T const* begin,T const* end) {
to_state(state,(char const*)begin,(char const*)end);
}
+ inline static char hexdigit(int i) {
+ return '0'+i;
+ }
+ inline static void print_hex_byte(std::ostream &o,unsigned c) {
+ o<<hexdigit(c>>4);
+ o<<hexdigit(c&0x0f);
+ }
+
public:
+ bool debug() const { return true; }
+ int fid() const { return fid_; } // return the one most important feature (for debugging)
+ std::string name() const {
+ return Impl::usage(false,false);
+ }
+
+ void print_state(std::ostream &o,void const*state) const {
+ char const* i=(char const*)state;
+ char const* e=i+state_bytes_;
+ for (;i!=e;++i)
+ print_hex_byte(o,*i);
+ }
+
+ std::string describe_state(void const* state) const {
+ std::ostringstream o;
+ d().print_state(o,state);
+ return o.str();
+ }
+
//edges may have old features on them. override if you have more than 1 fid. we need to call this explicitly because edges may have old feature values already, and I chose to use add_value (+=) to simplify scanning a phrase, rather than set_value (=) for fsa ffs. could revisit this and use set_value and therefore sum
void init_features(FeatureVector *fv) const {
fv->set_value(fid_,0);
@@ -93,7 +131,7 @@ public:
}
// don't set state-bytes etc. in ctor because it may depend on parsing param string
- FsaFeatureFunctionBase(int statesz=0) : start(statesz),h_start(statesz),state_bytes_(statesz) { }
+ FsaFeatureFunctionBase(int statesz=0,Sentence const& end_sentence_phrase=Sentence()) : state_bytes_(statesz),start(statesz),h_start(statesz),end_phrase_(end_sentence_phrase) {}
};
@@ -160,9 +198,15 @@ void AccumFeatures(FF const& ff,SentenceMetadata const& smeta,WordID const* i, W
}
// if State is pod. sets state size and allocs start, h_start
-template <class St>
-struct FsaTypedBase : public FsaFeatureFunctionBase {
+// usage:
+// struct ShorterThanPrev : public FsaTypedBase<int,ShorterThanPrev>
+// i.e. Impl is a CRTP
+template <class St,class Impl>
+struct FsaTypedBase : public FsaFeatureFunctionBase<Impl> {
+ Impl const& d() const { return static_cast<Impl const&>(*this); }
+ Impl & d() { return static_cast<Impl &>(*this); }
protected:
+ typedef FsaFeatureFunctionBase<Impl> Base;
typedef St State;
static inline State & state(void *state) {
return *(State*)state;
@@ -172,32 +216,33 @@ protected:
}
void set_starts(State const& s,State const& heuristic_s) {
if (0) { // already in ctor
- start.resize(sizeof(State));
- h_start.resize(sizeof(State));
+ Base::start.resize(sizeof(State));
+ Base::h_start.resize(sizeof(State));
}
- state(start.begin())=s;
- state(h_start.begin())=heuristic_s;
+ state(Base::start.begin())=s;
+ state(Base::h_start.begin())=heuristic_s;
}
- void set_h_start(State const& s) {
+ FsaTypedBase(St const& start_st=St()
+ ,St const& h_start_st=St()
+ ,Sentence const& end_sentence_phrase=Sentence())
+ : Base(sizeof(State),end_sentence_phrase) {
+ set_starts(start_st,h_start_st);
}
public:
+ void print_state(std::ostream &o,void const*st) const {
+ o<<state(st);
+ }
int markov_order() const { return 1; }
- FsaTypedBase() : FsaFeatureFunctionBase(sizeof(State)) {
+ void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const {
+ Impl const& im=d();
+ FSADBG("Scan "<<FD::Convert(im.fid_)<<" = "<<(*features)[im.fid_]<<" "<<im.state(st)<<" ->"<<TD::Convert(w)<<" ");
+ im.ScanTyped(smeta,w,im.state(st),im.state(next_state),features);
+ FSADBG(im.state(next_state)<<" = "<<(*features)[im.fid_]<<std::endl);
}
+
};
-// usage (if you're lazy):
-// struct ShorterThanPrev : public FsaTypedBase<int>,FsaTypedScan<ShorterThanPrev>
-template <class St,class Impl>
-struct FsaTypedScan : public FsaTypedBase<St> {
- void Scan(SentenceMetadata const& smeta,WordID w,void const* st,void *next_state,FeatureVector *features) const {
- Impl const* impl=static_cast<Impl const*>(this);
- FSADBG("Scan "<<(*features)[impl->fid_]<<" = "<<impl->state(st)<<" ->"<<TD::Convert(w)<<" ");
- impl->ScanTyped(smeta,w,impl->state(st),impl->state(next_state),features);
- FSADBG(impl->state(next_state)<<" = "<<(*features)[impl->fid_]<<std::endl);
- }
-};
diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h
index 947ad21c..8befc0bb 100755
--- a/decoder/ff_sample_fsa.h
+++ b/decoder/ff_sample_fsa.h
@@ -4,7 +4,7 @@
#include "ff_from_fsa.h"
// example: feature val = -1 * # of target words
-struct WordPenaltyFsa : public FsaFeatureFunctionBase {
+struct WordPenaltyFsa : public FsaFeatureFunctionBase<WordPenaltyFsa> {
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper(
"WordPenaltyFsa","","-1 per target word"
@@ -12,7 +12,7 @@ struct WordPenaltyFsa : public FsaFeatureFunctionBase {
}
WordPenaltyFsa(std::string const& param) {
- init_fid(usage(false,false));
+ Init();
return;
//below are all defaults:
set_state_bytes(0);
@@ -30,7 +30,7 @@ typedef FeatureFunctionFromFsa<WordPenaltyFsa> WordPenaltyFromFsa;
//
-struct LongerThanPrev : public FsaFeatureFunctionBase {
+struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> {
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper(
"LongerThanPrev",
@@ -50,7 +50,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase {
}
int markov_order() const { return 1; }
LongerThanPrev(std::string const& param) {
- init_fid(usage(false,false));
+ Init();
set_state_bytes(sizeof(int));
// start.resize(state_bytes()); // this is done by set_state_bytes already.
// h_start.resize(state_bytes());
@@ -73,7 +73,8 @@ struct LongerThanPrev : public FsaFeatureFunctionBase {
};
// similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State)
-struct ShorterThanPrev : FsaTypedScan<int,ShorterThanPrev> {
+struct ShorterThanPrev : FsaTypedBase<int,ShorterThanPrev> {
+ typedef FsaTypedBase<int,ShorterThanPrev> Base;
static std::string usage(bool param,bool verbose) {
return FeatureFunction::usage_helper(
"ShorterThanPrev",
@@ -85,10 +86,12 @@ struct ShorterThanPrev : FsaTypedScan<int,ShorterThanPrev> {
static inline int wordlen(WordID w) {
return std::strlen(TD::Convert(w));
}
- ShorterThanPrev(std::string const& param) {
- init_fid(usage(false,false));
-// end_phrase_.push_back(TD::Convert("")); // this triggers end of sentence firing
- set_starts(-1,4); // estimate: anything <4 chars is usually shorter than previous
+ ShorterThanPrev(std::string const& param)
+ : Base(-1,4,Sentence(1,TD::Convert("")))
+ // start, h_start, end_phrase
+ // estimate: anything <4 chars is usually shorter than previous
+ {
+ Init();
}
static const float val_per_target_word=-1;
diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h
index 0f3724f0..285e84a7 100644
--- a/decoder/sparse_vector.h
+++ b/decoder/sparse_vector.h
@@ -420,7 +420,7 @@ private:
List p;
};
-
+typedef SparseVectorList<double> FeatureVectorList;
typedef SparseVector<double> FeatureVector;
typedef SparseVector<double> WeightVector;
typedef std::vector<double> DenseWeightVector;
diff --git a/decoder/stringlib.h b/decoder/stringlib.h
index a0e03624..9efe3f36 100644
--- a/decoder/stringlib.h
+++ b/decoder/stringlib.h
@@ -14,6 +14,31 @@
#include <cstring>
#include <string>
+template <class Istr, class Isubstr> inline
+bool match_begin(Istr bstr,Istr estr,Isubstr bsub,Isubstr esub)
+{
+ while (bsub != esub) {
+ if (bstr == estr)
+ return false;
+ if (*bsub++ != *bstr++)
+ return false;
+ }
+ return true;
+}
+
+template <class Istr, class Prefix> inline
+bool match_begin(Istr bstr,Istr estr,Prefix prefix)
+{
+ return match_begin(bstr,estr,prefix.begin(),prefix.end());
+}
+
+template <class Str, class Prefix> inline
+bool match_begin(Str const& str,Prefix const& prefix)
+{
+ return match_begin(str.begin(),str.end(),prefix.begin(),prefix.end());
+}
+
+
// read line in the form of either:
// source
// source ||| target