From ffd4acfe5b84f413c66af6ec5a76bdbc0d3aa9e8 Mon Sep 17 00:00:00 2001 From: "graehl@gmail.com" Date: Sun, 8 Aug 2010 08:49:13 +0000 Subject: cfg git-svn-id: https://ws10smt.googlecode.com/svn/trunk@493 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/Makefile.am | 1 + decoder/apply_fsa_models.cc | 24 +++++++++------ decoder/cfg.cc | 42 +++++++++++++++++++++++++++ decoder/cfg.h | 71 +++++++++++++++++++++++++++++++++++++++++++++ decoder/hg.cc | 15 ++++++++-- decoder/hg.h | 18 +++++++----- decoder/int_or_pointer.h | 70 ++++++++++++++++++++++++++++++++++++++++++++ graehl/NOTES.earley | 13 +++++++++ 8 files changed, 235 insertions(+), 19 deletions(-) create mode 100755 decoder/cfg.cc create mode 100755 decoder/cfg.h create mode 100755 decoder/int_or_pointer.h diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 88d6d17a..68a7d765 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -44,6 +44,7 @@ rule_lexer.cc: rule_lexer.l noinst_LIBRARIES = libcdec.a libcdec_a_SOURCES = \ + cfg.cc \ apply_fsa_models.cc \ rule_lexer.cc \ fst_translator.cc \ diff --git a/decoder/apply_fsa_models.cc b/decoder/apply_fsa_models.cc index 01de62d3..416b9323 100755 --- a/decoder/apply_fsa_models.cc +++ b/decoder/apply_fsa_models.cc @@ -7,6 +7,7 @@ #include "apply_models.h" #include #include +#include "cfg.h" using namespace std; @@ -15,33 +16,38 @@ struct ApplyFsa { const SentenceMetadata& smeta, const FsaFeatureFunction& fsa, DenseWeightVector const& weights, - ApplyFsaBy const& cfg, + ApplyFsaBy const& by, Hypergraph* oh) - :ih(ih),smeta(smeta),fsa(fsa),weights(weights),cfg(cfg),oh(oh) + :ih(ih),smeta(smeta),fsa(fsa),weights(weights),by(by),oh(oh) { // sparse_to_dense(weight_vector,&weights); Init(); } void Init() { - ApplyBottomUp(); - //TODO: implement l->r + if (by.IsBottomUp()) + ApplyBottomUp(); + else + ApplyEarley(); } void ApplyBottomUp() { - assert(cfg.IsBottomUp()); + assert(by.IsBottomUp()); FeatureFunctionFromFsa buff(&fsa); buff.Init(); // mandatory to call this (normally factory would do it) vector ffs(1,&buff); ModelSet models(weights, ffs); - IntersectionConfiguration i(cfg.BottomUpAlgorithm(),cfg.pop_limit); + IntersectionConfiguration i(by.BottomUpAlgorithm(),by.pop_limit); ApplyModelSet(ih,smeta,models,i,oh); } + void ApplyEarley() { + CFG cfg(ih,true,false,true); + } private: const Hypergraph& ih; const SentenceMetadata& smeta; const FsaFeatureFunction& fsa; // WeightVector weight_vector; DenseWeightVector weights; - ApplyFsaBy cfg; + ApplyFsaBy by; Hypergraph* oh; }; @@ -50,10 +56,10 @@ void ApplyFsaModels(const Hypergraph& ih, const SentenceMetadata& smeta, const FsaFeatureFunction& fsa, DenseWeightVector const& weight_vector, - ApplyFsaBy const& cfg, + ApplyFsaBy const& by, Hypergraph* oh) { - ApplyFsa a(ih,smeta,fsa,weight_vector,cfg,oh); + ApplyFsa a(ih,smeta,fsa,weight_vector,by,oh); } diff --git a/decoder/cfg.cc b/decoder/cfg.cc new file mode 100755 index 00000000..0f20ba0f --- /dev/null +++ b/decoder/cfg.cc @@ -0,0 +1,42 @@ +#include "cfg.h" +#include "hg.h" + +using namespace std; + +void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool push_weights) { + hg_=&hg; + Hypergraph::NodeProbs np; + goal_inside=hg.ComputeNodeViterbi(&np); + pushed_inside=push_weights ? goal_inside : prob_t(1); + int nn=hg.nodes_.size(),ne=hg.edges_.size(); + nts.resize(nn); + rules.resize(ne); + for (int i=0;i const& rule_rhs=target_side?er.e():er.f(); + RHS &rhs=cfgr.rhs; + int nr=rule_rhs.size(); + rhs.resize(nr); + prob_t &crp=cfgr.p; + crp=e.edge_prob_; + cfgr.lhs=e.head_node_; +#if CFG_DEBUG + cfgr.rule=e.rule_; +#endif + if (copy_features) cfgr.f=e.feature_values_; + if (push_weights) crp /=np[e.head_node_]; + for (int j=0;j0) + rhs[j]=w; + else { + int n=e.tail_nodes_[-w]; + if (push_weights) crp*=np[n]; + rhs[j]=n; + } + } + } +} diff --git a/decoder/cfg.h b/decoder/cfg.h new file mode 100755 index 00000000..9e1b2837 --- /dev/null +++ b/decoder/cfg.h @@ -0,0 +1,71 @@ +#ifndef CFG_H +#define CFG_H + +#ifndef CFG_DEBUG +# define CFG_DEBUG 1 +#endif + +/* for target FSA intersection, we want to produce a simple (feature weighted) CFG using the target projection of a hg. this is essentially isomorphic to the hypergraph, and we're copying part of the rule info (we'll maintain a pointer to the original hg edge for posterity/debugging; and perhaps avoid making a copy of the feature vector). but we may also want to support CFG read from text files (w/ features), without needing to have a backing hypergraph. so hg pointer may be null? multiple types of CFG? always copy the feature vector? especially if we choose to binarize, we won't want to rely on 1:1 alignment w/ hg + + question: how much does making a copy (essentially) of hg simplify things? is the space used worth it? is the node in/out edges index really that much of a waste? is the use of indices that annoying? + + the only thing that excites me right now about an explicit cfg is that access to the target rhs can be less painful, and binarization *on the target side* is easier to define + + using indices to refer to NTs saves space (32 bit index vs 64 bit pointer) and allows more efficient ancillary maps for e.g. chart info (if we used pointers to actual node structures, it would be tempting to add various void * or other slots for use by mapped-during-computation ephemera) + */ + +#include +#include +#include +#include "feature_vector.h" +#include "small_vector.h" +#include "wordid.h" +#include "tdict.h" +#include "trule.h" +#include "prob.h" +//#include "int_or_pointer.h" +#include "small_vector.h" + +class Hypergraph; +class CFG; + + + +struct CFG { + typedef int RuleHandle; + typedef int NTHandle; + typedef SmallVector RHS; // same as in trule rhs: >0 means token, <=0 means -node index (not variable index) + typedef std::vector Ruleids; + + struct Rule { + int lhs; // index into nts + RHS rhs; + prob_t p; // h unused for now (there's nothing admissable, and p is already using 1st pass inside as pushed toward top) + FeatureVector f; // may be empty, unless copy_features on Init +#if CFG_DEBUG + TRulePtr rule; // normally no use for this (waste of space) +#endif + }; + + struct NT { + Ruleids ruleids; // index into CFG rules with this lhs + }; + + CFG() : hg_() { } + + // provided hg will have weights pushed up to root + CFG(Hypergraph const& hg,bool target_side=true,bool copy_features=false,bool push_weights=true) { + Init(hg,target_side,copy_features,push_weights); + } + void Init(Hypergraph const& hg,bool target_side=true,bool copy_features=false,bool push_weights=true); +protected: + Hypergraph const* hg_; // shouldn't be used for anything, esp. after binarization + prob_t goal_inside,pushed_inside; // when we push viterbi weights to goal, we store the removed probability in pushed_inside + // rules/nts will have same index as hg edges/nodes + typedef std::vector Rules; + Rules rules; + typedef std::vector NTs; + NTs nts; +}; + +#endif diff --git a/decoder/hg.cc b/decoder/hg.cc index f0238fe3..47924d99 100644 --- a/decoder/hg.cc +++ b/decoder/hg.cc @@ -147,10 +147,21 @@ struct vpusher : public vector { }; } -void Hypergraph::PushViterbiWeightsToGoal(int fid) { +prob_t Hypergraph::ComputeNodeViterbi(NodeProbs *np) const +{ + return Inside(*this, + reinterpret_cast *>(np), + ViterbiWeightFunction()).v_; +} + + +// save pushed weight ot some fid if we want. 0 = don't care +prob_t Hypergraph::PushViterbiWeightsToGoal(int fid) { vpusher vi(fid); - Inside(*this,&vi,ViterbiWeightFunction()); + NodeProbs np; + prob_t r=ComputeNodeViterbi(&np); visit_edges(vi); + return r; } diff --git a/decoder/hg.h b/decoder/hg.h index 08199eb4..f426d32f 100644 --- a/decoder/hg.h +++ b/decoder/hg.h @@ -42,8 +42,8 @@ public: Hypergraph() : is_linear_chain_(false) {} // SmallVector is a fast, small vector implementation for sizes <= 2 - typedef SmallVectorInt TailNodeVector; - typedef std::vector EdgesVector; + typedef SmallVectorInt TailNodeVector; // indices in nodes_ + typedef std::vector EdgesVector; // indices in edges_ // TODO get rid of cat_? // TODO keep cat_ and add span and/or state? :) @@ -51,8 +51,8 @@ public: Node() : id_(), cat_(), promise(1) {} int id_; // equal to this object's position in the nodes_ vector WordID cat_; // non-terminal category if <0, 0 if not set - EdgesVector in_edges_; // contents refer to positions in edges_ - EdgesVector out_edges_; // contents refer to positions in edges_ + EdgesVector in_edges_; // an in edge is an edge with this node as its head. (in edges come from the bottom up to us) indices in edges_ + EdgesVector out_edges_; // an out edge is an edge with this node as its tail. (out edges leave us up toward the top/goal). indices in edges_ double promise; // set in global pruning; in [0,infty) so that mean is 1. use: e.g. scale cube poplimit. //TODO: appears to be useless, compile without this? on the other hand, pretty cheap. void copy_fixed(Node const& o) { // nonstructural fields only - structural ones are managed by sorting/pruning/subsetting cat_=o.cat_; @@ -77,7 +77,7 @@ public: int head_node_; // refers to a position in nodes_ TailNodeVector tail_nodes_; // contents refer to positions in nodes_ TRulePtr rule_; - SparseVector feature_values_; + FeatureVector feature_values_; prob_t edge_prob_; // dot product of weights and feat_values int id_; // equal to this object's position in the edges_ vector @@ -325,6 +325,7 @@ public: } typedef std::vector EdgeProbs; + typedef std::vector NodeProbs; typedef std::vector EdgeMask; typedef std::vector NodeMask; @@ -376,8 +377,10 @@ public: void PushWeightsToGoal(double scale = 1.0); // contrary to PushWeightsToGoal, use viterbi semiring; store log(p) to fid. note that p_viterbi becomes 1; k*p_viterbi becomes k. also modifies edge_prob_ (note that the fid stored log(p) will stick around even if you reweight) - // afterwards, product of edge_prob_ for a derivation will equal 1 for the viterbi (p_v before, 1 after), and in general (k*p_v before, k after) - void PushViterbiWeightsToGoal(int fid=0); + // afterwards, product of edge_prob_ for a derivation will equal 1 for the viterbi (p_v before, 1 after), and in general (k*p_v before, k after). returns inside(goal) + prob_t PushViterbiWeightsToGoal(int fid=0); + prob_t ComputeNodeViterbi(NodeProbs *np) const; + // void SortInEdgesByEdgeWeights(); // local sort only - pretty useless @@ -392,7 +395,6 @@ public: /// drop edge i if edge_margin[i] < prune_below, unless preserve_mask[i] void MarginPrune(EdgeProbs const& edge_margin,prob_t prune_below,EdgeMask const* preserve_mask=0,bool safe_inside=false,bool verbose=false); // promise[i]=((max_marginal[i]/viterbi)^power).todouble. if normalize, ensure that avg promise is 1. - typedef EdgeProbs NodeProbs; void SetPromise(NodeProbs const& inside,NodeProbs const& outside, double power=1, bool normalize=true); //TODO: in my opinion, looking at the ratio of logprobs (features \dot weights) rather than the absolute difference generalizes more nicely across sentence lengths and weight vectors that are constant multiples of one another. at least make that an option. i worked around this a little in cdec by making "beam alpha per source word" but that's not helping with different tuning runs. this would also make me more comfortable about allocating Node.promise diff --git a/decoder/int_or_pointer.h b/decoder/int_or_pointer.h new file mode 100755 index 00000000..4b6a9e4a --- /dev/null +++ b/decoder/int_or_pointer.h @@ -0,0 +1,70 @@ +#ifndef INT_OR_POINTER_H +#define INT_OR_POINTER_H + +// if you ever wanted to store a discriminated union of pointer/integer without an extra boolean flag, this will do it, assuming your pointers are never odd. + +// check lsb for expected tag? +#ifndef IOP_CHECK_LSB +# define IOP_CHECK_LSB 1 +#endif +#if IOP_CHECK_LSB +# define iop_assert(x) assert(x) +#else +# define iop_assert(x) +#endif + +#include +#include + +template +struct IntOrPointer { + typedef Pointed pointed_type; + typedef Int integer_type; + typedef Pointed *value_type; + typedef IntOrPointer self_type; + IntOrPointer(int j) { *this=j; } + IntOrPointer(size_t j) { *this=j; } + IntOrPointer(value_type v) { *this=v; } + bool is_integer() const { return i&1; } + bool is_pointer() const { return !(i&1); } + value_type & pointer() { return p; } + const value_type & pointer() const { iop_assert(is_pointer()); return p; } + integer_type integer() const { iop_assert(is_integer()); return i >> 1; } + void set_integer(Int j) { i=2*j+1; } + void set_pointer(value_type p_) { p=p_;iop_assert(is_pointer()); } + void operator=(unsigned j) { i = 2*(integer_type)j+1; } + void operator=(int j) { i = 2*(integer_type)j+1; } + template + void operator=(C j) { i = 2*(integer_type)j+1; } + void operator=(value_type v) { p=v; } + IntOrPointer() {} + IntOrPointer(const self_type &s) : p(s.p) {} + void operator=(const self_type &s) { p=s.p; } + template + bool operator ==(C* v) const { return p==v; } + template + bool operator ==(const C* v) const { return p==v; } + template + bool operator ==(C j) const { return integer() == j; } + bool operator ==(self_type s) const { return p==s.p; } + bool operator !=(self_type s) const { return p!=s.p; } + template void print(O&o) const + { + if (is_integer()) + o << integer(); + else { + o << "0x" << std::hex << (size_t)pointer() << std::dec; + } + } + friend inline std::ostream& operator<<(std::ostream &o,self_type const& s) { + s.print(o); return o; + } +protected: + union { + value_type p; // must be even (guaranteed unless you're pointing at packed chars) + integer_type i; // stored as 2*data+1, so only has half the range (one less bit) of a normal integer_type + }; +}; + + +#endif diff --git a/graehl/NOTES.earley b/graehl/NOTES.earley index 73727a54..9b8bf1fc 100755 --- a/graehl/NOTES.earley +++ b/graehl/NOTES.earley @@ -1,3 +1,16 @@ +1. fsts (modify target string produced) are quite feasible. augment fsa ff to not just emit features, but also new target words. composition or intersection is no problem (can always bunch into a single FSA/FST lazily by wrapping) + +2. sparse fsas (many transitions have -inf score) aren't efficiently supported presently (they are in early_composer where the fsa is a phrase table); the fsa ff interface doesn't even provide a way to query the non-0 transitions (you have to emit a -inf feature). if sparse fsas were expected often and we wanted exact search, then a similar index of the tcfg as in earley_composer would make sense. however, with prob. beam search, we prune out bad-scoring stuff anyway + +3. binarization of rhs isn't usually considered necessary in earley, but i liked the idea of optimal binarization making the most sharing possible. however, this means what would have just been a scan is now a scan+complete. + +4. prefix (forward) + inside cost. this is phrased in a way so that prefix includes inside. but there's no reason not to think of it in exclusive terms (outside,inside) where prefix=outside*inside when using viterbi. on the other hand, we usually use the outside*inside as the beam score. and furthermore, it looks like when summing over all derivations, there would be some difficulty calculating, as the total inside wouldn't be known at first. + +(a,i) r => (+=a*r,r) would become (o,i) r => (+=[(o*i*r)/r],r) = (+=o*i,r) +(_,b'') (a,b) => (+=a*b'',+=b*b'') would become (_,b'') (o,b) => (?????) + +==== + the target CFG (tcfg) is finite - absolutely no cycles. conceptually we're intersecting it with a wfsa (weights are feature vectors), which is a lot like parsing a lattice, in that states are (source state, dest state) pairs and you're covering some string(s) that go from source->dest in the wfsa. -- cgit v1.2.3