summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgraehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-08 08:49:13 +0000
committergraehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-08 08:49:13 +0000
commitffd4acfe5b84f413c66af6ec5a76bdbc0d3aa9e8 (patch)
treee5e6f9063e200a7bee863fa34557007d5271a22f
parentb588990fe985be78c05a9497d7f95b440d454cd3 (diff)
cfg
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@493 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--decoder/Makefile.am1
-rwxr-xr-xdecoder/apply_fsa_models.cc24
-rwxr-xr-xdecoder/cfg.cc42
-rwxr-xr-xdecoder/cfg.h71
-rw-r--r--decoder/hg.cc15
-rw-r--r--decoder/hg.h18
-rwxr-xr-xdecoder/int_or_pointer.h70
-rwxr-xr-xgraehl/NOTES.earley13
8 files changed, 235 insertions, 19 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 88d6d17a..68a7d765 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -44,6 +44,7 @@ rule_lexer.cc: rule_lexer.l
noinst_LIBRARIES = libcdec.a
libcdec_a_SOURCES = \
+ cfg.cc \
apply_fsa_models.cc \
rule_lexer.cc \
fst_translator.cc \
diff --git a/decoder/apply_fsa_models.cc b/decoder/apply_fsa_models.cc
index 01de62d3..416b9323 100755
--- a/decoder/apply_fsa_models.cc
+++ b/decoder/apply_fsa_models.cc
@@ -7,6 +7,7 @@
#include "apply_models.h"
#include <stdexcept>
#include <cassert>
+#include "cfg.h"
using namespace std;
@@ -15,33 +16,38 @@ struct ApplyFsa {
const SentenceMetadata& smeta,
const FsaFeatureFunction& fsa,
DenseWeightVector const& weights,
- ApplyFsaBy const& cfg,
+ ApplyFsaBy const& by,
Hypergraph* oh)
- :ih(ih),smeta(smeta),fsa(fsa),weights(weights),cfg(cfg),oh(oh)
+ :ih(ih),smeta(smeta),fsa(fsa),weights(weights),by(by),oh(oh)
{
// sparse_to_dense(weight_vector,&weights);
Init();
}
void Init() {
- ApplyBottomUp();
- //TODO: implement l->r
+ if (by.IsBottomUp())
+ ApplyBottomUp();
+ else
+ ApplyEarley();
}
void ApplyBottomUp() {
- assert(cfg.IsBottomUp());
+ assert(by.IsBottomUp());
FeatureFunctionFromFsa<FsaFeatureFunctionFwd> buff(&fsa);
buff.Init(); // mandatory to call this (normally factory would do it)
vector<const FeatureFunction*> ffs(1,&buff);
ModelSet models(weights, ffs);
- IntersectionConfiguration i(cfg.BottomUpAlgorithm(),cfg.pop_limit);
+ IntersectionConfiguration i(by.BottomUpAlgorithm(),by.pop_limit);
ApplyModelSet(ih,smeta,models,i,oh);
}
+ void ApplyEarley() {
+ CFG cfg(ih,true,false,true);
+ }
private:
const Hypergraph& ih;
const SentenceMetadata& smeta;
const FsaFeatureFunction& fsa;
// WeightVector weight_vector;
DenseWeightVector weights;
- ApplyFsaBy cfg;
+ ApplyFsaBy by;
Hypergraph* oh;
};
@@ -50,10 +56,10 @@ void ApplyFsaModels(const Hypergraph& ih,
const SentenceMetadata& smeta,
const FsaFeatureFunction& fsa,
DenseWeightVector const& weight_vector,
- ApplyFsaBy const& cfg,
+ ApplyFsaBy const& by,
Hypergraph* oh)
{
- ApplyFsa a(ih,smeta,fsa,weight_vector,cfg,oh);
+ ApplyFsa a(ih,smeta,fsa,weight_vector,by,oh);
}
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
new file mode 100755
index 00000000..0f20ba0f
--- /dev/null
+++ b/decoder/cfg.cc
@@ -0,0 +1,42 @@
+#include "cfg.h"
+#include "hg.h"
+
+using namespace std;
+
+void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool push_weights) {
+ hg_=&hg;
+ Hypergraph::NodeProbs np;
+ goal_inside=hg.ComputeNodeViterbi(&np);
+ pushed_inside=push_weights ? goal_inside : prob_t(1);
+ int nn=hg.nodes_.size(),ne=hg.edges_.size();
+ nts.resize(nn);
+ rules.resize(ne);
+ for (int i=0;i<nn;++i)
+ nts[i].ruleids=hg.nodes_[i].in_edges_;
+ for (int i=0;i<ne;++i) {
+ Rule &cfgr=rules[i];
+ Hypergraph::Edge const& e=hg.edges_[i];
+ TRule const& er=*e.rule_; vector<WordID> const& rule_rhs=target_side?er.e():er.f();
+ RHS &rhs=cfgr.rhs;
+ int nr=rule_rhs.size();
+ rhs.resize(nr);
+ prob_t &crp=cfgr.p;
+ crp=e.edge_prob_;
+ cfgr.lhs=e.head_node_;
+#if CFG_DEBUG
+ cfgr.rule=e.rule_;
+#endif
+ if (copy_features) cfgr.f=e.feature_values_;
+ if (push_weights) crp /=np[e.head_node_];
+ for (int j=0;j<nr;++j) {
+ WordID w=rule_rhs[j];
+ if (w>0)
+ rhs[j]=w;
+ else {
+ int n=e.tail_nodes_[-w];
+ if (push_weights) crp*=np[n];
+ rhs[j]=n;
+ }
+ }
+ }
+}
diff --git a/decoder/cfg.h b/decoder/cfg.h
new file mode 100755
index 00000000..9e1b2837
--- /dev/null
+++ b/decoder/cfg.h
@@ -0,0 +1,71 @@
+#ifndef CFG_H
+#define CFG_H
+
+#ifndef CFG_DEBUG
+# define CFG_DEBUG 1
+#endif
+
+/* for target FSA intersection, we want to produce a simple (feature weighted) CFG using the target projection of a hg. this is essentially isomorphic to the hypergraph, and we're copying part of the rule info (we'll maintain a pointer to the original hg edge for posterity/debugging; and perhaps avoid making a copy of the feature vector). but we may also want to support CFG read from text files (w/ features), without needing to have a backing hypergraph. so hg pointer may be null? multiple types of CFG? always copy the feature vector? especially if we choose to binarize, we won't want to rely on 1:1 alignment w/ hg
+
+ question: how much does making a copy (essentially) of hg simplify things? is the space used worth it? is the node in/out edges index really that much of a waste? is the use of indices that annoying?
+
+ the only thing that excites me right now about an explicit cfg is that access to the target rhs can be less painful, and binarization *on the target side* is easier to define
+
+ using indices to refer to NTs saves space (32 bit index vs 64 bit pointer) and allows more efficient ancillary maps for e.g. chart info (if we used pointers to actual node structures, it would be tempting to add various void * or other slots for use by mapped-during-computation ephemera)
+ */
+
+#include <sstream>
+#include <string>
+#include <vector>
+#include "feature_vector.h"
+#include "small_vector.h"
+#include "wordid.h"
+#include "tdict.h"
+#include "trule.h"
+#include "prob.h"
+//#include "int_or_pointer.h"
+#include "small_vector.h"
+
+class Hypergraph;
+class CFG;
+
+
+
+struct CFG {
+ typedef int RuleHandle;
+ typedef int NTHandle;
+ typedef SmallVector<WordID> RHS; // same as in trule rhs: >0 means token, <=0 means -node index (not variable index)
+ typedef std::vector<RuleHandle> Ruleids;
+
+ struct Rule {
+ int lhs; // index into nts
+ RHS rhs;
+ prob_t p; // h unused for now (there's nothing admissable, and p is already using 1st pass inside as pushed toward top)
+ FeatureVector f; // may be empty, unless copy_features on Init
+#if CFG_DEBUG
+ TRulePtr rule; // normally no use for this (waste of space)
+#endif
+ };
+
+ struct NT {
+ Ruleids ruleids; // index into CFG rules with this lhs
+ };
+
+ CFG() : hg_() { }
+
+ // provided hg will have weights pushed up to root
+ CFG(Hypergraph const& hg,bool target_side=true,bool copy_features=false,bool push_weights=true) {
+ Init(hg,target_side,copy_features,push_weights);
+ }
+ void Init(Hypergraph const& hg,bool target_side=true,bool copy_features=false,bool push_weights=true);
+protected:
+ Hypergraph const* hg_; // shouldn't be used for anything, esp. after binarization
+ prob_t goal_inside,pushed_inside; // when we push viterbi weights to goal, we store the removed probability in pushed_inside
+ // rules/nts will have same index as hg edges/nodes
+ typedef std::vector<Rule> Rules;
+ Rules rules;
+ typedef std::vector<NT> NTs;
+ NTs nts;
+};
+
+#endif
diff --git a/decoder/hg.cc b/decoder/hg.cc
index f0238fe3..47924d99 100644
--- a/decoder/hg.cc
+++ b/decoder/hg.cc
@@ -147,10 +147,21 @@ struct vpusher : public vector<TropicalValue> {
};
}
-void Hypergraph::PushViterbiWeightsToGoal(int fid) {
+prob_t Hypergraph::ComputeNodeViterbi(NodeProbs *np) const
+{
+ return Inside(*this,
+ reinterpret_cast<std::vector<TropicalValue> *>(np),
+ ViterbiWeightFunction()).v_;
+}
+
+
+// save pushed weight ot some fid if we want. 0 = don't care
+prob_t Hypergraph::PushViterbiWeightsToGoal(int fid) {
vpusher vi(fid);
- Inside(*this,&vi,ViterbiWeightFunction());
+ NodeProbs np;
+ prob_t r=ComputeNodeViterbi(&np);
visit_edges(vi);
+ return r;
}
diff --git a/decoder/hg.h b/decoder/hg.h
index 08199eb4..f426d32f 100644
--- a/decoder/hg.h
+++ b/decoder/hg.h
@@ -42,8 +42,8 @@ public:
Hypergraph() : is_linear_chain_(false) {}
// SmallVector is a fast, small vector<int> implementation for sizes <= 2
- typedef SmallVectorInt TailNodeVector;
- typedef std::vector<int> EdgesVector;
+ typedef SmallVectorInt TailNodeVector; // indices in nodes_
+ typedef std::vector<int> EdgesVector; // indices in edges_
// TODO get rid of cat_?
// TODO keep cat_ and add span and/or state? :)
@@ -51,8 +51,8 @@ public:
Node() : id_(), cat_(), promise(1) {}
int id_; // equal to this object's position in the nodes_ vector
WordID cat_; // non-terminal category if <0, 0 if not set
- EdgesVector in_edges_; // contents refer to positions in edges_
- EdgesVector out_edges_; // contents refer to positions in edges_
+ EdgesVector in_edges_; // an in edge is an edge with this node as its head. (in edges come from the bottom up to us) indices in edges_
+ EdgesVector out_edges_; // an out edge is an edge with this node as its tail. (out edges leave us up toward the top/goal). indices in edges_
double promise; // set in global pruning; in [0,infty) so that mean is 1. use: e.g. scale cube poplimit. //TODO: appears to be useless, compile without this? on the other hand, pretty cheap.
void copy_fixed(Node const& o) { // nonstructural fields only - structural ones are managed by sorting/pruning/subsetting
cat_=o.cat_;
@@ -77,7 +77,7 @@ public:
int head_node_; // refers to a position in nodes_
TailNodeVector tail_nodes_; // contents refer to positions in nodes_
TRulePtr rule_;
- SparseVector<double> feature_values_;
+ FeatureVector feature_values_;
prob_t edge_prob_; // dot product of weights and feat_values
int id_; // equal to this object's position in the edges_ vector
@@ -325,6 +325,7 @@ public:
}
typedef std::vector<prob_t> EdgeProbs;
+ typedef std::vector<prob_t> NodeProbs;
typedef std::vector<bool> EdgeMask;
typedef std::vector<bool> NodeMask;
@@ -376,8 +377,10 @@ public:
void PushWeightsToGoal(double scale = 1.0);
// contrary to PushWeightsToGoal, use viterbi semiring; store log(p) to fid. note that p_viterbi becomes 1; k*p_viterbi becomes k. also modifies edge_prob_ (note that the fid stored log(p) will stick around even if you reweight)
- // afterwards, product of edge_prob_ for a derivation will equal 1 for the viterbi (p_v before, 1 after), and in general (k*p_v before, k after)
- void PushViterbiWeightsToGoal(int fid=0);
+ // afterwards, product of edge_prob_ for a derivation will equal 1 for the viterbi (p_v before, 1 after), and in general (k*p_v before, k after). returns inside(goal)
+ prob_t PushViterbiWeightsToGoal(int fid=0);
+ prob_t ComputeNodeViterbi(NodeProbs *np) const;
+
// void SortInEdgesByEdgeWeights(); // local sort only - pretty useless
@@ -392,7 +395,6 @@ public:
/// drop edge i if edge_margin[i] < prune_below, unless preserve_mask[i]
void MarginPrune(EdgeProbs const& edge_margin,prob_t prune_below,EdgeMask const* preserve_mask=0,bool safe_inside=false,bool verbose=false);
// promise[i]=((max_marginal[i]/viterbi)^power).todouble. if normalize, ensure that avg promise is 1.
- typedef EdgeProbs NodeProbs;
void SetPromise(NodeProbs const& inside,NodeProbs const& outside, double power=1, bool normalize=true);
//TODO: in my opinion, looking at the ratio of logprobs (features \dot weights) rather than the absolute difference generalizes more nicely across sentence lengths and weight vectors that are constant multiples of one another. at least make that an option. i worked around this a little in cdec by making "beam alpha per source word" but that's not helping with different tuning runs. this would also make me more comfortable about allocating Node.promise
diff --git a/decoder/int_or_pointer.h b/decoder/int_or_pointer.h
new file mode 100755
index 00000000..4b6a9e4a
--- /dev/null
+++ b/decoder/int_or_pointer.h
@@ -0,0 +1,70 @@
+#ifndef INT_OR_POINTER_H
+#define INT_OR_POINTER_H
+
+// if you ever wanted to store a discriminated union of pointer/integer without an extra boolean flag, this will do it, assuming your pointers are never odd.
+
+// check lsb for expected tag?
+#ifndef IOP_CHECK_LSB
+# define IOP_CHECK_LSB 1
+#endif
+#if IOP_CHECK_LSB
+# define iop_assert(x) assert(x)
+#else
+# define iop_assert(x)
+#endif
+
+#include <assert.h>
+#include <iostream>
+
+template <class Pointed=void,class Int=size_t>
+struct IntOrPointer {
+ typedef Pointed pointed_type;
+ typedef Int integer_type;
+ typedef Pointed *value_type;
+ typedef IntOrPointer<Pointed,Int> self_type;
+ IntOrPointer(int j) { *this=j; }
+ IntOrPointer(size_t j) { *this=j; }
+ IntOrPointer(value_type v) { *this=v; }
+ bool is_integer() const { return i&1; }
+ bool is_pointer() const { return !(i&1); }
+ value_type & pointer() { return p; }
+ const value_type & pointer() const { iop_assert(is_pointer()); return p; }
+ integer_type integer() const { iop_assert(is_integer()); return i >> 1; }
+ void set_integer(Int j) { i=2*j+1; }
+ void set_pointer(value_type p_) { p=p_;iop_assert(is_pointer()); }
+ void operator=(unsigned j) { i = 2*(integer_type)j+1; }
+ void operator=(int j) { i = 2*(integer_type)j+1; }
+ template <class C>
+ void operator=(C j) { i = 2*(integer_type)j+1; }
+ void operator=(value_type v) { p=v; }
+ IntOrPointer() {}
+ IntOrPointer(const self_type &s) : p(s.p) {}
+ void operator=(const self_type &s) { p=s.p; }
+ template <class C>
+ bool operator ==(C* v) const { return p==v; }
+ template <class C>
+ bool operator ==(const C* v) const { return p==v; }
+ template <class C>
+ bool operator ==(C j) const { return integer() == j; }
+ bool operator ==(self_type s) const { return p==s.p; }
+ bool operator !=(self_type s) const { return p!=s.p; }
+ template <class O> void print(O&o) const
+ {
+ if (is_integer())
+ o << integer();
+ else {
+ o << "0x" << std::hex << (size_t)pointer() << std::dec;
+ }
+ }
+ friend inline std::ostream& operator<<(std::ostream &o,self_type const& s) {
+ s.print(o); return o;
+ }
+protected:
+ union {
+ value_type p; // must be even (guaranteed unless you're pointing at packed chars)
+ integer_type i; // stored as 2*data+1, so only has half the range (one less bit) of a normal integer_type
+ };
+};
+
+
+#endif
diff --git a/graehl/NOTES.earley b/graehl/NOTES.earley
index 73727a54..9b8bf1fc 100755
--- a/graehl/NOTES.earley
+++ b/graehl/NOTES.earley
@@ -1,3 +1,16 @@
+1. fsts (modify target string produced) are quite feasible. augment fsa ff to not just emit features, but also new target words. composition or intersection is no problem (can always bunch into a single FSA/FST lazily by wrapping)
+
+2. sparse fsas (many transitions have -inf score) aren't efficiently supported presently (they are in early_composer where the fsa is a phrase table); the fsa ff interface doesn't even provide a way to query the non-0 transitions (you have to emit a -inf feature). if sparse fsas were expected often and we wanted exact search, then a similar index of the tcfg as in earley_composer would make sense. however, with prob. beam search, we prune out bad-scoring stuff anyway
+
+3. binarization of rhs isn't usually considered necessary in earley, but i liked the idea of optimal binarization making the most sharing possible. however, this means what would have just been a scan is now a scan+complete.
+
+4. prefix (forward) + inside cost. this is phrased in a way so that prefix includes inside. but there's no reason not to think of it in exclusive terms (outside,inside) where prefix=outside*inside when using viterbi. on the other hand, we usually use the outside*inside as the beam score. and furthermore, it looks like when summing over all derivations, there would be some difficulty calculating, as the total inside wouldn't be known at first.
+
+(a,i) r => (+=a*r,r) would become (o,i) r => (+=[(o*i*r)/r],r) = (+=o*i,r)
+(_,b'') (a,b) => (+=a*b'',+=b*b'') would become (_,b'') (o,b) => (?????)
+
+====
+
the target CFG (tcfg) is finite - absolutely no cycles. conceptually we're intersecting it with a wfsa (weights are feature vectors), which is a lot like parsing a lattice, in that states are (source state, dest state) pairs and you're covering some string(s) that go from source->dest in the wfsa.