summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xdecoder/apply_fsa_models.cc158
-rwxr-xr-xdecoder/cfg.cc21
-rwxr-xr-xutils/agenda.h95
-rwxr-xr-xutils/best.h13
-rwxr-xr-xutils/hash.h6
-rwxr-xr-xutils/lvalue_pmap.h25
-rwxr-xr-xutils/null_traits.h26
-rwxr-xr-xutils/value_array.h30
8 files changed, 329 insertions, 45 deletions
diff --git a/decoder/apply_fsa_models.cc b/decoder/apply_fsa_models.cc
index 7cd5fc6d..f9c94ec3 100755
--- a/decoder/apply_fsa_models.cc
+++ b/decoder/apply_fsa_models.cc
@@ -13,6 +13,8 @@
#include "utoa.h"
#include "hash.h"
#include "value_array.h"
+#include "d_ary_heap.h"
+#include "agenda.h"
#define DFSA(x) x
#define DPFSA(x) x
@@ -72,12 +74,15 @@ struct get_second {
struct PrefixTrieNode;
struct PrefixTrieEdge {
+// PrefixTrieEdge() { }
+// explicit PrefixTrieEdge(prob_t p) : p(p),dest(0) { }
prob_t p;// viterbi additional prob, i.e. product over path incl. p_final = total rule prob
//DPFSA()
// we can probably just store deltas, but for debugging remember the full p
// prob_t delta; //
PrefixTrieNode *dest;
- WordID w; // for lhs, this will be positive NTHandle instead
+ bool is_final() const { return dest==0; }
+ WordID w; // for lhs, this will be nonneg NTHandle instead. // not set if is_final() // actually, set to lhs nt index
// for sorting most probable first in adj; actually >(p)
inline bool operator <(PrefixTrieEdge const& o) const {
@@ -88,33 +93,55 @@ struct PrefixTrieEdge {
struct PrefixTrieNode {
prob_t p; // viterbi (max prob) of rule this node leads to - when building. telescope later onto edges for best-first.
#if TRIE_START_LHS
- bool final; // may also have successors, of course
- prob_t p_final; // additional prob beyond what we already paid. while building, this is the total prob
+// bool final; // may also have successors, of course. we don't really need to track this; a null dest edge in the adj list lets us encounter the fact in best first order.
+// prob_t p_final; // additional prob beyond what we already paid. while building, this is the total prob
+// instead of storing final, we'll say that an edge with a NULL dest is a final edge. this way it gets sorted into the list of adj.
+
// instead of completed map, we have trie start w/ lhs.
- NTHandle lhs; // instead of storing this in Item.
+ NTHandle lhs; // nonneg. - instead of storing this in Item.
#else
typedef FSA_MAP(LHS,RuleHandle) Completed; // can only have one rule w/ a given signature (duplicates should be collapsed when making CFG). but there may be multiple rules, with different LHS
Completed completed;
#endif
- explicit PrefixTrieNode(prob_t p=1) : p(p),final(false) { }
+ enum { ROOT=-1 };
+ explicit PrefixTrieNode(NTHandle lhs=ROOT,prob_t p=1) : p(p),lhs(lhs) {
+ //final=false;
+ }
+ bool is_root() const { return lhs==ROOT; } // means adj are the nonneg lhs indices, and we have the index edge_for still available
// outgoing edges will be ordered highest p to worst p
typedef FSA_MAP(WordID,PrefixTrieEdge) PrefixTrieEdgeFor;
public:
PrefixTrieEdgeFor edge_for; //TODO: move builder elsewhere? then need 2nd hash or edge include pointer to builder. just clear this later
+ bool have_adj() const {
+ return adj.size()>=edge_for.size();
+ }
+ bool no_adj() const {
+ return adj.empty();
+ }
+
void index_adj() {
index_adj(edge_for);
}
-
template <class M>
void index_adj(M &m) {
+ assert(have_adj());
m.clear();
for (int i=0;i<adj.size();++i) {
PrefixTrieEdge const& e=adj[i];
m[e.w]=e;
}
}
+ template <class PV>
+ void index_root(PV &v) {
+ v.resize(adj.size());
+ for (int i=0,e=adj.size();i!=e;++i) {
+ PrefixTrieEdge const& e=adj[i];
+ // assert(e.p.is_1()); // actually, after done_building, e will have telescoped dest->p/p.
+ v[e.w]=e.dest;
+ }
+ }
// call only once.
void done_building_r() {
@@ -124,18 +151,18 @@ public:
}
// for done_building; compute incremental (telescoped) edge p
- PrefixTrieEdge const& operator()(PrefixTrieEdgeFor::value_type &pair) const {
- PrefixTrieEdge &e=pair.second;
+ PrefixTrieEdge const& operator()(PrefixTrieEdgeFor::value_type const& pair) const {
+ PrefixTrieEdge &e=const_cast<PrefixTrieEdge&>(pair.second);
e.p=(e.dest->p)/p;
return e;
}
// call only once.
void done_building() {
- adj.reinit_map(edge_for.begin(),edge_for.end(),*this);
- if (final)
- p_final/=p;
+ adj.reinit_map(edge_for,*this);
+// if (final) p_final/=p;
std::sort(adj.begin(),adj.end());
+ //TODO: store adjacent differences on edges (compared to
}
typedef ValueArray<PrefixTrieEdge> Adj;
@@ -143,8 +170,6 @@ public:
Adj adj;
typedef WordID W;
- typedef NTHandle N; // not negative
- typedef W const* RI;
// let's compute p_min so that every rule reachable from the created node has p at least this low.
PrefixTrieNode *improve_edge(PrefixTrieEdge const& e,prob_t rulep) {
@@ -153,28 +178,46 @@ public:
return d;
}
- PrefixTrieNode *build(W w,prob_t rulep) {
+ inline PrefixTrieNode *build(W w,prob_t rulep) {
+ return build(lhs,w,rulep);
+ }
+ inline PrefixTrieNode *build_lhs(NTHandle w,prob_t rulep) {
+ return build(w,w,rulep);
+ }
+
+ PrefixTrieNode *build(NTHandle lhs_,W w,prob_t rulep) {
PrefixTrieEdgeFor::iterator i=edge_for.find(w);
if (i!=edge_for.end())
return improve_edge(i->second,rulep);
PrefixTrieEdge &e=edge_for[w];
- return e.dest=new PrefixTrieNode(rulep);
+ return e.dest=new PrefixTrieNode(lhs_,rulep);
}
- void set_final(prob_t pf) {
- final=true;p_final=pf;
+ void set_final(NTHandle lhs_,prob_t pf) {
+ assert(no_adj());
+// final=true; // don't really need to track this.
+ PrefixTrieEdge &e=edge_for[-1];
+ e.p=pf;
+ e.dest=0;
+ e.w=lhs_;
+ if (pf>p)
+ p=pf;
}
-#ifdef HAVE_TAIL_RECURSE
- // add string i...end
- void build(RI i,RI end, prob_t rulep) {
- if (i==end) {
- set_final(rulep);
- } else
- // tail recursion:
- build(*i)->build(i+1,end,rulep);
+private:
+ void destroy_children() {
+ assert(adj.size()>=edge_for.size());
+ for (int i=0,e=adj.size();i<e;++i) {
+ PrefixTrieNode *c=adj[i].dest;
+ if (c) { // final state has no end
+ delete c;
+ }
+ }
+ }
+public:
+ ~PrefixTrieNode() {
+ destroy_children();
}
-#endif
};
#if TRIE_START_LHS
@@ -200,34 +243,77 @@ struct PrefixTrie {
}
void operator()(int ri) const {
Rule const& r=rules()[ri];
+ NTHandle lhs=r.lhs;
prob_t p=r.p;
- PrefixTrieNode *n=const_cast<PrefixTrieNode&>(root).build(r.lhs,p);
+ PrefixTrieNode *n=const_cast<PrefixTrieNode&>(root).build_lhs(lhs,p);
for (RHS::const_iterator i=r.rhs.begin(),e=r.rhs.end();;++i) {
if (i==e) {
- n->set_final(p);
+ n->set_final(lhs,p);
break;
}
n=n->build(*i,p);
}
-#ifdef HAVE_TAIL_RECURSE
- root.build(r.lhs,r.p)->build(r.rhs,r.p);
-#endif
+// root.build(lhs,r.p)->build(r.rhs,r.p);
}
+
+
};
-// these should go in a global best-first queue
+typedef std::size_t ItemHash;
+
struct Item {
- prob_t forward;
+ explicit Item(PrefixTrieNode *dot,int next=0) : dot(dot),next(next) { }
+ PrefixTrieNode *dot; // dot is a function of the stuff already recognized, and gives a set of suffixes y to complete to finish a rhs for lhs() -> dot y. for a lhs A -> . *, this will point to lh2[A]
+ int next; // index of dot->adj to complete (if dest==0), or predict (if NT), or scan (if word)
+ NTHandle lhs() const { return dot->lhs; }
+ inline ItemHash hash() const {
+ return GOLDEN_MEAN_FRACTION*next^((ItemHash)dot>>4); // / sizeof(PrefixTrieNode), approx., i.e. lower order bits of ptr are nonrandom
+ }
+};
+
+inline ItemHash hash_value(Item const& x) {
+ return x.hash();
+}
+
+Item null_item((PrefixTrieNode*)0);
+
+// these should go in a global best-first queue
+struct ItemP {
+ ItemP() : forward(init_0()),inner(init_0()) { }
+ prob_t forward; // includes inner prob.
// NOTE: sum = viterbi (max)
/* The forward probability alpha_i(X[k]->x.y) is the sum of the probabilities of all
constrained paths of length that end in state X[k]->x.y*/
prob_t inner;
/* The inner probability beta_i(X[k]->x.y) is the sum of the probabilities of all
paths of length i-k that start in state X[k,k]->.xy and end in X[k,i]->x.y, and generate the input symbols x[k,...,i-1] */
- PrefixTrieNode *dot; // dot is a function of the stuff already recognized, and gives a set of suffixes y to complete to finish a rhs for lhs() -> dot y
- NTHandle lhs() const { return dot->lhs; }
};
+struct Chart {
+ //Agenda<Item> a;
+ //typedef HASH_MAP(Item,ItemP,boost::hash<Item>) Items;
+ //typedef Items::iterator FindItem;
+ //typedef std::pair<FindItem,bool> InsertItem;
+// Items items;
+ CFG &cfg; // TODO: remove this from Chart
+ NTHandle goal_nt;
+ PrefixTrie trie;
+ typedef std::vector<PrefixTrieNode *> LhsToTrie; // will have to check lhs2[lhs].p for best cost of some rule with that lhs, then use edge deltas after? they're just caching a very cheap computation, really
+ LhsToTrie lhs2; // no reason to use a map or hash table; every NT in the CFG will have some rule rhses. lhs_to_trie[i]=root.edge_for[i], i.e. we still have a root trie node conceptually, we just access through this since it's faster.
+
+ void enqueue(Item const& item,ItemP const& p) {
+// FindItem f=items.find(item);
+// if (f==items.end()) ;
+
+ }
+
+ Chart(CFG &cfg) :cfg(cfg),trie(cfg) {
+ goal_nt=cfg.goal_nt;
+ trie.root.index_root(lhs2);
+ }
+};
+
+
}//anon ns
@@ -279,7 +365,7 @@ void ApplyFsa::ApplyBottomUp()
void ApplyFsa::ApplyEarley()
{
hgcfg.GiveCFG(cfg);
- PrefixTrie rt(cfg);
+ Chart chart(cfg);
// don't need to uniq - option to do that already exists in cfg_options
//TODO:
}
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index b2219193..651978d2 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -8,6 +8,7 @@
#include "fast_lexical_cast.hpp"
//#include "indices_after.h"
#include "show.h"
+#include "null_traits.h"
#define DUNIQ(x) x
#define DBIN(x)
@@ -31,6 +32,7 @@ using namespace std;
typedef CFG::Rule Rule;
typedef CFG::NTOrder NTOrder;
typedef CFG::RHS RHS;
+typedef CFG::BinRhs BinRhs;
/////index ruleids:
void CFG::UnindexRules() {
@@ -166,11 +168,11 @@ void CFG::SortLocalBestFirst(NTHandle ni) {
/////binarization:
namespace {
-CFG::BinRhs null_bin_rhs(std::numeric_limits<int>::min(),std::numeric_limits<int>::min());
+BinRhs null_bin_rhs(std::numeric_limits<int>::min(),std::numeric_limits<int>::min());
// index i >= N.size()? then it's in M[i-N.size()]
//WordID first,WordID second,
-string BinStr(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
+string BinStr(BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
{
int nn=N.size();
ostringstream o;
@@ -203,7 +205,7 @@ string BinStr(RHS const& r,CFG::NTs const& N,CFG::NTs const& M)
}
-WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
+WordID BinName(BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
{
return TD::Convert(BinStr(b,N,M));
}
@@ -213,22 +215,27 @@ WordID BinName(RHS const& b,CFG::NTs const& N,CFG::NTs const& M)
return TD::Convert(BinStr(b,N,M));
}
+/*
template <class Rhs>
struct null_for;
-typedef CFG::BinRhs BinRhs;
template <>
struct null_for<BinRhs> {
static BinRhs null;
};
-BinRhs null_for<BinRhs>::null(std::numeric_limits<int>::min(),std::numeric_limits<int>::min());
template <>
struct null_for<RHS> {
static RHS null;
};
-RHS null_for<RHS>::null(1,std::numeric_limits<int>::min());
+*/
+
+template <>
+BinRhs null_traits<BinRhs>::null(std::numeric_limits<int>::min(),std::numeric_limits<int>::min());
+
+template <>
+RHS null_traits<RHS>::null(1,std::numeric_limits<int>::min());
template <class Rhs>
struct add_virtual_rules {
@@ -243,7 +250,7 @@ struct add_virtual_rules {
R2L rhs2lhs; // an rhs maps to this -virtntid, or original id if length 1
bool name_nts;
add_virtual_rules(CFG &cfg,bool name_nts=false) : nts(cfg.nts),rules(cfg.rules),newnt(-nts.size()),newruleid(rules.size()),name_nts(name_nts) {
- HASH_MAP_EMPTY(rhs2lhs,null_for<Rhs>::null);
+ HASH_MAP_EMPTY(rhs2lhs,null_traits<Rhs>::null);
}
NTHandle get_virt(Rhs const& r) {
NTHandle nt=get_default(rhs2lhs,r,newnt);
diff --git a/utils/agenda.h b/utils/agenda.h
new file mode 100755
index 00000000..8198f077
--- /dev/null
+++ b/utils/agenda.h
@@ -0,0 +1,95 @@
+#ifndef AGENDA_H
+#define AGENDA_H
+
+/*
+ a priority queue where you expect to queue the same item at different
+ priorities several times before finally popping it. higher priority = better.
+ so in best first you'd be using negative cost or e^-cost (probabilities, in
+ other words).
+
+ this means you have a way to look up a key and see its location in the queue,
+ so its priority can be adjusted (or, simpler implementation: so when you pop,
+ you see if you've already popped before at a lower cost, and skip the
+ subsequent pops).
+
+ it's assumed that you'll never queue an item @ a better priority after it has
+ already been popped. that is, the agenda will track already completed items.
+ maybe in the future i will let you recompute a cheaper way to reach things
+ after first-pop also, it's assumed that we're always improving prios of
+ existing items, never making them worse (even though technically this is
+ possible and sensible if it hasn't been popped yet).
+
+ simple binary max heap for now. there are better practical options w/
+ superior cache locaility. movements in the heap need to update a record for
+ that key of where the key went. i do this by creating canonical key pointers
+ out of boost object pools (if the key were lightweight e.g. an int, then it
+ would make sense to use the hash lookup too
+
+ since i'm doing key hashing to start with, i also allow you to attach some
+ arbitrary data (value) payload beyond key+priority.
+
+ hash map from key to done (has been popped) -> set where doneness is marked in key item?
+
+ a slightly different way to make an adjustable heap would be to use
+ tree-structured parent/children links intrusively (or mapped by key) in the
+ key, rather than indices in a compact binary-tree heap
+
+ */
+
+#include "best.h"
+#include "hash.h"
+#include "d_ary_heap.h"
+#include "lvalue_pmap.h"
+#include <vector>
+#include <functional>
+
+/*
+template <class P>
+struct priority_traits {
+ typedef typename P::priority_type priority_type;
+};
+
+// P p has priority_traits<P>::type &p->agenda_priority() and unsigned &p->agenda_location(), and bool & p->agenda_done()
+// this is really 4 functors in 1 (or lvalue property maps); you can supply your own as the Prio type in Agenda<...> below ; state is allowed.
+template <class P>
+struct AdjustablePriority {
+ typedef AdjustablePriority<P> Self;
+ typedef typename priority_traits<P>::priority_type Priority;
+ Priority & priority(P const &x) {
+ return x->agenda_priority();
+ }
+ unsigned & location(P const &x) { // this gets updated by push, pop, and adjust
+ return x->agenda_location();
+ }
+ void is_done(P const& x) const {
+ return x->agenda_done();
+ }
+ void set_done(P const& x) const {
+ x->agenda_done()=true;
+ }
+};
+*/
+
+typedef best_t agenda_best_t;
+
+PMAP_MEMBER_INDIRECT(LocationMap,unsigned,location)
+PMAP_MEMBER_INDIRECT(PriorityMap,best_t,priority)
+
+// LocMap and PrioMap are boost property maps put(locmap,key,size_t), Better(get(priomap,k1),get(priomap,k2)) means k1 should be above k2 (be popped first). Locmap and PrioMap may have state; the rest are assumed stateless functors
+template <class Item,class Hash=boost::hash<Item>,class Equal=std::equal_to<Item>,class Better=std::less<Item> >
+struct Agenda {
+ /* this is less generic than it could be, because I want to use a single hash mapping to intern to canonical mutable object pointers, where the property maps are just lvalue accessors */
+/*
+ typedef Item *CanonItem;
+ static const std::size_t heap_arity=4; // might be fastest possible (depends on key size probably - cache locality is bad w/ arity=2)
+ typedef std::vector<CanonItem> HeapStorage;
+ typedef boost::detail::d_ary_heap_indirect<Key,heap_arity,LocMap,PrioMap,Better,HeapStorage> Heap;
+ HASH_SET<Key,Hash,Equal> queued;
+ typedef LocationMap<ItemP>
+ LocMap locmap;
+ PrioMap priomap;
+ Agenda(LocMap const& lm=LocMap(),PrioMap const& pm=PrioMap()) : locmap(lm), priomap(pm) { }
+*/
+};
+
+#endif
diff --git a/utils/best.h b/utils/best.h
new file mode 100755
index 00000000..8ff896bb
--- /dev/null
+++ b/utils/best.h
@@ -0,0 +1,13 @@
+#ifndef UTILS__BEST_H
+#define UTILS__BEST_H
+
+#include "max_plus.h"
+
+typedef MaxPlus<double> best_t;
+
+inline bool operator <(best_t const& a,best_t const& b) {
+ return a.v_>b.v_; // intentionally reversed, so default min-heap, sort, etc. put best first.
+}
+
+
+#endif
diff --git a/utils/hash.h b/utils/hash.h
index 7e38bb2c..e9584dd5 100755
--- a/utils/hash.h
+++ b/utils/hash.h
@@ -9,17 +9,23 @@
#ifdef HAVE_SPARSEHASH
# include <google/dense_hash_map>
# define HASH_MAP google::dense_hash_map
+# define HASH_SET google::dense_hash_set
# define HASH_MAP_RESERVED(h,empty,deleted) do { h.set_empty_key(empty); h.set_deleted_key(deleted); } while(0)
# define HASH_MAP_EMPTY(h,empty) do { h.set_empty_key(empty); } while(0)
#else
# include <tr1/unordered_map>
# define HASH_MAP std::tr1::unordered_map
+# define HASH_SET std::tr1::unordered_set
# define HASH_MAP_RESERVED(h,empty,deleted)
# define HASH_MAP_EMPTY(h,empty)
#endif
#define BOOST_HASHED_MAP(k,v) HASH_MAP<k,v,boost::hash<k> >
+namespace {
+const unsigned GOLDEN_MEAN_FRACTION=2654435769U;
+}
+
// assumes C is POD
template <class C>
struct murmur_hash
diff --git a/utils/lvalue_pmap.h b/utils/lvalue_pmap.h
new file mode 100755
index 00000000..03ddf5e7
--- /dev/null
+++ b/utils/lvalue_pmap.h
@@ -0,0 +1,25 @@
+#ifndef LVALUE_PMAP_H
+#define LVALUE_PMAP_H
+
+#include <boost/property_map/property_map.hpp>
+
+// i checked: boost provides get and put given []
+
+// lvalue property map pmapname<P> that is: P p; valtype &v=p->name;
+#define PMAP_MEMBER_INDIRECT(pmapname,valtype,name) template <class P> struct pmapname { \
+ typedef P key_type; \
+ typedef valtype value_type; \
+ typedef value_type & reference; \
+ typedef boost::lvalue_property_map_tag category; \
+ reference operator[](key_type p) const { return p->name; } \
+};
+
+#define PMAP_MEMBER_INDIRECT_2(pmapname,name) template <class P,class R> struct pmapname { \
+ typedef P key_type; \
+ typedef R value_type; \
+ typedef value_type & reference; \
+ typedef boost::lvalue_property_map_tag category; \
+ reference operator[](key_type p) const { return p->name; } \
+};
+
+#endif
diff --git a/utils/null_traits.h b/utils/null_traits.h
new file mode 100755
index 00000000..fac857d9
--- /dev/null
+++ b/utils/null_traits.h
@@ -0,0 +1,26 @@
+#ifndef NULL_TRAITS_H
+#define NULL_TRAITS_H
+
+template <class V>
+struct null_traits {
+ static V null; //TODO: maybe take out default null and make ppl explicitly define? they may be surprised that they need to when they include a header lib that uses null_traits
+};
+// global bool is_null(V const& v)
+
+// definitely override this, and possibly set_null and is_null. that's the point.
+template <class V>
+V null_traits<V>::null;
+//TODO: are we getting single init of the static null object?
+
+template <class V>
+void set_null(V &v) {
+ v=null_traits<V>::null;
+}
+
+template <class V>
+void is_null(V const& v) {
+ return v==null_traits<V>::null;
+}
+
+
+#endif
diff --git a/utils/value_array.h b/utils/value_array.h
index 82e66e8d..a10f754f 100755
--- a/utils/value_array.h
+++ b/utils/value_array.h
@@ -103,6 +103,12 @@ protected:
}
}
+ template <class C,class F>
+ inline void init_map(C const& c,F const& f) {
+ alloc(c.size());
+ copy_construct_map(c.begin(),c.end(),array,f);
+ }
+ // warning: std::distance is likely slow on maps (anything other than random access containers. so container version using size will be better
template <class I,class F>
inline void init_range_map(I itr, I end,F const& f) {
alloc(std::distance(itr,end));
@@ -123,11 +129,25 @@ public:
{
init(s,t);
}
- void resize(size_type s, const_reference t = T()) {
+ void reinit(size_type s, const_reference t = T()) {
clear();
init(s,t);
}
+ //copy any existing data like std::vector. not A::construct exception safe. try blah blah?
+ void resize(size_type s, const_reference t = T()) {
+ pointer na=A::allocate(s);
+ size_type nc=s<sz ? s : sz;
+ size_type i=0;
+ for (;i<nc;++i)
+ A::construct(na+i,array[i]);
+ for (;i<s;++i)
+ A::construct(na+i,t);
+ clear();
+ array=na;
+ sz=s;
+ }
+
template <class I>
void reinit(I itr, I end) {
clear();
@@ -135,10 +155,16 @@ public:
}
template <class I,class F>
- void reinit_map(I itr, I end,F const& map) {
+ void reinit_map(I itr,I end,F const& map) {
clear();
init_range_map(itr,end,map);
}
+ // warning: std::distance is likely slow on maps,lists (anything other than random access containers. so container version below using size() will be better
+ template <class C,class F>
+ void reinit_map(C const& c,F const& map) {
+ clear();
+ init_map(c,map);
+ }
template <class I>
ValueArray(I itr, I end)