From ead8845217c5e6e48f3680ead6f859ec8e110eb2 Mon Sep 17 00:00:00 2001 From: graehl Date: Fri, 13 Aug 2010 08:20:47 +0000 Subject: (NEEDS TESTING) cfg index rules->nts, sort by prob, remove duplicates keeping highest prob, topo sort (and after binarize topo sort). beginning to apply_fsa_models (PrefixTrieNode) git-svn-id: https://ws10smt.googlecode.com/svn/trunk@539 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/apply_fsa_models.cc | 40 +++++++++++ decoder/cfg.cc | 160 ++++++++++++++++++++++++++++++++++++++++---- decoder/cfg.h | 149 ++++++++++++++++++++++++++++++++++++++--- decoder/cfg_binarize.h | 6 ++ decoder/exp_semiring.h | 10 +-- decoder/hg.h | 2 +- decoder/inside_outside.h | 6 +- decoder/viterbi.h | 2 +- 8 files changed, 343 insertions(+), 32 deletions(-) (limited to 'decoder') diff --git a/decoder/apply_fsa_models.cc b/decoder/apply_fsa_models.cc index 2854b28b..c9bda68b 100755 --- a/decoder/apply_fsa_models.cc +++ b/decoder/apply_fsa_models.cc @@ -13,6 +13,46 @@ using namespace std; +//impl details (not exported). flat namespace for my ease. + +typedef CFG::BinRhs BinRhs; +typedef CFG::NTs NTs; +typedef CFG::NT NT; +typedef CFG::NTHandle NTHandle; +typedef CFG::Rules Rules; +typedef CFG::Rule Rule; +typedef CFG::RuleHandle RuleHandle; + +namespace { + +// if we don't greedy-binarize, we want to encode recognized prefixes p (X -> p . rest) efficiently. if we're doing this, we may as well also push costs so we can best-first select rules in a lazy fashion. this is effectively left-branching binarization, of course. +template +struct prefix_map_type { + typedef std::map type; +}; +//template typedef +#define PREFIX_MAP(k,v) prefix_map_type::type +typedef NTHandle LHS; +struct PrefixTrieNode { + prob_t backward; // (viterbi) backward prob (for cost pushing) + typedef PREFIX_MAP(LHS,RuleHandle) Completed; // can only have one rule w/ a given signature (duplicates should be collapsed when making CFG). but there may be multiple rules, with different LHS + Completed completed; + typedef PREFIX_MAP(WordID,PrefixTrieNode *) Adj; + Adj adj; + //TODO: +}; + +struct PrefixTrie { + CFG const* cfgp; + CFG const& cfg() const { return *cfgp; } + PrefixTrie(CFG const& cfg) : cfgp(&cfg) { +//TODO: + } +}; + +}//anon ns + + DEFINE_NAMED_ENUM(FSA_BY) struct ApplyFsa { diff --git a/decoder/cfg.cc b/decoder/cfg.cc index 8fc5b10f..a279c74a 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -6,24 +6,156 @@ #include "batched_append.h" #include #include "fast_lexical_cast.hpp" +//#include "indices_after.h" using namespace std; +typedef CFG::Rule Rule; +typedef CFG::NTOrder NTOrder; +typedef CFG::RHS RHS; + +/////index ruleids: +void CFG::UnindexRules() { + for (NTs::iterator n=nts.begin(),nn=nts.end();n!=nn;++n) + n->ruleids.clear(); +} + +void CFG::ReindexRules() { + UnindexRules(); + for (int i=0,e=rules.size();i Seen; // 0 = unseen, 1 = seen+finished, 2 = open (for cycle detection; seen but not finished) +enum { UNSEEN=0,SEEN,OPEN }; + + +// bottom -> top topo order (rev head->tails topo) +template +struct CFGTopo { +// meaningless efficiency alternative: close over all the args except ni - so they're passed as a single pointer. also makes visiting tail_nts simpler. + CFG const& cfg; + OutOrder outorder; + std::ostream *cerrp; + CFGTopo(CFG const& cfg,OutOrder const& outorder,std::ostream *cerrp=&std::cerr) + : cfg(cfg),outorder(outorder),cerrp(cerrp) // closure over args + , seen(cfg.nts.size()) { } + + Seen seen; + void operator()(CFG::NTHandle ni) { + char &seenthis=seen[ni]; + if (seenthis==UNSEEN) { + seenthis=OPEN; + + CFG::NT const& nt=cfg.nts[ni]; + for (CFG::Ruleids::const_iterator i=nt.ruleids.begin(),e=nt.ruleids.end();i!=e;++i) { + Rule const& r=cfg.rules[*i]; + r.visit_rhs_nts(*this); // recurse. + } + + *outorder++=ni; // dfs finishing time order = reverse topo. + seenthis=SEEN; + } else if (cerrp && seenthis==OPEN) { + std::ostream &cerr=*cerrp; + cerr<<"WARNING: CFG Topo order attempt failed: NT "; + cfg.print_nt_name(cerr,ni); + cerr<<" already reached from goal(top) "; + cfg.print_nt_name(cerr,cfg.goal_nt); + cerr<<". Continuing to reorder, but it's not fully topological.\n"; + } + } + +}; + +template +void DoCFGTopo(CFG const& cfg,CFG::NTHandle goal,O const& o,std::ostream *w=0) { + CFGTopo ct(cfg,o,w); + ct(goal); +} + +}//ns + +// you would need to do this only if you didn't build from hg, or you Binarize without bin_topo option. note: this doesn't sort the list of rules; it's assumed that if you care about the topo order you'll iterate over nodes. +void CFG::OrderNTsTopo(NTOrder *o_,std::ostream *cycle_complain) { + NTOrder &o=*o_; + o.resize(nts.size()); + DoCFGTopo(*this,goal_nt,o.begin(),cycle_complain); +} + + +/////sort/uniq: namespace { -CFG::BinRhs nullrhs(std::numeric_limits::min(),std::numeric_limits::min()); +RHS null_rhs(1,INT_MIN); + +//sort +struct ruleid_best_first { + CFG::Rules const* rulesp; + bool operator()(int a,int b) const { // true if a >(prob for ruleid) b + return (*rulesp)[b].p < (*rulesp)[a].p; + } +}; + +//uniq +struct prob_pos { + prob_pos() {} + prob_pos(prob_t prob,int pos) : prob(prob),pos(pos) {} + prob_t prob; + int pos; + bool operator <(prob_pos const& o) const { return prob BestRHS; // faster to use trie? maybe. + BestRHS bestp; // once inserted, the position part (output index) never changes. but the prob may be improved (overwrite ruleid at that position). + HASH_MAP_EMPTY(bestp,null_rhs); + Ruleids &adj=nts[ni].ruleids; + Ruleids oldadj=adj; + int oi=0; + for (int i=0,e=oldadj.size();i!=e;++i) { // this beautiful complexity is to ensure that adj' is a subsequence of adj (without duplicates) + int ri=oldadj[i]; + Rule const& r=rules[ri]; + prob_pos pi(r.p,oi); + prob_pos &oldpi=get_default(bestp,r.rhs,pi); + if (oldpi.pos==oi) {// newly inserted + adj[oi++]=ri; + } else if (oldpi.prob::min(),std::numeric_limits::min()); // index i >= N.size()? then it's in M[i-N.size()] WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) { int nn=N.size(); ostringstream o; -#define BinNameOWORD(w) \ - do { \ - int n=w; if (n>0) o << TD::Convert(n); \ - else { \ - int i=-n; \ - if (i0) o << TD::Convert(n); \ + else { \ + int i=-n; \ + if (i > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left. - HASH_MAP_EMPTY(bin2lhs,nullrhs); + HASH_MAP_EMPTY(bin2lhs,null_bin_rhs); int rhsmin=b.bin_unary?0:1; - // iterate using indices and not iterators because we'll be adding to both nodes and edge list. we could instead pessimistically reserve space for both, but this is simpler. also: store original end of nts since we won't need to reprocess newly added ones. + // iterate using indices and not iterators because we'll be adding to both nts and rules list? we could instead pessimistically reserve space for both, but this is simpler. also: store original end of nts since we won't need to reprocess newly added ones. NTs new_nts; // these will be appended at the end, so we don't have to worry about iterator invalidation Rules new_rules; //TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target) @@ -77,6 +207,8 @@ void CFG::Binarize(CFGBinarize const& b) { } batched_append_swap(nts,new_nts); batched_append_swap(rules,new_rules); + if (b.bin_topo) //TODO: more efficient (at least for l2r) maintenance of order + OrderNTsTopo(); } namespace { diff --git a/decoder/cfg.h b/decoder/cfg.h index e1f818e8..b214d8dc 100755 --- a/decoder/cfg.h +++ b/decoder/cfg.h @@ -5,6 +5,11 @@ #ifndef CFG_DEBUG # define CFG_DEBUG 0 #endif +#if CFG_DEBUG +# define IF_CFG_DEBUG(x) x +#else +# define IF_CFG_DEBUG(x) +#endif /* for target FSA intersection, we want to produce a simple (feature weighted) CFG using the target projection of a hg. this is essentially isomorphic to the hypergraph, and we're copying part of the rule info (we'll maintain a pointer to the original hg edge for posterity/debugging; and perhaps avoid making a copy of the feature vector). but we may also want to support CFG read from text files (w/ features), without needing to have a backing hypergraph. so hg pointer may be null? multiple types of CFG? always copy the feature vector? especially if we choose to binarize, we won't want to rely on 1:1 alignment w/ hg @@ -28,6 +33,7 @@ #include "small_vector.h" #include "nt_span.h" #include +#include "indices_after.h" class Hypergraph; class CFGFormat; // #include "cfg_format.h" @@ -47,7 +53,10 @@ struct CFG { struct Rule { // for binarizing - no costs/probs - Rule() { } + Rule() : lhs(-1) { } + bool is_null() const { return lhs<0; } + void set_null() { lhs=-1; rhs.clear();f.clear(); IF_CFG_DEBUG(rule.reset();) } + Rule(int lhs,BinRhs const& binrhs) : lhs(lhs),rhs(2),p(1) { rhs[0]=binrhs.first; rhs[1]=binrhs.second; @@ -57,18 +66,59 @@ struct CFG { RHS rhs; prob_t p; // h unused for now (there's nothing admissable, and p is already using 1st pass inside as pushed toward top) FeatureVector f; // may be empty, unless copy_features on Init -#if CFG_DEBUG - TRulePtr rule; // normally no use for this (waste of space) -#endif + IF_CFG_DEBUG(TRulePtr rule;) void Swap(Rule &o) { using namespace std; swap(lhs,o.lhs); swap(rhs,o.rhs); swap(p,o.p); swap(f,o.f); -#if CFG_DEBUG - swap(rule,o.rule); -#endif + IF_CFG_DEBUG(swap(rule,o.rule);) + } + template + void visit_rhs_nts(V &v) const { + for (RHS::const_iterator i=rhs.begin(),e=rhs.end();i!=e;++i) { + WordID w=*i; + if (w<=0) + v(-w); + } + } + template + void visit_rhs_nts(V const& v) const { + for (RHS::const_iterator i=rhs.begin(),e=rhs.end();i!=e;++i) { + WordID w=*i; + if (w<=0) + v(-w); + } + } + + template + void visit_rhs(V &v) const { + for (RHS::const_iterator i=rhs.begin(),e=rhs.end();i!=e;++i) { + WordID w=*i; + if (w<=0) + v.visit_nt(-w); + else + v.visit_t(w); + } + } + + // returns 0 or 1 (# of non null rules in this rule). + template + bool reorder_from(O &order,NTHandle removed=-1) { + for (RHS::iterator i=rhs.begin(),e=rhs.end();i!=e;++i) { + WordID &w=*i; + if (w<=0) { + int oldnt=-w; + NTHandle newnt=(NTHandle)order[oldnt]; // e.g. unsigned to int (-1) conversion should be ok + if (newnt==removed) { + set_null(); + return false; + } + w=-newnt; + } + } + return true; } }; @@ -91,6 +141,17 @@ struct CFG { bool Uninitialized() const { return uninit; } void Clear(); bool Empty() const { return nts.empty(); } + void UnindexRules(); // save some space? + void ReindexRules(); // scan over rules and rebuild NT::ruleids (e.g. after using UniqRules) + void UniqRules(NTHandle ni); // keep only the highest prob rule for each rhs and lhs=nt - doesn't remove from Rules; just removes from nts[ni].ruleids. keeps the same order in this sense: for a given signature (rhs), that signature's first representative in the old ruleids will become the new position of the best. as a consequence, if you SortLocalBestFirst() then UniqRules(), the result is still best first. but you may also call this on unsorted ruleids. + inline void UniqRules() { + for (int i=0,e=nts.size();i!=e;++i) UniqRules(i); + } + + void SortLocalBestFirst(NTHandle ni); // post: nts[ni].ruleids lists rules from highest p to lowest. when doing best-first earley intersection/parsing, you don't want to use the global marginal viterbi; you want to ignore outside in ordering edges for a node, so call this. stable in case of ties + inline void SortLocalBestFirst() { + for (int i=0,e=nts.size();i!=e;++i) SortLocalBestFirst(i); + } void Init(Hypergraph const& hg,bool target_side=true,bool copy_features=false,bool push_weights=true); void Print(std::ostream &o,CFGFormat const& format) const; // see cfg_format.h void PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& format) const; @@ -104,7 +165,79 @@ struct CFG { swap(nts,o.nts); swap(goal_nt,o.goal_nt); } - void Binarize(CFGBinarize const& binarize_options); + + typedef std::vector NTOrder; // a list of nts, in definition-before-use order. + + //perhaps: give up on templated Order and move the below to .cc (NTOrder should be fine) + + // post: iterating nts 0,1... the same as order[0],order[1],... ; return number of non-null rules (these aren't actually deleted) + // pre: order is (without duplicates) a range of NTHandle + template + int ReorderNTs(Order const& order) { + using namespace std; + int nn=nts.size(); +#if 0 + NTs newnts(order.size()); // because the (sub)permutation order may have e.g. 1<->4 + int ni=0; + for (typename Order::const_iterator i=order.begin(),e=order.end();i!=e;++i) { + assert(*i + int RemapRules(NTHandleRemap const& remap_nti,NTHandle removed=-1) { + int n_non_null=0; + for (int i=0,e=rules.size();i + void VisitRuleIds(V &v) { + for (int i=0,e=nts.size();i + void VisitRuleIds(V const& v) { + for (int i=0,e=nts.size();i + void VisitRulesUnindexed(V const &v) { + for (int i=0,e=rules.size();irhs. + // you would need to do this only if you didn't build from hg, or you Binarize without bin_topo option. + // note: this doesn't sort the list of rules; it's assumed that if you care about the topo order you'll iterate over nodes. + // cycle_complain means to warn in case of back edges. it's not necessary to prevent inf. loops. you get some order that's not topo if there are loops. starts from goal_nt, of course. + + void OrderNTsTopo(std::ostream *cycle_complain=0) { + NTOrder o; + OrderNTsTopo(&o,cycle_complain); + ReorderNTs(o); + } + + void Binarize(CFGBinarize const& binarize_options); // see cfg_binarize.h for docs typedef std::vector NTs; NTs nts; diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h index f76619d2..c5303622 100755 --- a/decoder/cfg_binarize.h +++ b/decoder/cfg_binarize.h @@ -18,6 +18,8 @@ struct CFGBinarize { bool bin_l2r; bool bin_unary; bool bin_name_nts; + bool bin_uniq; + bool bin_topo; template // template to support both printable_opts and boost nonprintable void AddOptions(Opts *opts) { opts->add_options() @@ -25,6 +27,8 @@ struct CFGBinarize { ("cfg_binarize_unary", defaulted_value(&bin_unary),"if true, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least cfg_binarize_at times.") ("cfg_binarize_l2r", defaulted_value(&bin_l2r),"force left to right (a (b (c d))) binarization (ignore _at threshold)") ("cfg_binarize_name_nts", defaulted_value(&bin_name_nts),"create named virtual NT tokens e.g. 'A12+the' when binarizing 'B->[A12] the cat'") + ("cfg_binarize_uniq", defaulted_value(&bin_uniq),"in case of duplicate rules, keep only the one with highest prob") + ("cfg_binarize_topo", defaulted_value(&bin_topo),"reorder nonterminals after binarization to maintain definition before use (topological order). otherwise the virtual NTs will all appear after the regular NTs") ; } void Validate() { @@ -40,6 +44,8 @@ struct CFGBinarize { return bin_l2r || bin_at>0; } void set_defaults() { + bin_topo=false; + bin_uniq=true; bin_at=0; bin_unary=false; bin_name_nts=true; diff --git a/decoder/exp_semiring.h b/decoder/exp_semiring.h index f91beee4..111eaaf1 100644 --- a/decoder/exp_semiring.h +++ b/decoder/exp_semiring.h @@ -14,7 +14,7 @@ // PType scalar, RType vector // BAD examples: // PType vector, RType scalar -template +template struct PRPair { PRPair() : p(), r() {} // Inside algorithm requires that T(0) and T(1) @@ -35,26 +35,26 @@ struct PRPair { RType r; }; -template +template std::ostream& operator<<(std::ostream& o, const PRPair& x) { return o << '<' << x.p << ", " << x.r << '>'; } -template +template const PRPair operator+(const PRPair& a, const PRPair& b) { PRPair result = a; result += b; return result; } -template +template const PRPair operator*(const PRPair& a, const PRPair& b) { PRPair result = a; result *= b; return result; } -template +template struct PRWeightFunction { explicit PRWeightFunction(const PWeightFunction& pwf = PWeightFunction(), const RWeightFunction& rwf = RWeightFunction()) : diff --git a/decoder/hg.h b/decoder/hg.h index e9510997..03221c74 100644 --- a/decoder/hg.h +++ b/decoder/hg.h @@ -392,7 +392,7 @@ public: // multiple the weights vector by the edge feature vector // (inner product) to set the edge probabilities - template + template void Reweight(const V& weights) { for (int i = 0; i < edges_.size(); ++i) { Edge& e = edges_[i]; diff --git a/decoder/inside_outside.h b/decoder/inside_outside.h index e3bd4592..73d4ec6a 100644 --- a/decoder/inside_outside.h +++ b/decoder/inside_outside.h @@ -27,7 +27,7 @@ struct Boolean { // score for each node // NOTE: WeightType() must construct the semiring's additive identity // WeightType(1) must construct the semiring's multiplicative identity -template +template WeightType Inside(const Hypergraph& hg, std::vector* result = NULL, const WeightFunction& weight = WeightFunction()) { @@ -58,7 +58,7 @@ WeightType Inside(const Hypergraph& hg, return inside_score.back(); } -template +template void Outside(const Hypergraph& hg, std::vector& inside_score, std::vector* result, @@ -179,7 +179,7 @@ struct InsideOutsides { // NOTE: XType * KType must be valid (and yield XType) // NOTE: This may do things slightly differently than you are used to, please // read the description in Li and Eisner (2009) carefully! -template +template KType InsideOutside(const Hypergraph& hg, XType* result_x, const KWeightFunction& kwf = KWeightFunction(), diff --git a/decoder/viterbi.h b/decoder/viterbi.h index ea2bc6c2..e78cd157 100644 --- a/decoder/viterbi.h +++ b/decoder/viterbi.h @@ -16,7 +16,7 @@ std::string viterbi_stats(Hypergraph const& hg, std::string const& name="forest" // WeightFunction must implement: // typedef prob_t Weight; // Weight operator()(Hypergraph::Edge const& e) const; -template +template typename WeightFunction::Weight Viterbi(const Hypergraph& hg, typename Traversal::Result* result, const Traversal& traverse, -- cgit v1.2.3