diff options
Diffstat (limited to 'decoder/cfg.cc')
-rw-r--r-- | decoder/cfg.cc | 656 |
1 files changed, 0 insertions, 656 deletions
diff --git a/decoder/cfg.cc b/decoder/cfg.cc deleted file mode 100644 index d6ee651a..00000000 --- a/decoder/cfg.cc +++ /dev/null @@ -1,656 +0,0 @@ -#include "cfg.h" -#include "hg.h" -#include "cfg_format.h" -#include "cfg_binarize.h" -#include "hash.h" -#include "batched_append.h" -#include <limits> -#include "fast_lexical_cast.hpp" -//#include "indices_after.h" -#include "show.h" -#include "null_traits.h" - -#define DUNIQ(x) x -#define DBIN(x) -#define DSP(x) x -//SP:binarize by splitting. -#define DCFG(x) IF_CFG_DEBUG(x) - -#undef CFG_FOR_RULES -#define CFG_FOR_RULES(i,expr) \ - for (CFG::NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) { \ - NT const& nt=*n; \ - for (CFG::Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) { \ - RuleHandle i=*ir; \ - expr; \ - } \ - } - - -using namespace std; - -typedef CFG::Rule Rule; -typedef CFG::NTOrder NTOrder; -typedef CFG::RHS RHS; -typedef CFG::BinRhs BinRhs; - -/////index ruleids: -void CFG::UnindexRules() { - for (NTs::iterator n=nts.begin(),nn=nts.end();n!=nn;++n) - n->ruleids.clear(); -} - -void CFG::ReindexRules() { - UnindexRules(); - for (int i=0,e=rules.size();i<e;++i) - if (!rules[i].is_null()) - nts[rules[i].lhs].ruleids.push_back(i); -} - -//////topo order: -namespace { -typedef std::vector<char> Seen; // 0 = unseen, 1 = seen+finished, 2 = open (for cycle detection; seen but not finished) -enum { UNSEEN=0,SEEN,OPEN }; - -// bottom -> top topo order (rev head->tails topo) -template <class OutOrder> -struct CFGTopo { -// meaningless efficiency alternative: close over all the args except ni - so they're passed as a single pointer. also makes visiting tail_nts simpler. - CFG const& cfg; - OutOrder outorder; - std::ostream *cerrp; - CFGTopo(CFG const& cfg,OutOrder const& outorder,std::ostream *cerrp=&std::cerr) - : cfg(cfg),outorder(outorder),cerrp(cerrp) // closure over args - , seen(cfg.nts.size()) { } - - Seen seen; - void operator()(CFG::NTHandle ni) { - char &seenthis=seen[ni]; - if (seenthis==UNSEEN) { - seenthis=OPEN; - - CFG::NT const& nt=cfg.nts[ni]; - for (CFG::Ruleids::const_iterator i=nt.ruleids.begin(),e=nt.ruleids.end();i!=e;++i) { - Rule const& r=cfg.rules[*i]; - r.visit_rhs_nts(*this); // recurse. - } - - *outorder++=ni; // dfs finishing time order = reverse topo. - seenthis=SEEN; - } else if (cerrp && seenthis==OPEN) { - std::ostream &cerr=*cerrp; - cerr<<"WARNING: CFG Topo order attempt failed: NT "; - cfg.print_nt_name(cerr,ni); - cerr<<" already reached from goal(top) "; - cfg.print_nt_name(cerr,cfg.goal_nt); - cerr<<". Continuing to reorder, but it's not fully topological.\n"; - } - } - -}; - -template <class O> -void DoCFGTopo(CFG const& cfg,CFG::NTHandle goal,O const& o,std::ostream *w=0) { - CFGTopo<O> ct(cfg,o,w); - ct(goal); -} - -}//ns - -// you would need to do this only if you didn't build from hg, or you Binarize without bin_topo option. note: this doesn't sort the list of rules; it's assumed that if you care about the topo order you'll iterate over nodes. -void CFG::OrderNTsTopo(NTOrder *o_,std::ostream *cycle_complain) { - NTOrder &o=*o_; - o.resize(nts.size()); - DoCFGTopo(*this,goal_nt,o.begin(),cycle_complain); -} - - -/////sort/uniq: -namespace { -RHS null_rhs(1,INT_MIN); - -//sort -struct ruleid_best_first { - CFG::Rules const* rulesp; - bool operator()(int a,int b) const { // true if a >(prob for ruleid) b - return (*rulesp)[b].p < (*rulesp)[a].p; - } -}; - -//uniq -struct prob_pos { - prob_pos() {} - prob_pos(prob_t prob,int pos) : prob(prob),pos(pos) {} - prob_t prob; - int pos; - bool operator <(prob_pos const& o) const { return prob<o.prob; } -}; -}//ns - -int CFG::UniqRules(NTHandle ni) { - typedef HASH_MAP<RHS,prob_pos,boost::hash<RHS> > BestRHS; // faster to use trie? maybe. - BestRHS bestp; // once inserted, the position part (output index) never changes. but the prob may be improved (overwrite ruleid at that position). - HASH_MAP_EMPTY(bestp,null_rhs); - Ruleids &adj=nts[ni].ruleids; - Ruleids oldadj=adj; - int newpos=0; - for (int i=0,e=oldadj.size();i!=e;++i) { // this beautiful complexity is to ensure that adj' is a subsequence of adj (without duplicates) - int ri=oldadj[i]; - Rule const& r=rules[ri]; - prob_pos pi(r.p,newpos); - prob_pos &oldpi=get_default(bestp,r.rhs,pi); - if (oldpi.pos==newpos) {// newly inserted - adj[newpos++]=ri; - } else { - SHOWP(DUNIQ,"Uniq duplicate: ") SHOW4(DUNIQ,oldpi.prob,pi.prob,oldpi.pos,newpos); - SHOW(DUNIQ,ShowRule(ri)); - SHOW(DUNIQ,ShowRule(adj[oldpi.pos])); - if (oldpi.prob<pi.prob) { // we improve prev. best (overwrite it @old pos) - oldpi.prob=pi.prob; - adj[oldpi.pos]=ri; // replace worse rule w/ better - } - } - - } - // post: newpos = number of new adj - adj.resize(newpos); - return newpos; -} - -void CFG::SortLocalBestFirst(NTHandle ni) { - ruleid_best_first r; - r.rulesp=&rules; - Ruleids &adj=nts[ni].ruleids; - std::stable_sort(adj.begin(),adj.end(),r); -} - - -/////binarization: -namespace { - -BinRhs null_bin_rhs(std::numeric_limits<int>::min(),std::numeric_limits<int>::min()); - -// index i >= N.size()? then it's in M[i-N.size()] -//WordID first,WordID second, -string BinStr(BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) -{ - int nn=N.size(); - ostringstream o; -#undef BinNameOWORD -#define BinNameOWORD(w) \ - do { \ - int n=w; if (n>0) o << TD::Convert(n); \ - else { \ - int i=-n; \ - if (i<nn) o<<N[i].from<<i; else o<<M[i-nn].from; \ - } \ - } while(0) - - BinNameOWORD(b.first); - o<<'+'; - BinNameOWORD(b.second); - return o.str(); -} - -string BinStr(RHS const& r,CFG::NTs const& N,CFG::NTs const& M) -{ - int nn=N.size(); - ostringstream o; - for (int i=0,e=r.size();i!=e;++i) { - if (i) - o<<'+'; - BinNameOWORD(r[i]); - } - return o.str(); -} - - -WordID BinName(BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) -{ - return TD::Convert(BinStr(b,N,M)); -} - -WordID BinName(RHS const& b,CFG::NTs const& N,CFG::NTs const& M) -{ - return TD::Convert(BinStr(b,N,M)); -} - -/* -template <class Rhs> -struct null_for; - - -template <> -struct null_for<BinRhs> { - static BinRhs null; -}; - -template <> -struct null_for<RHS> { - static RHS null; -}; - -template <> -BinRhs null_traits<BinRhs>::xnull(std::numeric_limits<int>::min(),std::numeric_limits<int>::min()); - -template <> -RHS null_traits<RHS>::xnull(1,std::numeric_limits<int>::min()); -*/ - -template <class Rhs> -struct add_virtual_rules { - typedef CFG::RuleHandle RuleHandle; - typedef CFG::NTHandle NTHandle; - CFG::NTs &nts,new_nts; - CFG::Rules &rules, new_rules; -// above will be appended at the end, so we don't have to worry about iterator invalidation - WordID newnt; //negative of NTHandle, or positive => unary lexical item (not to binarize). fit for rhs of a rule - RuleHandle newruleid; - typedef HASH_MAP<Rhs,WordID,boost::hash<Rhs> > R2L; - R2L rhs2lhs; // an rhs maps to this -virtntid, or original id if length 1 - bool name_nts; - add_virtual_rules(CFG &cfg,bool name_nts=false) : nts(cfg.nts),rules(cfg.rules),newnt(-nts.size()),newruleid(rules.size()),name_nts(name_nts) { - HASH_MAP_EMPTY(rhs2lhs,null_traits<Rhs>::xnull); - } - NTHandle get_virt(Rhs const& r) { - NTHandle nt=get_default(rhs2lhs,r,newnt); - SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(r,nts,new_nts)<<"=>") SHOW(DBIN,nt); - if (newnt==nt) { - create(r); - } - return nt; - } - inline void set_nt_name(Rhs const& r) { - if (name_nts) - new_nts.back().from.nt=BinName(r,nts,new_nts); - } - inline void create_nt(Rhs const& rhs) { - new_nts.push_back(CFG::NT(newruleid++)); - set_nt_name(rhs); - } - inline void create_rule(Rhs const& rhs) { - new_rules.push_back(CFG::Rule(-newnt,rhs)); - --newnt; - } - inline void create_adding(Rhs const& rhs) { - NTHandle nt=get_default(rhs2lhs,rhs,newnt); - assert(nt==newnt); - create(rhs); - } - inline void create(Rhs const& rhs) { - SHOWP(DSP,"Create ") SHOW3(DSP,newnt,newruleid,BinStr(rhs,nts,new_nts)) - create_nt(rhs); - create_rule(rhs); - assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size()); - } - - ~add_virtual_rules() { - append_rules(); - } - void append_rules() { - // marginally more efficient - batched_append_swap(nts,new_nts); - batched_append_swap(rules,new_rules); - } - inline bool have(Rhs const& rhs,NTHandle &h) const { - if (rhs.size()==1) { // stop creating virtual unary rules. - h=rhs[0]; - return true; - } - typename R2L::const_iterator i=rhs2lhs.find(rhs); - if (i==rhs2lhs.end()) - return false; - h=i->second; - return true; - } - //HACK: prevent this for instantiating for BinRhs. we need to use rule index because we'll be adding rules before we can update. - // returns 1 per replaced NT (0,1, or 2) - inline std::string Str(Rhs const& rhs) const { - return BinStr(rhs,nts,new_nts); - } - - template <class RHSi> - int split_rhs(RHSi &rhs,bool only_free=false,bool only_reusing_1=false) { - typedef WordID const* WP; - //TODO: don't actually build substrings of rhs; define key type that stores ref to rhs in new_nts by index (because it may grow), and also allows an int* [b,e) range to serve as key (i.e. don't insert the latter type of key). - int n=rhs.size(); - if (n<=2) return 0; - int longest1=1; // all this other stuff is not uninitialized when used, based on checking this and other things (it's complicated, learn to prove theorems, gcc) - int mid=n/2; - int best_k; - enum {HAVE_L=-1,HAVE_NONE=0,HAVE_R=1}; - int have1=HAVE_NONE; // will mean we already have some >1 length prefix or suffix as a virt. (it's free). if we have both we use it immediately and return. - NTHandle ntr,ntl; - NTHandle bestntr,bestntl; - WP b=&rhs.front(),e=b+n; - WP wk=b; - SHOWM3(DSP,"Split",Str(rhs),only_free,only_reusing_1); - int rlen=n; - for (int k=1;k<n-1;++k) { - //TODO: handle length 1 l and r parts without explicitly building Rhs? - ++wk; assert(k==wk-b); - --rlen; assert(rlen==n-k); - Rhs l(b,wk); - if (have(l,ntl)) { - if (k>1) { SHOWM3(DSP,"Have l",k,n,Str(l)) } - Rhs r(wk,e); - if (have(r,ntr)) { - SHOWM3(DSP,"Have r too",k,n,Str(r)) - rhs.resize(2); - rhs[0]=ntl; - rhs[1]=ntr; - return 2; - } else if (k>longest1) { - longest1=k; - have1=HAVE_L; - bestntl=ntl; - best_k=k; - } - } else if (rlen>longest1) { // > or >= favors l or r branching, maybe. who cares. - Rhs r(wk,e); - if (have(r,ntr)) { - longest1=rlen; - if (rlen>1) { SHOWM3(DSP,"Have r (only) ",k,n,Str(r)) } - have1=HAVE_R; - bestntr=ntr; - best_k=k; - } - } - //TODO: swap order of consideration (l first or r?) depending on pre/post midpoint? one will be useless to check for beating the longest single match so far. check that second - - } - // now we know how we're going to split the rule; what follows is just doing the actual splitting: - - if (only_free) { - if (have1==HAVE_NONE) - return 0; - if (have1==HAVE_L) { - rhs.erase(rhs.begin()+1,rhs.begin()+best_k); //erase [1..best_k) - rhs[0]=bestntl; - } else { - assert(have1==HAVE_R); - rhs.erase(rhs.begin()+best_k+1,rhs.end()); // erase (best_k..) - rhs[best_k]=bestntr; - } - return 1; - } - /* now we have to add some new virtual rules. - some awkward constraints: - - 1. can't resize rhs until you save copy of l or r split portion - - 2. can't create new rule until you finished modifying rhs (this is why we store newnt then create). due to vector push_back invalidation. perhaps we could bypass this by reserving sufficient space first before a splitting pass (# rules and nts created is <= 2 * # of rules being passed over) - - */ - if (have1==HAVE_NONE) { // default: split down middle. - DSP(assert(longest1==1)); - WP m=b+mid; - if (n%2==0) { - WP i=b; - WP j=m; - for (;i!=m;++i,++j) - if (*i!=*j) goto notleqr; - // [...mid]==[mid...]! - RHS l(b,m); // 1. // this is equal to RHS(m,e). - rhs.resize(2); - rhs[0]=rhs[1]=newnt; //2. - create_adding(l); - return 1; // only had to create 1 total when splitting down middle when l==r - } - notleqr: - if (only_reusing_1) return 0; - best_k=mid; // rounds down - if (mid==1) { - RHS r(m,e); //1. - rhs.resize(2); - rhs[1]=newnt; //2. - create_adding(r); - return 1; - } else { - Rhs l(b,m); - Rhs r(m,e); // 1. - rhs.resize(2); - rhs[0]=newnt; - rhs[1]=newnt-1; // 2. - create_adding(l); - create_adding(r); - return 2; - } - } - WP best_wk=b+best_k; - //we build these first because adding rules may invalidate the underlying pointers (we end up binarizing already split virt rules)!. - //wow, that decision (not to use index into new_nts instead of pointer to rhs), while adding new nts to it really added some pain. - if (have1==HAVE_L) { - Rhs r(best_wk,e); //1. - rhs.resize(2); - rhs[0]=bestntl; - DSP(assert(best_wk<e-1)); // because we would have returned having both if rhs was singleton - rhs[1]=newnt; //2. - create_adding(r); - } else { - DSP(assert(have1==HAVE_R)); - DSP(assert(best_wk>b+1)); // because we would have returned having both if lhs was singleton - Rhs l(b,best_wk); //1. - rhs.resize(2); - rhs[0]=newnt; //2. - rhs[1]=bestntr; - create_adding(l); - } - return 1; - } -}; - -}//ns - -void CFG::BinarizeSplit(CFGBinarize const& b) { - add_virtual_rules<RHS> v(*this,b.bin_name_nts); - CFG_FOR_RULES(i,v.split_rhs(rules[i].rhs,false,false)); - Rules &newr=v.new_rules; -#undef CFG_FOR_VIRT -#define CFG_FOR_VIRT(r,expr) \ - for (int i=0,e=newr.size();i<e;++i) { \ - Rule &r=newr[i];expr; } // NOTE: must use indices since we'll be adding rules as we iterate. - - int n_changed_total=0; - int n_changed=0; // quiets a warning -#define CFG_SPLIT_PASS(N,free,just1) \ - for (int pass=0;pass<b.N;++pass) { \ - n_changed=0; \ - CFG_FOR_VIRT(r,n_changed+=v.split_rhs(r.rhs,free,just1)); \ - if (!n_changed) { \ - break; \ - } n_changed_total+=n_changed; } - - CFG_SPLIT_PASS(split_passes,false,false) - if (n_changed==0) return; - CFG_SPLIT_PASS(split_share1_passes,false,true) - CFG_SPLIT_PASS(split_free_passes,true,false) - -} - -void CFG::Binarize(CFGBinarize const& b) { - if (!b.Binarizing()) return; - cerr << "Binarizing "<<b<<endl; - if (b.bin_thresh>0) - BinarizeThresh(b); - if (b.bin_split) - BinarizeSplit(b); - if (b.bin_l2r) - BinarizeL2R(false,b.bin_name_nts); - if (b.bin_topo) //TODO: more efficient (at least for l2r) maintenance of order? - OrderNTsTopo(); - -} - -namespace { -} - -void CFG::BinarizeThresh(CFGBinarize const& b) { - throw runtime_error("TODO: some fancy linked list thing - see NOTES.partial.binarize"); -} - - -void CFG::BinarizeL2R(bool bin_unary,bool name) { - add_virtual_rules<BinRhs> v(*this,name); -cerr << "Binarizing left->right " << (bin_unary?"real to unary":"stop at binary") <<endl; - HASH_MAP<BinRhs,NTHandle,boost::hash<BinRhs> > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left. - HASH_MAP_EMPTY(bin2lhs,null_bin_rhs); - // iterate using indices and not iterators because we'll be adding to both nts and rules list? we could instead pessimistically reserve space for both, but this is simpler. also: store original end of nts since we won't need to reprocess newly added ones. - int rhsmin=bin_unary?0:1; - //NTs new_nts; - //Rules new_rules; - //TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target) -// int newnt=nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0) -// int newruleid=rules.size(); - BinRhs bin; - CFG_FOR_RULES(ruleid, -/* for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) { - NT const& nt=*n; - for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) { - RuleHandle ruleid=*ir;*/ -// SHOW2(DBIN,ruleid,ShowRule(ruleid)); - Rule & rule=rules[ruleid]; - RHS &rhs=rule.rhs; // we're going to binarize this while adding newly created rules to new_... - if (rhs.empty()) continue; - int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT - if (rhsmin<=r) { // means r>=0 also - bin.second=rhs[r+1]; - int bin_to; // the replacement for bin -// assert(newruleid==rules.size()+new_rules.size());assert(newnt==nts.size()+new_nts.size()); - // also true at start/end of loop: - for (;;) { // pairs from right to left (normally we leave the last pair alone) - bin.first=rhs[r]; - bin_to=v.get_virt(bin); -/* bin_to=get_default(bin2lhs,bin,v.newnt); -// SHOW(DBIN,r) SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(bin,nts,new_nts)<<"=>") SHOW(DBIN,bin_to); - if (v.newnt==bin_to) { // it's new! - new_nts.push_back(NT(newruleid++)); - //now newnt is the index of the last (after new_nts is appended) nt. bin is its rhs. bin_to is its lhs - new_rules.push_back(Rule(newnt,bin)); - ++newnt; - if (name) new_nts.back().from.nt=BinName(bin,nts,new_nts); - } -*/ - bin.second=bin_to; - --r; - if (r<rhsmin) { - rhs[rhsmin]=bin_to; - rhs.resize(rhsmin+1); - break; - } - } - }) - /* - } - } - */ -#if 0 - // marginally more efficient - batched_append_swap(nts,new_nts); - batched_append_swap(rules,new_rules); -//#else - batched_append(nts,new_nts); - batched_append(rules,new_rules); -#endif -} - -namespace { -inline int nt_index(int nvar,Hypergraph::TailNodeVector const& t,bool target_side,int w) { - assert(w<0 || (target_side&&w==0)); - return t[target_side?-w:nvar]; -} -} - -void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool push_weights) { - uninit=false; - hg_=&hg; - Hypergraph::NodeProbs np; - goal_inside=hg.ComputeNodeViterbi(&np); - pushed_inside=push_weights ? goal_inside : prob_t(1); - int nn=hg.nodes_.size(),ne=hg.edges_.size(); - nts.resize(nn); - goal_nt=nn-1; - rules.resize(ne); - for (int i=0;i<nn;++i) { - nts[i].ruleids=hg.nodes_[i].in_edges_; - hg.SetNodeOrigin(i,nts[i].from); - } - for (int i=0;i<ne;++i) { - Rule &cfgr=rules[i]; - Hypergraph::Edge const& e=hg.edges_[i]; - prob_t &crp=cfgr.p; - crp=e.edge_prob_; - cfgr.lhs=e.head_node_; - IF_CFG_TRULE(cfgr.rule=e.rule_;) - if (copy_features) cfgr.f=e.feature_values_; - if (push_weights) crp /=np[e.head_node_]; - TRule const& er=*e.rule_; - vector<WordID> const& rule_rhs=target_side?er.e():er.f(); - int nr=rule_rhs.size(); - RHS &rhs_out=cfgr.rhs; - rhs_out.resize(nr); - Hypergraph::TailNodeVector const& tails=e.tail_nodes_; - int nvar=0; - //split out into separate target_side, source_side loops? - for (int j=0;j<nr;++j) { - WordID w=rule_rhs[j]; - if (w>0) - rhs_out[j]=w; - else { - int n=nt_index(nvar,tails,target_side,w); - ++nvar; - if (push_weights) crp*=np[n]; - rhs_out[j]=-n; - } - } - assert(nvar==er.Arity()); - assert(nvar==tails.size()); - } -} - -void CFG::Clear() { - rules.clear(); - nts.clear(); - goal_nt=-1; - hg_=0; -} - -namespace { -CFGFormat form; -} - -void CFG::PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& f) const { - Rule const& r=rules[rulei]; - f.print_lhs(o,*this,r.lhs); - f.print_rhs(o,*this,r.rhs.begin(),r.rhs.end()); - f.print_features(o,r.p,r.f); - IF_CFG_TRULE(if (r.rule) o<<f.partsep<<*r.rule;) -} -void CFG::PrintRule(std::ostream &o,RuleHandle rulei) const { - PrintRule(o,rulei,form); -} -string CFG::ShowRule(RuleHandle i) const { - ostringstream o;PrintRule(o,i);return o.str(); -} - -void CFG::Print(std::ostream &o,CFGFormat const& f) const { - assert(!uninit); - if (!f.goal_nt_name.empty()) { - o << '['<<f.goal_nt_name <<']'; - WordID rhs=-goal_nt; - f.print_rhs(o,*this,&rhs,&rhs+1); - if (pushed_inside!=prob_t::One()) - f.print_features(o,pushed_inside); - o<<'\n'; - } - CFG_FOR_RULES(i,PrintRule(o,i,f);o<<'\n';) -} - -void CFG::Print(std::ostream &o) const { - Print(o,form); -} - -std::ostream &operator<<(std::ostream &o,CFG const &x) { - x.Print(o); - return o; -} |