diff options
Diffstat (limited to 'decoder/cfg.cc')
-rwxr-xr-x | decoder/cfg.cc | 70 |
1 files changed, 45 insertions, 25 deletions
diff --git a/decoder/cfg.cc b/decoder/cfg.cc index c0598f16..be07c2c5 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -8,6 +8,12 @@ #include "fast_lexical_cast.hpp" //#include "indices_after.h" +#define CFGPRINT(x) IF_CFG_DEBUG(std::cerr<<x) +#define CFGSHOWC(x,s) CFGPRINT(#x<<"="<<x<<s) +#define CFGSHOW(x) CFGSHOWC(x,"\n") +#define CFGSHOWS(x) CFGSHOWC(x," ") +#define CFGSHOW2(x,y) CFGSHOWS(x) CFGSHOW(y) + using namespace std; typedef CFG::Rule Rule; @@ -108,7 +114,7 @@ struct prob_pos { }; }//ns -void CFG::UniqRules(NTHandle ni) { +int CFG::UniqRules(NTHandle ni) { typedef HASH_MAP<RHS,prob_pos,boost::hash<RHS> > BestRHS; // faster to use trie? maybe. BestRHS bestp; // once inserted, the position part (output index) never changes. but the prob may be improved (overwrite ruleid at that position). HASH_MAP_EMPTY(bestp,null_rhs); @@ -129,6 +135,7 @@ void CFG::UniqRules(NTHandle ni) { } // post: oi = number of new adj adj.resize(oi); + return oi; } void CFG::SortLocalBestFirst(NTHandle ni) { @@ -145,10 +152,12 @@ namespace { CFG::BinRhs null_bin_rhs(std::numeric_limits<int>::min(),std::numeric_limits<int>::min()); // index i >= N.size()? then it's in M[i-N.size()] -WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) +//WordID first,WordID second, +string BinStr(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) { int nn=N.size(); ostringstream o; +#undef BinNameOWORD #define BinNameOWORD(w) \ do { \ int n=w; if (n>0) o << TD::Convert(n); \ @@ -161,8 +170,12 @@ WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) BinNameOWORD(b.first); o<<'+'; BinNameOWORD(b.second); -#undef BinNameOWORD - return TD::Convert(o.str()); + return o.str(); +} + +WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) +{ + return TD::Convert(BinStr(b,N,M)); } }//ns @@ -177,33 +190,44 @@ void CFG::Binarize(CFGBinarize const& b) { cerr << "Binarizing "<<b<<endl; HASH_MAP<BinRhs,NTHandle,boost::hash<BinRhs> > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left. HASH_MAP_EMPTY(bin2lhs,null_bin_rhs); - int rhsmin=b.bin_unary?0:1; // iterate using indices and not iterators because we'll be adding to both nts and rules list? we could instead pessimistically reserve space for both, but this is simpler. also: store original end of nts since we won't need to reprocess newly added ones. + int rhsmin=b.bin_unary?0:1; NTs new_nts; // these will be appended at the end, so we don't have to worry about iterator invalidation Rules new_rules; //TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target) - int newnt=-nts.size(); + int newnt=-nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0) int newruleid=rules.size(); BinRhs bin; for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) { NT const& nt=*n; for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) { + CFGPRINT("Rule id# ") CFGSHOWS(*ir);IF_CFG_DEBUG(PrintRule(cerr<<" '",*ir,CFGFormat());cerr<<"'\n"); RHS &rhs=rules[*ir].rhs; // we're going to binarize this while adding newly created rules to new_... if (rhs.empty()) continue; - bin.second=rhs.back(); - for (int r=rhs.size()-2;r>=rhsmin;--r) { // pairs from right to left (normally we leave the last pair alone) - bin.first=rhs[r]; - if (newnt==(bin.second=(get_default(bin2lhs,bin,newnt)))) { - new_nts.push_back(NT(newruleid)); - new_rules.push_back(Rule(-newnt,bin)); - ++newruleid; - if (b.bin_name_nts) - new_nts.back().from.nt=BinName(bin,nts,new_nts); - --newnt; + int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT + if (rhsmin<=r) { // means r>=0 also + bin.second=rhs[r+1]; + int bin_to; // the replacement for bin + assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size()); + // also true at start/end of loop: + for (;;) { // pairs from right to left (normally we leave the last pair alone) + + bin.first=rhs[r]; + bin_to=get_default(bin2lhs,bin,newnt); + CFGSHOWS(r) CFGSHOWS(newnt) CFGPRINT("bin="<<BinStr(bin,nts,new_nts)<<"=>") CFGSHOW(bin_to); + if (newnt==bin_to) { // it's new! + new_nts.push_back(NT(newruleid++)); + //now -newnt is the index of the last (after new_nts is appended) nt. bin is its rhs. bin_to is its lhs + new_rules.push_back(Rule(-newnt,bin)); + --newnt; + if (b.bin_name_nts) + new_nts.back().from.nt=BinName(bin,nts,new_nts); + } + bin.second=bin_to; + --r; + if (r<rhsmin) break; } - } - if (rhsmin<rhs.size()) { - rhs[rhsmin]=bin.second; + rhs[rhsmin]=bin_to; rhs.resize(rhsmin+1); } } @@ -246,9 +270,7 @@ void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool pus prob_t &crp=cfgr.p; crp=e.edge_prob_; cfgr.lhs=e.head_node_; -#if CFG_DEBUG - cfgr.rule=e.rule_; -#endif + IF_CFG_TRULE(cfgr.rule=e.rule_;) if (copy_features) cfgr.f=e.feature_values_; if (push_weights) crp /=np[e.head_node_]; TRule const& er=*e.rule_; @@ -287,9 +309,7 @@ void CFG::PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& f) const { f.print_lhs(o,*this,r.lhs); f.print_rhs(o,*this,r.rhs.begin(),r.rhs.end()); f.print_features(o,r.p,r.f); -#if CFG_DEBUG - if (r.rule) o<<f.partsep<<*r.rule; -#endif + IF_CFG_TRULE(if (r.rule) o<<f.partsep<<*r.rule;) } void CFG::Print(std::ostream &o,CFGFormat const& f) const { |