summaryrefslogtreecommitdiff
path: root/decoder/cfg.cc
diff options
context:
space:
mode:
Diffstat (limited to 'decoder/cfg.cc')
-rwxr-xr-xdecoder/cfg.cc297
1 files changed, 257 insertions, 40 deletions
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index 3076c75e..0fbb6a03 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -11,8 +11,20 @@
#define DUNIQ(x) x
#define DBIN(x)
+#define DSP(x) x
+//SP:binarize by splitting.
#define DCFG(x) IF_CFG_DEBUG(x)
+#undef CFG_FOR_RULES
+#define CFG_FOR_RULES(i,expr) \
+ for (CFG::NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) { \
+ NT const& nt=*n; \
+ for (CFG::Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) { \
+ RuleHandle i=*ir; \
+ expr; \
+ } \
+ }
+
using namespace std;
@@ -178,75 +190,286 @@ string BinStr(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
return o.str();
}
+string BinStr(RHS const& r,CFG::NTs const& N,CFG::NTs const& M)
+{
+ int nn=N.size();
+ ostringstream o;
+ for (int i=0,e=r.size();i!=e;++i) {
+ if (i)
+ o<<'+';
+ BinNameOWORD(r[i]);
+ }
+ return o.str();
+}
+
+
WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
{
return TD::Convert(BinStr(b,N,M));
}
+WordID BinName(RHS const& b,CFG::NTs const& N,CFG::NTs const& M)
+{
+ return TD::Convert(BinStr(b,N,M));
+}
+
+template <class Rhs>
+struct add_virtual_rules {
+ typedef CFG::RuleHandle RuleHandle;
+ typedef CFG::NTHandle NTHandle;
+ CFG::NTs &nts,new_nts;
+ CFG::Rules &rules, new_rules;
+// above will be appended at the end, so we don't have to worry about iterator invalidation
+ NTHandle newnt; //not negative. TODO: i think we use it most often as negative. either way, be careful.
+ RuleHandle newruleid;
+ HASH_MAP<Rhs,NTHandle,boost::hash<Rhs> > rhs2lhs;
+ bool name_nts;
+ add_virtual_rules(CFG &cfg,bool name_nts=false) : nts(cfg.nts),rules(cfg.rules),newnt(nts.size()),newruleid(rules.size()),name_nts(name_nts) {
+ HASH_MAP_EMPTY(rhs2lhs,null_for<Rhs>::null);
+ }
+ NTHandle get_virt(Rhs const& r) {
+ NTHandle nt=get_default(rhs2lhs,r,newnt);
+ if (newnt==nt) {
+ create_nt(r);
+ create_rule(r);
+ }
+ return nt;
+ }
+ NTHandle get_nt(Rhs const& r) {
+ NTHandle nt=get_default(rhs2lhs,r,newnt);
+ if (newnt==nt) {
+ create(r);
+ }
+ return nt;
+ }
+ inline void set_nt_name(Rhs const& r) {
+ if (name_nts)
+ new_nts.back().from.nt=BinName(r,nts,new_nts);
+ }
+ inline void create_nt(Rhs const& rhs) {
+ new_nts.push_back(CFG::NT(newruleid++));
+ set_nt_name(rhs);
+ }
+ inline void create_rule(Rhs const& rhs) {
+ new_rules.push_back(CFG::Rule(newnt++,rhs));
+ }
+ inline void create(Rhs const& rhs) {
+ create_nt(rhs);
+ create_rule(rhs);
+ assert(newruleid==rules.size()+new_rules.size());assert(newnt==nts.size()+new_nts.size());
+ }
+
+ ~add_virtual_rules() {
+ append_rules();
+ }
+ void append_rules() {
+ // marginally more efficient
+ batched_append_swap(nts,new_nts);
+ batched_append_swap(rules,new_rules);
+ }
+ inline bool have(Rhs const& rhs,NTHandle &h) const {
+ return rhs2lhs.find(rhs)!=rhs2lhs.end();
+ }
+ //HACK: prevent this for instantiating for BinRhs. we need to use rule index because we'll be adding rules before we can update.
+ // returns 1 per replaced NT (0,1, or 2)
+ template <class RHSi>
+ int split_rhs(RHSi &rhs,bool only_free=false,bool only_reusing_1=false) {
+ //TODO: don't actually build substrings of rhs; define key type that stores ref to rhs in new_nts by index (because it may grow), and also allows an int* [b,e) range to serve as key (i.e. don't insert the latter type of key).
+ int n=rhs.size();
+ if (n<=2) return 0;
+ int longest1=0;
+ int mid=n/2;
+ int best_k=mid;
+ bool haver,havel;
+ NTHandle ntr,ntl;
+ NTHandle bestntr,bestntl;
+ WordID *b=&rhs.front(),*e=b+n;
+ for (int k=1;k<n-1;++k) {
+ WordID *wk=&rhs[k];
+ Rhs l(b,wk);
+ int rlen=n-k;
+ if (have(l,ntl)) {
+ Rhs r(wk,e);
+ if (have(r,ntr)) {
+ rhs.resize(2);
+ rhs[0]=ntl;
+ rhs[1]=ntr;
+ return 2;
+ } else if (k>longest1) {
+ longest1=k;
+ haver=false;
+ havel=true;
+ bestntl=ntl;
+ best_k=k;
+ }
+ } else if (rlen>=longest1) {
+ Rhs r(wk,e);
+ if (have(r,ntr)) {
+ longest1=rlen;
+ havel=false;
+ haver=true;
+ bestntr=ntr;
+ best_k=k;
+ }
+ }
+ }
+ if (only_free) {
+ if (havel) {
+ rhs.erase(rhs.begin()+1,rhs.begin()+best_k);
+ rhs[0]=-ntl;
+ } else if (haver) {
+ rhs.erase(rhs.begin()+best_k+1,rhs.end());
+ rhs[best_k]=-ntr;
+ } else
+ return 0;
+ return 1;
+ }
+ if (only_reusing_1 && longest1==0) return 0;
+ //TODO: swap order of consideration (l first or r?) depending on pre/post midpoint? one will be useless to check for beating the longest single match so far. check that second.
+ WordID *best_wk=b+best_k;
+ Rhs l(b,best_wk);
+ Rhs r(best_wk,e);
+ //we build these first because adding rules may invalidate the underlying pointers (we end up binarizing already split virt rules)!
+ rhs.resize(2);
+ int nnt=newnt;
+ rhs[0]=-(havel?bestntl:nnt++);
+ rhs[1]=-(haver?bestntr:nnt);
+ // now that we've set rhs, we can actually safely add rules
+ if (!havel)
+ create(l);
+ if (!haver)
+ create(r);
+ return 2;
+ }
+};
+
+
+template <class Rhs>
+struct null_for;
+
+typedef CFG::BinRhs BinRhs;
+
+template <>
+struct null_for<BinRhs> {
+ static BinRhs null;
+};
+BinRhs null_for<BinRhs>::null(std::numeric_limits<int>::min(),std::numeric_limits<int>::min());
+
+template <>
+struct null_for<RHS> {
+ static RHS null;
+};
+RHS null_for<RHS>::null(1,std::numeric_limits<int>::min());
+
}//ns
+void CFG::BinarizeSplit(CFGBinarize const& b) {
+ add_virtual_rules<RHS> v(*this,b.bin_name_nts);
+ CFG_FOR_RULES(i,v.split_rhs(rules[i].rhs,false,false));
+#undef CFG_FOR_VIRT
+#define CFG_FOR_VIRT(r,expr) \
+ for (Rules::iterator ri=v.new_rules.begin(),e=v.new_rules.end();ri!=e;++ri) { \
+ Rule &r=*ri;expr; }
+
+ int n_changed_total=0;
+
+#define CFG_SPLIT_PASS(N,free,just1) \
+ for (int i=0;i<b.N;++i) { \
+ int n_changed=0; \
+ CFG_FOR_VIRT(r,n_changed+=v.split_rhs(r.rhs,free,just1)); \
+ if (!n_changed) { \
+ break; \
+ } n_changed_total+=n_changed; }
+
+ CFG_SPLIT_PASS(split_passes,false,false)
+ CFG_SPLIT_PASS(split_share1_passes,false,true)
+ CFG_SPLIT_PASS(split_free_passes,true,false)
+
+
+ /*
+ for (int i=0;i<b.split_passes;++i)
+ CFG_FOR_VIRT(r,v.split_rhs(r.rhs,false,false));
+ for (int i=0;i<b.split_share1_passes;++i)
+ CFG_FOR_VIRT(r,v.split_rhs(r.rhs,false,true));
+ for (int i=0;i<b.split_free_passes;++i)
+ CFG_FOR_VIRT(r,v.split_rhs(r.rhs,true));
+ */
+}
+
void CFG::Binarize(CFGBinarize const& b) {
if (!b.Binarizing()) return;
- if (!b.bin_l2r) {
- assert(b.bin_l2r);
- return;
- }
- //TODO: l2r only so far:
cerr << "Binarizing "<<b<<endl;
+ if (b.bin_split)
+ BinarizeSplit(b);
+ if (b.bin_l2r)
+ BinarizeL2R(false,b.bin_name_nts);
+ if (b.bin_topo) //TODO: more efficient (at least for l2r) maintenance of order?
+ OrderNTsTopo();
+
+}
+
+void CFG::BinarizeL2R(bool bin_unary,bool name) {
+ add_virtual_rules<BinRhs> v(*this,name);
+cerr << "Binarizing left->right " << (bin_unary?"real to unary":"stop at binary") <<endl;
HASH_MAP<BinRhs,NTHandle,boost::hash<BinRhs> > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left.
HASH_MAP_EMPTY(bin2lhs,null_bin_rhs);
// iterate using indices and not iterators because we'll be adding to both nts and rules list? we could instead pessimistically reserve space for both, but this is simpler. also: store original end of nts since we won't need to reprocess newly added ones.
- int rhsmin=b.bin_unary?0:1;
- NTs new_nts; // these will be appended at the end, so we don't have to worry about iterator invalidation
- Rules new_rules;
+ int rhsmin=bin_unary?0:1;
+ //NTs new_nts;
+ //Rules new_rules;
//TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target)
- int newnt=-nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0)
- int newruleid=rules.size();
+// int newnt=nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0)
+// int newruleid=rules.size();
BinRhs bin;
- for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) {
+ CFG_FOR_RULES(ruleid,
+/* for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) {
NT const& nt=*n;
for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) {
- RuleHandle ruleid=*ir;
- SHOW2(DBIN,ruleid,ShowRule(ruleid))
- RHS &rhs=rules[ruleid].rhs; // we're going to binarize this while adding newly created rules to new_...
+ RuleHandle ruleid=*ir;*/
+// SHOW2(DBIN,ruleid,ShowRule(ruleid));
+ Rule & rule=rules[ruleid];
+ RHS &rhs=rule.rhs; // we're going to binarize this while adding newly created rules to new_...
if (rhs.empty()) continue;
int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT
if (rhsmin<=r) { // means r>=0 also
bin.second=rhs[r+1];
int bin_to; // the replacement for bin
- assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size());
+// assert(newruleid==rules.size()+new_rules.size());assert(newnt==nts.size()+new_nts.size());
// also true at start/end of loop:
for (;;) { // pairs from right to left (normally we leave the last pair alone)
-
bin.first=rhs[r];
- bin_to=get_default(bin2lhs,bin,newnt);
- SHOW(DBIN,r) SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(bin,nts,new_nts)<<"=>") SHOW(DBIN,bin_to);
- if (newnt==bin_to) { // it's new!
+ bin_to=v.get_virt(bin);
+/* bin_to=get_default(bin2lhs,bin,v.newnt);
+// SHOW(DBIN,r) SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(bin,nts,new_nts)<<"=>") SHOW(DBIN,bin_to);
+ if (v.newnt==bin_to) { // it's new!
new_nts.push_back(NT(newruleid++));
- //now -newnt is the index of the last (after new_nts is appended) nt. bin is its rhs. bin_to is its lhs
- new_rules.push_back(Rule(-newnt,bin));
- --newnt;
- if (b.bin_name_nts)
- new_nts.back().from.nt=BinName(bin,nts,new_nts);
+ //now newnt is the index of the last (after new_nts is appended) nt. bin is its rhs. bin_to is its lhs
+ new_rules.push_back(Rule(newnt,bin));
+ ++newnt;
+ if (name) new_nts.back().from.nt=BinName(bin,nts,new_nts);
}
- bin.second=bin_to;
+*/
+ bin.second=-bin_to;
--r;
- if (r<rhsmin) break;
+ if (r<rhsmin) {
+ rhs[rhsmin]=-bin_to;
+ rhs.resize(rhsmin+1);
+ break;
+ }
}
- rhs[rhsmin]=bin_to;
- rhs.resize(rhsmin+1);
- }
+ })
+ /*
}
}
-#if 1
+ */
+#if 0
+ // marginally more efficient
batched_append_swap(nts,new_nts);
batched_append_swap(rules,new_rules);
-#else
+//#else
batched_append(nts,new_nts);
batched_append(rules,new_rules);
#endif
- if (b.bin_topo) //TODO: more efficient (at least for l2r) maintenance of order
- OrderNTsTopo();
}
namespace {
@@ -338,13 +561,7 @@ void CFG::Print(std::ostream &o,CFGFormat const& f) const {
f.print_features(o,pushed_inside);
o<<'\n';
}
- for (int i=0;i<nts.size();++i) {
- Ruleids const& ntr=nts[i].ruleids;
- for (Ruleids::const_iterator j=ntr.begin(),jj=ntr.end();j!=jj;++j) {
- PrintRule(o,*j,f);
- o<<'\n';
- }
- }
+ CFG_FOR_RULES(i,PrintRule(o,i,f);o<<'\n';)
}
void CFG::Print(std::ostream &o) const {