1 files changed, 257 insertions, 40 deletions
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index 3076c75e..0fbb6a03 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -11,8 +11,20 @@
 
 #define DUNIQ(x) x
 #define DBIN(x)
+#define DSP(x) x
+//SP:binarize by splitting.
 #define DCFG(x) IF_CFG_DEBUG(x)
 
+#undef CFG_FOR_RULES
+#define CFG_FOR_RULES(i,expr)                                    \
+  for (CFG::NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) { \
+    NT const& nt=*n; \
+    for (CFG::Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) {  \
+      RuleHandle i=*ir; \
+      expr; \
+    } \
+  }
+
 
 using namespace std;
 
@@ -178,75 +190,286 @@ string BinStr(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
   return o.str();
 }
 
+string BinStr(RHS const& r,CFG::NTs const& N,CFG::NTs const& M)
+{
+  int nn=N.size();
+  ostringstream o;
+  for (int i=0,e=r.size();i!=e;++i) {
+    if (i)
+      o<<'+';
+    BinNameOWORD(r[i]);
+  }
+  return o.str();
+}
+
+
 WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
 {
   return TD::Convert(BinStr(b,N,M));
 }
 
+WordID BinName(RHS const& b,CFG::NTs const& N,CFG::NTs const& M)
+{
+  return TD::Convert(BinStr(b,N,M));
+}
+
+template <class Rhs>
+struct add_virtual_rules {
+  typedef CFG::RuleHandle RuleHandle;
+  typedef CFG::NTHandle NTHandle;
+  CFG::NTs &nts,new_nts;
+  CFG::Rules &rules, new_rules;
+// above will be appended at the end, so we don't have to worry about iterator invalidation
+  NTHandle newnt; //not negative.  TODO: i think we use it most often as negative.  either way, be careful.
+  RuleHandle newruleid;
+  HASH_MAP<Rhs,NTHandle,boost::hash<Rhs> > rhs2lhs;
+  bool name_nts;
+  add_virtual_rules(CFG &cfg,bool name_nts=false) : nts(cfg.nts),rules(cfg.rules),newnt(nts.size()),newruleid(rules.size()),name_nts(name_nts) {
+    HASH_MAP_EMPTY(rhs2lhs,null_for<Rhs>::null);
+  }
+  NTHandle get_virt(Rhs const& r) {
+    NTHandle nt=get_default(rhs2lhs,r,newnt);
+    if (newnt==nt) {
+      create_nt(r);
+      create_rule(r);
+    }
+    return nt;
+  }
+  NTHandle get_nt(Rhs const& r) {
+    NTHandle nt=get_default(rhs2lhs,r,newnt);
+    if (newnt==nt) {
+      create(r);
+    }
+    return nt;
+  }
+  inline void set_nt_name(Rhs const& r) {
+    if (name_nts)
+      new_nts.back().from.nt=BinName(r,nts,new_nts);
+  }
+  inline void create_nt(Rhs const& rhs) {
+    new_nts.push_back(CFG::NT(newruleid++));
+    set_nt_name(rhs);
+  }
+  inline void create_rule(Rhs const& rhs) {
+    new_rules.push_back(CFG::Rule(newnt++,rhs));
+  }
+  inline void create(Rhs const& rhs) {
+    create_nt(rhs);
+    create_rule(rhs);
+    assert(newruleid==rules.size()+new_rules.size());assert(newnt==nts.size()+new_nts.size());
+  }
+
+  ~add_virtual_rules() {
+    append_rules();
+  }
+  void append_rules() {
+    // marginally more efficient
+    batched_append_swap(nts,new_nts);
+    batched_append_swap(rules,new_rules);
+  }
+  inline bool have(Rhs const& rhs,NTHandle &h) const {
+    return rhs2lhs.find(rhs)!=rhs2lhs.end();
+  }
+  //HACK: prevent this for instantiating for BinRhs.  we need to use rule index because we'll be adding rules before we can update.
+  // returns 1 per replaced NT (0,1, or 2)
+  template <class RHSi>
+  int split_rhs(RHSi &rhs,bool only_free=false,bool only_reusing_1=false) {
+    //TODO: don't actually build substrings of rhs; define key type that stores ref to rhs in new_nts by index (because it may grow), and also allows an int* [b,e) range to serve as key (i.e. don't insert the latter type of key).
+    int n=rhs.size();
+    if (n<=2) return 0;
+    int longest1=0;
+    int mid=n/2;
+    int best_k=mid;
+    bool haver,havel;
+    NTHandle ntr,ntl;
+    NTHandle bestntr,bestntl;
+    WordID *b=&rhs.front(),*e=b+n;
+    for (int k=1;k<n-1;++k) {
+      WordID *wk=&rhs[k];
+      Rhs l(b,wk);
+      int rlen=n-k;
+      if (have(l,ntl)) {
+        Rhs r(wk,e);
+        if (have(r,ntr)) {
+          rhs.resize(2);
+          rhs[0]=ntl;
+          rhs[1]=ntr;
+          return 2;
+        } else if (k>longest1) {
+          longest1=k;
+          haver=false;
+          havel=true;
+          bestntl=ntl;
+          best_k=k;
+        }
+      } else if (rlen>=longest1) {
+        Rhs r(wk,e);
+        if (have(r,ntr)) {
+          longest1=rlen;
+          havel=false;
+          haver=true;
+          bestntr=ntr;
+          best_k=k;
+        }
+      }
+    }
+    if (only_free) {
+      if (havel) {
+        rhs.erase(rhs.begin()+1,rhs.begin()+best_k);
+        rhs[0]=-ntl;
+      } else if (haver) {
+        rhs.erase(rhs.begin()+best_k+1,rhs.end());
+        rhs[best_k]=-ntr;
+      } else
+        return 0;
+      return 1;
+    }
+    if (only_reusing_1 && longest1==0) return 0;
+    //TODO: swap order of consideration (l first or r?) depending on pre/post midpoint?  one will be useless to check for beating the longest single match so far.  check that second.
+    WordID *best_wk=b+best_k;
+    Rhs l(b,best_wk);
+    Rhs r(best_wk,e);
+    //we build these first because adding rules may invalidate the underlying pointers (we end up binarizing already split virt rules)!
+    rhs.resize(2);
+    int nnt=newnt;
+    rhs[0]=-(havel?bestntl:nnt++);
+    rhs[1]=-(haver?bestntr:nnt);
+    // now that we've set rhs, we can actually safely add rules
+    if (!havel)
+      create(l);
+    if (!haver)
+      create(r);
+    return 2;
+  }
+};
+
+
+template <class Rhs>
+struct null_for;
+
+typedef CFG::BinRhs BinRhs;
+
+template <>
+struct null_for<BinRhs> {
+  static BinRhs null;
+};
+BinRhs null_for<BinRhs>::null(std::numeric_limits<int>::min(),std::numeric_limits<int>::min());
+
+template <>
+struct null_for<RHS> {
+  static RHS null;
+};
+RHS null_for<RHS>::null(1,std::numeric_limits<int>::min());
+
 }//ns
 
+void CFG::BinarizeSplit(CFGBinarize const& b) {
+  add_virtual_rules<RHS> v(*this,b.bin_name_nts);
+  CFG_FOR_RULES(i,v.split_rhs(rules[i].rhs,false,false));
+#undef CFG_FOR_VIRT
+#define CFG_FOR_VIRT(r,expr)                                                 \
+  for (Rules::iterator ri=v.new_rules.begin(),e=v.new_rules.end();ri!=e;++ri) { \
+    Rule &r=*ri;expr;  }
+
+  int n_changed_total=0;
+
+#define CFG_SPLIT_PASS(N,free,just1) \
+  for (int i=0;i<b.N;++i) { \
+    int n_changed=0; \
+    CFG_FOR_VIRT(r,n_changed+=v.split_rhs(r.rhs,free,just1)); \
+    if (!n_changed) { \
+      break; \
+    } n_changed_total+=n_changed; }
+
+  CFG_SPLIT_PASS(split_passes,false,false)
+  CFG_SPLIT_PASS(split_share1_passes,false,true)
+  CFG_SPLIT_PASS(split_free_passes,true,false)
+
+
+    /*
+  for (int i=0;i<b.split_passes;++i)
+    CFG_FOR_VIRT(r,v.split_rhs(r.rhs,false,false));
+  for (int i=0;i<b.split_share1_passes;++i)
+    CFG_FOR_VIRT(r,v.split_rhs(r.rhs,false,true));
+  for (int i=0;i<b.split_free_passes;++i)
+    CFG_FOR_VIRT(r,v.split_rhs(r.rhs,true));
+    */
+}
+
 void CFG::Binarize(CFGBinarize const& b) {
   if (!b.Binarizing()) return;
-  if (!b.bin_l2r) {
-    assert(b.bin_l2r);
-    return;
-  }
-  //TODO: l2r only so far:
   cerr << "Binarizing "<<b<<endl;
+  if (b.bin_split)
+    BinarizeSplit(b);
+  if (b.bin_l2r)
+    BinarizeL2R(false,b.bin_name_nts);
+  if (b.bin_topo) //TODO: more efficient (at least for l2r) maintenance of order?
+    OrderNTsTopo();
+
+}
+
+void CFG::BinarizeL2R(bool bin_unary,bool name) {
+  add_virtual_rules<BinRhs> v(*this,name);
+cerr << "Binarizing left->right " << (bin_unary?"real to unary":"stop at binary") <<endl;
   HASH_MAP<BinRhs,NTHandle,boost::hash<BinRhs> > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left.
   HASH_MAP_EMPTY(bin2lhs,null_bin_rhs);
   // iterate using indices and not iterators because we'll be adding to both nts and rules list?  we could instead pessimistically reserve space for both, but this is simpler.  also: store original end of nts since we won't need to reprocess newly added ones.
-  int rhsmin=b.bin_unary?0:1;
-  NTs new_nts; // these will be appended at the end, so we don't have to worry about iterator invalidation
-  Rules new_rules;
+  int rhsmin=bin_unary?0:1;
+  //NTs new_nts;
+  //Rules new_rules;
   //TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target)
-  int newnt=-nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0)
-  int newruleid=rules.size();
+//  int newnt=nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0)
+//  int newruleid=rules.size();
   BinRhs bin;
-  for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) {
+  CFG_FOR_RULES(ruleid,
+/*  for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) {
     NT const& nt=*n;
     for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) {
-      RuleHandle ruleid=*ir;
-      SHOW2(DBIN,ruleid,ShowRule(ruleid))
-      RHS &rhs=rules[ruleid].rhs; // we're going to binarize this while adding newly created rules to new_...
+    RuleHandle ruleid=*ir;*/
+//      SHOW2(DBIN,ruleid,ShowRule(ruleid));
+                Rule & rule=rules[ruleid];
+      RHS &rhs=rule.rhs; // we're going to binarize this while adding newly created rules to new_...
       if (rhs.empty()) continue;
       int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT
       if (rhsmin<=r) { // means r>=0 also
         bin.second=rhs[r+1];
         int bin_to; // the replacement for bin
-        assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size());
+//        assert(newruleid==rules.size()+new_rules.size());assert(newnt==nts.size()+new_nts.size());
         // also true at start/end of loop:
         for (;;) { // pairs from right to left (normally we leave the last pair alone)
-
           bin.first=rhs[r];
-          bin_to=get_default(bin2lhs,bin,newnt);
-          SHOW(DBIN,r) SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(bin,nts,new_nts)<<"=>") SHOW(DBIN,bin_to);
-          if (newnt==bin_to) { // it's new!
+          bin_to=v.get_virt(bin);
+/*          bin_to=get_default(bin2lhs,bin,v.newnt);
+//          SHOW(DBIN,r) SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(bin,nts,new_nts)<<"=>") SHOW(DBIN,bin_to);
+          if (v.newnt==bin_to) { // it's new!
             new_nts.push_back(NT(newruleid++));
-            //now -newnt is the index of the last (after new_nts is appended) nt.  bin is its rhs.  bin_to is its lhs
-            new_rules.push_back(Rule(-newnt,bin));
-            --newnt;
-            if (b.bin_name_nts)
-              new_nts.back().from.nt=BinName(bin,nts,new_nts);
+            //now newnt is the index of the last (after new_nts is appended) nt.  bin is its rhs.  bin_to is its lhs
+            new_rules.push_back(Rule(newnt,bin));
+            ++newnt;
+            if (name) new_nts.back().from.nt=BinName(bin,nts,new_nts);
           }
-          bin.second=bin_to;
+*/
+          bin.second=-bin_to;
           --r;
-          if (r<rhsmin) break;
+          if (r<rhsmin) {
+            rhs[rhsmin]=-bin_to;
+            rhs.resize(rhsmin+1);
+            break;
+          }
         }
-        rhs[rhsmin]=bin_to;
-        rhs.resize(rhsmin+1);
-      }
+      })
+                /*
     }
   }
-#if 1
+                */
+#if 0
+  // marginally more efficient
   batched_append_swap(nts,new_nts);
   batched_append_swap(rules,new_rules);
-#else
+//#else
   batched_append(nts,new_nts);
   batched_append(rules,new_rules);
 #endif
-  if (b.bin_topo) //TODO: more efficient (at least for l2r) maintenance of order
-    OrderNTsTopo();
 }
 
 namespace {
@@ -338,13 +561,7 @@ void CFG::Print(std::ostream &o,CFGFormat const& f) const {
       f.print_features(o,pushed_inside);
     o<<'\n';
   }
-  for (int i=0;i<nts.size();++i) {
-    Ruleids const& ntr=nts[i].ruleids;
-    for (Ruleids::const_iterator j=ntr.begin(),jj=ntr.end();j!=jj;++j) {
-      PrintRule(o,*j,f);
-      o<<'\n';
-    }
-  }
+  CFG_FOR_RULES(i,PrintRule(o,i,f);o<<'\n';)
 }
 
 void CFG::Print(std::ostream &o) const {