greedy binarization - needs testing, may have broke l2r

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@560 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-08-16 09:11:03 +0000
committer: graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-08-16 09:11:03 +0000
commit: 6cc769d102bfcf87822ceeb499cf45ff1e79e5f6 (patch)
tree: 5830cab16e60895832100206f962bfbc533fd860 /decoder
parent: 63531c9c25d1995e483c0a037518b5caa58fbb2c (diff)
4 files changed, 286 insertions, 51 deletions
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index 3076c75e..0fbb6a03 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -11,8 +11,20 @@
 
 #define DUNIQ(x) x
 #define DBIN(x)
+#define DSP(x) x
+//SP:binarize by splitting.
 #define DCFG(x) IF_CFG_DEBUG(x)
 
+#undef CFG_FOR_RULES
+#define CFG_FOR_RULES(i,expr)                                    \
+  for (CFG::NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) { \
+    NT const& nt=*n; \
+    for (CFG::Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) {  \
+      RuleHandle i=*ir; \
+      expr; \
+    } \
+  }
+
 
 using namespace std;
 
@@ -178,75 +190,286 @@ string BinStr(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
   return o.str();
 }
 
+string BinStr(RHS const& r,CFG::NTs const& N,CFG::NTs const& M)
+{
+  int nn=N.size();
+  ostringstream o;
+  for (int i=0,e=r.size();i!=e;++i) {
+    if (i)
+      o<<'+';
+    BinNameOWORD(r[i]);
+  }
+  return o.str();
+}
+
+
 WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
 {
   return TD::Convert(BinStr(b,N,M));
 }
 
+WordID BinName(RHS const& b,CFG::NTs const& N,CFG::NTs const& M)
+{
+  return TD::Convert(BinStr(b,N,M));
+}
+
+template <class Rhs>
+struct add_virtual_rules {
+  typedef CFG::RuleHandle RuleHandle;
+  typedef CFG::NTHandle NTHandle;
+  CFG::NTs &nts,new_nts;
+  CFG::Rules &rules, new_rules;
+// above will be appended at the end, so we don't have to worry about iterator invalidation
+  NTHandle newnt; //not negative.  TODO: i think we use it most often as negative.  either way, be careful.
+  RuleHandle newruleid;
+  HASH_MAP<Rhs,NTHandle,boost::hash<Rhs> > rhs2lhs;
+  bool name_nts;
+  add_virtual_rules(CFG &cfg,bool name_nts=false) : nts(cfg.nts),rules(cfg.rules),newnt(nts.size()),newruleid(rules.size()),name_nts(name_nts) {
+    HASH_MAP_EMPTY(rhs2lhs,null_for<Rhs>::null);
+  }
+  NTHandle get_virt(Rhs const& r) {
+    NTHandle nt=get_default(rhs2lhs,r,newnt);
+    if (newnt==nt) {
+      create_nt(r);
+      create_rule(r);
+    }
+    return nt;
+  }
+  NTHandle get_nt(Rhs const& r) {
+    NTHandle nt=get_default(rhs2lhs,r,newnt);
+    if (newnt==nt) {
+      create(r);
+    }
+    return nt;
+  }
+  inline void set_nt_name(Rhs const& r) {
+    if (name_nts)
+      new_nts.back().from.nt=BinName(r,nts,new_nts);
+  }
+  inline void create_nt(Rhs const& rhs) {
+    new_nts.push_back(CFG::NT(newruleid++));
+    set_nt_name(rhs);
+  }
+  inline void create_rule(Rhs const& rhs) {
+    new_rules.push_back(CFG::Rule(newnt++,rhs));
+  }
+  inline void create(Rhs const& rhs) {
+    create_nt(rhs);
+    create_rule(rhs);
+    assert(newruleid==rules.size()+new_rules.size());assert(newnt==nts.size()+new_nts.size());
+  }
+
+  ~add_virtual_rules() {
+    append_rules();
+  }
+  void append_rules() {
+    // marginally more efficient
+    batched_append_swap(nts,new_nts);
+    batched_append_swap(rules,new_rules);
+  }
+  inline bool have(Rhs const& rhs,NTHandle &h) const {
+    return rhs2lhs.find(rhs)!=rhs2lhs.end();
+  }
+  //HACK: prevent this for instantiating for BinRhs.  we need to use rule index because we'll be adding rules before we can update.
+  // returns 1 per replaced NT (0,1, or 2)
+  template <class RHSi>
+  int split_rhs(RHSi &rhs,bool only_free=false,bool only_reusing_1=false) {
+    //TODO: don't actually build substrings of rhs; define key type that stores ref to rhs in new_nts by index (because it may grow), and also allows an int* [b,e) range to serve as key (i.e. don't insert the latter type of key).
+    int n=rhs.size();
+    if (n<=2) return 0;
+    int longest1=0;
+    int mid=n/2;
+    int best_k=mid;
+    bool haver,havel;
+    NTHandle ntr,ntl;
+    NTHandle bestntr,bestntl;
+    WordID *b=&rhs.front(),*e=b+n;
+    for (int k=1;k<n-1;++k) {
+      WordID *wk=&rhs[k];
+      Rhs l(b,wk);
+      int rlen=n-k;
+      if (have(l,ntl)) {
+        Rhs r(wk,e);
+        if (have(r,ntr)) {
+          rhs.resize(2);
+          rhs[0]=ntl;
+          rhs[1]=ntr;
+          return 2;
+        } else if (k>longest1) {
+          longest1=k;
+          haver=false;
+          havel=true;
+          bestntl=ntl;
+          best_k=k;
+        }
+      } else if (rlen>=longest1) {
+        Rhs r(wk,e);
+        if (have(r,ntr)) {
+          longest1=rlen;
+          havel=false;
+          haver=true;
+          bestntr=ntr;
+          best_k=k;
+        }
+      }
+    }
+    if (only_free) {
+      if (havel) {
+        rhs.erase(rhs.begin()+1,rhs.begin()+best_k);
+        rhs[0]=-ntl;
+      } else if (haver) {
+        rhs.erase(rhs.begin()+best_k+1,rhs.end());
+        rhs[best_k]=-ntr;
+      } else
+        return 0;
+      return 1;
+    }
+    if (only_reusing_1 && longest1==0) return 0;
+    //TODO: swap order of consideration (l first or r?) depending on pre/post midpoint?  one will be useless to check for beating the longest single match so far.  check that second.
+    WordID *best_wk=b+best_k;
+    Rhs l(b,best_wk);
+    Rhs r(best_wk,e);
+    //we build these first because adding rules may invalidate the underlying pointers (we end up binarizing already split virt rules)!
+    rhs.resize(2);
+    int nnt=newnt;
+    rhs[0]=-(havel?bestntl:nnt++);
+    rhs[1]=-(haver?bestntr:nnt);
+    // now that we've set rhs, we can actually safely add rules
+    if (!havel)
+      create(l);
+    if (!haver)
+      create(r);
+    return 2;
+  }
+};
+
+
+template <class Rhs>
+struct null_for;
+
+typedef CFG::BinRhs BinRhs;
+
+template <>
+struct null_for<BinRhs> {
+  static BinRhs null;
+};
+BinRhs null_for<BinRhs>::null(std::numeric_limits<int>::min(),std::numeric_limits<int>::min());
+
+template <>
+struct null_for<RHS> {
+  static RHS null;
+};
+RHS null_for<RHS>::null(1,std::numeric_limits<int>::min());
+
 }//ns
 
+void CFG::BinarizeSplit(CFGBinarize const& b) {
+  add_virtual_rules<RHS> v(*this,b.bin_name_nts);
+  CFG_FOR_RULES(i,v.split_rhs(rules[i].rhs,false,false));
+#undef CFG_FOR_VIRT
+#define CFG_FOR_VIRT(r,expr)                                                 \
+  for (Rules::iterator ri=v.new_rules.begin(),e=v.new_rules.end();ri!=e;++ri) { \
+    Rule &r=*ri;expr;  }
+
+  int n_changed_total=0;
+
+#define CFG_SPLIT_PASS(N,free,just1) \
+  for (int i=0;i<b.N;++i) { \
+    int n_changed=0; \
+    CFG_FOR_VIRT(r,n_changed+=v.split_rhs(r.rhs,free,just1)); \
+    if (!n_changed) { \
+      break; \
+    } n_changed_total+=n_changed; }
+
+  CFG_SPLIT_PASS(split_passes,false,false)
+  CFG_SPLIT_PASS(split_share1_passes,false,true)
+  CFG_SPLIT_PASS(split_free_passes,true,false)
+
+
+    /*
+  for (int i=0;i<b.split_passes;++i)
+    CFG_FOR_VIRT(r,v.split_rhs(r.rhs,false,false));
+  for (int i=0;i<b.split_share1_passes;++i)
+    CFG_FOR_VIRT(r,v.split_rhs(r.rhs,false,true));
+  for (int i=0;i<b.split_free_passes;++i)
+    CFG_FOR_VIRT(r,v.split_rhs(r.rhs,true));
+    */
+}
+
 void CFG::Binarize(CFGBinarize const& b) {
   if (!b.Binarizing()) return;
-  if (!b.bin_l2r) {
-    assert(b.bin_l2r);
-    return;
-  }
-  //TODO: l2r only so far:
   cerr << "Binarizing "<<b<<endl;
+  if (b.bin_split)
+    BinarizeSplit(b);
+  if (b.bin_l2r)
+    BinarizeL2R(false,b.bin_name_nts);
+  if (b.bin_topo) //TODO: more efficient (at least for l2r) maintenance of order?
+    OrderNTsTopo();
+
+}
+
+void CFG::BinarizeL2R(bool bin_unary,bool name) {
+  add_virtual_rules<BinRhs> v(*this,name);
+cerr << "Binarizing left->right " << (bin_unary?"real to unary":"stop at binary") <<endl;
   HASH_MAP<BinRhs,NTHandle,boost::hash<BinRhs> > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left.
   HASH_MAP_EMPTY(bin2lhs,null_bin_rhs);
   // iterate using indices and not iterators because we'll be adding to both nts and rules list?  we could instead pessimistically reserve space for both, but this is simpler.  also: store original end of nts since we won't need to reprocess newly added ones.
-  int rhsmin=b.bin_unary?0:1;
-  NTs new_nts; // these will be appended at the end, so we don't have to worry about iterator invalidation
-  Rules new_rules;
+  int rhsmin=bin_unary?0:1;
+  //NTs new_nts;
+  //Rules new_rules;
   //TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target)
-  int newnt=-nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0)
-  int newruleid=rules.size();
+//  int newnt=nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0)
+//  int newruleid=rules.size();
   BinRhs bin;
-  for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) {
+  CFG_FOR_RULES(ruleid,
+/*  for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) {
     NT const& nt=*n;
     for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) {
-      RuleHandle ruleid=*ir;
-      SHOW2(DBIN,ruleid,ShowRule(ruleid))
-      RHS &rhs=rules[ruleid].rhs; // we're going to binarize this while adding newly created rules to new_...
+    RuleHandle ruleid=*ir;*/
+//      SHOW2(DBIN,ruleid,ShowRule(ruleid));
+                Rule & rule=rules[ruleid];
+      RHS &rhs=rule.rhs; // we're going to binarize this while adding newly created rules to new_...
       if (rhs.empty()) continue;
       int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT
       if (rhsmin<=r) { // means r>=0 also
         bin.second=rhs[r+1];
         int bin_to; // the replacement for bin
-        assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size());
+//        assert(newruleid==rules.size()+new_rules.size());assert(newnt==nts.size()+new_nts.size());
         // also true at start/end of loop:
         for (;;) { // pairs from right to left (normally we leave the last pair alone)
-
           bin.first=rhs[r];
-          bin_to=get_default(bin2lhs,bin,newnt);
-          SHOW(DBIN,r) SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(bin,nts,new_nts)<<"=>") SHOW(DBIN,bin_to);
-          if (newnt==bin_to) { // it's new!
+          bin_to=v.get_virt(bin);
+/*          bin_to=get_default(bin2lhs,bin,v.newnt);
+//          SHOW(DBIN,r) SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(bin,nts,new_nts)<<"=>") SHOW(DBIN,bin_to);
+          if (v.newnt==bin_to) { // it's new!
             new_nts.push_back(NT(newruleid++));
-            //now -newnt is the index of the last (after new_nts is appended) nt.  bin is its rhs.  bin_to is its lhs
-            new_rules.push_back(Rule(-newnt,bin));
-            --newnt;
-            if (b.bin_name_nts)
-              new_nts.back().from.nt=BinName(bin,nts,new_nts);
+            //now newnt is the index of the last (after new_nts is appended) nt.  bin is its rhs.  bin_to is its lhs
+            new_rules.push_back(Rule(newnt,bin));
+            ++newnt;
+            if (name) new_nts.back().from.nt=BinName(bin,nts,new_nts);
           }
-          bin.second=bin_to;
+*/
+          bin.second=-bin_to;
           --r;
-          if (r<rhsmin) break;
+          if (r<rhsmin) {
+            rhs[rhsmin]=-bin_to;
+            rhs.resize(rhsmin+1);
+            break;
+          }
         }
-        rhs[rhsmin]=bin_to;
-        rhs.resize(rhsmin+1);
-      }
+      })
+                /*
     }
   }
-#if 1
+                */
+#if 0
+  // marginally more efficient
   batched_append_swap(nts,new_nts);
   batched_append_swap(rules,new_rules);
-#else
+//#else
   batched_append(nts,new_nts);
   batched_append(rules,new_rules);
 #endif
-  if (b.bin_topo) //TODO: more efficient (at least for l2r) maintenance of order
-    OrderNTsTopo();
 }
 
 namespace {
@@ -338,13 +561,7 @@ void CFG::Print(std::ostream &o,CFGFormat const& f) const {
       f.print_features(o,pushed_inside);
     o<<'\n';
   }
-  for (int i=0;i<nts.size();++i) {
-    Ruleids const& ntr=nts[i].ruleids;
-    for (Ruleids::const_iterator j=ntr.begin(),jj=ntr.end();j!=jj;++j) {
-      PrintRule(o,*j,f);
-      o<<'\n';
-    }
-  }
+  CFG_FOR_RULES(i,PrintRule(o,i,f);o<<'\n';)
 }
 
 void CFG::Print(std::ostream &o) const {
diff --git a/decoder/cfg.h b/decoder/cfg.h
index 90a5dd5a..5a418234 100755
--- a/decoder/cfg.h
+++ b/decoder/cfg.h
@@ -92,6 +92,8 @@ struct CFG {
       rhs[0]=binrhs.first;
       rhs[1]=binrhs.second;
     }
+    Rule(int lhs,RHS const& rhs) : lhs(lhs),rhs(rhs),p(1) {
+    }
 
     int lhs; // index into nts
     RHS rhs;
@@ -302,7 +304,9 @@ struct CFG {
     ReorderNTs(o);
   }
 
+  void BinarizeL2R(bool bin_unary=false,bool name_nts=false);
   void Binarize(CFGBinarize const& binarize_options); // see cfg_binarize.h for docs
+  void BinarizeSplit(CFGBinarize const& binarize_options); // there may be many options affecting split.
 
   typedef std::vector<NT> NTs;
   NTs nts;
diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h
index 82c4dd1a..3aba5e9f 100755
--- a/decoder/cfg_binarize.h
+++ b/decoder/cfg_binarize.h
@@ -14,39 +14,50 @@
  */
 
 struct CFGBinarize {
-  int bin_at;
+  int bin_thresh;
   bool bin_l2r;
-  bool bin_unary;
+  int bin_unary;
   bool bin_name_nts;
   bool bin_topo;
+  bool bin_split;
+  int split_passes,split_share1_passes,split_free_passes;
   template <class Opts> // template to support both printable_opts and boost nonprintable
   void AddOptions(Opts *opts) {
     opts->add_options()
-      ("cfg_binarize_at", defaulted_value(&bin_at),"(if >0) binarize CFG rhs segments which appear at least this many times")
-      ("cfg_binarize_unary", defaulted_value(&bin_unary),"if true, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least cfg_binarize_at times.")
+      ("cfg_binarize_threshold", defaulted_value(&bin_thresh),"(if >0) repeatedly binarize CFG rhs bigrams which appear at least this many times, most frequent first.  resulting rules may be 1,2, or >2-ary.  this happens before the other types of binarization.")
+//      ("cfg_binarize_unary_threshold", defaulted_value(&bin_unary),"if >0, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least this many times.  this happens last.")
+      ("cfg_binarize_greedy_split", defaulted_value(&bin_split),"(DeNero et al) for each rule until binarized, pick a split point k of L->r[0..n) to make rules L->V1 V2, V1->r[0..k) V2->r[k..n), to minimize the number of new rules created")
+      ("cfg_split_full_passes", defaulted_value(&split_passes),"pass through the virtual rules only (up to) this many times (all real rules will have been split if not already binary)")
+      ("cfg_split_share1_passes", defaulted_value(&split_share1_passes),"after the full passes, for up to this many times split when at least 1 of the items has been seen before")
+      ("cfg_split_free_passes", defaulted_value(&split_free_passes),"only split off from virtual nts pre/post nts that already exist - could check for interior phrases but after a few splits everything should be tiny already.")
       ("cfg_binarize_l2r", defaulted_value(&bin_l2r),"force left to right (a (b (c d))) binarization (ignore _at threshold)")
       ("cfg_binarize_name_nts", defaulted_value(&bin_name_nts),"create named virtual NT tokens e.g. 'A12+the' when binarizing 'B->[A12] the cat'")
       ("cfg_binarize_topo", defaulted_value(&bin_topo),"reorder nonterminals after binarization to maintain definition before use (topological order).  otherwise the virtual NTs will all appear after the regular NTs")
     ;
   }
   void Validate() {
-    if (bin_l2r)
-      bin_at=0;
-    if (bin_at>0&&!bin_l2r) {
+    if (bin_thresh>0&&!bin_l2r) {
       std::cerr<<"\nWARNING: greedy binarization not yet supported; using l2r (right branching) instead.\n";
       bin_l2r=true;
     }
+    if (false && bin_l2r && bin_split) { // actually, split may be slightly incomplete due to finite number of passes.
+      std::cerr<<"\nWARNING: l2r and split are both complete binarization and redundant.  Using split.\n";
+      bin_l2r=false;
+    }
+
   }
 
   bool Binarizing() const {
-    return bin_l2r || bin_at>0;
+    return bin_split || bin_l2r || bin_thresh>0;
   }
   void set_defaults() {
+    bin_split=false;
     bin_topo=false;
-    bin_at=0;
-    bin_unary=false;
+    bin_thresh=0;
+    bin_unary=0;
     bin_name_nts=true;
     bin_l2r=false;
+    split_passes=10;split_share1_passes=0;split_free_passes=10;
   }
   CFGBinarize() { set_defaults(); }
   void print(std::ostream &o) const {
@@ -56,10 +67,12 @@ struct CFGBinarize {
     else {
       if (bin_unary)
         o << "unary-sharing ";
+      if (bin_thresh)
+        o<<"greedy bigram count>="<<bin_thresh<<" ";
       if (bin_l2r)
         o << "left->right";
       else
-        o << "greedy count>="<<bin_at;
+        o << "DeNero greedy split";
       if (bin_name_nts)
         o << " named-NTs";
       if (bin_topo)
diff --git a/decoder/hg.h b/decoder/hg.h
index 03221c74..074213ac 100644
--- a/decoder/hg.h
+++ b/decoder/hg.h
@@ -72,6 +72,7 @@ public:
   // TODO get rid of edge_prob_? (can be computed on the fly as the dot
   // product of the weight vector and the feature values)
   struct Edge {
+//    int poplimit; //TODO: cube pruning per edge limit?  per node didn't work well at all.
     Edge() : i_(-1), j_(-1), prev_i_(-1), prev_j_(-1) {}
     Edge(int id,Edge const& copy_pod_from) : id_(id) { copy_pod(copy_pod_from); } // call copy_features yourself later.
     Edge(int id,Edge const& copy_from,TailNodeVector const& tail) // fully inits - probably more expensive when push_back(Edge(...)) than setting after
author	graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-08-16 09:11:03 +0000
committer	graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-08-16 09:11:03 +0000
commit	6cc769d102bfcf87822ceeb499cf45ff1e79e5f6 (patch)
tree	5830cab16e60895832100206f962bfbc533fd860 /decoder
parent	63531c9c25d1995e483c0a037518b5caa58fbb2c (diff)