diff options
Diffstat (limited to 'decoder/cfg.cc')
| -rwxr-xr-x | decoder/cfg.cc | 297 | 
1 files changed, 257 insertions, 40 deletions
diff --git a/decoder/cfg.cc b/decoder/cfg.cc index 3076c75e..0fbb6a03 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -11,8 +11,20 @@  #define DUNIQ(x) x  #define DBIN(x) +#define DSP(x) x +//SP:binarize by splitting.  #define DCFG(x) IF_CFG_DEBUG(x) +#undef CFG_FOR_RULES +#define CFG_FOR_RULES(i,expr)                                    \ +  for (CFG::NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) { \ +    NT const& nt=*n; \ +    for (CFG::Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) {  \ +      RuleHandle i=*ir; \ +      expr; \ +    } \ +  } +  using namespace std; @@ -178,75 +190,286 @@ string BinStr(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)    return o.str();  } +string BinStr(RHS const& r,CFG::NTs const& N,CFG::NTs const& M) +{ +  int nn=N.size(); +  ostringstream o; +  for (int i=0,e=r.size();i!=e;++i) { +    if (i) +      o<<'+'; +    BinNameOWORD(r[i]); +  } +  return o.str(); +} + +  WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)  {    return TD::Convert(BinStr(b,N,M));  } +WordID BinName(RHS const& b,CFG::NTs const& N,CFG::NTs const& M) +{ +  return TD::Convert(BinStr(b,N,M)); +} + +template <class Rhs> +struct add_virtual_rules { +  typedef CFG::RuleHandle RuleHandle; +  typedef CFG::NTHandle NTHandle; +  CFG::NTs &nts,new_nts; +  CFG::Rules &rules, new_rules; +// above will be appended at the end, so we don't have to worry about iterator invalidation +  NTHandle newnt; //not negative.  TODO: i think we use it most often as negative.  either way, be careful. +  RuleHandle newruleid; +  HASH_MAP<Rhs,NTHandle,boost::hash<Rhs> > rhs2lhs; +  bool name_nts; +  add_virtual_rules(CFG &cfg,bool name_nts=false) : nts(cfg.nts),rules(cfg.rules),newnt(nts.size()),newruleid(rules.size()),name_nts(name_nts) { +    HASH_MAP_EMPTY(rhs2lhs,null_for<Rhs>::null); +  } +  NTHandle get_virt(Rhs const& r) { +    NTHandle nt=get_default(rhs2lhs,r,newnt); +    if (newnt==nt) { +      create_nt(r); +      create_rule(r); +    } +    return nt; +  } +  NTHandle get_nt(Rhs const& r) { +    NTHandle nt=get_default(rhs2lhs,r,newnt); +    if (newnt==nt) { +      create(r); +    } +    return nt; +  } +  inline void set_nt_name(Rhs const& r) { +    if (name_nts) +      new_nts.back().from.nt=BinName(r,nts,new_nts); +  } +  inline void create_nt(Rhs const& rhs) { +    new_nts.push_back(CFG::NT(newruleid++)); +    set_nt_name(rhs); +  } +  inline void create_rule(Rhs const& rhs) { +    new_rules.push_back(CFG::Rule(newnt++,rhs)); +  } +  inline void create(Rhs const& rhs) { +    create_nt(rhs); +    create_rule(rhs); +    assert(newruleid==rules.size()+new_rules.size());assert(newnt==nts.size()+new_nts.size()); +  } + +  ~add_virtual_rules() { +    append_rules(); +  } +  void append_rules() { +    // marginally more efficient +    batched_append_swap(nts,new_nts); +    batched_append_swap(rules,new_rules); +  } +  inline bool have(Rhs const& rhs,NTHandle &h) const { +    return rhs2lhs.find(rhs)!=rhs2lhs.end(); +  } +  //HACK: prevent this for instantiating for BinRhs.  we need to use rule index because we'll be adding rules before we can update. +  // returns 1 per replaced NT (0,1, or 2) +  template <class RHSi> +  int split_rhs(RHSi &rhs,bool only_free=false,bool only_reusing_1=false) { +    //TODO: don't actually build substrings of rhs; define key type that stores ref to rhs in new_nts by index (because it may grow), and also allows an int* [b,e) range to serve as key (i.e. don't insert the latter type of key). +    int n=rhs.size(); +    if (n<=2) return 0; +    int longest1=0; +    int mid=n/2; +    int best_k=mid; +    bool haver,havel; +    NTHandle ntr,ntl; +    NTHandle bestntr,bestntl; +    WordID *b=&rhs.front(),*e=b+n; +    for (int k=1;k<n-1;++k) { +      WordID *wk=&rhs[k]; +      Rhs l(b,wk); +      int rlen=n-k; +      if (have(l,ntl)) { +        Rhs r(wk,e); +        if (have(r,ntr)) { +          rhs.resize(2); +          rhs[0]=ntl; +          rhs[1]=ntr; +          return 2; +        } else if (k>longest1) { +          longest1=k; +          haver=false; +          havel=true; +          bestntl=ntl; +          best_k=k; +        } +      } else if (rlen>=longest1) { +        Rhs r(wk,e); +        if (have(r,ntr)) { +          longest1=rlen; +          havel=false; +          haver=true; +          bestntr=ntr; +          best_k=k; +        } +      } +    } +    if (only_free) { +      if (havel) { +        rhs.erase(rhs.begin()+1,rhs.begin()+best_k); +        rhs[0]=-ntl; +      } else if (haver) { +        rhs.erase(rhs.begin()+best_k+1,rhs.end()); +        rhs[best_k]=-ntr; +      } else +        return 0; +      return 1; +    } +    if (only_reusing_1 && longest1==0) return 0; +    //TODO: swap order of consideration (l first or r?) depending on pre/post midpoint?  one will be useless to check for beating the longest single match so far.  check that second. +    WordID *best_wk=b+best_k; +    Rhs l(b,best_wk); +    Rhs r(best_wk,e); +    //we build these first because adding rules may invalidate the underlying pointers (we end up binarizing already split virt rules)! +    rhs.resize(2); +    int nnt=newnt; +    rhs[0]=-(havel?bestntl:nnt++); +    rhs[1]=-(haver?bestntr:nnt); +    // now that we've set rhs, we can actually safely add rules +    if (!havel) +      create(l); +    if (!haver) +      create(r); +    return 2; +  } +}; + + +template <class Rhs> +struct null_for; + +typedef CFG::BinRhs BinRhs; + +template <> +struct null_for<BinRhs> { +  static BinRhs null; +}; +BinRhs null_for<BinRhs>::null(std::numeric_limits<int>::min(),std::numeric_limits<int>::min()); + +template <> +struct null_for<RHS> { +  static RHS null; +}; +RHS null_for<RHS>::null(1,std::numeric_limits<int>::min()); +  }//ns +void CFG::BinarizeSplit(CFGBinarize const& b) { +  add_virtual_rules<RHS> v(*this,b.bin_name_nts); +  CFG_FOR_RULES(i,v.split_rhs(rules[i].rhs,false,false)); +#undef CFG_FOR_VIRT +#define CFG_FOR_VIRT(r,expr)                                                 \ +  for (Rules::iterator ri=v.new_rules.begin(),e=v.new_rules.end();ri!=e;++ri) { \ +    Rule &r=*ri;expr;  } + +  int n_changed_total=0; + +#define CFG_SPLIT_PASS(N,free,just1) \ +  for (int i=0;i<b.N;++i) { \ +    int n_changed=0; \ +    CFG_FOR_VIRT(r,n_changed+=v.split_rhs(r.rhs,free,just1)); \ +    if (!n_changed) { \ +      break; \ +    } n_changed_total+=n_changed; } + +  CFG_SPLIT_PASS(split_passes,false,false) +  CFG_SPLIT_PASS(split_share1_passes,false,true) +  CFG_SPLIT_PASS(split_free_passes,true,false) + + +    /* +  for (int i=0;i<b.split_passes;++i) +    CFG_FOR_VIRT(r,v.split_rhs(r.rhs,false,false)); +  for (int i=0;i<b.split_share1_passes;++i) +    CFG_FOR_VIRT(r,v.split_rhs(r.rhs,false,true)); +  for (int i=0;i<b.split_free_passes;++i) +    CFG_FOR_VIRT(r,v.split_rhs(r.rhs,true)); +    */ +} +  void CFG::Binarize(CFGBinarize const& b) {    if (!b.Binarizing()) return; -  if (!b.bin_l2r) { -    assert(b.bin_l2r); -    return; -  } -  //TODO: l2r only so far:    cerr << "Binarizing "<<b<<endl; +  if (b.bin_split) +    BinarizeSplit(b); +  if (b.bin_l2r) +    BinarizeL2R(false,b.bin_name_nts); +  if (b.bin_topo) //TODO: more efficient (at least for l2r) maintenance of order? +    OrderNTsTopo(); + +} + +void CFG::BinarizeL2R(bool bin_unary,bool name) { +  add_virtual_rules<BinRhs> v(*this,name); +cerr << "Binarizing left->right " << (bin_unary?"real to unary":"stop at binary") <<endl;    HASH_MAP<BinRhs,NTHandle,boost::hash<BinRhs> > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left.    HASH_MAP_EMPTY(bin2lhs,null_bin_rhs);    // iterate using indices and not iterators because we'll be adding to both nts and rules list?  we could instead pessimistically reserve space for both, but this is simpler.  also: store original end of nts since we won't need to reprocess newly added ones. -  int rhsmin=b.bin_unary?0:1; -  NTs new_nts; // these will be appended at the end, so we don't have to worry about iterator invalidation -  Rules new_rules; +  int rhsmin=bin_unary?0:1; +  //NTs new_nts; +  //Rules new_rules;    //TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target) -  int newnt=-nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0) -  int newruleid=rules.size(); +//  int newnt=nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0) +//  int newruleid=rules.size();    BinRhs bin; -  for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) { +  CFG_FOR_RULES(ruleid, +/*  for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) {      NT const& nt=*n;      for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) { -      RuleHandle ruleid=*ir; -      SHOW2(DBIN,ruleid,ShowRule(ruleid)) -      RHS &rhs=rules[ruleid].rhs; // we're going to binarize this while adding newly created rules to new_... +    RuleHandle ruleid=*ir;*/ +//      SHOW2(DBIN,ruleid,ShowRule(ruleid)); +                Rule & rule=rules[ruleid]; +      RHS &rhs=rule.rhs; // we're going to binarize this while adding newly created rules to new_...        if (rhs.empty()) continue;        int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT        if (rhsmin<=r) { // means r>=0 also          bin.second=rhs[r+1];          int bin_to; // the replacement for bin -        assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size()); +//        assert(newruleid==rules.size()+new_rules.size());assert(newnt==nts.size()+new_nts.size());          // also true at start/end of loop:          for (;;) { // pairs from right to left (normally we leave the last pair alone) -            bin.first=rhs[r]; -          bin_to=get_default(bin2lhs,bin,newnt); -          SHOW(DBIN,r) SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(bin,nts,new_nts)<<"=>") SHOW(DBIN,bin_to); -          if (newnt==bin_to) { // it's new! +          bin_to=v.get_virt(bin); +/*          bin_to=get_default(bin2lhs,bin,v.newnt); +//          SHOW(DBIN,r) SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(bin,nts,new_nts)<<"=>") SHOW(DBIN,bin_to); +          if (v.newnt==bin_to) { // it's new!              new_nts.push_back(NT(newruleid++)); -            //now -newnt is the index of the last (after new_nts is appended) nt.  bin is its rhs.  bin_to is its lhs -            new_rules.push_back(Rule(-newnt,bin)); -            --newnt; -            if (b.bin_name_nts) -              new_nts.back().from.nt=BinName(bin,nts,new_nts); +            //now newnt is the index of the last (after new_nts is appended) nt.  bin is its rhs.  bin_to is its lhs +            new_rules.push_back(Rule(newnt,bin)); +            ++newnt; +            if (name) new_nts.back().from.nt=BinName(bin,nts,new_nts);            } -          bin.second=bin_to; +*/ +          bin.second=-bin_to;            --r; -          if (r<rhsmin) break; +          if (r<rhsmin) { +            rhs[rhsmin]=-bin_to; +            rhs.resize(rhsmin+1); +            break; +          }          } -        rhs[rhsmin]=bin_to; -        rhs.resize(rhsmin+1); -      } +      }) +                /*      }    } -#if 1 +                */ +#if 0 +  // marginally more efficient    batched_append_swap(nts,new_nts);    batched_append_swap(rules,new_rules); -#else +//#else    batched_append(nts,new_nts);    batched_append(rules,new_rules);  #endif -  if (b.bin_topo) //TODO: more efficient (at least for l2r) maintenance of order -    OrderNTsTopo();  }  namespace { @@ -338,13 +561,7 @@ void CFG::Print(std::ostream &o,CFGFormat const& f) const {        f.print_features(o,pushed_inside);      o<<'\n';    } -  for (int i=0;i<nts.size();++i) { -    Ruleids const& ntr=nts[i].ruleids; -    for (Ruleids::const_iterator j=ntr.begin(),jj=ntr.end();j!=jj;++j) { -      PrintRule(o,*j,f); -      o<<'\n'; -    } -  } +  CFG_FOR_RULES(i,PrintRule(o,i,f);o<<'\n';)  }  void CFG::Print(std::ostream &o) const {  | 
