diff options
| author | graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-17 08:08:37 +0000 | 
|---|---|---|
| committer | graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-17 08:08:37 +0000 | 
| commit | e24dd873186d90f0cd80abaa5bc5623baffe340d (patch) | |
| tree | 03368659df6958fb833e9d54a4492ccbfe53cb97 | |
| parent | feb4caa70c519053332e94790225ff69a124d490 (diff) | |
--cfg-binarize-split=1 fixes
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@568 ec762483-ff6d-05da-a07a-a48fb63a330f
| -rwxr-xr-x | decoder/cfg.cc | 143 | ||||
| -rwxr-xr-x | utils/show.h | 7 | 
2 files changed, 113 insertions, 37 deletions
| diff --git a/decoder/cfg.cc b/decoder/cfg.cc index a0c00e55..f51da9bf 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -10,7 +10,7 @@  #include "show.h"  #define DUNIQ(x) x -#define DBIN(x) +#define DBIN(x) x  #define DSP(x) x  //SP:binarize by splitting.  #define DCFG(x) IF_CFG_DEBUG(x) @@ -262,9 +262,16 @@ struct add_virtual_rules {      set_nt_name(rhs);    }    inline void create_rule(Rhs const& rhs) { -    new_rules.push_back(CFG::Rule(newnt--,rhs)); +    new_rules.push_back(CFG::Rule(-newnt,rhs)); +    --newnt; +  } +  inline void create_adding(Rhs const& rhs) { +    NTHandle nt=get_default(rhs2lhs,rhs,newnt); +    assert(nt==newnt); +    create(rhs);    }    inline void create(Rhs const& rhs) { +    SHOWP(DSP,"Create ") SHOW3(DSP,newnt,newruleid,BinStr(rhs,nts,new_nts))      create_nt(rhs);      create_rule(rhs);      assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size()); @@ -291,76 +298,138 @@ struct add_virtual_rules {    }    //HACK: prevent this for instantiating for BinRhs.  we need to use rule index because we'll be adding rules before we can update.    // returns 1 per replaced NT (0,1, or 2) +  inline std::string Str(Rhs const& rhs) const { +    return BinStr(rhs,nts,new_nts); +  } +    template <class RHSi>    int split_rhs(RHSi &rhs,bool only_free=false,bool only_reusing_1=false) { +    typedef WordID const* WP;      //TODO: don't actually build substrings of rhs; define key type that stores ref to rhs in new_nts by index (because it may grow), and also allows an int* [b,e) range to serve as key (i.e. don't insert the latter type of key).      int n=rhs.size();      if (n<=2) return 0; -    int longest1=0; +    int longest1=1;      int mid=n/2; -    int best_k=mid; -    bool haver,havel; +    int best_k; +    enum {HAVE_L=-1,HAVE_NONE=0,HAVE_R=1}; +    int have1=HAVE_NONE; // will mean we already have some >1 length prefix or suffix as a virt. (it's free).  if we have both we use it immediately and return.      NTHandle ntr,ntl;      NTHandle bestntr,bestntl; -    WordID *b=&rhs.front(),*e=b+n; +    WP b=&rhs.front(),e=b+n; +    WP wk=b; +    SHOWM3(DSP,"Split",Str(rhs),only_free,only_reusing_1); +    int rlen=n;      for (int k=1;k<n-1;++k) {        //TODO: handle length 1 l and r parts without explicitly building Rhs? -      WordID *wk=&rhs[k]; +      ++wk; assert(k==wk-b); +      --rlen; assert(rlen==n-k);        Rhs l(b,wk); -      int rlen=n-k;        if (have(l,ntl)) { +        if (k>1) { SHOWM3(DSP,"Have l",k,n,Str(l)) }          Rhs r(wk,e);          if (have(r,ntr)) { +          SHOWM3(DSP,"Have r too",k,n,Str(r))            rhs.resize(2);            rhs[0]=ntl;            rhs[1]=ntr;            return 2;          } else if (k>longest1) {            longest1=k; -          haver=false; -          havel=true; +          have1=HAVE_L;            bestntl=ntl;            best_k=k;          } -      } else if (rlen>=longest1) { +      } else if (rlen>longest1) { // > or >= favors l or r branching, maybe.  who cares.          Rhs r(wk,e);          if (have(r,ntr)) {            longest1=rlen; -          havel=false; -          haver=true; +          if (rlen>1) { SHOWM3(DSP,"Have r (only) ",k,n,Str(r)) } +          have1=HAVE_R;            bestntr=ntr;            best_k=k;          }        } +      //TODO: swap order of consideration (l first or r?) depending on pre/post midpoint?  one will be useless to check for beating the longest single match so far.  check that second +      } -    bool no1=longest1<=1; // a single token sub-rhs part is not really interesting, but it would be reported as havel or haver. +    // now we know how we're going to split the rule; what follows is just doing the actual splitting: +      if (only_free) { -      if (no1) return 0; -      if (havel) { -        rhs.erase(rhs.begin()+1,rhs.begin()+best_k); -        rhs[0]=ntl; -      } else if (haver) { -        rhs.erase(rhs.begin()+best_k+1,rhs.end()); -        rhs[best_k]=ntr; +      if (have1==HAVE_NONE) +        return 0; +      if (have1==HAVE_L) { +        rhs.erase(rhs.begin()+1,rhs.begin()+best_k); //erase [1..best_k) +        rhs[0]=bestntl; +      } else { +        assert(have1==HAVE_R); +        rhs.erase(rhs.begin()+best_k+1,rhs.end()); // erase (best_k..) +        rhs[best_k]=bestntr;        }        return 1;      } -    if (only_reusing_1 && no1) return 0; -    //TODO: swap order of consideration (l first or r?) depending on pre/post midpoint?  one will be useless to check for beating the longest single match so far.  check that second. -    WordID *best_wk=b+best_k; -    Rhs l(b,best_wk); -    Rhs r(best_wk,e); -    //we build these first because adding rules may invalidate the underlying pointers (we end up binarizing already split virt rules)! -    rhs.resize(2); -    int nnt=newnt; -    rhs[0]=havel?bestntl:nnt--; -    rhs[1]=haver?bestntr:nnt; -    // now that we've set rhs, we can actually safely add rules -    if (!havel) -      create(l); -    if (!haver) -      create(r); -    return 2; +    /* now we have to add some new virtual rules. +       some awkward constraints: + +       1. can't resize rhs until you save copy of l or r split portion + +       2. can't create new rule until you finished modifying rhs (this is why we store newnt then create).  due to vector push_back invalidation.  perhaps we could bypass this by reserving sufficient space first before a splitting pass (# rules and nts created is <= 2 * # of rules being passed over) + +    */ +    if (have1==HAVE_NONE) { // default: split down middle. +      DSP(assert(longest1==1)); +      WP m=b+mid; +      if (n%2==0) { +        WP i=b; +        WP j=m; +        for (;i!=m;++i,++j) +          if (*i!=*j) goto notleqr; +        // [...mid]==[mid...]! +        RHS l(b,m); // 1. // this is equal to RHS(m,e). +        rhs.resize(2); +        rhs[0]=rhs[1]=newnt; //2. +        create_adding(l); +        return 1; // only had to create 1 total when splitting down middle when l==r +      } +    notleqr: +      if (only_reusing_1) return 0; +      best_k=mid; // rounds down +      if (mid==1) { +        RHS r(m,e); //1. +        rhs.resize(2); +        rhs[1]=newnt; //2. +        create_adding(r); +        return 1; +      } else { +        Rhs l(b,m); +        Rhs r(m,e); // 1. +        rhs.resize(2); +        rhs[0]=newnt; +        rhs[1]=newnt-1; // 2. +        create_adding(l); +        create_adding(r); +        return 2; +      } +    } +    WP best_wk=b+best_k; +    //we build these first because adding rules may invalidate the underlying pointers (we end up binarizing already split virt rules)!. +    //wow, that decision (not to use index into new_nts instead of pointer to rhs), while adding new nts to it really added some pain. +    if (have1==HAVE_L) { +      Rhs r(best_wk,e); //1. +      rhs.resize(2); +      rhs[0]=bestntl; +      DSP(assert(best_wk<e-1)); // because we would have returned having both if rhs was singleton +      rhs[1]=newnt; //2. +      create_adding(r); +    } else { +      DSP(assert(have1==HAVE_R)); +      DSP(assert(best_wk>b+1)); // because we would have returned having both if lhs was singleton +      Rhs l(b,best_wk); //1. +      rhs.resize(2); +      rhs[0]=newnt; //2. +      rhs[1]=bestntr; +      create_adding(l); +    } +    return 1;    }  }; diff --git a/utils/show.h b/utils/show.h index e8adff99..a687868c 100755 --- a/utils/show.h +++ b/utils/show.h @@ -38,4 +38,11 @@ careful: none of this is wrapped in a block.  so you can't use one of these macr  #define SHOW5(IF,x,y0,y1,y2,y3) SHOW1(IF,x) SHOW4(IF,y0,y1,y2,y3)  #define SHOW6(IF,x,y0,y1,y2,y3,y4) SHOW1(IF,x) SHOW5(IF,y0,y1,y2,y3,y4) +#define SHOWM(IF,m,x) SHOWP(IF,m<<": ") SHOW(IF,x) +#define SHOWM2(IF,m,x0,x1) SHOWP(IF,m<<": ") SHOW2(IF,x0,x1) +#define SHOWM3(IF,m,x0,x1,x2) SHOWP(IF,m<<": ") SHOW3(IF,x0,x1,x2) +#define SHOWM4(IF,m,x0,x1,x2,x3) SHOWP(IF,m<<": ") SHOW4(IF,x0,x1,x2,x3) +#define SHOWM5(IF,m,x0,x1,x2,x3,x4) SHOWP(IF,m<<": ") SHOW5(IF,x,x0,x1,x2,x3,x4) +#define SHOWM6(IF,m,x0,x1,x2,x3,x4,x5) SHOWP(IF,m<<": ") SHOW6(IF,x0,x1,x2,x3,x4,x5) +  #endif | 
