diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-17 02:35:19 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-17 02:35:19 +0000 |
commit | ed3b4f784cba2c4a77dcb46b7d588d0161263716 (patch) | |
tree | 1553540d7b8cf7bdba6830b0130230372fa1a9ee | |
parent | 35615b91d6e6bce7e3e70f33bba73bafaff5b884 (diff) |
split bin fix - no length 1 rhs virtual NTs
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@565 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-x | decoder/cfg.cc | 64 | ||||
-rwxr-xr-x | decoder/cfg_binarize.h | 2 |
2 files changed, 31 insertions, 35 deletions
diff --git a/decoder/cfg.cc b/decoder/cfg.cc index 4d5bf801..a0c00e55 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -237,23 +237,17 @@ struct add_virtual_rules { CFG::NTs &nts,new_nts; CFG::Rules &rules, new_rules; // above will be appended at the end, so we don't have to worry about iterator invalidation - NTHandle newnt; //not negative. TODO: i think we use it most often as negative. either way, be careful. + WordID newnt; //negative of NTHandle, or positive => unary lexical item (not to binarize). fit for rhs of a rule RuleHandle newruleid; - HASH_MAP<Rhs,NTHandle,boost::hash<Rhs> > rhs2lhs; + typedef HASH_MAP<Rhs,WordID,boost::hash<Rhs> > R2L; + R2L rhs2lhs; // an rhs maps to this -virtntid, or original id if length 1 bool name_nts; - add_virtual_rules(CFG &cfg,bool name_nts=false) : nts(cfg.nts),rules(cfg.rules),newnt(nts.size()),newruleid(rules.size()),name_nts(name_nts) { + add_virtual_rules(CFG &cfg,bool name_nts=false) : nts(cfg.nts),rules(cfg.rules),newnt(-nts.size()),newruleid(rules.size()),name_nts(name_nts) { HASH_MAP_EMPTY(rhs2lhs,null_for<Rhs>::null); } NTHandle get_virt(Rhs const& r) { NTHandle nt=get_default(rhs2lhs,r,newnt); - if (newnt==nt) { - create_nt(r); - create_rule(r); - } - return nt; - } - NTHandle get_nt(Rhs const& r) { - NTHandle nt=get_default(rhs2lhs,r,newnt); + SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(r,nts,new_nts)<<"=>") SHOW(DBIN,nt); if (newnt==nt) { create(r); } @@ -268,12 +262,12 @@ struct add_virtual_rules { set_nt_name(rhs); } inline void create_rule(Rhs const& rhs) { - new_rules.push_back(CFG::Rule(newnt++,rhs)); + new_rules.push_back(CFG::Rule(newnt--,rhs)); } inline void create(Rhs const& rhs) { create_nt(rhs); create_rule(rhs); - assert(newruleid==rules.size()+new_rules.size());assert(newnt==nts.size()+new_nts.size()); + assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size()); } ~add_virtual_rules() { @@ -285,7 +279,15 @@ struct add_virtual_rules { batched_append_swap(rules,new_rules); } inline bool have(Rhs const& rhs,NTHandle &h) const { - return rhs2lhs.find(rhs)!=rhs2lhs.end(); + if (rhs.size()==1) { // stop creating virtual unary rules. + h=rhs[0]; + return true; + } + typename R2L::const_iterator i=rhs2lhs.find(rhs); + if (i==rhs2lhs.end()) + return false; + h=i->second; + return true; } //HACK: prevent this for instantiating for BinRhs. we need to use rule index because we'll be adding rules before we can update. // returns 1 per replaced NT (0,1, or 2) @@ -302,6 +304,7 @@ struct add_virtual_rules { NTHandle bestntr,bestntl; WordID *b=&rhs.front(),*e=b+n; for (int k=1;k<n-1;++k) { + //TODO: handle length 1 l and r parts without explicitly building Rhs? WordID *wk=&rhs[k]; Rhs l(b,wk); int rlen=n-k; @@ -330,18 +333,19 @@ struct add_virtual_rules { } } } + bool no1=longest1<=1; // a single token sub-rhs part is not really interesting, but it would be reported as havel or haver. if (only_free) { + if (no1) return 0; if (havel) { rhs.erase(rhs.begin()+1,rhs.begin()+best_k); - rhs[0]=-ntl; + rhs[0]=ntl; } else if (haver) { rhs.erase(rhs.begin()+best_k+1,rhs.end()); - rhs[best_k]=-ntr; - } else - return 0; + rhs[best_k]=ntr; + } return 1; } - if (only_reusing_1 && longest1==0) return 0; + if (only_reusing_1 && no1) return 0; //TODO: swap order of consideration (l first or r?) depending on pre/post midpoint? one will be useless to check for beating the longest single match so far. check that second. WordID *best_wk=b+best_k; Rhs l(b,best_wk); @@ -349,8 +353,8 @@ struct add_virtual_rules { //we build these first because adding rules may invalidate the underlying pointers (we end up binarizing already split virt rules)! rhs.resize(2); int nnt=newnt; - rhs[0]=-(havel?bestntl:nnt++); - rhs[1]=-(haver?bestntr:nnt); + rhs[0]=havel?bestntl:nnt--; + rhs[1]=haver?bestntr:nnt; // now that we've set rhs, we can actually safely add rules if (!havel) create(l); @@ -372,28 +376,20 @@ void CFG::BinarizeSplit(CFGBinarize const& b) { Rule &r=newr[i];expr; } // NOTE: must use indices since we'll be adding rules as we iterate. int n_changed_total=0; - + int n_changed; #define CFG_SPLIT_PASS(N,free,just1) \ for (int pass=0;pass<b.N;++pass) { \ - int n_changed=0; \ + n_changed=0; \ CFG_FOR_VIRT(r,n_changed+=v.split_rhs(r.rhs,free,just1)); \ if (!n_changed) { \ break; \ } n_changed_total+=n_changed; } CFG_SPLIT_PASS(split_passes,false,false) + if (n_changed==0) return; CFG_SPLIT_PASS(split_share1_passes,false,true) CFG_SPLIT_PASS(split_free_passes,true,false) - - /* - for (int i=0;i<b.split_passes;++i) - CFG_FOR_VIRT(r,v.split_rhs(r.rhs,false,false)); - for (int i=0;i<b.split_share1_passes;++i) - CFG_FOR_VIRT(r,v.split_rhs(r.rhs,false,true)); - for (int i=0;i<b.split_free_passes;++i) - CFG_FOR_VIRT(r,v.split_rhs(r.rhs,true)); - */ } void CFG::Binarize(CFGBinarize const& b) { @@ -449,10 +445,10 @@ cerr << "Binarizing left->right " << (bin_unary?"real to unary":"stop at binary" if (name) new_nts.back().from.nt=BinName(bin,nts,new_nts); } */ - bin.second=-bin_to; + bin.second=bin_to; --r; if (r<rhsmin) { - rhs[rhsmin]=-bin_to; + rhs[rhsmin]=bin_to; rhs.resize(rhsmin+1); break; } diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h index 3aba5e9f..41eba11b 100755 --- a/decoder/cfg_binarize.h +++ b/decoder/cfg_binarize.h @@ -26,7 +26,7 @@ struct CFGBinarize { opts->add_options() ("cfg_binarize_threshold", defaulted_value(&bin_thresh),"(if >0) repeatedly binarize CFG rhs bigrams which appear at least this many times, most frequent first. resulting rules may be 1,2, or >2-ary. this happens before the other types of binarization.") // ("cfg_binarize_unary_threshold", defaulted_value(&bin_unary),"if >0, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least this many times. this happens last.") - ("cfg_binarize_greedy_split", defaulted_value(&bin_split),"(DeNero et al) for each rule until binarized, pick a split point k of L->r[0..n) to make rules L->V1 V2, V1->r[0..k) V2->r[k..n), to minimize the number of new rules created") + ("cfg_binarize_split", defaulted_value(&bin_split),"(DeNero et al) for each rule until binarized, pick a split point k of L->r[0..n) to make rules L->V1 V2, V1->r[0..k) V2->r[k..n), to minimize the number of new rules created") ("cfg_split_full_passes", defaulted_value(&split_passes),"pass through the virtual rules only (up to) this many times (all real rules will have been split if not already binary)") ("cfg_split_share1_passes", defaulted_value(&split_share1_passes),"after the full passes, for up to this many times split when at least 1 of the items has been seen before") ("cfg_split_free_passes", defaulted_value(&split_free_passes),"only split off from virtual nts pre/post nts that already exist - could check for interior phrases but after a few splits everything should be tiny already.") |