From ed3b4f784cba2c4a77dcb46b7d588d0161263716 Mon Sep 17 00:00:00 2001 From: graehl Date: Tue, 17 Aug 2010 02:35:19 +0000 Subject: split bin fix - no length 1 rhs virtual NTs git-svn-id: https://ws10smt.googlecode.com/svn/trunk@565 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/cfg.cc | 64 +++++++++++++++++++++++--------------------------- decoder/cfg_binarize.h | 2 +- 2 files changed, 31 insertions(+), 35 deletions(-) diff --git a/decoder/cfg.cc b/decoder/cfg.cc index 4d5bf801..a0c00e55 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -237,23 +237,17 @@ struct add_virtual_rules { CFG::NTs &nts,new_nts; CFG::Rules &rules, new_rules; // above will be appended at the end, so we don't have to worry about iterator invalidation - NTHandle newnt; //not negative. TODO: i think we use it most often as negative. either way, be careful. + WordID newnt; //negative of NTHandle, or positive => unary lexical item (not to binarize). fit for rhs of a rule RuleHandle newruleid; - HASH_MAP > rhs2lhs; + typedef HASH_MAP > R2L; + R2L rhs2lhs; // an rhs maps to this -virtntid, or original id if length 1 bool name_nts; - add_virtual_rules(CFG &cfg,bool name_nts=false) : nts(cfg.nts),rules(cfg.rules),newnt(nts.size()),newruleid(rules.size()),name_nts(name_nts) { + add_virtual_rules(CFG &cfg,bool name_nts=false) : nts(cfg.nts),rules(cfg.rules),newnt(-nts.size()),newruleid(rules.size()),name_nts(name_nts) { HASH_MAP_EMPTY(rhs2lhs,null_for::null); } NTHandle get_virt(Rhs const& r) { NTHandle nt=get_default(rhs2lhs,r,newnt); - if (newnt==nt) { - create_nt(r); - create_rule(r); - } - return nt; - } - NTHandle get_nt(Rhs const& r) { - NTHandle nt=get_default(rhs2lhs,r,newnt); + SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<") SHOW(DBIN,nt); if (newnt==nt) { create(r); } @@ -268,12 +262,12 @@ struct add_virtual_rules { set_nt_name(rhs); } inline void create_rule(Rhs const& rhs) { - new_rules.push_back(CFG::Rule(newnt++,rhs)); + new_rules.push_back(CFG::Rule(newnt--,rhs)); } inline void create(Rhs const& rhs) { create_nt(rhs); create_rule(rhs); - assert(newruleid==rules.size()+new_rules.size());assert(newnt==nts.size()+new_nts.size()); + assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size()); } ~add_virtual_rules() { @@ -285,7 +279,15 @@ struct add_virtual_rules { batched_append_swap(rules,new_rules); } inline bool have(Rhs const& rhs,NTHandle &h) const { - return rhs2lhs.find(rhs)!=rhs2lhs.end(); + if (rhs.size()==1) { // stop creating virtual unary rules. + h=rhs[0]; + return true; + } + typename R2L::const_iterator i=rhs2lhs.find(rhs); + if (i==rhs2lhs.end()) + return false; + h=i->second; + return true; } //HACK: prevent this for instantiating for BinRhs. we need to use rule index because we'll be adding rules before we can update. // returns 1 per replaced NT (0,1, or 2) @@ -302,6 +304,7 @@ struct add_virtual_rules { NTHandle bestntr,bestntl; WordID *b=&rhs.front(),*e=b+n; for (int k=1;kright " << (bin_unary?"real to unary":"stop at binary" if (name) new_nts.back().from.nt=BinName(bin,nts,new_nts); } */ - bin.second=-bin_to; + bin.second=bin_to; --r; if (radd_options() ("cfg_binarize_threshold", defaulted_value(&bin_thresh),"(if >0) repeatedly binarize CFG rhs bigrams which appear at least this many times, most frequent first. resulting rules may be 1,2, or >2-ary. this happens before the other types of binarization.") // ("cfg_binarize_unary_threshold", defaulted_value(&bin_unary),"if >0, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least this many times. this happens last.") - ("cfg_binarize_greedy_split", defaulted_value(&bin_split),"(DeNero et al) for each rule until binarized, pick a split point k of L->r[0..n) to make rules L->V1 V2, V1->r[0..k) V2->r[k..n), to minimize the number of new rules created") + ("cfg_binarize_split", defaulted_value(&bin_split),"(DeNero et al) for each rule until binarized, pick a split point k of L->r[0..n) to make rules L->V1 V2, V1->r[0..k) V2->r[k..n), to minimize the number of new rules created") ("cfg_split_full_passes", defaulted_value(&split_passes),"pass through the virtual rules only (up to) this many times (all real rules will have been split if not already binary)") ("cfg_split_share1_passes", defaulted_value(&split_share1_passes),"after the full passes, for up to this many times split when at least 1 of the items has been seen before") ("cfg_split_free_passes", defaulted_value(&split_free_passes),"only split off from virtual nts pre/post nts that already exist - could check for interior phrases but after a few splits everything should be tiny already.") -- cgit v1.2.3