summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-17 02:35:19 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-17 02:35:19 +0000
commited3b4f784cba2c4a77dcb46b7d588d0161263716 (patch)
tree1553540d7b8cf7bdba6830b0130230372fa1a9ee
parent35615b91d6e6bce7e3e70f33bba73bafaff5b884 (diff)
split bin fix - no length 1 rhs virtual NTs
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@565 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-xdecoder/cfg.cc64
-rwxr-xr-xdecoder/cfg_binarize.h2
2 files changed, 31 insertions, 35 deletions
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index 4d5bf801..a0c00e55 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -237,23 +237,17 @@ struct add_virtual_rules {
CFG::NTs &nts,new_nts;
CFG::Rules &rules, new_rules;
// above will be appended at the end, so we don't have to worry about iterator invalidation
- NTHandle newnt; //not negative. TODO: i think we use it most often as negative. either way, be careful.
+ WordID newnt; //negative of NTHandle, or positive => unary lexical item (not to binarize). fit for rhs of a rule
RuleHandle newruleid;
- HASH_MAP<Rhs,NTHandle,boost::hash<Rhs> > rhs2lhs;
+ typedef HASH_MAP<Rhs,WordID,boost::hash<Rhs> > R2L;
+ R2L rhs2lhs; // an rhs maps to this -virtntid, or original id if length 1
bool name_nts;
- add_virtual_rules(CFG &cfg,bool name_nts=false) : nts(cfg.nts),rules(cfg.rules),newnt(nts.size()),newruleid(rules.size()),name_nts(name_nts) {
+ add_virtual_rules(CFG &cfg,bool name_nts=false) : nts(cfg.nts),rules(cfg.rules),newnt(-nts.size()),newruleid(rules.size()),name_nts(name_nts) {
HASH_MAP_EMPTY(rhs2lhs,null_for<Rhs>::null);
}
NTHandle get_virt(Rhs const& r) {
NTHandle nt=get_default(rhs2lhs,r,newnt);
- if (newnt==nt) {
- create_nt(r);
- create_rule(r);
- }
- return nt;
- }
- NTHandle get_nt(Rhs const& r) {
- NTHandle nt=get_default(rhs2lhs,r,newnt);
+ SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(r,nts,new_nts)<<"=>") SHOW(DBIN,nt);
if (newnt==nt) {
create(r);
}
@@ -268,12 +262,12 @@ struct add_virtual_rules {
set_nt_name(rhs);
}
inline void create_rule(Rhs const& rhs) {
- new_rules.push_back(CFG::Rule(newnt++,rhs));
+ new_rules.push_back(CFG::Rule(newnt--,rhs));
}
inline void create(Rhs const& rhs) {
create_nt(rhs);
create_rule(rhs);
- assert(newruleid==rules.size()+new_rules.size());assert(newnt==nts.size()+new_nts.size());
+ assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size());
}
~add_virtual_rules() {
@@ -285,7 +279,15 @@ struct add_virtual_rules {
batched_append_swap(rules,new_rules);
}
inline bool have(Rhs const& rhs,NTHandle &h) const {
- return rhs2lhs.find(rhs)!=rhs2lhs.end();
+ if (rhs.size()==1) { // stop creating virtual unary rules.
+ h=rhs[0];
+ return true;
+ }
+ typename R2L::const_iterator i=rhs2lhs.find(rhs);
+ if (i==rhs2lhs.end())
+ return false;
+ h=i->second;
+ return true;
}
//HACK: prevent this for instantiating for BinRhs. we need to use rule index because we'll be adding rules before we can update.
// returns 1 per replaced NT (0,1, or 2)
@@ -302,6 +304,7 @@ struct add_virtual_rules {
NTHandle bestntr,bestntl;
WordID *b=&rhs.front(),*e=b+n;
for (int k=1;k<n-1;++k) {
+ //TODO: handle length 1 l and r parts without explicitly building Rhs?
WordID *wk=&rhs[k];
Rhs l(b,wk);
int rlen=n-k;
@@ -330,18 +333,19 @@ struct add_virtual_rules {
}
}
}
+ bool no1=longest1<=1; // a single token sub-rhs part is not really interesting, but it would be reported as havel or haver.
if (only_free) {
+ if (no1) return 0;
if (havel) {
rhs.erase(rhs.begin()+1,rhs.begin()+best_k);
- rhs[0]=-ntl;
+ rhs[0]=ntl;
} else if (haver) {
rhs.erase(rhs.begin()+best_k+1,rhs.end());
- rhs[best_k]=-ntr;
- } else
- return 0;
+ rhs[best_k]=ntr;
+ }
return 1;
}
- if (only_reusing_1 && longest1==0) return 0;
+ if (only_reusing_1 && no1) return 0;
//TODO: swap order of consideration (l first or r?) depending on pre/post midpoint? one will be useless to check for beating the longest single match so far. check that second.
WordID *best_wk=b+best_k;
Rhs l(b,best_wk);
@@ -349,8 +353,8 @@ struct add_virtual_rules {
//we build these first because adding rules may invalidate the underlying pointers (we end up binarizing already split virt rules)!
rhs.resize(2);
int nnt=newnt;
- rhs[0]=-(havel?bestntl:nnt++);
- rhs[1]=-(haver?bestntr:nnt);
+ rhs[0]=havel?bestntl:nnt--;
+ rhs[1]=haver?bestntr:nnt;
// now that we've set rhs, we can actually safely add rules
if (!havel)
create(l);
@@ -372,28 +376,20 @@ void CFG::BinarizeSplit(CFGBinarize const& b) {
Rule &r=newr[i];expr; } // NOTE: must use indices since we'll be adding rules as we iterate.
int n_changed_total=0;
-
+ int n_changed;
#define CFG_SPLIT_PASS(N,free,just1) \
for (int pass=0;pass<b.N;++pass) { \
- int n_changed=0; \
+ n_changed=0; \
CFG_FOR_VIRT(r,n_changed+=v.split_rhs(r.rhs,free,just1)); \
if (!n_changed) { \
break; \
} n_changed_total+=n_changed; }
CFG_SPLIT_PASS(split_passes,false,false)
+ if (n_changed==0) return;
CFG_SPLIT_PASS(split_share1_passes,false,true)
CFG_SPLIT_PASS(split_free_passes,true,false)
-
- /*
- for (int i=0;i<b.split_passes;++i)
- CFG_FOR_VIRT(r,v.split_rhs(r.rhs,false,false));
- for (int i=0;i<b.split_share1_passes;++i)
- CFG_FOR_VIRT(r,v.split_rhs(r.rhs,false,true));
- for (int i=0;i<b.split_free_passes;++i)
- CFG_FOR_VIRT(r,v.split_rhs(r.rhs,true));
- */
}
void CFG::Binarize(CFGBinarize const& b) {
@@ -449,10 +445,10 @@ cerr << "Binarizing left->right " << (bin_unary?"real to unary":"stop at binary"
if (name) new_nts.back().from.nt=BinName(bin,nts,new_nts);
}
*/
- bin.second=-bin_to;
+ bin.second=bin_to;
--r;
if (r<rhsmin) {
- rhs[rhsmin]=-bin_to;
+ rhs[rhsmin]=bin_to;
rhs.resize(rhsmin+1);
break;
}
diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h
index 3aba5e9f..41eba11b 100755
--- a/decoder/cfg_binarize.h
+++ b/decoder/cfg_binarize.h
@@ -26,7 +26,7 @@ struct CFGBinarize {
opts->add_options()
("cfg_binarize_threshold", defaulted_value(&bin_thresh),"(if >0) repeatedly binarize CFG rhs bigrams which appear at least this many times, most frequent first. resulting rules may be 1,2, or >2-ary. this happens before the other types of binarization.")
// ("cfg_binarize_unary_threshold", defaulted_value(&bin_unary),"if >0, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least this many times. this happens last.")
- ("cfg_binarize_greedy_split", defaulted_value(&bin_split),"(DeNero et al) for each rule until binarized, pick a split point k of L->r[0..n) to make rules L->V1 V2, V1->r[0..k) V2->r[k..n), to minimize the number of new rules created")
+ ("cfg_binarize_split", defaulted_value(&bin_split),"(DeNero et al) for each rule until binarized, pick a split point k of L->r[0..n) to make rules L->V1 V2, V1->r[0..k) V2->r[k..n), to minimize the number of new rules created")
("cfg_split_full_passes", defaulted_value(&split_passes),"pass through the virtual rules only (up to) this many times (all real rules will have been split if not already binary)")
("cfg_split_share1_passes", defaulted_value(&split_share1_passes),"after the full passes, for up to this many times split when at least 1 of the items has been seen before")
("cfg_split_free_passes", defaulted_value(&split_free_passes),"only split off from virtual nts pre/post nts that already exist - could check for interior phrases but after a few splits everything should be tiny already.")