summaryrefslogtreecommitdiff
path: root/decoder/cfg_binarize.h
diff options
context:
space:
mode:
Diffstat (limited to 'decoder/cfg_binarize.h')
-rwxr-xr-xdecoder/cfg_binarize.h35
1 files changed, 24 insertions, 11 deletions
diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h
index 82c4dd1a..3aba5e9f 100755
--- a/decoder/cfg_binarize.h
+++ b/decoder/cfg_binarize.h
@@ -14,39 +14,50 @@
*/
struct CFGBinarize {
- int bin_at;
+ int bin_thresh;
bool bin_l2r;
- bool bin_unary;
+ int bin_unary;
bool bin_name_nts;
bool bin_topo;
+ bool bin_split;
+ int split_passes,split_share1_passes,split_free_passes;
template <class Opts> // template to support both printable_opts and boost nonprintable
void AddOptions(Opts *opts) {
opts->add_options()
- ("cfg_binarize_at", defaulted_value(&bin_at),"(if >0) binarize CFG rhs segments which appear at least this many times")
- ("cfg_binarize_unary", defaulted_value(&bin_unary),"if true, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least cfg_binarize_at times.")
+ ("cfg_binarize_threshold", defaulted_value(&bin_thresh),"(if >0) repeatedly binarize CFG rhs bigrams which appear at least this many times, most frequent first. resulting rules may be 1,2, or >2-ary. this happens before the other types of binarization.")
+// ("cfg_binarize_unary_threshold", defaulted_value(&bin_unary),"if >0, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least this many times. this happens last.")
+ ("cfg_binarize_greedy_split", defaulted_value(&bin_split),"(DeNero et al) for each rule until binarized, pick a split point k of L->r[0..n) to make rules L->V1 V2, V1->r[0..k) V2->r[k..n), to minimize the number of new rules created")
+ ("cfg_split_full_passes", defaulted_value(&split_passes),"pass through the virtual rules only (up to) this many times (all real rules will have been split if not already binary)")
+ ("cfg_split_share1_passes", defaulted_value(&split_share1_passes),"after the full passes, for up to this many times split when at least 1 of the items has been seen before")
+ ("cfg_split_free_passes", defaulted_value(&split_free_passes),"only split off from virtual nts pre/post nts that already exist - could check for interior phrases but after a few splits everything should be tiny already.")
("cfg_binarize_l2r", defaulted_value(&bin_l2r),"force left to right (a (b (c d))) binarization (ignore _at threshold)")
("cfg_binarize_name_nts", defaulted_value(&bin_name_nts),"create named virtual NT tokens e.g. 'A12+the' when binarizing 'B->[A12] the cat'")
("cfg_binarize_topo", defaulted_value(&bin_topo),"reorder nonterminals after binarization to maintain definition before use (topological order). otherwise the virtual NTs will all appear after the regular NTs")
;
}
void Validate() {
- if (bin_l2r)
- bin_at=0;
- if (bin_at>0&&!bin_l2r) {
+ if (bin_thresh>0&&!bin_l2r) {
std::cerr<<"\nWARNING: greedy binarization not yet supported; using l2r (right branching) instead.\n";
bin_l2r=true;
}
+ if (false && bin_l2r && bin_split) { // actually, split may be slightly incomplete due to finite number of passes.
+ std::cerr<<"\nWARNING: l2r and split are both complete binarization and redundant. Using split.\n";
+ bin_l2r=false;
+ }
+
}
bool Binarizing() const {
- return bin_l2r || bin_at>0;
+ return bin_split || bin_l2r || bin_thresh>0;
}
void set_defaults() {
+ bin_split=false;
bin_topo=false;
- bin_at=0;
- bin_unary=false;
+ bin_thresh=0;
+ bin_unary=0;
bin_name_nts=true;
bin_l2r=false;
+ split_passes=10;split_share1_passes=0;split_free_passes=10;
}
CFGBinarize() { set_defaults(); }
void print(std::ostream &o) const {
@@ -56,10 +67,12 @@ struct CFGBinarize {
else {
if (bin_unary)
o << "unary-sharing ";
+ if (bin_thresh)
+ o<<"greedy bigram count>="<<bin_thresh<<" ";
if (bin_l2r)
o << "left->right";
else
- o << "greedy count>="<<bin_at;
+ o << "DeNero greedy split";
if (bin_name_nts)
o << " named-NTs";
if (bin_topo)