diff options
| author | graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-16 09:11:03 +0000 | 
|---|---|---|
| committer | graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-16 09:11:03 +0000 | 
| commit | 708e59d0e908b24bac36ec36956ad013268253b0 (patch) | |
| tree | b568b7afc5b9012aed8595821c3ef4daa4dbffde /decoder/cfg_binarize.h | |
| parent | d523a48ff2a7097ec5c33054af82f9395774d2d2 (diff) | |
greedy binarization - needs testing, may have broke l2r
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@560 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder/cfg_binarize.h')
| -rwxr-xr-x | decoder/cfg_binarize.h | 35 | 
1 files changed, 24 insertions, 11 deletions
| diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h index 82c4dd1a..3aba5e9f 100755 --- a/decoder/cfg_binarize.h +++ b/decoder/cfg_binarize.h @@ -14,39 +14,50 @@   */  struct CFGBinarize { -  int bin_at; +  int bin_thresh;    bool bin_l2r; -  bool bin_unary; +  int bin_unary;    bool bin_name_nts;    bool bin_topo; +  bool bin_split; +  int split_passes,split_share1_passes,split_free_passes;    template <class Opts> // template to support both printable_opts and boost nonprintable    void AddOptions(Opts *opts) {      opts->add_options() -      ("cfg_binarize_at", defaulted_value(&bin_at),"(if >0) binarize CFG rhs segments which appear at least this many times") -      ("cfg_binarize_unary", defaulted_value(&bin_unary),"if true, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least cfg_binarize_at times.") +      ("cfg_binarize_threshold", defaulted_value(&bin_thresh),"(if >0) repeatedly binarize CFG rhs bigrams which appear at least this many times, most frequent first.  resulting rules may be 1,2, or >2-ary.  this happens before the other types of binarization.") +//      ("cfg_binarize_unary_threshold", defaulted_value(&bin_unary),"if >0, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least this many times.  this happens last.") +      ("cfg_binarize_greedy_split", defaulted_value(&bin_split),"(DeNero et al) for each rule until binarized, pick a split point k of L->r[0..n) to make rules L->V1 V2, V1->r[0..k) V2->r[k..n), to minimize the number of new rules created") +      ("cfg_split_full_passes", defaulted_value(&split_passes),"pass through the virtual rules only (up to) this many times (all real rules will have been split if not already binary)") +      ("cfg_split_share1_passes", defaulted_value(&split_share1_passes),"after the full passes, for up to this many times split when at least 1 of the items has been seen before") +      ("cfg_split_free_passes", defaulted_value(&split_free_passes),"only split off from virtual nts pre/post nts that already exist - could check for interior phrases but after a few splits everything should be tiny already.")        ("cfg_binarize_l2r", defaulted_value(&bin_l2r),"force left to right (a (b (c d))) binarization (ignore _at threshold)")        ("cfg_binarize_name_nts", defaulted_value(&bin_name_nts),"create named virtual NT tokens e.g. 'A12+the' when binarizing 'B->[A12] the cat'")        ("cfg_binarize_topo", defaulted_value(&bin_topo),"reorder nonterminals after binarization to maintain definition before use (topological order).  otherwise the virtual NTs will all appear after the regular NTs")      ;    }    void Validate() { -    if (bin_l2r) -      bin_at=0; -    if (bin_at>0&&!bin_l2r) { +    if (bin_thresh>0&&!bin_l2r) {        std::cerr<<"\nWARNING: greedy binarization not yet supported; using l2r (right branching) instead.\n";        bin_l2r=true;      } +    if (false && bin_l2r && bin_split) { // actually, split may be slightly incomplete due to finite number of passes. +      std::cerr<<"\nWARNING: l2r and split are both complete binarization and redundant.  Using split.\n"; +      bin_l2r=false; +    } +    }    bool Binarizing() const { -    return bin_l2r || bin_at>0; +    return bin_split || bin_l2r || bin_thresh>0;    }    void set_defaults() { +    bin_split=false;      bin_topo=false; -    bin_at=0; -    bin_unary=false; +    bin_thresh=0; +    bin_unary=0;      bin_name_nts=true;      bin_l2r=false; +    split_passes=10;split_share1_passes=0;split_free_passes=10;    }    CFGBinarize() { set_defaults(); }    void print(std::ostream &o) const { @@ -56,10 +67,12 @@ struct CFGBinarize {      else {        if (bin_unary)          o << "unary-sharing "; +      if (bin_thresh) +        o<<"greedy bigram count>="<<bin_thresh<<" ";        if (bin_l2r)          o << "left->right";        else -        o << "greedy count>="<<bin_at; +        o << "DeNero greedy split";        if (bin_name_nts)          o << " named-NTs";        if (bin_topo) | 
