summaryrefslogtreecommitdiff
path: root/decoder/cfg_binarize.h
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-10 23:49:33 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-10 23:49:33 +0000
commit667675465486e1f9729931c0b38b5a1124a1c000 (patch)
tree731cbe983050220b995c643421beb04b4e271560 /decoder/cfg_binarize.h
parent131c2280809e890a817688b708f03a231025fd77 (diff)
CFG binarize opts
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@503 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder/cfg_binarize.h')
-rwxr-xr-xdecoder/cfg_binarize.h72
1 files changed, 72 insertions, 0 deletions
diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h
new file mode 100755
index 00000000..f76619d2
--- /dev/null
+++ b/decoder/cfg_binarize.h
@@ -0,0 +1,72 @@
+#ifndef CFG_BINARIZE_H
+#define CFG_BINARIZE_H
+
+#include <iostream>
+
+/*
+ binarization: decimate rhs of original rules until their rhs have been reduced to length 2 (or 1 if bin_unary). also decimate rhs of newly binarized rules until length 2. newly created rules are all binary (never unary/nullary).
+
+ bin_name_nts: nts[i].from will be initialized, including adding new names to TD
+
+ bin_l2r: right-branching (a (b c)) means suffixes are shared. if requested, the only other option that matters is bin_unary
+
+ otherwise, greedy binarization: the pairs that are most frequent in the rules are binarized, one at a time. this should be done efficiently: each pair has a count of and list of its left and right adjacent pair+count (or maybe a non-count collapsed list of adjacent instances). this can be efficiently updated when a pair is chosen for replacement by a new virtual NT.
+ */
+
+struct CFGBinarize {
+ int bin_at;
+ bool bin_l2r;
+ bool bin_unary;
+ bool bin_name_nts;
+ template <class Opts> // template to support both printable_opts and boost nonprintable
+ void AddOptions(Opts *opts) {
+ opts->add_options()
+ ("cfg_binarize_at", defaulted_value(&bin_at),"(if >0) binarize CFG rhs segments which appear at least this many times")
+ ("cfg_binarize_unary", defaulted_value(&bin_unary),"if true, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least cfg_binarize_at times.")
+ ("cfg_binarize_l2r", defaulted_value(&bin_l2r),"force left to right (a (b (c d))) binarization (ignore _at threshold)")
+ ("cfg_binarize_name_nts", defaulted_value(&bin_name_nts),"create named virtual NT tokens e.g. 'A12+the' when binarizing 'B->[A12] the cat'")
+ ;
+ }
+ void Validate() {
+ if (bin_l2r)
+ bin_at=0;
+ if (bin_at>0&&!bin_l2r) {
+ std::cerr<<"\nWARNING: greedy binarization not yet supported; using l2r (right branching) instead.\n";
+ bin_l2r=true;
+ }
+ }
+
+ bool Binarizing() const {
+ return bin_l2r || bin_at>0;
+ }
+ void set_defaults() {
+ bin_at=0;
+ bin_unary=false;
+ bin_name_nts=true;
+ bin_l2r=false;
+ }
+ CFGBinarize() { set_defaults(); }
+ void print(std::ostream &o) const {
+ o<<'(';
+ if (!Binarizing())
+ o << "Unbinarized";
+ else {
+ if (bin_unary)
+ o << "unary-sharing ";
+ if (bin_l2r)
+ o << "left->right";
+ else
+ o << "greedy count>="<<bin_at;
+ if (bin_name_nts)
+ o << " named-NTs";
+ }
+ o<<')';
+ }
+ friend inline std::ostream &operator<<(std::ostream &o,CFGBinarize const& me) {
+ me.print(o); return o;
+ }
+
+};
+
+
+#endif