From e32e9fdd48ef6466fbb257d92e250816f5b69114 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 7 Apr 2014 00:54:52 -0400 Subject: clean up dead TRule code --- decoder/trule.cc | 202 +++++-------------------------------------------------- 1 file changed, 17 insertions(+), 185 deletions(-) (limited to 'decoder/trule.cc') diff --git a/decoder/trule.cc b/decoder/trule.cc index c22baae3..1bd5425f 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -17,73 +17,16 @@ bool TRule::IsGoal() const { return GetLHS() == kGOAL; } -static WordID ConvertTrgString(const string& w) { - const unsigned len = w.size(); - WordID id = 0; - // [X,0] or [0] - // for target rules, we ignore the category, just keep the index - if (len > 2 && w[0]=='[' && w[len-1]==']' && w[len-2] > '0' && w[len-2] <= '9' && - (len == 3 || (len > 4 && w[len-3] == ','))) { - id = w[len-2] - '0'; - id = 1 - id; - } else { - id = TD::Convert(w); - } - return id; -} - -static WordID ConvertSrcString(const string& w, bool mono = false) { - const unsigned len = w.size(); - // [X,0] - // for source rules, we keep the category and ignore the index (source rules are - // always numbered 1, 2, 3... - if (mono) { - if (len > 2 && w[0]=='[' && w[len-1]==']') { - if (len > 4 && w[len-3] == ',') { - cerr << "[ERROR] Monolingual rules mut not have non-terminal indices:\n " - << w << endl; - exit(1); - } - // TODO check that source indices go 1,2,3,etc. - return TD::Convert(w.substr(1, len-2)) * -1; - } else { - return TD::Convert(w); - } - } else { - if (len > 4 && w[0]=='[' && w[len-1]==']' && w[len-3] == ',' && w[len-2] > '0' && w[len-2] <= '9') { - return TD::Convert(w.substr(1, len-4)) * -1; - } else { - return TD::Convert(w); - } - } -} - -static WordID ConvertLHS(const string& w) { - if (w[0] == '[') { - const unsigned len = w.size(); - if (len < 3) { cerr << "Format error: " << w << endl; exit(1); } - return TD::Convert(w.substr(1, len-2)) * -1; - } else { - return TD::Convert(w) * -1; - } -} - TRule* TRule::CreateRuleSynchronous(const string& rule) { TRule* res = new TRule; - if (res->ReadFromString(rule, true, false)) return res; + if (res->ReadFromString(rule)) return res; cerr << "[ERROR] Failed to creating rule from: " << rule << endl; delete res; return NULL; } TRule* TRule::CreateRulePhrasetable(const string& rule) { - // TODO make this faster - // TODO add configuration for default NT type - if (rule[0] == '[') { - cerr << "Phrasetable rules shouldn't have a LHS / non-terminals:\n " << rule << endl; - return NULL; - } - TRule* res = new TRule("[X] ||| " + rule, true, false); + TRule* res = new TRule("[X] ||| " + rule); if (res->Arity() != 0) { cerr << "Phrasetable rules should have arity 0:\n " << rule << endl; delete res; @@ -93,138 +36,27 @@ TRule* TRule::CreateRulePhrasetable(const string& rule) { } TRule* TRule::CreateRuleMonolingual(const string& rule) { - return new TRule(rule, false, true); + return new TRule(rule, true); } namespace { -// callback for lexer +// callback for single rule lexer int n_assigned=0; -void assign_trule(const TRulePtr& new_rule, const unsigned int ctf_level, const TRulePtr& coarse_rule, void* extra) { - (void) ctf_level; - (void) coarse_rule; - TRule *assignto=(TRule *)extra; - *assignto=*new_rule; - ++n_assigned; -} - -} - -bool TRule::ReadFromString(const string& line, bool strict, bool mono) { - if (!is_single_line_stripped(line)) - cerr<<"\nWARNING: building rule from multi-line string "<1) - cerr<<"\nWARNING: more than one rule parsed from multi-line string; kept last: "<(extra) = *new_rule; + ++n_assigned; } - if (format >= 2 || (mono && format == 1)) { - while(is>>w && w!="|||") { lhs_ = ConvertLHS(w); } - while(is>>w && w!="|||") { f_.push_back(ConvertSrcString(w, mono)); } - if (!mono) { - while(is>>w && w!="|||") { e_.push_back(ConvertTrgString(w)); } - } - int fv = 0; - if (is) { - string ss; - getline(is, ss); - //cerr << "L: " << ss << endl; - unsigned start = 0; - unsigned len = ss.size(); - const size_t ppos = ss.find(" |||"); - if (ppos != string::npos) { len = ppos; } - while (start < len) { - while(start < len && (ss[start] == ' ' || ss[start] == ';')) - ++start; - if (start == len) break; - unsigned end = start + 1; - while(end < len && (ss[end] != '=' && ss[end] != ' ' && ss[end] != ';')) - ++end; - if (end == len || ss[end] == ' ' || ss[end] == ';') { - //cerr << "PROC: '" << ss.substr(start, end - start) << "'\n"; - // non-named features - if (end != len) { ss[end] = 0; } - string fname = "PhraseModel_X"; - if (fv > 9) { cerr << "Too many phrasetable scores - used named format\n"; abort(); } - fname[12]='0' + fv; - ++fv; - // if the feature set is frozen, this may return zero, indicating an - // undefined feature - const int fid = FD::Convert(fname); - if (fid) - scores_.set_value(fid, atof(&ss[start])); - //cerr << "F: " << fname << " VAL=" << scores_.value(FD::Convert(fname)) << endl; - } else { - const int fid = FD::Convert(ss.substr(start, end - start)); - start = end + 1; - end = start + 1; - while(end < len && (ss[end] != ' ' && ss[end] != ';')) - ++end; - if (end < len) { ss[end] = 0; } - assert(start < len); - if (fid) - scores_.set_value(fid, atof(&ss[start])); - //cerr << "F: " << FD::Convert(fid) << " VAL=" << scores_.value(fid) << endl; - } - start = end + 1; - } - } - } else if (format == 1) { - while(is>>w && w!="|||") { lhs_ = ConvertLHS(w); } - while(is>>w && w!="|||") { e_.push_back(ConvertTrgString(w)); } - f_ = e_; - int x = ConvertLHS("[X]"); - for (unsigned i = 0; i < f_.size(); ++i) - if (f_[i] <= 0) { f_[i] = x; } - } else { - cerr << "F: " << format << endl; - cerr << "[ERROR] Don't know how to read:\n" << line << endl; - } - if (mono) { - e_ = f_; - int ci = 0; - for (unsigned i = 0; i < e_.size(); ++i) - if (e_[i] < 0) - e_[i] = ci--; - } - ComputeArity(); - return SanityCheck(); } -bool TRule::SanityCheck() const { - vector used(f_.size(), 0); - int ac = 0; - for (unsigned i = 0; i < e_.size(); ++i) { - int ind = e_[i]; - if (ind > 0) continue; - ind = -ind; - if ((++used[ind]) != 1) { - cerr << "[ERROR] e-side variable index " << (ind+1) << " used more than once!\n"; - return false; - } - ac++; - } - if (ac != Arity()) { - cerr << "[ERROR] e-side arity mismatches f-side\n"; - return false; - } - return true; +bool TRule::ReadFromString(const string& line, bool mono) { + n_assigned = 0; + //cerr << "LINE: " << line << " -- mono=" << mono << endl; + RuleLexer::ReadRule(line + '\n', assign_trule, mono, this); + if (n_assigned > 1) + cerr<<"\nWARNING: more than one rule parsed from multi-line string; kept last: "< Date: Thu, 17 Apr 2014 20:55:34 -0400 Subject: fix rescoring --- decoder/trule.cc | 6 ++++++ tests/system_tests/cfg_rescore/README | 4 ++++ tests/system_tests/cfg_rescore/cdec.ini | 2 ++ tests/system_tests/cfg_rescore/gold.statistics | 3 +++ tests/system_tests/cfg_rescore/gold.stdout | 4 ++++ tests/system_tests/cfg_rescore/input.cfg | 9 +++++++++ tests/system_tests/cfg_rescore/input.txt | 1 + tests/system_tests/cfg_rescore/weights | 3 +++ 8 files changed, 32 insertions(+) create mode 100644 tests/system_tests/cfg_rescore/README create mode 100644 tests/system_tests/cfg_rescore/cdec.ini create mode 100644 tests/system_tests/cfg_rescore/gold.statistics create mode 100644 tests/system_tests/cfg_rescore/gold.stdout create mode 100644 tests/system_tests/cfg_rescore/input.cfg create mode 100644 tests/system_tests/cfg_rescore/input.txt create mode 100644 tests/system_tests/cfg_rescore/weights (limited to 'decoder/trule.cc') diff --git a/decoder/trule.cc b/decoder/trule.cc index 1bd5425f..bee211d5 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -56,6 +56,12 @@ bool TRule::ReadFromString(const string& line, bool mono) { RuleLexer::ReadRule(line + '\n', assign_trule, mono, this); if (n_assigned > 1) cerr<<"\nWARNING: more than one rule parsed from multi-line string; kept last: "< input.txt diff --git a/tests/system_tests/cfg_rescore/cdec.ini b/tests/system_tests/cfg_rescore/cdec.ini new file mode 100644 index 00000000..1a913f2d --- /dev/null +++ b/tests/system_tests/cfg_rescore/cdec.ini @@ -0,0 +1,2 @@ +formalism=rescore +k_best=100 diff --git a/tests/system_tests/cfg_rescore/gold.statistics b/tests/system_tests/cfg_rescore/gold.statistics new file mode 100644 index 00000000..7b05e2d8 --- /dev/null +++ b/tests/system_tests/cfg_rescore/gold.statistics @@ -0,0 +1,3 @@ +-lm_nodes 8 +-lm_edges 10 +-lm_paths 4 diff --git a/tests/system_tests/cfg_rescore/gold.stdout b/tests/system_tests/cfg_rescore/gold.stdout new file mode 100644 index 00000000..ccf99263 --- /dev/null +++ b/tests/system_tests/cfg_rescore/gold.stdout @@ -0,0 +1,4 @@ +0 ||| the broccoli was eaten by John ||| Passive=1 Definite=1 ||| 2 +0 ||| John ate the broccoli ||| Active=1 Definite=1 ||| 1.1 +0 ||| broccoli was eaten by John ||| Passive=1 ||| 1 +0 ||| John ate broccoli ||| Active=1 ||| 0.1 diff --git a/tests/system_tests/cfg_rescore/input.cfg b/tests/system_tests/cfg_rescore/input.cfg new file mode 100644 index 00000000..0073cb7b --- /dev/null +++ b/tests/system_tests/cfg_rescore/input.cfg @@ -0,0 +1,9 @@ +[S] ||| [S1] +[S1] ||| [NP1] [VP] ||| Active=1 +[VP] ||| [V] [NP2] +[V] ||| ate +[VPSV] ||| was eaten +[S1] ||| [NP2] [VPSV] by [NP1] ||| Passive=1 +[NP1] ||| John +[NP2] ||| broccoli +[NP2] ||| the broccoli ||| Definite=1 diff --git a/tests/system_tests/cfg_rescore/input.txt b/tests/system_tests/cfg_rescore/input.txt new file mode 100644 index 00000000..71fc26bc --- /dev/null +++ b/tests/system_tests/cfg_rescore/input.txt @@ -0,0 +1 @@ +{"rules":[1,"[S] ||| [S1] ||| [1]",2,"[S1] ||| [NP1] [VP] ||| [1] [2] ||| Active=1",3,"[VP] ||| [V] [NP2] ||| [1] [2]",4,"[V] ||| ate ||| ate",5,"[VPSV] ||| was eaten ||| was eaten",6,"[S1] ||| [NP2] [VPSV] by [NP1] ||| [1] [2] by [3] ||| Passive=1",7,"[NP1] ||| John ||| John",8,"[NP2] ||| broccoli ||| broccoli",9,"[NP2] ||| the broccoli ||| the broccoli ||| Definite=1"],"features":["PhraseModel_0","PhraseModel_1","PhraseModel_2","PhraseModel_3","PhraseModel_4","PhraseModel_5","PhraseModel_6","PhraseModel_7","PhraseModel_8","PhraseModel_9","PhraseModel_10","PhraseModel_11","PhraseModel_12","PhraseModel_13","PhraseModel_14","PhraseModel_15","PhraseModel_16","PhraseModel_17","PhraseModel_18","PhraseModel_19","PhraseModel_20","PhraseModel_21","PhraseModel_22","PhraseModel_23","PhraseModel_24","PhraseModel_25","PhraseModel_26","PhraseModel_27","PhraseModel_28","PhraseModel_29","PhraseModel_30","PhraseModel_31","PhraseModel_32","PhraseModel_33","PhraseModel_34","PhraseModel_35","PhraseModel_36","PhraseModel_37","PhraseModel_38","PhraseModel_39","PhraseModel_40","PhraseModel_41","PhraseModel_42","PhraseModel_43","PhraseModel_44","PhraseModel_45","PhraseModel_46","PhraseModel_47","PhraseModel_48","PhraseModel_49","PhraseModel_50","PhraseModel_51","PhraseModel_52","PhraseModel_53","PhraseModel_54","PhraseModel_55","PhraseModel_56","PhraseModel_57","PhraseModel_58","PhraseModel_59","PhraseModel_60","PhraseModel_61","PhraseModel_62","PhraseModel_63","PhraseModel_64","PhraseModel_65","PhraseModel_66","PhraseModel_67","PhraseModel_68","PhraseModel_69","PhraseModel_70","PhraseModel_71","PhraseModel_72","PhraseModel_73","PhraseModel_74","PhraseModel_75","PhraseModel_76","PhraseModel_77","PhraseModel_78","PhraseModel_79","PhraseModel_80","PhraseModel_81","PhraseModel_82","PhraseModel_83","PhraseModel_84","PhraseModel_85","PhraseModel_86","PhraseModel_87","PhraseModel_88","PhraseModel_89","PhraseModel_90","PhraseModel_91","PhraseModel_92","PhraseModel_93","PhraseModel_94","PhraseModel_95","PhraseModel_96","PhraseModel_97","PhraseModel_98","PhraseModel_99","Active","Passive","Definite"],"edges":[{"tail":[],"spans":[-1,-1,-1,-1],"feats":[],"rule":7}],"node":{"in_edges":[0],"cat":"NP1","node_hash":"0000000000000007"},"edges":[{"tail":[],"spans":[-1,-1,-1,-1],"feats":[],"rule":4}],"node":{"in_edges":[1],"cat":"V","node_hash":"0000000000000004"},"edges":[{"tail":[],"spans":[-1,-1,-1,-1],"feats":[],"rule":8},{"tail":[],"spans":[-1,-1,-1,-1],"feats":[102,1],"rule":9}],"node":{"in_edges":[2,3],"cat":"NP2","node_hash":"0000000000000009"},"edges":[{"tail":[1,2],"spans":[-1,-1,-1,-1],"feats":[],"rule":3}],"node":{"in_edges":[4],"cat":"VP","node_hash":"0000000000000003"},"edges":[{"tail":[],"spans":[-1,-1,-1,-1],"feats":[],"rule":5}],"node":{"in_edges":[5],"cat":"VPSV","node_hash":"0000000000000005"},"edges":[{"tail":[0,3],"spans":[-1,-1,-1,-1],"feats":[100,1],"rule":2},{"tail":[2,4,0],"spans":[-1,-1,-1,-1],"feats":[101,1],"rule":6}],"node":{"in_edges":[6,7],"cat":"S1","node_hash":"0000000000000006"},"edges":[{"tail":[5],"spans":[-1,-1,-1,-1],"feats":[],"rule":1}],"node":{"in_edges":[8],"cat":"S","node_hash":"0000000000000001"}} diff --git a/tests/system_tests/cfg_rescore/weights b/tests/system_tests/cfg_rescore/weights new file mode 100644 index 00000000..bd3bb1af --- /dev/null +++ b/tests/system_tests/cfg_rescore/weights @@ -0,0 +1,3 @@ +Active 0.1 +Passive 1 +Definite 1 -- cgit v1.2.3