summaryrefslogtreecommitdiff
path: root/decoder/trule.cc
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-06-12 13:56:42 +0200
committerPatrick Simianer <p@simianer.de>2014-06-12 13:56:42 +0200
commit244971287003d079e46193b8a209c28955f90134 (patch)
tree8beaae6b12b913acb213fc7f2415fd63886192f9 /decoder/trule.cc
parent5250fd67a4b8f242068cff87f0a6a4211f8b0fcf (diff)
parentb66e838ed52decc0be1eb5817b2a77c3840db2c5 (diff)
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'decoder/trule.cc')
-rw-r--r--decoder/trule.cc204
1 files changed, 21 insertions, 183 deletions
diff --git a/decoder/trule.cc b/decoder/trule.cc
index c22baae3..bee211d5 100644
--- a/decoder/trule.cc
+++ b/decoder/trule.cc
@@ -17,73 +17,16 @@ bool TRule::IsGoal() const {
return GetLHS() == kGOAL;
}
-static WordID ConvertTrgString(const string& w) {
- const unsigned len = w.size();
- WordID id = 0;
- // [X,0] or [0]
- // for target rules, we ignore the category, just keep the index
- if (len > 2 && w[0]=='[' && w[len-1]==']' && w[len-2] > '0' && w[len-2] <= '9' &&
- (len == 3 || (len > 4 && w[len-3] == ','))) {
- id = w[len-2] - '0';
- id = 1 - id;
- } else {
- id = TD::Convert(w);
- }
- return id;
-}
-
-static WordID ConvertSrcString(const string& w, bool mono = false) {
- const unsigned len = w.size();
- // [X,0]
- // for source rules, we keep the category and ignore the index (source rules are
- // always numbered 1, 2, 3...
- if (mono) {
- if (len > 2 && w[0]=='[' && w[len-1]==']') {
- if (len > 4 && w[len-3] == ',') {
- cerr << "[ERROR] Monolingual rules mut not have non-terminal indices:\n "
- << w << endl;
- exit(1);
- }
- // TODO check that source indices go 1,2,3,etc.
- return TD::Convert(w.substr(1, len-2)) * -1;
- } else {
- return TD::Convert(w);
- }
- } else {
- if (len > 4 && w[0]=='[' && w[len-1]==']' && w[len-3] == ',' && w[len-2] > '0' && w[len-2] <= '9') {
- return TD::Convert(w.substr(1, len-4)) * -1;
- } else {
- return TD::Convert(w);
- }
- }
-}
-
-static WordID ConvertLHS(const string& w) {
- if (w[0] == '[') {
- const unsigned len = w.size();
- if (len < 3) { cerr << "Format error: " << w << endl; exit(1); }
- return TD::Convert(w.substr(1, len-2)) * -1;
- } else {
- return TD::Convert(w) * -1;
- }
-}
-
TRule* TRule::CreateRuleSynchronous(const string& rule) {
TRule* res = new TRule;
- if (res->ReadFromString(rule, true, false)) return res;
+ if (res->ReadFromString(rule)) return res;
cerr << "[ERROR] Failed to creating rule from: " << rule << endl;
delete res;
return NULL;
}
TRule* TRule::CreateRulePhrasetable(const string& rule) {
- // TODO make this faster
- // TODO add configuration for default NT type
- if (rule[0] == '[') {
- cerr << "Phrasetable rules shouldn't have a LHS / non-terminals:\n " << rule << endl;
- return NULL;
- }
- TRule* res = new TRule("[X] ||| " + rule, true, false);
+ TRule* res = new TRule("[X] ||| " + rule);
if (res->Arity() != 0) {
cerr << "Phrasetable rules should have arity 0:\n " << rule << endl;
delete res;
@@ -93,138 +36,33 @@ TRule* TRule::CreateRulePhrasetable(const string& rule) {
}
TRule* TRule::CreateRuleMonolingual(const string& rule) {
- return new TRule(rule, false, true);
+ return new TRule(rule, true);
}
namespace {
-// callback for lexer
+// callback for single rule lexer
int n_assigned=0;
-void assign_trule(const TRulePtr& new_rule, const unsigned int ctf_level, const TRulePtr& coarse_rule, void* extra) {
- (void) ctf_level;
- (void) coarse_rule;
- TRule *assignto=(TRule *)extra;
- *assignto=*new_rule;
- ++n_assigned;
-}
-
-}
-
-bool TRule::ReadFromString(const string& line, bool strict, bool mono) {
- if (!is_single_line_stripped(line))
- cerr<<"\nWARNING: building rule from multi-line string "<<line<<".\n";
- // backed off of this: it's failing to parse TRulePtr glue(new TRule("[" + goal_nt + "] ||| [" + goal_nt + ",1] ["+ default_nt + ",2] ||| [1] [2] ||| Glue=1")); thinks [1] is the features!
- if (false && !(mono||strict)) {
- // use lexer
- istringstream il(line);
- n_assigned=0;
- RuleLexer::ReadRules(&il,assign_trule,"STRING",this);
- if (n_assigned>1)
- cerr<<"\nWARNING: more than one rule parsed from multi-line string; kept last: "<<line<<".\n";
- return n_assigned;
+ void assign_trule(const TRulePtr& new_rule, const unsigned int ctf_level, const TRulePtr& coarse_rule, void* extra) {
+ (void) ctf_level;
+ (void) coarse_rule;
+ *static_cast<TRule*>(extra) = *new_rule;
+ ++n_assigned;
}
+}
- e_.clear();
- f_.clear();
- scores_.clear();
-
- string w;
- istringstream is(line);
- int format = CountSubstrings(line, "|||");
- if (strict && format < 2) {
- cerr << "Bad rule format in strict mode:\n" << line << endl;
- return false;
- }
- if (format >= 2 || (mono && format == 1)) {
- while(is>>w && w!="|||") { lhs_ = ConvertLHS(w); }
- while(is>>w && w!="|||") { f_.push_back(ConvertSrcString(w, mono)); }
- if (!mono) {
- while(is>>w && w!="|||") { e_.push_back(ConvertTrgString(w)); }
- }
- int fv = 0;
- if (is) {
- string ss;
- getline(is, ss);
- //cerr << "L: " << ss << endl;
- unsigned start = 0;
- unsigned len = ss.size();
- const size_t ppos = ss.find(" |||");
- if (ppos != string::npos) { len = ppos; }
- while (start < len) {
- while(start < len && (ss[start] == ' ' || ss[start] == ';'))
- ++start;
- if (start == len) break;
- unsigned end = start + 1;
- while(end < len && (ss[end] != '=' && ss[end] != ' ' && ss[end] != ';'))
- ++end;
- if (end == len || ss[end] == ' ' || ss[end] == ';') {
- //cerr << "PROC: '" << ss.substr(start, end - start) << "'\n";
- // non-named features
- if (end != len) { ss[end] = 0; }
- string fname = "PhraseModel_X";
- if (fv > 9) { cerr << "Too many phrasetable scores - used named format\n"; abort(); }
- fname[12]='0' + fv;
- ++fv;
- // if the feature set is frozen, this may return zero, indicating an
- // undefined feature
- const int fid = FD::Convert(fname);
- if (fid)
- scores_.set_value(fid, atof(&ss[start]));
- //cerr << "F: " << fname << " VAL=" << scores_.value(FD::Convert(fname)) << endl;
- } else {
- const int fid = FD::Convert(ss.substr(start, end - start));
- start = end + 1;
- end = start + 1;
- while(end < len && (ss[end] != ' ' && ss[end] != ';'))
- ++end;
- if (end < len) { ss[end] = 0; }
- assert(start < len);
- if (fid)
- scores_.set_value(fid, atof(&ss[start]));
- //cerr << "F: " << FD::Convert(fid) << " VAL=" << scores_.value(fid) << endl;
- }
- start = end + 1;
- }
- }
- } else if (format == 1) {
- while(is>>w && w!="|||") { lhs_ = ConvertLHS(w); }
- while(is>>w && w!="|||") { e_.push_back(ConvertTrgString(w)); }
- f_ = e_;
- int x = ConvertLHS("[X]");
- for (unsigned i = 0; i < f_.size(); ++i)
- if (f_[i] <= 0) { f_[i] = x; }
- } else {
- cerr << "F: " << format << endl;
- cerr << "[ERROR] Don't know how to read:\n" << line << endl;
- }
+bool TRule::ReadFromString(const string& line, bool mono) {
+ n_assigned = 0;
+ //cerr << "LINE: " << line << " -- mono=" << mono << endl;
+ RuleLexer::ReadRule(line + '\n', assign_trule, mono, this);
+ if (n_assigned > 1)
+ cerr<<"\nWARNING: more than one rule parsed from multi-line string; kept last: "<<line<<".\n";
if (mono) {
e_ = f_;
- int ci = 0;
- for (unsigned i = 0; i < e_.size(); ++i)
- if (e_[i] < 0)
- e_[i] = ci--;
- }
- ComputeArity();
- return SanityCheck();
-}
-
-bool TRule::SanityCheck() const {
- vector<int> used(f_.size(), 0);
- int ac = 0;
- for (unsigned i = 0; i < e_.size(); ++i) {
- int ind = e_[i];
- if (ind > 0) continue;
- ind = -ind;
- if ((++used[ind]) != 1) {
- cerr << "[ERROR] e-side variable index " << (ind+1) << " used more than once!\n";
- return false;
- }
- ac++;
- }
- if (ac != Arity()) {
- cerr << "[ERROR] e-side arity mismatches f-side\n";
- return false;
+ int ntc = 0;
+ for (auto& i : e_)
+ if (i < 0) i = -ntc++;
}
- return true;
+ return n_assigned;
}
void TRule::ComputeArity() {
@@ -245,7 +83,7 @@ string TRule::AsString(bool verbose) const {
if (w < 0) {
int wi = w * -1;
++idx;
- os << " [" << TD::Convert(wi) << ',' << idx << ']';
+ os << " [" << TD::Convert(wi) << ']';
} else {
os << ' ' << TD::Convert(w);
}