diff options
author | Chris Dyer <redpony@gmail.com> | 2014-03-12 02:31:37 -0400 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2014-03-12 02:31:37 -0400 |
commit | 23fe12f6d5083c66d755b797b8d0d5587f95de8e (patch) | |
tree | a0447c8d05e18f50a111c909f3ac40b5d30124e6 | |
parent | 3cebc7496f328359f8f698085c02306c55b5a371 (diff) |
tree_fragment stuff
-rw-r--r-- | decoder/tree_fragment.cc | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/decoder/tree_fragment.cc b/decoder/tree_fragment.cc new file mode 100644 index 00000000..d5c30f58 --- /dev/null +++ b/decoder/tree_fragment.cc @@ -0,0 +1,115 @@ +#include "tree_fragment.h" + +#include <cassert> + +using namespace std; + +namespace cdec { + +TreeFragment::TreeFragment(const string& tree, bool allow_frontier_sites) { + int bal = 0; + const unsigned len = tree.size(); + unsigned cur = 0; + unsigned open = 0, close = 0; + for (auto& c : tree) { + ++cur; + if (c == '(') { ++open; ++bal; } + else if (c == ')') { + ++close; --bal; + if (bal < 1 && cur != len) { + cerr << "Badly formed tree detected at column " << cur << " in:\n" << tree << endl; + abort(); + } + } + } + nodes.resize(open); + unsigned cp = 0, symp = 0, np = 0; + ParseRec(tree, allow_frontier_sites, cp, symp, np, &cp, &symp, &np); + root = nodes.back().lhs; + //cerr << "ROOT: " << TD::Convert(root & ALL_MASK) << endl; + //DebugRec(open - 1, &cerr); cerr << "\n"; +} + +void TreeFragment::DebugRec(unsigned cur, ostream* out) const { + *out << '(' << TD::Convert(nodes[cur].lhs & ALL_MASK); + for (auto& x : nodes[cur].rhs) { + *out << ' '; + if (IsFrontier(x)) { + *out << '[' << TD::Convert(x & ALL_MASK) << ']'; + } else if (IsInternalNT(x)) { + DebugRec(x & ALL_MASK, out); + } else { // must be terminal + *out << TD::Convert(x); + } + } + *out << ')'; +} + +// cp is the character index in the tree +// np keeps track of the nodes (nonterminals) that have been built +// symp keeps track of the terminal symbols that have been built +void TreeFragment::ParseRec(const string& tree, bool afs, unsigned cp, unsigned symp, unsigned np, unsigned* pcp, unsigned* psymp, unsigned* pnp) { + if (tree[cp] != '(') { + cerr << "Expected ( at " << cp << endl; + abort(); + } + const unsigned i = symp; + vector<unsigned> rhs; // w | 0 = terminal, w | NT_BIT, index | FRONTIER_BIT + ++cp; + while(tree[cp] == ' ') { ++cp; } + const unsigned nt_start = cp; + while(tree[cp] != ' ' && tree[cp] != '(' && tree[cp] != ')') { ++cp; } + const unsigned nt_end = cp; + while(tree[cp] == ' ') { ++cp; } + while (tree[cp] != ')') { + if (tree[cp] == '(') { + // recursively call parser to deal with constituent + ParseRec(tree, afs, cp, symp, np, &cp, &symp, &np); + unsigned ind = np - 1; + rhs.push_back(ind | NT_BIT); + } else { // deal with terminal / nonterminal substitution + ++symp; + assert(tree[cp] != ' '); + const unsigned t_start = cp; + while(tree[cp] != ' ' && tree[cp] != ')' && tree[cp] != '(') { ++cp; } + const unsigned t_end = cp; + while(tree[cp] == ' ') { ++cp; } + // TODO: add a terminal symbol to the current edge + const bool is_terminal = tree[t_start] != '[' || (t_end - t_start < 3 || tree[t_end - 1] != ']'); + if (is_terminal) { + const unsigned term = TD::Convert(tree.substr(t_start, t_end - t_start)); + rhs.push_back(term); + // cerr << "T='" << TD::Convert(term) << "'\n"; + ++terminals; + } else { // frontier site (NT but no recursion) + const unsigned nt = TD::Convert(tree.substr(t_start + 1, t_end - t_start - 2)) | FRONTIER_BIT; + rhs.push_back(nt); + ++frontier_sites; + // cerr << "FRONT-NT=[" << TD::Convert(nt & ALL_MASK) << "]\n"; + if (!afs) { + cerr << "Frontier sites not allowed in input: " << tree << endl; + abort(); + } + } + } + } // continuent has completed, cp is at ), build node + const unsigned j = symp; // span from (i,j) + // add an internal non-terminal symbol + const unsigned nt = TD::Convert(tree.substr(nt_start, nt_end - nt_start)) | NT_BIT; + nodes[np] = TreeFragmentProduction(nt, rhs); + //cerr << np << " production(" << i << "," << j << ")= " << TD::Convert(nt & ALL_MASK) << " -->"; + //for (auto& x : rhs) { + // cerr << ' '; + // if (IsFrontier(x)) cerr << '*'; + // if (IsInternalNT(x)) cerr << TD::Convert(nodes[x & ALL_MASK].lhs & ALL_MASK); else + // cerr << TD::Convert(x & ALL_MASK); + //} + //cerr << "\n "; DebugRec(np,&cerr); cerr << endl; + ++cp; + while(tree[cp] == ' ' && cp < tree.size()) { ++cp; } + *pcp = cp; + *pnp = np + 1; + *psymp = symp; +} + +} |