From c0ece722bd7775af14c7f2ec18dcd1fd138607cb Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 10 Oct 2014 14:20:59 +0100 Subject: algorithms: shift reduce dependency parsing --- algorithms/dependency_structures.cc | 382 ++++++++++++++++++++++++++++++++++++ 1 file changed, 382 insertions(+) create mode 100644 algorithms/dependency_structures.cc (limited to 'algorithms/dependency_structures.cc') diff --git a/algorithms/dependency_structures.cc b/algorithms/dependency_structures.cc new file mode 100644 index 0000000..cd84d2a --- /dev/null +++ b/algorithms/dependency_structures.cc @@ -0,0 +1,382 @@ +#include +#include +#include +#include +#include + +using namespace std; + + +typedef char Word; +struct Node; +typedef vector Structure; + +struct Node { + Word word; + size_t id; + + Structure left; + Structure right; +}; + +struct StackItem { + char state; + + Structure d; +}; + +/* + * @brief Find node with given id below Node n. + * @detail That's variant of BFS + * + */ +Node* +find_in_node(Node* n, size_t const id) +{ + vector queue; + queue.push_back(n); + while (!queue.empty()) { + Node* cur = queue.back(); + queue.pop_back(); + if (id == cur->id) { + return cur; + } else if (id < cur->id) { + for (size_t i = 0; i < cur->left.size(); i++) + queue.push_back(&(cur->left[i])); + } else if (id > cur->id) { + for (size_t i = 0; i < cur->right.size(); i++) + queue.push_back(&(cur->right[i])); + } + } + + return NULL; +} + +/* + * @brief Pretty prints a structure. + * + */ +ostream& +print_structure(ostream& os, Structure& s, size_t level=0, char side='-') +{ + ostringstream indent; + for (size_t i = 0; i <= level; i++) + indent << " "; + + if (side == 'r' || side == '-') { + for (int i = s.size()-1; i >= 0; i--) { // sorted by id + os << indent.str() << s[i].id << "," << s[i].word << " " << side << endl; + print_structure(os, s[i].left, level+1, 'l'); + print_structure(os, s[i].right, level+1, 'r'); + } + } else { + for (size_t i = 0; i < s.size(); i++) { // sorted by id + os << indent.str() << s[i].id << "," << s[i].word << " " << side << endl; + print_structure(os, s[i].left, level+1, 'l'); + print_structure(os, s[i].right, level+1, 'r'); + } + } + + return os; +} + +/* + * @brief Fills dependency structure given words and + * DependencyInfo. + * + */ +void +make_structure(Structure& s, vector& words, vector links) +{ + // create nodes without hierarchy + vector all, queue, sorted; + size_t id = 0; + for (vector::iterator it = words.begin(); + it != words.end(); it++) { + Word word = *it; + Node n; + n.id = id; + n.word = word; + all.push_back(n); + if (links[id] == -1) + queue.push_back(n); + id++; + } + + // topologically sort before construction + while (!queue.empty()) { + Node n = queue.back(); + queue.pop_back(); + sorted.push_back(n); + size_t i = 0; + for (size_t j = 0; j < links.size(); j++) { + if (links[j] == n.id) + queue.push_back(all[i]); + i++; + } + } + + // build structure/s + for (vector::iterator n = sorted.begin(); + n != sorted.end(); n++) { + if (links[n->id] == -1) { // top-level + s.push_back(*n); + } else { + Node* a; + for (size_t k = 0; k < s.size(); k++) { + a = find_in_node(&(s[k]), links[n->id]); + if (a) { + if (n->id < a->id) { // left + a->left.push_back(*n); + } else { // right + a->right.push_back(*n); + } + break; + } + } + } + } +} + +void +score_and_prune(Node* n, bool final=false) +{ + if (n->left.size() == 0 + && n->right.size() == 0) return; + + for (int i = n->left.size()-1; i >= 0; i--) + cout << n->left[i].word; + cout << "{" << n->word; + if (final) cout << "!"; + cout << "}"; + for (int i = n->right.size()-1; i >= 0; i--) + cout << n->right[i].word; + cout << endl; + + for (int i = 0; i < n->left.size(); i++) + score_and_prune(&(n->left[i])); + for (int i = 0; i < n->right.size(); i++) + score_and_prune(&(n->right[i])); + + n->left.clear(); + n->right.clear(); +} + +/* + * where = left or right + * + */ +void +reduce(Node* head, Structure& a, char where) +{ + for (Structure::iterator it = a.begin(); + it != a.end(); it++) { + if (where == 'l') + head->left.push_back(*it); + else // where == 'r' + head->right.push_back(*it); + } +} + +void +test1() +{ + cerr << "TEST 1" << endl; + vector words; + words.push_back('e'); + words.push_back('f'); + words.push_back('g'); + words.push_back('x'); + words.push_back('h'); + words.push_back('i'); + vector links; + links.push_back(4); + links.push_back(0); + links.push_back(0); + links.push_back(2); + links.push_back(-1); + links.push_back(-1); + Structure s; + make_structure(s, words, links); + print_structure(cout, s); + cerr << endl; +} + +void +test2() +{ + cerr << "TEST 2" << endl; + vector words; + words.push_back('e'); + words.push_back('f'); + words.push_back('x'); + words.push_back('g'); + words.push_back('h'); + words.push_back('i'); + vector links; + links.push_back(4); + links.push_back(0); + links.push_back(3); + links.push_back(0); + links.push_back(-1); + links.push_back(-1); + Structure s; + make_structure(s, words, links); + print_structure(cout, s); + cerr << endl; +} + +void +test3() +{ + cerr << "TEST 3" << endl; + + vector w1; + w1.push_back('a'); + vector l1; + l1.push_back(-1); + Structure s1; + make_structure(s1, w1, l1); + print_structure(cout, s1); + cout << endl; + + vector w2; + w2.push_back('b'); + w2.push_back('c'); + vector l2; + l2.push_back(-1); + l2.push_back(-1); + Structure s2; + make_structure(s2, w2, l2); + print_structure(cout, s2); + cout << endl; + + vector w3; + w3.push_back('d'); + vector l3; + l3.push_back(-1); + Structure s3; + make_structure(s3, w3, l3); + print_structure(cout, s3); + cout << endl; + + vector w4; + w4.push_back('e'); + w4.push_back('f'); + w4.push_back('g'); + vector l4; + l4.push_back(-1); + l4.push_back(0); + l4.push_back(0); + Structure s4; + make_structure(s4, w4, l4); + print_structure(cout, s4); + cout << endl; + + vector w5; + w5.push_back('h'); + vector l5; + l5.push_back(-1); + Structure s5; + make_structure(s5, w5, l5); + print_structure(cout, s5); + cout << endl; +} + +void +test4() +{ + cerr << "TEST 3" << endl; + vector words; + words.push_back('a'); + words.push_back('b'); + words.push_back('c'); + words.push_back('d'); + words.push_back('e'); + words.push_back('f'); + words.push_back('g'); + words.push_back('h'); + words.push_back('i'); + vector links; + links.push_back(7); + links.push_back(3); + links.push_back(3); + links.push_back(7); + links.push_back(7); + links.push_back(4); + links.push_back(4); + links.push_back(-1); + links.push_back(-1); + Structure s; + make_structure(s, words, links); + print_structure(cout, s); + score_and_prune(&(s.back()), true); + cerr << endl; +} + +void +test5() +{ + cerr << "TEST 5" << endl; + + vector w2; + w2.push_back('b'); + w2.push_back('c'); + vector l2; + l2.push_back(-1); + l2.push_back(-1); + Structure s2; + make_structure(s2, w2, l2); + print_structure(cout, s2); + cout << endl; + + vector w3; + w3.push_back('d'); + vector l3; + l3.push_back(-1); + Structure s3; + make_structure(s3, w3, l3); + print_structure(cout, s3); + cout << endl; + + vector w4; + w4.push_back('e'); + w4.push_back('f'); + w4.push_back('g'); + vector l4; + l4.push_back(-1); + l4.push_back(0); + l4.push_back(0); + Structure s4; + make_structure(s4, w4, l4); + print_structure(cout, s4); + cout << endl; + + reduce(&(s3.back()), s2, 'l'); + print_structure(cout, s3); + cerr << endl; + + reduce(&(s3.back()), s4, 'r'); + print_structure(cout, s3); + cerr << endl; + + score_and_prune(&(s3.back().right[0])); + cerr << endl; + print_structure(cout, s3); + cerr << endl; + score_and_prune(&(s3.back())); + cerr << endl; +} + +int +main(void) +{ + test1(); + test2(); + test3(); + test4(); + test5(); + + return 0; +} + -- cgit v1.2.3