summaryrefslogtreecommitdiff
path: root/algorithms/dependency_structures.cc
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-10-10 14:20:59 +0100
committerPatrick Simianer <p@simianer.de>2014-10-10 14:20:59 +0100
commitc0ece722bd7775af14c7f2ec18dcd1fd138607cb (patch)
treefab93a5d48d89c9e3c7473a1ca5aafe7b3bcaeed /algorithms/dependency_structures.cc
parentde54ca7c31487a33fa8148159e9b3dea96a9dd4c (diff)
algorithms: shift reduce dependency parsing
Diffstat (limited to 'algorithms/dependency_structures.cc')
-rw-r--r--algorithms/dependency_structures.cc382
1 files changed, 382 insertions, 0 deletions
diff --git a/algorithms/dependency_structures.cc b/algorithms/dependency_structures.cc
new file mode 100644
index 0000000..cd84d2a
--- /dev/null
+++ b/algorithms/dependency_structures.cc
@@ -0,0 +1,382 @@
+#include <iostream>
+#include <vector>
+#include <map>
+#include <algorithm>
+#include <sstream>
+
+using namespace std;
+
+
+typedef char Word;
+struct Node;
+typedef vector<Node> Structure;
+
+struct Node {
+ Word word;
+ size_t id;
+
+ Structure left;
+ Structure right;
+};
+
+struct StackItem {
+ char state;
+
+ Structure d;
+};
+
+/*
+ * @brief Find node with given id below Node n.
+ * @detail That's variant of BFS
+ *
+ */
+Node*
+find_in_node(Node* n, size_t const id)
+{
+ vector<Node*> queue;
+ queue.push_back(n);
+ while (!queue.empty()) {
+ Node* cur = queue.back();
+ queue.pop_back();
+ if (id == cur->id) {
+ return cur;
+ } else if (id < cur->id) {
+ for (size_t i = 0; i < cur->left.size(); i++)
+ queue.push_back(&(cur->left[i]));
+ } else if (id > cur->id) {
+ for (size_t i = 0; i < cur->right.size(); i++)
+ queue.push_back(&(cur->right[i]));
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * @brief Pretty prints a structure.
+ *
+ */
+ostream&
+print_structure(ostream& os, Structure& s, size_t level=0, char side='-')
+{
+ ostringstream indent;
+ for (size_t i = 0; i <= level; i++)
+ indent << " ";
+
+ if (side == 'r' || side == '-') {
+ for (int i = s.size()-1; i >= 0; i--) { // sorted by id
+ os << indent.str() << s[i].id << "," << s[i].word << " " << side << endl;
+ print_structure(os, s[i].left, level+1, 'l');
+ print_structure(os, s[i].right, level+1, 'r');
+ }
+ } else {
+ for (size_t i = 0; i < s.size(); i++) { // sorted by id
+ os << indent.str() << s[i].id << "," << s[i].word << " " << side << endl;
+ print_structure(os, s[i].left, level+1, 'l');
+ print_structure(os, s[i].right, level+1, 'r');
+ }
+ }
+
+ return os;
+}
+
+/*
+ * @brief Fills dependency structure given words and
+ * DependencyInfo.
+ *
+ */
+void
+make_structure(Structure& s, vector<Word>& words, vector<int> links)
+{
+ // create nodes without hierarchy
+ vector<Node> all, queue, sorted;
+ size_t id = 0;
+ for (vector<Word>::iterator it = words.begin();
+ it != words.end(); it++) {
+ Word word = *it;
+ Node n;
+ n.id = id;
+ n.word = word;
+ all.push_back(n);
+ if (links[id] == -1)
+ queue.push_back(n);
+ id++;
+ }
+
+ // topologically sort before construction
+ while (!queue.empty()) {
+ Node n = queue.back();
+ queue.pop_back();
+ sorted.push_back(n);
+ size_t i = 0;
+ for (size_t j = 0; j < links.size(); j++) {
+ if (links[j] == n.id)
+ queue.push_back(all[i]);
+ i++;
+ }
+ }
+
+ // build structure/s
+ for (vector<Node>::iterator n = sorted.begin();
+ n != sorted.end(); n++) {
+ if (links[n->id] == -1) { // top-level
+ s.push_back(*n);
+ } else {
+ Node* a;
+ for (size_t k = 0; k < s.size(); k++) {
+ a = find_in_node(&(s[k]), links[n->id]);
+ if (a) {
+ if (n->id < a->id) { // left
+ a->left.push_back(*n);
+ } else { // right
+ a->right.push_back(*n);
+ }
+ break;
+ }
+ }
+ }
+ }
+}
+
+void
+score_and_prune(Node* n, bool final=false)
+{
+ if (n->left.size() == 0
+ && n->right.size() == 0) return;
+
+ for (int i = n->left.size()-1; i >= 0; i--)
+ cout << n->left[i].word;
+ cout << "{" << n->word;
+ if (final) cout << "!";
+ cout << "}";
+ for (int i = n->right.size()-1; i >= 0; i--)
+ cout << n->right[i].word;
+ cout << endl;
+
+ for (int i = 0; i < n->left.size(); i++)
+ score_and_prune(&(n->left[i]));
+ for (int i = 0; i < n->right.size(); i++)
+ score_and_prune(&(n->right[i]));
+
+ n->left.clear();
+ n->right.clear();
+}
+
+/*
+ * where = left or right
+ *
+ */
+void
+reduce(Node* head, Structure& a, char where)
+{
+ for (Structure::iterator it = a.begin();
+ it != a.end(); it++) {
+ if (where == 'l')
+ head->left.push_back(*it);
+ else // where == 'r'
+ head->right.push_back(*it);
+ }
+}
+
+void
+test1()
+{
+ cerr << "TEST 1" << endl;
+ vector<Word> words;
+ words.push_back('e');
+ words.push_back('f');
+ words.push_back('g');
+ words.push_back('x');
+ words.push_back('h');
+ words.push_back('i');
+ vector<int> links;
+ links.push_back(4);
+ links.push_back(0);
+ links.push_back(0);
+ links.push_back(2);
+ links.push_back(-1);
+ links.push_back(-1);
+ Structure s;
+ make_structure(s, words, links);
+ print_structure(cout, s);
+ cerr << endl;
+}
+
+void
+test2()
+{
+ cerr << "TEST 2" << endl;
+ vector<Word> words;
+ words.push_back('e');
+ words.push_back('f');
+ words.push_back('x');
+ words.push_back('g');
+ words.push_back('h');
+ words.push_back('i');
+ vector<int> links;
+ links.push_back(4);
+ links.push_back(0);
+ links.push_back(3);
+ links.push_back(0);
+ links.push_back(-1);
+ links.push_back(-1);
+ Structure s;
+ make_structure(s, words, links);
+ print_structure(cout, s);
+ cerr << endl;
+}
+
+void
+test3()
+{
+ cerr << "TEST 3" << endl;
+
+ vector<Word> w1;
+ w1.push_back('a');
+ vector<int> l1;
+ l1.push_back(-1);
+ Structure s1;
+ make_structure(s1, w1, l1);
+ print_structure(cout, s1);
+ cout << endl;
+
+ vector<Word> w2;
+ w2.push_back('b');
+ w2.push_back('c');
+ vector<int> l2;
+ l2.push_back(-1);
+ l2.push_back(-1);
+ Structure s2;
+ make_structure(s2, w2, l2);
+ print_structure(cout, s2);
+ cout << endl;
+
+ vector<Word> w3;
+ w3.push_back('d');
+ vector<int> l3;
+ l3.push_back(-1);
+ Structure s3;
+ make_structure(s3, w3, l3);
+ print_structure(cout, s3);
+ cout << endl;
+
+ vector<Word> w4;
+ w4.push_back('e');
+ w4.push_back('f');
+ w4.push_back('g');
+ vector<int> l4;
+ l4.push_back(-1);
+ l4.push_back(0);
+ l4.push_back(0);
+ Structure s4;
+ make_structure(s4, w4, l4);
+ print_structure(cout, s4);
+ cout << endl;
+
+ vector<Word> w5;
+ w5.push_back('h');
+ vector<int> l5;
+ l5.push_back(-1);
+ Structure s5;
+ make_structure(s5, w5, l5);
+ print_structure(cout, s5);
+ cout << endl;
+}
+
+void
+test4()
+{
+ cerr << "TEST 3" << endl;
+ vector<Word> words;
+ words.push_back('a');
+ words.push_back('b');
+ words.push_back('c');
+ words.push_back('d');
+ words.push_back('e');
+ words.push_back('f');
+ words.push_back('g');
+ words.push_back('h');
+ words.push_back('i');
+ vector<int> links;
+ links.push_back(7);
+ links.push_back(3);
+ links.push_back(3);
+ links.push_back(7);
+ links.push_back(7);
+ links.push_back(4);
+ links.push_back(4);
+ links.push_back(-1);
+ links.push_back(-1);
+ Structure s;
+ make_structure(s, words, links);
+ print_structure(cout, s);
+ score_and_prune(&(s.back()), true);
+ cerr << endl;
+}
+
+void
+test5()
+{
+ cerr << "TEST 5" << endl;
+
+ vector<Word> w2;
+ w2.push_back('b');
+ w2.push_back('c');
+ vector<int> l2;
+ l2.push_back(-1);
+ l2.push_back(-1);
+ Structure s2;
+ make_structure(s2, w2, l2);
+ print_structure(cout, s2);
+ cout << endl;
+
+ vector<Word> w3;
+ w3.push_back('d');
+ vector<int> l3;
+ l3.push_back(-1);
+ Structure s3;
+ make_structure(s3, w3, l3);
+ print_structure(cout, s3);
+ cout << endl;
+
+ vector<Word> w4;
+ w4.push_back('e');
+ w4.push_back('f');
+ w4.push_back('g');
+ vector<int> l4;
+ l4.push_back(-1);
+ l4.push_back(0);
+ l4.push_back(0);
+ Structure s4;
+ make_structure(s4, w4, l4);
+ print_structure(cout, s4);
+ cout << endl;
+
+ reduce(&(s3.back()), s2, 'l');
+ print_structure(cout, s3);
+ cerr << endl;
+
+ reduce(&(s3.back()), s4, 'r');
+ print_structure(cout, s3);
+ cerr << endl;
+
+ score_and_prune(&(s3.back().right[0]));
+ cerr << endl;
+ print_structure(cout, s3);
+ cerr << endl;
+ score_and_prune(&(s3.back()));
+ cerr << endl;
+}
+
+int
+main(void)
+{
+ test1();
+ test2();
+ test3();
+ test4();
+ test5();
+
+ return 0;
+}
+