From 567f2bd17c05d31cd8a9b9d351f9030cddf53cb7 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 9 Jul 2014 15:26:00 +0200 Subject: c++ implementation --- fast/Makefile | 12 ++++ fast/README.md | 8 +++ fast/fast_weaver | Bin 0 -> 88433 bytes fast/grammar.cc | 16 ++++++ fast/grammar.hh | 33 +++++++++++ fast/grammar.o | Bin 0 -> 2872 bytes fast/hypergraph.cc | 161 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fast/hypergraph.hh | 76 +++++++++++++++++++++++++ fast/hypergraph.o | Bin 0 -> 72616 bytes fast/main.cc | 138 +++++++++++++++++++++++++++++++++++++++++++++ fast/semiring.hh | 36 ++++++++++++ 11 files changed, 480 insertions(+) create mode 100644 fast/Makefile create mode 100644 fast/README.md create mode 100755 fast/fast_weaver create mode 100644 fast/grammar.cc create mode 100644 fast/grammar.hh create mode 100644 fast/grammar.o create mode 100644 fast/hypergraph.cc create mode 100644 fast/hypergraph.hh create mode 100644 fast/hypergraph.o create mode 100644 fast/main.cc create mode 100644 fast/semiring.hh diff --git a/fast/Makefile b/fast/Makefile new file mode 100644 index 0000000..4aea1ac --- /dev/null +++ b/fast/Makefile @@ -0,0 +1,12 @@ +all: hypergraph.o main.cc + clang -std=c++11 -lstdc++ main.cc hypergraph.o -o fast_weaver + +hypergraph.o: hypergraph.cc hypergraph.hh grammar.o semiring.hh + clang -std=c++11 -lstdc++ -c hypergraph.cc grammar.o + +grammar.o: grammar.cc grammar.hh + clang -std=c++11 -lstdc++ -c grammar.cc + +clean: + rm fast_weaver hypergraph.o grammar.o + diff --git a/fast/README.md b/fast/README.md new file mode 100644 index 0000000..112a7ae --- /dev/null +++ b/fast/README.md @@ -0,0 +1,8 @@ +TODO + * grammar + * parser + * other semirings + * sparse vector + * hg: json input (jsoncpp?) + * language model: kenlm + diff --git a/fast/fast_weaver b/fast/fast_weaver new file mode 100755 index 0000000..7d349b3 Binary files /dev/null and b/fast/fast_weaver differ diff --git a/fast/grammar.cc b/fast/grammar.cc new file mode 100644 index 0000000..946b062 --- /dev/null +++ b/fast/grammar.cc @@ -0,0 +1,16 @@ +#include "grammar.hh" + +namespace Grammar { + + +string +NT::s() +{ + ostringstream os; + os << "NT<" << this->symbol << "," << this->index << ">"; + return os.str(); +} + + +} // namespace + diff --git a/fast/grammar.hh b/fast/grammar.hh new file mode 100644 index 0000000..5625b85 --- /dev/null +++ b/fast/grammar.hh @@ -0,0 +1,33 @@ +#ifndef GRAMMAR_HH +#define GRAMMAR_HH + +#include +#include + +using namespace std; + +namespace Grammar { + + +class NT { + public: + string symbol; + unsigned int index; + + string s(); +}; + +class T { + public: + string word; +}; + +class Rule { + public: +}; + + +} // namespace + +#endif + diff --git a/fast/grammar.o b/fast/grammar.o new file mode 100644 index 0000000..aae25db Binary files /dev/null and b/fast/grammar.o differ diff --git a/fast/hypergraph.cc b/fast/hypergraph.cc new file mode 100644 index 0000000..08b4d05 --- /dev/null +++ b/fast/hypergraph.cc @@ -0,0 +1,161 @@ +#include "hypergraph.hh" + +namespace Hg { + + +/* + * Node + * methods + * + */ +string +Node::s() +{ + ostringstream os; + os << \ + "Nodeid << \ + ", symbol=" << this->symbol << \ + ", span=(" << this->left << "," << this->right << ")" \ + ", outgoing:" << this->outgoing.size() << \ + ", incoming:" << this->incoming.size() << ">"; + return os.str(); +} + +/* + * Hyperedge + * methods + * + */ +unsigned int +Hyperedge::arity() +{ + return this->arity_; +} + +bool +Hyperedge::is_marked() +{ + return (this->arity_ == this->mark); +} + +string +Hyperedge::s() +{ + ostringstream os; + vector tails_ids; + ostringstream _; + for_each(this->tails.begin(), this->tails.end(), [&_](Node* n) { _ << n->id; _ << ","; }); + os << \ + "Hyperedgehead->id << \ + ", rule:'" << "TODO" << \ + "', tails=" << _.str() << \ + ", arity=" << this->arity() << \ + ", score=" << this->score << \ + " , f=" << "TODO" << \ + ", mark=" << this->mark << ">"; + return os.str(); +} + + +/* + * Hypergraph + * methods + * + */ +unsigned int +Hypergraph::arity() +{ + // TODO + return 0; +} + +void +Hypergraph::reset() +{ +} + +string +Hypergraph::s() +{ + // TODO + return ""; +} + +string +Hypergraph::json_s() +{ + // TODO + return ""; +} + +/* + * functions + * + */ +vector +topological_sort(vector& nodes) +{ + vector sorted; + vector tmp; + for_each(nodes.begin(), nodes.end(), [&tmp](Node* n) -> void { if (n->incoming.size()==0) tmp.push_back(n); }); + while (!tmp.empty()) { + Node* n = tmp.front(); + cout << n->id << endl; + tmp.erase(tmp.begin()); + sorted.push_back(n); + for (auto it = n->outgoing.begin(); it != n->outgoing.end(); it++) { // edges + if ((*it)->is_marked()) { + continue; + } + (*it)->mark++; + bool add = true; + for (auto jt = (*it)->head->incoming.begin(); jt != (*it)->head->incoming.end(); jt++) { + if (!(*jt)->is_marked()) { + add = false; + break; + } + } + if (add==true) { + tmp.push_back( (*it)->head ); + cout << " " << (*it)->head->id<< endl; + } + } + } + return sorted; +} + +void +init(vector& nodes, ViterbiSemiring& semiring, Node* root) +{ + for (auto it = nodes.begin(); it != nodes.end(); ++it) + (*it)->score = semiring.null; + root->score = semiring.one; +} + +void +viterbi(vector& nodes, map nodes_by_id, Node* root) +{ + vector sorted = topological_sort(nodes); + ViterbiSemiring semiring; + + init(sorted, semiring, root); + + for (auto n_it = sorted.begin(); n_it != sorted.end(); ++n_it) { + for (auto e_it = (*n_it)->incoming.begin(); e_it != (*n_it)->incoming.end(); ++e_it) { + cout << (*e_it)->s() << endl; + double s = semiring.one; + for (auto m_it = (*e_it)->tails.begin(); m_it != (*e_it)->tails.end(); m_it++) { + s = semiring.multiply(s, (*m_it)->score); + } + (*n_it)->score = semiring.add((*n_it)->score, semiring.multiply(s, (*e_it)->score)); + } + } + + for (auto it = sorted.begin(); it != sorted.end(); ++it) { + cout << (*it)->id << " " << (*it)->score << endl; + } +} + + +} // namespace + diff --git a/fast/hypergraph.hh b/fast/hypergraph.hh new file mode 100644 index 0000000..6e53045 --- /dev/null +++ b/fast/hypergraph.hh @@ -0,0 +1,76 @@ +#ifndef HYPERGRAPH_HH +#define HYPERGRAPH_HH + +#include "grammar.hh" +#include "semiring.hh" +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +typedef double score_t; +typedef double weight_t; + +namespace Hg { + + +class Node; + +class Hyperedge { + public: + Node* head; + vector tails; + score_t score; + vector f; + unsigned int mark; + //Grammar::Rule rule; FIXME + unsigned int arity_; + + unsigned int arity(); + bool is_marked(); + string s(); + +}; + + +class Node { + public: + unsigned int id; + string symbol; + unsigned int left; + unsigned int right; + score_t score; + vector outgoing; + vector incoming; + + string s(); +}; + + + +class Hypergraph { + public: + vector nodes; + vector edges; + unsigned int arity_; + map nodes_by_id; + + unsigned int arity(); + void reset(); + string s(); + string json_s(); +}; + +vector topological_sort(vector& nodes); +void viterbi(vector& nodes, map nodes_by_id, Node* root); + + +} // namespace + +#endif + diff --git a/fast/hypergraph.o b/fast/hypergraph.o new file mode 100644 index 0000000..8fab348 Binary files /dev/null and b/fast/hypergraph.o differ diff --git a/fast/main.cc b/fast/main.cc new file mode 100644 index 0000000..c951cd8 --- /dev/null +++ b/fast/main.cc @@ -0,0 +1,138 @@ +#include "hypergraph.hh" + + +int +main(void) +{ +/* +{ +"weights":{"logp":2.0,"use_house":0.0,"use_shell":1.0}, +"nodes": +[ +{ "id":0, "cat":"root", "span":[0,0] }, +{ "id":1, "cat":"NP", "span":[1,2] }, +{ "id":2, "cat":"V", "span":[2,3] }, +{ "id":3, "cat":"JJ", "span":[4,5] }, +{ "id":4, "cat":"NN", "span":[4,6] }, +{ "id":5, "cat":"NP", "span":[3,6] }, +{ "id":6, "cat":"VP", "span":[2,6] }, +{ "id":7, "cat":"S", "span":[1,6] } +], +"edges": +[ +{ "head":1, "rule":"[NP] ||| ich ||| i", "tails":[0], "f":{"logp":-0.5,"use_i":1.0} }, +{ "head":2, "rule":"[V] ||| sah ||| saw", "tails":[0], "f":{"logp":-0.25,"use_saw":1.0} }, +{ "head":3, "rule":"[JJ] ||| kleines ||| small", "tails":[0], "f":{"logp":0.0,"use_small":1.0} }, +{ "head":3, "rule":"[JJ] ||| kleines ||| little", "tails":[0], "f":{"logp":0.0,"use_little":1.0} }, +{ "head":4, "rule":"[NN] ||| kleines haus ||| small house", "tails":[0], "f":{"logp":0.0,"use_house":1.0} }, +{ "head":4, "rule":"[NN] ||| kleines haus ||| little house", "tails":[0], "f":{"logp":0.0,"use_house":1.0} }, +{ "head":4, "rule":"[NN] ||| [JJ,1] haus ||| [JJ,1] shell", "tails":[3], "f":{"logp":0.0,"use_shell":1.0} }, +{ "head":4, "rule":"[NN] ||| [JJ,1] haus ||| [JJ,1] house", "tails":[3], "f":{"logp":0.0,"use_house":1.0} }, +{ "head":5, "rule":"[NP] ||| ein [NN,1] ||| a [NN,1]", "tails":[4], "f":{"logp":0.0,"use_a":1.0} }, +{ "head":6, "rule":"[VP] ||| [V,1] [NP,2] ||| [V,1] [NP,2]", "tails":[2, 5], "f":{"logp":0.0} }, +{ "head":7, "rule":"[S] ||| [NP,1] [VP,2] ||| [NP,1] [VP,2]", "tails":[1, 6], "f":{"logp":0.0} } +] +} +*/ + + // nodes + Hg::Node a; a.id = 0; a.symbol = "root"; a.left = false; a.right = false; + Hg::Node b; b.id = 1; b.symbol = "NP"; b.left = 0; b.right = 1; + Hg::Node c; c.id = 2; c.symbol = "V"; c.left = 1; c.right = 2; + Hg::Node d; d.id = 3; d.symbol = "JJ"; d.left = 3; d.right = 4; + Hg::Node e; e.id = 4; e.symbol = "NN"; e.left = 3; e.right = 5; + Hg::Node f; f.id = 5; f.symbol = "NP"; f.left = 2; f.right = 5; + Hg::Node g; g.id = 6; g.symbol = "NP"; g.left = 1; g.right = 5; + Hg::Node h; h.id = 7; h.symbol = "S"; g.left = 0; g.right = 5; + + vector nodes; + nodes.push_back(&a); + nodes.push_back(&b); + nodes.push_back(&c); + nodes.push_back(&d); + nodes.push_back(&e); + nodes.push_back(&f); + nodes.push_back(&g); + nodes.push_back(&h); + + // nodes_by_id + map nodes_by_id; + nodes_by_id[0] = &a; + nodes_by_id[1] = &b; + nodes_by_id[2] = &c; + nodes_by_id[3] = &d; + nodes_by_id[4] = &e; + nodes_by_id[5] = &f; + nodes_by_id[6] = &g; + nodes_by_id[7] = &h; + + // edges + Hg::Hyperedge q; q.head = nodes_by_id[1]; q.tails.push_back(nodes_by_id[0]); q.score = 0.367879441171; + nodes_by_id[1]->incoming.push_back(&q); + nodes_by_id[0]->outgoing.push_back(&q); + q.arity_ = 1; + q.mark = 0; + + Hg::Hyperedge p; p.head = nodes_by_id[2]; p.tails.push_back(nodes_by_id[0]); p.score = 0.606530659713; + nodes_by_id[2]->incoming.push_back(&p); + nodes_by_id[0]->outgoing.push_back(&p); + p.arity_ = 1; + p.mark = 0; + + Hg::Hyperedge r; r.head = nodes_by_id[3]; r.tails.push_back(nodes_by_id[0]); r.score = 1.0; + nodes_by_id[3]->incoming.push_back(&r); + nodes_by_id[0]->outgoing.push_back(&r); + r.arity_ = 1; + r.mark = 0; + Hg::Hyperedge s; s.head = nodes_by_id[3]; s.tails.push_back(nodes_by_id[0]); s.score = 1.0; + nodes_by_id[3]->incoming.push_back(&s); + nodes_by_id[0]->outgoing.push_back(&s); + s.arity_ = 1; + s.mark = 0; + + Hg::Hyperedge t; t.head = nodes_by_id[4]; t.tails.push_back(nodes_by_id[0]); t.score = 1.0; + nodes_by_id[4]->incoming.push_back(&t); + nodes_by_id[0]->outgoing.push_back(&t); + t.arity_ = 1; + t.mark = 0; + Hg::Hyperedge u; u.head = nodes_by_id[4]; u.tails.push_back(nodes_by_id[0]); u.score = 1.0; + nodes_by_id[4]->incoming.push_back(&u); + nodes_by_id[0]->outgoing.push_back(&u); + u.arity_ = 1; + u.mark = 0; + + Hg::Hyperedge v; v.head = nodes_by_id[4]; v.tails.push_back(nodes_by_id[3]); v.score = 1.0; + nodes_by_id[4]->incoming.push_back(&v); + nodes_by_id[3]->outgoing.push_back(&v); + v.arity_ = 1; + v.mark = 0; + Hg::Hyperedge w; w.head = nodes_by_id[4]; w.tails.push_back(nodes_by_id[3]); w.score = 2.71828182846; + nodes_by_id[4]->incoming.push_back(&w); + nodes_by_id[3]->outgoing.push_back(&w); + w.arity_ = 1; + w.mark = 0; + + Hg::Hyperedge x; x.head = nodes_by_id[5]; x.tails.push_back(nodes_by_id[4]); x.score = 1.0; + nodes_by_id[5]->incoming.push_back(&x); + nodes_by_id[4]->outgoing.push_back(&x); + x.arity_ = 1; + x.mark = 0; + + Hg::Hyperedge y; y.head = nodes_by_id[6]; y.tails.push_back(nodes_by_id[2]); y.tails.push_back(nodes_by_id[5]); y.score = 1.0; + nodes_by_id[6]->incoming.push_back(&y); + nodes_by_id[2]->outgoing.push_back(&y); + nodes_by_id[5]->outgoing.push_back(&y); + y.arity_ = 2; + y.mark = 0; + + Hg::Hyperedge z; z.head = nodes_by_id[7]; z.tails.push_back(nodes_by_id[1]); z.tails.push_back(nodes_by_id[6]); z.score = 1.0; + nodes_by_id[7]->incoming.push_back(&z); + nodes_by_id[1]->outgoing.push_back(&z); + nodes_by_id[6]->outgoing.push_back(&z); + z.arity_ = 2; + z.mark = 0; + + // test + Hg::viterbi(nodes, nodes_by_id, nodes_by_id[0]); +} + diff --git a/fast/semiring.hh b/fast/semiring.hh new file mode 100644 index 0000000..1e40f48 --- /dev/null +++ b/fast/semiring.hh @@ -0,0 +1,36 @@ +#ifndef SEMIRING_HH +#define SEMIRING_HH + + +template +class ViterbiSemiring { + public: + T one = 1.0; + T null = 0.0; + + T add(T x, T y); + T multiply(T x, T y); + T convert(T x); +}; + +template T +ViterbiSemiring::add(T x, T y) +{ + return max(x, y); +} + +template T +ViterbiSemiring::multiply(T x, T y) +{ + return x * y; +} + +template T +ViterbiSemiring::convert(T x) +{ + return (T)x; +} + + +#endif + -- cgit v1.2.3