From 4b7b2693e829166ccec8707b59fb2bc26179551b Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 22 Jul 2014 00:34:01 +0200 Subject: simple sparse vector type --- .gitignore | 2 + fast/Makefile | 16 +++-- fast/README.md | 32 ++++++++-- fast/grammar.cc | 155 +++++++++++++++++++++++++++++++++++++++++++++ fast/grammar.hh | 58 ++++++++++++++++- fast/grammar.o | Bin 2928 -> 285176 bytes fast/hypergraph.cc | 91 +++++++++++++------------- fast/hypergraph.hh | 11 ++-- fast/sparse_vector.hh | 116 +++++++++++++++++++++++++++++++++ fast/test/Makefile | 13 ++++ fast/test_grammar.cc | 15 +++++ fast/test_sparse_vector.cc | 32 ++++++++++ 12 files changed, 474 insertions(+), 67 deletions(-) create mode 100644 fast/sparse_vector.hh create mode 100644 fast/test/Makefile create mode 100644 fast/test_grammar.cc create mode 100644 fast/test_sparse_vector.cc diff --git a/.gitignore b/.gitignore index 00b0e1a..94218e0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ *.o fast/example/ fast/fast_weaver +fast/test_grammar +fast/test_sparse_vector util/make_pak util/read_pak diff --git a/fast/Makefile b/fast/Makefile index 16bc48c..b2b697f 100644 --- a/fast/Makefile +++ b/fast/Makefile @@ -1,16 +1,22 @@ COMPILER=clang -CFLAGS=-O3 +CFLAGS=-std=c++11 -O3 all: hypergraph.o main.cc - $(COMPILER) $(CFLAGS) -std=c++11 -lstdc++ -lm -lmsgpack hypergraph.o main.cc -o fast_weaver + $(COMPILER) $(CFLAGS) -std=c++11 -lstdc++ -lm -lmsgpack grammar.o hypergraph.o main.cc -o fast_weaver hypergraph.o: hypergraph.cc hypergraph.hh grammar.o semiring.hh - $(COMPILER) $(CFLAGS) -g -std=c++11 -c hypergraph.cc + $(COMPILER) $(CFLAGS) -g -c hypergraph.cc grammar.o: grammar.cc grammar.hh - $(COMPILER) $(CFLAGS) -g -std=c++11 -c grammar.cc + $(COMPILER) $(CFLAGS) -g -c grammar.cc + +test_grammar: test_grammar.cc grammar.o + $(COMPILER) $(CFLAGS) -lstdc++ -lm grammar.o test_grammar.cc -o test_grammar + +test_sparse_vector: test_sparse_vector.cc sparse_vector.hh + $(COMPILER) $(CFLAGS) -lstdc++ -lm test_sparse_vector.cc -o test_sparse_vector clean: - rm -f fast_weaver hypergraph.o grammar.o + rm -f fast_weaver hypergraph.o grammar.o test_grammar test_sparse_vector diff --git a/fast/README.md b/fast/README.md index 5bcc962..541f93f 100644 --- a/fast/README.md +++ b/fast/README.md @@ -1,14 +1,32 @@ TODO - * grammar + * sparse vector (unordered_map) -> where to store? * parser + * Rule -> ChartItem -> Node ? + * viterbi path/string + * k-best * other semirings - * sparse vector (unordered_map) - * hg serialization? json/bson/msgpack/protocol buffers (no!) - * hg: json input (jsoncpp?) - * language model: kenlm + * include language model + * compress/hash words/feature strings? + + +Dependencies: + * MessagePack for object serialization [1] + * kenlm language model [2] + + +This is Linux only. -depends on msgpack [1] -http://jscheiny.github.io/Streams/ [1] http://msgpack.org +[2] http://kheafield.com/code/kenlm/ +http://math.nist.gov/spblas/ +http://lapackpp.sourceforge.net/ +http://www.cvmlib.com/ +http://sourceforge.net/projects/lpp/ +http://math-atlas.sourceforge.net/ +http://www.netlib.org/lapack/ +http://bytes.com/topic/c/answers/702569-blas-vs-cblas-c +http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack +http://www.osl.iu.edu/research/mtl/download.php3 +http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library diff --git a/fast/grammar.cc b/fast/grammar.cc index 9f26bd7..a8e2747 100644 --- a/fast/grammar.cc +++ b/fast/grammar.cc @@ -3,5 +3,160 @@ namespace G { +NT::NT(string& s) +{ + s.erase(0, 1); + s.pop_back(); + stringstream ss(s); + string buf; + size_t c = 0; + index = 0; + while (ss.good() && getline(ss, buf, ',')) { + if (c == 0) { + symbol = buf; + } else { + index = stoi(buf); + } + c++; + } +} + +T::T(string& s) +{ + word = s; +} + +Item::Item(string& s) +{ + if (s.front() == '[' && s.back() == ']') { + type = NON_TERMINAL; + nt = new NT(s); + } else { + type = TERMINAL; + t = new T(s); + } +} + +Rule::Rule(string& s) +{ + stringstream ss(s); + size_t c = 0; + string buf; + while (ss >> buf) { + if (buf == "|||") { c++; continue; } + if (c == 0) { // LHS + lhs = new NT(buf); + } else if (c == 1) { // RHS + rhs.push_back(new Item(buf)); + if (rhs.back()->type == NON_TERMINAL) arity++; + } else if (c == 2) { // TARGET + target.push_back(new Item(buf)); + } else if (c == 3) { // F TODO + } else if (c == 4) { // A TODO + } else { // ERROR FIXME + } + if (c == 4) break; + } + arity = 0; +} + +Grammar::Grammar(string fn) +{ + ifstream ifs(fn); + string line; + while (getline(ifs, line)) { + G::Rule* r = new G::Rule(line); + rules.push_back(r); + if (r->arity == 0) + flat.push_back(r); + else if (r->rhs.front()->type == NON_TERMINAL) + start_nt.push_back(r); + else + start_t.push_back(r); + } +} + +string +Item::repr() const +{ + ostringstream os; + if (type == TERMINAL) + os << t->repr(); + else + os << nt->repr(); + return os.str(); +} + +ostream& +operator<<(ostream& os, const Item& i) +{ + return os << i.repr(); +} + +string +NT::repr() const +{ + ostringstream os; + os << "NT<" << symbol << "," << index << ">"; + return os.str(); +} + +ostream& +operator<<(ostream& os, const NT& nt) +{ + return os << nt.repr(); +} + +string +T::repr() const +{ + ostringstream os; + os << "T<" << word << ">"; + return os.str(); +} + +ostream& +operator<<(ostream& os, const T& t) +{ + return os << t.repr(); +} + +string +Rule::repr() const +{ + ostringstream os; + os << "Rulerepr() << \ + ", rhs:{"; + for (auto it = rhs.begin(); it != rhs.end(); it++) { + os << (**it).repr(); + if (next(it) != rhs.end()) os << " "; + } + os << "}, target:{"; + for (auto it = target.begin(); it != target.end(); it++) { + os << (**it).repr(); + if (next(it) != target.end()) os << " "; + } + os << "}" \ + ", f:" << "TODO" << \ + ", arity=" << arity << \ + ", map:" << "TODO" << \ + ">"; + return os.str(); +} + +ostream& +operator<<(ostream& os, const Rule& r) +{ + return os << r.repr(); +} + +ostream& +operator<<(ostream& os, const Grammar& g) +{ + for (auto it = g.rules.begin(); it != g.rules.end(); it++) + os << (**it).repr() << endl; + return os; +} + } // namespace diff --git a/fast/grammar.hh b/fast/grammar.hh index d17a331..3c7f208 100644 --- a/fast/grammar.hh +++ b/fast/grammar.hh @@ -1,6 +1,13 @@ #pragma once +#include #include +#include +#include +#include +#include + +#include "dummyvector.h" using namespace std; @@ -10,13 +17,60 @@ namespace G { struct NT { string symbol; unsigned int index; + + NT() {}; + NT(string& s); + string repr() const; + friend ostream& operator<<(ostream& os, const NT& t); }; -class T { +struct T { string word; + + T(string& s); + string repr() const; + friend ostream& operator<<(ostream& os, const NT& nt); +}; + +enum item_type { + NON_TERMINAL, + TERMINAL +}; + +struct Item { + item_type type; + NT* nt; + T* t; + + Item(string& s); + string repr() const; + friend ostream& operator<<(ostream& os, const Item& i); +}; + +struct Rule { + NT* lhs; + vector rhs; + vector target; + //map map; + size_t arity; + DummyVector f; + + Rule() {}; + Rule(string& s); + string repr() const; + friend ostream& operator<<(ostream& os, const Rule& r); }; -class Rule { +struct Grammar { + vector rules; + vector flat; + vector start_nt; + vector start_t; + + Grammar(string fn); + void add_glue(); + void add_pass_through(); + friend ostream& operator<<(ostream& os, const Grammar& g); }; } // namespace diff --git a/fast/grammar.o b/fast/grammar.o index 705aed9..984065c 100644 Binary files a/fast/grammar.o and b/fast/grammar.o differ diff --git a/fast/hypergraph.cc b/fast/hypergraph.cc index e6ec495..9101c92 100644 --- a/fast/hypergraph.cc +++ b/fast/hypergraph.cc @@ -3,36 +3,12 @@ namespace Hg { -std::ostream& -operator<<(std::ostream& os, const Node& n) -{ - os << \ - "Node"; - return os; -} - -std::ostream& -operator<<(std::ostream& os, const Edge& e) +template void +init(list& nodes, list::iterator root, Semiring& semiring) { - ostringstream _; - for (auto it = e.tails.begin(); it != e.tails.end(); it++) { - _ << (**it).id; if (*it != e.tails.back()) _ << ","; - } - os << \ - "Edgeid << \ - ", tails=[" << _.str() << "]" \ - ", score=" << e.score << \ - ", rule:'" << "TODO" << "'" << \ - ", f=" << "TODO" << \ - ", arity=" << e.arity << \ - ", mark=" << e.mark << ">"; - return os; + for (auto it = nodes.begin(); it != nodes.end(); it++) + (**it).score = semiring.null; + (**root).score = semiring.one; } void @@ -44,14 +20,6 @@ reset(list nodes, vector edges) (**it).mark = 0; } -template void -init(list& nodes, list::iterator root, Semiring& semiring) -{ - for (auto it = nodes.begin(); it != nodes.end(); it++) - (**it).score = semiring.null; - (**root).score = semiring.one; -} - void topological_sort(list& nodes, list::iterator root) { @@ -162,22 +130,21 @@ manual(Hypergraph& hg) { // nodes Node* a = new Node; a->id = 0; a->symbol = "root"; a->left = -1; a->right = -1; a->mark = 0; + hg.nodes.push_back(a); hg.nodes_by_id[a->id] = a; Node* b = new Node; b->id = 1; b->symbol = "NP"; b->left = 0; b->right = 1; b->mark = 0; + hg.nodes.push_back(b); hg.nodes_by_id[b->id] = b; Node* c = new Node; c->id = 2; c->symbol = "V"; c->left = 1; c->right = 2; c->mark = 0; + hg.nodes.push_back(c); hg.nodes_by_id[c->id] = c; Node* d = new Node; d->id = 3; d->symbol = "JJ"; d->left = 3; d->right = 4; d->mark = 0; + hg.nodes.push_back(d); hg.nodes_by_id[d->id] = d; Node* e = new Node; e->id = 4; e->symbol = "NN"; e->left = 3; e->right = 5; e->mark = 0; + hg.nodes.push_back(e); hg.nodes_by_id[e->id] = e; Node* f = new Node; f->id = 5; f->symbol = "NP"; f->left = 2; f->right = 5; f->mark = 0; + hg.nodes.push_back(f); hg.nodes_by_id[f->id] = f; Node* g = new Node; g->id = 6; g->symbol = "NP"; g->left = 1; g->right = 5; g->mark = 0; + hg.nodes.push_back(g); hg.nodes_by_id[g->id] = g; Node* h = new Node; h->id = 7; h->symbol = "S"; h->left = 0; h->right = 6; h->mark = 0; - - hg.add_node(a); - hg.add_node(b); - hg.add_node(c); - hg.add_node(d); - hg.add_node(e); - hg.add_node(f); - hg.add_node(g); - hg.add_node(h); + hg.nodes.push_back(h); hg.nodes_by_id[h->id] = h; // edges Edge* q = new Edge; q->head = hg.nodes_by_id[1]; q->tails.push_back(hg.nodes_by_id[0]); q->score = 0.367879441171; @@ -251,5 +218,37 @@ manual(Hypergraph& hg) } // namespace +ostream& +operator<<(ostream& os, const Node& n) +{ + os << \ + "Node"; + return os; +} + +ostream& +operator<<(ostream& os, const Edge& e) +{ + ostringstream _; + for (auto it = e.tails.begin(); it != e.tails.end(); it++) { + _ << (**it).id; if (*it != e.tails.back()) _ << ","; + } + os << \ + "Edgeid << \ + ", tails=[" << _.str() << "]" \ + ", score=" << e.score << \ + ", rule:'" << "TODO" << "'" << \ + ", f=" << "TODO" << \ + ", arity=" << e.arity << \ + ", mark=" << e.mark << ">"; + return os; +} + } // namespace diff --git a/fast/hypergraph.hh b/fast/hypergraph.hh index ea940ad..86b9069 100644 --- a/fast/hypergraph.hh +++ b/fast/hypergraph.hh @@ -31,18 +31,17 @@ struct Edge { Node* head; vector tails; score_t score; - string rule; //FIXME - DummyVector f; //FIXME + string rule; // FIXME unsigned int arity = 0; unsigned int mark = 0; inline bool is_marked() { return mark >= arity; } - friend std::ostream& operator<<(std::ostream& os, const Edge& s); + friend ostream& operator<<(ostream& os, const Edge& s); size_t head_id_; vector tails_ids_; // node ids - MSGPACK_DEFINE(head_id_, tails_ids_, score, f, arity); + MSGPACK_DEFINE(head_id_, tails_ids_, rule, score, arity); }; struct Node { @@ -56,7 +55,7 @@ struct Node { unsigned int mark; inline bool is_marked() { return mark >= incoming.size(); }; - friend std::ostream& operator<<(std::ostream& os, const Node& n); + friend ostream& operator<<(ostream& os, const Node& n); MSGPACK_DEFINE(id, symbol, left, right, score); }; @@ -66,8 +65,6 @@ struct Hypergraph { vector edges; unordered_map nodes_by_id; unsigned int arity; - - void add_node(Node* n) { nodes.push_back(n); nodes_by_id[n->id] = n; } }; void diff --git a/fast/sparse_vector.hh b/fast/sparse_vector.hh new file mode 100644 index 0000000..8fdc1b9 --- /dev/null +++ b/fast/sparse_vector.hh @@ -0,0 +1,116 @@ +#pragma once + +#include +#include +#include + +#include "hypergraph.hh" // FIXME + +using namespace std; + + +namespace Sv { + +template +struct SparseVector { + unordered_map m_; + V zero = 0.0; + + void + insert(K k, V v) { m_[k] = v; }; + + weight_t + dot(SparseVector& other) + { + }; + + V& + operator[](const K& k) + { + return at(k); + }; + + const V& + at(const K& k) const + { + if (m_.find(k) == m_.end()) + return zero; + else + return m_.at(k); + } + + SparseVector + operator+(const SparseVector& other) const + { + SparseVector v; + v.m_.insert(m_.begin(), m_.end()); + v.m_.insert(other.m_.begin(), other.m_.end()); + for (auto it = v.m_.begin(); it != v.m_.end(); it++) + v.m_[it->first] = this->at(it->first) + other.at(it->first); + return v; + }; + + SparseVector& + operator+=(const SparseVector& other) + { + for (auto it = other.m_.begin(); it != other.m_.end(); it++) + m_[it->first] += it->second; + return *this; + }; + + SparseVector + operator-(const SparseVector& other) const + { + SparseVector v; + v.m_.insert(m_.begin(), m_.end()); + v.m_.insert(other.m_.begin(), other.m_.end()); + for (auto it = v.m_.begin(); it != v.m_.end(); it++) + v.m_[it->first] = this->at(it->first) - other.at(it->first); + return v; + }; + + SparseVector& + operator-=(const SparseVector& other) + { + for (auto it = other.m_.begin(); it != other.m_.end(); it++) + m_[it->first] -= it->second; + return *this; + }; + + SparseVector + operator*(V f) const + { + SparseVector v; + for (auto it = m_.begin(); it != m_.end(); it++) + v.m_[it->first] = this->at(it->first) * f; + return v; + }; + + SparseVector& + operator*=(V f) + { + for (auto it = m_.begin(); it != m_.end(); it++) + m_[it->first] *= f; + return *this; + }; + + string + repr() const + { + ostringstream os; + os << "SparseVector<{"; + for (auto it = m_.begin(); it != m_.end(); it ++) { + os << "'" << it->first << "'=" << it->second; + if (next(it) != m_.end()) + os << ", "; + } + os << "}>"; + return os.str(); + }; + + friend ostream& + operator<<(ostream& os, const SparseVector& v) { return os << v.repr(); } +}; + +} // namespace + diff --git a/fast/test/Makefile b/fast/test/Makefile new file mode 100644 index 0000000..3e01dc5 --- /dev/null +++ b/fast/test/Makefile @@ -0,0 +1,13 @@ +COMPILER=clang +CFLAGS=-std=c++11 -O3 + + +test_grammar: test_grammar.cc ../grammar.o + $(COMPILER) $(CFLAGS) ../grammar.o test_grammar.cc + +test_sparse_vector: test_sparse_vector.cc + $(COMPILER) $(CFLAGS) -lm test_sparse_vector.cc + +clean: + rm -f test_grammar test_sparse_vector + diff --git a/fast/test_grammar.cc b/fast/test_grammar.cc new file mode 100644 index 0000000..9c5b74e --- /dev/null +++ b/fast/test_grammar.cc @@ -0,0 +1,15 @@ +#include + +#include "grammar.hh" + +using namespace std; + + +int +main(int argc, char** argv) +{ + G::Grammar g(argv[1]); + cout << g << endl; + return 0; +} + diff --git a/fast/test_sparse_vector.cc b/fast/test_sparse_vector.cc new file mode 100644 index 0000000..f486486 --- /dev/null +++ b/fast/test_sparse_vector.cc @@ -0,0 +1,32 @@ +#include "sparse_vector.hh" + + +int +main(void) +{ + Sv::SparseVector a; + a.insert("1", 1); + a.insert("2", 2); + cout << "a:" << a << endl; + + Sv::SparseVector b; + b.insert("2", 2); + cout << "b:" << b << endl; + + Sv::SparseVector c = a + b; + cout << "a+b:" << c << endl; + + a += b; + cout << "a+=b:" << a << endl; + + a -= b; + cout << "a-=b:" << a << endl; + + cout << "a*2:" << a*2 << endl; + + a *= 2; + cout << "a*=2:" << a << endl; + + return 0; +} + -- cgit v1.2.3