diff options
Diffstat (limited to 'fast')
-rw-r--r-- | fast/Makefile | 15 | ||||
-rw-r--r-- | fast/README.md | 38 | ||||
-rw-r--r-- | fast/grammar.hh | 316 | ||||
-rw-r--r-- | fast/hypergraph.cc | 363 | ||||
-rw-r--r-- | fast/hypergraph.hh | 103 | ||||
-rw-r--r-- | fast/main.cc | 27 | ||||
-rw-r--r-- | fast/parse.hh | 108 | ||||
-rw-r--r-- | fast/semiring.hh | 35 | ||||
-rw-r--r-- | fast/sparse_vector.hh | 186 | ||||
-rw-r--r-- | fast/test/Makefile | 19 | ||||
-rwxr-xr-x | fast/test/test_grammar | bin | 60943 -> 0 bytes | |||
-rw-r--r-- | fast/test/test_grammar.cc | 20 | ||||
-rwxr-xr-x | fast/test/test_sparse_vector | bin | 44288 -> 0 bytes | |||
-rw-r--r-- | fast/test/test_sparse_vector.cc | 37 | ||||
-rw-r--r-- | fast/util.hh | 47 | ||||
-rw-r--r-- | fast/weaver.hh | 10 |
16 files changed, 0 insertions, 1324 deletions
diff --git a/fast/Makefile b/fast/Makefile deleted file mode 100644 index 1a7f5b9..0000000 --- a/fast/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -COMPILER=g++ -CFLAGS=-std=c++11 -O3 -TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread - - -all: hypergraph.o main.cc - $(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack $(TCMALLOC) hypergraph.o main.cc -o fast_weaver - -hypergraph.o: hypergraph.cc hypergraph.hh semiring.hh sparse_vector.hh weaver.hh - $(COMPILER) $(CFLAGS) -g -c $(TCMALLOC) hypergraph.cc - -clean: - rm -f fast_weaver - rm -f hypergraph.o parse.o - diff --git a/fast/README.md b/fast/README.md deleted file mode 100644 index f92245b..0000000 --- a/fast/README.md +++ /dev/null @@ -1,38 +0,0 @@ -TODO - * sparse vector (unordered_map) -> where to store? - * parser - * Rule -> ChartItem -> Node ? - * k-best - * other semirings - * include language model - * compress/hash words/feature strings? - * cast? Rule -> Edge, ChartItem -> Node - * feature factory, observer - -Dependencies: - * MessagePack for object serialization [1] - * kenlm language model [2] - -This is Linux only. - - -[1] http://msgpack.org -[2] http://kheafield.com/code/kenlm/ - - -stuff to have a look at: -http://math.nist.gov/spblas/ -http://lapackpp.sourceforge.net/ -http://www.cvmlib.com/ -http://sourceforge.net/projects/lpp/ -http://math-atlas.sourceforge.net/ -http://www.netlib.org/lapack/ -http://bytes.com/topic/c/answers/702569-blas-vs-cblas-c -http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack -http://www.osl.iu.edu/research/mtl/download.php3 -http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library -https://software.intel.com/en-us/tbb_4.2_doc -http://goog-perftools.sourceforge.net/doc/tcmalloc.html -http://www.sgi.com/tech/stl/Rope.html -http://www.cs.unc.edu/Research/compgeom/gzstream/ - diff --git a/fast/grammar.hh b/fast/grammar.hh deleted file mode 100644 index 4906c46..0000000 --- a/fast/grammar.hh +++ /dev/null @@ -1,316 +0,0 @@ -#pragma once - -#include <iostream> -#include <fstream> -#include <map> -#include <sstream> -#include <string> -#include <vector> - -#include <msgpack.hpp> - -#include "sparse_vector.hh" -#include "util.hh" - -using namespace std; - - -namespace G { - -enum item_type { - UNDEFINED, - TERMINAL, - NON_TERMINAL -}; - -struct Item { - virtual size_t index() const { return 0; } - virtual symbol_t symbol() const { return ""; } - virtual item_type type() const { return UNDEFINED; } - - virtual ostream& repr(ostream& os) const { return os << "Item<>"; } - virtual ostream& escaped(ostream& os) const { return os << ""; } - - friend ostream& - operator<<(ostream& os, const Item& i) - { - return i.repr(os); - }; -}; - -struct NT : public Item { - symbol_t symbol_; - size_t index_; - - NT() {} - - NT(string const& s) - { - index_ = 0; // default - string t(s); - t.erase(0, 1); t.pop_back(); // remove '[' and ']' - istringstream ss(t); - if (ss >> index_) { // [i] - symbol_ = ""; - index_ = stoi(s); - return; - } else { - ss.clear(); - string buf; - size_t j = 0; - while (ss.good() && getline(ss, buf, ',')) { - if (j == 0) { - symbol_ = buf; - } else { - index_ = stoi(buf); - } - j++; - } - } - } - - virtual size_t index() const { return index_; } - virtual symbol_t symbol() const { return symbol_; } - virtual item_type type() const { return NON_TERMINAL; } - - virtual ostream& - repr(ostream& os) const - { - return os << "NT<" << symbol_ << "," << index_ << ">"; - } - - virtual ostream& - escaped(ostream& os) const - { - os << "[" << symbol_; - if (index_ > 0) - os << "," << index_; - os << "]"; - - return os; - } -}; - -struct T : public Item { - symbol_t symbol_; - - T(string const& s) - { - symbol_ = s; - } - - virtual symbol_t symbol() const { return symbol_; } - virtual item_type type() const { return TERMINAL; } - - virtual ostream& - repr(ostream& os) const - { - return os << "T<" << symbol_ << ">"; - } - - virtual ostream& - escaped(ostream& os) const - { - os << util::json_escape(symbol_); - } -}; - -struct Vocabulary -{ - unordered_map<symbol_t, size_t> m_; - vector<Item*> items_; - - bool - is_non_terminal(string const& s) - { - return s.front() == '[' && s.back() == ']'; - } - - Item* - get(symbol_t const& s) - { - if (is_non_terminal(s)) - return new NT(s); - if (m_.find(s) != m_.end()) - return items_[m_[s]]; - return add(s); - } - - Item* - add(symbol_t const& s) - { - size_t next_index_ = items_.size(); - T* item = new T(s); - items_.push_back(item); - m_[s] = next_index_; - - return item; - } -}; - -struct Rule { - NT* lhs; - vector<Item*> rhs; - vector<Item*> target; - size_t arity; -Sv::SparseVector<string, score_t>* f; - map<size_t, size_t> order; - string as_str_; - - Rule() {} - - Rule(string const& s, Vocabulary& vocab) { from_s(this, s, vocab); } - - static void - from_s(Rule* r, string const& s, Vocabulary& vocab) - { - istringstream ss(s); - string buf; - size_t j = 0, i = 1; - r->arity = 0; - vector<NT*> rhs_non_terminals; - r->f = new Sv::SparseVector<string, score_t>(); - while (ss >> buf) { - if (buf == "|||") { j++; continue; } - if (j == 0) { // left-hand side - r->lhs = new NT(buf); - } else if (j == 1) { // right-hand side - Item* item = vocab.get(buf); - r->rhs.push_back(item); - if (item->type() == NON_TERMINAL) { - r->arity++; - rhs_non_terminals.push_back(reinterpret_cast<NT*>(item)); - } - } else if (j == 2) { // target - Item* item = vocab.get(buf); - if (item->type() == NON_TERMINAL) { - r->order.insert(make_pair(i, item->index())); - i++; - if (item->symbol() == "") { // only [1], [2] ... on target - reinterpret_cast<NT*>(item)->symbol_ = \ - rhs_non_terminals[item->index()-1]->symbol(); - } - } - r->target.push_back(item); - } else if (j == 3) { // feature vector - Sv::SparseVector<string, score_t>::from_s(r->f, buf); - // FIXME: this is slow!!! ^^^ - } else if (j == 4) { // alignment - } else { - // error - } - if (j == 4) break; - } - } - - ostream& - repr(ostream& os) const - { - os << "Rule<lhs="; - lhs->repr(os); - os << ", rhs:{"; - for (auto it = rhs.begin(); it != rhs.end(); it++) { - (**it).repr(os); - if (next(it) != rhs.end()) os << " "; - } - os << "}, target:{"; - for (auto it = target.begin(); it != target.end(); it++) { - (**it).repr(os); - if (next(it) != target.end()) os << " "; - } - os << "}, f:"; - f->repr(os); - os << ", arity=" << arity << \ - ", order:{"; - for (auto it = order.begin(); it != order.end(); it++) { - os << it->first << "->" << it->second; - if (next(it) != order.end()) os << ", "; - } - os << "}>"; - - return os; - } - - ostream& - escaped(ostream& os) const - { - lhs->escaped(os); - os << " ||| "; - for (auto it = rhs.begin(); it != rhs.end(); it++) { - (**it).escaped(os); - if (next(it) != rhs.end()) os << " "; - } - os << " ||| "; - for (auto it = target.begin(); it != target.end(); it++) { - (**it).escaped(os); - if (next(it) != target.end()) os << " "; - } - os << " ||| "; - f->escaped(os); - os << " ||| " << \ - "TODO"; - - return os; - }; - - friend ostream& - operator<<(ostream& os, Rule const& r) - { - return r.repr(os); - }; - - // -- - void - prep_for_serialization_() - { - ostringstream os; - escaped(os); - as_str_ = os.str(); - }; - MSGPACK_DEFINE(as_str_); - // ^^^ FIXME -}; - -struct Grammar { - vector<Rule*> rules; - vector<Rule*> flat; - vector<Rule*> start_non_terminal; - vector<Rule*> start_terminal; - - Grammar() {} - - Grammar(string const& fn, Vocabulary& vocab) - { - ifstream ifs(fn); - string line; - while (getline(ifs, line)) { - G::Rule* r = new G::Rule(line, vocab); - rules.push_back(r); - if (r->arity == 0) - flat.push_back(r); - else if (r->rhs.front()->type() == NON_TERMINAL) - start_non_terminal.push_back(r); - else - start_terminal.push_back(r); - } - } - - void add_glue(); - // ^^^ TODO - void add_pass_through(const string& input); - // ^^^ TODO - - friend ostream& - operator<<(ostream& os, Grammar const& g) - { - for (const auto it: g.rules) { - it->repr(os); - os << endl; - } - - return os; - } -}; - -} // namespace G - diff --git a/fast/hypergraph.cc b/fast/hypergraph.cc deleted file mode 100644 index 2b33ff4..0000000 --- a/fast/hypergraph.cc +++ /dev/null @@ -1,363 +0,0 @@ -#include "hypergraph.hh" - - -namespace Hg { - -template<typename Semiring> void -init(const list<Node*>& nodes, const list<Node*>::iterator root, const Semiring& semiring) -{ - for (const auto it: nodes) - it->score = semiring.null; - (**root).score = semiring.one; -} - -void -reset(const list<Node*> nodes, const vector<Edge*> edges) -{ - for (const auto it: nodes) - it->mark = 0; - for (auto it: edges) - it->mark = 0; -} - -void -topological_sort(list<Node*>& nodes, const list<Node*>::iterator root) -{ - auto p = root; - auto to = nodes.begin(); - while (to != nodes.end()) { - if ((**p).is_marked()) { - for (const auto e: (**p).outgoing) { // explore edges - e->mark++; - if (e->is_marked()) { - e->head->mark++; - } - } - } - if ((**p).is_marked()) { - nodes.splice(to, nodes, p); - to++; - p = to; - } else { - ++p; - } - } -} - -void -viterbi(Hypergraph& hg) -{ - list<Node*>::iterator root = \ - find_if(hg.nodes.begin(), hg.nodes.end(), \ - [](Node* n) { return n->incoming.size() == 0; }); - - Hg::topological_sort(hg.nodes, root); - Semiring::Viterbi<score_t> semiring; - Hg::init(hg.nodes, root, semiring); - - for (const auto n: hg.nodes) { - for (const auto e: n->incoming) { - score_t s = semiring.one; - for (const auto m: e->tails) { - s = semiring.multiply(s, m->score); - } - n->score = semiring.add(n->score, semiring.multiply(s, e->score)); - } - } -} - -void -viterbi_path(Hypergraph& hg, Path& p) -{ - list<Node*>::iterator root = \ - find_if(hg.nodes.begin(), hg.nodes.end(), \ - [](Node* n) { return n->incoming.size() == 0; }); - //list<Node*>::iterator root = hg.nodes.begin(); - - Hg::topological_sort(hg.nodes, root); - // ^^^ FIXME do I need to do this when reading from file? - Semiring::Viterbi<score_t> semiring; - Hg::init(hg.nodes, root, semiring); - - for (auto n: hg.nodes) { - Edge* best_edge; - bool best = false; - for (auto e: n->incoming) { - score_t s = semiring.one; - for (auto m: e->tails) { - s = semiring.multiply(s, m->score); - } - if (n->score < semiring.multiply(s, e->score)) { // find max - best_edge = e; - best = true; - } - n->score = semiring.add(n->score, semiring.multiply(s, e->score)); - } - if (best) - p.push_back(best_edge); - } -} - - -void -derive(const Path& p, const Node* cur, vector<string>& carry) -{ - Edge* next; - for (auto it: p) { - if (it->head->symbol == cur->symbol && - it->head->left == cur->left && - it->head->right == cur->right) { - next = it; - } - } // FIXME this is probably not so good - - unsigned j = 0; - for (auto it: next->rule->target) { - if (it->type() == G::NON_TERMINAL) { - derive(p, next->tails[next->rule->order[j]], carry); - j++; - } else { - carry.push_back(it->symbol()); - } - } -} - -namespace io { - -void -read(Hypergraph& hg, vector<G::Rule*>& rules, G::Vocabulary& vocab, const string& fn) -{ - ifstream ifs(fn); - size_t i = 0, r, n, e; - msgpack::unpacker pac; - while(true) { - pac.reserve_buffer(32*1024); - size_t bytes = ifs.readsome(pac.buffer(), pac.buffer_capacity()); - pac.buffer_consumed(bytes); - msgpack::unpacked result; - while(pac.next(&result)) { - msgpack::object o = result.get(); - if (i == 0) { - o.convert(&r); - } else if (i == 1) { - o.convert(&n); - } else if (i == 2) { - o.convert(&e); - } else if (i > 2 && i <= r+2) { - string s; - o.convert(&s); - G::Rule* rule = new G::Rule; - G::Rule::from_s(rule, s, vocab); - rules.push_back(rule); - } else if (i > r+2 && i <= r+n+2) { - Node* n = new Node; - o.convert(n); - hg.nodes.push_back(n); - hg.nodes_by_id[n->id] = n; - } else if (i > n+2 && i <= r+n+e+2) { - Edge* e = new Edge; - e->arity = 0; - o.convert(e); - e->head = hg.nodes_by_id[e->head_id_]; - hg.edges.push_back(e); - hg.nodes_by_id[e->head_id_]->incoming.push_back(e); - e->arity = 0; - for (auto it = e->tails_ids_.begin(); it != e->tails_ids_.end(); it++) { - hg.nodes_by_id[*it]->outgoing.push_back(e); - e->tails.push_back(hg.nodes_by_id[*it]); - e->arity++; - } - e->rule = rules[e->rule_id_]; - } else { - // ERROR - } - i++; - } - if (!bytes) break; - } -} - -void -write(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn) // FIXME -{ - FILE* file = fopen(fn.c_str(), "wb"); - msgpack::fbuffer fbuf(file); - msgpack::pack(fbuf, rules.size()); - msgpack::pack(fbuf, hg.nodes.size()); - msgpack::pack(fbuf, hg.edges.size()); - for (auto it = rules.cbegin(); it != rules.cend(); it++) - msgpack::pack(fbuf, **it); - for (auto it = hg.nodes.cbegin(); it != hg.nodes.cend(); it++) - msgpack::pack(fbuf, **it); - for (auto it = hg.edges.cbegin(); it != hg.edges.cend(); it++) - msgpack::pack(fbuf, **it); - fclose(file); -} - -void -manual(Hypergraph& hg, vector<G::Rule*>& rules, G::Vocabulary& vocab) -{ - // nodes - Node* a = new Node; a->id = 0; a->symbol = "root"; a->left = -1; a->right = -1; a->mark = 0; - hg.nodes.push_back(a); hg.nodes_by_id[a->id] = a; - Node* b = new Node; b->id = 1; b->symbol = "NP"; b->left = 0; b->right = 1; b->mark = 0; - hg.nodes.push_back(b); hg.nodes_by_id[b->id] = b; - Node* c = new Node; c->id = 2; c->symbol = "V"; c->left = 1; c->right = 2; c->mark = 0; - hg.nodes.push_back(c); hg.nodes_by_id[c->id] = c; - Node* d = new Node; d->id = 3; d->symbol = "JJ"; d->left = 3; d->right = 4; d->mark = 0; - hg.nodes.push_back(d); hg.nodes_by_id[d->id] = d; - Node* e = new Node; e->id = 4; e->symbol = "NN"; e->left = 3; e->right = 5; e->mark = 0; - hg.nodes.push_back(e); hg.nodes_by_id[e->id] = e; - Node* f = new Node; f->id = 5; f->symbol = "NP"; f->left = 2; f->right = 5; f->mark = 0; - hg.nodes.push_back(f); hg.nodes_by_id[f->id] = f; - Node* g = new Node; g->id = 6; g->symbol = "NP"; g->left = 1; g->right = 5; g->mark = 0; - hg.nodes.push_back(g); hg.nodes_by_id[g->id] = g; - Node* h = new Node; h->id = 7; h->symbol = "S"; h->left = 0; h->right = 6; h->mark = 0; - hg.nodes.push_back(h); hg.nodes_by_id[h->id] = h; - - // rules - vector<string> rule_strs; - rule_strs.push_back("[NP] ||| ich ||| i ||| ||| "); - rule_strs.push_back("[V] ||| sah ||| saw ||| ||| "); - rule_strs.push_back("[JJ] ||| kleines ||| small ||| ||| "); - rule_strs.push_back("[JJ] ||| kleines ||| little ||| ||| "); - rule_strs.push_back("[NN] ||| kleines haus ||| small house ||| ||| "); - rule_strs.push_back("[NN] ||| kleines haus ||| little house ||| ||| "); - rule_strs.push_back("[NN] ||| [JJ,1] haus ||| [JJ,1] shell ||| ||| "); - rule_strs.push_back("[NN] ||| [JJ,1] haus ||| [JJ,1] house ||| ||| "); - rule_strs.push_back("[NP] ||| ein [NN,1] ||| a [NN,1] ||| ||| "); - rule_strs.push_back("[VP] ||| [V,1] [NP,2] ||| [V,1] [NP,2] ||| ||| "); - rule_strs.push_back("[S] ||| [NP,1] [VP,2] ||| [NP,1] [VP,2] ||| ||| "); - - for (auto it: rule_strs) { - rules.push_back(new G::Rule(it, vocab)); - rules.back()->f = new Sv::SparseVector<string, score_t>(); - } - - // edges - Edge* q = new Edge; q->head = hg.nodes_by_id[1]; q->tails.push_back(hg.nodes_by_id[0]); q->score = 0.367879441171; - q->arity = 1; q->mark = 0; - hg.edges.push_back(q); - hg.nodes_by_id[1]->incoming.push_back(q); - hg.nodes_by_id[0]->outgoing.push_back(q); - q->rule = rules[0]; - - Edge* p = new Edge; p->head = hg.nodes_by_id[2]; p->tails.push_back(hg.nodes_by_id[0]); p->score = 0.606530659713; - p->arity = 1; p->mark = 0; - hg.edges.push_back(p); - hg.nodes_by_id[2]->incoming.push_back(p); - hg.nodes_by_id[0]->outgoing.push_back(p); - p->rule = rules[1]; - - Edge* r = new Edge; r->head = hg.nodes_by_id[3]; r->tails.push_back(hg.nodes_by_id[0]); r->score = 1.0; - r->arity = 1; r->mark = 0; - hg.edges.push_back(r); - hg.nodes_by_id[3]->incoming.push_back(r); - hg.nodes_by_id[0]->outgoing.push_back(r); - r->rule = rules[2]; - - Edge* s = new Edge; s->head = hg.nodes_by_id[3]; s->tails.push_back(hg.nodes_by_id[0]); s->score = 1.0; - s->arity = 1; s->mark = 0; - hg.edges.push_back(s); - hg.nodes_by_id[3]->incoming.push_back(s); - hg.nodes_by_id[0]->outgoing.push_back(s); - s->rule = rules[3]; - - Edge* t = new Edge; t->head = hg.nodes_by_id[4]; t->tails.push_back(hg.nodes_by_id[0]); t->score = 1.0; - t->arity = 1; t->mark = 0; - hg.edges.push_back(t); - hg.nodes_by_id[4]->incoming.push_back(t); - hg.nodes_by_id[0]->outgoing.push_back(t); - t->rule = rules[4]; - - Edge* u = new Edge; u->head = hg.nodes_by_id[4]; u->tails.push_back(hg.nodes_by_id[0]); u->score = 1.0; - u->arity = 1; u->mark = 0; - hg.edges.push_back(u); - hg.nodes_by_id[4]->incoming.push_back(u); - hg.nodes_by_id[0]->outgoing.push_back(u); - u->rule = rules[5]; - - Edge* v = new Edge; v->head = hg.nodes_by_id[4]; v->tails.push_back(hg.nodes_by_id[3]); v->score = 1.0; - v->arity = 1; v->mark = 0; - hg.edges.push_back(v); - hg.nodes_by_id[4]->incoming.push_back(v); - hg.nodes_by_id[3]->outgoing.push_back(v); - v->rule = rules[6]; - - Edge* w = new Edge; w->head = hg.nodes_by_id[4]; w->tails.push_back(hg.nodes_by_id[3]); w->score = 2.71828182846; - w->arity = 1; w->mark = 0; - hg.edges.push_back(w); - hg.nodes_by_id[4]->incoming.push_back(w); - hg.nodes_by_id[3]->outgoing.push_back(w); - w->rule = rules[7]; - - Edge* x = new Edge; x->head = hg.nodes_by_id[5]; x->tails.push_back(hg.nodes_by_id[4]); x->score = 1.0; - x->arity = 1; x->mark = 0; - hg.edges.push_back(x); - hg.nodes_by_id[5]->incoming.push_back(x); - hg.nodes_by_id[4]->outgoing.push_back(x); - x->rule = rules[8]; - - Edge* y = new Edge; y->head = hg.nodes_by_id[6]; y->tails.push_back(hg.nodes_by_id[2]); y->tails.push_back(hg.nodes_by_id[5]); y->score = 1.0; - y->arity = 2; y->mark = 0; - hg.edges.push_back(y); - hg.nodes_by_id[6]->incoming.push_back(y); - hg.nodes_by_id[2]->outgoing.push_back(y); - hg.nodes_by_id[5]->outgoing.push_back(y); - y->rule = rules[9]; - - Edge* z = new Edge; z->head = hg.nodes_by_id[7]; z->tails.push_back(hg.nodes_by_id[1]); z->tails.push_back(hg.nodes_by_id[6]); z->score = 1.0; - z->arity = 2; z->mark = 0; - hg.edges.push_back(z); - hg.nodes_by_id[7]->incoming.push_back(z); - hg.nodes_by_id[1]->outgoing.push_back(z); - hg.nodes_by_id[6]->outgoing.push_back(z); - z->rule = rules[10]; -} - -} // namespace Hg::io - -/* - * Hg::Node - * - */ -ostream& -operator<<(ostream& os, const Node& n) -{ - os << \ - "Node<id=" << n.id << \ - ", symbol='" << n.symbol << "'" << \ - ", span=(" << n.left << "," << n.right << ")" \ - ", score=" << n.score << \ - ", incoming:" << n.incoming.size() << \ - ", outgoing:" << n.outgoing.size() << \ - ", mark=" << n.mark << ">"; - return os; -} - -/* - * Hg::Edge - * - */ -ostream& -operator<<(ostream& os, const Edge& e) -{ - ostringstream _; - for (auto it: e.tails) { - _ << it->id; - if (it != e.tails.back()) _ << ","; - } - os << \ - "Edge<head=" << e.head->id << \ - ", tails=[" << _.str() << "]" \ - ", score=" << e.score << \ - ", rule:'"; - e.rule->escaped(os); - os << "', f=" << "TODO" << \ - ", arity=" << e.arity << \ - ", mark=" << e.mark << ">"; - return os; -} - -} // namespace Hg - diff --git a/fast/hypergraph.hh b/fast/hypergraph.hh deleted file mode 100644 index 1c48a88..0000000 --- a/fast/hypergraph.hh +++ /dev/null @@ -1,103 +0,0 @@ -#pragma once - -#include <algorithm> -#include <fstream> -#include <functional> -#include <iostream> -#include <iterator> -#include <list> -#include <msgpack.hpp> -#include <msgpack/fbuffer.hpp> -#include <sstream> -#include <string> -#include <unordered_map> -#include <vector> - -#include "grammar.hh" -#include "semiring.hh" -#include "sparse_vector.hh" -#include "weaver.hh" - -using namespace std; - - -namespace Hg { - -struct Node; - -struct Edge { - Node* head; - vector<Node*> tails; - score_t score; - G::Rule* rule; - unsigned int arity = 0; - unsigned int mark = 0; - - inline bool is_marked() { return mark >= arity; } - friend ostream& operator<<(ostream& os, const Edge& e); - - size_t head_id_; - vector<size_t> tails_ids_; // node ids - size_t rule_id_; - - MSGPACK_DEFINE(head_id_, tails_ids_, rule_id_, score, arity); -}; - -struct Node { - size_t id; - string symbol; - short left; - short right; - score_t score; - vector<Edge*> incoming; - vector<Edge*> outgoing; - unsigned int mark; - - inline bool is_marked() { return mark >= incoming.size(); }; - friend ostream& operator<<(ostream& os, const Node& n); - - MSGPACK_DEFINE(id, symbol, left, right, score); -}; - -struct Hypergraph { - list<Node*> nodes; - vector<Edge*> edges; - unordered_map<size_t, Node*> nodes_by_id; - unsigned int arity; -}; - -template<typename Semiring> void -init(const list<Node*>& nodes, const list<Node*>::iterator root, const Semiring& semiring); - -void -reset(const list<Node*> nodes, const vector<Edge*> edges); - -void -topological_sort(list<Node*>& nodes, const list<Node*>::iterator root); - -void -viterbi(Hypergraph& hg); - -typedef vector<Edge*> Path; - -void -viterbi_path(Hypergraph& hg, Path& p); - -void -derive(const Path& p, const Node* cur, vector<string>& carry); - -namespace io { - -void -read(Hypergraph& hg, vector<G::Rule*>& rules, G::Vocabulary& vocab, const string& fn); // FIXME - -void -write(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn); // FIXME - -void -manual(Hypergraph& hg, vector<G::Rule*>& rules); - -} // namespace - -} // namespace - diff --git a/fast/main.cc b/fast/main.cc deleted file mode 100644 index 825ad59..0000000 --- a/fast/main.cc +++ /dev/null @@ -1,27 +0,0 @@ -#include "hypergraph.hh" -#include <ctime> - - -int -main(int argc, char** argv) -{ - Hg::Hypergraph hg; - G::Vocabulary y; - G::Grammar g; - Hg::io::read(hg, g.rules, y, argv[1]); - //Hg::io::manual(hg, g.rules); - clock_t begin = clock(); - Hg::Path p; - Hg::viterbi_path(hg, p); - vector<string> s; - Hg::derive(p, p.back()->head, s); - for (auto it: s) - cout << it << " "; - cout << endl; - clock_t end = clock(); - double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; - cout << elapsed_secs << " s" << endl; - - return 0; -} - diff --git a/fast/parse.hh b/fast/parse.hh deleted file mode 100644 index 33ea9ce..0000000 --- a/fast/parse.hh +++ /dev/null @@ -1,108 +0,0 @@ -#pragma once - -#include <vector> -#include <utility> -#include <sstream> -#include <unordered_map> - -#include "grammar.hh" -#include "util.hh" -#include "weaver.hh" - - -using namespace std; - -typedef pair<size_t,size_t> Span; -namespace std { - template <> - struct hash<Span> - { - size_t - operator()(Span const& k) const - { - return ((hash<size_t>()(k.first) - ^ (hash<size_t>()(k.second) << 1)) >> 1); - } - }; -} - -namespace Parse { - -void visit(vector<Span>& p, - size_t i, size_t l, size_t r, size_t x=0) -{ - for (size_t span = i; span <= r-x; span++) { - for (size_t k = l; k <= r-span; k++) { - p.push_back(Span(k,k+span)); - } - } -} - -struct ChartItem -{ - Span span; - G::Rule const* rule; - vector<ChartItem*> tails; - size_t dot; - - ChartItem(G::Rule* r) - { - rule = r; - dot = 0; - } - - ChartItem(ChartItem const& o) - { - span.first = o.span.first; - span.second = o.span.second; - rule = o.rule; - for (auto it: o.tails) - tails.push_back(it); - dot = o.dot; - } -}; - -struct Chart -{ - size_t n_; - unordered_map<Span, vector<ChartItem*> > m_; - unordered_map<string,bool> b_; - - vector<ChartItem*>& at(Span s) - { - return m_[s]; - } - - string h(ChartItem* item, Span s) - { - ostringstream ss; - item->rule->lhs->symbol(); - ss << s.first; - ss << s.second; - - return ss.str(); - } - - void add(ChartItem* item, Span s) - { - m_[s].push_back(item); - b_[h(item, s)] = true; - } - - Chart(size_t n) - { - } -}; - - -void -init(vector<symbol_t> const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g) -{ - for (auto rule: g.rules) { - cout << *rule << endl; - } -} - - -} // - diff --git a/fast/semiring.hh b/fast/semiring.hh deleted file mode 100644 index 3f4ac08..0000000 --- a/fast/semiring.hh +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - - -namespace Semiring { - -template<typename T> -struct Viterbi { - T one = 1.0; - T null = 0.0; - - T add(T x, T y); - T multiply(T x, T y); - T convert(T x); -}; - -template<typename T> T -Viterbi<T>::add(T x, T y) -{ - return max(x, y); -} - -template<typename T> T -Viterbi<T>::multiply(T x, T y) -{ - return x * y; -} - -template<typename T> T -Viterbi<T>::convert(T x) -{ - return (T)x; -} - -} // namespace - diff --git a/fast/sparse_vector.hh b/fast/sparse_vector.hh deleted file mode 100644 index 9d815ff..0000000 --- a/fast/sparse_vector.hh +++ /dev/null @@ -1,186 +0,0 @@ -#pragma once - -#include <iostream> -#include <sstream> -#include <string> -#include <unordered_map> -#include <vector> - -#include "util.hh" -#include "weaver.hh" - -using namespace std; - - -namespace Sv { - -template<typename K, typename V> -struct SparseVector { - unordered_map<K,V> m_; - V zero = 0.f; - - SparseVector() {}; - - SparseVector(string& s) - { - from_s(this, s); - }; - - void - insert(K k, V v) { m_[k] = v; }; - - V - dot(SparseVector& other) - { - V r; - unordered_map<K,V>* o = &m_; - auto b = m_.cbegin(); - auto e = m_.cend(); - if (other.size() < size()) { - b = other.m_.cbegin(); - e = other.m_.cend(); - o = &other.m_; - } - for (auto it = b; it != e; it++) - r += it->second * o->at(it->first); - - return r; - }; - - size_t - size() - { - return m_.size(); - } - - V& - operator[](const K& k) - { - return at(k); - }; - - const V& - at(const K& k) const - { - if (m_.find(k) == m_.end()) - return zero; - else - return m_.at(k); - } - - SparseVector - operator+(const SparseVector& other) const - { - SparseVector<K,V> v; - v.m_.insert(m_.cbegin(), m_.cend()); - v.m_.insert(other.m_.cbegin(), other.m_.cend()); - for (const auto it: v.m_) - v.m_[it.first] = this->at(it.first) + other.at(it.first); - - return v; - }; - - SparseVector& - operator+=(const SparseVector& other) - { - for (const auto it: other.m_) - m_[it.first] += it.second; - - return *this; - }; - - SparseVector - operator-(const SparseVector& other) const - { - SparseVector<K,V> v; - v.m_.insert(m_.cbegin(), m_.cend()); - v.m_.insert(other.m_.cbegin(), other.m_.cend()); - for (const auto it: v.m_) - v.m_[it.first] = this->at(it.first) - other.at(it.first); - - return v; - }; - - SparseVector& - operator-=(const SparseVector& other) - { - for (const auto it: other.m_) - m_[it.first] -= it.second; - - return *this; - }; - - SparseVector - operator*(V f) const - { - SparseVector<K,V> v; - for (const auto it: m_) - v.m_[it.first] = this->at(it.first) * f; - - return v; - }; - - SparseVector& - operator*=(V f) - { - for (const auto it: m_) - m_[it.first] *= f; - - return *this; - }; - - static void - from_s(SparseVector* w, const string& s) - { - stringstream ss(s); - while (!ss.eof()) { - string t; - ss >> t; - size_t eq = t.find_first_of("="); - if (eq == string::npos) { - return; - } - t.replace(eq, 1, " "); - stringstream tt(t); - K k; V v; - tt >> k >> v; - w->m_.emplace(k.substr(k.find_first_of("\"")+1, k.find_last_of("\"")-1), v); - } - } - - ostream& - repr(ostream& os) const - { - os << "SparseVector<{"; - for (auto it = m_.cbegin(); it != m_.cend(); it++) { - os << "'" << it->first << "'=" << it->second; - if (next(it) != m_.end()) - os << ", "; - } - os << "}>"; - - return os; - }; - - ostream& - escaped(ostream& os, bool quote_keys=false) const { - for (auto it = m_.cbegin(); it != m_.cend(); it++) { - if (quote_keys) os << '"'; - os << util::json_escape(it->first); - if (quote_keys) os << '"'; - os << "=" << it->second; - if (next(it) != m_.cend()) os << " "; - } - - return os; - }; - - friend ostream& - operator<<(ostream& os, const SparseVector& v) - { - return v.repr(os); - } -}; - -} // namespace - diff --git a/fast/test/Makefile b/fast/test/Makefile deleted file mode 100644 index 65e97ef..0000000 --- a/fast/test/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -COMPILER=g++ -CFLAGS=-std=c++11 -O3 -I../ -TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread - - -all: test_grammar test_sparse_vector test_parse - -test_grammar: test_grammar.cc ../grammar.hh - $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_grammar.cc -o test_grammar - -test_sparse_vector: test_sparse_vector.cc ../sparse_vector.hh - $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_sparse_vector.cc -o test_sparse_vector - -test_parse: test_parse.cc ../parse.hh ../grammar.hh ../util.hh - $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_parse.cc -o test_parse - -clean: - rm -f test_grammar test_sparse_vector - diff --git a/fast/test/test_grammar b/fast/test/test_grammar Binary files differdeleted file mode 100755 index 6cf7ad5..0000000 --- a/fast/test/test_grammar +++ /dev/null diff --git a/fast/test/test_grammar.cc b/fast/test/test_grammar.cc deleted file mode 100644 index bbe76e7..0000000 --- a/fast/test/test_grammar.cc +++ /dev/null @@ -1,20 +0,0 @@ -#include <fstream> - -#include "grammar.hh" - -using namespace std; - - -int -main(int argc, char** argv) -{ - G::Vocabulary y; - G::Grammar g(argv[1], y); - for (auto it: g.rules) { - it->escaped(cout); - cout << endl; - } - - return 0; -} - diff --git a/fast/test/test_sparse_vector b/fast/test/test_sparse_vector Binary files differdeleted file mode 100755 index c06fe9e..0000000 --- a/fast/test/test_sparse_vector +++ /dev/null diff --git a/fast/test/test_sparse_vector.cc b/fast/test/test_sparse_vector.cc deleted file mode 100644 index 426bed1..0000000 --- a/fast/test/test_sparse_vector.cc +++ /dev/null @@ -1,37 +0,0 @@ -#include "sparse_vector.hh" - - -int -main(void) -{ - Sv::SparseVector<string, score_t> a; - a.insert("1", 1); - a.insert("2", 2); - cout << "a:" << a << endl; - - Sv::SparseVector<string, score_t> b; - b.insert("2", 2); - cout << "b:" << b << endl; - - Sv::SparseVector<string, score_t> c = a + b; - cout << "a+b:" << c << endl; - - a += b; - cout << "a+=b:" << a << endl; - - a -= b; - cout << "a-=b:" << a << endl; - - cout << "a*2:" << a*2 << endl; - - a *= 2; - cout << "a*=2:" << a << endl; - - string s("\"a\"=2 \"b\"=3"); - Sv::SparseVector<string, score_t>* sv = new Sv::SparseVector<string, score_t>(s); - cout << *sv << endl; - cout << sv->dot(*sv) << endl; - - return 0; -} - diff --git a/fast/util.hh b/fast/util.hh deleted file mode 100644 index 9ce19da..0000000 --- a/fast/util.hh +++ /dev/null @@ -1,47 +0,0 @@ -#pragma once - -#include <string> - -#include "weaver.hh" - -using namespace std; - - -namespace util { - -inline string -json_escape(const string& s) -{ - ostringstream os; - for (auto it = s.cbegin(); it != s.cend(); it++) { - switch (*it) { - case '"': os << "\\\""; break; - case '\\': os << "\\\\"; break; - case '\b': os << "\\b"; break; - case '\f': os << "\\f"; break; - case '\n': os << "\\n"; break; - case '\r': os << "\\r"; break; - case '\t': os << "\\t"; break; - default: os << *it; break; - } - } - - return os.str(); -} - -inline vector<symbol_t> -tokenize(string s) -{ - istringstream ss(s); - vector<symbol_t> r; - while (ss.good()) { - string buf; - ss >> buf; - r.push_back(buf); - } - - return r; -} - -} // namespace util - diff --git a/fast/weaver.hh b/fast/weaver.hh deleted file mode 100644 index e89b4dd..0000000 --- a/fast/weaver.hh +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include <string> - -using namespace std; - - -typedef double score_t; -typedef string symbol_t; - |