summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-09-16 10:23:14 +0100
committerPatrick Simianer <p@simianer.de>2014-09-16 10:23:14 +0100
commit129a22cfcc7651daa4b11ed52e7870249f6373a5 (patch)
tree78de4649396ab0d37a325b7598f9873c2d65f4c9 /src
parentdf70006a07fb67b17fb39aa56762c50c2e7b8131 (diff)
spring cleaning
Diffstat (limited to 'src')
-rw-r--r--src/fast_weaver.cc26
-rw-r--r--src/grammar.hh334
-rw-r--r--src/hypergraph.cc362
-rw-r--r--src/hypergraph.hh102
-rw-r--r--src/make_pak.cc103
-rw-r--r--src/parse.hh301
-rw-r--r--src/read_pak.cc26
-rw-r--r--src/semiring.hh35
-rw-r--r--src/sparse_vector.hh186
-rw-r--r--src/test_grammar.cc19
-rw-r--r--src/test_parse.cc19
-rw-r--r--src/test_sparse_vector.cc36
-rw-r--r--src/types.hh10
-rw-r--r--src/util.hh47
14 files changed, 1606 insertions, 0 deletions
diff --git a/src/fast_weaver.cc b/src/fast_weaver.cc
new file mode 100644
index 0000000..4854476
--- /dev/null
+++ b/src/fast_weaver.cc
@@ -0,0 +1,26 @@
+#include "hypergraph.hh"
+#include <ctime>
+
+int
+main(int argc, char** argv)
+{
+ Hg::Hypergraph hg;
+ G::Vocabulary y;
+ G::Grammar g;
+ Hg::io::read(hg, g.rules, y, argv[1]);
+ //Hg::io::manual(hg, g.rules);
+ clock_t begin = clock();
+ Hg::Path p;
+ Hg::viterbi_path(hg, p);
+ vector<string> s;
+ Hg::derive(p, p.back()->head, s);
+ for (auto it: s)
+ cout << it << " ";
+ cout << endl;
+ clock_t end = clock();
+ double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
+ cout << elapsed_secs << " s" << endl;
+
+ return 0;
+}
+
diff --git a/src/grammar.hh b/src/grammar.hh
new file mode 100644
index 0000000..c489ec5
--- /dev/null
+++ b/src/grammar.hh
@@ -0,0 +1,334 @@
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <set>
+
+#include <msgpack.hpp>
+
+#include "sparse_vector.hh"
+#include "util.hh"
+
+using namespace std;
+
+namespace G {
+
+enum item_type {
+ UNDEFINED,
+ TERMINAL,
+ NON_TERMINAL
+};
+
+struct Item {
+ virtual size_t index() const { return 0; }
+ virtual symbol_t symbol() const { return ""; }
+ virtual item_type type() const { return UNDEFINED; }
+
+ virtual ostream& repr(ostream& os) const { return os << "Item<>"; }
+ virtual ostream& escaped(ostream& os) const { return os << ""; }
+
+ friend ostream&
+ operator<<(ostream& os, const Item& i)
+ {
+ return i.repr(os);
+ };
+};
+
+struct NT : public Item {
+ symbol_t symbol_;
+ size_t index_;
+
+ NT() {}
+
+ NT(string const& s)
+ {
+ index_ = 0; // default
+ string t(s);
+ t.erase(0, 1); t.pop_back(); // remove '[' and ']'
+ istringstream ss(t);
+ if (ss >> index_) { // [i]
+ symbol_ = "";
+ index_ = stoi(s);
+ return;
+ } else {
+ ss.clear();
+ string buf;
+ size_t j = 0;
+ while (ss.good() && getline(ss, buf, ',')) {
+ if (j == 0) {
+ symbol_ = buf;
+ } else {
+ index_ = stoi(buf);
+ }
+ j++;
+ }
+ }
+ }
+
+ virtual size_t index() const { return index_; }
+ virtual symbol_t symbol() const { return symbol_; }
+ virtual item_type type() const { return NON_TERMINAL; }
+
+ virtual ostream&
+ repr(ostream& os) const
+ {
+ return os << "NT<" << symbol_ << "," << index_ << ">";
+ }
+
+ virtual ostream&
+ escaped(ostream& os) const
+ {
+ os << "[" << symbol_;
+ if (index_ > 0)
+ os << "," << index_;
+ os << "]";
+
+ return os;
+ }
+};
+
+struct T : public Item {
+ symbol_t symbol_;
+
+ T(string const& s)
+ {
+ symbol_ = s;
+ }
+
+ virtual symbol_t symbol() const { return symbol_; }
+ virtual item_type type() const { return TERMINAL; }
+
+ virtual ostream&
+ repr(ostream& os) const
+ {
+ return os << "T<" << symbol_ << ">";
+ }
+
+ virtual ostream&
+ escaped(ostream& os) const
+ {
+ os << util::json_escape(symbol_);
+ }
+};
+
+struct Vocabulary
+{
+ unordered_map<symbol_t, size_t> m_;
+ vector<Item*> items_;
+
+ bool
+ is_non_terminal(string const& s)
+ {
+ return s.front() == '[' && s.back() == ']';
+ }
+
+ Item*
+ get(symbol_t const& s)
+ {
+ if (is_non_terminal(s))
+ return new NT(s);
+ if (m_.find(s) != m_.end())
+ return items_[m_[s]];
+ return add(s);
+ }
+
+ Item*
+ add(symbol_t const& s)
+ {
+ size_t next_index_ = items_.size();
+ T* item = new T(s);
+ items_.push_back(item);
+ m_[s] = next_index_;
+
+ return item;
+ }
+};
+
+struct Rule {
+ NT* lhs;
+ vector<Item*> rhs;
+ vector<Item*> target;
+ size_t arity;
+Sv::SparseVector<string, score_t>* f;
+ map<size_t, size_t> order;
+ string as_str_;
+
+ Rule() {}
+
+ Rule(string const& s, Vocabulary& vocab) { from_s(this, s, vocab); }
+
+ static void
+ from_s(Rule* r, string const& s, Vocabulary& vocab)
+ {
+ istringstream ss(s);
+ string buf;
+ size_t j = 0, i = 1;
+ r->arity = 0;
+ vector<NT*> rhs_non_terminals;
+ r->f = new Sv::SparseVector<string, score_t>();
+ while (ss >> buf) {
+ if (buf == "|||") { j++; continue; }
+ if (j == 0) { // left-hand side
+ r->lhs = new NT(buf);
+ } else if (j == 1) { // right-hand side
+ Item* item = vocab.get(buf);
+ r->rhs.push_back(item);
+ if (item->type() == NON_TERMINAL) {
+ r->arity++;
+ rhs_non_terminals.push_back(reinterpret_cast<NT*>(item));
+ }
+ } else if (j == 2) { // target
+ Item* item = vocab.get(buf);
+ if (item->type() == NON_TERMINAL) {
+ r->order.insert(make_pair(i, item->index()));
+ i++;
+ if (item->symbol() == "") { // only [1], [2] ... on target
+ reinterpret_cast<NT*>(item)->symbol_ = \
+ rhs_non_terminals[item->index()-1]->symbol();
+ }
+ }
+ r->target.push_back(item);
+ } else if (j == 3) { // feature vector
+ Sv::SparseVector<string, score_t>::from_s(r->f, buf);
+ // FIXME: this is slow!!! ^^^
+ } else if (j == 4) { // alignment
+ } else {
+ // error
+ }
+ if (j == 4) break;
+ }
+ }
+
+ ostream&
+ repr(ostream& os) const
+ {
+ os << "Rule<lhs=";
+ lhs->repr(os);
+ os << ", rhs:{";
+ for (auto it = rhs.begin(); it != rhs.end(); it++) {
+ (**it).repr(os);
+ if (next(it) != rhs.end()) os << " ";
+ }
+ os << "}, target:{";
+ for (auto it = target.begin(); it != target.end(); it++) {
+ (**it).repr(os);
+ if (next(it) != target.end()) os << " ";
+ }
+ os << "}, f:";
+ f->repr(os);
+ os << ", arity=" << arity << \
+ ", order:{";
+ for (auto it = order.begin(); it != order.end(); it++) {
+ os << it->first << "->" << it->second;
+ if (next(it) != order.end()) os << ", ";
+ }
+ os << "}>";
+
+ return os;
+ }
+
+ ostream&
+ escaped(ostream& os) const
+ {
+ lhs->escaped(os);
+ os << " ||| ";
+ for (auto it = rhs.begin(); it != rhs.end(); it++) {
+ (**it).escaped(os);
+ if (next(it) != rhs.end()) os << " ";
+ }
+ os << " ||| ";
+ for (auto it = target.begin(); it != target.end(); it++) {
+ (**it).escaped(os);
+ if (next(it) != target.end()) os << " ";
+ }
+ os << " ||| ";
+ f->escaped(os);
+ os << " ||| " << \
+ "TODO";
+
+ return os;
+ };
+
+ friend ostream&
+ operator<<(ostream& os, Rule const& r)
+ {
+ return r.repr(os);
+ };
+
+ // --
+ void
+ prep_for_serialization_()
+ {
+ ostringstream os;
+ escaped(os);
+ as_str_ = os.str();
+ };
+ MSGPACK_DEFINE(as_str_);
+ // ^^^ FIXME
+};
+
+struct Grammar {
+ vector<Rule*> rules;
+ vector<Rule*> flat;
+ vector<Rule*> start_non_terminal;
+ vector<Rule*> start_terminal;
+ set<symbol_t> nts;
+
+ Grammar() {}
+
+ Grammar(string const& fn, Vocabulary& vocab)
+ {
+ ifstream ifs(fn);
+ string line;
+ while (getline(ifs, line)) {
+ G::Rule* r = new G::Rule(line, vocab);
+ rules.push_back(r);
+ nts.insert(r->lhs->symbol());
+ if (r->arity == 0)
+ flat.push_back(r);
+ else if (r->rhs.front()->type() == NON_TERMINAL)
+ start_non_terminal.push_back(r);
+ else
+ start_terminal.push_back(r);
+ }
+ }
+
+ void
+ add_glue(Vocabulary& vocab)
+ {
+ for (auto nt: nts) {
+ ostringstream oss_1;
+ oss_1 << "[S] ||| [" << nt << ",1] ||| [" << nt << ",1] ||| ";
+ cout << oss_1.str() << endl;
+ Rule* r1 = new Rule(oss_1.str(), vocab);
+ rules.push_back(r1); start_non_terminal.push_back(r1);
+ ostringstream oss_2;
+ oss_2 << "[S] ||| [S,1] [" << nt << ",2] ||| [S,1] [" << nt << ",2] ||| ";
+ cout << oss_2.str() << endl;
+ Rule* r2 = new Rule(oss_2.str(), vocab);
+ cout << *r2 << endl;
+ rules.push_back(r2); start_non_terminal.push_back(r2);
+ }
+ }
+
+ void add_pass_through(const string& input);
+ // ^^^ TODO
+
+ friend ostream&
+ operator<<(ostream& os, Grammar const& g)
+ {
+ for (const auto it: g.rules) {
+ it->repr(os);
+ os << endl;
+ }
+
+ return os;
+ }
+};
+
+} // namespace G
+
diff --git a/src/hypergraph.cc b/src/hypergraph.cc
new file mode 100644
index 0000000..40bcc64
--- /dev/null
+++ b/src/hypergraph.cc
@@ -0,0 +1,362 @@
+#include "hypergraph.hh"
+
+namespace Hg {
+
+template<typename Semiring> void
+init(const list<Node*>& nodes, const list<Node*>::iterator root, const Semiring& semiring)
+{
+ for (const auto it: nodes)
+ it->score = semiring.null;
+ (**root).score = semiring.one;
+}
+
+void
+reset(const list<Node*> nodes, const vector<Edge*> edges)
+{
+ for (const auto it: nodes)
+ it->mark = 0;
+ for (auto it: edges)
+ it->mark = 0;
+}
+
+void
+topological_sort(list<Node*>& nodes, const list<Node*>::iterator root)
+{
+ auto p = root;
+ auto to = nodes.begin();
+ while (to != nodes.end()) {
+ if ((**p).is_marked()) {
+ for (const auto e: (**p).outgoing) { // explore edges
+ e->mark++;
+ if (e->is_marked()) {
+ e->head->mark++;
+ }
+ }
+ }
+ if ((**p).is_marked()) {
+ nodes.splice(to, nodes, p);
+ to++;
+ p = to;
+ } else {
+ ++p;
+ }
+ }
+}
+
+void
+viterbi(Hypergraph& hg)
+{
+ list<Node*>::iterator root = \
+ find_if(hg.nodes.begin(), hg.nodes.end(), \
+ [](Node* n) { return n->incoming.size() == 0; });
+
+ Hg::topological_sort(hg.nodes, root);
+ Semiring::Viterbi<score_t> semiring;
+ Hg::init(hg.nodes, root, semiring);
+
+ for (const auto n: hg.nodes) {
+ for (const auto e: n->incoming) {
+ score_t s = semiring.one;
+ for (const auto m: e->tails) {
+ s = semiring.multiply(s, m->score);
+ }
+ n->score = semiring.add(n->score, semiring.multiply(s, e->score));
+ }
+ }
+}
+
+void
+viterbi_path(Hypergraph& hg, Path& p)
+{
+ list<Node*>::iterator root = \
+ find_if(hg.nodes.begin(), hg.nodes.end(), \
+ [](Node* n) { return n->incoming.size() == 0; });
+ //list<Node*>::iterator root = hg.nodes.begin();
+
+ Hg::topological_sort(hg.nodes, root);
+ // ^^^ FIXME do I need to do this when reading from file?
+ Semiring::Viterbi<score_t> semiring;
+ Hg::init(hg.nodes, root, semiring);
+
+ for (auto n: hg.nodes) {
+ Edge* best_edge;
+ bool best = false;
+ for (auto e: n->incoming) {
+ score_t s = semiring.one;
+ for (auto m: e->tails) {
+ s = semiring.multiply(s, m->score);
+ }
+ if (n->score < semiring.multiply(s, e->score)) { // find max
+ best_edge = e;
+ best = true;
+ }
+ n->score = semiring.add(n->score, semiring.multiply(s, e->score));
+ }
+ if (best)
+ p.push_back(best_edge);
+ }
+}
+
+
+void
+derive(const Path& p, const Node* cur, vector<string>& carry)
+{
+ Edge* next;
+ for (auto it: p) {
+ if (it->head->symbol == cur->symbol &&
+ it->head->left == cur->left &&
+ it->head->right == cur->right) {
+ next = it;
+ }
+ } // FIXME this is probably not so good
+
+ unsigned j = 0;
+ for (auto it: next->rule->target) {
+ if (it->type() == G::NON_TERMINAL) {
+ derive(p, next->tails[next->rule->order[j]], carry);
+ j++;
+ } else {
+ carry.push_back(it->symbol());
+ }
+ }
+}
+
+namespace io {
+
+void
+read(Hypergraph& hg, vector<G::Rule*>& rules, G::Vocabulary& vocab, const string& fn)
+{
+ ifstream ifs(fn);
+ size_t i = 0, r, n, e;
+ msgpack::unpacker pac;
+ while(true) {
+ pac.reserve_buffer(32*1024);
+ size_t bytes = ifs.readsome(pac.buffer(), pac.buffer_capacity());
+ pac.buffer_consumed(bytes);
+ msgpack::unpacked result;
+ while(pac.next(&result)) {
+ msgpack::object o = result.get();
+ if (i == 0) {
+ o.convert(&r);
+ } else if (i == 1) {
+ o.convert(&n);
+ } else if (i == 2) {
+ o.convert(&e);
+ } else if (i > 2 && i <= r+2) {
+ string s;
+ o.convert(&s);
+ G::Rule* rule = new G::Rule;
+ G::Rule::from_s(rule, s, vocab);
+ rules.push_back(rule);
+ } else if (i > r+2 && i <= r+n+2) {
+ Node* n = new Node;
+ o.convert(n);
+ hg.nodes.push_back(n);
+ hg.nodes_by_id[n->id] = n;
+ } else if (i > n+2 && i <= r+n+e+2) {
+ Edge* e = new Edge;
+ e->arity = 0;
+ o.convert(e);
+ e->head = hg.nodes_by_id[e->head_id_];
+ hg.edges.push_back(e);
+ hg.nodes_by_id[e->head_id_]->incoming.push_back(e);
+ e->arity = 0;
+ for (auto it = e->tails_ids_.begin(); it != e->tails_ids_.end(); it++) {
+ hg.nodes_by_id[*it]->outgoing.push_back(e);
+ e->tails.push_back(hg.nodes_by_id[*it]);
+ e->arity++;
+ }
+ e->rule = rules[e->rule_id_];
+ } else {
+ // ERROR
+ }
+ i++;
+ }
+ if (!bytes) break;
+ }
+}
+
+void
+write(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn) // FIXME
+{
+ FILE* file = fopen(fn.c_str(), "wb");
+ msgpack::fbuffer fbuf(file);
+ msgpack::pack(fbuf, rules.size());
+ msgpack::pack(fbuf, hg.nodes.size());
+ msgpack::pack(fbuf, hg.edges.size());
+ for (auto it = rules.cbegin(); it != rules.cend(); it++)
+ msgpack::pack(fbuf, **it);
+ for (auto it = hg.nodes.cbegin(); it != hg.nodes.cend(); it++)
+ msgpack::pack(fbuf, **it);
+ for (auto it = hg.edges.cbegin(); it != hg.edges.cend(); it++)
+ msgpack::pack(fbuf, **it);
+ fclose(file);
+}
+
+void
+manual(Hypergraph& hg, vector<G::Rule*>& rules, G::Vocabulary& vocab)
+{
+ // nodes
+ Node* a = new Node; a->id = 0; a->symbol = "root"; a->left = -1; a->right = -1; a->mark = 0;
+ hg.nodes.push_back(a); hg.nodes_by_id[a->id] = a;
+ Node* b = new Node; b->id = 1; b->symbol = "NP"; b->left = 0; b->right = 1; b->mark = 0;
+ hg.nodes.push_back(b); hg.nodes_by_id[b->id] = b;
+ Node* c = new Node; c->id = 2; c->symbol = "V"; c->left = 1; c->right = 2; c->mark = 0;
+ hg.nodes.push_back(c); hg.nodes_by_id[c->id] = c;
+ Node* d = new Node; d->id = 3; d->symbol = "JJ"; d->left = 3; d->right = 4; d->mark = 0;
+ hg.nodes.push_back(d); hg.nodes_by_id[d->id] = d;
+ Node* e = new Node; e->id = 4; e->symbol = "NN"; e->left = 3; e->right = 5; e->mark = 0;
+ hg.nodes.push_back(e); hg.nodes_by_id[e->id] = e;
+ Node* f = new Node; f->id = 5; f->symbol = "NP"; f->left = 2; f->right = 5; f->mark = 0;
+ hg.nodes.push_back(f); hg.nodes_by_id[f->id] = f;
+ Node* g = new Node; g->id = 6; g->symbol = "NP"; g->left = 1; g->right = 5; g->mark = 0;
+ hg.nodes.push_back(g); hg.nodes_by_id[g->id] = g;
+ Node* h = new Node; h->id = 7; h->symbol = "S"; h->left = 0; h->right = 6; h->mark = 0;
+ hg.nodes.push_back(h); hg.nodes_by_id[h->id] = h;
+
+ // rules
+ vector<string> rule_strs;
+ rule_strs.push_back("[NP] ||| ich ||| i ||| ||| ");
+ rule_strs.push_back("[V] ||| sah ||| saw ||| ||| ");
+ rule_strs.push_back("[JJ] ||| kleines ||| small ||| ||| ");
+ rule_strs.push_back("[JJ] ||| kleines ||| little ||| ||| ");
+ rule_strs.push_back("[NN] ||| kleines haus ||| small house ||| ||| ");
+ rule_strs.push_back("[NN] ||| kleines haus ||| little house ||| ||| ");
+ rule_strs.push_back("[NN] ||| [JJ,1] haus ||| [JJ,1] shell ||| ||| ");
+ rule_strs.push_back("[NN] ||| [JJ,1] haus ||| [JJ,1] house ||| ||| ");
+ rule_strs.push_back("[NP] ||| ein [NN,1] ||| a [NN,1] ||| ||| ");
+ rule_strs.push_back("[VP] ||| [V,1] [NP,2] ||| [V,1] [NP,2] ||| ||| ");
+ rule_strs.push_back("[S] ||| [NP,1] [VP,2] ||| [NP,1] [VP,2] ||| ||| ");
+
+ for (auto it: rule_strs) {
+ rules.push_back(new G::Rule(it, vocab));
+ rules.back()->f = new Sv::SparseVector<string, score_t>();
+ }
+
+ // edges
+ Edge* q = new Edge; q->head = hg.nodes_by_id[1]; q->tails.push_back(hg.nodes_by_id[0]); q->score = 0.367879441171;
+ q->arity = 1; q->mark = 0;
+ hg.edges.push_back(q);
+ hg.nodes_by_id[1]->incoming.push_back(q);
+ hg.nodes_by_id[0]->outgoing.push_back(q);
+ q->rule = rules[0];
+
+ Edge* p = new Edge; p->head = hg.nodes_by_id[2]; p->tails.push_back(hg.nodes_by_id[0]); p->score = 0.606530659713;
+ p->arity = 1; p->mark = 0;
+ hg.edges.push_back(p);
+ hg.nodes_by_id[2]->incoming.push_back(p);
+ hg.nodes_by_id[0]->outgoing.push_back(p);
+ p->rule = rules[1];
+
+ Edge* r = new Edge; r->head = hg.nodes_by_id[3]; r->tails.push_back(hg.nodes_by_id[0]); r->score = 1.0;
+ r->arity = 1; r->mark = 0;
+ hg.edges.push_back(r);
+ hg.nodes_by_id[3]->incoming.push_back(r);
+ hg.nodes_by_id[0]->outgoing.push_back(r);
+ r->rule = rules[2];
+
+ Edge* s = new Edge; s->head = hg.nodes_by_id[3]; s->tails.push_back(hg.nodes_by_id[0]); s->score = 1.0;
+ s->arity = 1; s->mark = 0;
+ hg.edges.push_back(s);
+ hg.nodes_by_id[3]->incoming.push_back(s);
+ hg.nodes_by_id[0]->outgoing.push_back(s);
+ s->rule = rules[3];
+
+ Edge* t = new Edge; t->head = hg.nodes_by_id[4]; t->tails.push_back(hg.nodes_by_id[0]); t->score = 1.0;
+ t->arity = 1; t->mark = 0;
+ hg.edges.push_back(t);
+ hg.nodes_by_id[4]->incoming.push_back(t);
+ hg.nodes_by_id[0]->outgoing.push_back(t);
+ t->rule = rules[4];
+
+ Edge* u = new Edge; u->head = hg.nodes_by_id[4]; u->tails.push_back(hg.nodes_by_id[0]); u->score = 1.0;
+ u->arity = 1; u->mark = 0;
+ hg.edges.push_back(u);
+ hg.nodes_by_id[4]->incoming.push_back(u);
+ hg.nodes_by_id[0]->outgoing.push_back(u);
+ u->rule = rules[5];
+
+ Edge* v = new Edge; v->head = hg.nodes_by_id[4]; v->tails.push_back(hg.nodes_by_id[3]); v->score = 1.0;
+ v->arity = 1; v->mark = 0;
+ hg.edges.push_back(v);
+ hg.nodes_by_id[4]->incoming.push_back(v);
+ hg.nodes_by_id[3]->outgoing.push_back(v);
+ v->rule = rules[6];
+
+ Edge* w = new Edge; w->head = hg.nodes_by_id[4]; w->tails.push_back(hg.nodes_by_id[3]); w->score = 2.71828182846;
+ w->arity = 1; w->mark = 0;
+ hg.edges.push_back(w);
+ hg.nodes_by_id[4]->incoming.push_back(w);
+ hg.nodes_by_id[3]->outgoing.push_back(w);
+ w->rule = rules[7];
+
+ Edge* x = new Edge; x->head = hg.nodes_by_id[5]; x->tails.push_back(hg.nodes_by_id[4]); x->score = 1.0;
+ x->arity = 1; x->mark = 0;
+ hg.edges.push_back(x);
+ hg.nodes_by_id[5]->incoming.push_back(x);
+ hg.nodes_by_id[4]->outgoing.push_back(x);
+ x->rule = rules[8];
+
+ Edge* y = new Edge; y->head = hg.nodes_by_id[6]; y->tails.push_back(hg.nodes_by_id[2]); y->tails.push_back(hg.nodes_by_id[5]); y->score = 1.0;
+ y->arity = 2; y->mark = 0;
+ hg.edges.push_back(y);
+ hg.nodes_by_id[6]->incoming.push_back(y);
+ hg.nodes_by_id[2]->outgoing.push_back(y);
+ hg.nodes_by_id[5]->outgoing.push_back(y);
+ y->rule = rules[9];
+
+ Edge* z = new Edge; z->head = hg.nodes_by_id[7]; z->tails.push_back(hg.nodes_by_id[1]); z->tails.push_back(hg.nodes_by_id[6]); z->score = 1.0;
+ z->arity = 2; z->mark = 0;
+ hg.edges.push_back(z);
+ hg.nodes_by_id[7]->incoming.push_back(z);
+ hg.nodes_by_id[1]->outgoing.push_back(z);
+ hg.nodes_by_id[6]->outgoing.push_back(z);
+ z->rule = rules[10];
+}
+
+} // namespace Hg::io
+
+/*
+ * Hg::Node
+ *
+ */
+ostream&
+operator<<(ostream& os, const Node& n)
+{
+ os << \
+ "Node<id=" << n.id << \
+ ", symbol='" << n.symbol << "'" << \
+ ", span=(" << n.left << "," << n.right << ")" \
+ ", score=" << n.score << \
+ ", incoming:" << n.incoming.size() << \
+ ", outgoing:" << n.outgoing.size() << \
+ ", mark=" << n.mark << ">";
+ return os;
+}
+
+/*
+ * Hg::Edge
+ *
+ */
+ostream&
+operator<<(ostream& os, const Edge& e)
+{
+ ostringstream _;
+ for (auto it: e.tails) {
+ _ << it->id;
+ if (it != e.tails.back()) _ << ",";
+ }
+ os << \
+ "Edge<head=" << e.head->id << \
+ ", tails=[" << _.str() << "]" \
+ ", score=" << e.score << \
+ ", rule:'";
+ e.rule->escaped(os);
+ os << "', f=" << "TODO" << \
+ ", arity=" << e.arity << \
+ ", mark=" << e.mark << ">";
+ return os;
+}
+
+} // namespace Hg
+
diff --git a/src/hypergraph.hh b/src/hypergraph.hh
new file mode 100644
index 0000000..8e05e9f
--- /dev/null
+++ b/src/hypergraph.hh
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <algorithm>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <list>
+#include <msgpack.hpp>
+#include <msgpack/fbuffer.hpp>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "grammar.hh"
+#include "semiring.hh"
+#include "sparse_vector.hh"
+#include "types.hh"
+
+using namespace std;
+
+namespace Hg {
+
+struct Node;
+
+struct Edge {
+ Node* head;
+ vector<Node*> tails;
+ score_t score;
+ G::Rule* rule;
+ unsigned int arity = 0;
+ unsigned int mark = 0;
+
+ inline bool is_marked() { return mark >= arity; }
+ friend ostream& operator<<(ostream& os, const Edge& e);
+
+ size_t head_id_;
+ vector<size_t> tails_ids_; // node ids
+ size_t rule_id_;
+
+ MSGPACK_DEFINE(head_id_, tails_ids_, rule_id_, score, arity);
+};
+
+struct Node {
+ size_t id;
+ string symbol;
+ short left;
+ short right;
+ score_t score;
+ vector<Edge*> incoming;
+ vector<Edge*> outgoing;
+ unsigned int mark;
+
+ inline bool is_marked() { return mark >= incoming.size(); };
+ friend ostream& operator<<(ostream& os, const Node& n);
+
+ MSGPACK_DEFINE(id, symbol, left, right, score);
+};
+
+struct Hypergraph {
+ list<Node*> nodes;
+ vector<Edge*> edges;
+ unordered_map<size_t, Node*> nodes_by_id;
+ unsigned int arity;
+};
+
+template<typename Semiring> void
+init(const list<Node*>& nodes, const list<Node*>::iterator root, const Semiring& semiring);
+
+void
+reset(const list<Node*> nodes, const vector<Edge*> edges);
+
+void
+topological_sort(list<Node*>& nodes, const list<Node*>::iterator root);
+
+void
+viterbi(Hypergraph& hg);
+
+typedef vector<Edge*> Path;
+
+void
+viterbi_path(Hypergraph& hg, Path& p);
+
+void
+derive(const Path& p, const Node* cur, vector<string>& carry);
+
+namespace io {
+
+void
+read(Hypergraph& hg, vector<G::Rule*>& rules, G::Vocabulary& vocab, const string& fn); // FIXME
+
+void
+write(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn); // FIXME
+
+void
+manual(Hypergraph& hg, vector<G::Rule*>& rules);
+
+} // namespace
+
+} // namespace
+
diff --git a/src/make_pak.cc b/src/make_pak.cc
new file mode 100644
index 0000000..db3a8a4
--- /dev/null
+++ b/src/make_pak.cc
@@ -0,0 +1,103 @@
+#include <iostream>
+#include <fstream>
+#include <msgpack.hpp>
+#include <msgpack/fbuffer.hpp>
+#include <string>
+
+#include "json-cpp/single_include/json-cpp.hpp"
+#include "hypergraph.hh"
+#include "types.hh"
+
+using namespace std;
+
+struct DummyNode {
+ size_t id;
+ string symbol;
+ vector<short> span;
+};
+
+struct DummyEdge {
+ size_t head_id;
+ size_t rule_id;
+ vector<size_t> tails_ids;
+ string f;
+ score_t score;
+};
+
+struct DummyHg {
+ vector<string> rules;
+ vector<DummyNode> nodes;
+ vector<DummyEdge> edges;
+};
+
+template<typename X> inline void
+serialize(jsoncpp::Stream<X>& stream, DummyNode& o)
+{
+ fields(o, stream, "id", o.id, "symbol", o.symbol, "span", o.span);
+}
+
+template<typename X> inline void
+serialize(jsoncpp::Stream<X>& stream, DummyEdge& o)
+{
+ fields(o, stream, "head", o.head_id, "rule", o.rule_id, "tails", o.tails_ids, "score", o.score);
+}
+
+template<typename X> inline void
+serialize(jsoncpp::Stream<X>& stream, DummyHg& o)
+{
+ fields(o, stream, "rules", o.rules, "nodes", o.nodes, "edges", o.edges);
+}
+
+int
+main(int argc, char** argv)
+{
+ // read from json
+ ifstream ifs(argv[1]);
+ string json_str((istreambuf_iterator<char>(ifs) ),
+ (istreambuf_iterator<char>()));
+ DummyHg hg;
+ vector<string> rules;
+ hg.rules = rules;
+ vector<DummyNode> nodes;
+ hg.nodes = nodes;
+ vector<DummyEdge> edges;
+ hg.edges = edges;
+ jsoncpp::parse(hg, json_str);
+
+ // convert to proper objects
+ vector<Hg::Node*> nodes_conv;
+ for (const auto it: hg.nodes) {
+ Hg::Node* n = new Hg::Node;
+ n->id = it.id;
+ n->symbol = it.symbol;
+ n->left = it.span[0];
+ n->right = it.span[1];
+ nodes_conv.push_back(n);
+ }
+ vector<Hg::Edge*> edges_conv;
+ for (const auto it: hg.edges) {
+ Hg::Edge* e = new Hg::Edge;
+ e->head_id_ = it.head_id;
+ e->tails_ids_ = it.tails_ids;
+ e->score = it.score;
+ e->rule_id_ = it.rule_id;
+ edges_conv.push_back(e);
+ }
+
+ // write to msgpack
+ FILE* file = fopen(argv[2], "wb");
+ msgpack::fbuffer fbuf(file);
+ msgpack::pack(fbuf, hg.rules.size());
+ msgpack::pack(fbuf, hg.nodes.size());
+ msgpack::pack(fbuf, hg.edges.size());
+ for (const auto it: hg.rules)
+ msgpack::pack(fbuf, it);
+ for (const auto it: nodes_conv)
+ msgpack::pack(fbuf, *it);
+ for (const auto it: edges_conv)
+ msgpack::pack(fbuf, *it);
+ fclose(file);
+
+ return 0;
+}
+
diff --git a/src/parse.hh b/src/parse.hh
new file mode 100644
index 0000000..0dd2fc0
--- /dev/null
+++ b/src/parse.hh
@@ -0,0 +1,301 @@
+#pragma once
+
+#include <vector>
+#include <utility>
+#include <sstream>
+#include <unordered_map>
+#include <set>
+
+#include "grammar.hh"
+#include "util.hh"
+#include "types.hh"
+
+using namespace std;
+
+typedef pair<size_t,size_t> Span;
+namespace std {
+ template <>
+ struct hash<Span>
+ {
+ size_t
+ operator()(Span const& k) const
+ {
+ return ((hash<size_t>()(k.first)
+ ^ (hash<size_t>()(k.second) << 1)) >> 1);
+ }
+ };
+}
+
+namespace Parse {
+
+void visit(vector<Span>& p,
+ size_t i, size_t l, size_t r, size_t x=0)
+{
+ for (size_t s = i; s <= r-x; s++) {
+ for (size_t k = l; k <= r-s; k++) {
+ p.push_back(Span(k,k+s));
+ }
+ }
+}
+
+struct ChartItem
+{
+ Span span;
+ G::Rule const* rule;
+ vector<Span> tails_spans;
+ size_t dot;
+
+ ChartItem() {}
+
+ ChartItem(G::Rule* r) : rule(r), dot(0) {}
+
+ ChartItem(G::Rule* r, Span s, size_t dot)
+ : rule(r), span(s), dot(dot) {}
+
+ ChartItem(ChartItem const& o)
+ : span(o.span),
+ rule(o.rule),
+ tails_spans(o.tails_spans),
+ dot(o.dot)
+ {
+ }
+
+ ostream&
+ repr(ostream& os) const
+ {
+ os << "ChartItem<";
+ os << "span=(" << span.first << "," << span.second << "), lhs=";
+ rule->lhs->repr(os);
+ os << ", dot=" << dot;
+ os << ", tails=" << tails_spans.size() << ", ";
+ os << "rule=";
+ rule->repr(os);
+ os << ">";
+ os << endl;
+ }
+
+ friend ostream&
+ operator<<(ostream& os, ChartItem item)
+ {
+ item.repr(os);
+
+ return os;
+ }
+};
+
+struct Chart
+{
+ size_t n_;
+ map<Span, vector<ChartItem*> > m_;
+ unordered_map<string,bool> b_;
+
+ vector<ChartItem*>& at(Span s)
+ {
+ return m_[s];
+ }
+
+ string h(symbol_t sym, Span s)
+ {
+ ostringstream ss;
+ ss << sym;
+ ss << s.first;
+ ss << s.second;
+
+ return ss.str();
+ }
+
+ bool
+ has_at(symbol_t sym, Span s)
+ {
+ return b_[h(sym, s)];
+ }
+
+ void add(ChartItem* item, Span s)
+ {
+ if (m_.count(s) > 0)
+ m_[s].push_back(item);
+ else {
+ m_.insert(make_pair(s, vector<ChartItem*>{item}));
+ }
+ b_[h(item->rule->lhs->symbol(), s)] = true;
+ }
+
+ Chart(size_t n) : n_(n) {}
+
+ friend ostream&
+ operator<<(ostream& os, Chart const& chart)
+ {
+ for (map<Span, vector<ChartItem*> >::const_iterator it = chart.m_.cbegin();
+ it != chart.m_.cend(); it++) {
+ os << "(" << it->first.first << "," << it->first.second << ")" << endl;
+ for (auto jt: it->second)
+ jt->repr(os); os << endl;
+ }
+
+ return os;
+ }
+};
+
+bool
+scan(ChartItem* item, vector<symbol_t> in, size_t limit, Chart& passive)
+{
+ //cout << "S1" << endl;
+ while (item->dot < item->rule->rhs.size() &&
+ item->rule->rhs[item->dot]->type() == G::TERMINAL) {
+ //cout << "S2" << endl;
+ if (item->span.second == limit) return false;
+ //cout << "S3" << endl;
+ if (item->rule->rhs[item->dot]->symbol() == in[item->span.second]) {
+ //cout << "S4" << endl;
+ item->dot++;
+ //cout << "S5" << endl;
+ item->span.second++;
+ //cout << "S6" << endl;
+ } else {
+ //cout << "S7" << endl;
+ return false;
+ }
+ }
+ //cout << "S8" << endl;
+ return true;
+}
+
+
+void
+init(vector<symbol_t> const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g)
+{
+ for (auto rule: g.flat) {
+ size_t j = 0;
+ for (auto it: in) {
+ if (it == rule->rhs.front()->symbol()) {
+ cout << it << " " << j << j+rule->rhs.size() << endl;
+ Span span(j, j+rule->rhs.size());
+ passive.add(new ChartItem(rule, span, rule->rhs.size()), span);
+ cout << "new passive item [1] " << *passive.at(span).back() << endl;
+ }
+ j++;
+ }
+ }
+}
+
+void
+parse(vector<symbol_t> const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g)
+{
+ vector<Span> spans;
+ Parse::visit(spans, 1, 0, n);
+ for (auto span: spans) {
+
+ cout << "Span (" << span.first << "," << span.second << ")" << endl;
+
+ for (auto it: g.start_terminal) {
+ ChartItem* item = new ChartItem(it, Span(span.first,span.first), 0);
+ if (scan(item, in, span.second, passive)
+ && span.first + item->rule->rhs.size() <= span.second) {
+ active.add(item, span);
+ cout << "new active item [1] " << *active.at(span).back();
+ }
+ }
+
+ for (auto it: g.start_non_terminal) {
+ if (it->rhs.size() > span.second-span.first) continue;
+ active.add(new ChartItem(it, Span(span.first,span.first), 0), span);
+ cout << "new active item [2] " << *active.at(span).back();
+ }
+
+ set<symbol_t> new_symbols;
+ vector<ChartItem*> remaining_items;
+
+ while (true) {
+ cout << "active size at (" << span.first << "," << span.second << ") " << active.at(span).size() << endl;
+ cout << "passive size at (" << span.first << "," << span.second << ") " << passive.at(span).size() << endl;
+ if (active.at(span).empty()) break;
+ ChartItem* item = active.at(span).back();
+ cout << "current item " << *item;
+ active.at(span).pop_back();
+ bool advanced = false;
+ vector<Span> spans2;
+ Parse::visit(spans2, 1, span.first, span.second, 1);
+ for (auto span2: spans2) {
+ cout << "A" << endl;
+ if (item->rule->rhs[item->dot]->type() == G::NON_TERMINAL) {
+ cout << "B" << endl;
+ if (passive.has_at(item->rule->rhs[item->dot]->symbol(), span2)) {
+ cout << "C" << endl;
+ if (span2.first == item->span.second) {
+ cout << "D" << endl;
+ ChartItem* new_item = new ChartItem(*item);
+ cout << "D1" << endl;
+ new_item->span.second = span2.second;
+ cout << "D2" << endl;
+ new_item->dot++;
+ cout << "D3" << endl;
+ new_item->tails_spans.push_back(span2);
+ cout << "D4" << endl;
+ if (scan(new_item, in, span.second, passive)) {
+ cout << "E" << endl;
+ if (new_item->dot == new_item->rule->rhs.size()) {
+ cout << "F" << endl;
+ if (new_item->span.first == span.first && new_item->span.second == span.second) {
+ cout << "G" << endl;
+ cout << "H" << endl;
+ new_symbols.insert(new_item->rule->lhs->symbol());
+ passive.add(new_item, span);
+ cout << "new passive item [2] " << *new_item;
+ advanced = true;
+ }
+ } else {
+ if (new_item->span.second+(new_item->rule->rhs.size()-new_item->dot) <= span.second) {
+ active.add(new_item, span);
+ cout << "new active item [3] " << *new_item;
+ }
+ }
+ }
+ cout << "I" << endl;
+ }
+ }
+ }
+ }
+ cout << "J" << endl;
+ if (!advanced) {
+ cout << "K" << endl;
+ remaining_items.push_back(item);
+ }
+ }
+
+ for (auto new_sym: new_symbols) {
+ cout << "new sym " << new_sym << endl;
+ for (auto rem_item: remaining_items) {
+ if (rem_item->dot != 0 ||
+ rem_item->rule->rhs[rem_item->dot]->type() != G::NON_TERMINAL) {
+ continue;
+ cout << "K1" << endl;
+ }
+ cout << "K2" << endl;
+ if (rem_item->rule->rhs[rem_item->dot]->symbol() == new_sym) {
+ cout << "K3" << endl;
+ ChartItem* new_item = new ChartItem(*rem_item);
+ cout << "K31" << endl;
+ //new_item->tails_spans[new_item->dot-1] = span;
+ new_item->tails_spans.push_back(span);
+ new_item->dot++;
+ cout << "K32" << endl;
+ if (new_item->dot == new_item->rule->rhs.size()) {
+ cout << "K4" << endl;
+ new_symbols.insert(new_item->rule->lhs->symbol());
+ passive.add(new_item, span);
+ }
+ }
+ }
+ }
+
+ cout << "L" << endl;
+ cout << "-------------------" << endl;
+ cout << endl;
+ }
+
+ //cout << "ACTIVE" << endl << active << endl;
+ cout << "PASSIVE" << endl << passive << endl;
+}
+
+} //
+
diff --git a/src/read_pak.cc b/src/read_pak.cc
new file mode 100644
index 0000000..c894442
--- /dev/null
+++ b/src/read_pak.cc
@@ -0,0 +1,26 @@
+#include <iostream>
+#include <fstream>
+#include <msgpack.hpp>
+
+using namespace std;
+
+int
+main(int argc, char** argv)
+{
+ ifstream ifs(argv[1]);
+ msgpack::unpacker pac;
+ while(true) {
+ pac.reserve_buffer(32*1024);
+ size_t bytes = ifs.readsome(pac.buffer(), pac.buffer_capacity());
+ pac.buffer_consumed(bytes);
+ msgpack::unpacked result;
+ while(pac.next(&result)) {
+ msgpack::object o = result.get();
+ cout << o << endl;
+ }
+ if (!bytes) break;
+ }
+
+ return 0;
+}
+
diff --git a/src/semiring.hh b/src/semiring.hh
new file mode 100644
index 0000000..3f4ac08
--- /dev/null
+++ b/src/semiring.hh
@@ -0,0 +1,35 @@
+#pragma once
+
+
+namespace Semiring {
+
+template<typename T>
+struct Viterbi {
+ T one = 1.0;
+ T null = 0.0;
+
+ T add(T x, T y);
+ T multiply(T x, T y);
+ T convert(T x);
+};
+
+template<typename T> T
+Viterbi<T>::add(T x, T y)
+{
+ return max(x, y);
+}
+
+template<typename T> T
+Viterbi<T>::multiply(T x, T y)
+{
+ return x * y;
+}
+
+template<typename T> T
+Viterbi<T>::convert(T x)
+{
+ return (T)x;
+}
+
+} // namespace
+
diff --git a/src/sparse_vector.hh b/src/sparse_vector.hh
new file mode 100644
index 0000000..7fff338
--- /dev/null
+++ b/src/sparse_vector.hh
@@ -0,0 +1,186 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "util.hh"
+#include "types.hh"
+
+using namespace std;
+
+
+namespace Sv {
+
+template<typename K, typename V>
+struct SparseVector {
+ unordered_map<K,V> m_;
+ V zero = 0.f;
+
+ SparseVector() {};
+
+ SparseVector(string& s)
+ {
+ from_s(this, s);
+ };
+
+ void
+ insert(K k, V v) { m_[k] = v; };
+
+ V
+ dot(SparseVector& other)
+ {
+ V r;
+ unordered_map<K,V>* o = &m_;
+ auto b = m_.cbegin();
+ auto e = m_.cend();
+ if (other.size() < size()) {
+ b = other.m_.cbegin();
+ e = other.m_.cend();
+ o = &other.m_;
+ }
+ for (auto it = b; it != e; it++)
+ r += it->second * o->at(it->first);
+
+ return r;
+ };
+
+ size_t
+ size()
+ {
+ return m_.size();
+ }
+
+ V&
+ operator[](const K& k)
+ {
+ return at(k);
+ };
+
+ const V&
+ at(const K& k) const
+ {
+ if (m_.find(k) == m_.end())
+ return zero;
+ else
+ return m_.at(k);
+ }
+
+ SparseVector
+ operator+(const SparseVector& other) const
+ {
+ SparseVector<K,V> v;
+ v.m_.insert(m_.cbegin(), m_.cend());
+ v.m_.insert(other.m_.cbegin(), other.m_.cend());
+ for (const auto it: v.m_)
+ v.m_[it.first] = this->at(it.first) + other.at(it.first);
+
+ return v;
+ };
+
+ SparseVector&
+ operator+=(const SparseVector& other)
+ {
+ for (const auto it: other.m_)
+ m_[it.first] += it.second;
+
+ return *this;
+ };
+
+ SparseVector
+ operator-(const SparseVector& other) const
+ {
+ SparseVector<K,V> v;
+ v.m_.insert(m_.cbegin(), m_.cend());
+ v.m_.insert(other.m_.cbegin(), other.m_.cend());
+ for (const auto it: v.m_)
+ v.m_[it.first] = this->at(it.first) - other.at(it.first);
+
+ return v;
+ };
+
+ SparseVector&
+ operator-=(const SparseVector& other)
+ {
+ for (const auto it: other.m_)
+ m_[it.first] -= it.second;
+
+ return *this;
+ };
+
+ SparseVector
+ operator*(V f) const
+ {
+ SparseVector<K,V> v;
+ for (const auto it: m_)
+ v.m_[it.first] = this->at(it.first) * f;
+
+ return v;
+ };
+
+ SparseVector&
+ operator*=(V f)
+ {
+ for (const auto it: m_)
+ m_[it.first] *= f;
+
+ return *this;
+ };
+
+ static void
+ from_s(SparseVector* w, const string& s)
+ {
+ stringstream ss(s);
+ while (!ss.eof()) {
+ string t;
+ ss >> t;
+ size_t eq = t.find_first_of("=");
+ if (eq == string::npos) {
+ return;
+ }
+ t.replace(eq, 1, " ");
+ stringstream tt(t);
+ K k; V v;
+ tt >> k >> v;
+ w->m_.emplace(k.substr(k.find_first_of("\"")+1, k.find_last_of("\"")-1), v);
+ }
+ }
+
+ ostream&
+ repr(ostream& os) const
+ {
+ os << "SparseVector<{";
+ for (auto it = m_.cbegin(); it != m_.cend(); it++) {
+ os << "'" << it->first << "'=" << it->second;
+ if (next(it) != m_.end())
+ os << ", ";
+ }
+ os << "}>";
+
+ return os;
+ };
+
+ ostream&
+ escaped(ostream& os, bool quote_keys=false) const {
+ for (auto it = m_.cbegin(); it != m_.cend(); it++) {
+ if (quote_keys) os << '"';
+ os << util::json_escape(it->first);
+ if (quote_keys) os << '"';
+ os << "=" << it->second;
+ if (next(it) != m_.cend()) os << " ";
+ }
+
+ return os;
+ };
+
+ friend ostream&
+ operator<<(ostream& os, const SparseVector& v)
+ {
+ return v.repr(os);
+ }
+};
+
+} // namespace
+
diff --git a/src/test_grammar.cc b/src/test_grammar.cc
new file mode 100644
index 0000000..7fee79a
--- /dev/null
+++ b/src/test_grammar.cc
@@ -0,0 +1,19 @@
+#include <fstream>
+
+#include "grammar.hh"
+
+using namespace std;
+
+int
+main(int argc, char** argv)
+{
+ G::Vocabulary y;
+ G::Grammar g(argv[1], y);
+ for (auto it: g.rules) {
+ it->escaped(cout);
+ cout << endl;
+ }
+
+ return 0;
+}
+
diff --git a/src/test_parse.cc b/src/test_parse.cc
new file mode 100644
index 0000000..2d51d44
--- /dev/null
+++ b/src/test_parse.cc
@@ -0,0 +1,19 @@
+#include "parse.hh"
+
+int main(int argc, char** argv)
+{
+ //string in("ich sah ein kleines haus");
+ //string in("europa bildet den ersten oder zweiten markt für die zehn am häufigsten von indien exportierten produkte , erklärte der europäische kommissar weiter . die asiatischen und europäischen giganten tauschen jährlich güter im wert von 47 milliarden euro und dienstleistungen im wert von 10 milliarden euro aus , hatte diese woche daniéle smadja , vorsitzende der abordnung der europäischen kommission in neu delhi , erklärt , und bedauert , dass der gegenseitige handel sein potential noch nicht ausgeschöpft hat . die eu und indien treffen sich am freitag zu ihrem achten diplomatischen in neu delhi , bei dem premierminister manmohan singh und der präsident der europäischen kommission josé manuel durao barrosso anwesend sein werden .");
+ //string in("aber schon bald nach seinem eintritt kam der erste große erfolg .");
+ string in("lebensmittel schuld an europäischer inflation");
+ vector<symbol_t> tok = util::tokenize(in);
+ size_t n = tok.size();
+ G::Vocabulary v;
+ G::Grammar g(argv[1], v);
+ g.add_glue(v);
+ Parse::Chart active(n);
+ Parse::Chart passive(n);
+ init(tok, n, active, passive, g);
+ parse(tok, n, active, passive, g);
+}
+
diff --git a/src/test_sparse_vector.cc b/src/test_sparse_vector.cc
new file mode 100644
index 0000000..69aaa21
--- /dev/null
+++ b/src/test_sparse_vector.cc
@@ -0,0 +1,36 @@
+#include "sparse_vector.hh"
+
+int
+main(void)
+{
+ Sv::SparseVector<string, score_t> a;
+ a.insert("1", 1);
+ a.insert("2", 2);
+ cout << "a:" << a << endl;
+
+ Sv::SparseVector<string, score_t> b;
+ b.insert("2", 2);
+ cout << "b:" << b << endl;
+
+ Sv::SparseVector<string, score_t> c = a + b;
+ cout << "a+b:" << c << endl;
+
+ a += b;
+ cout << "a+=b:" << a << endl;
+
+ a -= b;
+ cout << "a-=b:" << a << endl;
+
+ cout << "a*2:" << a*2 << endl;
+
+ a *= 2;
+ cout << "a*=2:" << a << endl;
+
+ string s("\"a\"=2 \"b\"=3");
+ Sv::SparseVector<string, score_t>* sv = new Sv::SparseVector<string, score_t>(s);
+ cout << *sv << endl;
+ cout << sv->dot(*sv) << endl;
+
+ return 0;
+}
+
diff --git a/src/types.hh b/src/types.hh
new file mode 100644
index 0000000..e89b4dd
--- /dev/null
+++ b/src/types.hh
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <string>
+
+using namespace std;
+
+
+typedef double score_t;
+typedef string symbol_t;
+
diff --git a/src/util.hh b/src/util.hh
new file mode 100644
index 0000000..93ea320
--- /dev/null
+++ b/src/util.hh
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <string>
+
+#include "types.hh"
+
+using namespace std;
+
+
+namespace util {
+
+inline string
+json_escape(const string& s)
+{
+ ostringstream os;
+ for (auto it = s.cbegin(); it != s.cend(); it++) {
+ switch (*it) {
+ case '"': os << "\\\""; break;
+ case '\\': os << "\\\\"; break;
+ case '\b': os << "\\b"; break;
+ case '\f': os << "\\f"; break;
+ case '\n': os << "\\n"; break;
+ case '\r': os << "\\r"; break;
+ case '\t': os << "\\t"; break;
+ default: os << *it; break;
+ }
+ }
+
+ return os.str();
+}
+
+inline vector<symbol_t>
+tokenize(string s)
+{
+ istringstream ss(s);
+ vector<symbol_t> r;
+ while (ss.good()) {
+ string buf;
+ ss >> buf;
+ r.push_back(buf);
+ }
+
+ return r;
+}
+
+} // namespace util
+