summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--fast/Makefile16
-rw-r--r--fast/README.md32
-rw-r--r--fast/grammar.cc155
-rw-r--r--fast/grammar.hh58
-rw-r--r--fast/grammar.obin2928 -> 285176 bytes
-rw-r--r--fast/hypergraph.cc91
-rw-r--r--fast/hypergraph.hh11
-rw-r--r--fast/sparse_vector.hh116
-rw-r--r--fast/test/Makefile13
-rw-r--r--fast/test_grammar.cc15
-rw-r--r--fast/test_sparse_vector.cc32
12 files changed, 474 insertions, 67 deletions
diff --git a/.gitignore b/.gitignore
index 00b0e1a..94218e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
*.o
fast/example/
fast/fast_weaver
+fast/test_grammar
+fast/test_sparse_vector
util/make_pak
util/read_pak
diff --git a/fast/Makefile b/fast/Makefile
index 16bc48c..b2b697f 100644
--- a/fast/Makefile
+++ b/fast/Makefile
@@ -1,16 +1,22 @@
COMPILER=clang
-CFLAGS=-O3
+CFLAGS=-std=c++11 -O3
all: hypergraph.o main.cc
- $(COMPILER) $(CFLAGS) -std=c++11 -lstdc++ -lm -lmsgpack hypergraph.o main.cc -o fast_weaver
+ $(COMPILER) $(CFLAGS) -std=c++11 -lstdc++ -lm -lmsgpack grammar.o hypergraph.o main.cc -o fast_weaver
hypergraph.o: hypergraph.cc hypergraph.hh grammar.o semiring.hh
- $(COMPILER) $(CFLAGS) -g -std=c++11 -c hypergraph.cc
+ $(COMPILER) $(CFLAGS) -g -c hypergraph.cc
grammar.o: grammar.cc grammar.hh
- $(COMPILER) $(CFLAGS) -g -std=c++11 -c grammar.cc
+ $(COMPILER) $(CFLAGS) -g -c grammar.cc
+
+test_grammar: test_grammar.cc grammar.o
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm grammar.o test_grammar.cc -o test_grammar
+
+test_sparse_vector: test_sparse_vector.cc sparse_vector.hh
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm test_sparse_vector.cc -o test_sparse_vector
clean:
- rm -f fast_weaver hypergraph.o grammar.o
+ rm -f fast_weaver hypergraph.o grammar.o test_grammar test_sparse_vector
diff --git a/fast/README.md b/fast/README.md
index 5bcc962..541f93f 100644
--- a/fast/README.md
+++ b/fast/README.md
@@ -1,14 +1,32 @@
TODO
- * grammar
+ * sparse vector (unordered_map) -> where to store?
* parser
+ * Rule -> ChartItem -> Node ?
+ * viterbi path/string
+ * k-best
* other semirings
- * sparse vector (unordered_map)
- * hg serialization? json/bson/msgpack/protocol buffers (no!)
- * hg: json input (jsoncpp?)
- * language model: kenlm
+ * include language model
+ * compress/hash words/feature strings?
+
+
+Dependencies:
+ * MessagePack for object serialization [1]
+ * kenlm language model [2]
+
+
+This is Linux only.
-depends on msgpack [1]
-http://jscheiny.github.io/Streams/
[1] http://msgpack.org
+[2] http://kheafield.com/code/kenlm/
+http://math.nist.gov/spblas/
+http://lapackpp.sourceforge.net/
+http://www.cvmlib.com/
+http://sourceforge.net/projects/lpp/
+http://math-atlas.sourceforge.net/
+http://www.netlib.org/lapack/
+http://bytes.com/topic/c/answers/702569-blas-vs-cblas-c
+http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack
+http://www.osl.iu.edu/research/mtl/download.php3
+http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library
diff --git a/fast/grammar.cc b/fast/grammar.cc
index 9f26bd7..a8e2747 100644
--- a/fast/grammar.cc
+++ b/fast/grammar.cc
@@ -3,5 +3,160 @@
namespace G {
+NT::NT(string& s)
+{
+ s.erase(0, 1);
+ s.pop_back();
+ stringstream ss(s);
+ string buf;
+ size_t c = 0;
+ index = 0;
+ while (ss.good() && getline(ss, buf, ',')) {
+ if (c == 0) {
+ symbol = buf;
+ } else {
+ index = stoi(buf);
+ }
+ c++;
+ }
+}
+
+T::T(string& s)
+{
+ word = s;
+}
+
+Item::Item(string& s)
+{
+ if (s.front() == '[' && s.back() == ']') {
+ type = NON_TERMINAL;
+ nt = new NT(s);
+ } else {
+ type = TERMINAL;
+ t = new T(s);
+ }
+}
+
+Rule::Rule(string& s)
+{
+ stringstream ss(s);
+ size_t c = 0;
+ string buf;
+ while (ss >> buf) {
+ if (buf == "|||") { c++; continue; }
+ if (c == 0) { // LHS
+ lhs = new NT(buf);
+ } else if (c == 1) { // RHS
+ rhs.push_back(new Item(buf));
+ if (rhs.back()->type == NON_TERMINAL) arity++;
+ } else if (c == 2) { // TARGET
+ target.push_back(new Item(buf));
+ } else if (c == 3) { // F TODO
+ } else if (c == 4) { // A TODO
+ } else { // ERROR FIXME
+ }
+ if (c == 4) break;
+ }
+ arity = 0;
+}
+
+Grammar::Grammar(string fn)
+{
+ ifstream ifs(fn);
+ string line;
+ while (getline(ifs, line)) {
+ G::Rule* r = new G::Rule(line);
+ rules.push_back(r);
+ if (r->arity == 0)
+ flat.push_back(r);
+ else if (r->rhs.front()->type == NON_TERMINAL)
+ start_nt.push_back(r);
+ else
+ start_t.push_back(r);
+ }
+}
+
+string
+Item::repr() const
+{
+ ostringstream os;
+ if (type == TERMINAL)
+ os << t->repr();
+ else
+ os << nt->repr();
+ return os.str();
+}
+
+ostream&
+operator<<(ostream& os, const Item& i)
+{
+ return os << i.repr();
+}
+
+string
+NT::repr() const
+{
+ ostringstream os;
+ os << "NT<" << symbol << "," << index << ">";
+ return os.str();
+}
+
+ostream&
+operator<<(ostream& os, const NT& nt)
+{
+ return os << nt.repr();
+}
+
+string
+T::repr() const
+{
+ ostringstream os;
+ os << "T<" << word << ">";
+ return os.str();
+}
+
+ostream&
+operator<<(ostream& os, const T& t)
+{
+ return os << t.repr();
+}
+
+string
+Rule::repr() const
+{
+ ostringstream os;
+ os << "Rule<lhs=" << lhs->repr() << \
+ ", rhs:{";
+ for (auto it = rhs.begin(); it != rhs.end(); it++) {
+ os << (**it).repr();
+ if (next(it) != rhs.end()) os << " ";
+ }
+ os << "}, target:{";
+ for (auto it = target.begin(); it != target.end(); it++) {
+ os << (**it).repr();
+ if (next(it) != target.end()) os << " ";
+ }
+ os << "}" \
+ ", f:" << "TODO" << \
+ ", arity=" << arity << \
+ ", map:" << "TODO" << \
+ ">";
+ return os.str();
+}
+
+ostream&
+operator<<(ostream& os, const Rule& r)
+{
+ return os << r.repr();
+}
+
+ostream&
+operator<<(ostream& os, const Grammar& g)
+{
+ for (auto it = g.rules.begin(); it != g.rules.end(); it++)
+ os << (**it).repr() << endl;
+ return os;
+}
+
} // namespace
diff --git a/fast/grammar.hh b/fast/grammar.hh
index d17a331..3c7f208 100644
--- a/fast/grammar.hh
+++ b/fast/grammar.hh
@@ -1,6 +1,13 @@
#pragma once
+#include <iostream>
#include <string>
+#include <sstream>
+#include <fstream>
+#include <vector>
+#include <map>
+
+#include "dummyvector.h"
using namespace std;
@@ -10,13 +17,60 @@ namespace G {
struct NT {
string symbol;
unsigned int index;
+
+ NT() {};
+ NT(string& s);
+ string repr() const;
+ friend ostream& operator<<(ostream& os, const NT& t);
};
-class T {
+struct T {
string word;
+
+ T(string& s);
+ string repr() const;
+ friend ostream& operator<<(ostream& os, const NT& nt);
+};
+
+enum item_type {
+ NON_TERMINAL,
+ TERMINAL
+};
+
+struct Item {
+ item_type type;
+ NT* nt;
+ T* t;
+
+ Item(string& s);
+ string repr() const;
+ friend ostream& operator<<(ostream& os, const Item& i);
+};
+
+struct Rule {
+ NT* lhs;
+ vector<Item*> rhs;
+ vector<Item*> target;
+ //map<int,int> map;
+ size_t arity;
+ DummyVector f;
+
+ Rule() {};
+ Rule(string& s);
+ string repr() const;
+ friend ostream& operator<<(ostream& os, const Rule& r);
};
-class Rule {
+struct Grammar {
+ vector<Rule*> rules;
+ vector<Rule*> flat;
+ vector<Rule*> start_nt;
+ vector<Rule*> start_t;
+
+ Grammar(string fn);
+ void add_glue();
+ void add_pass_through();
+ friend ostream& operator<<(ostream& os, const Grammar& g);
};
} // namespace
diff --git a/fast/grammar.o b/fast/grammar.o
index 705aed9..984065c 100644
--- a/fast/grammar.o
+++ b/fast/grammar.o
Binary files differ
diff --git a/fast/hypergraph.cc b/fast/hypergraph.cc
index e6ec495..9101c92 100644
--- a/fast/hypergraph.cc
+++ b/fast/hypergraph.cc
@@ -3,36 +3,12 @@
namespace Hg {
-std::ostream&
-operator<<(std::ostream& os, const Node& n)
-{
- os << \
- "Node<id=" << n.id << \
- ", symbol='" << n.symbol << "'" << \
- ", span=(" << n.left << "," << n.right << ")" \
- ", score=" << n.score << \
- ", incoming:" << n.incoming.size() << \
- ", outgoing:" << n.outgoing.size() << \
- ", mark=" << n.mark << ">";
- return os;
-}
-
-std::ostream&
-operator<<(std::ostream& os, const Edge& e)
+template<typename Semiring> void
+init(list<Node*>& nodes, list<Node*>::iterator root, Semiring& semiring)
{
- ostringstream _;
- for (auto it = e.tails.begin(); it != e.tails.end(); it++) {
- _ << (**it).id; if (*it != e.tails.back()) _ << ",";
- }
- os << \
- "Edge<head=" << e.head->id << \
- ", tails=[" << _.str() << "]" \
- ", score=" << e.score << \
- ", rule:'" << "TODO" << "'" << \
- ", f=" << "TODO" << \
- ", arity=" << e.arity << \
- ", mark=" << e.mark << ">";
- return os;
+ for (auto it = nodes.begin(); it != nodes.end(); it++)
+ (**it).score = semiring.null;
+ (**root).score = semiring.one;
}
void
@@ -44,14 +20,6 @@ reset(list<Node*> nodes, vector<Edge*> edges)
(**it).mark = 0;
}
-template<typename Semiring> void
-init(list<Node*>& nodes, list<Node*>::iterator root, Semiring& semiring)
-{
- for (auto it = nodes.begin(); it != nodes.end(); it++)
- (**it).score = semiring.null;
- (**root).score = semiring.one;
-}
-
void
topological_sort(list<Node*>& nodes, list<Node*>::iterator root)
{
@@ -162,22 +130,21 @@ manual(Hypergraph& hg)
{
// nodes
Node* a = new Node; a->id = 0; a->symbol = "root"; a->left = -1; a->right = -1; a->mark = 0;
+ hg.nodes.push_back(a); hg.nodes_by_id[a->id] = a;
Node* b = new Node; b->id = 1; b->symbol = "NP"; b->left = 0; b->right = 1; b->mark = 0;
+ hg.nodes.push_back(b); hg.nodes_by_id[b->id] = b;
Node* c = new Node; c->id = 2; c->symbol = "V"; c->left = 1; c->right = 2; c->mark = 0;
+ hg.nodes.push_back(c); hg.nodes_by_id[c->id] = c;
Node* d = new Node; d->id = 3; d->symbol = "JJ"; d->left = 3; d->right = 4; d->mark = 0;
+ hg.nodes.push_back(d); hg.nodes_by_id[d->id] = d;
Node* e = new Node; e->id = 4; e->symbol = "NN"; e->left = 3; e->right = 5; e->mark = 0;
+ hg.nodes.push_back(e); hg.nodes_by_id[e->id] = e;
Node* f = new Node; f->id = 5; f->symbol = "NP"; f->left = 2; f->right = 5; f->mark = 0;
+ hg.nodes.push_back(f); hg.nodes_by_id[f->id] = f;
Node* g = new Node; g->id = 6; g->symbol = "NP"; g->left = 1; g->right = 5; g->mark = 0;
+ hg.nodes.push_back(g); hg.nodes_by_id[g->id] = g;
Node* h = new Node; h->id = 7; h->symbol = "S"; h->left = 0; h->right = 6; h->mark = 0;
-
- hg.add_node(a);
- hg.add_node(b);
- hg.add_node(c);
- hg.add_node(d);
- hg.add_node(e);
- hg.add_node(f);
- hg.add_node(g);
- hg.add_node(h);
+ hg.nodes.push_back(h); hg.nodes_by_id[h->id] = h;
// edges
Edge* q = new Edge; q->head = hg.nodes_by_id[1]; q->tails.push_back(hg.nodes_by_id[0]); q->score = 0.367879441171;
@@ -251,5 +218,37 @@ manual(Hypergraph& hg)
} // namespace
+ostream&
+operator<<(ostream& os, const Node& n)
+{
+ os << \
+ "Node<id=" << n.id << \
+ ", symbol='" << n.symbol << "'" << \
+ ", span=(" << n.left << "," << n.right << ")" \
+ ", score=" << n.score << \
+ ", incoming:" << n.incoming.size() << \
+ ", outgoing:" << n.outgoing.size() << \
+ ", mark=" << n.mark << ">";
+ return os;
+}
+
+ostream&
+operator<<(ostream& os, const Edge& e)
+{
+ ostringstream _;
+ for (auto it = e.tails.begin(); it != e.tails.end(); it++) {
+ _ << (**it).id; if (*it != e.tails.back()) _ << ",";
+ }
+ os << \
+ "Edge<head=" << e.head->id << \
+ ", tails=[" << _.str() << "]" \
+ ", score=" << e.score << \
+ ", rule:'" << "TODO" << "'" << \
+ ", f=" << "TODO" << \
+ ", arity=" << e.arity << \
+ ", mark=" << e.mark << ">";
+ return os;
+}
+
} // namespace
diff --git a/fast/hypergraph.hh b/fast/hypergraph.hh
index ea940ad..86b9069 100644
--- a/fast/hypergraph.hh
+++ b/fast/hypergraph.hh
@@ -31,18 +31,17 @@ struct Edge {
Node* head;
vector<Node*> tails;
score_t score;
- string rule; //FIXME
- DummyVector f; //FIXME
+ string rule; // FIXME
unsigned int arity = 0;
unsigned int mark = 0;
inline bool is_marked() { return mark >= arity; }
- friend std::ostream& operator<<(std::ostream& os, const Edge& s);
+ friend ostream& operator<<(ostream& os, const Edge& s);
size_t head_id_;
vector<size_t> tails_ids_; // node ids
- MSGPACK_DEFINE(head_id_, tails_ids_, score, f, arity);
+ MSGPACK_DEFINE(head_id_, tails_ids_, rule, score, arity);
};
struct Node {
@@ -56,7 +55,7 @@ struct Node {
unsigned int mark;
inline bool is_marked() { return mark >= incoming.size(); };
- friend std::ostream& operator<<(std::ostream& os, const Node& n);
+ friend ostream& operator<<(ostream& os, const Node& n);
MSGPACK_DEFINE(id, symbol, left, right, score);
};
@@ -66,8 +65,6 @@ struct Hypergraph {
vector<Edge*> edges;
unordered_map<size_t, Node*> nodes_by_id;
unsigned int arity;
-
- void add_node(Node* n) { nodes.push_back(n); nodes_by_id[n->id] = n; }
};
void
diff --git a/fast/sparse_vector.hh b/fast/sparse_vector.hh
new file mode 100644
index 0000000..8fdc1b9
--- /dev/null
+++ b/fast/sparse_vector.hh
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+#include <sstream>
+
+#include "hypergraph.hh" // FIXME
+
+using namespace std;
+
+
+namespace Sv {
+
+template<typename K, typename V>
+struct SparseVector {
+ unordered_map<K, V> m_;
+ V zero = 0.0;
+
+ void
+ insert(K k, V v) { m_[k] = v; };
+
+ weight_t
+ dot(SparseVector& other)
+ {
+ };
+
+ V&
+ operator[](const K& k)
+ {
+ return at(k);
+ };
+
+ const V&
+ at(const K& k) const
+ {
+ if (m_.find(k) == m_.end())
+ return zero;
+ else
+ return m_.at(k);
+ }
+
+ SparseVector
+ operator+(const SparseVector& other) const
+ {
+ SparseVector<K,V> v;
+ v.m_.insert(m_.begin(), m_.end());
+ v.m_.insert(other.m_.begin(), other.m_.end());
+ for (auto it = v.m_.begin(); it != v.m_.end(); it++)
+ v.m_[it->first] = this->at(it->first) + other.at(it->first);
+ return v;
+ };
+
+ SparseVector&
+ operator+=(const SparseVector& other)
+ {
+ for (auto it = other.m_.begin(); it != other.m_.end(); it++)
+ m_[it->first] += it->second;
+ return *this;
+ };
+
+ SparseVector
+ operator-(const SparseVector& other) const
+ {
+ SparseVector<K,V> v;
+ v.m_.insert(m_.begin(), m_.end());
+ v.m_.insert(other.m_.begin(), other.m_.end());
+ for (auto it = v.m_.begin(); it != v.m_.end(); it++)
+ v.m_[it->first] = this->at(it->first) - other.at(it->first);
+ return v;
+ };
+
+ SparseVector&
+ operator-=(const SparseVector& other)
+ {
+ for (auto it = other.m_.begin(); it != other.m_.end(); it++)
+ m_[it->first] -= it->second;
+ return *this;
+ };
+
+ SparseVector
+ operator*(V f) const
+ {
+ SparseVector<K,V> v;
+ for (auto it = m_.begin(); it != m_.end(); it++)
+ v.m_[it->first] = this->at(it->first) * f;
+ return v;
+ };
+
+ SparseVector&
+ operator*=(V f)
+ {
+ for (auto it = m_.begin(); it != m_.end(); it++)
+ m_[it->first] *= f;
+ return *this;
+ };
+
+ string
+ repr() const
+ {
+ ostringstream os;
+ os << "SparseVector<{";
+ for (auto it = m_.begin(); it != m_.end(); it ++) {
+ os << "'" << it->first << "'=" << it->second;
+ if (next(it) != m_.end())
+ os << ", ";
+ }
+ os << "}>";
+ return os.str();
+ };
+
+ friend ostream&
+ operator<<(ostream& os, const SparseVector& v) { return os << v.repr(); }
+};
+
+} // namespace
+
diff --git a/fast/test/Makefile b/fast/test/Makefile
new file mode 100644
index 0000000..3e01dc5
--- /dev/null
+++ b/fast/test/Makefile
@@ -0,0 +1,13 @@
+COMPILER=clang
+CFLAGS=-std=c++11 -O3
+
+
+test_grammar: test_grammar.cc ../grammar.o
+ $(COMPILER) $(CFLAGS) ../grammar.o test_grammar.cc
+
+test_sparse_vector: test_sparse_vector.cc
+ $(COMPILER) $(CFLAGS) -lm test_sparse_vector.cc
+
+clean:
+ rm -f test_grammar test_sparse_vector
+
diff --git a/fast/test_grammar.cc b/fast/test_grammar.cc
new file mode 100644
index 0000000..9c5b74e
--- /dev/null
+++ b/fast/test_grammar.cc
@@ -0,0 +1,15 @@
+#include <fstream>
+
+#include "grammar.hh"
+
+using namespace std;
+
+
+int
+main(int argc, char** argv)
+{
+ G::Grammar g(argv[1]);
+ cout << g << endl;
+ return 0;
+}
+
diff --git a/fast/test_sparse_vector.cc b/fast/test_sparse_vector.cc
new file mode 100644
index 0000000..f486486
--- /dev/null
+++ b/fast/test_sparse_vector.cc
@@ -0,0 +1,32 @@
+#include "sparse_vector.hh"
+
+
+int
+main(void)
+{
+ Sv::SparseVector<string, weight_t> a;
+ a.insert("1", 1);
+ a.insert("2", 2);
+ cout << "a:" << a << endl;
+
+ Sv::SparseVector<string, weight_t> b;
+ b.insert("2", 2);
+ cout << "b:" << b << endl;
+
+ Sv::SparseVector<string, weight_t> c = a + b;
+ cout << "a+b:" << c << endl;
+
+ a += b;
+ cout << "a+=b:" << a << endl;
+
+ a -= b;
+ cout << "a-=b:" << a << endl;
+
+ cout << "a*2:" << a*2 << endl;
+
+ a *= 2;
+ cout << "a*=2:" << a << endl;
+
+ return 0;
+}
+