From 4b7b2693e829166ccec8707b59fb2bc26179551b Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Tue, 22 Jul 2014 00:34:01 +0200
Subject: simple sparse vector type
---
.gitignore | 2 +
fast/Makefile | 16 +++--
fast/README.md | 32 ++++++++--
fast/grammar.cc | 155 +++++++++++++++++++++++++++++++++++++++++++++
fast/grammar.hh | 58 ++++++++++++++++-
fast/grammar.o | Bin 2928 -> 285176 bytes
fast/hypergraph.cc | 91 +++++++++++++-------------
fast/hypergraph.hh | 11 ++--
fast/sparse_vector.hh | 116 +++++++++++++++++++++++++++++++++
fast/test/Makefile | 13 ++++
fast/test_grammar.cc | 15 +++++
fast/test_sparse_vector.cc | 32 ++++++++++
12 files changed, 474 insertions(+), 67 deletions(-)
create mode 100644 fast/sparse_vector.hh
create mode 100644 fast/test/Makefile
create mode 100644 fast/test_grammar.cc
create mode 100644 fast/test_sparse_vector.cc
diff --git a/.gitignore b/.gitignore
index 00b0e1a..94218e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
*.o
fast/example/
fast/fast_weaver
+fast/test_grammar
+fast/test_sparse_vector
util/make_pak
util/read_pak
diff --git a/fast/Makefile b/fast/Makefile
index 16bc48c..b2b697f 100644
--- a/fast/Makefile
+++ b/fast/Makefile
@@ -1,16 +1,22 @@
COMPILER=clang
-CFLAGS=-O3
+CFLAGS=-std=c++11 -O3
all: hypergraph.o main.cc
- $(COMPILER) $(CFLAGS) -std=c++11 -lstdc++ -lm -lmsgpack hypergraph.o main.cc -o fast_weaver
+ $(COMPILER) $(CFLAGS) -std=c++11 -lstdc++ -lm -lmsgpack grammar.o hypergraph.o main.cc -o fast_weaver
hypergraph.o: hypergraph.cc hypergraph.hh grammar.o semiring.hh
- $(COMPILER) $(CFLAGS) -g -std=c++11 -c hypergraph.cc
+ $(COMPILER) $(CFLAGS) -g -c hypergraph.cc
grammar.o: grammar.cc grammar.hh
- $(COMPILER) $(CFLAGS) -g -std=c++11 -c grammar.cc
+ $(COMPILER) $(CFLAGS) -g -c grammar.cc
+
+test_grammar: test_grammar.cc grammar.o
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm grammar.o test_grammar.cc -o test_grammar
+
+test_sparse_vector: test_sparse_vector.cc sparse_vector.hh
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm test_sparse_vector.cc -o test_sparse_vector
clean:
- rm -f fast_weaver hypergraph.o grammar.o
+ rm -f fast_weaver hypergraph.o grammar.o test_grammar test_sparse_vector
diff --git a/fast/README.md b/fast/README.md
index 5bcc962..541f93f 100644
--- a/fast/README.md
+++ b/fast/README.md
@@ -1,14 +1,32 @@
TODO
- * grammar
+ * sparse vector (unordered_map) -> where to store?
* parser
+ * Rule -> ChartItem -> Node ?
+ * viterbi path/string
+ * k-best
* other semirings
- * sparse vector (unordered_map)
- * hg serialization? json/bson/msgpack/protocol buffers (no!)
- * hg: json input (jsoncpp?)
- * language model: kenlm
+ * include language model
+ * compress/hash words/feature strings?
+
+
+Dependencies:
+ * MessagePack for object serialization [1]
+ * kenlm language model [2]
+
+
+This is Linux only.
-depends on msgpack [1]
-http://jscheiny.github.io/Streams/
[1] http://msgpack.org
+[2] http://kheafield.com/code/kenlm/
+http://math.nist.gov/spblas/
+http://lapackpp.sourceforge.net/
+http://www.cvmlib.com/
+http://sourceforge.net/projects/lpp/
+http://math-atlas.sourceforge.net/
+http://www.netlib.org/lapack/
+http://bytes.com/topic/c/answers/702569-blas-vs-cblas-c
+http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack
+http://www.osl.iu.edu/research/mtl/download.php3
+http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library
diff --git a/fast/grammar.cc b/fast/grammar.cc
index 9f26bd7..a8e2747 100644
--- a/fast/grammar.cc
+++ b/fast/grammar.cc
@@ -3,5 +3,160 @@
namespace G {
+NT::NT(string& s)
+{
+ s.erase(0, 1);
+ s.pop_back();
+ stringstream ss(s);
+ string buf;
+ size_t c = 0;
+ index = 0;
+ while (ss.good() && getline(ss, buf, ',')) {
+ if (c == 0) {
+ symbol = buf;
+ } else {
+ index = stoi(buf);
+ }
+ c++;
+ }
+}
+
+T::T(string& s)
+{
+ word = s;
+}
+
+Item::Item(string& s)
+{
+ if (s.front() == '[' && s.back() == ']') {
+ type = NON_TERMINAL;
+ nt = new NT(s);
+ } else {
+ type = TERMINAL;
+ t = new T(s);
+ }
+}
+
+Rule::Rule(string& s)
+{
+ stringstream ss(s);
+ size_t c = 0;
+ string buf;
+ while (ss >> buf) {
+ if (buf == "|||") { c++; continue; }
+ if (c == 0) { // LHS
+ lhs = new NT(buf);
+ } else if (c == 1) { // RHS
+ rhs.push_back(new Item(buf));
+ if (rhs.back()->type == NON_TERMINAL) arity++;
+ } else if (c == 2) { // TARGET
+ target.push_back(new Item(buf));
+ } else if (c == 3) { // F TODO
+ } else if (c == 4) { // A TODO
+ } else { // ERROR FIXME
+ }
+ if (c == 4) break;
+ }
+ arity = 0;
+}
+
+Grammar::Grammar(string fn)
+{
+ ifstream ifs(fn);
+ string line;
+ while (getline(ifs, line)) {
+ G::Rule* r = new G::Rule(line);
+ rules.push_back(r);
+ if (r->arity == 0)
+ flat.push_back(r);
+ else if (r->rhs.front()->type == NON_TERMINAL)
+ start_nt.push_back(r);
+ else
+ start_t.push_back(r);
+ }
+}
+
+string
+Item::repr() const
+{
+ ostringstream os;
+ if (type == TERMINAL)
+ os << t->repr();
+ else
+ os << nt->repr();
+ return os.str();
+}
+
+ostream&
+operator<<(ostream& os, const Item& i)
+{
+ return os << i.repr();
+}
+
+string
+NT::repr() const
+{
+ ostringstream os;
+ os << "NT<" << symbol << "," << index << ">";
+ return os.str();
+}
+
+ostream&
+operator<<(ostream& os, const NT& nt)
+{
+ return os << nt.repr();
+}
+
+string
+T::repr() const
+{
+ ostringstream os;
+ os << "T<" << word << ">";
+ return os.str();
+}
+
+ostream&
+operator<<(ostream& os, const T& t)
+{
+ return os << t.repr();
+}
+
+string
+Rule::repr() const
+{
+ ostringstream os;
+ os << "Rulerepr() << \
+ ", rhs:{";
+ for (auto it = rhs.begin(); it != rhs.end(); it++) {
+ os << (**it).repr();
+ if (next(it) != rhs.end()) os << " ";
+ }
+ os << "}, target:{";
+ for (auto it = target.begin(); it != target.end(); it++) {
+ os << (**it).repr();
+ if (next(it) != target.end()) os << " ";
+ }
+ os << "}" \
+ ", f:" << "TODO" << \
+ ", arity=" << arity << \
+ ", map:" << "TODO" << \
+ ">";
+ return os.str();
+}
+
+ostream&
+operator<<(ostream& os, const Rule& r)
+{
+ return os << r.repr();
+}
+
+ostream&
+operator<<(ostream& os, const Grammar& g)
+{
+ for (auto it = g.rules.begin(); it != g.rules.end(); it++)
+ os << (**it).repr() << endl;
+ return os;
+}
+
} // namespace
diff --git a/fast/grammar.hh b/fast/grammar.hh
index d17a331..3c7f208 100644
--- a/fast/grammar.hh
+++ b/fast/grammar.hh
@@ -1,6 +1,13 @@
#pragma once
+#include
#include
+#include
+#include
+#include
+#include