From cef65063cec641a93973b38a48e100fdd115db44 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Sat, 23 Aug 2014 22:59:16 +0100
Subject: rewritten grammar
---
.gitignore | 1 +
fast/Makefile | 23 +--
fast/README.md | 3 +-
fast/grammar.cc | 273 ------------------------------------
fast/grammar.hh | 303 ++++++++++++++++++++++++++++++++++------
fast/hypergraph.cc | 23 +--
fast/hypergraph.hh | 4 +-
fast/main.cc | 3 +-
fast/parse.cc | 55 ++++++++
fast/parse.hh | 103 ++++++++++++++
fast/semiring.hh | 1 -
fast/sparse_vector.hh | 21 +--
fast/test/Makefile | 16 +++
fast/test/test_grammar | Bin 0 -> 56832 bytes
fast/test/test_grammar.cc | 20 +++
fast/test/test_sparse_vector | Bin 0 -> 44288 bytes
fast/test/test_sparse_vector.cc | 37 +++++
fast/test_grammar.cc | 17 ---
fast/test_sparse_vector.cc | 37 -----
fast/util.hh | 35 ++---
fast/weaver.hh | 1 +
21 files changed, 548 insertions(+), 428 deletions(-)
delete mode 100644 fast/grammar.cc
create mode 100644 fast/parse.cc
create mode 100644 fast/parse.hh
create mode 100644 fast/test/Makefile
create mode 100755 fast/test/test_grammar
create mode 100644 fast/test/test_grammar.cc
create mode 100755 fast/test/test_sparse_vector
create mode 100644 fast/test/test_sparse_vector.cc
delete mode 100644 fast/test_grammar.cc
delete mode 100644 fast/test_sparse_vector.cc
diff --git a/.gitignore b/.gitignore
index 94218e0..d8a671e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ fast/test_grammar
fast/test_sparse_vector
util/make_pak
util/read_pak
+fast/gperftools-2.1/
diff --git a/fast/Makefile b/fast/Makefile
index 9e88076..1a7f5b9 100644
--- a/fast/Makefile
+++ b/fast/Makefile
@@ -1,24 +1,15 @@
COMPILER=g++
CFLAGS=-std=c++11 -O3
+TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread
-all: grammar.o hypergraph.o main.cc
- $(COMPILER) $(CFLAGS) -std=c++11 -lstdc++ -lm -lmsgpack grammar.o hypergraph.o main.cc -o fast_weaver
+all: hypergraph.o main.cc
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack $(TCMALLOC) hypergraph.o main.cc -o fast_weaver
-hypergraph.o: hypergraph.cc hypergraph.hh grammar.o semiring.hh sparse_vector.hh weaver.hh
- $(COMPILER) $(CFLAGS) -g -c hypergraph.cc
-
-grammar.o: grammar.cc grammar.hh sparse_vector.hh util.hh
- $(COMPILER) $(CFLAGS) -g -c grammar.cc
-
-test: test_grammar test_sparse_vector
-
-test_grammar: test_grammar.cc grammar.o
- $(COMPILER) $(CFLAGS) -lstdc++ -lm grammar.o test_grammar.cc -o test_grammar
-
-test_sparse_vector: test_sparse_vector.cc sparse_vector.hh
- $(COMPILER) $(CFLAGS) -lstdc++ -lm test_sparse_vector.cc -o test_sparse_vector
+hypergraph.o: hypergraph.cc hypergraph.hh semiring.hh sparse_vector.hh weaver.hh
+ $(COMPILER) $(CFLAGS) -g -c $(TCMALLOC) hypergraph.cc
clean:
- rm -f fast_weaver hypergraph.o grammar.o test_grammar test_sparse_vector
+ rm -f fast_weaver
+ rm -f hypergraph.o parse.o
diff --git a/fast/README.md b/fast/README.md
index 1d6bd04..f92245b 100644
--- a/fast/README.md
+++ b/fast/README.md
@@ -2,7 +2,6 @@ TODO
* sparse vector (unordered_map) -> where to store?
* parser
* Rule -> ChartItem -> Node ?
- * viterbi path/string
* k-best
* other semirings
* include language model
@@ -34,4 +33,6 @@ http://www.osl.iu.edu/research/mtl/download.php3
http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library
https://software.intel.com/en-us/tbb_4.2_doc
http://goog-perftools.sourceforge.net/doc/tcmalloc.html
+http://www.sgi.com/tech/stl/Rope.html
+http://www.cs.unc.edu/Research/compgeom/gzstream/
diff --git a/fast/grammar.cc b/fast/grammar.cc
deleted file mode 100644
index a003eb4..0000000
--- a/fast/grammar.cc
+++ /dev/null
@@ -1,273 +0,0 @@
-#include "grammar.hh"
-
-
-namespace G {
-
-/*
- * G::NT
- *
- */
-NT::NT(string& s)
-{
- s.erase(0, 1); s.pop_back(); // remove '[' and ']'
- istringstream ss(s);
- if (ss >> index) { // [i]
- symbol = "";
- index = stoi(s);
-
- return;
- } else { // [X]
- symbol = s;
- index = 0;
-
- return;
- }
- string buf;
- size_t j = 0;
- index = 0; // default
- while (ss.good() && getline(ss, buf, ',')) {
- if (j == 0) {
- symbol = buf;
- } else {
- index = stoi(buf);
- }
- j++;
- }
-}
-
-string
-NT::repr() const
-{
- ostringstream os;
- os << "NT<" << symbol << "," << index << ">";
-
- return os.str();
-}
-
-string
-NT::escaped() const
-{
- ostringstream os;
- os << "[" << symbol;
- if (index > 0)
- os << "," << index;
- os << "]";
-
- return os.str();
-}
-
-ostream&
-operator<<(ostream& os, const NT& nt)
-{
- return os << nt.repr();
-}
-
-/*
- * G::T
- *
- */
-T::T(const string& s)
-{
- word = s;
-}
-
-string
-T::repr() const
-{
- ostringstream os;
- os << "T<" << word << ">";
-
- return os.str();
-}
-
-string
-T::escaped() const
-{
- return util::json_escape(word);
-}
-
-ostream&
-operator<<(ostream& os, const T& t)
-{
- return os << t.repr();
-}
-
-
-/*
- * G::Item
- *
- * Better solve this by inheritance
- * -> rhs, target as vector ?
- *
- */
-Item::Item(string& s)
-{
- if (s.front() == '[' && s.back() == ']') {
- type = NON_TERMINAL;
- nt = new NT(s);
- } else {
- type = TERMINAL;
- t = new T(s);
- }
-}
-
-string
-Item::repr() const
-{
- ostringstream os;
- if (type == TERMINAL)
- os << t->repr();
- else
- os << nt->repr();
-
- return os.str();
-}
-
-string
-Item::escaped() const
-{
- ostringstream os;
- if (type == TERMINAL)
- os << t->escaped();
- else
- os << nt->escaped();
-
- return os.str();
-}
-
-ostream&
-operator<<(ostream& os, const Item& i)
-{
- return os << i.repr();
-}
-
-/*
- * G::Rule
- *
- */
-Rule::Rule(const string& s)
-{
- from_s(this, s);
-}
-
-void
-Rule::from_s(Rule* r, const string& s)
-{
- stringstream ss(s);
- size_t j = 0;
- string buf;
- r->arity = 0;
- size_t index = 1;
- vector rhs_nt;
- r->f = new Sv::SparseVector();
- while (ss >> buf) {
- if (buf == "|||") { j++; continue; }
- if (j == 0) { // LHS
- r->lhs = new NT(buf);
- } else if (j == 1) { // RHS
- r->rhs.push_back(new Item(buf));
- if (r->rhs.back()->type == NON_TERMINAL) {
- rhs_nt.push_back(r->rhs.back()->nt);
- r->arity++;
- }
- } else if (j == 2) { // TARGET
- r->target.push_back(new Item(buf));
- if (r->target.back()->type == NON_TERMINAL) {
- r->order.insert(make_pair(index, r->target.back()->nt->index));
- if (r->target.back()->nt->symbol == "")
- r->target.back()->nt->symbol = rhs_nt[r->target.back()->nt->index-1]->symbol;
- index++;
- }
- } else if (j == 3) { // F TODO
- Sv::SparseVector::from_s(r->f, buf); // FIXME this is slow!!!
- } else if (j == 4) { // A TODO
- } else {
- // ERROR
- }
- if (j == 4) break;
- }
-}
-
-string
-Rule::repr() const
-{
- ostringstream os;
- os << "Rulerepr() << \
- ", rhs:{";
- for (auto it = rhs.begin(); it != rhs.end(); it++) {
- os << (**it).repr();
- if (next(it) != rhs.end()) os << " ";
- }
- os << "}, target:{";
- for (auto it = target.begin(); it != target.end(); it++) {
- os << (**it).repr();
- if (next(it) != target.end()) os << " ";
- }
- os << "}" \
- ", f:" << f->repr() << \
- ", arity=" << arity << \
- ", map:" << "TODO" << \
- ">";
-
- return os.str();
-}
-
-string
-Rule::escaped() const
-{
- ostringstream os;
- os << lhs->escaped() << " ||| ";
- for (auto it = rhs.begin(); it != rhs.end(); it++) {
- os << (**it).escaped();
- if (next(it) != rhs.end()) os << " ";
- }
- os << " ||| ";
- for (auto it = target.begin(); it != target.end(); it++) {
- os << (**it).escaped();
- if (next(it) != target.end()) os << " ";
- }
- os << " ||| ";
- os << f->escaped();
- os << " ||| ";
- os << "TODO(alignment)";
-
- return os.str();
-}
-
-ostream&
-operator<<(ostream& os, const Rule& r)
-{
- return os << r.repr();
-}
-
-/*
- * G::Grammmar
- *
- */
-Grammar::Grammar(const string& fn)
-{
- ifstream ifs(fn);
- string line;
- while (getline(ifs, line)) {
- G::Rule* r = new G::Rule(line);
- rules.push_back(r);
- if (r->arity == 0)
- flat.push_back(r);
- else if (r->rhs.front()->type == NON_TERMINAL)
- start_nt.push_back(r);
- else
- start_t.push_back(r);
- }
-}
-
-ostream&
-operator<<(ostream& os, const Grammar& g)
-{
- for (const auto it: g.rules)
- os << it->repr() << endl;
-
- return os;
-}
-
-} // namespace G
-
diff --git a/fast/grammar.hh b/fast/grammar.hh
index 1b9ac5a..e5acb8a 100644
--- a/fast/grammar.hh
+++ b/fast/grammar.hh
@@ -1,13 +1,14 @@
#pragma once
-#include
#include
+#include
+#include