summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-08-23 22:59:16 +0100
committerPatrick Simianer <p@simianer.de>2014-08-23 22:59:16 +0100
commitcef65063cec641a93973b38a48e100fdd115db44 (patch)
tree32d5f10757e021a9fad01156fbff62a96212f006
parent190f68c880eb27506669e95e2bc0493e2ec42c4c (diff)
rewritten grammar
-rw-r--r--.gitignore1
-rw-r--r--fast/Makefile23
-rw-r--r--fast/README.md3
-rw-r--r--fast/grammar.cc273
-rw-r--r--fast/grammar.hh303
-rw-r--r--fast/hypergraph.cc23
-rw-r--r--fast/hypergraph.hh4
-rw-r--r--fast/main.cc3
-rw-r--r--fast/parse.cc55
-rw-r--r--fast/parse.hh103
-rw-r--r--fast/semiring.hh1
-rw-r--r--fast/sparse_vector.hh21
-rw-r--r--fast/test/Makefile16
-rwxr-xr-xfast/test/test_grammarbin0 -> 56832 bytes
-rw-r--r--fast/test/test_grammar.cc20
-rwxr-xr-xfast/test/test_sparse_vectorbin0 -> 44288 bytes
-rw-r--r--fast/test/test_sparse_vector.cc (renamed from fast/test_sparse_vector.cc)0
-rw-r--r--fast/test_grammar.cc17
-rw-r--r--fast/util.hh35
-rw-r--r--fast/weaver.hh1
20 files changed, 511 insertions, 391 deletions
diff --git a/.gitignore b/.gitignore
index 94218e0..d8a671e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ fast/test_grammar
fast/test_sparse_vector
util/make_pak
util/read_pak
+fast/gperftools-2.1/
diff --git a/fast/Makefile b/fast/Makefile
index 9e88076..1a7f5b9 100644
--- a/fast/Makefile
+++ b/fast/Makefile
@@ -1,24 +1,15 @@
COMPILER=g++
CFLAGS=-std=c++11 -O3
+TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread
-all: grammar.o hypergraph.o main.cc
- $(COMPILER) $(CFLAGS) -std=c++11 -lstdc++ -lm -lmsgpack grammar.o hypergraph.o main.cc -o fast_weaver
+all: hypergraph.o main.cc
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack $(TCMALLOC) hypergraph.o main.cc -o fast_weaver
-hypergraph.o: hypergraph.cc hypergraph.hh grammar.o semiring.hh sparse_vector.hh weaver.hh
- $(COMPILER) $(CFLAGS) -g -c hypergraph.cc
-
-grammar.o: grammar.cc grammar.hh sparse_vector.hh util.hh
- $(COMPILER) $(CFLAGS) -g -c grammar.cc
-
-test: test_grammar test_sparse_vector
-
-test_grammar: test_grammar.cc grammar.o
- $(COMPILER) $(CFLAGS) -lstdc++ -lm grammar.o test_grammar.cc -o test_grammar
-
-test_sparse_vector: test_sparse_vector.cc sparse_vector.hh
- $(COMPILER) $(CFLAGS) -lstdc++ -lm test_sparse_vector.cc -o test_sparse_vector
+hypergraph.o: hypergraph.cc hypergraph.hh semiring.hh sparse_vector.hh weaver.hh
+ $(COMPILER) $(CFLAGS) -g -c $(TCMALLOC) hypergraph.cc
clean:
- rm -f fast_weaver hypergraph.o grammar.o test_grammar test_sparse_vector
+ rm -f fast_weaver
+ rm -f hypergraph.o parse.o
diff --git a/fast/README.md b/fast/README.md
index 1d6bd04..f92245b 100644
--- a/fast/README.md
+++ b/fast/README.md
@@ -2,7 +2,6 @@ TODO
* sparse vector (unordered_map) -> where to store?
* parser
* Rule -> ChartItem -> Node ?
- * viterbi path/string
* k-best
* other semirings
* include language model
@@ -34,4 +33,6 @@ http://www.osl.iu.edu/research/mtl/download.php3
http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library
https://software.intel.com/en-us/tbb_4.2_doc
http://goog-perftools.sourceforge.net/doc/tcmalloc.html
+http://www.sgi.com/tech/stl/Rope.html
+http://www.cs.unc.edu/Research/compgeom/gzstream/
diff --git a/fast/grammar.cc b/fast/grammar.cc
deleted file mode 100644
index a003eb4..0000000
--- a/fast/grammar.cc
+++ /dev/null
@@ -1,273 +0,0 @@
-#include "grammar.hh"
-
-
-namespace G {
-
-/*
- * G::NT
- *
- */
-NT::NT(string& s)
-{
- s.erase(0, 1); s.pop_back(); // remove '[' and ']'
- istringstream ss(s);
- if (ss >> index) { // [i]
- symbol = "";
- index = stoi(s);
-
- return;
- } else { // [X]
- symbol = s;
- index = 0;
-
- return;
- }
- string buf;
- size_t j = 0;
- index = 0; // default
- while (ss.good() && getline(ss, buf, ',')) {
- if (j == 0) {
- symbol = buf;
- } else {
- index = stoi(buf);
- }
- j++;
- }
-}
-
-string
-NT::repr() const
-{
- ostringstream os;
- os << "NT<" << symbol << "," << index << ">";
-
- return os.str();
-}
-
-string
-NT::escaped() const
-{
- ostringstream os;
- os << "[" << symbol;
- if (index > 0)
- os << "," << index;
- os << "]";
-
- return os.str();
-}
-
-ostream&
-operator<<(ostream& os, const NT& nt)
-{
- return os << nt.repr();
-}
-
-/*
- * G::T
- *
- */
-T::T(const string& s)
-{
- word = s;
-}
-
-string
-T::repr() const
-{
- ostringstream os;
- os << "T<" << word << ">";
-
- return os.str();
-}
-
-string
-T::escaped() const
-{
- return util::json_escape(word);
-}
-
-ostream&
-operator<<(ostream& os, const T& t)
-{
- return os << t.repr();
-}
-
-
-/*
- * G::Item
- *
- * Better solve this by inheritance
- * -> rhs, target as vector<base class> ?
- *
- */
-Item::Item(string& s)
-{
- if (s.front() == '[' && s.back() == ']') {
- type = NON_TERMINAL;
- nt = new NT(s);
- } else {
- type = TERMINAL;
- t = new T(s);
- }
-}
-
-string
-Item::repr() const
-{
- ostringstream os;
- if (type == TERMINAL)
- os << t->repr();
- else
- os << nt->repr();
-
- return os.str();
-}
-
-string
-Item::escaped() const
-{
- ostringstream os;
- if (type == TERMINAL)
- os << t->escaped();
- else
- os << nt->escaped();
-
- return os.str();
-}
-
-ostream&
-operator<<(ostream& os, const Item& i)
-{
- return os << i.repr();
-}
-
-/*
- * G::Rule
- *
- */
-Rule::Rule(const string& s)
-{
- from_s(this, s);
-}
-
-void
-Rule::from_s(Rule* r, const string& s)
-{
- stringstream ss(s);
- size_t j = 0;
- string buf;
- r->arity = 0;
- size_t index = 1;
- vector<G::NT*> rhs_nt;
- r->f = new Sv::SparseVector<string, score_t>();
- while (ss >> buf) {
- if (buf == "|||") { j++; continue; }
- if (j == 0) { // LHS
- r->lhs = new NT(buf);
- } else if (j == 1) { // RHS
- r->rhs.push_back(new Item(buf));
- if (r->rhs.back()->type == NON_TERMINAL) {
- rhs_nt.push_back(r->rhs.back()->nt);
- r->arity++;
- }
- } else if (j == 2) { // TARGET
- r->target.push_back(new Item(buf));
- if (r->target.back()->type == NON_TERMINAL) {
- r->order.insert(make_pair(index, r->target.back()->nt->index));
- if (r->target.back()->nt->symbol == "")
- r->target.back()->nt->symbol = rhs_nt[r->target.back()->nt->index-1]->symbol;
- index++;
- }
- } else if (j == 3) { // F TODO
- Sv::SparseVector<string, score_t>::from_s(r->f, buf); // FIXME this is slow!!!
- } else if (j == 4) { // A TODO
- } else {
- // ERROR
- }
- if (j == 4) break;
- }
-}
-
-string
-Rule::repr() const
-{
- ostringstream os;
- os << "Rule<lhs=" << lhs->repr() << \
- ", rhs:{";
- for (auto it = rhs.begin(); it != rhs.end(); it++) {
- os << (**it).repr();
- if (next(it) != rhs.end()) os << " ";
- }
- os << "}, target:{";
- for (auto it = target.begin(); it != target.end(); it++) {
- os << (**it).repr();
- if (next(it) != target.end()) os << " ";
- }
- os << "}" \
- ", f:" << f->repr() << \
- ", arity=" << arity << \
- ", map:" << "TODO" << \
- ">";
-
- return os.str();
-}
-
-string
-Rule::escaped() const
-{
- ostringstream os;
- os << lhs->escaped() << " ||| ";
- for (auto it = rhs.begin(); it != rhs.end(); it++) {
- os << (**it).escaped();
- if (next(it) != rhs.end()) os << " ";
- }
- os << " ||| ";
- for (auto it = target.begin(); it != target.end(); it++) {
- os << (**it).escaped();
- if (next(it) != target.end()) os << " ";
- }
- os << " ||| ";
- os << f->escaped();
- os << " ||| ";
- os << "TODO(alignment)";
-
- return os.str();
-}
-
-ostream&
-operator<<(ostream& os, const Rule& r)
-{
- return os << r.repr();
-}
-
-/*
- * G::Grammmar
- *
- */
-Grammar::Grammar(const string& fn)
-{
- ifstream ifs(fn);
- string line;
- while (getline(ifs, line)) {
- G::Rule* r = new G::Rule(line);
- rules.push_back(r);
- if (r->arity == 0)
- flat.push_back(r);
- else if (r->rhs.front()->type == NON_TERMINAL)
- start_nt.push_back(r);
- else
- start_t.push_back(r);
- }
-}
-
-ostream&
-operator<<(ostream& os, const Grammar& g)
-{
- for (const auto it: g.rules)
- os << it->repr() << endl;
-
- return os;
-}
-
-} // namespace G
-
diff --git a/fast/grammar.hh b/fast/grammar.hh
index 1b9ac5a..e5acb8a 100644
--- a/fast/grammar.hh
+++ b/fast/grammar.hh
@@ -1,13 +1,14 @@
#pragma once
-#include <fstream>
#include <iostream>
+#include <fstream>
+#include <map>
#include <sstream>
#include <string>
-#include <map>
-#include <msgpack.hpp>
#include <vector>
+#include <msgpack.hpp>
+
#include "sparse_vector.hh"
#include "util.hh"
@@ -16,46 +17,138 @@ using namespace std;
namespace G {
-struct NT {
- string symbol;
- size_t index;
+enum item_type {
+ UNDEFINED,
+ TERMINAL,
+ NON_TERMINAL
+};
- NT() {};
- NT(string& s);
+struct Item {
+ virtual size_t index() const { return 0; }
+ virtual symbol_t symbol() const { return ""; }
+ virtual item_type type() const { return UNDEFINED; }
- string repr() const;
- string escaped() const;
+ virtual ostream& repr(ostream& os) const { return os << "Item<>"; }
+ virtual ostream& escaped(ostream& os) const { return os << ""; }
- friend ostream& operator<<(ostream& os, const NT& t);
+ friend ostream&
+ operator<<(ostream& os, const Item& i)
+ {
+ return i.repr(os);
+ };
};
-struct T {
- string word; // use word ids instead?
+struct NT : public Item {
+ symbol_t symbol_;
+ size_t index_;
+
+ NT() {}
+
+ NT(string const& s)
+ {
+ index_ = 0; // default
+ string t(s);
+ t.erase(0, 1); t.pop_back(); // remove '[' and ']'
+ istringstream ss(s);
+ if (ss >> index_) { // [i]
+ symbol_ = "";
+ index_ = stoi(s);
+
+ return;
+ } else { // [X]
+ symbol_ = s;
+
+ return;
+ }
+ string buf;
+ size_t j = 0;
+ while (ss.good() && getline(ss, buf, ',')) {
+ if (j == 0) {
+ symbol_ = buf;
+ } else {
+ index_ = stoi(buf);
+ }
+ j++;
+ }
+ }
- T(const string& s);
+ virtual size_t index() const { return index_; }
+ virtual symbol_t symbol() const { return symbol_; }
+ virtual item_type type() { return NON_TERMINAL; }
- string repr() const;
- string escaped() const;
+ virtual ostream&
+ repr(ostream& os) const
+ {
+ return os << "NT<" << symbol_ << "," << index_ << ">";
+ }
- friend ostream& operator<<(ostream& os, const NT& nt);
+ virtual ostream&
+ escaped(ostream& os) const
+ {
+ os << "[" << symbol_;
+ if (index_ > 0)
+ os << "," << index_;
+ os << "]";
+
+ return os;
+ }
};
-enum item_type {
- NON_TERMINAL,
- TERMINAL
+struct T : public Item {
+ symbol_t symbol_;
+
+ T(string const& s)
+ {
+ symbol_ = s;
+ }
+
+ virtual symbol_t symbol() const { return symbol_; }
+ virtual item_type type() { return TERMINAL; }
+
+ virtual ostream&
+ repr(ostream& os) const
+ {
+ return os << "T<" << symbol_ << ">";
+ }
+
+ virtual ostream&
+ escaped(ostream& os) const
+ {
+ os << util::json_escape(symbol_);
+ }
};
-struct Item {
- item_type type;
- NT* nt;
- T* t;
+struct Vocabulary
+{
+ unordered_map<symbol_t, size_t> m_;
+ vector<Item*> items_;
- Item(string& s);
+ bool
+ is_non_terminal(string const& s)
+ {
+ return s.front() == '[' && s.back() == ']';
+ }
- string repr() const;
- string escaped() const;
+ Item*
+ get(symbol_t const& s)
+ {
+ if (is_non_terminal(s))
+ return new NT(s);
+ if (m_.find(s) != m_.end())
+ return items_[m_[s]];
+ return add(s);
+ }
- friend ostream& operator<<(ostream& os, const Item& i);
+ Item*
+ add(symbol_t const& s)
+ {
+ size_t next_index_ = items_.size();
+ T* item = new T(s);
+ items_.push_back(item);
+ m_[s] = next_index_;
+
+ return item;
+ }
};
struct Rule {
@@ -65,35 +158,157 @@ struct Rule {
size_t arity;
Sv::SparseVector<string, score_t>* f;
map<size_t, size_t> order;
- string as_str_; // FIXME
+ string as_str_;
+
+ Rule() {}
+
+ Rule(string const& s, Vocabulary& vocab) { from_s(this, s, vocab); }
+
+ static void
+ from_s(Rule* r, string const& s, Vocabulary& vocab)
+ {
+ istringstream ss(s);
+ string buf;
+ size_t j = 0, i = 0;
+ r->arity = 0;
+ vector<NT*> rhs_non_terminals;
+ r->f = new Sv::SparseVector<string, score_t>();
+ while (ss >> buf) {
+ if (buf == "|||") { j++; continue; }
+ if (j == 0) { // left-hand side
+ r->lhs = new NT(buf);
+ } else if (j == 1) { // right-hand side
+ Item* item = vocab.get(buf);
+ r->rhs.push_back(item);
+ if (item->type() == NON_TERMINAL) {
+ r->arity++;
+ rhs_non_terminals.push_back(reinterpret_cast<NT*>(item));
+ }
+ } else if (j == 2) { // target
+ Item* item = vocab.get(buf);
+ if (item->type() == NON_TERMINAL) {
+ r->order[i] = item->index();
+ i++;
+ if (item->symbol() == "") { // only [1], [2] ... on target
+ reinterpret_cast<NT*>(item)->symbol_ = \
+ rhs_non_terminals[item->index()-1]->symbol();
+ }
+ }
+ r->target.push_back(item);
+ } else if (j == 3) { // feature vector
+ Sv::SparseVector<string, score_t>::from_s(r->f, buf);
+ // FIXME: this is slow!!! ^^^
+ } else if (j == 4) { // alignment
+ } else {
+ // error
+ }
+ if (j == 4) break;
+ }
+ }
- Rule() {};
- Rule(const string& s);
- static void from_s(Rule* r, const string& s);
+ ostream&
+ repr(ostream& os) const
+ {
+ os << "Rule<lhs=";
+ lhs->repr(os);
+ os << ", rhs:{";
+ for (auto it = rhs.begin(); it != rhs.end(); it++) {
+ (**it).repr(os);
+ if (next(it) != rhs.end()) os << " ";
+ }
+ os << "}, target:{";
+ for (auto it = target.begin(); it != target.end(); it++) {
+ (**it).repr(os);
+ if (next(it) != target.end()) os << " ";
+ }
+ os << "}, f:";
+ f->repr(os);
+ os << ", arity=" << arity << \
+ ", map:" << "TODO" << \
+ ">";
- string repr() const;
- string escaped() const;
+ return os;
+ }
- friend ostream& operator<<(ostream& os, const Rule& r);
+ ostream&
+ escaped(ostream& os) const
+ {
+ lhs->escaped(os);
+ os << " ||| ";
+ for (auto it = rhs.begin(); it != rhs.end(); it++) {
+ (**it).escaped(os);
+ if (next(it) != rhs.end()) os << " ";
+ }
+ os << " ||| ";
+ for (auto it = target.begin(); it != target.end(); it++) {
+ (**it).escaped(os);
+ if (next(it) != target.end()) os << " ";
+ }
+ os << " ||| ";
+ f->escaped(os);
+ os << " ||| ";
+ os << "TODO";
- void prep_for_serialization_() { as_str_ = escaped(); }; // FIXME
+ return os;
+ };
- MSGPACK_DEFINE(as_str_); // TODO
+ friend ostream&
+ operator<<(ostream& os, const Rule& r)
+ {
+ return r.repr(os);
+ };
+
+ // --
+ void
+ prep_for_serialization_()
+ {
+ ostringstream os;
+ escaped(os);
+ as_str_ = os.str();
+ };
+ MSGPACK_DEFINE(as_str_);
+ // ^^^ FIXME
};
struct Grammar {
vector<Rule*> rules;
vector<Rule*> flat;
- vector<Rule*> start_nt;
- vector<Rule*> start_t;
+ vector<Rule*> start_non_terminal;
+ vector<Rule*> start_terminal;
+
+ Grammar() {}
+
+ Grammar(string const& fn, Vocabulary& vocab)
+ {
+ ifstream ifs(fn);
+ string line;
+ while (getline(ifs, line)) {
+ G::Rule* r = new G::Rule(line, vocab);
+ rules.push_back(r);
+ if (r->arity == 0)
+ flat.push_back(r);
+ else if (r->rhs.front()->type() == NON_TERMINAL)
+ start_non_terminal.push_back(r);
+ else
+ start_terminal.push_back(r);
+ }
+ }
- Grammar() {};
- Grammar(const string& fn);
+ void add_glue();
+ // ^^^ TODO
+ void add_pass_through(const string& input);
+ // ^^^ TODO
- void add_glue(); // TODO
- void add_pass_through(const string& input); // TODO
+ friend ostream&
+ operator<<(ostream& os, Grammar const& g)
+ {
+ for (const auto it: g.rules) {
+ it->repr(os);
+ os << endl;
+ }
- friend ostream& operator<<(ostream& os, const Grammar& g);
+ return os;
+ }
};
} // namespace G
diff --git a/fast/hypergraph.cc b/fast/hypergraph.cc
index a9a44f9..d9a51a5 100644
--- a/fast/hypergraph.cc
+++ b/fast/hypergraph.cc
@@ -69,11 +69,13 @@ viterbi(Hypergraph& hg)
void
viterbi_path(Hypergraph& hg, Path& p)
{
- list<Node*>::iterator root = \
+ //list<Node*>::iterator root = \
find_if(hg.nodes.begin(), hg.nodes.end(), \
[](Node* n) { return n->incoming.size() == 0; });
+ list<Node*>::iterator root = hg.nodes.begin();
- Hg::topological_sort(hg.nodes, root); // FIXME do I need to do this when reading from file?
+ //Hg::topological_sort(hg.nodes, root);
+ // ^^^ FIXME do I need to do this when reading from file?
Semiring::Viterbi<score_t> semiring;
Hg::init(hg.nodes, root, semiring);
@@ -111,11 +113,11 @@ derive(const Path& p, const Node* cur, vector<string>& carry)
unsigned j = 0;
for (auto it: next->rule->target) {
- if (it->type == G::NON_TERMINAL) {
+ if (it->type() == G::NON_TERMINAL) {
derive(p, next->tails[next->rule->order[j]], carry);
j++;
} else {
- carry.push_back(it->t->word);
+ carry.push_back(it->symbol());
}
}
}
@@ -123,7 +125,7 @@ derive(const Path& p, const Node* cur, vector<string>& carry)
namespace io {
void
-read(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn) // FIXME
+read(Hypergraph& hg, vector<G::Rule*>& rules, G::Vocabulary& vocab, const string& fn)
{
ifstream ifs(fn);
size_t i = 0, r, n, e;
@@ -145,7 +147,7 @@ read(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn) // FIXME
string s;
o.convert(&s);
G::Rule* rule = new G::Rule;
- G::Rule::from_s(rule, s);
+ G::Rule::from_s(rule, s, vocab);
rules.push_back(rule);
} else if (i > r+2 && i <= r+n+2) {
Node* n = new Node;
@@ -193,7 +195,7 @@ write(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn) // FIXME
}
void
-manual(Hypergraph& hg, vector<G::Rule*>& rules)
+manual(Hypergraph& hg, vector<G::Rule*>& rules, G::Vocabulary& vocab)
{
// nodes
Node* a = new Node; a->id = 0; a->symbol = "root"; a->left = -1; a->right = -1; a->mark = 0;
@@ -228,7 +230,7 @@ manual(Hypergraph& hg, vector<G::Rule*>& rules)
rule_strs.push_back("[S] ||| [NP,1] [VP,2] ||| [NP,1] [VP,2] ||| ||| ");
for (auto it: rule_strs) {
- rules.push_back(new G::Rule(it));
+ rules.push_back(new G::Rule(it, vocab));
rules.back()->f = new Sv::SparseVector<string, score_t>();
}
@@ -349,8 +351,9 @@ operator<<(ostream& os, const Edge& e)
"Edge<head=" << e.head->id << \
", tails=[" << _.str() << "]" \
", score=" << e.score << \
- ", rule:'" << e.rule->escaped() << "'" << \
- ", f=" << "TODO" << \
+ ", rule:'";
+ e.rule->escaped(os);
+ os << "', f=" << "TODO" << \
", arity=" << e.arity << \
", mark=" << e.mark << ">";
return os;
diff --git a/fast/hypergraph.hh b/fast/hypergraph.hh
index 299a62d..1c48a88 100644
--- a/fast/hypergraph.hh
+++ b/fast/hypergraph.hh
@@ -34,7 +34,7 @@ struct Edge {
unsigned int mark = 0;
inline bool is_marked() { return mark >= arity; }
- friend ostream& operator<<(ostream& os, const Edge& s);
+ friend ostream& operator<<(ostream& os, const Edge& e);
size_t head_id_;
vector<size_t> tails_ids_; // node ids
@@ -89,7 +89,7 @@ derive(const Path& p, const Node* cur, vector<string>& carry);
namespace io {
void
-read(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn); // FIXME
+read(Hypergraph& hg, vector<G::Rule*>& rules, G::Vocabulary& vocab, const string& fn); // FIXME
void
write(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn); // FIXME
diff --git a/fast/main.cc b/fast/main.cc
index 08fcfcf..825ad59 100644
--- a/fast/main.cc
+++ b/fast/main.cc
@@ -6,8 +6,9 @@ int
main(int argc, char** argv)
{
Hg::Hypergraph hg;
+ G::Vocabulary y;
G::Grammar g;
- Hg::io::read(hg, g.rules, argv[1]);
+ Hg::io::read(hg, g.rules, y, argv[1]);
//Hg::io::manual(hg, g.rules);
clock_t begin = clock();
Hg::Path p;
diff --git a/fast/parse.cc b/fast/parse.cc
new file mode 100644
index 0000000..06c9fa0
--- /dev/null
+++ b/fast/parse.cc
@@ -0,0 +1,55 @@
+#include "parse.hh"
+
+
+namespace Parse {
+
+
+} //
+
+
+vector<G::T> tokenize(string s)
+{
+ istringstream ss(s);
+ vector<G::T> res;
+ while (ss.good()) {
+ string t;
+ ss >> t;
+ G::T i(t);
+ cout << i.word << endl;
+ res.push_back(i);
+ }
+ return res;
+}
+
+
+bool operator==(vector<G::Item> const& a, vector<G::Item> const& b)
+{
+ if (a.size() != b.size()) return false;
+ for (auto it: a)
+}
+
+int main(int argc, char** argv)
+{
+ string in("karten haie");
+ vector<G::T> tok = tokenize(in);
+ for (auto it: tok)
+ cout << it.word << ",";
+ cout << endl;
+ size_t n = tok.size();
+
+ G::Grammar g(argv[1]);
+
+ vector<Span> spans;
+ Parse::visit(spans, 1, 0, 6);
+ for (auto it: spans) {
+ cout << "(" << it.first << "," << it.second << ")" << endl;
+ }
+
+ Parse::Chart active(n);
+ Parse::Chart passive(n);
+
+ //init(tok, n, active, passive, g);
+
+ cout << *(g.flat.at(0)) << endl;
+}
+
diff --git a/fast/parse.hh b/fast/parse.hh
new file mode 100644
index 0000000..9fbcdea
--- /dev/null
+++ b/fast/parse.hh
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <vector>
+#include <utility>
+#include <sstream>
+#include <unordered_map>
+
+#include "grammar.hh"
+
+
+using namespace std;
+
+typedef pair<size_t,size_t> Span;
+namespace std {
+ template <>
+ struct hash<Span>
+ {
+ size_t
+ operator()(Span const& k) const
+ {
+ return ((hash<size_t>()(k.first)
+ ^ (hash<size_t>()(k.second) << 1)) >> 1);
+ }
+ };
+}
+
+namespace Parse {
+
+void visit(vector<Span>& p,
+ size_t i, size_t l, size_t r, size_t x=0)
+{
+ for (size_t span = i; span <= r-x; span++) {
+ for (size_t k = l; k <= r-span; k++) {
+ p.push_back(Span(k,k+span));
+ }
+ }
+}
+
+struct ChartItem
+{
+ Span span;
+ G::Rule const* rule;
+ vector<ChartItem*> tails;
+ size_t dot;
+
+ ChartItem(G::Rule* r)
+ {
+ rule = r;
+ dot = 0;
+ }
+
+ ChartItem(ChartItem const& o)
+ {
+ span.first = o.span.first;
+ span.second = o.span.second;
+ rule = o.rule;
+ for (auto it: o.tails)
+ tails.push_back(it);
+ dot = o.dot;
+ }
+};
+
+struct Chart
+{
+ size_t n_;
+ unordered_map<Span, vector<ChartItem*> > m_;
+ unordered_map<string,bool> b_;
+
+ vector<ChartItem*>& at(Span s)
+ {
+ return m_[s];
+ }
+
+ string h(ChartItem* item, Span s)
+ {
+ ostringstream ss;
+ ss << item->rule->lhs->symbol;
+ ss << s.first;
+ ss << s.second;
+ return ss.str();
+ }
+
+ void add(ChartItem* item, Span s)
+ {
+ m_[s].push_back(item);
+ b_[h(item, s)] = true;
+ }
+
+ Chart(size_t n)
+ {
+ }
+};
+
+
+void init(vector<G::T> const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g)
+{
+ for (auto rule: g.flat) {
+ }
+}
+
+
+} //
+
diff --git a/fast/semiring.hh b/fast/semiring.hh
index 1c3ff1d..3f4ac08 100644
--- a/fast/semiring.hh
+++ b/fast/semiring.hh
@@ -1,7 +1,6 @@
#pragma once
-// TODO: others
namespace Semiring {
template<typename T>
diff --git a/fast/sparse_vector.hh b/fast/sparse_vector.hh
index 3583240..9d815ff 100644
--- a/fast/sparse_vector.hh
+++ b/fast/sparse_vector.hh
@@ -20,6 +20,7 @@ struct SparseVector {
V zero = 0.f;
SparseVector() {};
+
SparseVector(string& s)
{
from_s(this, s);
@@ -147,10 +148,9 @@ struct SparseVector {
}
}
- string
- repr() const
+ ostream&
+ repr(ostream& os) const
{
- ostringstream os;
os << "SparseVector<{";
for (auto it = m_.cbegin(); it != m_.cend(); it++) {
os << "'" << it->first << "'=" << it->second;
@@ -159,12 +159,11 @@ struct SparseVector {
}
os << "}>";
- return os.str();
+ return os;
};
- string
- escaped(bool quote_keys=false) const {
- ostringstream os;
+ ostream&
+ escaped(ostream& os, bool quote_keys=false) const {
for (auto it = m_.cbegin(); it != m_.cend(); it++) {
if (quote_keys) os << '"';
os << util::json_escape(it->first);
@@ -173,10 +172,14 @@ struct SparseVector {
if (next(it) != m_.cend()) os << " ";
}
- return os.str();
+ return os;
};
- friend ostream& operator<<(ostream& os, const SparseVector& v) { return os << v.repr(); }
+ friend ostream&
+ operator<<(ostream& os, const SparseVector& v)
+ {
+ return v.repr(os);
+ }
};
} // namespace
diff --git a/fast/test/Makefile b/fast/test/Makefile
new file mode 100644
index 0000000..0140f63
--- /dev/null
+++ b/fast/test/Makefile
@@ -0,0 +1,16 @@
+COMPILER=g++
+CFLAGS=-std=c++11 -O3 -I../
+TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread
+
+
+all: test_grammar test_sparse_vector
+
+test_grammar: test_grammar.cc ../grammar.hh
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_grammar.cc -o test_grammar
+
+test_sparse_vector: test_sparse_vector.cc ../sparse_vector.hh
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_sparse_vector.cc -o test_sparse_vector
+
+clean:
+ rm -f test_grammar test_sparse_vector
+
diff --git a/fast/test/test_grammar b/fast/test/test_grammar
new file mode 100755
index 0000000..088d55a
--- /dev/null
+++ b/fast/test/test_grammar
Binary files differ
diff --git a/fast/test/test_grammar.cc b/fast/test/test_grammar.cc
new file mode 100644
index 0000000..bbe76e7
--- /dev/null
+++ b/fast/test/test_grammar.cc
@@ -0,0 +1,20 @@
+#include <fstream>
+
+#include "grammar.hh"
+
+using namespace std;
+
+
+int
+main(int argc, char** argv)
+{
+ G::Vocabulary y;
+ G::Grammar g(argv[1], y);
+ for (auto it: g.rules) {
+ it->escaped(cout);
+ cout << endl;
+ }
+
+ return 0;
+}
+
diff --git a/fast/test/test_sparse_vector b/fast/test/test_sparse_vector
new file mode 100755
index 0000000..c06fe9e
--- /dev/null
+++ b/fast/test/test_sparse_vector
Binary files differ
diff --git a/fast/test_sparse_vector.cc b/fast/test/test_sparse_vector.cc
index 426bed1..426bed1 100644
--- a/fast/test_sparse_vector.cc
+++ b/fast/test/test_sparse_vector.cc
diff --git a/fast/test_grammar.cc b/fast/test_grammar.cc
deleted file mode 100644
index 3263edd..0000000
--- a/fast/test_grammar.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <fstream>
-
-#include "grammar.hh"
-
-using namespace std;
-
-
-int
-main(int argc, char** argv)
-{
- G::Grammar g(argv[1]);
- for (auto it: g.rules)
- cout << it->escaped() << endl;
-
- return 0;
-}
-
diff --git a/fast/util.hh b/fast/util.hh
index 2a28f16..c3e087e 100644
--- a/fast/util.hh
+++ b/fast/util.hh
@@ -7,23 +7,24 @@ using namespace std;
namespace util {
- inline string
- json_escape(const string& s) { // FIXME: only inline?
- ostringstream os;
- for (auto it = s.cbegin(); it != s.cend(); it++) {
- switch (*it) {
- case '"': os << "\\\""; break;
- case '\\': os << "\\\\"; break;
- case '\b': os << "\\b"; break;
- case '\f': os << "\\f"; break;
- case '\n': os << "\\n"; break;
- case '\r': os << "\\r"; break;
- case '\t': os << "\\t"; break;
- default: os << *it; break;
- }
- }
- return os.str();
- };
+inline string
+json_escape(const string& s) { // FIXME: only inline?
+ ostringstream os;
+ for (auto it = s.cbegin(); it != s.cend(); it++) {
+ switch (*it) {
+ case '"': os << "\\\""; break;
+ case '\\': os << "\\\\"; break;
+ case '\b': os << "\\b"; break;
+ case '\f': os << "\\f"; break;
+ case '\n': os << "\\n"; break;
+ case '\r': os << "\\r"; break;
+ case '\t': os << "\\t"; break;
+ default: os << *it; break;
+ }
+ }
+
+ return os.str();
+}
} // namespace util
diff --git a/fast/weaver.hh b/fast/weaver.hh
index e7c3238..39d5391 100644
--- a/fast/weaver.hh
+++ b/fast/weaver.hh
@@ -1,4 +1,5 @@
#pragma once
typedef double score_t;
+typedef string symbol_t;