From 129a22cfcc7651daa4b11ed52e7870249f6373a5 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 16 Sep 2014 10:23:14 +0100 Subject: spring cleaning --- .gitignore | 13 +- .gitmodules | 2 +- Makefile | 58 ++++++- README.md | 39 +++++ external/json-cpp | 1 + fast/Makefile | 15 -- fast/README.md | 38 ----- fast/grammar.hh | 316 ---------------------------------- fast/hypergraph.cc | 363 ---------------------------------------- fast/hypergraph.hh | 103 ------------ fast/main.cc | 27 --- fast/parse.hh | 108 ------------ fast/semiring.hh | 35 ---- fast/sparse_vector.hh | 186 -------------------- fast/test/Makefile | 19 --- fast/test/test_grammar | Bin 60943 -> 0 bytes fast/test/test_grammar.cc | 20 --- fast/test/test_sparse_vector | Bin 44288 -> 0 bytes fast/test/test_sparse_vector.cc | 37 ---- fast/util.hh | 47 ------ fast/weaver.hh | 10 -- grammar.rb | 139 --------------- hg.rb | 222 ------------------------ main.rb | 72 -------- parse.rb | 210 ----------------------- prototype/grammar.rb | 137 +++++++++++++++ prototype/hypergraph.rb | 219 ++++++++++++++++++++++++ prototype/parse.rb | 207 +++++++++++++++++++++++ prototype/test_hg.rb | 32 ++++ prototype/test_parse.rb | 49 ++++++ prototype/weaver.rb | 70 ++++++++ src/fast_weaver.cc | 26 +++ src/grammar.hh | 334 ++++++++++++++++++++++++++++++++++++ src/hypergraph.cc | 362 +++++++++++++++++++++++++++++++++++++++ src/hypergraph.hh | 102 +++++++++++ src/make_pak.cc | 103 ++++++++++++ src/parse.hh | 301 +++++++++++++++++++++++++++++++++ src/read_pak.cc | 26 +++ src/semiring.hh | 35 ++++ src/sparse_vector.hh | 186 ++++++++++++++++++++ src/test_grammar.cc | 19 +++ src/test_parse.cc | 19 +++ src/test_sparse_vector.cc | 36 ++++ src/types.hh | 10 ++ src/util.hh | 47 ++++++ test/test_hg.rb | 32 ---- test/test_parse.rb | 49 ------ util/Makefile | 14 -- util/cdec2json.py | 96 ----------- util/json-cpp | 1 - util/make_pak.cc | 104 ------------ util/read_pak.cc | 27 --- 52 files changed, 2419 insertions(+), 2304 deletions(-) create mode 160000 external/json-cpp delete mode 100644 fast/Makefile delete mode 100644 fast/README.md delete mode 100644 fast/grammar.hh delete mode 100644 fast/hypergraph.cc delete mode 100644 fast/hypergraph.hh delete mode 100644 fast/main.cc delete mode 100644 fast/parse.hh delete mode 100644 fast/semiring.hh delete mode 100644 fast/sparse_vector.hh delete mode 100644 fast/test/Makefile delete mode 100755 fast/test/test_grammar delete mode 100644 fast/test/test_grammar.cc delete mode 100755 fast/test/test_sparse_vector delete mode 100644 fast/test/test_sparse_vector.cc delete mode 100644 fast/util.hh delete mode 100644 fast/weaver.hh delete mode 100644 grammar.rb delete mode 100644 hg.rb delete mode 100755 main.rb delete mode 100644 parse.rb create mode 100644 prototype/grammar.rb create mode 100644 prototype/hypergraph.rb create mode 100644 prototype/parse.rb create mode 100755 prototype/test_hg.rb create mode 100755 prototype/test_parse.rb create mode 100755 prototype/weaver.rb create mode 100644 src/fast_weaver.cc create mode 100644 src/grammar.hh create mode 100644 src/hypergraph.cc create mode 100644 src/hypergraph.hh create mode 100644 src/make_pak.cc create mode 100644 src/parse.hh create mode 100644 src/read_pak.cc create mode 100644 src/semiring.hh create mode 100644 src/sparse_vector.hh create mode 100644 src/test_grammar.cc create mode 100644 src/test_parse.cc create mode 100644 src/test_sparse_vector.cc create mode 100644 src/types.hh create mode 100644 src/util.hh delete mode 100755 test/test_hg.rb delete mode 100755 test/test_parse.rb delete mode 100644 util/Makefile delete mode 100755 util/cdec2json.py delete mode 160000 util/json-cpp delete mode 100644 util/make_pak.cc delete mode 100644 util/read_pak.cc diff --git a/.gitignore b/.gitignore index d8a671e..8bb6628 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,7 @@ *.o -fast/example/ -fast/fast_weaver -fast/test_grammar -fast/test_sparse_vector -util/make_pak -util/read_pak -fast/gperftools-2.1/ +fast_weaver +test_grammar +test_sparse_vector +make_pak +read_pak +external/gperftools-2.1/ diff --git a/.gitmodules b/.gitmodules index 843caa2..7fe83f1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "util/json-cpp"] - path = util/json-cpp + path = external/json-cpp url = https://github.com/ascheglov/json-cpp.git diff --git a/Makefile b/Makefile index f499591..56f89d4 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,53 @@ -json_examples: - ./main.rb -w example/3/weights.init < example/3/in.sgm > example/3/3.json 2>/dev/null - ./main.rb -w example/3/weights.init -l < example/3/in.sgm > example/3/3-with-glue.json 2>/dev/null - ./main.rb -w example/glue/weights -l < example/glue/in.sgm > example/glue/glue.json 2>/dev/null - ./main.rb -w example/toy/weights < example/toy/in.sgm > example/toy/toy.json 2>/dev/null - ./main.rb -w example/toy/weights < example/toy/in-test.sgm > example/toy/toy-test.json 2>/dev/null +COMPILER=clang +CFLAGS=-std=c++11 -O3 -Wall +TCMALLOC=$(shell pwd)/external/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread +SRC=src + +all: $(SRC)/hypergraph.o $(SRC)/fast_weaver.cc + $(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack $(TCMALLOC) $(SRC)/hypergraph.o \ + $(SRC)/fast_weaver.cc \ + -o fast_weaver + +$(SRC)/hypergraph.o: $(SRC)/hypergraph.cc $(SRC)/hypergraph.hh \ + $(SRC)/semiring.hh $(SRC)/sparse_vector.hh \ + $(SRC)/types.hh + $(COMPILER) $(CFLAGS) -g -c $(TCMALLOC) \ + $(SRC)/hypergraph.cc \ + -o $(SRC)/hypergraph.o + +util: make_pak read_pak + +make_pak: $(SRC)/make_pak.cc external/json-cpp/single_include/json-cpp.hpp \ + $(SRC)/hypergraph.hh $(SRC)/types.hh + $(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack -I./external \ + $(SRC)/make_pak.cc \ + -o make_pak + +read_pak: $(SRC)/read_pak.cc + $(COMPILER) $(CFLAGS) -lstdc++ -lmsgpack \ + $(SRC)/read_pak.cc \ + -o read_pak + +test: test_grammar test_parse test_sparse_vector + +test_grammar: $(SRC)/test_grammar.cc $(SRC)/grammar.hh + $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) \ + $(SRC)/test_grammar.cc \ + -o test_grammar + +test_parse: $(SRC)/test_parse.cc $(SRC)/parse.hh \ + $(SRC)/grammar.hh $(SRC)/util.hh + $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) \ + $(SRC)/test_parse.cc \ + -o test_parse + +test_sparse_vector: $(SRC)/test_sparse_vector.cc $(SRC)/sparse_vector.hh + $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) \ + $(SRC)/test_sparse_vector.cc \ + -o test_sparse_vector + +clean: + rm -f fast_weaver hypergraph.o + rm -f make_pak read_pak + rm -f test_grammar test_sparse_vector test_parse diff --git a/README.md b/README.md index 4cdcf31..e462d41 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,42 @@ +TODO + * sparse vector (unordered_map) -> where to store? + * parser + * Rule -> ChartItem -> Node ? + * k-best + * other semirings + * include language model + * compress/hash words/feature strings? + * cast? Rule -> Edge, ChartItem -> Node + * feature factory, observer + +Dependencies: + * MessagePack for object serialization [1] + * kenlm language model [2] + +This is Linux only. + + +[1] http://msgpack.org +[2] http://kheafield.com/code/kenlm/ + + +stuff to have a look at: +http://math.nist.gov/spblas/ +http://lapackpp.sourceforge.net/ +http://www.cvmlib.com/ +http://sourceforge.net/projects/lpp/ +http://math-atlas.sourceforge.net/ +http://www.netlib.org/lapack/ +http://bytes.com/topic/c/answers/702569-blas-vs-cblas-c +http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack +http://www.osl.iu.edu/research/mtl/download.php3 +http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library +https://software.intel.com/en-us/tbb_4.2_doc +http://goog-perftools.sourceforge.net/doc/tcmalloc.html +http://www.sgi.com/tech/stl/Rope.html +http://www.cs.unc.edu/Research/compgeom/gzstream/ +https://github.com/facebook/folly/blob/6e46d468cf2876dd59c7a4dddcb4e37abf070b7a/folly/docs/Overview.md +--- not much to see here, yet (SCFG machine translation decoder in ruby, currently implements CKY+ parsing and hypergraph viterbi) diff --git a/external/json-cpp b/external/json-cpp new file mode 160000 index 0000000..4eb4b47 --- /dev/null +++ b/external/json-cpp @@ -0,0 +1 @@ +Subproject commit 4eb4b47cf4d622bc7bf34071d6b68fc5beb37051 diff --git a/fast/Makefile b/fast/Makefile deleted file mode 100644 index 1a7f5b9..0000000 --- a/fast/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -COMPILER=g++ -CFLAGS=-std=c++11 -O3 -TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread - - -all: hypergraph.o main.cc - $(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack $(TCMALLOC) hypergraph.o main.cc -o fast_weaver - -hypergraph.o: hypergraph.cc hypergraph.hh semiring.hh sparse_vector.hh weaver.hh - $(COMPILER) $(CFLAGS) -g -c $(TCMALLOC) hypergraph.cc - -clean: - rm -f fast_weaver - rm -f hypergraph.o parse.o - diff --git a/fast/README.md b/fast/README.md deleted file mode 100644 index f92245b..0000000 --- a/fast/README.md +++ /dev/null @@ -1,38 +0,0 @@ -TODO - * sparse vector (unordered_map) -> where to store? - * parser - * Rule -> ChartItem -> Node ? - * k-best - * other semirings - * include language model - * compress/hash words/feature strings? - * cast? Rule -> Edge, ChartItem -> Node - * feature factory, observer - -Dependencies: - * MessagePack for object serialization [1] - * kenlm language model [2] - -This is Linux only. - - -[1] http://msgpack.org -[2] http://kheafield.com/code/kenlm/ - - -stuff to have a look at: -http://math.nist.gov/spblas/ -http://lapackpp.sourceforge.net/ -http://www.cvmlib.com/ -http://sourceforge.net/projects/lpp/ -http://math-atlas.sourceforge.net/ -http://www.netlib.org/lapack/ -http://bytes.com/topic/c/answers/702569-blas-vs-cblas-c -http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack -http://www.osl.iu.edu/research/mtl/download.php3 -http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library -https://software.intel.com/en-us/tbb_4.2_doc -http://goog-perftools.sourceforge.net/doc/tcmalloc.html -http://www.sgi.com/tech/stl/Rope.html -http://www.cs.unc.edu/Research/compgeom/gzstream/ - diff --git a/fast/grammar.hh b/fast/grammar.hh deleted file mode 100644 index 4906c46..0000000 --- a/fast/grammar.hh +++ /dev/null @@ -1,316 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -#include - -#include "sparse_vector.hh" -#include "util.hh" - -using namespace std; - - -namespace G { - -enum item_type { - UNDEFINED, - TERMINAL, - NON_TERMINAL -}; - -struct Item { - virtual size_t index() const { return 0; } - virtual symbol_t symbol() const { return ""; } - virtual item_type type() const { return UNDEFINED; } - - virtual ostream& repr(ostream& os) const { return os << "Item<>"; } - virtual ostream& escaped(ostream& os) const { return os << ""; } - - friend ostream& - operator<<(ostream& os, const Item& i) - { - return i.repr(os); - }; -}; - -struct NT : public Item { - symbol_t symbol_; - size_t index_; - - NT() {} - - NT(string const& s) - { - index_ = 0; // default - string t(s); - t.erase(0, 1); t.pop_back(); // remove '[' and ']' - istringstream ss(t); - if (ss >> index_) { // [i] - symbol_ = ""; - index_ = stoi(s); - return; - } else { - ss.clear(); - string buf; - size_t j = 0; - while (ss.good() && getline(ss, buf, ',')) { - if (j == 0) { - symbol_ = buf; - } else { - index_ = stoi(buf); - } - j++; - } - } - } - - virtual size_t index() const { return index_; } - virtual symbol_t symbol() const { return symbol_; } - virtual item_type type() const { return NON_TERMINAL; } - - virtual ostream& - repr(ostream& os) const - { - return os << "NT<" << symbol_ << "," << index_ << ">"; - } - - virtual ostream& - escaped(ostream& os) const - { - os << "[" << symbol_; - if (index_ > 0) - os << "," << index_; - os << "]"; - - return os; - } -}; - -struct T : public Item { - symbol_t symbol_; - - T(string const& s) - { - symbol_ = s; - } - - virtual symbol_t symbol() const { return symbol_; } - virtual item_type type() const { return TERMINAL; } - - virtual ostream& - repr(ostream& os) const - { - return os << "T<" << symbol_ << ">"; - } - - virtual ostream& - escaped(ostream& os) const - { - os << util::json_escape(symbol_); - } -}; - -struct Vocabulary -{ - unordered_map m_; - vector items_; - - bool - is_non_terminal(string const& s) - { - return s.front() == '[' && s.back() == ']'; - } - - Item* - get(symbol_t const& s) - { - if (is_non_terminal(s)) - return new NT(s); - if (m_.find(s) != m_.end()) - return items_[m_[s]]; - return add(s); - } - - Item* - add(symbol_t const& s) - { - size_t next_index_ = items_.size(); - T* item = new T(s); - items_.push_back(item); - m_[s] = next_index_; - - return item; - } -}; - -struct Rule { - NT* lhs; - vector rhs; - vector target; - size_t arity; -Sv::SparseVector* f; - map order; - string as_str_; - - Rule() {} - - Rule(string const& s, Vocabulary& vocab) { from_s(this, s, vocab); } - - static void - from_s(Rule* r, string const& s, Vocabulary& vocab) - { - istringstream ss(s); - string buf; - size_t j = 0, i = 1; - r->arity = 0; - vector rhs_non_terminals; - r->f = new Sv::SparseVector(); - while (ss >> buf) { - if (buf == "|||") { j++; continue; } - if (j == 0) { // left-hand side - r->lhs = new NT(buf); - } else if (j == 1) { // right-hand side - Item* item = vocab.get(buf); - r->rhs.push_back(item); - if (item->type() == NON_TERMINAL) { - r->arity++; - rhs_non_terminals.push_back(reinterpret_cast(item)); - } - } else if (j == 2) { // target - Item* item = vocab.get(buf); - if (item->type() == NON_TERMINAL) { - r->order.insert(make_pair(i, item->index())); - i++; - if (item->symbol() == "") { // only [1], [2] ... on target - reinterpret_cast(item)->symbol_ = \ - rhs_non_terminals[item->index()-1]->symbol(); - } - } - r->target.push_back(item); - } else if (j == 3) { // feature vector - Sv::SparseVector::from_s(r->f, buf); - // FIXME: this is slow!!! ^^^ - } else if (j == 4) { // alignment - } else { - // error - } - if (j == 4) break; - } - } - - ostream& - repr(ostream& os) const - { - os << "Rulerepr(os); - os << ", rhs:{"; - for (auto it = rhs.begin(); it != rhs.end(); it++) { - (**it).repr(os); - if (next(it) != rhs.end()) os << " "; - } - os << "}, target:{"; - for (auto it = target.begin(); it != target.end(); it++) { - (**it).repr(os); - if (next(it) != target.end()) os << " "; - } - os << "}, f:"; - f->repr(os); - os << ", arity=" << arity << \ - ", order:{"; - for (auto it = order.begin(); it != order.end(); it++) { - os << it->first << "->" << it->second; - if (next(it) != order.end()) os << ", "; - } - os << "}>"; - - return os; - } - - ostream& - escaped(ostream& os) const - { - lhs->escaped(os); - os << " ||| "; - for (auto it = rhs.begin(); it != rhs.end(); it++) { - (**it).escaped(os); - if (next(it) != rhs.end()) os << " "; - } - os << " ||| "; - for (auto it = target.begin(); it != target.end(); it++) { - (**it).escaped(os); - if (next(it) != target.end()) os << " "; - } - os << " ||| "; - f->escaped(os); - os << " ||| " << \ - "TODO"; - - return os; - }; - - friend ostream& - operator<<(ostream& os, Rule const& r) - { - return r.repr(os); - }; - - // -- - void - prep_for_serialization_() - { - ostringstream os; - escaped(os); - as_str_ = os.str(); - }; - MSGPACK_DEFINE(as_str_); - // ^^^ FIXME -}; - -struct Grammar { - vector rules; - vector flat; - vector start_non_terminal; - vector start_terminal; - - Grammar() {} - - Grammar(string const& fn, Vocabulary& vocab) - { - ifstream ifs(fn); - string line; - while (getline(ifs, line)) { - G::Rule* r = new G::Rule(line, vocab); - rules.push_back(r); - if (r->arity == 0) - flat.push_back(r); - else if (r->rhs.front()->type() == NON_TERMINAL) - start_non_terminal.push_back(r); - else - start_terminal.push_back(r); - } - } - - void add_glue(); - // ^^^ TODO - void add_pass_through(const string& input); - // ^^^ TODO - - friend ostream& - operator<<(ostream& os, Grammar const& g) - { - for (const auto it: g.rules) { - it->repr(os); - os << endl; - } - - return os; - } -}; - -} // namespace G - diff --git a/fast/hypergraph.cc b/fast/hypergraph.cc deleted file mode 100644 index 2b33ff4..0000000 --- a/fast/hypergraph.cc +++ /dev/null @@ -1,363 +0,0 @@ -#include "hypergraph.hh" - - -namespace Hg { - -template void -init(const list& nodes, const list::iterator root, const Semiring& semiring) -{ - for (const auto it: nodes) - it->score = semiring.null; - (**root).score = semiring.one; -} - -void -reset(const list nodes, const vector edges) -{ - for (const auto it: nodes) - it->mark = 0; - for (auto it: edges) - it->mark = 0; -} - -void -topological_sort(list& nodes, const list::iterator root) -{ - auto p = root; - auto to = nodes.begin(); - while (to != nodes.end()) { - if ((**p).is_marked()) { - for (const auto e: (**p).outgoing) { // explore edges - e->mark++; - if (e->is_marked()) { - e->head->mark++; - } - } - } - if ((**p).is_marked()) { - nodes.splice(to, nodes, p); - to++; - p = to; - } else { - ++p; - } - } -} - -void -viterbi(Hypergraph& hg) -{ - list::iterator root = \ - find_if(hg.nodes.begin(), hg.nodes.end(), \ - [](Node* n) { return n->incoming.size() == 0; }); - - Hg::topological_sort(hg.nodes, root); - Semiring::Viterbi semiring; - Hg::init(hg.nodes, root, semiring); - - for (const auto n: hg.nodes) { - for (const auto e: n->incoming) { - score_t s = semiring.one; - for (const auto m: e->tails) { - s = semiring.multiply(s, m->score); - } - n->score = semiring.add(n->score, semiring.multiply(s, e->score)); - } - } -} - -void -viterbi_path(Hypergraph& hg, Path& p) -{ - list::iterator root = \ - find_if(hg.nodes.begin(), hg.nodes.end(), \ - [](Node* n) { return n->incoming.size() == 0; }); - //list::iterator root = hg.nodes.begin(); - - Hg::topological_sort(hg.nodes, root); - // ^^^ FIXME do I need to do this when reading from file? - Semiring::Viterbi semiring; - Hg::init(hg.nodes, root, semiring); - - for (auto n: hg.nodes) { - Edge* best_edge; - bool best = false; - for (auto e: n->incoming) { - score_t s = semiring.one; - for (auto m: e->tails) { - s = semiring.multiply(s, m->score); - } - if (n->score < semiring.multiply(s, e->score)) { // find max - best_edge = e; - best = true; - } - n->score = semiring.add(n->score, semiring.multiply(s, e->score)); - } - if (best) - p.push_back(best_edge); - } -} - - -void -derive(const Path& p, const Node* cur, vector& carry) -{ - Edge* next; - for (auto it: p) { - if (it->head->symbol == cur->symbol && - it->head->left == cur->left && - it->head->right == cur->right) { - next = it; - } - } // FIXME this is probably not so good - - unsigned j = 0; - for (auto it: next->rule->target) { - if (it->type() == G::NON_TERMINAL) { - derive(p, next->tails[next->rule->order[j]], carry); - j++; - } else { - carry.push_back(it->symbol()); - } - } -} - -namespace io { - -void -read(Hypergraph& hg, vector& rules, G::Vocabulary& vocab, const string& fn) -{ - ifstream ifs(fn); - size_t i = 0, r, n, e; - msgpack::unpacker pac; - while(true) { - pac.reserve_buffer(32*1024); - size_t bytes = ifs.readsome(pac.buffer(), pac.buffer_capacity()); - pac.buffer_consumed(bytes); - msgpack::unpacked result; - while(pac.next(&result)) { - msgpack::object o = result.get(); - if (i == 0) { - o.convert(&r); - } else if (i == 1) { - o.convert(&n); - } else if (i == 2) { - o.convert(&e); - } else if (i > 2 && i <= r+2) { - string s; - o.convert(&s); - G::Rule* rule = new G::Rule; - G::Rule::from_s(rule, s, vocab); - rules.push_back(rule); - } else if (i > r+2 && i <= r+n+2) { - Node* n = new Node; - o.convert(n); - hg.nodes.push_back(n); - hg.nodes_by_id[n->id] = n; - } else if (i > n+2 && i <= r+n+e+2) { - Edge* e = new Edge; - e->arity = 0; - o.convert(e); - e->head = hg.nodes_by_id[e->head_id_]; - hg.edges.push_back(e); - hg.nodes_by_id[e->head_id_]->incoming.push_back(e); - e->arity = 0; - for (auto it = e->tails_ids_.begin(); it != e->tails_ids_.end(); it++) { - hg.nodes_by_id[*it]->outgoing.push_back(e); - e->tails.push_back(hg.nodes_by_id[*it]); - e->arity++; - } - e->rule = rules[e->rule_id_]; - } else { - // ERROR - } - i++; - } - if (!bytes) break; - } -} - -void -write(Hypergraph& hg, vector& rules, const string& fn) // FIXME -{ - FILE* file = fopen(fn.c_str(), "wb"); - msgpack::fbuffer fbuf(file); - msgpack::pack(fbuf, rules.size()); - msgpack::pack(fbuf, hg.nodes.size()); - msgpack::pack(fbuf, hg.edges.size()); - for (auto it = rules.cbegin(); it != rules.cend(); it++) - msgpack::pack(fbuf, **it); - for (auto it = hg.nodes.cbegin(); it != hg.nodes.cend(); it++) - msgpack::pack(fbuf, **it); - for (auto it = hg.edges.cbegin(); it != hg.edges.cend(); it++) - msgpack::pack(fbuf, **it); - fclose(file); -} - -void -manual(Hypergraph& hg, vector& rules, G::Vocabulary& vocab) -{ - // nodes - Node* a = new Node; a->id = 0; a->symbol = "root"; a->left = -1; a->right = -1; a->mark = 0; - hg.nodes.push_back(a); hg.nodes_by_id[a->id] = a; - Node* b = new Node; b->id = 1; b->symbol = "NP"; b->left = 0; b->right = 1; b->mark = 0; - hg.nodes.push_back(b); hg.nodes_by_id[b->id] = b; - Node* c = new Node; c->id = 2; c->symbol = "V"; c->left = 1; c->right = 2; c->mark = 0; - hg.nodes.push_back(c); hg.nodes_by_id[c->id] = c; - Node* d = new Node; d->id = 3; d->symbol = "JJ"; d->left = 3; d->right = 4; d->mark = 0; - hg.nodes.push_back(d); hg.nodes_by_id[d->id] = d; - Node* e = new Node; e->id = 4; e->symbol = "NN"; e->left = 3; e->right = 5; e->mark = 0; - hg.nodes.push_back(e); hg.nodes_by_id[e->id] = e; - Node* f = new Node; f->id = 5; f->symbol = "NP"; f->left = 2; f->right = 5; f->mark = 0; - hg.nodes.push_back(f); hg.nodes_by_id[f->id] = f; - Node* g = new Node; g->id = 6; g->symbol = "NP"; g->left = 1; g->right = 5; g->mark = 0; - hg.nodes.push_back(g); hg.nodes_by_id[g->id] = g; - Node* h = new Node; h->id = 7; h->symbol = "S"; h->left = 0; h->right = 6; h->mark = 0; - hg.nodes.push_back(h); hg.nodes_by_id[h->id] = h; - - // rules - vector rule_strs; - rule_strs.push_back("[NP] ||| ich ||| i ||| ||| "); - rule_strs.push_back("[V] ||| sah ||| saw ||| ||| "); - rule_strs.push_back("[JJ] ||| kleines ||| small ||| ||| "); - rule_strs.push_back("[JJ] ||| kleines ||| little ||| ||| "); - rule_strs.push_back("[NN] ||| kleines haus ||| small house ||| ||| "); - rule_strs.push_back("[NN] ||| kleines haus ||| little house ||| ||| "); - rule_strs.push_back("[NN] ||| [JJ,1] haus ||| [JJ,1] shell ||| ||| "); - rule_strs.push_back("[NN] ||| [JJ,1] haus ||| [JJ,1] house ||| ||| "); - rule_strs.push_back("[NP] ||| ein [NN,1] ||| a [NN,1] ||| ||| "); - rule_strs.push_back("[VP] ||| [V,1] [NP,2] ||| [V,1] [NP,2] ||| ||| "); - rule_strs.push_back("[S] ||| [NP,1] [VP,2] ||| [NP,1] [VP,2] ||| ||| "); - - for (auto it: rule_strs) { - rules.push_back(new G::Rule(it, vocab)); - rules.back()->f = new Sv::SparseVector(); - } - - // edges - Edge* q = new Edge; q->head = hg.nodes_by_id[1]; q->tails.push_back(hg.nodes_by_id[0]); q->score = 0.367879441171; - q->arity = 1; q->mark = 0; - hg.edges.push_back(q); - hg.nodes_by_id[1]->incoming.push_back(q); - hg.nodes_by_id[0]->outgoing.push_back(q); - q->rule = rules[0]; - - Edge* p = new Edge; p->head = hg.nodes_by_id[2]; p->tails.push_back(hg.nodes_by_id[0]); p->score = 0.606530659713; - p->arity = 1; p->mark = 0; - hg.edges.push_back(p); - hg.nodes_by_id[2]->incoming.push_back(p); - hg.nodes_by_id[0]->outgoing.push_back(p); - p->rule = rules[1]; - - Edge* r = new Edge; r->head = hg.nodes_by_id[3]; r->tails.push_back(hg.nodes_by_id[0]); r->score = 1.0; - r->arity = 1; r->mark = 0; - hg.edges.push_back(r); - hg.nodes_by_id[3]->incoming.push_back(r); - hg.nodes_by_id[0]->outgoing.push_back(r); - r->rule = rules[2]; - - Edge* s = new Edge; s->head = hg.nodes_by_id[3]; s->tails.push_back(hg.nodes_by_id[0]); s->score = 1.0; - s->arity = 1; s->mark = 0; - hg.edges.push_back(s); - hg.nodes_by_id[3]->incoming.push_back(s); - hg.nodes_by_id[0]->outgoing.push_back(s); - s->rule = rules[3]; - - Edge* t = new Edge; t->head = hg.nodes_by_id[4]; t->tails.push_back(hg.nodes_by_id[0]); t->score = 1.0; - t->arity = 1; t->mark = 0; - hg.edges.push_back(t); - hg.nodes_by_id[4]->incoming.push_back(t); - hg.nodes_by_id[0]->outgoing.push_back(t); - t->rule = rules[4]; - - Edge* u = new Edge; u->head = hg.nodes_by_id[4]; u->tails.push_back(hg.nodes_by_id[0]); u->score = 1.0; - u->arity = 1; u->mark = 0; - hg.edges.push_back(u); - hg.nodes_by_id[4]->incoming.push_back(u); - hg.nodes_by_id[0]->outgoing.push_back(u); - u->rule = rules[5]; - - Edge* v = new Edge; v->head = hg.nodes_by_id[4]; v->tails.push_back(hg.nodes_by_id[3]); v->score = 1.0; - v->arity = 1; v->mark = 0; - hg.edges.push_back(v); - hg.nodes_by_id[4]->incoming.push_back(v); - hg.nodes_by_id[3]->outgoing.push_back(v); - v->rule = rules[6]; - - Edge* w = new Edge; w->head = hg.nodes_by_id[4]; w->tails.push_back(hg.nodes_by_id[3]); w->score = 2.71828182846; - w->arity = 1; w->mark = 0; - hg.edges.push_back(w); - hg.nodes_by_id[4]->incoming.push_back(w); - hg.nodes_by_id[3]->outgoing.push_back(w); - w->rule = rules[7]; - - Edge* x = new Edge; x->head = hg.nodes_by_id[5]; x->tails.push_back(hg.nodes_by_id[4]); x->score = 1.0; - x->arity = 1; x->mark = 0; - hg.edges.push_back(x); - hg.nodes_by_id[5]->incoming.push_back(x); - hg.nodes_by_id[4]->outgoing.push_back(x); - x->rule = rules[8]; - - Edge* y = new Edge; y->head = hg.nodes_by_id[6]; y->tails.push_back(hg.nodes_by_id[2]); y->tails.push_back(hg.nodes_by_id[5]); y->score = 1.0; - y->arity = 2; y->mark = 0; - hg.edges.push_back(y); - hg.nodes_by_id[6]->incoming.push_back(y); - hg.nodes_by_id[2]->outgoing.push_back(y); - hg.nodes_by_id[5]->outgoing.push_back(y); - y->rule = rules[9]; - - Edge* z = new Edge; z->head = hg.nodes_by_id[7]; z->tails.push_back(hg.nodes_by_id[1]); z->tails.push_back(hg.nodes_by_id[6]); z->score = 1.0; - z->arity = 2; z->mark = 0; - hg.edges.push_back(z); - hg.nodes_by_id[7]->incoming.push_back(z); - hg.nodes_by_id[1]->outgoing.push_back(z); - hg.nodes_by_id[6]->outgoing.push_back(z); - z->rule = rules[10]; -} - -} // namespace Hg::io - -/* - * Hg::Node - * - */ -ostream& -operator<<(ostream& os, const Node& n) -{ - os << \ - "Node"; - return os; -} - -/* - * Hg::Edge - * - */ -ostream& -operator<<(ostream& os, const Edge& e) -{ - ostringstream _; - for (auto it: e.tails) { - _ << it->id; - if (it != e.tails.back()) _ << ","; - } - os << \ - "Edgeid << \ - ", tails=[" << _.str() << "]" \ - ", score=" << e.score << \ - ", rule:'"; - e.rule->escaped(os); - os << "', f=" << "TODO" << \ - ", arity=" << e.arity << \ - ", mark=" << e.mark << ">"; - return os; -} - -} // namespace Hg - diff --git a/fast/hypergraph.hh b/fast/hypergraph.hh deleted file mode 100644 index 1c48a88..0000000 --- a/fast/hypergraph.hh +++ /dev/null @@ -1,103 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "grammar.hh" -#include "semiring.hh" -#include "sparse_vector.hh" -#include "weaver.hh" - -using namespace std; - - -namespace Hg { - -struct Node; - -struct Edge { - Node* head; - vector tails; - score_t score; - G::Rule* rule; - unsigned int arity = 0; - unsigned int mark = 0; - - inline bool is_marked() { return mark >= arity; } - friend ostream& operator<<(ostream& os, const Edge& e); - - size_t head_id_; - vector tails_ids_; // node ids - size_t rule_id_; - - MSGPACK_DEFINE(head_id_, tails_ids_, rule_id_, score, arity); -}; - -struct Node { - size_t id; - string symbol; - short left; - short right; - score_t score; - vector incoming; - vector outgoing; - unsigned int mark; - - inline bool is_marked() { return mark >= incoming.size(); }; - friend ostream& operator<<(ostream& os, const Node& n); - - MSGPACK_DEFINE(id, symbol, left, right, score); -}; - -struct Hypergraph { - list nodes; - vector edges; - unordered_map nodes_by_id; - unsigned int arity; -}; - -template void -init(const list& nodes, const list::iterator root, const Semiring& semiring); - -void -reset(const list nodes, const vector edges); - -void -topological_sort(list& nodes, const list::iterator root); - -void -viterbi(Hypergraph& hg); - -typedef vector Path; - -void -viterbi_path(Hypergraph& hg, Path& p); - -void -derive(const Path& p, const Node* cur, vector& carry); - -namespace io { - -void -read(Hypergraph& hg, vector& rules, G::Vocabulary& vocab, const string& fn); // FIXME - -void -write(Hypergraph& hg, vector& rules, const string& fn); // FIXME - -void -manual(Hypergraph& hg, vector& rules); - -} // namespace - -} // namespace - diff --git a/fast/main.cc b/fast/main.cc deleted file mode 100644 index 825ad59..0000000 --- a/fast/main.cc +++ /dev/null @@ -1,27 +0,0 @@ -#include "hypergraph.hh" -#include - - -int -main(int argc, char** argv) -{ - Hg::Hypergraph hg; - G::Vocabulary y; - G::Grammar g; - Hg::io::read(hg, g.rules, y, argv[1]); - //Hg::io::manual(hg, g.rules); - clock_t begin = clock(); - Hg::Path p; - Hg::viterbi_path(hg, p); - vector s; - Hg::derive(p, p.back()->head, s); - for (auto it: s) - cout << it << " "; - cout << endl; - clock_t end = clock(); - double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; - cout << elapsed_secs << " s" << endl; - - return 0; -} - diff --git a/fast/parse.hh b/fast/parse.hh deleted file mode 100644 index 33ea9ce..0000000 --- a/fast/parse.hh +++ /dev/null @@ -1,108 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include "grammar.hh" -#include "util.hh" -#include "weaver.hh" - - -using namespace std; - -typedef pair Span; -namespace std { - template <> - struct hash - { - size_t - operator()(Span const& k) const - { - return ((hash()(k.first) - ^ (hash()(k.second) << 1)) >> 1); - } - }; -} - -namespace Parse { - -void visit(vector& p, - size_t i, size_t l, size_t r, size_t x=0) -{ - for (size_t span = i; span <= r-x; span++) { - for (size_t k = l; k <= r-span; k++) { - p.push_back(Span(k,k+span)); - } - } -} - -struct ChartItem -{ - Span span; - G::Rule const* rule; - vector tails; - size_t dot; - - ChartItem(G::Rule* r) - { - rule = r; - dot = 0; - } - - ChartItem(ChartItem const& o) - { - span.first = o.span.first; - span.second = o.span.second; - rule = o.rule; - for (auto it: o.tails) - tails.push_back(it); - dot = o.dot; - } -}; - -struct Chart -{ - size_t n_; - unordered_map > m_; - unordered_map b_; - - vector& at(Span s) - { - return m_[s]; - } - - string h(ChartItem* item, Span s) - { - ostringstream ss; - item->rule->lhs->symbol(); - ss << s.first; - ss << s.second; - - return ss.str(); - } - - void add(ChartItem* item, Span s) - { - m_[s].push_back(item); - b_[h(item, s)] = true; - } - - Chart(size_t n) - { - } -}; - - -void -init(vector const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g) -{ - for (auto rule: g.rules) { - cout << *rule << endl; - } -} - - -} // - diff --git a/fast/semiring.hh b/fast/semiring.hh deleted file mode 100644 index 3f4ac08..0000000 --- a/fast/semiring.hh +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - - -namespace Semiring { - -template -struct Viterbi { - T one = 1.0; - T null = 0.0; - - T add(T x, T y); - T multiply(T x, T y); - T convert(T x); -}; - -template T -Viterbi::add(T x, T y) -{ - return max(x, y); -} - -template T -Viterbi::multiply(T x, T y) -{ - return x * y; -} - -template T -Viterbi::convert(T x) -{ - return (T)x; -} - -} // namespace - diff --git a/fast/sparse_vector.hh b/fast/sparse_vector.hh deleted file mode 100644 index 9d815ff..0000000 --- a/fast/sparse_vector.hh +++ /dev/null @@ -1,186 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -#include "util.hh" -#include "weaver.hh" - -using namespace std; - - -namespace Sv { - -template -struct SparseVector { - unordered_map m_; - V zero = 0.f; - - SparseVector() {}; - - SparseVector(string& s) - { - from_s(this, s); - }; - - void - insert(K k, V v) { m_[k] = v; }; - - V - dot(SparseVector& other) - { - V r; - unordered_map* o = &m_; - auto b = m_.cbegin(); - auto e = m_.cend(); - if (other.size() < size()) { - b = other.m_.cbegin(); - e = other.m_.cend(); - o = &other.m_; - } - for (auto it = b; it != e; it++) - r += it->second * o->at(it->first); - - return r; - }; - - size_t - size() - { - return m_.size(); - } - - V& - operator[](const K& k) - { - return at(k); - }; - - const V& - at(const K& k) const - { - if (m_.find(k) == m_.end()) - return zero; - else - return m_.at(k); - } - - SparseVector - operator+(const SparseVector& other) const - { - SparseVector v; - v.m_.insert(m_.cbegin(), m_.cend()); - v.m_.insert(other.m_.cbegin(), other.m_.cend()); - for (const auto it: v.m_) - v.m_[it.first] = this->at(it.first) + other.at(it.first); - - return v; - }; - - SparseVector& - operator+=(const SparseVector& other) - { - for (const auto it: other.m_) - m_[it.first] += it.second; - - return *this; - }; - - SparseVector - operator-(const SparseVector& other) const - { - SparseVector v; - v.m_.insert(m_.cbegin(), m_.cend()); - v.m_.insert(other.m_.cbegin(), other.m_.cend()); - for (const auto it: v.m_) - v.m_[it.first] = this->at(it.first) - other.at(it.first); - - return v; - }; - - SparseVector& - operator-=(const SparseVector& other) - { - for (const auto it: other.m_) - m_[it.first] -= it.second; - - return *this; - }; - - SparseVector - operator*(V f) const - { - SparseVector v; - for (const auto it: m_) - v.m_[it.first] = this->at(it.first) * f; - - return v; - }; - - SparseVector& - operator*=(V f) - { - for (const auto it: m_) - m_[it.first] *= f; - - return *this; - }; - - static void - from_s(SparseVector* w, const string& s) - { - stringstream ss(s); - while (!ss.eof()) { - string t; - ss >> t; - size_t eq = t.find_first_of("="); - if (eq == string::npos) { - return; - } - t.replace(eq, 1, " "); - stringstream tt(t); - K k; V v; - tt >> k >> v; - w->m_.emplace(k.substr(k.find_first_of("\"")+1, k.find_last_of("\"")-1), v); - } - } - - ostream& - repr(ostream& os) const - { - os << "SparseVector<{"; - for (auto it = m_.cbegin(); it != m_.cend(); it++) { - os << "'" << it->first << "'=" << it->second; - if (next(it) != m_.end()) - os << ", "; - } - os << "}>"; - - return os; - }; - - ostream& - escaped(ostream& os, bool quote_keys=false) const { - for (auto it = m_.cbegin(); it != m_.cend(); it++) { - if (quote_keys) os << '"'; - os << util::json_escape(it->first); - if (quote_keys) os << '"'; - os << "=" << it->second; - if (next(it) != m_.cend()) os << " "; - } - - return os; - }; - - friend ostream& - operator<<(ostream& os, const SparseVector& v) - { - return v.repr(os); - } -}; - -} // namespace - diff --git a/fast/test/Makefile b/fast/test/Makefile deleted file mode 100644 index 65e97ef..0000000 --- a/fast/test/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -COMPILER=g++ -CFLAGS=-std=c++11 -O3 -I../ -TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread - - -all: test_grammar test_sparse_vector test_parse - -test_grammar: test_grammar.cc ../grammar.hh - $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_grammar.cc -o test_grammar - -test_sparse_vector: test_sparse_vector.cc ../sparse_vector.hh - $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_sparse_vector.cc -o test_sparse_vector - -test_parse: test_parse.cc ../parse.hh ../grammar.hh ../util.hh - $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_parse.cc -o test_parse - -clean: - rm -f test_grammar test_sparse_vector - diff --git a/fast/test/test_grammar b/fast/test/test_grammar deleted file mode 100755 index 6cf7ad5..0000000 Binary files a/fast/test/test_grammar and /dev/null differ diff --git a/fast/test/test_grammar.cc b/fast/test/test_grammar.cc deleted file mode 100644 index bbe76e7..0000000 --- a/fast/test/test_grammar.cc +++ /dev/null @@ -1,20 +0,0 @@ -#include - -#include "grammar.hh" - -using namespace std; - - -int -main(int argc, char** argv) -{ - G::Vocabulary y; - G::Grammar g(argv[1], y); - for (auto it: g.rules) { - it->escaped(cout); - cout << endl; - } - - return 0; -} - diff --git a/fast/test/test_sparse_vector b/fast/test/test_sparse_vector deleted file mode 100755 index c06fe9e..0000000 Binary files a/fast/test/test_sparse_vector and /dev/null differ diff --git a/fast/test/test_sparse_vector.cc b/fast/test/test_sparse_vector.cc deleted file mode 100644 index 426bed1..0000000 --- a/fast/test/test_sparse_vector.cc +++ /dev/null @@ -1,37 +0,0 @@ -#include "sparse_vector.hh" - - -int -main(void) -{ - Sv::SparseVector a; - a.insert("1", 1); - a.insert("2", 2); - cout << "a:" << a << endl; - - Sv::SparseVector b; - b.insert("2", 2); - cout << "b:" << b << endl; - - Sv::SparseVector c = a + b; - cout << "a+b:" << c << endl; - - a += b; - cout << "a+=b:" << a << endl; - - a -= b; - cout << "a-=b:" << a << endl; - - cout << "a*2:" << a*2 << endl; - - a *= 2; - cout << "a*=2:" << a << endl; - - string s("\"a\"=2 \"b\"=3"); - Sv::SparseVector* sv = new Sv::SparseVector(s); - cout << *sv << endl; - cout << sv->dot(*sv) << endl; - - return 0; -} - diff --git a/fast/util.hh b/fast/util.hh deleted file mode 100644 index 9ce19da..0000000 --- a/fast/util.hh +++ /dev/null @@ -1,47 +0,0 @@ -#pragma once - -#include - -#include "weaver.hh" - -using namespace std; - - -namespace util { - -inline string -json_escape(const string& s) -{ - ostringstream os; - for (auto it = s.cbegin(); it != s.cend(); it++) { - switch (*it) { - case '"': os << "\\\""; break; - case '\\': os << "\\\\"; break; - case '\b': os << "\\b"; break; - case '\f': os << "\\f"; break; - case '\n': os << "\\n"; break; - case '\r': os << "\\r"; break; - case '\t': os << "\\t"; break; - default: os << *it; break; - } - } - - return os.str(); -} - -inline vector -tokenize(string s) -{ - istringstream ss(s); - vector r; - while (ss.good()) { - string buf; - ss >> buf; - r.push_back(buf); - } - - return r; -} - -} // namespace util - diff --git a/fast/weaver.hh b/fast/weaver.hh deleted file mode 100644 index e89b4dd..0000000 --- a/fast/weaver.hh +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include - -using namespace std; - - -typedef double score_t; -typedef string symbol_t; - diff --git a/grammar.rb b/grammar.rb deleted file mode 100644 index e91c0f6..0000000 --- a/grammar.rb +++ /dev/null @@ -1,139 +0,0 @@ -module Grammar - - -class T - attr_accessor :word - - def initialize word - @word = word - end - - def to_s - @word - end -end - -class NT - attr_accessor :symbol, :index - - def initialize symbol=nil, index=-1 - @symbol = symbol - @index = index - end - - #FIXME? symbol should not contain [, ] or ", - # else we're in trouble - def from_s s - @symbol, @index = s.delete('[]').split ',' - @symbol.strip! - @index = @index.to_i-1 - end - - def self.from_s s - new = NT.new - new.from_s s - return new - end - - #FIXME? indexed by 1 - def to_s - return "[#{@symbol},#{@index+1}]" if @index>=0 - return "[#{@symbol}]" - end -end - -class Rule - attr_accessor :lhs, :rhs, :target, :map, :f - - def initialize lhs=NT.new, rhs=[], target=[], map=[], f=SparseVector.new, arity=0 - @lhs = lhs - @rhs = rhs - @target = target - @map = map - @f = f - @arity = arity - end - - def read_rhs_ s, make_meta=false - a = [] - s.split.map { |x| - x.strip! - if x[0] == '[' && x[x.size-1] == ']' - a << NT.from_s(x) - if make_meta - @map << a.last.index - @arity += 1 - end - else - a << T.new(x) - end - } - return a - end - - def from_s s - lhs, rhs, target, f = splitpipe s, 3 - @lhs = NT.from_s lhs - @rhs = read_rhs_ rhs, true - @target = read_rhs_ target - @f = (f ? SparseVector.from_kv(f, '=', ' ') : SparseVector.new) - end - - def self.from_s s - r = Rule.new - r.from_s s - return r - end - - def to_s - "#{@lhs.to_s} ||| #{@rhs.map { |x| x.to_s }.join ' '} ||| #{@target.map { |x| x.to_s }.join ' '}" - end -end - -class Grammar - attr_accessor :rules, :start_nt, :start_t, :flat - - def initialize fn - @rules = []; @start_nt = []; @start_t = []; @flat = [] - n = 0 - ReadFile.readlines_strip(fn).each_with_index { |s,i| - STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%40==0 - n += 1 - @rules << Rule.from_s(s) - if @rules.last.rhs.first.class == NT - @start_nt << @rules.last - else - if rules.last.arity == 0 - @flat << @rules.last - else - @start_t << @rules.last - end - end - } - STDERR.write " #{n}\n" - end - - def to_s - @rules.map { |r| r.to_s }.join "\n" - end - - def add_glue - @rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol| - @rules << Rule.new(NT.new('S'), [NT.new(symbol, 0)], [NT.new(symbol, 0)], [0]) - @start_nt << @rules.last - @rules << Rule.new(NT.new('S'), [NT.new('S', 0), NT.new('X', 1)], [NT.new('S', 0), NT.new('X', 1)], [0, 1]) - @start_nt << @rules.last - } - end - - def add_pass_through a - a.each { |word| - @rules << Rule.new(NT.new('X'), [T.new(word)], [T.new(word)]) - @flat << @rules.last - } - end -end - - -end #module - diff --git a/hg.rb b/hg.rb deleted file mode 100644 index a68407c..0000000 --- a/hg.rb +++ /dev/null @@ -1,222 +0,0 @@ -require 'zipf' -require_relative 'grammar' - - -module HG - - -class HG::Node - attr_accessor :id, :symbol, :left, :right, :outgoing, :incoming, :score - - def initialize id=nil, symbol='', span=[-1,-1], outgoing=[], incoming=[], score=nil - @id = id - @symbol = symbol - @left = span[0] - @right = span[1] - @outgoing = outgoing - @incoming = incoming - @score = score - end - - def to_s - "Node" - end -end - -class HG::Hypergraph - attr_accessor :nodes, :edges, :nodes_by_id - - def initialize nodes=[], edges=[], nodes_by_id={} - @nodes = nodes - @edges = edges - @nodes_by_id = nodes_by_id - @arity_ = nil - end - - def arity - @arity_ = @edges.map { |e| e.arity }.max if !@arity_ - return @arity_ - end - - def reset - @edges.each { |e| e.mark = 0 } - end - - def to_s - "Hypergraph" - end - - def to_json weights=nil - json_s = "{\n" - json_s += "\"weights\":#{(weights ? weights.to_json : '{}')},\n" - json_s += "\"nodes\":\n" - json_s += "[\n" - json_s += @nodes.map { |n| - "{ \"id\":#{n.id}, \"cat\":\"#{n.symbol.to_json.slice(1..-1).chomp('"')}\", \"span\":[#{n.left},#{n.right}] }" - }.join ",\n" - json_s += "\n],\n" - json_s += "\"edges\":\n" - json_s += "[\n" - json_s += @edges.map { |e| - "{ \"head\":#{e.head.id}, \"rule\":#{e.rule.to_json}, \"tails\":#{e.tails.map{ |n| n.id }}, \"f\":#{(e.f ? e.f.to_json : '{}')} }" - }.join ",\n" - json_s += "\n]\n" - json_s += "}\n" - - return json_s - - end -end - -class HG::Hyperedge - attr_accessor :head, :tails, :score, :f, :mark, :rule - - def initialize head=Node.new, tails=[], score=0.0, f=SparseVector.new, rule=nil - @head = head - @tails = tails - @score = score - @f = f - @mark = 0 - @rule = (rule.class==String ? Grammar::Rule.from_s(rule) : rule) - end - - def arity - return @tails.size - end - - def marked? - arity == @mark - end - - def to_s - "Hyperedge" - end -end - -def HG::topological_sort nodes - sorted = [] - s = nodes.reject { |n| !n.incoming.empty? } - while !s.empty? - sorted << s.shift - sorted.last.outgoing.each { |e| - next if e.marked? - e.mark += 1 - s << e.head if e.head.incoming.reject{ |f| f.marked? }.empty? - } - end - return sorted -end - -def HG::init nodes, semiring, root - nodes.each { |n| n.score=semiring.null } - root.score = semiring.one -end - -def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new - toposorted = topological_sort hypergraph.nodes - init toposorted, semiring, root - toposorted.each { |n| - n.incoming.each { |e| - s = semiring.one - e.tails.each { |m| - s = semiring.multiply.call(s, m.score) - } - n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.score)) - } - } -end - -def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new - toposorted = topological_sort hypergraph.nodes - init toposorted, semiring, root - best_path = [] - toposorted.each { |n| - best_edge = nil - n.incoming.each { |e| - s = semiring.one - e.tails.each { |m| - s = semiring.multiply.call(s, m.score) - } - if n.score < semiring.multiply.call(s, e.score) # ViterbiSemiring add - best_edge = e - end - n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.score)) - } - best_path << best_edge if best_edge - } - return best_path, toposorted.last.score -end - -def HG::k_best hypergraph, root, semiring=nil - #TODO -end - -def HG::all_paths hypergraph, root - toposorted = topological_sort hypergraph.nodes - paths = [[]] - toposorted.each { |n| - next if n.incoming.empty? - new_paths = [] - while !paths.empty? - p = paths.pop - n.incoming.each { |e| - new_paths << p+[e] - } - end - paths = new_paths - } - return paths -end - -def HG::derive path, cur, carry - edge = path.select { |e| e.head.symbol==cur.symbol \ - && e.head.left==cur.left \ - && e.head.right==cur.right }.first - j = 0 - edge.rule.target.each { |i| - if i.class == Grammar::NT - derive path, edge.tails[edge.rule.map[j]], carry - j += 1 - else - carry << i - end - } - return carry -end - -def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false - nodes = [] - edges = [] - nodes_by_id = {} - h = JSON.parse File.new(fn).read - w = SparseVector.from_h h['weights'] - h['nodes'].each { |x| - n = Node.new x['id'], x['symbol'], x['span'] - nodes << n - nodes_by_id[n.id] = n - } - h['edges'].each { |x| - e = Hyperedge.new(nodes_by_id[x['head']], \ - x['tails'].map { |j| nodes_by_id[j] }.to_a, \ - (x['score'] ? semiring.convert.call(x['score'].to_f) : nil), \ - (x['f'] ? SparseVector.from_h(x['f']) : SparseVector.new), \ - x['rule']) - if x['f'] - if log_weights - e.score = Math.exp(w.dot(e.f)) - else - e.score = w.dot(e.f) - end - end - e.tails.each { |m| - m.outgoing << e - } - e.head.incoming << e - edges << e - } - return Hypergraph.new(nodes, edges), nodes_by_id -end - - -end #module - diff --git a/main.rb b/main.rb deleted file mode 100755 index e41d0f1..0000000 --- a/main.rb +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env ruby - -require 'trollop' -require 'xmlsimple' -require_relative 'parse' - - -def read_grammar fn, add_glue, add_pass_through, input=nil - STDERR.write "> reading grammar '#{fn}'\n" - grammar = Grammar::Grammar.new fn - if add_glue - STDERR.write ">> adding glue rules\n" - grammar.add_glue - end - if add_pass_through - STDERR.write ">> adding pass-through rules\n" - grammar.add_pass_through input - end - return grammar -end - -def main - cfg = Trollop::options do - opt :input, "", :type => :string, :default => '-', :short => '-i' - opt :grammar, "", :type => :string, :default => nil, :short => '-g' - opt :weights, "", :type => :string, :default => nil, :short => '-w' - opt :add_glue, "", :type => :bool, :default => false, :short => '-l' - opt :add_pass_through, "", :type => :bool, :default => false, :short => '-p' - end - - grammar = nil - if cfg[:grammar] - grammar = read_grammar cfg[:grammar], cfg[:add_glue], cfg[:add_pass_through] - end - - STDERR.write "> reading input from '#{cfg[:input]}'\n" - ReadFile.readlines_strip(cfg[:input]).each { |input| - - x = XmlSimple.xml_in(input) - input = x['content'].split - n = input.size - - if x['grammar'] - grammar = read_grammar x['grammar'], cfg[:add_glue], cfg[:add_pass_through], input - end - - STDERR.write "> initializing charts\n" - passive_chart = Parse::Chart.new n - active_chart = Parse::Chart.new n - Parse::init input, n, active_chart, passive_chart, grammar - - STDERR.write "> parsing\n" - Parse::parse input, n, active_chart, passive_chart, grammar - - weights = SparseVector.from_kv(ReadFile.read(cfg[:weights]), ' ', "\n") - if !weights - weights = SparseVector.new - end - - hypergraph = passive_chart.to_hg weights - - STDERR.write "> viterbi\n" - semiring = ViterbiSemiring.new - path, score = HG::viterbi_path hypergraph, hypergraph.nodes_by_id[-1], semiring - s = HG::derive path, path.last.head, [] - STDERR.write " #{s.map { |i| i.word }.join ' '} ||| #{Math.log score}\n" - } -end - - -main - diff --git a/parse.rb b/parse.rb deleted file mode 100644 index 1f69c91..0000000 --- a/parse.rb +++ /dev/null @@ -1,210 +0,0 @@ -require 'zipf' -require_relative 'grammar' -require_relative 'hg' - - -module Parse - - -def Parse::visit i, l, r, x=0 - i.upto(r-x) { |span| - l.upto(r-span) { |k| - yield k, k+span - } - } -end - -class Chart - - def initialize n - @n = n - @m = [] - (n+1).times { - a = [] - (n+1).times { a << [] } - @m << a - } - @b = {} - end - - def at i, j - @m[i][j] - end - - def add item, i, j - at(i,j) << item - @b["#{i},#{j},#{item.lhs.symbol}"] = true - end - - def has symbol, i, j - return @b["#{i},#{j},#{symbol}"] - end - - def to_hg weights=SparseVector.new - nodes = [] - edges = [] - nodes_by_id = {} - nodes << HG::Node.new(-1, "root", [-1,-1]) - nodes_by_id[-1] = nodes.last - id = 0 - seen = {} - Parse::visit(1, 0, @n) { |i,j| - self.at(i,j).each { |item| - _ = "#{item.lhs.symbol},#{i},#{j}" - if !seen[_] - nodes << HG::Node.new(id, item.lhs.symbol, [i,j]) - nodes_by_id[id] = nodes.last - seen[_] = id - id += 1 - end - } - } - - Parse::visit(1, 0, @n) { |i,j| - self.at(i,j).each { |item| - edges << HG::Hyperedge.new(nodes_by_id[seen[item.lhs.symbol+','+i.to_s+','+j.to_s]], \ - (item.tail_spans.empty? ? [nodes_by_id[-1]] : item.rhs.zip((0..item.rhs.size-1).map{|q| item.tail_spans[q] }).select{|x| x[0].class==Grammar::NT }.map{|x| nodes_by_id[seen["#{x[0].symbol},#{x[1].left},#{x[1].right}"]]}), \ - Math.exp(weights.dot(item.f)), - item.f, - Grammar::Rule.new(item.lhs, item.rhs, item.target, item.map, item.f), \ - ) - edges.last.head.incoming << edges.last - edges.last.tails.each { |n| n.outgoing << edges.last } - } - } - return HG::Hypergraph.new(nodes, edges, nodes_by_id) - end -end - -Span = Struct.new(:left, :right) - -class Item < Grammar::Rule - attr_accessor :left, :right, :tail_spans, :dot, :f - - def initialize rule_or_item, left, right, dot - @lhs = Grammar::NT.new rule_or_item.lhs.symbol, rule_or_item.lhs.index - @left = left - @right = right - @rhs = [] - @tail_spans = {} # refers to source side, use @map - @f = rule_or_item.f - @map = (rule_or_item.map ? rule_or_item.map.dup : []) - rule_or_item.rhs.each_with_index { |x,i| # duplicate rhs partially - @rhs << x - if x.class == Grammar::NT - begin - if i >= dot - @tail_spans[i] = Span.new(-1, -1) - else - @tail_spans[i] = rule_or_item.tail_spans[i].dup - end - rescue - @tail_spans[i] = Span.new(-1, -1) - end - end - } - @dot = dot - @target = rule_or_item.target - end - - def to_s - "(#{@left}, #{@right}) [#{tail_spans.map{|k,v| k.to_s+'('+v.left.to_s+','+v.right.to_s+')'}.join ' '}] {#{@map.to_s.delete('[]')}} #{lhs} -> #{rhs.map{|i|i.to_s}.insert(@dot,'*').join ' '} [dot@#{@dot}] ||| #{@target.map{|x|x.to_s}.join ' '}" - end -end - -def Parse::init input, n, active_chart, passive_chart, grammar - grammar.flat.each { |r| - input.each_index { |i| - if input[i, r.rhs.size] == r.rhs.map { |x| x.word } - passive_chart.add Item.new(r, i, i+r.rhs.size, r.rhs.size), i, i+r.rhs.size - end - } - } -end - -def Parse::scan item, input, limit, passive_chart - while item.rhs[item.dot].class == Grammar::T - return false if item.right==limit - if item.rhs[item.dot].word == input[item.right] - item.dot += 1 - item.right += 1 - else - return false - end - end - return true -end - -def Parse::parse input, n, active_chart, passive_chart, grammar - visit(1, 0, n) { |i,j| - - STDERR.write " span(#{i},#{j})\n" - - # try to apply rules starting with T - grammar.start_t.select { |r| r.rhs.first.word == input[i] }.each { |r| - new_item = Item.new r, i, i, 0 - active_chart.at(i,j) << new_item if scan new_item, input, j, passive_chart - } - - # seed active chart - grammar.start_nt.each { |r| - next if r.rhs.size > j-i - active_chart.at(i,j) << Item.new(r, i, i, 0) - } - - # parse - new_symbols = [] - remaining_items = [] - while !active_chart.at(i,j).empty? - active_item = active_chart.at(i,j).pop - advanced = false - visit(1, i, j, 1) { |k,l| - if passive_chart.has active_item.rhs[active_item.dot].symbol, k, l - if k == active_item.right - new_item = Item.new active_item, active_item.left, l, active_item.dot+1 - new_item.tail_spans[new_item.dot-1] = Span.new(k,l) - if scan new_item, input, j, passive_chart - if new_item.dot == new_item.rhs.size - if new_item.left == i && new_item.right == j - new_symbols << new_item.lhs.symbol if !new_symbols.include? new_item.lhs.symbol - passive_chart.add new_item, i, j - advanced = true - end - else - if new_item.right+(new_item.rhs.size-(new_item.dot)) <= j - active_chart.at(i,j) << new_item - advanced = true - end - end - end - end - end - } - if !advanced - remaining_items << active_item - end - end - - - # 'self-filling' step - new_symbols.each { |s| - remaining_items.each { |item| - next if item.dot!=0 - next if item.rhs[item.dot].class!=Grammar::NT - if item.rhs[item.dot].symbol == s - new_item = Item.new item, i, j, item.dot+1 - new_item.tail_spans[new_item.dot-1] = Span.new(i,j) - if new_item.dot==new_item.rhs.size - new_symbols << new_item.lhs.symbol if !new_symbols.include? new_item.lhs.symbol - passive_chart.add new_item, i, j - end - end - } - } - - } -end - - -end #module - diff --git a/prototype/grammar.rb b/prototype/grammar.rb new file mode 100644 index 0000000..42ffbc0 --- /dev/null +++ b/prototype/grammar.rb @@ -0,0 +1,137 @@ +module Grammar + +class T + attr_accessor :word + + def initialize word + @word = word + end + + def to_s + @word + end +end + +class NT + attr_accessor :symbol, :index + + def initialize symbol=nil, index=-1 + @symbol = symbol + @index = index + end + + #FIXME? symbol should not contain [, ] or ", + # else we're in trouble + def from_s s + @symbol, @index = s.delete('[]').split ',' + @symbol.strip! + @index = @index.to_i-1 + end + + def self.from_s s + new = NT.new + new.from_s s + return new + end + + #FIXME? indexed by 1 + def to_s + return "[#{@symbol},#{@index+1}]" if @index>=0 + return "[#{@symbol}]" + end +end + +class Rule + attr_accessor :lhs, :rhs, :target, :map, :f + + def initialize lhs=NT.new, rhs=[], target=[], map=[], f=SparseVector.new, arity=0 + @lhs = lhs + @rhs = rhs + @target = target + @map = map + @f = f + @arity = arity + end + + def read_rhs_ s, make_meta=false + a = [] + s.split.map { |x| + x.strip! + if x[0] == '[' && x[x.size-1] == ']' + a << NT.from_s(x) + if make_meta + @map << a.last.index + @arity += 1 + end + else + a << T.new(x) + end + } + return a + end + + def from_s s + lhs, rhs, target, f = splitpipe s, 3 + @lhs = NT.from_s lhs + @rhs = read_rhs_ rhs, true + @target = read_rhs_ target + @f = (f ? SparseVector.from_kv(f, '=', ' ') : SparseVector.new) + end + + def self.from_s s + r = Rule.new + r.from_s s + return r + end + + def to_s + "#{@lhs.to_s} ||| #{@rhs.map { |x| x.to_s }.join ' '} ||| #{@target.map { |x| x.to_s }.join ' '}" + end +end + +class Grammar + attr_accessor :rules, :start_nt, :start_t, :flat + + def initialize fn + @rules = []; @start_nt = []; @start_t = []; @flat = [] + n = 0 + ReadFile.readlines_strip(fn).each_with_index { |s,i| + STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%40==0 + n += 1 + @rules << Rule.from_s(s) + if @rules.last.rhs.first.class == NT + @start_nt << @rules.last + else + if rules.last.arity == 0 + @flat << @rules.last + else + @start_t << @rules.last + end + end + } + STDERR.write " #{n}\n" + end + + def to_s + @rules.map { |r| r.to_s }.join "\n" + end + + def add_glue + @rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol| + @rules << Rule.new(NT.new('S'), [NT.new(symbol, 0)], [NT.new(symbol, 0)], [0]) + @start_nt << @rules.last + @rules << Rule.new(NT.new('S'), [NT.new('S', 0), NT.new('X', 1)], [NT.new('S', 0), NT.new('X', 1)], [0, 1]) + @start_nt << @rules.last + } + end + + def add_pass_through a + a.each { |word| + @rules << Rule.new(NT.new('X'), [T.new(word)], [T.new(word)]) + @flat << @rules.last + } + end +end + +end #module + diff --git a/prototype/hypergraph.rb b/prototype/hypergraph.rb new file mode 100644 index 0000000..d43d5ee --- /dev/null +++ b/prototype/hypergraph.rb @@ -0,0 +1,219 @@ +require 'zipf' +require_relative 'grammar' + +module HG + +class HG::Node + attr_accessor :id, :symbol, :left, :right, :outgoing, :incoming, :score + + def initialize id=nil, symbol='', span=[-1,-1], outgoing=[], incoming=[], score=nil + @id = id + @symbol = symbol + @left = span[0] + @right = span[1] + @outgoing = outgoing + @incoming = incoming + @score = score + end + + def to_s + "Node" + end +end + +class HG::Hypergraph + attr_accessor :nodes, :edges, :nodes_by_id + + def initialize nodes=[], edges=[], nodes_by_id={} + @nodes = nodes + @edges = edges + @nodes_by_id = nodes_by_id + @arity_ = nil + end + + def arity + @arity_ = @edges.map { |e| e.arity }.max if !@arity_ + return @arity_ + end + + def reset + @edges.each { |e| e.mark = 0 } + end + + def to_s + "Hypergraph" + end + + def to_json weights=nil + json_s = "{\n" + json_s += "\"weights\":#{(weights ? weights.to_json : '{}')},\n" + json_s += "\"nodes\":\n" + json_s += "[\n" + json_s += @nodes.map { |n| + "{ \"id\":#{n.id}, \"cat\":\"#{n.symbol.to_json.slice(1..-1).chomp('"')}\", \"span\":[#{n.left},#{n.right}] }" + }.join ",\n" + json_s += "\n],\n" + json_s += "\"edges\":\n" + json_s += "[\n" + json_s += @edges.map { |e| + "{ \"head\":#{e.head.id}, \"rule\":#{e.rule.to_json}, \"tails\":#{e.tails.map{ |n| n.id }}, \"f\":#{(e.f ? e.f.to_json : '{}')} }" + }.join ",\n" + json_s += "\n]\n" + json_s += "}\n" + + return json_s + + end +end + +class HG::Hyperedge + attr_accessor :head, :tails, :score, :f, :mark, :rule + + def initialize head=Node.new, tails=[], score=0.0, f=SparseVector.new, rule=nil + @head = head + @tails = tails + @score = score + @f = f + @mark = 0 + @rule = (rule.class==String ? Grammar::Rule.from_s(rule) : rule) + end + + def arity + return @tails.size + end + + def marked? + arity == @mark + end + + def to_s + "Hyperedge" + end +end + +def HG::topological_sort nodes + sorted = [] + s = nodes.reject { |n| !n.incoming.empty? } + while !s.empty? + sorted << s.shift + sorted.last.outgoing.each { |e| + next if e.marked? + e.mark += 1 + s << e.head if e.head.incoming.reject{ |f| f.marked? }.empty? + } + end + return sorted +end + +def HG::init nodes, semiring, root + nodes.each { |n| n.score=semiring.null } + root.score = semiring.one +end + +def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new + toposorted = topological_sort hypergraph.nodes + init toposorted, semiring, root + toposorted.each { |n| + n.incoming.each { |e| + s = semiring.one + e.tails.each { |m| + s = semiring.multiply.call(s, m.score) + } + n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.score)) + } + } +end + +def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new + toposorted = topological_sort hypergraph.nodes + init toposorted, semiring, root + best_path = [] + toposorted.each { |n| + best_edge = nil + n.incoming.each { |e| + s = semiring.one + e.tails.each { |m| + s = semiring.multiply.call(s, m.score) + } + if n.score < semiring.multiply.call(s, e.score) # ViterbiSemiring add + best_edge = e + end + n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.score)) + } + best_path << best_edge if best_edge + } + return best_path, toposorted.last.score +end + +def HG::k_best hypergraph, root, semiring=nil + #TODO +end + +def HG::all_paths hypergraph, root + toposorted = topological_sort hypergraph.nodes + paths = [[]] + toposorted.each { |n| + next if n.incoming.empty? + new_paths = [] + while !paths.empty? + p = paths.pop + n.incoming.each { |e| + new_paths << p+[e] + } + end + paths = new_paths + } + return paths +end + +def HG::derive path, cur, carry + edge = path.select { |e| e.head.symbol==cur.symbol \ + && e.head.left==cur.left \ + && e.head.right==cur.right }.first + j = 0 + edge.rule.target.each { |i| + if i.class == Grammar::NT + derive path, edge.tails[edge.rule.map[j]], carry + j += 1 + else + carry << i + end + } + return carry +end + +def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false + nodes = [] + edges = [] + nodes_by_id = {} + h = JSON.parse File.new(fn).read + w = SparseVector.from_h h['weights'] + h['nodes'].each { |x| + n = Node.new x['id'], x['symbol'], x['span'] + nodes << n + nodes_by_id[n.id] = n + } + h['edges'].each { |x| + e = Hyperedge.new(nodes_by_id[x['head']], \ + x['tails'].map { |j| nodes_by_id[j] }.to_a, \ + (x['score'] ? semiring.convert.call(x['score'].to_f) : nil), \ + (x['f'] ? SparseVector.from_h(x['f']) : SparseVector.new), \ + x['rule']) + if x['f'] + if log_weights + e.score = Math.exp(w.dot(e.f)) + else + e.score = w.dot(e.f) + end + end + e.tails.each { |m| + m.outgoing << e + } + e.head.incoming << e + edges << e + } + return Hypergraph.new(nodes, edges), nodes_by_id +end + +end #module + diff --git a/prototype/parse.rb b/prototype/parse.rb new file mode 100644 index 0000000..adf2b91 --- /dev/null +++ b/prototype/parse.rb @@ -0,0 +1,207 @@ +require 'zipf' +require_relative 'grammar' +require_relative 'hypergraph' + +module Parse + +def Parse::visit i, l, r, x=0 + i.upto(r-x) { |span| + l.upto(r-span) { |k| + yield k, k+span + } + } +end + +class Chart + + def initialize n + @n = n + @m = [] + (n+1).times { + a = [] + (n+1).times { a << [] } + @m << a + } + @b = {} + end + + def at i, j + @m[i][j] + end + + def add item, i, j + at(i,j) << item + @b["#{i},#{j},#{item.lhs.symbol}"] = true + end + + def has symbol, i, j + return @b["#{i},#{j},#{symbol}"] + end + + def to_hg weights=SparseVector.new + nodes = [] + edges = [] + nodes_by_id = {} + nodes << HG::Node.new(-1, "root", [-1,-1]) + nodes_by_id[-1] = nodes.last + id = 0 + seen = {} + Parse::visit(1, 0, @n) { |i,j| + self.at(i,j).each { |item| + _ = "#{item.lhs.symbol},#{i},#{j}" + if !seen[_] + nodes << HG::Node.new(id, item.lhs.symbol, [i,j]) + nodes_by_id[id] = nodes.last + seen[_] = id + id += 1 + end + } + } + + Parse::visit(1, 0, @n) { |i,j| + self.at(i,j).each { |item| + edges << HG::Hyperedge.new(nodes_by_id[seen[item.lhs.symbol+','+i.to_s+','+j.to_s]], \ + (item.tail_spans.empty? ? [nodes_by_id[-1]] : item.rhs.zip((0..item.rhs.size-1).map{|q| item.tail_spans[q] }).select{|x| x[0].class==Grammar::NT }.map{|x| nodes_by_id[seen["#{x[0].symbol},#{x[1].left},#{x[1].right}"]]}), \ + Math.exp(weights.dot(item.f)), + item.f, + Grammar::Rule.new(item.lhs, item.rhs, item.target, item.map, item.f), \ + ) + edges.last.head.incoming << edges.last + edges.last.tails.each { |n| n.outgoing << edges.last } + } + } + return HG::Hypergraph.new(nodes, edges, nodes_by_id) + end +end + +Span = Struct.new(:left, :right) + +class Item < Grammar::Rule + attr_accessor :left, :right, :tail_spans, :dot, :f + + def initialize rule_or_item, left, right, dot + @lhs = Grammar::NT.new rule_or_item.lhs.symbol, rule_or_item.lhs.index + @left = left + @right = right + @rhs = [] + @tail_spans = {} # refers to source side, use @map + @f = rule_or_item.f + @map = (rule_or_item.map ? rule_or_item.map.dup : []) + rule_or_item.rhs.each_with_index { |x,i| # duplicate rhs partially + @rhs << x + if x.class == Grammar::NT + begin + if i >= dot + @tail_spans[i] = Span.new(-1, -1) + else + @tail_spans[i] = rule_or_item.tail_spans[i].dup + end + rescue + @tail_spans[i] = Span.new(-1, -1) + end + end + } + @dot = dot + @target = rule_or_item.target + end + + def to_s + "(#{@left}, #{@right}) [#{tail_spans.map{|k,v| k.to_s+'('+v.left.to_s+','+v.right.to_s+')'}.join ' '}] {#{@map.to_s.delete('[]')}} #{lhs} -> #{rhs.map{|i|i.to_s}.insert(@dot,'*').join ' '} [dot@#{@dot}] ||| #{@target.map{|x|x.to_s}.join ' '}" + end +end + +def Parse::init input, n, active_chart, passive_chart, grammar + grammar.flat.each { |r| + input.each_index { |i| + if input[i, r.rhs.size] == r.rhs.map { |x| x.word } + passive_chart.add Item.new(r, i, i+r.rhs.size, r.rhs.size), i, i+r.rhs.size + end + } + } +end + +def Parse::scan item, input, limit, passive_chart + while item.rhs[item.dot].class == Grammar::T + return false if item.right==limit + if item.rhs[item.dot].word == input[item.right] + item.dot += 1 + item.right += 1 + else + return false + end + end + return true +end + +def Parse::parse input, n, active_chart, passive_chart, grammar + visit(1, 0, n) { |i,j| + + STDERR.write " span(#{i},#{j})\n" + + # try to apply rules starting with T + grammar.start_t.select { |r| r.rhs.first.word == input[i] }.each { |r| + new_item = Item.new r, i, i, 0 + active_chart.at(i,j) << new_item if scan new_item, input, j, passive_chart + } + + # seed active chart + grammar.start_nt.each { |r| + next if r.rhs.size > j-i + active_chart.at(i,j) << Item.new(r, i, i, 0) + } + + # parse + new_symbols = [] + remaining_items = [] + while !active_chart.at(i,j).empty? + active_item = active_chart.at(i,j).pop + advanced = false + visit(1, i, j, 1) { |k,l| + if passive_chart.has active_item.rhs[active_item.dot].symbol, k, l + if k == active_item.right + new_item = Item.new active_item, active_item.left, l, active_item.dot+1 + new_item.tail_spans[new_item.dot-1] = Span.new(k,l) + if scan new_item, input, j, passive_chart + if new_item.dot == new_item.rhs.size + if new_item.left == i && new_item.right == j + new_symbols << new_item.lhs.symbol if !new_symbols.include? new_item.lhs.symbol + passive_chart.add new_item, i, j + advanced = true + end + else + if new_item.right+(new_item.rhs.size-(new_item.dot)) <= j + active_chart.at(i,j) << new_item + advanced = true + end + end + end + end + end + } + if !advanced + remaining_items << active_item + end + end + + + # 'self-filling' step + new_symbols.each { |s| + remaining_items.each { |item| + next if item.dot!=0 + next if item.rhs[item.dot].class!=Grammar::NT + if item.rhs[item.dot].symbol == s + new_item = Item.new item, i, j, item.dot+1 + new_item.tail_spans[new_item.dot-1] = Span.new(i,j) + if new_item.dot==new_item.rhs.size + new_symbols << new_item.lhs.symbol if !new_symbols.include? new_item.lhs.symbol + passive_chart.add new_item, i, j + end + end + } + } + + } +end + +end #module + diff --git a/prototype/test_hg.rb b/prototype/test_hg.rb new file mode 100755 index 0000000..4be72bd --- /dev/null +++ b/prototype/test_hg.rb @@ -0,0 +1,32 @@ +#!/usr/bin/env ruby + +require_relative 'hypergraph' + + +def main + # viterbi + semiring = ViterbiSemiring.new + hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/toy/toy.json', semiring, true) + #hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/toy/toy-test.json', semiring, true) + #hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/glue/glue.json', semiring, true) + #hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/3/3.json', semiring, true) + path, score = HG::viterbi_path hypergraph, nodes_by_id[-1], semiring + s = HG::derive path, path.last.head, [] + path.each { |e| puts "#{e.rule}" } + puts "---" + puts "#{s.map { |i| i.word }.join ' '}" + puts Math.log score + puts + + # all paths + hypergraph.reset + paths = HG::all_paths hypergraph, nodes_by_id[-1] + paths.each_with_index { |p,i| + s = HG::derive p, p.last.head, [] + puts "#{i+1}. #{s.map { |x| x.word }.join ' '}" + } +end + + +main + diff --git a/prototype/test_parse.rb b/prototype/test_parse.rb new file mode 100755 index 0000000..918101b --- /dev/null +++ b/prototype/test_parse.rb @@ -0,0 +1,49 @@ +#!/usr/bin/env ruby + +require_relative 'parse' + + +def main + STDERR.write "> reading input from TODO\n" + input = 'ich sah ein kleines haus'.split + #input = 'lebensmittel schuld an europäischer inflation'.split + #input = 'offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .'.split + n = input.size + + STDERR.write "> reading grammar\n" + grammar = Grammar::Grammar.new '../example/toy/grammar' + #grammar = Grammar::Grammar.new '../example/toy/grammar-test' + #grammar = Grammar::Grammar.new '../example/glue/grammar' + #grammar = Grammar::Grammar.new '../example/3/grammar.3.gz' + + STDERR.write ">> adding glue grammar\n" + #grammar.add_glue_rules + + STDERR.write ">> adding pass-through grammar\n" + #grammar.add_pass_through_rules input + + STDERR.write "> initializing charts\n" + passive_chart = Parse::Chart.new n + active_chart = Parse::Chart.new n + Parse::init input, n, active_chart, passive_chart, grammar + + STDERR.write "> parsing\n" + Parse::parse input, n, active_chart, passive_chart, grammar + + puts "\n---\npassive chart" + Parse::visit(1, 0, 5) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts " #{j} #{item.to_s}" }; puts } + + weights_file = '../example/toy/weights' + #weights_file = '../example/glue/weights' + #weights_file = '../example/3/weights.init' + weights = SparseVector.from_kv(ReadFile.read(weights_file), ' ', "\n") + if !weights + weights = SparseVector.new + end + + puts passive_chart.to_hg.to_json weights +end + + +main + diff --git a/prototype/weaver.rb b/prototype/weaver.rb new file mode 100755 index 0000000..966d4c8 --- /dev/null +++ b/prototype/weaver.rb @@ -0,0 +1,70 @@ +#!/usr/bin/env ruby + +require 'trollop' +require 'xmlsimple' +require_relative 'parse' + +def read_grammar fn, add_glue, add_pass_through, input=nil + STDERR.write "> reading grammar '#{fn}'\n" + grammar = Grammar::Grammar.new fn + if add_glue + STDERR.write ">> adding glue rules\n" + grammar.add_glue + end + if add_pass_through + STDERR.write ">> adding pass-through rules\n" + grammar.add_pass_through input + end + return grammar +end + +def main + cfg = Trollop::options do + opt :input, "", :type => :string, :default => '-', :short => '-i' + opt :grammar, "", :type => :string, :default => nil, :short => '-g' + opt :weights, "", :type => :string, :default => nil, :short => '-w' + opt :add_glue, "", :type => :bool, :default => false, :short => '-l' + opt :add_pass_through, "", :type => :bool, :default => false, :short => '-p' + end + + grammar = nil + if cfg[:grammar] + grammar = read_grammar cfg[:grammar], cfg[:add_glue], cfg[:add_pass_through] + end + + STDERR.write "> reading input from '#{cfg[:input]}'\n" + ReadFile.readlines_strip(cfg[:input]).each { |input| + + x = XmlSimple.xml_in(input) + input = x['content'].split + n = input.size + + if x['grammar'] + grammar = read_grammar x['grammar'], cfg[:add_glue], cfg[:add_pass_through], input + end + + STDERR.write "> initializing charts\n" + passive_chart = Parse::Chart.new n + active_chart = Parse::Chart.new n + Parse::init input, n, active_chart, passive_chart, grammar + + STDERR.write "> parsing\n" + Parse::parse input, n, active_chart, passive_chart, grammar + + weights = SparseVector.from_kv(ReadFile.read(cfg[:weights]), ' ', "\n") + if !weights + weights = SparseVector.new + end + + hypergraph = passive_chart.to_hg weights + + STDERR.write "> viterbi\n" + semiring = ViterbiSemiring.new + path, score = HG::viterbi_path hypergraph, hypergraph.nodes_by_id[-1], semiring + s = HG::derive path, path.last.head, [] + STDERR.write " #{s.map { |i| i.word }.join ' '} ||| #{Math.log score}\n" + } +end + +main + diff --git a/src/fast_weaver.cc b/src/fast_weaver.cc new file mode 100644 index 0000000..4854476 --- /dev/null +++ b/src/fast_weaver.cc @@ -0,0 +1,26 @@ +#include "hypergraph.hh" +#include + +int +main(int argc, char** argv) +{ + Hg::Hypergraph hg; + G::Vocabulary y; + G::Grammar g; + Hg::io::read(hg, g.rules, y, argv[1]); + //Hg::io::manual(hg, g.rules); + clock_t begin = clock(); + Hg::Path p; + Hg::viterbi_path(hg, p); + vector s; + Hg::derive(p, p.back()->head, s); + for (auto it: s) + cout << it << " "; + cout << endl; + clock_t end = clock(); + double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; + cout << elapsed_secs << " s" << endl; + + return 0; +} + diff --git a/src/grammar.hh b/src/grammar.hh new file mode 100644 index 0000000..c489ec5 --- /dev/null +++ b/src/grammar.hh @@ -0,0 +1,334 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sparse_vector.hh" +#include "util.hh" + +using namespace std; + +namespace G { + +enum item_type { + UNDEFINED, + TERMINAL, + NON_TERMINAL +}; + +struct Item { + virtual size_t index() const { return 0; } + virtual symbol_t symbol() const { return ""; } + virtual item_type type() const { return UNDEFINED; } + + virtual ostream& repr(ostream& os) const { return os << "Item<>"; } + virtual ostream& escaped(ostream& os) const { return os << ""; } + + friend ostream& + operator<<(ostream& os, const Item& i) + { + return i.repr(os); + }; +}; + +struct NT : public Item { + symbol_t symbol_; + size_t index_; + + NT() {} + + NT(string const& s) + { + index_ = 0; // default + string t(s); + t.erase(0, 1); t.pop_back(); // remove '[' and ']' + istringstream ss(t); + if (ss >> index_) { // [i] + symbol_ = ""; + index_ = stoi(s); + return; + } else { + ss.clear(); + string buf; + size_t j = 0; + while (ss.good() && getline(ss, buf, ',')) { + if (j == 0) { + symbol_ = buf; + } else { + index_ = stoi(buf); + } + j++; + } + } + } + + virtual size_t index() const { return index_; } + virtual symbol_t symbol() const { return symbol_; } + virtual item_type type() const { return NON_TERMINAL; } + + virtual ostream& + repr(ostream& os) const + { + return os << "NT<" << symbol_ << "," << index_ << ">"; + } + + virtual ostream& + escaped(ostream& os) const + { + os << "[" << symbol_; + if (index_ > 0) + os << "," << index_; + os << "]"; + + return os; + } +}; + +struct T : public Item { + symbol_t symbol_; + + T(string const& s) + { + symbol_ = s; + } + + virtual symbol_t symbol() const { return symbol_; } + virtual item_type type() const { return TERMINAL; } + + virtual ostream& + repr(ostream& os) const + { + return os << "T<" << symbol_ << ">"; + } + + virtual ostream& + escaped(ostream& os) const + { + os << util::json_escape(symbol_); + } +}; + +struct Vocabulary +{ + unordered_map m_; + vector items_; + + bool + is_non_terminal(string const& s) + { + return s.front() == '[' && s.back() == ']'; + } + + Item* + get(symbol_t const& s) + { + if (is_non_terminal(s)) + return new NT(s); + if (m_.find(s) != m_.end()) + return items_[m_[s]]; + return add(s); + } + + Item* + add(symbol_t const& s) + { + size_t next_index_ = items_.size(); + T* item = new T(s); + items_.push_back(item); + m_[s] = next_index_; + + return item; + } +}; + +struct Rule { + NT* lhs; + vector rhs; + vector target; + size_t arity; +Sv::SparseVector* f; + map order; + string as_str_; + + Rule() {} + + Rule(string const& s, Vocabulary& vocab) { from_s(this, s, vocab); } + + static void + from_s(Rule* r, string const& s, Vocabulary& vocab) + { + istringstream ss(s); + string buf; + size_t j = 0, i = 1; + r->arity = 0; + vector rhs_non_terminals; + r->f = new Sv::SparseVector(); + while (ss >> buf) { + if (buf == "|||") { j++; continue; } + if (j == 0) { // left-hand side + r->lhs = new NT(buf); + } else if (j == 1) { // right-hand side + Item* item = vocab.get(buf); + r->rhs.push_back(item); + if (item->type() == NON_TERMINAL) { + r->arity++; + rhs_non_terminals.push_back(reinterpret_cast(item)); + } + } else if (j == 2) { // target + Item* item = vocab.get(buf); + if (item->type() == NON_TERMINAL) { + r->order.insert(make_pair(i, item->index())); + i++; + if (item->symbol() == "") { // only [1], [2] ... on target + reinterpret_cast(item)->symbol_ = \ + rhs_non_terminals[item->index()-1]->symbol(); + } + } + r->target.push_back(item); + } else if (j == 3) { // feature vector + Sv::SparseVector::from_s(r->f, buf); + // FIXME: this is slow!!! ^^^ + } else if (j == 4) { // alignment + } else { + // error + } + if (j == 4) break; + } + } + + ostream& + repr(ostream& os) const + { + os << "Rulerepr(os); + os << ", rhs:{"; + for (auto it = rhs.begin(); it != rhs.end(); it++) { + (**it).repr(os); + if (next(it) != rhs.end()) os << " "; + } + os << "}, target:{"; + for (auto it = target.begin(); it != target.end(); it++) { + (**it).repr(os); + if (next(it) != target.end()) os << " "; + } + os << "}, f:"; + f->repr(os); + os << ", arity=" << arity << \ + ", order:{"; + for (auto it = order.begin(); it != order.end(); it++) { + os << it->first << "->" << it->second; + if (next(it) != order.end()) os << ", "; + } + os << "}>"; + + return os; + } + + ostream& + escaped(ostream& os) const + { + lhs->escaped(os); + os << " ||| "; + for (auto it = rhs.begin(); it != rhs.end(); it++) { + (**it).escaped(os); + if (next(it) != rhs.end()) os << " "; + } + os << " ||| "; + for (auto it = target.begin(); it != target.end(); it++) { + (**it).escaped(os); + if (next(it) != target.end()) os << " "; + } + os << " ||| "; + f->escaped(os); + os << " ||| " << \ + "TODO"; + + return os; + }; + + friend ostream& + operator<<(ostream& os, Rule const& r) + { + return r.repr(os); + }; + + // -- + void + prep_for_serialization_() + { + ostringstream os; + escaped(os); + as_str_ = os.str(); + }; + MSGPACK_DEFINE(as_str_); + // ^^^ FIXME +}; + +struct Grammar { + vector rules; + vector flat; + vector start_non_terminal; + vector start_terminal; + set nts; + + Grammar() {} + + Grammar(string const& fn, Vocabulary& vocab) + { + ifstream ifs(fn); + string line; + while (getline(ifs, line)) { + G::Rule* r = new G::Rule(line, vocab); + rules.push_back(r); + nts.insert(r->lhs->symbol()); + if (r->arity == 0) + flat.push_back(r); + else if (r->rhs.front()->type() == NON_TERMINAL) + start_non_terminal.push_back(r); + else + start_terminal.push_back(r); + } + } + + void + add_glue(Vocabulary& vocab) + { + for (auto nt: nts) { + ostringstream oss_1; + oss_1 << "[S] ||| [" << nt << ",1] ||| [" << nt << ",1] ||| "; + cout << oss_1.str() << endl; + Rule* r1 = new Rule(oss_1.str(), vocab); + rules.push_back(r1); start_non_terminal.push_back(r1); + ostringstream oss_2; + oss_2 << "[S] ||| [S,1] [" << nt << ",2] ||| [S,1] [" << nt << ",2] ||| "; + cout << oss_2.str() << endl; + Rule* r2 = new Rule(oss_2.str(), vocab); + cout << *r2 << endl; + rules.push_back(r2); start_non_terminal.push_back(r2); + } + } + + void add_pass_through(const string& input); + // ^^^ TODO + + friend ostream& + operator<<(ostream& os, Grammar const& g) + { + for (const auto it: g.rules) { + it->repr(os); + os << endl; + } + + return os; + } +}; + +} // namespace G + diff --git a/src/hypergraph.cc b/src/hypergraph.cc new file mode 100644 index 0000000..40bcc64 --- /dev/null +++ b/src/hypergraph.cc @@ -0,0 +1,362 @@ +#include "hypergraph.hh" + +namespace Hg { + +template void +init(const list& nodes, const list::iterator root, const Semiring& semiring) +{ + for (const auto it: nodes) + it->score = semiring.null; + (**root).score = semiring.one; +} + +void +reset(const list nodes, const vector edges) +{ + for (const auto it: nodes) + it->mark = 0; + for (auto it: edges) + it->mark = 0; +} + +void +topological_sort(list& nodes, const list::iterator root) +{ + auto p = root; + auto to = nodes.begin(); + while (to != nodes.end()) { + if ((**p).is_marked()) { + for (const auto e: (**p).outgoing) { // explore edges + e->mark++; + if (e->is_marked()) { + e->head->mark++; + } + } + } + if ((**p).is_marked()) { + nodes.splice(to, nodes, p); + to++; + p = to; + } else { + ++p; + } + } +} + +void +viterbi(Hypergraph& hg) +{ + list::iterator root = \ + find_if(hg.nodes.begin(), hg.nodes.end(), \ + [](Node* n) { return n->incoming.size() == 0; }); + + Hg::topological_sort(hg.nodes, root); + Semiring::Viterbi semiring; + Hg::init(hg.nodes, root, semiring); + + for (const auto n: hg.nodes) { + for (const auto e: n->incoming) { + score_t s = semiring.one; + for (const auto m: e->tails) { + s = semiring.multiply(s, m->score); + } + n->score = semiring.add(n->score, semiring.multiply(s, e->score)); + } + } +} + +void +viterbi_path(Hypergraph& hg, Path& p) +{ + list::iterator root = \ + find_if(hg.nodes.begin(), hg.nodes.end(), \ + [](Node* n) { return n->incoming.size() == 0; }); + //list::iterator root = hg.nodes.begin(); + + Hg::topological_sort(hg.nodes, root); + // ^^^ FIXME do I need to do this when reading from file? + Semiring::Viterbi semiring; + Hg::init(hg.nodes, root, semiring); + + for (auto n: hg.nodes) { + Edge* best_edge; + bool best = false; + for (auto e: n->incoming) { + score_t s = semiring.one; + for (auto m: e->tails) { + s = semiring.multiply(s, m->score); + } + if (n->score < semiring.multiply(s, e->score)) { // find max + best_edge = e; + best = true; + } + n->score = semiring.add(n->score, semiring.multiply(s, e->score)); + } + if (best) + p.push_back(best_edge); + } +} + + +void +derive(const Path& p, const Node* cur, vector& carry) +{ + Edge* next; + for (auto it: p) { + if (it->head->symbol == cur->symbol && + it->head->left == cur->left && + it->head->right == cur->right) { + next = it; + } + } // FIXME this is probably not so good + + unsigned j = 0; + for (auto it: next->rule->target) { + if (it->type() == G::NON_TERMINAL) { + derive(p, next->tails[next->rule->order[j]], carry); + j++; + } else { + carry.push_back(it->symbol()); + } + } +} + +namespace io { + +void +read(Hypergraph& hg, vector& rules, G::Vocabulary& vocab, const string& fn) +{ + ifstream ifs(fn); + size_t i = 0, r, n, e; + msgpack::unpacker pac; + while(true) { + pac.reserve_buffer(32*1024); + size_t bytes = ifs.readsome(pac.buffer(), pac.buffer_capacity()); + pac.buffer_consumed(bytes); + msgpack::unpacked result; + while(pac.next(&result)) { + msgpack::object o = result.get(); + if (i == 0) { + o.convert(&r); + } else if (i == 1) { + o.convert(&n); + } else if (i == 2) { + o.convert(&e); + } else if (i > 2 && i <= r+2) { + string s; + o.convert(&s); + G::Rule* rule = new G::Rule; + G::Rule::from_s(rule, s, vocab); + rules.push_back(rule); + } else if (i > r+2 && i <= r+n+2) { + Node* n = new Node; + o.convert(n); + hg.nodes.push_back(n); + hg.nodes_by_id[n->id] = n; + } else if (i > n+2 && i <= r+n+e+2) { + Edge* e = new Edge; + e->arity = 0; + o.convert(e); + e->head = hg.nodes_by_id[e->head_id_]; + hg.edges.push_back(e); + hg.nodes_by_id[e->head_id_]->incoming.push_back(e); + e->arity = 0; + for (auto it = e->tails_ids_.begin(); it != e->tails_ids_.end(); it++) { + hg.nodes_by_id[*it]->outgoing.push_back(e); + e->tails.push_back(hg.nodes_by_id[*it]); + e->arity++; + } + e->rule = rules[e->rule_id_]; + } else { + // ERROR + } + i++; + } + if (!bytes) break; + } +} + +void +write(Hypergraph& hg, vector& rules, const string& fn) // FIXME +{ + FILE* file = fopen(fn.c_str(), "wb"); + msgpack::fbuffer fbuf(file); + msgpack::pack(fbuf, rules.size()); + msgpack::pack(fbuf, hg.nodes.size()); + msgpack::pack(fbuf, hg.edges.size()); + for (auto it = rules.cbegin(); it != rules.cend(); it++) + msgpack::pack(fbuf, **it); + for (auto it = hg.nodes.cbegin(); it != hg.nodes.cend(); it++) + msgpack::pack(fbuf, **it); + for (auto it = hg.edges.cbegin(); it != hg.edges.cend(); it++) + msgpack::pack(fbuf, **it); + fclose(file); +} + +void +manual(Hypergraph& hg, vector& rules, G::Vocabulary& vocab) +{ + // nodes + Node* a = new Node; a->id = 0; a->symbol = "root"; a->left = -1; a->right = -1; a->mark = 0; + hg.nodes.push_back(a); hg.nodes_by_id[a->id] = a; + Node* b = new Node; b->id = 1; b->symbol = "NP"; b->left = 0; b->right = 1; b->mark = 0; + hg.nodes.push_back(b); hg.nodes_by_id[b->id] = b; + Node* c = new Node; c->id = 2; c->symbol = "V"; c->left = 1; c->right = 2; c->mark = 0; + hg.nodes.push_back(c); hg.nodes_by_id[c->id] = c; + Node* d = new Node; d->id = 3; d->symbol = "JJ"; d->left = 3; d->right = 4; d->mark = 0; + hg.nodes.push_back(d); hg.nodes_by_id[d->id] = d; + Node* e = new Node; e->id = 4; e->symbol = "NN"; e->left = 3; e->right = 5; e->mark = 0; + hg.nodes.push_back(e); hg.nodes_by_id[e->id] = e; + Node* f = new Node; f->id = 5; f->symbol = "NP"; f->left = 2; f->right = 5; f->mark = 0; + hg.nodes.push_back(f); hg.nodes_by_id[f->id] = f; + Node* g = new Node; g->id = 6; g->symbol = "NP"; g->left = 1; g->right = 5; g->mark = 0; + hg.nodes.push_back(g); hg.nodes_by_id[g->id] = g; + Node* h = new Node; h->id = 7; h->symbol = "S"; h->left = 0; h->right = 6; h->mark = 0; + hg.nodes.push_back(h); hg.nodes_by_id[h->id] = h; + + // rules + vector rule_strs; + rule_strs.push_back("[NP] ||| ich ||| i ||| ||| "); + rule_strs.push_back("[V] ||| sah ||| saw ||| ||| "); + rule_strs.push_back("[JJ] ||| kleines ||| small ||| ||| "); + rule_strs.push_back("[JJ] ||| kleines ||| little ||| ||| "); + rule_strs.push_back("[NN] ||| kleines haus ||| small house ||| ||| "); + rule_strs.push_back("[NN] ||| kleines haus ||| little house ||| ||| "); + rule_strs.push_back("[NN] ||| [JJ,1] haus ||| [JJ,1] shell ||| ||| "); + rule_strs.push_back("[NN] ||| [JJ,1] haus ||| [JJ,1] house ||| ||| "); + rule_strs.push_back("[NP] ||| ein [NN,1] ||| a [NN,1] ||| ||| "); + rule_strs.push_back("[VP] ||| [V,1] [NP,2] ||| [V,1] [NP,2] ||| ||| "); + rule_strs.push_back("[S] ||| [NP,1] [VP,2] ||| [NP,1] [VP,2] ||| ||| "); + + for (auto it: rule_strs) { + rules.push_back(new G::Rule(it, vocab)); + rules.back()->f = new Sv::SparseVector(); + } + + // edges + Edge* q = new Edge; q->head = hg.nodes_by_id[1]; q->tails.push_back(hg.nodes_by_id[0]); q->score = 0.367879441171; + q->arity = 1; q->mark = 0; + hg.edges.push_back(q); + hg.nodes_by_id[1]->incoming.push_back(q); + hg.nodes_by_id[0]->outgoing.push_back(q); + q->rule = rules[0]; + + Edge* p = new Edge; p->head = hg.nodes_by_id[2]; p->tails.push_back(hg.nodes_by_id[0]); p->score = 0.606530659713; + p->arity = 1; p->mark = 0; + hg.edges.push_back(p); + hg.nodes_by_id[2]->incoming.push_back(p); + hg.nodes_by_id[0]->outgoing.push_back(p); + p->rule = rules[1]; + + Edge* r = new Edge; r->head = hg.nodes_by_id[3]; r->tails.push_back(hg.nodes_by_id[0]); r->score = 1.0; + r->arity = 1; r->mark = 0; + hg.edges.push_back(r); + hg.nodes_by_id[3]->incoming.push_back(r); + hg.nodes_by_id[0]->outgoing.push_back(r); + r->rule = rules[2]; + + Edge* s = new Edge; s->head = hg.nodes_by_id[3]; s->tails.push_back(hg.nodes_by_id[0]); s->score = 1.0; + s->arity = 1; s->mark = 0; + hg.edges.push_back(s); + hg.nodes_by_id[3]->incoming.push_back(s); + hg.nodes_by_id[0]->outgoing.push_back(s); + s->rule = rules[3]; + + Edge* t = new Edge; t->head = hg.nodes_by_id[4]; t->tails.push_back(hg.nodes_by_id[0]); t->score = 1.0; + t->arity = 1; t->mark = 0; + hg.edges.push_back(t); + hg.nodes_by_id[4]->incoming.push_back(t); + hg.nodes_by_id[0]->outgoing.push_back(t); + t->rule = rules[4]; + + Edge* u = new Edge; u->head = hg.nodes_by_id[4]; u->tails.push_back(hg.nodes_by_id[0]); u->score = 1.0; + u->arity = 1; u->mark = 0; + hg.edges.push_back(u); + hg.nodes_by_id[4]->incoming.push_back(u); + hg.nodes_by_id[0]->outgoing.push_back(u); + u->rule = rules[5]; + + Edge* v = new Edge; v->head = hg.nodes_by_id[4]; v->tails.push_back(hg.nodes_by_id[3]); v->score = 1.0; + v->arity = 1; v->mark = 0; + hg.edges.push_back(v); + hg.nodes_by_id[4]->incoming.push_back(v); + hg.nodes_by_id[3]->outgoing.push_back(v); + v->rule = rules[6]; + + Edge* w = new Edge; w->head = hg.nodes_by_id[4]; w->tails.push_back(hg.nodes_by_id[3]); w->score = 2.71828182846; + w->arity = 1; w->mark = 0; + hg.edges.push_back(w); + hg.nodes_by_id[4]->incoming.push_back(w); + hg.nodes_by_id[3]->outgoing.push_back(w); + w->rule = rules[7]; + + Edge* x = new Edge; x->head = hg.nodes_by_id[5]; x->tails.push_back(hg.nodes_by_id[4]); x->score = 1.0; + x->arity = 1; x->mark = 0; + hg.edges.push_back(x); + hg.nodes_by_id[5]->incoming.push_back(x); + hg.nodes_by_id[4]->outgoing.push_back(x); + x->rule = rules[8]; + + Edge* y = new Edge; y->head = hg.nodes_by_id[6]; y->tails.push_back(hg.nodes_by_id[2]); y->tails.push_back(hg.nodes_by_id[5]); y->score = 1.0; + y->arity = 2; y->mark = 0; + hg.edges.push_back(y); + hg.nodes_by_id[6]->incoming.push_back(y); + hg.nodes_by_id[2]->outgoing.push_back(y); + hg.nodes_by_id[5]->outgoing.push_back(y); + y->rule = rules[9]; + + Edge* z = new Edge; z->head = hg.nodes_by_id[7]; z->tails.push_back(hg.nodes_by_id[1]); z->tails.push_back(hg.nodes_by_id[6]); z->score = 1.0; + z->arity = 2; z->mark = 0; + hg.edges.push_back(z); + hg.nodes_by_id[7]->incoming.push_back(z); + hg.nodes_by_id[1]->outgoing.push_back(z); + hg.nodes_by_id[6]->outgoing.push_back(z); + z->rule = rules[10]; +} + +} // namespace Hg::io + +/* + * Hg::Node + * + */ +ostream& +operator<<(ostream& os, const Node& n) +{ + os << \ + "Node"; + return os; +} + +/* + * Hg::Edge + * + */ +ostream& +operator<<(ostream& os, const Edge& e) +{ + ostringstream _; + for (auto it: e.tails) { + _ << it->id; + if (it != e.tails.back()) _ << ","; + } + os << \ + "Edgeid << \ + ", tails=[" << _.str() << "]" \ + ", score=" << e.score << \ + ", rule:'"; + e.rule->escaped(os); + os << "', f=" << "TODO" << \ + ", arity=" << e.arity << \ + ", mark=" << e.mark << ">"; + return os; +} + +} // namespace Hg + diff --git a/src/hypergraph.hh b/src/hypergraph.hh new file mode 100644 index 0000000..8e05e9f --- /dev/null +++ b/src/hypergraph.hh @@ -0,0 +1,102 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "grammar.hh" +#include "semiring.hh" +#include "sparse_vector.hh" +#include "types.hh" + +using namespace std; + +namespace Hg { + +struct Node; + +struct Edge { + Node* head; + vector tails; + score_t score; + G::Rule* rule; + unsigned int arity = 0; + unsigned int mark = 0; + + inline bool is_marked() { return mark >= arity; } + friend ostream& operator<<(ostream& os, const Edge& e); + + size_t head_id_; + vector tails_ids_; // node ids + size_t rule_id_; + + MSGPACK_DEFINE(head_id_, tails_ids_, rule_id_, score, arity); +}; + +struct Node { + size_t id; + string symbol; + short left; + short right; + score_t score; + vector incoming; + vector outgoing; + unsigned int mark; + + inline bool is_marked() { return mark >= incoming.size(); }; + friend ostream& operator<<(ostream& os, const Node& n); + + MSGPACK_DEFINE(id, symbol, left, right, score); +}; + +struct Hypergraph { + list nodes; + vector edges; + unordered_map nodes_by_id; + unsigned int arity; +}; + +template void +init(const list& nodes, const list::iterator root, const Semiring& semiring); + +void +reset(const list nodes, const vector edges); + +void +topological_sort(list& nodes, const list::iterator root); + +void +viterbi(Hypergraph& hg); + +typedef vector Path; + +void +viterbi_path(Hypergraph& hg, Path& p); + +void +derive(const Path& p, const Node* cur, vector& carry); + +namespace io { + +void +read(Hypergraph& hg, vector& rules, G::Vocabulary& vocab, const string& fn); // FIXME + +void +write(Hypergraph& hg, vector& rules, const string& fn); // FIXME + +void +manual(Hypergraph& hg, vector& rules); + +} // namespace + +} // namespace + diff --git a/src/make_pak.cc b/src/make_pak.cc new file mode 100644 index 0000000..db3a8a4 --- /dev/null +++ b/src/make_pak.cc @@ -0,0 +1,103 @@ +#include +#include +#include +#include +#include + +#include "json-cpp/single_include/json-cpp.hpp" +#include "hypergraph.hh" +#include "types.hh" + +using namespace std; + +struct DummyNode { + size_t id; + string symbol; + vector span; +}; + +struct DummyEdge { + size_t head_id; + size_t rule_id; + vector tails_ids; + string f; + score_t score; +}; + +struct DummyHg { + vector rules; + vector nodes; + vector edges; +}; + +template inline void +serialize(jsoncpp::Stream& stream, DummyNode& o) +{ + fields(o, stream, "id", o.id, "symbol", o.symbol, "span", o.span); +} + +template inline void +serialize(jsoncpp::Stream& stream, DummyEdge& o) +{ + fields(o, stream, "head", o.head_id, "rule", o.rule_id, "tails", o.tails_ids, "score", o.score); +} + +template inline void +serialize(jsoncpp::Stream& stream, DummyHg& o) +{ + fields(o, stream, "rules", o.rules, "nodes", o.nodes, "edges", o.edges); +} + +int +main(int argc, char** argv) +{ + // read from json + ifstream ifs(argv[1]); + string json_str((istreambuf_iterator(ifs) ), + (istreambuf_iterator())); + DummyHg hg; + vector rules; + hg.rules = rules; + vector nodes; + hg.nodes = nodes; + vector edges; + hg.edges = edges; + jsoncpp::parse(hg, json_str); + + // convert to proper objects + vector nodes_conv; + for (const auto it: hg.nodes) { + Hg::Node* n = new Hg::Node; + n->id = it.id; + n->symbol = it.symbol; + n->left = it.span[0]; + n->right = it.span[1]; + nodes_conv.push_back(n); + } + vector edges_conv; + for (const auto it: hg.edges) { + Hg::Edge* e = new Hg::Edge; + e->head_id_ = it.head_id; + e->tails_ids_ = it.tails_ids; + e->score = it.score; + e->rule_id_ = it.rule_id; + edges_conv.push_back(e); + } + + // write to msgpack + FILE* file = fopen(argv[2], "wb"); + msgpack::fbuffer fbuf(file); + msgpack::pack(fbuf, hg.rules.size()); + msgpack::pack(fbuf, hg.nodes.size()); + msgpack::pack(fbuf, hg.edges.size()); + for (const auto it: hg.rules) + msgpack::pack(fbuf, it); + for (const auto it: nodes_conv) + msgpack::pack(fbuf, *it); + for (const auto it: edges_conv) + msgpack::pack(fbuf, *it); + fclose(file); + + return 0; +} + diff --git a/src/parse.hh b/src/parse.hh new file mode 100644 index 0000000..0dd2fc0 --- /dev/null +++ b/src/parse.hh @@ -0,0 +1,301 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "grammar.hh" +#include "util.hh" +#include "types.hh" + +using namespace std; + +typedef pair Span; +namespace std { + template <> + struct hash + { + size_t + operator()(Span const& k) const + { + return ((hash()(k.first) + ^ (hash()(k.second) << 1)) >> 1); + } + }; +} + +namespace Parse { + +void visit(vector& p, + size_t i, size_t l, size_t r, size_t x=0) +{ + for (size_t s = i; s <= r-x; s++) { + for (size_t k = l; k <= r-s; k++) { + p.push_back(Span(k,k+s)); + } + } +} + +struct ChartItem +{ + Span span; + G::Rule const* rule; + vector tails_spans; + size_t dot; + + ChartItem() {} + + ChartItem(G::Rule* r) : rule(r), dot(0) {} + + ChartItem(G::Rule* r, Span s, size_t dot) + : rule(r), span(s), dot(dot) {} + + ChartItem(ChartItem const& o) + : span(o.span), + rule(o.rule), + tails_spans(o.tails_spans), + dot(o.dot) + { + } + + ostream& + repr(ostream& os) const + { + os << "ChartItem<"; + os << "span=(" << span.first << "," << span.second << "), lhs="; + rule->lhs->repr(os); + os << ", dot=" << dot; + os << ", tails=" << tails_spans.size() << ", "; + os << "rule="; + rule->repr(os); + os << ">"; + os << endl; + } + + friend ostream& + operator<<(ostream& os, ChartItem item) + { + item.repr(os); + + return os; + } +}; + +struct Chart +{ + size_t n_; + map > m_; + unordered_map b_; + + vector& at(Span s) + { + return m_[s]; + } + + string h(symbol_t sym, Span s) + { + ostringstream ss; + ss << sym; + ss << s.first; + ss << s.second; + + return ss.str(); + } + + bool + has_at(symbol_t sym, Span s) + { + return b_[h(sym, s)]; + } + + void add(ChartItem* item, Span s) + { + if (m_.count(s) > 0) + m_[s].push_back(item); + else { + m_.insert(make_pair(s, vector{item})); + } + b_[h(item->rule->lhs->symbol(), s)] = true; + } + + Chart(size_t n) : n_(n) {} + + friend ostream& + operator<<(ostream& os, Chart const& chart) + { + for (map >::const_iterator it = chart.m_.cbegin(); + it != chart.m_.cend(); it++) { + os << "(" << it->first.first << "," << it->first.second << ")" << endl; + for (auto jt: it->second) + jt->repr(os); os << endl; + } + + return os; + } +}; + +bool +scan(ChartItem* item, vector in, size_t limit, Chart& passive) +{ + //cout << "S1" << endl; + while (item->dot < item->rule->rhs.size() && + item->rule->rhs[item->dot]->type() == G::TERMINAL) { + //cout << "S2" << endl; + if (item->span.second == limit) return false; + //cout << "S3" << endl; + if (item->rule->rhs[item->dot]->symbol() == in[item->span.second]) { + //cout << "S4" << endl; + item->dot++; + //cout << "S5" << endl; + item->span.second++; + //cout << "S6" << endl; + } else { + //cout << "S7" << endl; + return false; + } + } + //cout << "S8" << endl; + return true; +} + + +void +init(vector const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g) +{ + for (auto rule: g.flat) { + size_t j = 0; + for (auto it: in) { + if (it == rule->rhs.front()->symbol()) { + cout << it << " " << j << j+rule->rhs.size() << endl; + Span span(j, j+rule->rhs.size()); + passive.add(new ChartItem(rule, span, rule->rhs.size()), span); + cout << "new passive item [1] " << *passive.at(span).back() << endl; + } + j++; + } + } +} + +void +parse(vector const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g) +{ + vector spans; + Parse::visit(spans, 1, 0, n); + for (auto span: spans) { + + cout << "Span (" << span.first << "," << span.second << ")" << endl; + + for (auto it: g.start_terminal) { + ChartItem* item = new ChartItem(it, Span(span.first,span.first), 0); + if (scan(item, in, span.second, passive) + && span.first + item->rule->rhs.size() <= span.second) { + active.add(item, span); + cout << "new active item [1] " << *active.at(span).back(); + } + } + + for (auto it: g.start_non_terminal) { + if (it->rhs.size() > span.second-span.first) continue; + active.add(new ChartItem(it, Span(span.first,span.first), 0), span); + cout << "new active item [2] " << *active.at(span).back(); + } + + set new_symbols; + vector remaining_items; + + while (true) { + cout << "active size at (" << span.first << "," << span.second << ") " << active.at(span).size() << endl; + cout << "passive size at (" << span.first << "," << span.second << ") " << passive.at(span).size() << endl; + if (active.at(span).empty()) break; + ChartItem* item = active.at(span).back(); + cout << "current item " << *item; + active.at(span).pop_back(); + bool advanced = false; + vector spans2; + Parse::visit(spans2, 1, span.first, span.second, 1); + for (auto span2: spans2) { + cout << "A" << endl; + if (item->rule->rhs[item->dot]->type() == G::NON_TERMINAL) { + cout << "B" << endl; + if (passive.has_at(item->rule->rhs[item->dot]->symbol(), span2)) { + cout << "C" << endl; + if (span2.first == item->span.second) { + cout << "D" << endl; + ChartItem* new_item = new ChartItem(*item); + cout << "D1" << endl; + new_item->span.second = span2.second; + cout << "D2" << endl; + new_item->dot++; + cout << "D3" << endl; + new_item->tails_spans.push_back(span2); + cout << "D4" << endl; + if (scan(new_item, in, span.second, passive)) { + cout << "E" << endl; + if (new_item->dot == new_item->rule->rhs.size()) { + cout << "F" << endl; + if (new_item->span.first == span.first && new_item->span.second == span.second) { + cout << "G" << endl; + cout << "H" << endl; + new_symbols.insert(new_item->rule->lhs->symbol()); + passive.add(new_item, span); + cout << "new passive item [2] " << *new_item; + advanced = true; + } + } else { + if (new_item->span.second+(new_item->rule->rhs.size()-new_item->dot) <= span.second) { + active.add(new_item, span); + cout << "new active item [3] " << *new_item; + } + } + } + cout << "I" << endl; + } + } + } + } + cout << "J" << endl; + if (!advanced) { + cout << "K" << endl; + remaining_items.push_back(item); + } + } + + for (auto new_sym: new_symbols) { + cout << "new sym " << new_sym << endl; + for (auto rem_item: remaining_items) { + if (rem_item->dot != 0 || + rem_item->rule->rhs[rem_item->dot]->type() != G::NON_TERMINAL) { + continue; + cout << "K1" << endl; + } + cout << "K2" << endl; + if (rem_item->rule->rhs[rem_item->dot]->symbol() == new_sym) { + cout << "K3" << endl; + ChartItem* new_item = new ChartItem(*rem_item); + cout << "K31" << endl; + //new_item->tails_spans[new_item->dot-1] = span; + new_item->tails_spans.push_back(span); + new_item->dot++; + cout << "K32" << endl; + if (new_item->dot == new_item->rule->rhs.size()) { + cout << "K4" << endl; + new_symbols.insert(new_item->rule->lhs->symbol()); + passive.add(new_item, span); + } + } + } + } + + cout << "L" << endl; + cout << "-------------------" << endl; + cout << endl; + } + + //cout << "ACTIVE" << endl << active << endl; + cout << "PASSIVE" << endl << passive << endl; +} + +} // + diff --git a/src/read_pak.cc b/src/read_pak.cc new file mode 100644 index 0000000..c894442 --- /dev/null +++ b/src/read_pak.cc @@ -0,0 +1,26 @@ +#include +#include +#include + +using namespace std; + +int +main(int argc, char** argv) +{ + ifstream ifs(argv[1]); + msgpack::unpacker pac; + while(true) { + pac.reserve_buffer(32*1024); + size_t bytes = ifs.readsome(pac.buffer(), pac.buffer_capacity()); + pac.buffer_consumed(bytes); + msgpack::unpacked result; + while(pac.next(&result)) { + msgpack::object o = result.get(); + cout << o << endl; + } + if (!bytes) break; + } + + return 0; +} + diff --git a/src/semiring.hh b/src/semiring.hh new file mode 100644 index 0000000..3f4ac08 --- /dev/null +++ b/src/semiring.hh @@ -0,0 +1,35 @@ +#pragma once + + +namespace Semiring { + +template +struct Viterbi { + T one = 1.0; + T null = 0.0; + + T add(T x, T y); + T multiply(T x, T y); + T convert(T x); +}; + +template T +Viterbi::add(T x, T y) +{ + return max(x, y); +} + +template T +Viterbi::multiply(T x, T y) +{ + return x * y; +} + +template T +Viterbi::convert(T x) +{ + return (T)x; +} + +} // namespace + diff --git a/src/sparse_vector.hh b/src/sparse_vector.hh new file mode 100644 index 0000000..7fff338 --- /dev/null +++ b/src/sparse_vector.hh @@ -0,0 +1,186 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "util.hh" +#include "types.hh" + +using namespace std; + + +namespace Sv { + +template +struct SparseVector { + unordered_map m_; + V zero = 0.f; + + SparseVector() {}; + + SparseVector(string& s) + { + from_s(this, s); + }; + + void + insert(K k, V v) { m_[k] = v; }; + + V + dot(SparseVector& other) + { + V r; + unordered_map* o = &m_; + auto b = m_.cbegin(); + auto e = m_.cend(); + if (other.size() < size()) { + b = other.m_.cbegin(); + e = other.m_.cend(); + o = &other.m_; + } + for (auto it = b; it != e; it++) + r += it->second * o->at(it->first); + + return r; + }; + + size_t + size() + { + return m_.size(); + } + + V& + operator[](const K& k) + { + return at(k); + }; + + const V& + at(const K& k) const + { + if (m_.find(k) == m_.end()) + return zero; + else + return m_.at(k); + } + + SparseVector + operator+(const SparseVector& other) const + { + SparseVector v; + v.m_.insert(m_.cbegin(), m_.cend()); + v.m_.insert(other.m_.cbegin(), other.m_.cend()); + for (const auto it: v.m_) + v.m_[it.first] = this->at(it.first) + other.at(it.first); + + return v; + }; + + SparseVector& + operator+=(const SparseVector& other) + { + for (const auto it: other.m_) + m_[it.first] += it.second; + + return *this; + }; + + SparseVector + operator-(const SparseVector& other) const + { + SparseVector v; + v.m_.insert(m_.cbegin(), m_.cend()); + v.m_.insert(other.m_.cbegin(), other.m_.cend()); + for (const auto it: v.m_) + v.m_[it.first] = this->at(it.first) - other.at(it.first); + + return v; + }; + + SparseVector& + operator-=(const SparseVector& other) + { + for (const auto it: other.m_) + m_[it.first] -= it.second; + + return *this; + }; + + SparseVector + operator*(V f) const + { + SparseVector v; + for (const auto it: m_) + v.m_[it.first] = this->at(it.first) * f; + + return v; + }; + + SparseVector& + operator*=(V f) + { + for (const auto it: m_) + m_[it.first] *= f; + + return *this; + }; + + static void + from_s(SparseVector* w, const string& s) + { + stringstream ss(s); + while (!ss.eof()) { + string t; + ss >> t; + size_t eq = t.find_first_of("="); + if (eq == string::npos) { + return; + } + t.replace(eq, 1, " "); + stringstream tt(t); + K k; V v; + tt >> k >> v; + w->m_.emplace(k.substr(k.find_first_of("\"")+1, k.find_last_of("\"")-1), v); + } + } + + ostream& + repr(ostream& os) const + { + os << "SparseVector<{"; + for (auto it = m_.cbegin(); it != m_.cend(); it++) { + os << "'" << it->first << "'=" << it->second; + if (next(it) != m_.end()) + os << ", "; + } + os << "}>"; + + return os; + }; + + ostream& + escaped(ostream& os, bool quote_keys=false) const { + for (auto it = m_.cbegin(); it != m_.cend(); it++) { + if (quote_keys) os << '"'; + os << util::json_escape(it->first); + if (quote_keys) os << '"'; + os << "=" << it->second; + if (next(it) != m_.cend()) os << " "; + } + + return os; + }; + + friend ostream& + operator<<(ostream& os, const SparseVector& v) + { + return v.repr(os); + } +}; + +} // namespace + diff --git a/src/test_grammar.cc b/src/test_grammar.cc new file mode 100644 index 0000000..7fee79a --- /dev/null +++ b/src/test_grammar.cc @@ -0,0 +1,19 @@ +#include + +#include "grammar.hh" + +using namespace std; + +int +main(int argc, char** argv) +{ + G::Vocabulary y; + G::Grammar g(argv[1], y); + for (auto it: g.rules) { + it->escaped(cout); + cout << endl; + } + + return 0; +} + diff --git a/src/test_parse.cc b/src/test_parse.cc new file mode 100644 index 0000000..2d51d44 --- /dev/null +++ b/src/test_parse.cc @@ -0,0 +1,19 @@ +#include "parse.hh" + +int main(int argc, char** argv) +{ + //string in("ich sah ein kleines haus"); + //string in("europa bildet den ersten oder zweiten markt für die zehn am häufigsten von indien exportierten produkte , erklärte der europäische kommissar weiter . die asiatischen und europäischen giganten tauschen jährlich güter im wert von 47 milliarden euro und dienstleistungen im wert von 10 milliarden euro aus , hatte diese woche daniéle smadja , vorsitzende der abordnung der europäischen kommission in neu delhi , erklärt , und bedauert , dass der gegenseitige handel sein potential noch nicht ausgeschöpft hat . die eu und indien treffen sich am freitag zu ihrem achten diplomatischen in neu delhi , bei dem premierminister manmohan singh und der präsident der europäischen kommission josé manuel durao barrosso anwesend sein werden ."); + //string in("aber schon bald nach seinem eintritt kam der erste große erfolg ."); + string in("lebensmittel schuld an europäischer inflation"); + vector tok = util::tokenize(in); + size_t n = tok.size(); + G::Vocabulary v; + G::Grammar g(argv[1], v); + g.add_glue(v); + Parse::Chart active(n); + Parse::Chart passive(n); + init(tok, n, active, passive, g); + parse(tok, n, active, passive, g); +} + diff --git a/src/test_sparse_vector.cc b/src/test_sparse_vector.cc new file mode 100644 index 0000000..69aaa21 --- /dev/null +++ b/src/test_sparse_vector.cc @@ -0,0 +1,36 @@ +#include "sparse_vector.hh" + +int +main(void) +{ + Sv::SparseVector a; + a.insert("1", 1); + a.insert("2", 2); + cout << "a:" << a << endl; + + Sv::SparseVector b; + b.insert("2", 2); + cout << "b:" << b << endl; + + Sv::SparseVector c = a + b; + cout << "a+b:" << c << endl; + + a += b; + cout << "a+=b:" << a << endl; + + a -= b; + cout << "a-=b:" << a << endl; + + cout << "a*2:" << a*2 << endl; + + a *= 2; + cout << "a*=2:" << a << endl; + + string s("\"a\"=2 \"b\"=3"); + Sv::SparseVector* sv = new Sv::SparseVector(s); + cout << *sv << endl; + cout << sv->dot(*sv) << endl; + + return 0; +} + diff --git a/src/types.hh b/src/types.hh new file mode 100644 index 0000000..e89b4dd --- /dev/null +++ b/src/types.hh @@ -0,0 +1,10 @@ +#pragma once + +#include + +using namespace std; + + +typedef double score_t; +typedef string symbol_t; + diff --git a/src/util.hh b/src/util.hh new file mode 100644 index 0000000..93ea320 --- /dev/null +++ b/src/util.hh @@ -0,0 +1,47 @@ +#pragma once + +#include + +#include "types.hh" + +using namespace std; + + +namespace util { + +inline string +json_escape(const string& s) +{ + ostringstream os; + for (auto it = s.cbegin(); it != s.cend(); it++) { + switch (*it) { + case '"': os << "\\\""; break; + case '\\': os << "\\\\"; break; + case '\b': os << "\\b"; break; + case '\f': os << "\\f"; break; + case '\n': os << "\\n"; break; + case '\r': os << "\\r"; break; + case '\t': os << "\\t"; break; + default: os << *it; break; + } + } + + return os.str(); +} + +inline vector +tokenize(string s) +{ + istringstream ss(s); + vector r; + while (ss.good()) { + string buf; + ss >> buf; + r.push_back(buf); + } + + return r; +} + +} // namespace util + diff --git a/test/test_hg.rb b/test/test_hg.rb deleted file mode 100755 index 6311bac..0000000 --- a/test/test_hg.rb +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env ruby - -require_relative '../hg' - - -def main - # viterbi - semiring = ViterbiSemiring.new - hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/toy/toy.json', semiring, true) - #hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/toy/toy-test.json', semiring, true) - #hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/glue/glue.json', semiring, true) - #hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/3/3.json', semiring, true) - path, score = HG::viterbi_path hypergraph, nodes_by_id[-1], semiring - s = HG::derive path, path.last.head, [] - path.each { |e| puts "#{e.rule}" } - puts "---" - puts "#{s.map { |i| i.word }.join ' '}" - puts Math.log score - puts - - # all paths - hypergraph.reset - paths = HG::all_paths hypergraph, nodes_by_id[-1] - paths.each_with_index { |p,i| - s = HG::derive p, p.last.head, [] - puts "#{i+1}. #{s.map { |x| x.word }.join ' '}" - } -end - - -main - diff --git a/test/test_parse.rb b/test/test_parse.rb deleted file mode 100755 index c3be5ae..0000000 --- a/test/test_parse.rb +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env ruby - -require_relative '../parse' - - -def main - STDERR.write "> reading input from TODO\n" - input = 'ich sah ein kleines haus'.split - #input = 'lebensmittel schuld an europäischer inflation'.split - #input = 'offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .'.split - n = input.size - - STDERR.write "> reading grammar\n" - grammar = Grammar::Grammar.new '../example/toy/grammar' - #grammar = Grammar::Grammar.new '../example/toy/grammar-test' - #grammar = Grammar::Grammar.new '../example/glue/grammar' - #grammar = Grammar::Grammar.new '../example/3/grammar.3.gz' - - STDERR.write ">> adding glue grammar\n" - #grammar.add_glue_rules - - STDERR.write ">> adding pass-through grammar\n" - #grammar.add_pass_through_rules input - - STDERR.write "> initializing charts\n" - passive_chart = Parse::Chart.new n - active_chart = Parse::Chart.new n - Parse::init input, n, active_chart, passive_chart, grammar - - STDERR.write "> parsing\n" - Parse::parse input, n, active_chart, passive_chart, grammar - - puts "\n---\npassive chart" - Parse::visit(1, 0, 5) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts " #{j} #{item.to_s}" }; puts } - - weights_file = '../example/toy/weights' - #weights_file = '../example/glue/weights' - #weights_file = '../example/3/weights.init' - weights = SparseVector.from_kv(ReadFile.read(weights_file), ' ', "\n") - if !weights - weights = SparseVector.new - end - - puts passive_chart.to_hg.to_json weights -end - - -main - diff --git a/util/Makefile b/util/Makefile deleted file mode 100644 index 30564fe..0000000 --- a/util/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -COMPILER=clang - - -all: make_pak read_pak - -make_pak: make_pak.cc json-cpp/single_include/json-cpp.hpp ../fast/hypergraph.hh ../fast/weaver.hh - $(COMPILER) -std=c++11 -lstdc++ -lm -lmsgpack make_pak.cc -o make_pak - -read_pak: read_pak.cc - $(COMPILER) -std=c++11 -lstdc++ -lmsgpack read_pak.cc -o read_pak - -clean: - rm -f make_pak read_pak - diff --git a/util/cdec2json.py b/util/cdec2json.py deleted file mode 100755 index e7c8e93..0000000 --- a/util/cdec2json.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python2 - -import cdec -import sys, argparse -import json -import gzip - - -#FIXME new format -# strings? -# map? -def hg2json(hg, weights): - """ - output a JSON representation of a cdec hypegraph - """ - res = '' - res += "{\n" - res += '"rules":[\n' - rules = [] - for i in hg.edges: - s = json.dumps(str(i.trule)) - try: - rules.index(s) - except: - rules.append(s) - res += ",\n".join(rules) - res += "\n],\n" - res += '"nodes":'+"\n" - res += "[\n" - a = [] - a.append( '{ "id":0, "symbol":"root", "span":[-1,-1] }' ) - for i in hg.nodes: - a.append('{ "id":%d, "symbol":"%s", "span":[%d,%d] }'%(i.id+1, i.cat, i.span[0], i.span[1])) - res += ",\n".join(a)+"\n" - res += "],\n" - res += '"edges":'+"\n" - res += "[\n" - a = [] - for i in hg.edges: - s = "{" - s += '"head":%d'%(i.head_node.id+1) - s += ', "rule":%s'%(rules.index(json.dumps(str(i.trule)))) - # f - #xs = ' "f":{' - #b = [] - #for j in i.feature_values: - # b.append( '"%s":%s'%(j[0], j[1]) ) - #xs += ", ".join(b) - #xs += "}," - # tails - if len(list(i.tail_nodes)) > 0: - s += ', "tails":[ %s ],'%(",".join([str(n.id+1) for n in i.tail_nodes])) - else: - s += ', "tails":[ 0 ],' - #s += xs - s += ' "score":%s }'%(i.prob) - a.append(s) - res += ",\n".join(a)+"\n" - res += "]\n" - res += "}\n" - return res - -def main(): - parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs') - parser.add_argument('-c', '--config', required=True, help='decoder configuration') - parser.add_argument('-w', '--weights', required=True, help='feature weights') - parser.add_argument('-g', '--grammar', required=False, help='grammar') - args = parser.parse_args() - with open(args.config) as config: - config = config.read() - decoder = cdec.Decoder(config) - decoder.read_weights(args.weights) - ins = sys.stdin.readline().strip() - if args.grammar: - with gzip.open(args.grammar) as grammar: - grammar = grammar.read() - hg = decoder.translate(ins, grammar=grammar) - else: - hg = decoder.translate(ins) - - sys.stderr.write( "input:\n '%s'\n"%(ins) ) - sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) ) - num_nodes = 0 - for i in hg.nodes: num_nodes+=1 - sys.stderr.write( "# nodes = %s\n"%(num_nodes) ) - num_edges = 0 - for i in hg.edges: num_edges+=1 - sys.stderr.write( "# edges = %s\n"%(num_edges) ) - sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) ) - - print hg2json(hg, decoder.weights) - - -if __name__=="__main__": - main() - diff --git a/util/json-cpp b/util/json-cpp deleted file mode 160000 index 4eb4b47..0000000 --- a/util/json-cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4eb4b47cf4d622bc7bf34071d6b68fc5beb37051 diff --git a/util/make_pak.cc b/util/make_pak.cc deleted file mode 100644 index e858155..0000000 --- a/util/make_pak.cc +++ /dev/null @@ -1,104 +0,0 @@ -#include -#include -#include -#include -#include - -#include "json-cpp/single_include/json-cpp.hpp" -#include "../fast/hypergraph.hh" -#include "../fast/weaver.hh" - -using namespace std; - - -struct DummyNode { - size_t id; - string symbol; - vector span; -}; - -struct DummyEdge { - size_t head_id; - size_t rule_id; - vector tails_ids; - string f; - score_t score; -}; - -struct DummyHg { - vector rules; - vector nodes; - vector edges; -}; - -template inline void -serialize(jsoncpp::Stream& stream, DummyNode& o) -{ - fields(o, stream, "id", o.id, "symbol", o.symbol, "span", o.span); -} - -template inline void -serialize(jsoncpp::Stream& stream, DummyEdge& o) -{ - fields(o, stream, "head", o.head_id, "rule", o.rule_id, "tails", o.tails_ids, "score", o.score); -} - -template inline void -serialize(jsoncpp::Stream& stream, DummyHg& o) -{ - fields(o, stream, "rules", o.rules, "nodes", o.nodes, "edges", o.edges); -} - -int -main(int argc, char** argv) -{ - // read from json - ifstream ifs(argv[1]); - string json_str((istreambuf_iterator(ifs) ), - (istreambuf_iterator())); - DummyHg hg; - vector rules; - hg.rules = rules; - vector nodes; - hg.nodes = nodes; - vector edges; - hg.edges = edges; - jsoncpp::parse(hg, json_str); - - // convert to proper objects - vector nodes_conv; - for (const auto it: hg.nodes) { - Hg::Node* n = new Hg::Node; - n->id = it.id; - n->symbol = it.symbol; - n->left = it.span[0]; - n->right = it.span[1]; - nodes_conv.push_back(n); - } - vector edges_conv; - for (const auto it: hg.edges) { - Hg::Edge* e = new Hg::Edge; - e->head_id_ = it.head_id; - e->tails_ids_ = it.tails_ids; - e->score = it.score; - e->rule_id_ = it.rule_id; - edges_conv.push_back(e); - } - - // write to msgpack - FILE* file = fopen(argv[2], "wb"); - msgpack::fbuffer fbuf(file); - msgpack::pack(fbuf, hg.rules.size()); - msgpack::pack(fbuf, hg.nodes.size()); - msgpack::pack(fbuf, hg.edges.size()); - for (const auto it: hg.rules) - msgpack::pack(fbuf, it); - for (const auto it: nodes_conv) - msgpack::pack(fbuf, *it); - for (const auto it: edges_conv) - msgpack::pack(fbuf, *it); - fclose(file); - - return 0; -} - diff --git a/util/read_pak.cc b/util/read_pak.cc deleted file mode 100644 index afd6e6a..0000000 --- a/util/read_pak.cc +++ /dev/null @@ -1,27 +0,0 @@ -#include -#include -#include - -using namespace std; - - -int -main(int argc, char** argv) -{ - ifstream ifs(argv[1]); - msgpack::unpacker pac; - while(true) { - pac.reserve_buffer(32*1024); - size_t bytes = ifs.readsome(pac.buffer(), pac.buffer_capacity()); - pac.buffer_consumed(bytes); - msgpack::unpacked result; - while(pac.next(&result)) { - msgpack::object o = result.get(); - cout << o << endl; - } - if (!bytes) break; - } - - return 0; -} - -- cgit v1.2.3