From 190f68c880eb27506669e95e2bc0493e2ec42c4c Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 17 Aug 2014 07:51:16 +0100 Subject: functional again --- fast/Makefile | 3 +-- fast/README.md | 9 ++++++--- fast/grammar.cc | 46 ++++++++++++++++++++++++++++++++++++---------- fast/grammar.hh | 1 + fast/hypergraph.cc | 28 +++++++++++++++++++--------- fast/hypergraph.hh | 2 +- fast/main.cc | 11 +++++++---- fast/sparse_vector.hh | 38 +++++++++++++++++++++++++------------- 8 files changed, 96 insertions(+), 42 deletions(-) (limited to 'fast') diff --git a/fast/Makefile b/fast/Makefile index 40ce0eb..9e88076 100644 --- a/fast/Makefile +++ b/fast/Makefile @@ -1,11 +1,10 @@ -COMPILER=clang +COMPILER=g++ CFLAGS=-std=c++11 -O3 all: grammar.o hypergraph.o main.cc $(COMPILER) $(CFLAGS) -std=c++11 -lstdc++ -lm -lmsgpack grammar.o hypergraph.o main.cc -o fast_weaver - hypergraph.o: hypergraph.cc hypergraph.hh grammar.o semiring.hh sparse_vector.hh weaver.hh $(COMPILER) $(CFLAGS) -g -c hypergraph.cc diff --git a/fast/README.md b/fast/README.md index a11bd85..1d6bd04 100644 --- a/fast/README.md +++ b/fast/README.md @@ -7,12 +7,12 @@ TODO * other semirings * include language model * compress/hash words/feature strings? - + * cast? Rule -> Edge, ChartItem -> Node + * feature factory, observer Dependencies: * MessagePack for object serialization [1] * kenlm language model [2] - This is Linux only. @@ -20,6 +20,8 @@ This is Linux only. [1] http://msgpack.org [2] http://kheafield.com/code/kenlm/ + +stuff to have a look at: http://math.nist.gov/spblas/ http://lapackpp.sourceforge.net/ http://www.cvmlib.com/ @@ -30,5 +32,6 @@ http://bytes.com/topic/c/answers/702569-blas-vs-cblas-c http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack http://www.osl.iu.edu/research/mtl/download.php3 http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library - +https://software.intel.com/en-us/tbb_4.2_doc http://goog-perftools.sourceforge.net/doc/tcmalloc.html + diff --git a/fast/grammar.cc b/fast/grammar.cc index 558f6e6..a003eb4 100644 --- a/fast/grammar.cc +++ b/fast/grammar.cc @@ -10,7 +10,18 @@ namespace G { NT::NT(string& s) { s.erase(0, 1); s.pop_back(); // remove '[' and ']' - stringstream ss(s); + istringstream ss(s); + if (ss >> index) { // [i] + symbol = ""; + index = stoi(s); + + return; + } else { // [X] + symbol = s; + index = 0; + + return; + } string buf; size_t j = 0; index = 0; // default @@ -135,28 +146,43 @@ operator<<(ostream& os, const Item& i) * */ Rule::Rule(const string& s) +{ + from_s(this, s); +} + +void +Rule::from_s(Rule* r, const string& s) { stringstream ss(s); size_t j = 0; string buf; - arity = 0; + r->arity = 0; size_t index = 1; + vector rhs_nt; + r->f = new Sv::SparseVector(); while (ss >> buf) { if (buf == "|||") { j++; continue; } if (j == 0) { // LHS - lhs = new NT(buf); + r->lhs = new NT(buf); } else if (j == 1) { // RHS - rhs.push_back(new Item(buf)); - if (rhs.back()->type == NON_TERMINAL) arity++; + r->rhs.push_back(new Item(buf)); + if (r->rhs.back()->type == NON_TERMINAL) { + rhs_nt.push_back(r->rhs.back()->nt); + r->arity++; + } } else if (j == 2) { // TARGET - target.push_back(new Item(buf)); - if (target.back()->type == NON_TERMINAL) { - order.insert(make_pair(index, target.back()->nt->index)); + r->target.push_back(new Item(buf)); + if (r->target.back()->type == NON_TERMINAL) { + r->order.insert(make_pair(index, r->target.back()->nt->index)); + if (r->target.back()->nt->symbol == "") + r->target.back()->nt->symbol = rhs_nt[r->target.back()->nt->index-1]->symbol; index++; } } else if (j == 3) { // F TODO + Sv::SparseVector::from_s(r->f, buf); // FIXME this is slow!!! } else if (j == 4) { // A TODO - } else { // ERROR + } else { + // ERROR } if (j == 4) break; } @@ -203,7 +229,7 @@ Rule::escaped() const os << " ||| "; os << f->escaped(); os << " ||| "; - os << "TODO"; + os << "TODO(alignment)"; return os.str(); } diff --git a/fast/grammar.hh b/fast/grammar.hh index 48a5116..1b9ac5a 100644 --- a/fast/grammar.hh +++ b/fast/grammar.hh @@ -69,6 +69,7 @@ Sv::SparseVector* f; Rule() {}; Rule(const string& s); + static void from_s(Rule* r, const string& s); string repr() const; string escaped() const; diff --git a/fast/hypergraph.cc b/fast/hypergraph.cc index e1debb1..a9a44f9 100644 --- a/fast/hypergraph.cc +++ b/fast/hypergraph.cc @@ -73,7 +73,7 @@ viterbi_path(Hypergraph& hg, Path& p) find_if(hg.nodes.begin(), hg.nodes.end(), \ [](Node* n) { return n->incoming.size() == 0; }); - Hg::topological_sort(hg.nodes, root); + Hg::topological_sort(hg.nodes, root); // FIXME do I need to do this when reading from file? Semiring::Viterbi semiring; Hg::init(hg.nodes, root, semiring); @@ -107,7 +107,8 @@ derive(const Path& p, const Node* cur, vector& carry) it->head->right == cur->right) { next = it; } - } + } // FIXME this is probably not so good + unsigned j = 0; for (auto it: next->rule->target) { if (it->type == G::NON_TERMINAL) { @@ -125,7 +126,7 @@ void read(Hypergraph& hg, vector& rules, const string& fn) // FIXME { ifstream ifs(fn); - size_t i = 0, nr, nn, ne; + size_t i = 0, r, n, e; msgpack::unpacker pac; while(true) { pac.reserve_buffer(32*1024); @@ -135,17 +136,23 @@ read(Hypergraph& hg, vector& rules, const string& fn) // FIXME while(pac.next(&result)) { msgpack::object o = result.get(); if (i == 0) { - o.convert(&nn); - nn += 1; + o.convert(&r); } else if (i == 1) { - o.convert(&ne); - ne += 1; - } else if (i > 1 && i <= nn) { + o.convert(&n); + } else if (i == 2) { + o.convert(&e); + } else if (i > 2 && i <= r+2) { + string s; + o.convert(&s); + G::Rule* rule = new G::Rule; + G::Rule::from_s(rule, s); + rules.push_back(rule); + } else if (i > r+2 && i <= r+n+2) { Node* n = new Node; o.convert(n); hg.nodes.push_back(n); hg.nodes_by_id[n->id] = n; - } else if (i > nn && i <= nn+ne+1) { + } else if (i > n+2 && i <= r+n+e+2) { Edge* e = new Edge; e->arity = 0; o.convert(e); @@ -158,6 +165,9 @@ read(Hypergraph& hg, vector& rules, const string& fn) // FIXME e->tails.push_back(hg.nodes_by_id[*it]); e->arity++; } + e->rule = rules[e->rule_id_]; + } else { + // ERROR } i++; } diff --git a/fast/hypergraph.hh b/fast/hypergraph.hh index 699bfdf..299a62d 100644 --- a/fast/hypergraph.hh +++ b/fast/hypergraph.hh @@ -92,7 +92,7 @@ void read(Hypergraph& hg, vector& rules, const string& fn); // FIXME void -write(Hypergraph& hg, vector& rules, const string& fn); // TODO +write(Hypergraph& hg, vector& rules, const string& fn); // FIXME void manual(Hypergraph& hg, vector& rules); diff --git a/fast/main.cc b/fast/main.cc index 59e25d5..08fcfcf 100644 --- a/fast/main.cc +++ b/fast/main.cc @@ -1,4 +1,5 @@ #include "hypergraph.hh" +#include int @@ -6,9 +7,9 @@ main(int argc, char** argv) { Hg::Hypergraph hg; G::Grammar g; -//Hg::io::read(hg, g.rules, argv[1]); - Hg::io::manual(hg, g.rules); - + Hg::io::read(hg, g.rules, argv[1]); + //Hg::io::manual(hg, g.rules); + clock_t begin = clock(); Hg::Path p; Hg::viterbi_path(hg, p); vector s; @@ -16,7 +17,9 @@ main(int argc, char** argv) for (auto it: s) cout << it << " "; cout << endl; - + clock_t end = clock(); + double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; + cout << elapsed_secs << " s" << endl; return 0; } diff --git a/fast/sparse_vector.hh b/fast/sparse_vector.hh index e497769..3583240 100644 --- a/fast/sparse_vector.hh +++ b/fast/sparse_vector.hh @@ -22,17 +22,7 @@ struct SparseVector { SparseVector() {}; SparseVector(string& s) { - stringstream ss(s); - while (!ss.eof()) { - string t; - ss >> t; - size_t eq = t.find_first_of("="); - t.replace(eq, 1, " "); - stringstream tt(t); - K k; V v; - tt >> k >> v; - m_.emplace(k.substr(k.find_first_of("\"")+1, k.find_last_of("\"")-1), v); - } + from_s(this, s); }; void @@ -138,6 +128,25 @@ struct SparseVector { return *this; }; + static void + from_s(SparseVector* w, const string& s) + { + stringstream ss(s); + while (!ss.eof()) { + string t; + ss >> t; + size_t eq = t.find_first_of("="); + if (eq == string::npos) { + return; + } + t.replace(eq, 1, " "); + stringstream tt(t); + K k; V v; + tt >> k >> v; + w->m_.emplace(k.substr(k.find_first_of("\"")+1, k.find_last_of("\"")-1), v); + } + } + string repr() const { @@ -154,10 +163,13 @@ struct SparseVector { }; string - escaped() const { + escaped(bool quote_keys=false) const { ostringstream os; for (auto it = m_.cbegin(); it != m_.cend(); it++) { - os << '"' << util::json_escape(it->first) << '"' << "=" << it->second; + if (quote_keys) os << '"'; + os << util::json_escape(it->first); + if (quote_keys) os << '"'; + os << "=" << it->second; if (next(it) != m_.cend()) os << " "; } -- cgit v1.2.3