From 190f68c880eb27506669e95e2bc0493e2ec42c4c Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 17 Aug 2014 07:51:16 +0100 Subject: functional again --- .gitmodules | 3 + fast/Makefile | 3 +- fast/README.md | 9 +- fast/grammar.cc | 46 +- fast/grammar.hh | 1 + fast/hypergraph.cc | 28 +- fast/hypergraph.hh | 2 +- fast/main.cc | 11 +- fast/sparse_vector.hh | 38 +- util/Makefile | 2 +- util/cdec2json.py | 11 +- util/json-cpp | 1 + util/json-cpp.hpp | 1231 ------------------------------------------------- util/make_pak.cc | 72 ++- util/read_pak.cc | 1 - 15 files changed, 137 insertions(+), 1322 deletions(-) create mode 100644 .gitmodules create mode 160000 util/json-cpp delete mode 100644 util/json-cpp.hpp diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..843caa2 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "util/json-cpp"] + path = util/json-cpp + url = https://github.com/ascheglov/json-cpp.git diff --git a/fast/Makefile b/fast/Makefile index 40ce0eb..9e88076 100644 --- a/fast/Makefile +++ b/fast/Makefile @@ -1,11 +1,10 @@ -COMPILER=clang +COMPILER=g++ CFLAGS=-std=c++11 -O3 all: grammar.o hypergraph.o main.cc $(COMPILER) $(CFLAGS) -std=c++11 -lstdc++ -lm -lmsgpack grammar.o hypergraph.o main.cc -o fast_weaver - hypergraph.o: hypergraph.cc hypergraph.hh grammar.o semiring.hh sparse_vector.hh weaver.hh $(COMPILER) $(CFLAGS) -g -c hypergraph.cc diff --git a/fast/README.md b/fast/README.md index a11bd85..1d6bd04 100644 --- a/fast/README.md +++ b/fast/README.md @@ -7,12 +7,12 @@ TODO * other semirings * include language model * compress/hash words/feature strings? - + * cast? Rule -> Edge, ChartItem -> Node + * feature factory, observer Dependencies: * MessagePack for object serialization [1] * kenlm language model [2] - This is Linux only. @@ -20,6 +20,8 @@ This is Linux only. [1] http://msgpack.org [2] http://kheafield.com/code/kenlm/ + +stuff to have a look at: http://math.nist.gov/spblas/ http://lapackpp.sourceforge.net/ http://www.cvmlib.com/ @@ -30,5 +32,6 @@ http://bytes.com/topic/c/answers/702569-blas-vs-cblas-c http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack http://www.osl.iu.edu/research/mtl/download.php3 http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library - +https://software.intel.com/en-us/tbb_4.2_doc http://goog-perftools.sourceforge.net/doc/tcmalloc.html + diff --git a/fast/grammar.cc b/fast/grammar.cc index 558f6e6..a003eb4 100644 --- a/fast/grammar.cc +++ b/fast/grammar.cc @@ -10,7 +10,18 @@ namespace G { NT::NT(string& s) { s.erase(0, 1); s.pop_back(); // remove '[' and ']' - stringstream ss(s); + istringstream ss(s); + if (ss >> index) { // [i] + symbol = ""; + index = stoi(s); + + return; + } else { // [X] + symbol = s; + index = 0; + + return; + } string buf; size_t j = 0; index = 0; // default @@ -135,28 +146,43 @@ operator<<(ostream& os, const Item& i) * */ Rule::Rule(const string& s) +{ + from_s(this, s); +} + +void +Rule::from_s(Rule* r, const string& s) { stringstream ss(s); size_t j = 0; string buf; - arity = 0; + r->arity = 0; size_t index = 1; + vector rhs_nt; + r->f = new Sv::SparseVector(); while (ss >> buf) { if (buf == "|||") { j++; continue; } if (j == 0) { // LHS - lhs = new NT(buf); + r->lhs = new NT(buf); } else if (j == 1) { // RHS - rhs.push_back(new Item(buf)); - if (rhs.back()->type == NON_TERMINAL) arity++; + r->rhs.push_back(new Item(buf)); + if (r->rhs.back()->type == NON_TERMINAL) { + rhs_nt.push_back(r->rhs.back()->nt); + r->arity++; + } } else if (j == 2) { // TARGET - target.push_back(new Item(buf)); - if (target.back()->type == NON_TERMINAL) { - order.insert(make_pair(index, target.back()->nt->index)); + r->target.push_back(new Item(buf)); + if (r->target.back()->type == NON_TERMINAL) { + r->order.insert(make_pair(index, r->target.back()->nt->index)); + if (r->target.back()->nt->symbol == "") + r->target.back()->nt->symbol = rhs_nt[r->target.back()->nt->index-1]->symbol; index++; } } else if (j == 3) { // F TODO + Sv::SparseVector::from_s(r->f, buf); // FIXME this is slow!!! } else if (j == 4) { // A TODO - } else { // ERROR + } else { + // ERROR } if (j == 4) break; } @@ -203,7 +229,7 @@ Rule::escaped() const os << " ||| "; os << f->escaped(); os << " ||| "; - os << "TODO"; + os << "TODO(alignment)"; return os.str(); } diff --git a/fast/grammar.hh b/fast/grammar.hh index 48a5116..1b9ac5a 100644 --- a/fast/grammar.hh +++ b/fast/grammar.hh @@ -69,6 +69,7 @@ Sv::SparseVector* f; Rule() {}; Rule(const string& s); + static void from_s(Rule* r, const string& s); string repr() const; string escaped() const; diff --git a/fast/hypergraph.cc b/fast/hypergraph.cc index e1debb1..a9a44f9 100644 --- a/fast/hypergraph.cc +++ b/fast/hypergraph.cc @@ -73,7 +73,7 @@ viterbi_path(Hypergraph& hg, Path& p) find_if(hg.nodes.begin(), hg.nodes.end(), \ [](Node* n) { return n->incoming.size() == 0; }); - Hg::topological_sort(hg.nodes, root); + Hg::topological_sort(hg.nodes, root); // FIXME do I need to do this when reading from file? Semiring::Viterbi semiring; Hg::init(hg.nodes, root, semiring); @@ -107,7 +107,8 @@ derive(const Path& p, const Node* cur, vector& carry) it->head->right == cur->right) { next = it; } - } + } // FIXME this is probably not so good + unsigned j = 0; for (auto it: next->rule->target) { if (it->type == G::NON_TERMINAL) { @@ -125,7 +126,7 @@ void read(Hypergraph& hg, vector& rules, const string& fn) // FIXME { ifstream ifs(fn); - size_t i = 0, nr, nn, ne; + size_t i = 0, r, n, e; msgpack::unpacker pac; while(true) { pac.reserve_buffer(32*1024); @@ -135,17 +136,23 @@ read(Hypergraph& hg, vector& rules, const string& fn) // FIXME while(pac.next(&result)) { msgpack::object o = result.get(); if (i == 0) { - o.convert(&nn); - nn += 1; + o.convert(&r); } else if (i == 1) { - o.convert(&ne); - ne += 1; - } else if (i > 1 && i <= nn) { + o.convert(&n); + } else if (i == 2) { + o.convert(&e); + } else if (i > 2 && i <= r+2) { + string s; + o.convert(&s); + G::Rule* rule = new G::Rule; + G::Rule::from_s(rule, s); + rules.push_back(rule); + } else if (i > r+2 && i <= r+n+2) { Node* n = new Node; o.convert(n); hg.nodes.push_back(n); hg.nodes_by_id[n->id] = n; - } else if (i > nn && i <= nn+ne+1) { + } else if (i > n+2 && i <= r+n+e+2) { Edge* e = new Edge; e->arity = 0; o.convert(e); @@ -158,6 +165,9 @@ read(Hypergraph& hg, vector& rules, const string& fn) // FIXME e->tails.push_back(hg.nodes_by_id[*it]); e->arity++; } + e->rule = rules[e->rule_id_]; + } else { + // ERROR } i++; } diff --git a/fast/hypergraph.hh b/fast/hypergraph.hh index 699bfdf..299a62d 100644 --- a/fast/hypergraph.hh +++ b/fast/hypergraph.hh @@ -92,7 +92,7 @@ void read(Hypergraph& hg, vector& rules, const string& fn); // FIXME void -write(Hypergraph& hg, vector& rules, const string& fn); // TODO +write(Hypergraph& hg, vector& rules, const string& fn); // FIXME void manual(Hypergraph& hg, vector& rules); diff --git a/fast/main.cc b/fast/main.cc index 59e25d5..08fcfcf 100644 --- a/fast/main.cc +++ b/fast/main.cc @@ -1,4 +1,5 @@ #include "hypergraph.hh" +#include int @@ -6,9 +7,9 @@ main(int argc, char** argv) { Hg::Hypergraph hg; G::Grammar g; -//Hg::io::read(hg, g.rules, argv[1]); - Hg::io::manual(hg, g.rules); - + Hg::io::read(hg, g.rules, argv[1]); + //Hg::io::manual(hg, g.rules); + clock_t begin = clock(); Hg::Path p; Hg::viterbi_path(hg, p); vector s; @@ -16,7 +17,9 @@ main(int argc, char** argv) for (auto it: s) cout << it << " "; cout << endl; - + clock_t end = clock(); + double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; + cout << elapsed_secs << " s" << endl; return 0; } diff --git a/fast/sparse_vector.hh b/fast/sparse_vector.hh index e497769..3583240 100644 --- a/fast/sparse_vector.hh +++ b/fast/sparse_vector.hh @@ -22,17 +22,7 @@ struct SparseVector { SparseVector() {}; SparseVector(string& s) { - stringstream ss(s); - while (!ss.eof()) { - string t; - ss >> t; - size_t eq = t.find_first_of("="); - t.replace(eq, 1, " "); - stringstream tt(t); - K k; V v; - tt >> k >> v; - m_.emplace(k.substr(k.find_first_of("\"")+1, k.find_last_of("\"")-1), v); - } + from_s(this, s); }; void @@ -138,6 +128,25 @@ struct SparseVector { return *this; }; + static void + from_s(SparseVector* w, const string& s) + { + stringstream ss(s); + while (!ss.eof()) { + string t; + ss >> t; + size_t eq = t.find_first_of("="); + if (eq == string::npos) { + return; + } + t.replace(eq, 1, " "); + stringstream tt(t); + K k; V v; + tt >> k >> v; + w->m_.emplace(k.substr(k.find_first_of("\"")+1, k.find_last_of("\"")-1), v); + } + } + string repr() const { @@ -154,10 +163,13 @@ struct SparseVector { }; string - escaped() const { + escaped(bool quote_keys=false) const { ostringstream os; for (auto it = m_.cbegin(); it != m_.cend(); it++) { - os << '"' << util::json_escape(it->first) << '"' << "=" << it->second; + if (quote_keys) os << '"'; + os << util::json_escape(it->first); + if (quote_keys) os << '"'; + os << "=" << it->second; if (next(it) != m_.cend()) os << " "; } diff --git a/util/Makefile b/util/Makefile index 08ead26..30564fe 100644 --- a/util/Makefile +++ b/util/Makefile @@ -3,7 +3,7 @@ COMPILER=clang all: make_pak read_pak -make_pak: make_pak.cc +make_pak: make_pak.cc json-cpp/single_include/json-cpp.hpp ../fast/hypergraph.hh ../fast/weaver.hh $(COMPILER) -std=c++11 -lstdc++ -lm -lmsgpack make_pak.cc -o make_pak read_pak: read_pak.cc diff --git a/util/cdec2json.py b/util/cdec2json.py index adddb64..e7c8e93 100755 --- a/util/cdec2json.py +++ b/util/cdec2json.py @@ -15,13 +15,6 @@ def hg2json(hg, weights): """ res = '' res += "{\n" - res += '"weights":{'+"\n" - a = [] - for i in weights: - if i[1] != 0: - a.append( '"%s":%s'%(i[0], i[1]) ) - res += ", ".join(a)+"\n" - res += "},\n" res += '"rules":[\n' rules = [] for i in hg.edges: @@ -35,9 +28,9 @@ def hg2json(hg, weights): res += '"nodes":'+"\n" res += "[\n" a = [] - a.append( '{ "id":0, "cat":"root", "span":[-1,-1] }' ) + a.append( '{ "id":0, "symbol":"root", "span":[-1,-1] }' ) for i in hg.nodes: - a.append('{ "id":%d, "cat":"%s", "span":[%d,%d] }'%(i.id+1, i.cat, i.span[0], i.span[1])) + a.append('{ "id":%d, "symbol":"%s", "span":[%d,%d] }'%(i.id+1, i.cat, i.span[0], i.span[1])) res += ",\n".join(a)+"\n" res += "],\n" res += '"edges":'+"\n" diff --git a/util/json-cpp b/util/json-cpp new file mode 160000 index 0000000..4eb4b47 --- /dev/null +++ b/util/json-cpp @@ -0,0 +1 @@ +Subproject commit 4eb4b47cf4d622bc7bf34071d6b68fc5beb37051 diff --git a/util/json-cpp.hpp b/util/json-cpp.hpp deleted file mode 100644 index 851a4f4..0000000 --- a/util/json-cpp.hpp +++ /dev/null @@ -1,1231 +0,0 @@ -// -// DO NOT EDIT !!! This file was generated with a script. -// -// JSON for C++ -// https://github.com/ascheglov/json-cpp -// Version 0.1 alpha, rev. 170121e2dc099895064305e38bfb25d90a807ce3 -// Generated 2014-03-27 17:16:47.104492 UTC -// -// Belongs to the public domain - -#pragma once - -//---------------------------------------------------------------------- -// json-cpp.hpp begin - -//---------------------------------------------------------------------- -// json-cpp/parse.hpp begin - -#include -#include -#include -#include -#include - -//---------------------------------------------------------------------- -// json-cpp/ParserError.hpp begin - -#include -#include -#include -#include - -#if defined _MSC_VER -# define JSONCPP_INTERNAL_NOEXCEPT_ throw() -#else -# define JSONCPP_INTERNAL_NOEXCEPT_ noexcept -#endif - -namespace jsoncpp -{ - class ParserError : public std::exception - { - public: - enum Type - { - NoError, - Eof, UnexpectedCharacter, - InvalidEscapeSequence, NoTrailSurrogate, - UnexpectedType, UnknownField, - NumberIsOutOfRange, - }; - - ParserError(Type type, std::size_t line, std::size_t column) - : m_type{type}, m_line{line}, m_column{column} - { - assert(type != NoError); - } - - virtual const char* what() const JSONCPP_INTERNAL_NOEXCEPT_ override - { - if (m_what.empty()) - { - m_what = "JSON parser error at line "; - m_what += std::to_string(m_line); - m_what += ", column "; - m_what += std::to_string(m_column); - switch (m_type) - { - case Eof: m_what += ": unexpected end of file"; break; - case UnexpectedCharacter: m_what += ": unexpected character"; break; - case InvalidEscapeSequence: m_what += ": invalid escape sequence"; break; - case NoTrailSurrogate: m_what += ": no UTF-16 trail surrogate"; break; - case UnexpectedType: m_what += ": unexpected value type"; break; - case UnknownField: m_what += ": unknown field name"; break; - case NumberIsOutOfRange: m_what += ": number is out of range"; break; - case NoError: - default: - m_what += ": INTERNAL ERROR"; break; - } - } - - return m_what.c_str(); - } - - Type type() const { return m_type; } - std::size_t line() const { return m_line; } - std::size_t column() const { return m_column; } - - private: - Type m_type; - std::size_t m_line; - std::size_t m_column; - - mutable std::string m_what; - }; -} - -#undef JSONCPP_INTERNAL_NOEXCEPT_ - -// json-cpp/ParserError.hpp end -//---------------------------------------------------------------------- - -//---------------------------------------------------------------------- -// json-cpp/Stream.hpp begin - -namespace jsoncpp -{ - template - class Stream; - - namespace details - { - template - struct Traits2 {}; - - template - struct ParserTraits {}; - - template - struct GeneratorTraits {}; - } - - template - using Parser = Stream>; - - template - using Generator = Stream>; - - template - inline auto serialize(Stream& stream, T& value) -> decltype(value.serialize(stream), void()) - { - value.serialize(stream); - } -} -// json-cpp/Stream.hpp end -//---------------------------------------------------------------------- - -//---------------------------------------------------------------------- -// json-cpp/value_types.hpp begin - -namespace jsoncpp -{ - // Helper masks - const auto TypeIsNotFundamental = 0x40; - const auto TypeIsCollection = 0x80; - - enum class Type - { - Undefined = 0, // Helper type for debugging variant-like types - Null = 0x01, - Boolean = 0x02, - Number = 0x04, - String = 0x08 | TypeIsNotFundamental, - Array = 0x10 | TypeIsNotFundamental | TypeIsCollection, - Object = 0x20 | TypeIsNotFundamental | TypeIsCollection, - }; -} -// json-cpp/value_types.hpp end -//---------------------------------------------------------------------- - -//---------------------------------------------------------------------- -// json-cpp/details/parser_utility.hpp begin - -#include -#include -#include - -namespace jsoncpp { namespace details -{ - template - struct CStrIterator - { - using this_type = CStrIterator; - - CStrIterator() - { - static CharT null{0}; - m_ptr = &null; - } - - CStrIterator(const CharT* ptr) : m_ptr{ptr} {} - - const CharT& operator*() { return *m_ptr; } - const CharT* operator->() { return m_ptr; } - - this_type& operator++() - { - assert(!isEnd()); - ++m_ptr; - return *this; - } - - this_type operator++(int) { auto temp = *this; ++*this; return temp; } - - bool operator==(const this_type& rhs) const { return isEnd() == rhs.isEnd(); } - bool operator!=(const this_type& rhs) const { return !this->operator==(rhs); } - - private: - const CharT* m_ptr; - - bool isEnd() const { return *m_ptr == 0; } - }; - - class Diagnostics - { - public: - void nextColumn() { ++m_column; } - void newLine() { ++m_line; m_column = 0; } - - ParserError makeError(ParserError::Type type) const - { - return{type, m_line, m_column}; - } - - private: - std::size_t m_column{0}; - std::size_t m_line{1}; - }; - - template - struct Reader - { - using this_type = Reader; - - Reader(InputIterator first, InputIterator last) : m_iter(first), m_end(last) - { - checkEnd(); - } - - char operator*() { return *m_iter; } - this_type& operator++() - { - checkEnd(); - ++m_iter; - m_diag.nextColumn(); - return *this; - } - - void checkEnd() - { - if (m_iter == m_end) - throw m_diag.makeError(ParserError::Eof); - } - - char getNextChar() - { - auto prev = *m_iter; - ++*this; - return prev; - } - - Diagnostics m_diag; - InputIterator m_iter, m_end; - }; -}} - -// json-cpp/details/parser_utility.hpp end -//---------------------------------------------------------------------- - -//---------------------------------------------------------------------- -// json-cpp/details/number_parser.hpp begin - -#include - -namespace jsoncpp { namespace details -{ - inline bool isDigit(char c) { return c >= '0' && c <= '9'; } - - template - inline unsigned parseIntNumber(Iterator& iter) - { - auto intPart = 0U; // TBD: 0ULL ? - - do - { - intPart = intPart * 10 + (*iter - '0'); - - ++iter; - } - while (isDigit(*iter)); - - return intPart; - } - - template - inline double parseRealNumber(Iterator& iter) - { - double number = 0; - - if (*iter == '0') - { - ++iter; - } - else - { - number = parseIntNumber(iter); - } - - // here `ch` is a peeked character, need to call eat() - - if (*iter == '.') - { - ++iter; - - auto mul = 0.1; - while (isDigit(*iter)) - { - number += (*iter - '0') * mul; - mul /= 10; - ++iter; - } - } - - // here `ch` is a peeked character, need to call eat() - - if (*iter == 'e' || *iter == 'E') - { - ++iter; - - auto negate = *iter == '-'; - if (negate || *iter == '+') - ++iter; - // FIXME: check `ch` for non-digit - - auto e = parseIntNumber(iter); - - if (negate) - number /= std::pow(10, e); - else - number *= std::pow(10, e); - } - - return number; - } -}} -// json-cpp/details/number_parser.hpp end -//---------------------------------------------------------------------- - -//---------------------------------------------------------------------- -// json-cpp/details/string_parser.hpp begin - -#include - -namespace jsoncpp { namespace details -{ - inline char32_t utf16SurrogatePairToUtf32(char32_t lead, char32_t trail) - { - return 0x10000 | (lead - 0xD800) << 10 | (trail - 0xDC00); - } - - inline void utf32ToUtf8(char32_t c, std::string& str) - { - auto add = [&str](char32_t c){ str.push_back(static_cast(c)); }; - - if (c < 0x80) - { - add(c); - } - else if (c < 0x800) - { - add(0xC0 | c >> 6); - add(0x80 | (c & 0x3f)); - } - else if (c < 0x10000) - { - add(0xE0 | c >> 12); - add(0x80 | ((c >> 6) & 0x3f)); - add(0x80 | (c & 0x3f)); - } - else if (c < 0x200000) - { - add(0xF0 | c >> 18); - add(0x80 | ((c >> 12) & 0x3f)); - add(0x80 | ((c >> 6) & 0x3f)); - add(0x80 | (c & 0x3f)); - } - else if (c < 0x4000000) - { - add(0xF8 | c >> 24); - add(0x80 | ((c >> 18) & 0x3f)); - add(0x80 | ((c >> 12) & 0x3f)); - add(0x80 | ((c >> 6) & 0x3f)); - add(0x80 | (c & 0x3f)); - } - else - { - add(0xFC | c >> 30); - add(0x80 | ((c >> 24) & 0x3f)); - add(0x80 | ((c >> 18) & 0x3f)); - add(0x80 | ((c >> 12) & 0x3f)); - add(0x80 | ((c >> 6) & 0x3f)); - add(0x80 | (c & 0x3f)); - } - } - - enum class CharType { Raw, CodePoint, UTF16Pair }; - - template - inline void addToStr(std::basic_string& str, CharType type, char32_t c1, char32_t c2); - - template<> - inline void addToStr(std::basic_string& str, CharType type, char32_t c1, char32_t c2) - { - if (type == CharType::Raw) - { - str.push_back(static_cast(c1)); - } - else if (type == CharType::CodePoint) - { - utf32ToUtf8(c1, str); - } - else - { - auto c32 = utf16SurrogatePairToUtf32(c1, c2); - utf32ToUtf8(c32, str); - } - } - - template<> - inline void addToStr(std::basic_string& str, CharType type, char32_t c1, char32_t c2) - { - str.push_back(static_cast(c1)); - if (type == CharType::UTF16Pair) - str.push_back(static_cast(c2)); - } - - template<> - inline void addToStr(std::basic_string& str, CharType type, char32_t c1, char32_t c2) - { - auto c = (type == CharType::UTF16Pair) ? utf16SurrogatePairToUtf32(c1, c2) : c1; - str.push_back(static_cast(c)); - } - - template - inline int parseHexDigit(Iterator& iter, ParserError::Type& err) - { - auto ch = *iter; - ++iter; - if (ch >= '0' && ch <= '9') return ch - '0'; - if (ch >= 'A' && ch <= 'F') return ch - 'A' + 10; - if (ch >= 'a' && ch <= 'f') return ch - 'a' + 10; - - err = ParserError::InvalidEscapeSequence; - return 0; - } - - template - inline char32_t parseUTF16CodeUnit(Iterator& iter, ParserError::Type& err) - { - auto n = parseHexDigit(iter, err) << 12; - n |= parseHexDigit(iter, err) << 8; - n |= parseHexDigit(iter, err) << 4; - n |= parseHexDigit(iter, err); - return static_cast(n); - } - - template - inline ParserError::Type parseStringImpl(Iterator& iter, std::basic_string& str) - { - str.clear(); - auto add = [&str](CharType type, char32_t c1, char32_t c2) - { - addToStr(str, type, c1, c2); - }; - - for (;;) - { - auto ch = static_cast(*iter); - ++iter; - if (ch == '"') - return ParserError::NoError; - - if (ch == '\\') - { - ch = static_cast(*iter); - ++iter; - switch (ch) - { - case '\\': case '"': case '/': - break; - - case 'b': ch = '\b'; break; - case 'f': ch = '\f'; break; - case 'n': ch = '\n'; break; - case 'r': ch = '\r'; break; - case 't': ch = '\t'; break; - - case 'u': - { - ParserError::Type err{ParserError::NoError}; - auto codeUnit = parseUTF16CodeUnit(iter, err); - if (err != ParserError::NoError) - return err; - - if (codeUnit >= 0xD800 && codeUnit < 0xDC00) - { - if (*iter != '\\') return ParserError::NoTrailSurrogate; - ++iter; - if (*iter != 'u') return ParserError::NoTrailSurrogate; - ++iter; - - auto trailSurrogate = parseUTF16CodeUnit(iter, err); - if (err != ParserError::NoError) - return err; - - add(CharType::UTF16Pair, codeUnit, trailSurrogate); - } - else - { - add(CharType::CodePoint, codeUnit, 0); - } - } - continue; - - default: - return ParserError::InvalidEscapeSequence; - } - } - - add(CharType::Raw, ch, 0); - } - } -}} - -// json-cpp/details/string_parser.hpp end -//---------------------------------------------------------------------- - -namespace jsoncpp -{ - template - class Stream>> - { - public: - using this_type = Parser>; - - explicit Stream(InputIterator first, InputIterator last) - : m_reader{first, last} - { - nextValue(); - } - - Type getType() const { return m_type; } - bool getBoolean() const { return m_boolean; } - double getNumber() const { return m_number; } - const std::string& getFieldName() const { return m_fieldName; } - - void checkType(Type type) const - { - if (getType() != type) - throw makeError(ParserError::UnexpectedType); - } - - bool isListEnd(char terminator) - { - eatWhitespace(); - if (*m_reader != terminator) - return false; - - ++m_reader; - return true; - } - - void eatListSeparator() - { - eatWhitespace(); - check(','); - eatWhitespace(); - } - - void nextNameValuePair() - { - eatWhitespace(); - check('"'); - parseString(m_fieldName); - eatWhitespace(); - check(':'); - nextValue(); - } - - void nextValue() - { - eatWhitespace(); - m_type = nextValueImpl(); - } - - template - void parseString(std::basic_string& str) - { - auto err = parseStringImpl(m_reader, str); - if (err != ParserError::NoError) - throw m_reader.m_diag.makeError(err); - } - - ParserError makeError(ParserError::Type type) const - { - return m_reader.m_diag.makeError(type); - } - - private: - Type nextValueImpl() - { - switch (*m_reader) - { - case '{': ++m_reader; return Type::Object; - case '[': ++m_reader; return Type::Array; - case 't': ++m_reader; checkLiteral("true"); m_boolean = true; return Type::Boolean; - case 'f': ++m_reader; checkLiteral("false"); m_boolean = false; return Type::Boolean; - case 'n': ++m_reader; checkLiteral("null"); return Type::Null; - case '"': ++m_reader; return Type::String; - - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - m_number = parseRealNumber(m_reader); - return Type::Number; - - case '-': - ++m_reader; - m_number = -parseRealNumber(m_reader); - return Type::Number; - } - - throw unexpectedCharacter(); - } - - ParserError unexpectedCharacter() const - { - return makeError(ParserError::UnexpectedCharacter); - } - - void check(char expectedChar) - { - if (*m_reader != expectedChar) - throw unexpectedCharacter(); - - ++m_reader; - } - - template - void checkLiteral(const char(&literal)[N]) - { - static_assert(N > 2, ""); - for (auto i = 1; i != N - 1; ++i, ++m_reader) - if (*m_reader != literal[i]) - throw unexpectedCharacter(); - } - - void eatWhitespace() - { - for (;; ++m_reader) - { - switch (*m_reader) - { - case '/': - ++m_reader; - check('/'); - while (*m_reader != '\n') - ++m_reader; - - // no break here - case '\n': - m_reader.m_diag.newLine(); - break; - - case ' ': case '\t': case '\r': - break; - - default: - return; - } - } - } - - details::Reader m_reader; - - Type m_type; - double m_number; - bool m_boolean; - std::string m_fieldName; - }; - - template - inline void serialize(Parser& parser, bool& value) - { - parser.checkType(Type::Boolean); - value = parser.getBoolean(); - } - - template - inline typename std::enable_if::value>::type - serialize(Parser& parser, T& value) - { - parser.checkType(Type::Number); - auto number = parser.getNumber(); - value = static_cast(number); - if (value != number) - throw parser.makeError(ParserError::NumberIsOutOfRange); - } - - template - inline void serialize(Parser& parser, std::basic_string& value) - { - parser.checkType(Type::String); - parser.parseString(value); - } - - namespace details - { - template - inline void parseList(Parser& parser, Type type, char terminator, Callback&& callback) - { - parser.checkType(type); - - while (!parser.isListEnd(terminator)) - { - callback(); - - if (parser.isListEnd(terminator)) - return; - - parser.eatListSeparator(); - } - } - } - - template - inline void parseObject(Parser& parser, Callback&& callback) - { - details::parseList(parser, Type::Object, '}', [&] - { - parser.nextNameValuePair(); - callback(parser.getFieldName()); - }); - } - - template - void parseArray(Parser& parser, Callback&& callback) - { - details::parseList(parser, Type::Array, ']', [&] - { - parser.nextValue(); - callback(); - }); - } - - template - inline void parse(T& object, InputIterator first, InputIterator last) - { - Parser> stream{first, last}; - serialize(stream, object); - } - - template - inline void parse(T& object, const CharT* str) - { - details::CStrIterator first{str}, last; - parse(object, first, last); - } - - template - inline void parse(T& object, std::basic_string& str) - { - parse(object, std::begin(str), std::end(str)); - } - - template - inline void parse(T& object, std::basic_istream& stream) - { - std::istreambuf_iterator first{stream}, last; - parse(object, first, last); - } -} - -// json-cpp/parse.hpp end -//---------------------------------------------------------------------- - -//---------------------------------------------------------------------- -// json-cpp/std_types.hpp begin - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -//---------------------------------------------------------------------- -// json-cpp/generate.hpp begin - -#include -#include - -//---------------------------------------------------------------------- -// json-cpp/details/string_writer.hpp begin - -#include - -namespace jsoncpp { namespace details -{ - template - inline void writeString(const std::basic_string& str, Sink&& sink) - { - sink('"'); - for (auto iter = std::begin(str), last = std::end(str); iter != last; ++iter) - { - switch (char32_t ch = static_cast(*iter)) - { - case '"': sink('\\'); sink('"'); break; - case '\\': sink('\\'); sink('\\'); break; - case '\b': sink('\\'); sink('b'); break; - case '\f': sink('\\'); sink('f'); break; - case '\n': sink('\\'); sink('n'); break; - case '\r': sink('\\'); sink('r'); break; - case '\t': sink('\\'); sink('t'); break; - default: - if (ch < '\x20') - { - const auto table = "0123456789ABCDEF"; - unsigned n = static_cast(ch); - sink('\\'); - sink('u'); - sink('0'); - sink('0'); - sink(table[n >> 4]); - sink(table[n & 15]); - } - else - { - sink(static_cast(ch)); - } - } - } - sink('"'); - } -}} - -// json-cpp/details/string_writer.hpp end -//---------------------------------------------------------------------- - -namespace jsoncpp -{ - template - class Stream>> - { - public: - using this_type = Generator>; - - explicit Stream(Sink& sink) : m_sink(&sink) {} - - void objectBegin() - { - (*m_sink) << "{"; - } - - void fieldName(const char* name) - { - (*m_sink) << '"' << name << "\": "; - // TODO: use writeString (?) - } - - template - void fieldName(const std::basic_string& name) - { - (*m_sink) << '"' << name << "\": "; - // TODO: use writeString (?) - } - - void separator() - { - (*m_sink) << ", "; - } - - void objectEnd() - { - (*m_sink) << '}'; - } - - void arrayBegin() - { - (*m_sink) << '['; - } - - void arrayEnd() - { - (*m_sink) << ']'; - } - - friend void serialize(this_type& stream, std::nullptr_t) - { - (*stream.m_sink) << "null"; - } - - friend void serialize(this_type& stream, bool value) - { - (*stream.m_sink) << (value ? "true" : "false"); - } - - template - friend typename std::enable_if::value>::type serialize(this_type& stream, T& value) - { - (*stream.m_sink) << value; - } - - template - friend void serialize(this_type& stream, const std::basic_string& value) - { - details::writeString(value, [&stream](char c){ stream.m_sink->put(c); }); - } - - private: - Sink* m_sink; - }; - - template - inline void writePointer(Generator& generator, Pointer& ptr) - { - if (ptr) - { - serialize(generator, *ptr); - } - else - { - serialize(generator, nullptr); - } - } - - template - inline void writeRange(Generator& generator, Range& range) - { - generator.arrayBegin(); - - auto iter = std::begin(range); - const auto& last = std::end(range); - if (iter != last) - { - for (;;) - { - serialize(generator, *iter); - - ++iter; - if (iter == last) - break; - - generator.separator(); - } - } - - generator.arrayEnd(); - } - - template - inline std::string to_string(const T& object) - { - std::ostringstream rawStream; - Generator> stream{rawStream}; - serialize(stream, const_cast(object)); - return rawStream.str(); - } -} - -// json-cpp/generate.hpp end -//---------------------------------------------------------------------- - -namespace jsoncpp -{ - template - inline void serialize(Parser& parser, std::shared_ptr& obj) - { - if (parser.getType() != jsoncpp::Type::Null) - { - obj = std::make_shared(); - serialize(parser, *obj); - } - else - { - obj.reset(); - } - } - - template - inline void serialize(Generator& generator, std::shared_ptr& obj) - { - writePointer(generator, obj); - } - - template - inline void serialize(Parser& parser, std::unique_ptr& obj) - { - if (parser.getType() != jsoncpp::Type::Null) - { - obj->reset(new T()); - serialize(parser, *obj); - } - else - { - obj.reset(); - } - } - - template - inline void serialize(Generator& generator, std::unique_ptr& obj) - { - writePointer(generator, obj); - } - - namespace details - { - template - inline void serializeContainer(Parser& parser, C& c) - { - c.clear(); - - parseArray(parser, [&] - { - c.emplace_back(); - serialize(parser, c.back()); - }); - } - - template - inline void serializeContainer(Generator& generator, C& c) - { - writeRange(generator, c); - } - - template - inline void serializeSet(Parser& parser, C& c) - { - c.clear(); - - parseArray(parser, [&] - { - typename C::value_type value; - serialize(parser, value); - c.insert(value); - }); - } - - template - inline void serializeSet(Generator& generator, C& c) - { - writeRange(generator, c); - } - - template - inline void serializeStrMap(Parser& parser, C& c) - { - c.clear(); - - parseObject(parser, [&](const std::string& name) - { - serialize(parser, c[name]); - }); - } - - template - inline void serializeStrMap(Generator& generator, C& c) - { - generator.objectBegin(); - - auto iter = std::begin(c); - const auto& last = std::end(c); - if (iter != last) - { - for (;;) - { - generator.fieldName(iter->first); - serialize(generator, iter->second); - - ++iter; - if (iter == last) - break; - - generator.separator(); - } - } - - generator.objectEnd(); - } - } - - template - inline void serialize(Stream& stream, std::vector& arr) - { details::serializeContainer(stream, arr); } - - template - inline void serialize(Stream& stream, std::list& arr) - { details::serializeContainer(stream, arr); } - - template - inline void serialize(Stream& stream, std::forward_list& arr) - { details::serializeContainer(stream, arr); } - - template - inline void serialize(Stream& stream, std::deque& arr) - { details::serializeContainer(stream, arr); } - - template - inline void serialize(Stream& stream, std::set& arr) - { details::serializeSet(stream, arr); } - - template - inline void serialize(Stream& stream, std::unordered_set& arr) - { details::serializeSet(stream, arr); } - - template - inline void serialize(Stream& stream, std::map& t) - { details::serializeStrMap(stream, t); } - - template - inline void serialize(Stream& stream, std::unordered_map& t) - { details::serializeStrMap(stream, t); } -} -// json-cpp/std_types.hpp end -//---------------------------------------------------------------------- - -//---------------------------------------------------------------------- -// json-cpp/serialization_helpers.hpp begin - -#include -#include - -namespace jsoncpp -{ - namespace details - { - template - inline void writeField(Generator& generator, const char* name, T& value) - { - generator.fieldName(name); - serialize(generator, value); - } - - template - inline void writeField(Generator& generator, const char* name, T& value, F&&... fieldsDef) - { - writeField(generator, name, value); - generator.separator(); - writeField(generator, fieldsDef...); - } - - template - class FieldsTable - { - public: - template - FieldsTable(F&&... fieldsDef) - { - m_map.reserve(sizeof...(fieldsDef) / 2); - add(1, fieldsDef...); - } - - struct FieldInfo - { - template - FieldInfo(T&, std::size_t idx) - { - m_fieldIdx = idx; - m_parseFn = [](ParserT& parser, void* fieldPtr) - { - serialize(parser, static_cast(*reinterpret_cast(fieldPtr))); - }; - } - - std::size_t m_fieldIdx; - void(*m_parseFn)(ParserT& parser, void* fieldPtr); - }; - - const FieldInfo* find(const std::string& name) const - { - auto it = m_map.find(name); - return it == m_map.end() ? nullptr : &it->second; - } - - private: - template - void add(std::size_t idx, const char* name, T& value, F&&... otherFields) - { - m_map.emplace(name, FieldInfo(value, idx)); - add(idx + 2, otherFields...); - } - - void add(std::size_t /*idx*/) {} - - std::unordered_map m_map; - }; - - inline void* makePtrs(const char*) { return nullptr; } - - template - inline void* makePtrs(T& obj) { return &obj; } - } - - template - inline void fields(Cls&, Parser& parser, F&&... fieldsDef) - { - std::array ptrs{details::makePtrs(fieldsDef)...}; - - static const details::FieldsTable> table{fieldsDef...}; - - auto&& handler = [&](const std::string& fieldName) - { - auto fieldInfo = table.find(fieldName); - if (fieldInfo == nullptr) - throw parser.makeError(ParserError::UnknownField); - - auto fieldPtr = ptrs[fieldInfo->m_fieldIdx]; - fieldInfo->m_parseFn(parser, fieldPtr); - }; - - parseObject(parser, handler); - } - - template - inline void fields(Cls&, Generator& generator, F&&... fieldsDef) - { - generator.objectBegin(); - details::writeField(generator, fieldsDef...); - generator.objectEnd(); - } -} - -// json-cpp/serialization_helpers.hpp end -//---------------------------------------------------------------------- - -// json-cpp.hpp end -//---------------------------------------------------------------------- - diff --git a/util/make_pak.cc b/util/make_pak.cc index f09c17d..e858155 100644 --- a/util/make_pak.cc +++ b/util/make_pak.cc @@ -1,58 +1,52 @@ #include #include -#include #include #include +#include -#include "json-cpp.hpp" -#include "../fast/dummyvector.h" +#include "json-cpp/single_include/json-cpp.hpp" #include "../fast/hypergraph.hh" +#include "../fast/weaver.hh" using namespace std; struct DummyNode { - size_t id; - string cat; + size_t id; + string symbol; vector span; }; struct DummyEdge { - size_t head; - string rule; - vector tails; - DummyVector f; - score_t weight; + size_t head_id; + size_t rule_id; + vector tails_ids; + string f; + score_t score; }; struct DummyHg { + vector rules; vector nodes; vector edges; - DummyVector weights; }; template inline void serialize(jsoncpp::Stream& stream, DummyNode& o) { - fields(o, stream, "id", o.id, "cat", o.cat, "span", o.span); + fields(o, stream, "id", o.id, "symbol", o.symbol, "span", o.span); } template inline void serialize(jsoncpp::Stream& stream, DummyEdge& o) { - fields(o, stream, "head", o.head, "rule", o.rule, "tails", o.tails, "f", o.f, "weight", o.weight); + fields(o, stream, "head", o.head_id, "rule", o.rule_id, "tails", o.tails_ids, "score", o.score); } template inline void serialize(jsoncpp::Stream& stream, DummyHg& o) { - fields(o, stream, "nodes", o.nodes, "edges", o.edges, "weights", o.weights); -} - -template inline void -serialize(jsoncpp::Stream& stream, DummyVector& o) -{ - fields(o, stream, "EgivenFCoherent", o.EgivenFCoherent, "SampleCountF", o.SampleCountF, "CountEF", o.CountEF, "MaxLexFgivenE", o.MaxLexFgivenE, "MaxLexEgivenF", o.MaxLexEgivenF, "IsSingletonF", o.IsSingletonF, "IsSingletonFE", o.IsSingletonFE, "LanguageModel", o.LanguageModel, "LanguageModel_OOV", o.LanguageModel_OOV, "PassThrough", o.PassThrough, "PassThrough_1", o.PassThrough_1, "PassThrough_2", o.PassThrough_2, "PassThrough_3", o.PassThrough_3, "PassThrough_4", o.PassThrough_4, "PassThrough_5", o.PassThrough_5, "PassThrough_6", o.PassThrough_6, "WordPenalty", o.WordPenalty, "Glue", o.Glue); + fields(o, stream, "rules", o.rules, "nodes", o.nodes, "edges", o.edges); } int @@ -63,44 +57,46 @@ main(int argc, char** argv) string json_str((istreambuf_iterator(ifs) ), (istreambuf_iterator())); DummyHg hg; + vector rules; + hg.rules = rules; vector nodes; hg.nodes = nodes; vector edges; hg.edges = edges; - DummyVector w; - hg.weights = w; jsoncpp::parse(hg, json_str); - // convert objects + // convert to proper objects vector nodes_conv; - for (auto it = hg.nodes.begin(); it != hg.nodes.end(); ++it) { + for (const auto it: hg.nodes) { Hg::Node* n = new Hg::Node; - n->id = it->id; - n->symbol = it->cat; - n->left = it->span[0]; - n->right = it->span[1]; + n->id = it.id; + n->symbol = it.symbol; + n->left = it.span[0]; + n->right = it.span[1]; nodes_conv.push_back(n); } vector edges_conv; - for (auto it = hg.edges.begin(); it != hg.edges.end(); ++it) { + for (const auto it: hg.edges) { Hg::Edge* e = new Hg::Edge; - e->head_id_ = it->head; - e->tails_ids_ = it->tails; - e->score = it->weight; - e->rule = it->rule; - e->f = it->f; + e->head_id_ = it.head_id; + e->tails_ids_ = it.tails_ids; + e->score = it.score; + e->rule_id_ = it.rule_id; edges_conv.push_back(e); } // write to msgpack FILE* file = fopen(argv[2], "wb"); msgpack::fbuffer fbuf(file); + msgpack::pack(fbuf, hg.rules.size()); msgpack::pack(fbuf, hg.nodes.size()); msgpack::pack(fbuf, hg.edges.size()); - for (auto it = nodes_conv.begin(); it != nodes_conv.end(); ++it) - msgpack::pack(fbuf, **it); - for (auto it = edges_conv.begin(); it != edges_conv.end(); ++it) - msgpack::pack(fbuf, **it); + for (const auto it: hg.rules) + msgpack::pack(fbuf, it); + for (const auto it: nodes_conv) + msgpack::pack(fbuf, *it); + for (const auto it: edges_conv) + msgpack::pack(fbuf, *it); fclose(file); return 0; diff --git a/util/read_pak.cc b/util/read_pak.cc index d4bff91..afd6e6a 100644 --- a/util/read_pak.cc +++ b/util/read_pak.cc @@ -9,7 +9,6 @@ int main(int argc, char** argv) { ifstream ifs(argv[1]); - size_t i = 0, nn, ne; msgpack::unpacker pac; while(true) { pac.reserve_buffer(32*1024); -- cgit v1.2.3