#pragma once #include #include #include #include #include #include #include #include "sparse_vector.hh" #include "util.hh" using namespace std; namespace G { enum item_type { UNDEFINED, TERMINAL, NON_TERMINAL }; struct Item { virtual size_t index() const { return 0; } virtual symbol_t symbol() const { return ""; } virtual item_type type() const { return UNDEFINED; } virtual ostream& repr(ostream& os) const { return os << "Item<>"; } virtual ostream& escaped(ostream& os) const { return os << ""; } friend ostream& operator<<(ostream& os, const Item& i) { return i.repr(os); }; }; struct NT : public Item { symbol_t symbol_; size_t index_; NT() {} NT(string const& s) { index_ = 0; // default string t(s); t.erase(0, 1); t.pop_back(); // remove '[' and ']' istringstream ss(s); if (ss >> index_) { // [i] symbol_ = ""; index_ = stoi(s); return; } else { // [X] symbol_ = s; return; } string buf; size_t j = 0; while (ss.good() && getline(ss, buf, ',')) { if (j == 0) { symbol_ = buf; } else { index_ = stoi(buf); } j++; } } virtual size_t index() const { return index_; } virtual symbol_t symbol() const { return symbol_; } virtual item_type type() { return NON_TERMINAL; } virtual ostream& repr(ostream& os) const { return os << "NT<" << symbol_ << "," << index_ << ">"; } virtual ostream& escaped(ostream& os) const { os << "[" << symbol_; if (index_ > 0) os << "," << index_; os << "]"; return os; } }; struct T : public Item { symbol_t symbol_; T(string const& s) { symbol_ = s; } virtual symbol_t symbol() const { return symbol_; } virtual item_type type() { return TERMINAL; } virtual ostream& repr(ostream& os) const { return os << "T<" << symbol_ << ">"; } virtual ostream& escaped(ostream& os) const { os << util::json_escape(symbol_); } }; struct Vocabulary { unordered_map m_; vector items_; bool is_non_terminal(string const& s) { return s.front() == '[' && s.back() == ']'; } Item* get(symbol_t const& s) { if (is_non_terminal(s)) return new NT(s); if (m_.find(s) != m_.end()) return items_[m_[s]]; return add(s); } Item* add(symbol_t const& s) { size_t next_index_ = items_.size(); T* item = new T(s); items_.push_back(item); m_[s] = next_index_; return item; } }; struct Rule { NT* lhs; vector rhs; vector target; size_t arity; Sv::SparseVector* f; map order; string as_str_; Rule() {} Rule(string const& s, Vocabulary& vocab) { from_s(this, s, vocab); } static void from_s(Rule* r, string const& s, Vocabulary& vocab) { istringstream ss(s); string buf; size_t j = 0, i = 0; r->arity = 0; vector rhs_non_terminals; r->f = new Sv::SparseVector(); while (ss >> buf) { if (buf == "|||") { j++; continue; } if (j == 0) { // left-hand side r->lhs = new NT(buf); } else if (j == 1) { // right-hand side Item* item = vocab.get(buf); r->rhs.push_back(item); if (item->type() == NON_TERMINAL) { r->arity++; rhs_non_terminals.push_back(reinterpret_cast(item)); } } else if (j == 2) { // target Item* item = vocab.get(buf); if (item->type() == NON_TERMINAL) { r->order[i] = item->index(); i++; if (item->symbol() == "") { // only [1], [2] ... on target reinterpret_cast(item)->symbol_ = \ rhs_non_terminals[item->index()-1]->symbol(); } } r->target.push_back(item); } else if (j == 3) { // feature vector Sv::SparseVector::from_s(r->f, buf); // FIXME: this is slow!!! ^^^ } else if (j == 4) { // alignment } else { // error } if (j == 4) break; } } ostream& repr(ostream& os) const { os << "Rulerepr(os); os << ", rhs:{"; for (auto it = rhs.begin(); it != rhs.end(); it++) { (**it).repr(os); if (next(it) != rhs.end()) os << " "; } os << "}, target:{"; for (auto it = target.begin(); it != target.end(); it++) { (**it).repr(os); if (next(it) != target.end()) os << " "; } os << "}, f:"; f->repr(os); os << ", arity=" << arity << \ ", map:" << "TODO" << \ ">"; return os; } ostream& escaped(ostream& os) const { lhs->escaped(os); os << " ||| "; for (auto it = rhs.begin(); it != rhs.end(); it++) { (**it).escaped(os); if (next(it) != rhs.end()) os << " "; } os << " ||| "; for (auto it = target.begin(); it != target.end(); it++) { (**it).escaped(os); if (next(it) != target.end()) os << " "; } os << " ||| "; f->escaped(os); os << " ||| "; os << "TODO"; return os; }; friend ostream& operator<<(ostream& os, const Rule& r) { return r.repr(os); }; // -- void prep_for_serialization_() { ostringstream os; escaped(os); as_str_ = os.str(); }; MSGPACK_DEFINE(as_str_); // ^^^ FIXME }; struct Grammar { vector rules; vector flat; vector start_non_terminal; vector start_terminal; Grammar() {} Grammar(string const& fn, Vocabulary& vocab) { ifstream ifs(fn); string line; while (getline(ifs, line)) { G::Rule* r = new G::Rule(line, vocab); rules.push_back(r); if (r->arity == 0) flat.push_back(r); else if (r->rhs.front()->type() == NON_TERMINAL) start_non_terminal.push_back(r); else start_terminal.push_back(r); } } void add_glue(); // ^^^ TODO void add_pass_through(const string& input); // ^^^ TODO friend ostream& operator<<(ostream& os, Grammar const& g) { for (const auto it: g.rules) { it->repr(os); os << endl; } return os; } }; } // namespace G