rewritten grammar

author: Patrick Simianer <p@simianer.de> 2014-08-23 22:59:16 +0100
committer: Patrick Simianer <p@simianer.de> 2014-08-23 22:59:16 +0100
commit: cef65063cec641a93973b38a48e100fdd115db44 (patch)
tree: 32d5f10757e021a9fad01156fbff62a96212f006
parent: 190f68c880eb27506669e95e2bc0493e2ec42c4c (diff)
20 files changed, 511 insertions, 391 deletions
diff --git a/.gitignore b/.gitignore
index 94218e0..d8a671e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ fast/test_grammar
 fast/test_sparse_vector
 util/make_pak
 util/read_pak
+fast/gperftools-2.1/
diff --git a/fast/Makefile b/fast/Makefile
index 9e88076..1a7f5b9 100644
--- a/fast/Makefile
+++ b/fast/Makefile
@@ -1,24 +1,15 @@
 COMPILER=g++
 CFLAGS=-std=c++11 -O3
+TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread
 
 
-all: grammar.o hypergraph.o main.cc
-	$(COMPILER) $(CFLAGS) -std=c++11 -lstdc++ -lm -lmsgpack grammar.o hypergraph.o main.cc -o fast_weaver
+all: hypergraph.o main.cc
+	$(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack $(TCMALLOC) hypergraph.o main.cc -o fast_weaver
 
-hypergraph.o: hypergraph.cc hypergraph.hh grammar.o semiring.hh sparse_vector.hh weaver.hh
-	$(COMPILER) $(CFLAGS) -g -c hypergraph.cc
-
-grammar.o: grammar.cc grammar.hh sparse_vector.hh util.hh
-	$(COMPILER) $(CFLAGS) -g -c grammar.cc
-
-test: test_grammar test_sparse_vector
-
-test_grammar: test_grammar.cc grammar.o
-	$(COMPILER) $(CFLAGS) -lstdc++ -lm grammar.o test_grammar.cc -o test_grammar
-
-test_sparse_vector: test_sparse_vector.cc sparse_vector.hh
-	$(COMPILER) $(CFLAGS) -lstdc++ -lm test_sparse_vector.cc -o test_sparse_vector
+hypergraph.o: hypergraph.cc hypergraph.hh semiring.hh sparse_vector.hh weaver.hh
+	$(COMPILER) $(CFLAGS) -g -c $(TCMALLOC) hypergraph.cc
 
 clean:
-	rm -f fast_weaver hypergraph.o grammar.o test_grammar test_sparse_vector
+	rm -f fast_weaver
+	rm -f hypergraph.o parse.o
 
diff --git a/fast/README.md b/fast/README.md
index 1d6bd04..f92245b 100644
--- a/fast/README.md
+++ b/fast/README.md
@@ -2,7 +2,6 @@ TODO
   * sparse vector (unordered_map) -> where to store?
   * parser
   * Rule -> ChartItem -> Node ?
-  * viterbi path/string
   * k-best
   * other semirings
   * include language model
@@ -34,4 +33,6 @@ http://www.osl.iu.edu/research/mtl/download.php3
 http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library
 https://software.intel.com/en-us/tbb_4.2_doc
 http://goog-perftools.sourceforge.net/doc/tcmalloc.html
+http://www.sgi.com/tech/stl/Rope.html
+http://www.cs.unc.edu/Research/compgeom/gzstream/
 
diff --git a/fast/grammar.cc b/fast/grammar.cc
deleted file mode 100644
index a003eb4..0000000
--- a/fast/grammar.cc
+++ /dev/null
@@ -1,273 +0,0 @@
-#include "grammar.hh"
-
-
-namespace G {
-
-/*
- * G::NT
- *
- */
-NT::NT(string& s)
-{
-  s.erase(0, 1); s.pop_back(); // remove '[' and ']'
-  istringstream ss(s);
-  if (ss >> index) { // [i]
-    symbol = "";
-    index = stoi(s);
-
-    return;
-  } else { // [X]
-    symbol = s;
-    index = 0;
-
-    return;
-  }
-  string buf;
-  size_t j = 0;
-  index = 0; // default
-  while (ss.good() && getline(ss, buf, ',')) {
-    if (j == 0) {
-      symbol = buf;
-    } else {
-      index = stoi(buf);
-    }
-    j++;
-  }
-}
-
-string
-NT::repr() const
-{
-  ostringstream os;
-  os << "NT<" << symbol << "," << index << ">";
-
-  return os.str();
-}
-
-string
-NT::escaped() const
-{
-  ostringstream os;
-  os << "[" << symbol;
-  if (index > 0)
-    os << "," << index;
-  os << "]";
-
-  return os.str();
-}
-
-ostream&
-operator<<(ostream& os, const NT& nt)
-{
-  return os << nt.repr();
-}
-
-/*
- * G::T
- *
- */
-T::T(const string& s)
-{
-  word = s;
-}
-
-string
-T::repr() const
-{
-  ostringstream os;
-  os << "T<" << word << ">";
-
-  return os.str();
-}
-
-string
-T::escaped() const
-{
-  return util::json_escape(word);
-}
-
-ostream&
-operator<<(ostream& os, const T& t)
-{
-  return os << t.repr();
-}
-
-
-/*
- * G::Item
- *
- * Better solve this by inheritance
- *  -> rhs, target as vector<base class> ?
- *
- */
-Item::Item(string& s)
-{
-  if (s.front() == '[' && s.back() == ']') {
-    type = NON_TERMINAL;
-    nt = new NT(s);
-  } else {
-    type = TERMINAL;
-    t = new T(s);
-  }
-}
-
-string
-Item::repr() const
-{
-  ostringstream os;
-  if (type == TERMINAL)
-    os << t->repr();
-  else
-    os << nt->repr();
-
-  return os.str();
-}
-
-string
-Item::escaped() const
-{
-  ostringstream os;
-  if (type == TERMINAL)
-    os << t->escaped();
-  else
-    os << nt->escaped();
-
-  return os.str();
-}
-
-ostream&
-operator<<(ostream& os, const Item& i)
-{
-  return os << i.repr();
-}
-
-/*
- * G::Rule
- *
- */
-Rule::Rule(const string& s)
-{
-  from_s(this, s);
-}
-
-void
-Rule::from_s(Rule* r, const string& s)
-{
-  stringstream ss(s);
-  size_t j = 0;
-  string buf;
-  r->arity = 0;
-  size_t index = 1;
-  vector<G::NT*> rhs_nt;
-  r->f = new Sv::SparseVector<string, score_t>();
-  while (ss >> buf) {
-    if (buf == "|||") { j++; continue; }
-    if (j == 0) {        // LHS
-      r->lhs = new NT(buf);
-    } else if (j == 1) { // RHS
-      r->rhs.push_back(new Item(buf));
-      if (r->rhs.back()->type == NON_TERMINAL) {
-        rhs_nt.push_back(r->rhs.back()->nt);
-        r->arity++;
-      }
-    } else if (j == 2) { // TARGET
-      r->target.push_back(new Item(buf));
-      if (r->target.back()->type == NON_TERMINAL) {
-        r->order.insert(make_pair(index, r->target.back()->nt->index));
-        if (r->target.back()->nt->symbol == "")
-          r->target.back()->nt->symbol = rhs_nt[r->target.back()->nt->index-1]->symbol;
-        index++;
-      }
-    } else if (j == 3) { // F TODO
-      Sv::SparseVector<string, score_t>::from_s(r->f, buf); // FIXME this is slow!!!
-    } else if (j == 4) { // A TODO
-    } else { 
-      // ERROR
-    }
-    if (j == 4) break;
-  }
-}
-
-string
-Rule::repr() const
-{
-  ostringstream os;
-  os << "Rule<lhs=" << lhs->repr() << \
-   ", rhs:{";
-  for (auto it = rhs.begin(); it != rhs.end(); it++) {
-    os << (**it).repr();
-    if (next(it) != rhs.end()) os << " ";
-  }
-  os << "}, target:{";
-  for (auto it = target.begin(); it != target.end(); it++) {
-    os << (**it).repr();
-    if (next(it) != target.end()) os << " ";
-  }
-  os << "}" \
-   ", f:" << f->repr() << \
-   ", arity=" << arity << \
-   ", map:" << "TODO" << \
-   ">";
-
-  return os.str();
-}
-
-string
-Rule::escaped() const
-{
-  ostringstream os;
-  os << lhs->escaped() << " ||| ";
-  for (auto it = rhs.begin(); it != rhs.end(); it++) {
-    os << (**it).escaped();
-    if (next(it) != rhs.end()) os << " ";
-  }
-  os << " ||| ";
-  for (auto it = target.begin(); it != target.end(); it++) {
-    os << (**it).escaped();
-    if (next(it) != target.end()) os << " ";
-  }
-  os << " ||| ";
-  os << f->escaped();
-  os << " ||| ";
-  os << "TODO(alignment)";
-
-  return os.str();
-}
-
-ostream&
-operator<<(ostream& os, const Rule& r)
-{
-  return os << r.repr();
-}
-
-/*
- * G::Grammmar
- *
- */
-Grammar::Grammar(const string& fn)
-{
-  ifstream ifs(fn);
-  string line;
-  while (getline(ifs, line)) {
-    G::Rule* r = new G::Rule(line);
-    rules.push_back(r);
-    if (r->arity == 0)
-      flat.push_back(r);
-    else if (r->rhs.front()->type == NON_TERMINAL)
-      start_nt.push_back(r);
-    else
-      start_t.push_back(r);
-  }
-}
-
-ostream&
-operator<<(ostream& os, const Grammar& g)
-{
-  for (const auto it: g.rules)
-    os << it->repr() << endl;
-
-  return os;
-}
-
-} // namespace G
-
diff --git a/fast/grammar.hh b/fast/grammar.hh
index 1b9ac5a..e5acb8a 100644
--- a/fast/grammar.hh
+++ b/fast/grammar.hh
@@ -1,13 +1,14 @@
 #pragma once
 
-#include <fstream>
 #include <iostream>
+#include <fstream>
+#include <map>
 #include <sstream>
 #include <string>
-#include <map>
-#include <msgpack.hpp>
 #include <vector>
 
+#include <msgpack.hpp>
+
 #include "sparse_vector.hh"
 #include "util.hh"
 
@@ -16,46 +17,138 @@ using namespace std;
 
 namespace G {
 
-struct NT {
-  string symbol;
-  size_t index;
+enum item_type {
+  UNDEFINED,
+  TERMINAL,
+  NON_TERMINAL
+};
 
-  NT() {};
-  NT(string& s);
+struct Item {
+  virtual size_t index() const { return 0; }
+  virtual symbol_t symbol() const { return ""; }
+  virtual item_type type() const { return UNDEFINED; }
 
-  string repr() const;
-  string escaped() const;
+  virtual ostream& repr(ostream& os) const { return os << "Item<>"; }
+  virtual ostream& escaped(ostream& os) const { return os << ""; }
 
-  friend ostream& operator<<(ostream& os, const NT& t);
+  friend ostream&
+  operator<<(ostream& os, const Item& i)
+  {
+    return i.repr(os);
+  };
 };
 
-struct T {
-  string word; // use word ids instead?
+struct NT : public Item {
+  symbol_t symbol_;
+  size_t index_;
+
+  NT() {}
+
+  NT(string const& s)
+  {
+    index_ = 0; // default
+    string t(s);
+    t.erase(0, 1); t.pop_back(); // remove '[' and ']'
+    istringstream ss(s);
+    if (ss >> index_) { // [i]
+      symbol_ = "";
+      index_ = stoi(s);
+
+      return;
+    } else { // [X]
+      symbol_ = s;
+
+      return;
+    }
+    string buf;
+    size_t j = 0;
+    while (ss.good() && getline(ss, buf, ',')) {
+      if (j == 0) {
+        symbol_ = buf;
+      } else {
+        index_ = stoi(buf);
+      }
+      j++;
+    }
+  }
 
-  T(const string& s);
+  virtual size_t index() const { return index_; }
+  virtual symbol_t symbol() const { return symbol_; }
+  virtual item_type type() { return NON_TERMINAL; }
 
-  string repr() const;
-  string escaped() const;
+  virtual ostream&
+  repr(ostream& os) const
+  {
+    return os << "NT<" << symbol_ << "," << index_ << ">";
+  }
 
-  friend ostream& operator<<(ostream& os, const NT& nt);
+  virtual ostream&
+  escaped(ostream& os) const
+  {
+    os << "[" << symbol_;
+    if (index_ > 0)
+      os << "," << index_;
+    os << "]";
+
+    return os;
+  }
 };
 
-enum item_type {
-  NON_TERMINAL,
-  TERMINAL
+struct T : public Item {
+  symbol_t symbol_;
+
+  T(string const& s)
+  {
+    symbol_ = s;
+  }
+
+  virtual symbol_t symbol() const { return symbol_; }
+  virtual item_type type() { return TERMINAL; }
+
+  virtual ostream&
+  repr(ostream& os) const
+  {
+    return os << "T<" << symbol_ << ">";
+  }
+
+  virtual ostream&
+  escaped(ostream& os) const
+  {
+    os << util::json_escape(symbol_);
+  }
 };
 
-struct Item {
- item_type type;
-       NT* nt;
-        T* t;
+struct Vocabulary
+{
+  unordered_map<symbol_t, size_t> m_;
+  vector<Item*> items_;
 
-  Item(string& s);
+  bool
+  is_non_terminal(string const& s)
+  {
+    return s.front() == '[' && s.back() == ']';
+  }
 
-  string repr() const;
-  string escaped() const;
+  Item*
+  get(symbol_t const& s)
+  {
+    if (is_non_terminal(s))
+      return new NT(s);
+    if (m_.find(s) != m_.end())
+      return items_[m_[s]];
+    return add(s);
+  }
 
-  friend ostream& operator<<(ostream& os, const Item& i);
+  Item*
+  add(symbol_t const& s)
+  {
+    size_t next_index_ = items_.size();
+    T* item = new T(s);
+    items_.push_back(item);
+    m_[s] = next_index_;
+
+    return item;
+  }
 };
 
 struct Rule {
@@ -65,35 +158,157 @@ struct Rule {
                             size_t arity;
 Sv::SparseVector<string, score_t>* f;
                map<size_t, size_t> order;
-                            string as_str_; // FIXME
+                            string as_str_;
+
+  Rule() {}
+
+  Rule(string const& s, Vocabulary& vocab) { from_s(this, s, vocab); }
+
+  static void
+  from_s(Rule* r, string const& s, Vocabulary& vocab)
+  {
+    istringstream ss(s);
+    string buf;
+    size_t j = 0, i = 0;
+    r->arity = 0;
+    vector<NT*> rhs_non_terminals;
+    r->f = new Sv::SparseVector<string, score_t>();
+    while (ss >> buf) {
+      if (buf == "|||") { j++; continue; }
+      if (j == 0) {        // left-hand side
+        r->lhs = new NT(buf);
+      } else if (j == 1) { // right-hand side
+        Item* item = vocab.get(buf);
+        r->rhs.push_back(item);
+        if (item->type() == NON_TERMINAL) {
+          r->arity++;
+          rhs_non_terminals.push_back(reinterpret_cast<NT*>(item));
+        }
+      } else if (j == 2) { // target
+        Item* item = vocab.get(buf);
+        if (item->type() == NON_TERMINAL) {
+          r->order[i] = item->index();
+          i++;
+          if (item->symbol() == "") { // only [1], [2] ... on target
+            reinterpret_cast<NT*>(item)->symbol_ = \
+             rhs_non_terminals[item->index()-1]->symbol();
+          }
+        }
+        r->target.push_back(item);
+      } else if (j == 3) { // feature vector
+        Sv::SparseVector<string, score_t>::from_s(r->f, buf);
+        // FIXME: this is slow!!!          ^^^ 
+      } else if (j == 4) { // alignment
+      } else {
+        // error
+      }
+      if (j == 4) break;
+    }
+  }
 
-  Rule() {};
-  Rule(const string& s);
-  static void from_s(Rule* r, const string& s);
+  ostream&
+  repr(ostream& os) const
+  {
+    os << "Rule<lhs=";
+    lhs->repr(os);
+    os << ", rhs:{";
+    for (auto it = rhs.begin(); it != rhs.end(); it++) {
+      (**it).repr(os);
+      if (next(it) != rhs.end()) os << " ";
+    }
+    os << "}, target:{";
+    for (auto it = target.begin(); it != target.end(); it++) {
+      (**it).repr(os);
+      if (next(it) != target.end()) os << " ";
+    }
+    os << "}, f:";
+    f->repr(os);
+    os <<  ", arity=" << arity << \
+     ", map:" << "TODO" << \
+     ">";
 
-  string repr() const;
-  string escaped() const;
+    return os;
+  }
 
-  friend ostream& operator<<(ostream& os, const Rule& r);
+  ostream&
+  escaped(ostream& os) const
+  {
+    lhs->escaped(os);
+    os << " ||| ";
+    for (auto it = rhs.begin(); it != rhs.end(); it++) {
+      (**it).escaped(os);
+      if (next(it) != rhs.end()) os << " ";
+    }
+    os << " ||| ";
+    for (auto it = target.begin(); it != target.end(); it++) {
+      (**it).escaped(os);
+      if (next(it) != target.end()) os << " ";
+    }
+    os << " ||| ";
+    f->escaped(os);
+    os << " ||| ";
+    os << "TODO";
 
-  void prep_for_serialization_() { as_str_ = escaped(); }; // FIXME
+    return os;
+  };
 
-  MSGPACK_DEFINE(as_str_); // TODO
+  friend ostream&
+  operator<<(ostream& os, const Rule& r)
+  {
+    return r.repr(os);
+  };
+
+  // --
+  void
+  prep_for_serialization_()
+  {
+    ostringstream os;
+    escaped(os);
+    as_str_ = os.str();
+  };
+  MSGPACK_DEFINE(as_str_);
+  //             ^^^ FIXME
 };
 
 struct Grammar {
   vector<Rule*> rules;
   vector<Rule*> flat;
-  vector<Rule*> start_nt;
-  vector<Rule*> start_t;
+  vector<Rule*> start_non_terminal;
+  vector<Rule*> start_terminal;
+
+  Grammar() {}
+
+  Grammar(string const& fn, Vocabulary& vocab)
+  {
+    ifstream ifs(fn);
+    string line;
+    while (getline(ifs, line)) {
+      G::Rule* r = new G::Rule(line, vocab);
+      rules.push_back(r);
+      if (r->arity == 0)
+        flat.push_back(r);
+      else if (r->rhs.front()->type() == NON_TERMINAL)
+        start_non_terminal.push_back(r);
+      else
+        start_terminal.push_back(r);
+    }
+  }
 
-  Grammar() {};
-  Grammar(const string& fn);
+  void add_glue();
+  //   ^^^ TODO
+  void add_pass_through(const string& input);
+  //   ^^^ TODO
 
-  void add_glue(); // TODO
-  void add_pass_through(const string& input); // TODO
+  friend ostream&
+  operator<<(ostream& os, Grammar const& g)
+  {
+    for (const auto it: g.rules) {
+      it->repr(os);
+      os << endl;
+    }
 
-  friend ostream& operator<<(ostream& os, const Grammar& g);
+    return os;
+  }
 };
 
 } // namespace G
diff --git a/fast/hypergraph.cc b/fast/hypergraph.cc
index a9a44f9..d9a51a5 100644
--- a/fast/hypergraph.cc
+++ b/fast/hypergraph.cc
@@ -69,11 +69,13 @@ viterbi(Hypergraph& hg)
 void
 viterbi_path(Hypergraph& hg, Path& p)
 {
-  list<Node*>::iterator root = \
+  //list<Node*>::iterator root = \
     find_if(hg.nodes.begin(), hg.nodes.end(), \
     [](Node* n) { return n->incoming.size() == 0; });
+  list<Node*>::iterator root = hg.nodes.begin();
 
-  Hg::topological_sort(hg.nodes, root); // FIXME do I need to do this when reading from file?
+  //Hg::topological_sort(hg.nodes, root);
+  //  ^^^ FIXME do I need to do this when reading from file?
   Semiring::Viterbi<score_t> semiring;
   Hg::init(hg.nodes, root, semiring);
 
@@ -111,11 +113,11 @@ derive(const Path& p, const Node* cur, vector<string>& carry)
 
   unsigned j = 0;
   for (auto it: next->rule->target) {
-    if (it->type == G::NON_TERMINAL) {
+    if (it->type() == G::NON_TERMINAL) {
       derive(p, next->tails[next->rule->order[j]], carry);
       j++;
     } else {
-      carry.push_back(it->t->word);
+      carry.push_back(it->symbol());
     }
   }
 }
@@ -123,7 +125,7 @@ derive(const Path& p, const Node* cur, vector<string>& carry)
 namespace io {
 
 void
-read(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn) // FIXME
+read(Hypergraph& hg, vector<G::Rule*>& rules, G::Vocabulary& vocab, const string& fn)
 {
   ifstream ifs(fn);
   size_t i = 0, r, n, e;
@@ -145,7 +147,7 @@ read(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn) // FIXME
         string s;
         o.convert(&s);
         G::Rule* rule = new G::Rule;
-        G::Rule::from_s(rule, s);
+        G::Rule::from_s(rule, s, vocab);
         rules.push_back(rule);
       } else if (i > r+2 && i <= r+n+2) {
         Node* n = new Node;
@@ -193,7 +195,7 @@ write(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn) // FIXME
 }
 
 void
-manual(Hypergraph& hg, vector<G::Rule*>& rules)
+manual(Hypergraph& hg, vector<G::Rule*>& rules, G::Vocabulary& vocab)
 {
   // nodes
   Node* a = new Node; a->id = 0; a->symbol = "root"; a->left = -1;    a->right = -1; a->mark = 0;
@@ -228,7 +230,7 @@ manual(Hypergraph& hg, vector<G::Rule*>& rules)
   rule_strs.push_back("[S] ||| [NP,1] [VP,2] ||| [NP,1] [VP,2] ||| ||| ");
 
   for (auto it: rule_strs) {
-    rules.push_back(new G::Rule(it));
+    rules.push_back(new G::Rule(it, vocab));
     rules.back()->f = new Sv::SparseVector<string, score_t>();
   }
 
@@ -349,8 +351,9 @@ operator<<(ostream& os, const Edge& e)
     "Edge<head=" << e.head->id << \
     ", tails=[" << _.str() << "]" \
     ", score=" << e.score << \
-    ", rule:'" << e.rule->escaped() << "'" <<  \
-    ", f=" << "TODO" << \
+    ", rule:'";
+  e.rule->escaped(os);
+  os << "', f=" << "TODO" << \
     ", arity=" << e.arity << \
     ", mark=" << e.mark << ">";
   return os;
diff --git a/fast/hypergraph.hh b/fast/hypergraph.hh
index 299a62d..1c48a88 100644
--- a/fast/hypergraph.hh
+++ b/fast/hypergraph.hh
@@ -34,7 +34,7 @@ struct Edge {
       unsigned int mark = 0;
 
   inline bool is_marked() { return mark >= arity; }
-  friend ostream& operator<<(ostream& os, const Edge& s);
+  friend ostream& operator<<(ostream& os, const Edge& e);
 
           size_t head_id_;
   vector<size_t> tails_ids_; // node ids
@@ -89,7 +89,7 @@ derive(const Path& p, const Node* cur, vector<string>& carry);
 namespace io {
 
 void
-read(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn); // FIXME
+read(Hypergraph& hg, vector<G::Rule*>& rules, G::Vocabulary& vocab, const string& fn); // FIXME
 
 void
 write(Hypergraph& hg, vector<G::Rule*>& rules, const string& fn); // FIXME
diff --git a/fast/main.cc b/fast/main.cc
index 08fcfcf..825ad59 100644
--- a/fast/main.cc
+++ b/fast/main.cc
@@ -6,8 +6,9 @@ int
 main(int argc, char** argv)
 {
   Hg::Hypergraph hg;
+  G::Vocabulary y;
   G::Grammar g;
-  Hg::io::read(hg, g.rules, argv[1]);
+  Hg::io::read(hg, g.rules, y, argv[1]);
   //Hg::io::manual(hg, g.rules);
   clock_t begin = clock();
   Hg::Path p;
diff --git a/fast/parse.cc b/fast/parse.cc
new file mode 100644
index 0000000..06c9fa0
--- /dev/null
+++ b/fast/parse.cc
@@ -0,0 +1,55 @@
+#include "parse.hh"
+
+
+namespace Parse {
+
+
+} // 
+
+
+vector<G::T> tokenize(string s)
+{
+  istringstream ss(s);
+  vector<G::T> res; 
+  while (ss.good()) {
+    string t;
+    ss >> t;
+    G::T i(t);
+    cout << i.word << endl;
+    res.push_back(i); 
+  }
+  return res;
+}
+
+
+bool operator==(vector<G::Item> const& a, vector<G::Item> const& b)
+{
+  if (a.size() != b.size()) return false;
+  for (auto it: a)
+}
+
+int main(int argc, char** argv)
+{
+  string in("karten haie");
+  vector<G::T> tok = tokenize(in);
+  for (auto it: tok)
+    cout << it.word << ",";
+  cout << endl;
+  size_t n = tok.size();
+
+  G::Grammar g(argv[1]);
+
+  vector<Span> spans; 
+  Parse::visit(spans, 1, 0, 6);
+  for (auto it: spans) {
+    cout << "(" << it.first << "," << it.second << ")" << endl;
+  }
+
+  Parse::Chart active(n);
+  Parse::Chart passive(n);
+
+  //init(tok, n, active, passive, g);
+  
+  cout << *(g.flat.at(0)) << endl;
+}
+
diff --git a/fast/parse.hh b/fast/parse.hh
new file mode 100644
index 0000000..9fbcdea
--- /dev/null
+++ b/fast/parse.hh
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <vector>
+#include <utility>
+#include <sstream>
+#include <unordered_map>
+
+#include "grammar.hh"
+
+
+using namespace std;
+
+typedef pair<size_t,size_t> Span;
+namespace std {
+  template <>
+  struct hash<Span>
+  {
+    size_t
+    operator()(Span const& k) const
+    {
+      return ((hash<size_t>()(k.first)
+               ^ (hash<size_t>()(k.second) << 1)) >> 1);
+    }
+  };
+}
+
+namespace Parse {
+
+void visit(vector<Span>& p,
+           size_t i, size_t l, size_t r, size_t x=0)
+{
+  for (size_t span = i; span <= r-x; span++) {
+    for (size_t k = l; k <= r-span; k++) {
+      p.push_back(Span(k,k+span));
+    }
+  }
+}
+
+struct ChartItem
+{
+             Span span;
+   G::Rule const* rule;
+    vector<ChartItem*> tails;
+           size_t dot;
+
+  ChartItem(G::Rule* r)
+  {
+    rule = r;
+    dot = 0;
+  }
+
+  ChartItem(ChartItem const& o)
+  {
+    span.first = o.span.first; 
+    span.second = o.span.second; 
+    rule = o.rule;
+    for (auto it: o.tails)
+      tails.push_back(it);
+    dot = o.dot;
+  }
+};
+
+struct Chart
+{
+  size_t n_;
+  unordered_map<Span, vector<ChartItem*> > m_;
+  unordered_map<string,bool> b_;
+
+  vector<ChartItem*>& at(Span s)
+  {
+    return m_[s];
+  }
+
+  string h(ChartItem* item, Span s)
+  {
+    ostringstream ss;
+    ss << item->rule->lhs->symbol;
+    ss << s.first;
+    ss << s.second;
+    return ss.str();
+  }
+
+  void add(ChartItem* item, Span s)
+  {
+    m_[s].push_back(item); 
+    b_[h(item, s)] = true;
+  }
+
+  Chart(size_t n)
+  {
+  }
+};
+
+
+void init(vector<G::T> const& in, size_t n, Chart& active,  Chart& passive, G::Grammar const& g)
+{
+  for (auto rule: g.flat) {
+  }
+}
+
+
+} // 
+
diff --git a/fast/semiring.hh b/fast/semiring.hh
index 1c3ff1d..3f4ac08 100644
--- a/fast/semiring.hh
+++ b/fast/semiring.hh
@@ -1,7 +1,6 @@
 #pragma once
 
 
-// TODO: others
 namespace Semiring {
 
 template<typename T>
diff --git a/fast/sparse_vector.hh b/fast/sparse_vector.hh
index 3583240..9d815ff 100644
--- a/fast/sparse_vector.hh
+++ b/fast/sparse_vector.hh
@@ -20,6 +20,7 @@ struct SparseVector {
                    V zero = 0.f;
 
   SparseVector() {};
+
   SparseVector(string& s)
   {
     from_s(this, s);
@@ -147,10 +148,9 @@ struct SparseVector {
     }
   }
 
-  string
-  repr() const
+  ostream&
+  repr(ostream& os) const
   {
-    ostringstream os;
     os << "SparseVector<{";
     for (auto it = m_.cbegin(); it != m_.cend(); it++) {
       os << "'" << it->first << "'=" << it->second;
@@ -159,12 +159,11 @@ struct SparseVector {
     }
     os << "}>";
 
-    return os.str();
+    return os;
   };
 
-  string
-  escaped(bool quote_keys=false) const {
-    ostringstream os;
+  ostream&
+  escaped(ostream& os, bool quote_keys=false) const {
     for (auto it = m_.cbegin(); it != m_.cend(); it++) {
       if (quote_keys) os << '"';
       os << util::json_escape(it->first);
@@ -173,10 +172,14 @@ struct SparseVector {
       if (next(it) != m_.cend()) os << " ";
     }
 
-    return os.str();
+    return os;
   };
 
-  friend ostream& operator<<(ostream& os, const SparseVector& v) { return os << v.repr(); }
+  friend ostream&
+  operator<<(ostream& os, const SparseVector& v)
+  {
+    return v.repr(os);
+  }
 };
 
 } // namespace
diff --git a/fast/test/Makefile b/fast/test/Makefile
new file mode 100644
index 0000000..0140f63
--- /dev/null
+++ b/fast/test/Makefile
@@ -0,0 +1,16 @@
+COMPILER=g++
+CFLAGS=-std=c++11 -O3 -I../
+TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread
+
+
+all: test_grammar test_sparse_vector
+
+test_grammar: test_grammar.cc ../grammar.hh
+	$(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_grammar.cc -o test_grammar
+
+test_sparse_vector: test_sparse_vector.cc ../sparse_vector.hh
+	$(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_sparse_vector.cc -o test_sparse_vector
+
+clean:
+	rm -f test_grammar test_sparse_vector
+
diff --git a/fast/test/test_grammar b/fast/test/test_grammar
new file mode 100755
index 0000000..088d55a
--- /dev/null
+++ b/fast/test/test_grammar
diff --git a/fast/test/test_grammar.cc b/fast/test/test_grammar.cc
new file mode 100644
index 0000000..bbe76e7
--- /dev/null
+++ b/fast/test/test_grammar.cc
@@ -0,0 +1,20 @@
+#include <fstream>
+
+#include "grammar.hh"
+
+using namespace std;
+
+
+int
+main(int argc, char** argv)
+{
+  G::Vocabulary y;
+  G::Grammar g(argv[1], y);
+  for (auto it: g.rules) {
+    it->escaped(cout);
+    cout << endl;
+  }
+
+  return 0;
+}
+
diff --git a/fast/test/test_sparse_vector b/fast/test/test_sparse_vector
new file mode 100755
index 0000000..c06fe9e
--- /dev/null
+++ b/fast/test/test_sparse_vector
diff --git a/fast/test_sparse_vector.cc b/fast/test/test_sparse_vector.cc
index 426bed1..426bed1 100644
--- a/fast/test_sparse_vector.cc
+++ b/fast/test/test_sparse_vector.cc
diff --git a/fast/test_grammar.cc b/fast/test_grammar.cc
deleted file mode 100644
index 3263edd..0000000
--- a/fast/test_grammar.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <fstream>
-
-#include "grammar.hh"
-
-using namespace std;
-
-
-int
-main(int argc, char** argv)
-{
-  G::Grammar g(argv[1]);
-  for (auto it: g.rules)
-    cout << it->escaped() << endl;
-
-  return 0;
-}
-
diff --git a/fast/util.hh b/fast/util.hh
index 2a28f16..c3e087e 100644
--- a/fast/util.hh
+++ b/fast/util.hh
@@ -7,23 +7,24 @@ using namespace std;
 
 namespace util {
 
-  inline string
-  json_escape(const string& s) { // FIXME: only inline?
-    ostringstream os;
-    for (auto it = s.cbegin(); it != s.cend(); it++) {
-      switch (*it) {
-        case '"':  os << "\\\""; break;
-        case '\\': os << "\\\\"; break;
-        case '\b': os << "\\b";  break;
-        case '\f': os << "\\f";  break;
-        case '\n': os << "\\n";  break;
-        case '\r': os << "\\r";  break;
-        case '\t': os << "\\t";  break;
-        default:   os << *it;    break;
-     }
-   }
-   return os.str();
- };
+inline string
+json_escape(const string& s) { // FIXME: only inline?
+  ostringstream os;
+  for (auto it = s.cbegin(); it != s.cend(); it++) {
+    switch (*it) {
+      case '"':  os << "\\\""; break;
+      case '\\': os << "\\\\"; break;
+      case '\b': os << "\\b";  break;
+      case '\f': os << "\\f";  break;
+      case '\n': os << "\\n";  break;
+      case '\r': os << "\\r";  break;
+      case '\t': os << "\\t";  break;
+      default:   os << *it;    break;
+    }
+  }
+
+  return os.str();
+}
 
 } // namespace util
 
diff --git a/fast/weaver.hh b/fast/weaver.hh
index e7c3238..39d5391 100644
--- a/fast/weaver.hh
+++ b/fast/weaver.hh
@@ -1,4 +1,5 @@
 #pragma once
 
 typedef double score_t;
+typedef string symbol_t;
author	Patrick Simianer <p@simianer.de>	2014-08-23 22:59:16 +0100
committer	Patrick Simianer <p@simianer.de>	2014-08-23 22:59:16 +0100
commit	cef65063cec641a93973b38a48e100fdd115db44 (patch)
tree	32d5f10757e021a9fad01156fbff62a96212f006
parent	190f68c880eb27506669e95e2bc0493e2ec42c4c (diff)