summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore13
-rw-r--r--.gitmodules2
-rw-r--r--Makefile58
-rw-r--r--README.md39
m---------external/json-cpp (renamed from util/json-cpp)0
-rw-r--r--fast/Makefile15
-rw-r--r--fast/README.md38
-rw-r--r--fast/parse.hh108
-rw-r--r--fast/test/Makefile19
-rwxr-xr-xfast/test/test_grammarbin60943 -> 0 bytes
-rwxr-xr-xfast/test/test_sparse_vectorbin44288 -> 0 bytes
-rw-r--r--prototype/grammar.rb (renamed from grammar.rb)2
-rw-r--r--prototype/hypergraph.rb (renamed from hg.rb)3
-rw-r--r--prototype/parse.rb (renamed from parse.rb)5
-rwxr-xr-xprototype/test_hg.rb (renamed from test/test_hg.rb)2
-rwxr-xr-xprototype/test_parse.rb (renamed from test/test_parse.rb)2
-rwxr-xr-xprototype/weaver.rb (renamed from main.rb)2
-rw-r--r--src/fast_weaver.cc (renamed from fast/main.cc)1
-rw-r--r--src/grammar.hh (renamed from fast/grammar.hh)26
-rw-r--r--src/hypergraph.cc (renamed from fast/hypergraph.cc)1
-rw-r--r--src/hypergraph.hh (renamed from fast/hypergraph.hh)3
-rw-r--r--src/make_pak.cc (renamed from util/make_pak.cc)5
-rw-r--r--src/parse.hh301
-rw-r--r--src/read_pak.cc (renamed from util/read_pak.cc)1
-rw-r--r--src/semiring.hh (renamed from fast/semiring.hh)0
-rw-r--r--src/sparse_vector.hh (renamed from fast/sparse_vector.hh)2
-rw-r--r--src/test_grammar.cc (renamed from fast/test/test_grammar.cc)1
-rw-r--r--src/test_parse.cc19
-rw-r--r--src/test_sparse_vector.cc (renamed from fast/test/test_sparse_vector.cc)1
-rw-r--r--src/types.hh (renamed from fast/weaver.hh)0
-rw-r--r--src/util.hh (renamed from fast/util.hh)2
-rw-r--r--util/Makefile14
-rwxr-xr-xutil/cdec2json.py96
33 files changed, 448 insertions, 333 deletions
diff --git a/.gitignore b/.gitignore
index d8a671e..8bb6628 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,7 @@
*.o
-fast/example/
-fast/fast_weaver
-fast/test_grammar
-fast/test_sparse_vector
-util/make_pak
-util/read_pak
-fast/gperftools-2.1/
+fast_weaver
+test_grammar
+test_sparse_vector
+make_pak
+read_pak
+external/gperftools-2.1/
diff --git a/.gitmodules b/.gitmodules
index 843caa2..7fe83f1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
[submodule "util/json-cpp"]
- path = util/json-cpp
+ path = external/json-cpp
url = https://github.com/ascheglov/json-cpp.git
diff --git a/Makefile b/Makefile
index f499591..56f89d4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,53 @@
-json_examples:
- ./main.rb -w example/3/weights.init < example/3/in.sgm > example/3/3.json 2>/dev/null
- ./main.rb -w example/3/weights.init -l < example/3/in.sgm > example/3/3-with-glue.json 2>/dev/null
- ./main.rb -w example/glue/weights -l < example/glue/in.sgm > example/glue/glue.json 2>/dev/null
- ./main.rb -w example/toy/weights < example/toy/in.sgm > example/toy/toy.json 2>/dev/null
- ./main.rb -w example/toy/weights < example/toy/in-test.sgm > example/toy/toy-test.json 2>/dev/null
+COMPILER=clang
+CFLAGS=-std=c++11 -O3 -Wall
+TCMALLOC=$(shell pwd)/external/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread
+SRC=src
+
+all: $(SRC)/hypergraph.o $(SRC)/fast_weaver.cc
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack $(TCMALLOC) $(SRC)/hypergraph.o \
+ $(SRC)/fast_weaver.cc \
+ -o fast_weaver
+
+$(SRC)/hypergraph.o: $(SRC)/hypergraph.cc $(SRC)/hypergraph.hh \
+ $(SRC)/semiring.hh $(SRC)/sparse_vector.hh \
+ $(SRC)/types.hh
+ $(COMPILER) $(CFLAGS) -g -c $(TCMALLOC) \
+ $(SRC)/hypergraph.cc \
+ -o $(SRC)/hypergraph.o
+
+util: make_pak read_pak
+
+make_pak: $(SRC)/make_pak.cc external/json-cpp/single_include/json-cpp.hpp \
+ $(SRC)/hypergraph.hh $(SRC)/types.hh
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack -I./external \
+ $(SRC)/make_pak.cc \
+ -o make_pak
+
+read_pak: $(SRC)/read_pak.cc
+ $(COMPILER) $(CFLAGS) -lstdc++ -lmsgpack \
+ $(SRC)/read_pak.cc \
+ -o read_pak
+
+test: test_grammar test_parse test_sparse_vector
+
+test_grammar: $(SRC)/test_grammar.cc $(SRC)/grammar.hh
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) \
+ $(SRC)/test_grammar.cc \
+ -o test_grammar
+
+test_parse: $(SRC)/test_parse.cc $(SRC)/parse.hh \
+ $(SRC)/grammar.hh $(SRC)/util.hh
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) \
+ $(SRC)/test_parse.cc \
+ -o test_parse
+
+test_sparse_vector: $(SRC)/test_sparse_vector.cc $(SRC)/sparse_vector.hh
+ $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) \
+ $(SRC)/test_sparse_vector.cc \
+ -o test_sparse_vector
+
+clean:
+ rm -f fast_weaver hypergraph.o
+ rm -f make_pak read_pak
+ rm -f test_grammar test_sparse_vector test_parse
diff --git a/README.md b/README.md
index 4cdcf31..e462d41 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,42 @@
+TODO
+ * sparse vector (unordered_map) -> where to store?
+ * parser
+ * Rule -> ChartItem -> Node ?
+ * k-best
+ * other semirings
+ * include language model
+ * compress/hash words/feature strings?
+ * cast? Rule -> Edge, ChartItem -> Node
+ * feature factory, observer
+
+Dependencies:
+ * MessagePack for object serialization [1]
+ * kenlm language model [2]
+
+This is Linux only.
+
+
+[1] http://msgpack.org
+[2] http://kheafield.com/code/kenlm/
+
+
+stuff to have a look at:
+http://math.nist.gov/spblas/
+http://lapackpp.sourceforge.net/
+http://www.cvmlib.com/
+http://sourceforge.net/projects/lpp/
+http://math-atlas.sourceforge.net/
+http://www.netlib.org/lapack/
+http://bytes.com/topic/c/answers/702569-blas-vs-cblas-c
+http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack
+http://www.osl.iu.edu/research/mtl/download.php3
+http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library
+https://software.intel.com/en-us/tbb_4.2_doc
+http://goog-perftools.sourceforge.net/doc/tcmalloc.html
+http://www.sgi.com/tech/stl/Rope.html
+http://www.cs.unc.edu/Research/compgeom/gzstream/
+https://github.com/facebook/folly/blob/6e46d468cf2876dd59c7a4dddcb4e37abf070b7a/folly/docs/Overview.md
+---
not much to see here, yet
(SCFG machine translation decoder in ruby, currently implements CKY+ parsing and hypergraph viterbi)
diff --git a/util/json-cpp b/external/json-cpp
-Subproject 4eb4b47cf4d622bc7bf34071d6b68fc5beb3705
+Subproject 4eb4b47cf4d622bc7bf34071d6b68fc5beb3705
diff --git a/fast/Makefile b/fast/Makefile
deleted file mode 100644
index 1a7f5b9..0000000
--- a/fast/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-COMPILER=g++
-CFLAGS=-std=c++11 -O3
-TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread
-
-
-all: hypergraph.o main.cc
- $(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack $(TCMALLOC) hypergraph.o main.cc -o fast_weaver
-
-hypergraph.o: hypergraph.cc hypergraph.hh semiring.hh sparse_vector.hh weaver.hh
- $(COMPILER) $(CFLAGS) -g -c $(TCMALLOC) hypergraph.cc
-
-clean:
- rm -f fast_weaver
- rm -f hypergraph.o parse.o
-
diff --git a/fast/README.md b/fast/README.md
deleted file mode 100644
index f92245b..0000000
--- a/fast/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-TODO
- * sparse vector (unordered_map) -> where to store?
- * parser
- * Rule -> ChartItem -> Node ?
- * k-best
- * other semirings
- * include language model
- * compress/hash words/feature strings?
- * cast? Rule -> Edge, ChartItem -> Node
- * feature factory, observer
-
-Dependencies:
- * MessagePack for object serialization [1]
- * kenlm language model [2]
-
-This is Linux only.
-
-
-[1] http://msgpack.org
-[2] http://kheafield.com/code/kenlm/
-
-
-stuff to have a look at:
-http://math.nist.gov/spblas/
-http://lapackpp.sourceforge.net/
-http://www.cvmlib.com/
-http://sourceforge.net/projects/lpp/
-http://math-atlas.sourceforge.net/
-http://www.netlib.org/lapack/
-http://bytes.com/topic/c/answers/702569-blas-vs-cblas-c
-http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack
-http://www.osl.iu.edu/research/mtl/download.php3
-http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library
-https://software.intel.com/en-us/tbb_4.2_doc
-http://goog-perftools.sourceforge.net/doc/tcmalloc.html
-http://www.sgi.com/tech/stl/Rope.html
-http://www.cs.unc.edu/Research/compgeom/gzstream/
-
diff --git a/fast/parse.hh b/fast/parse.hh
deleted file mode 100644
index 33ea9ce..0000000
--- a/fast/parse.hh
+++ /dev/null
@@ -1,108 +0,0 @@
-#pragma once
-
-#include <vector>
-#include <utility>
-#include <sstream>
-#include <unordered_map>
-
-#include "grammar.hh"
-#include "util.hh"
-#include "weaver.hh"
-
-
-using namespace std;
-
-typedef pair<size_t,size_t> Span;
-namespace std {
- template <>
- struct hash<Span>
- {
- size_t
- operator()(Span const& k) const
- {
- return ((hash<size_t>()(k.first)
- ^ (hash<size_t>()(k.second) << 1)) >> 1);
- }
- };
-}
-
-namespace Parse {
-
-void visit(vector<Span>& p,
- size_t i, size_t l, size_t r, size_t x=0)
-{
- for (size_t span = i; span <= r-x; span++) {
- for (size_t k = l; k <= r-span; k++) {
- p.push_back(Span(k,k+span));
- }
- }
-}
-
-struct ChartItem
-{
- Span span;
- G::Rule const* rule;
- vector<ChartItem*> tails;
- size_t dot;
-
- ChartItem(G::Rule* r)
- {
- rule = r;
- dot = 0;
- }
-
- ChartItem(ChartItem const& o)
- {
- span.first = o.span.first;
- span.second = o.span.second;
- rule = o.rule;
- for (auto it: o.tails)
- tails.push_back(it);
- dot = o.dot;
- }
-};
-
-struct Chart
-{
- size_t n_;
- unordered_map<Span, vector<ChartItem*> > m_;
- unordered_map<string,bool> b_;
-
- vector<ChartItem*>& at(Span s)
- {
- return m_[s];
- }
-
- string h(ChartItem* item, Span s)
- {
- ostringstream ss;
- item->rule->lhs->symbol();
- ss << s.first;
- ss << s.second;
-
- return ss.str();
- }
-
- void add(ChartItem* item, Span s)
- {
- m_[s].push_back(item);
- b_[h(item, s)] = true;
- }
-
- Chart(size_t n)
- {
- }
-};
-
-
-void
-init(vector<symbol_t> const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g)
-{
- for (auto rule: g.rules) {
- cout << *rule << endl;
- }
-}
-
-
-} //
-
diff --git a/fast/test/Makefile b/fast/test/Makefile
deleted file mode 100644
index 65e97ef..0000000
--- a/fast/test/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-COMPILER=g++
-CFLAGS=-std=c++11 -O3 -I../
-TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread
-
-
-all: test_grammar test_sparse_vector test_parse
-
-test_grammar: test_grammar.cc ../grammar.hh
- $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_grammar.cc -o test_grammar
-
-test_sparse_vector: test_sparse_vector.cc ../sparse_vector.hh
- $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_sparse_vector.cc -o test_sparse_vector
-
-test_parse: test_parse.cc ../parse.hh ../grammar.hh ../util.hh
- $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_parse.cc -o test_parse
-
-clean:
- rm -f test_grammar test_sparse_vector
-
diff --git a/fast/test/test_grammar b/fast/test/test_grammar
deleted file mode 100755
index 6cf7ad5..0000000
--- a/fast/test/test_grammar
+++ /dev/null
Binary files differ
diff --git a/fast/test/test_sparse_vector b/fast/test/test_sparse_vector
deleted file mode 100755
index c06fe9e..0000000
--- a/fast/test/test_sparse_vector
+++ /dev/null
Binary files differ
diff --git a/grammar.rb b/prototype/grammar.rb
index e91c0f6..42ffbc0 100644
--- a/grammar.rb
+++ b/prototype/grammar.rb
@@ -1,6 +1,5 @@
module Grammar
-
class T
attr_accessor :word
@@ -134,6 +133,5 @@ class Grammar
end
end
-
end #module
diff --git a/hg.rb b/prototype/hypergraph.rb
index a68407c..d43d5ee 100644
--- a/hg.rb
+++ b/prototype/hypergraph.rb
@@ -1,10 +1,8 @@
require 'zipf'
require_relative 'grammar'
-
module HG
-
class HG::Node
attr_accessor :id, :symbol, :left, :right, :outgoing, :incoming, :score
@@ -217,6 +215,5 @@ def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=fal
return Hypergraph.new(nodes, edges), nodes_by_id
end
-
end #module
diff --git a/parse.rb b/prototype/parse.rb
index 1f69c91..adf2b91 100644
--- a/parse.rb
+++ b/prototype/parse.rb
@@ -1,11 +1,9 @@
require 'zipf'
require_relative 'grammar'
-require_relative 'hg'
-
+require_relative 'hypergraph'
module Parse
-
def Parse::visit i, l, r, x=0
i.upto(r-x) { |span|
l.upto(r-span) { |k|
@@ -205,6 +203,5 @@ def Parse::parse input, n, active_chart, passive_chart, grammar
}
end
-
end #module
diff --git a/test/test_hg.rb b/prototype/test_hg.rb
index 6311bac..4be72bd 100755
--- a/test/test_hg.rb
+++ b/prototype/test_hg.rb
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require_relative '../hg'
+require_relative 'hypergraph'
def main
diff --git a/test/test_parse.rb b/prototype/test_parse.rb
index c3be5ae..918101b 100755
--- a/test/test_parse.rb
+++ b/prototype/test_parse.rb
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require_relative '../parse'
+require_relative 'parse'
def main
diff --git a/main.rb b/prototype/weaver.rb
index e41d0f1..966d4c8 100755
--- a/main.rb
+++ b/prototype/weaver.rb
@@ -4,7 +4,6 @@ require 'trollop'
require 'xmlsimple'
require_relative 'parse'
-
def read_grammar fn, add_glue, add_pass_through, input=nil
STDERR.write "> reading grammar '#{fn}'\n"
grammar = Grammar::Grammar.new fn
@@ -67,6 +66,5 @@ def main
}
end
-
main
diff --git a/fast/main.cc b/src/fast_weaver.cc
index 825ad59..4854476 100644
--- a/fast/main.cc
+++ b/src/fast_weaver.cc
@@ -1,7 +1,6 @@
#include "hypergraph.hh"
#include <ctime>
-
int
main(int argc, char** argv)
{
diff --git a/fast/grammar.hh b/src/grammar.hh
index 4906c46..c489ec5 100644
--- a/fast/grammar.hh
+++ b/src/grammar.hh
@@ -6,6 +6,7 @@
#include <sstream>
#include <string>
#include <vector>
+#include <set>
#include <msgpack.hpp>
@@ -14,7 +15,6 @@
using namespace std;
-
namespace G {
enum item_type {
@@ -40,7 +40,7 @@ struct Item {
struct NT : public Item {
symbol_t symbol_;
- size_t index_;
+ size_t index_;
NT() {}
@@ -276,6 +276,7 @@ struct Grammar {
vector<Rule*> flat;
vector<Rule*> start_non_terminal;
vector<Rule*> start_terminal;
+ set<symbol_t> nts;
Grammar() {}
@@ -286,6 +287,7 @@ struct Grammar {
while (getline(ifs, line)) {
G::Rule* r = new G::Rule(line, vocab);
rules.push_back(r);
+ nts.insert(r->lhs->symbol());
if (r->arity == 0)
flat.push_back(r);
else if (r->rhs.front()->type() == NON_TERMINAL)
@@ -295,8 +297,24 @@ struct Grammar {
}
}
- void add_glue();
- // ^^^ TODO
+ void
+ add_glue(Vocabulary& vocab)
+ {
+ for (auto nt: nts) {
+ ostringstream oss_1;
+ oss_1 << "[S] ||| [" << nt << ",1] ||| [" << nt << ",1] ||| ";
+ cout << oss_1.str() << endl;
+ Rule* r1 = new Rule(oss_1.str(), vocab);
+ rules.push_back(r1); start_non_terminal.push_back(r1);
+ ostringstream oss_2;
+ oss_2 << "[S] ||| [S,1] [" << nt << ",2] ||| [S,1] [" << nt << ",2] ||| ";
+ cout << oss_2.str() << endl;
+ Rule* r2 = new Rule(oss_2.str(), vocab);
+ cout << *r2 << endl;
+ rules.push_back(r2); start_non_terminal.push_back(r2);
+ }
+ }
+
void add_pass_through(const string& input);
// ^^^ TODO
diff --git a/fast/hypergraph.cc b/src/hypergraph.cc
index 2b33ff4..40bcc64 100644
--- a/fast/hypergraph.cc
+++ b/src/hypergraph.cc
@@ -1,6 +1,5 @@
#include "hypergraph.hh"
-
namespace Hg {
template<typename Semiring> void
diff --git a/fast/hypergraph.hh b/src/hypergraph.hh
index 1c48a88..8e05e9f 100644
--- a/fast/hypergraph.hh
+++ b/src/hypergraph.hh
@@ -16,11 +16,10 @@
#include "grammar.hh"
#include "semiring.hh"
#include "sparse_vector.hh"
-#include "weaver.hh"
+#include "types.hh"
using namespace std;
-
namespace Hg {
struct Node;
diff --git a/util/make_pak.cc b/src/make_pak.cc
index e858155..db3a8a4 100644
--- a/util/make_pak.cc
+++ b/src/make_pak.cc
@@ -5,12 +5,11 @@
#include <string>
#include "json-cpp/single_include/json-cpp.hpp"
-#include "../fast/hypergraph.hh"
-#include "../fast/weaver.hh"
+#include "hypergraph.hh"
+#include "types.hh"
using namespace std;
-
struct DummyNode {
size_t id;
string symbol;
diff --git a/src/parse.hh b/src/parse.hh
new file mode 100644
index 0000000..0dd2fc0
--- /dev/null
+++ b/src/parse.hh
@@ -0,0 +1,301 @@
+#pragma once
+
+#include <vector>
+#include <utility>
+#include <sstream>
+#include <unordered_map>
+#include <set>
+
+#include "grammar.hh"
+#include "util.hh"
+#include "types.hh"
+
+using namespace std;
+
+typedef pair<size_t,size_t> Span;
+namespace std {
+ template <>
+ struct hash<Span>
+ {
+ size_t
+ operator()(Span const& k) const
+ {
+ return ((hash<size_t>()(k.first)
+ ^ (hash<size_t>()(k.second) << 1)) >> 1);
+ }
+ };
+}
+
+namespace Parse {
+
+void visit(vector<Span>& p,
+ size_t i, size_t l, size_t r, size_t x=0)
+{
+ for (size_t s = i; s <= r-x; s++) {
+ for (size_t k = l; k <= r-s; k++) {
+ p.push_back(Span(k,k+s));
+ }
+ }
+}
+
+struct ChartItem
+{
+ Span span;
+ G::Rule const* rule;
+ vector<Span> tails_spans;
+ size_t dot;
+
+ ChartItem() {}
+
+ ChartItem(G::Rule* r) : rule(r), dot(0) {}
+
+ ChartItem(G::Rule* r, Span s, size_t dot)
+ : rule(r), span(s), dot(dot) {}
+
+ ChartItem(ChartItem const& o)
+ : span(o.span),
+ rule(o.rule),
+ tails_spans(o.tails_spans),
+ dot(o.dot)
+ {
+ }
+
+ ostream&
+ repr(ostream& os) const
+ {
+ os << "ChartItem<";
+ os << "span=(" << span.first << "," << span.second << "), lhs=";
+ rule->lhs->repr(os);
+ os << ", dot=" << dot;
+ os << ", tails=" << tails_spans.size() << ", ";
+ os << "rule=";
+ rule->repr(os);
+ os << ">";
+ os << endl;
+ }
+
+ friend ostream&
+ operator<<(ostream& os, ChartItem item)
+ {
+ item.repr(os);
+
+ return os;
+ }
+};
+
+struct Chart
+{
+ size_t n_;
+ map<Span, vector<ChartItem*> > m_;
+ unordered_map<string,bool> b_;
+
+ vector<ChartItem*>& at(Span s)
+ {
+ return m_[s];
+ }
+
+ string h(symbol_t sym, Span s)
+ {
+ ostringstream ss;
+ ss << sym;
+ ss << s.first;
+ ss << s.second;
+
+ return ss.str();
+ }
+
+ bool
+ has_at(symbol_t sym, Span s)
+ {
+ return b_[h(sym, s)];
+ }
+
+ void add(ChartItem* item, Span s)
+ {
+ if (m_.count(s) > 0)
+ m_[s].push_back(item);
+ else {
+ m_.insert(make_pair(s, vector<ChartItem*>{item}));
+ }
+ b_[h(item->rule->lhs->symbol(), s)] = true;
+ }
+
+ Chart(size_t n) : n_(n) {}
+
+ friend ostream&
+ operator<<(ostream& os, Chart const& chart)
+ {
+ for (map<Span, vector<ChartItem*> >::const_iterator it = chart.m_.cbegin();
+ it != chart.m_.cend(); it++) {
+ os << "(" << it->first.first << "," << it->first.second << ")" << endl;
+ for (auto jt: it->second)
+ jt->repr(os); os << endl;
+ }
+
+ return os;
+ }
+};
+
+bool
+scan(ChartItem* item, vector<symbol_t> in, size_t limit, Chart& passive)
+{
+ //cout << "S1" << endl;
+ while (item->dot < item->rule->rhs.size() &&
+ item->rule->rhs[item->dot]->type() == G::TERMINAL) {
+ //cout << "S2" << endl;
+ if (item->span.second == limit) return false;
+ //cout << "S3" << endl;
+ if (item->rule->rhs[item->dot]->symbol() == in[item->span.second]) {
+ //cout << "S4" << endl;
+ item->dot++;
+ //cout << "S5" << endl;
+ item->span.second++;
+ //cout << "S6" << endl;
+ } else {
+ //cout << "S7" << endl;
+ return false;
+ }
+ }
+ //cout << "S8" << endl;
+ return true;
+}
+
+
+void
+init(vector<symbol_t> const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g)
+{
+ for (auto rule: g.flat) {
+ size_t j = 0;
+ for (auto it: in) {
+ if (it == rule->rhs.front()->symbol()) {
+ cout << it << " " << j << j+rule->rhs.size() << endl;
+ Span span(j, j+rule->rhs.size());
+ passive.add(new ChartItem(rule, span, rule->rhs.size()), span);
+ cout << "new passive item [1] " << *passive.at(span).back() << endl;
+ }
+ j++;
+ }
+ }
+}
+
+void
+parse(vector<symbol_t> const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g)
+{
+ vector<Span> spans;
+ Parse::visit(spans, 1, 0, n);
+ for (auto span: spans) {
+
+ cout << "Span (" << span.first << "," << span.second << ")" << endl;
+
+ for (auto it: g.start_terminal) {
+ ChartItem* item = new ChartItem(it, Span(span.first,span.first), 0);
+ if (scan(item, in, span.second, passive)
+ && span.first + item->rule->rhs.size() <= span.second) {
+ active.add(item, span);
+ cout << "new active item [1] " << *active.at(span).back();
+ }
+ }
+
+ for (auto it: g.start_non_terminal) {
+ if (it->rhs.size() > span.second-span.first) continue;
+ active.add(new ChartItem(it, Span(span.first,span.first), 0), span);
+ cout << "new active item [2] " << *active.at(span).back();
+ }
+
+ set<symbol_t> new_symbols;
+ vector<ChartItem*> remaining_items;
+
+ while (true) {
+ cout << "active size at (" << span.first << "," << span.second << ") " << active.at(span).size() << endl;
+ cout << "passive size at (" << span.first << "," << span.second << ") " << passive.at(span).size() << endl;
+ if (active.at(span).empty()) break;
+ ChartItem* item = active.at(span).back();
+ cout << "current item " << *item;
+ active.at(span).pop_back();
+ bool advanced = false;
+ vector<Span> spans2;
+ Parse::visit(spans2, 1, span.first, span.second, 1);
+ for (auto span2: spans2) {
+ cout << "A" << endl;
+ if (item->rule->rhs[item->dot]->type() == G::NON_TERMINAL) {
+ cout << "B" << endl;
+ if (passive.has_at(item->rule->rhs[item->dot]->symbol(), span2)) {
+ cout << "C" << endl;
+ if (span2.first == item->span.second) {
+ cout << "D" << endl;
+ ChartItem* new_item = new ChartItem(*item);
+ cout << "D1" << endl;
+ new_item->span.second = span2.second;
+ cout << "D2" << endl;
+ new_item->dot++;
+ cout << "D3" << endl;
+ new_item->tails_spans.push_back(span2);
+ cout << "D4" << endl;
+ if (scan(new_item, in, span.second, passive)) {
+ cout << "E" << endl;
+ if (new_item->dot == new_item->rule->rhs.size()) {
+ cout << "F" << endl;
+ if (new_item->span.first == span.first && new_item->span.second == span.second) {
+ cout << "G" << endl;
+ cout << "H" << endl;
+ new_symbols.insert(new_item->rule->lhs->symbol());
+ passive.add(new_item, span);
+ cout << "new passive item [2] " << *new_item;
+ advanced = true;
+ }
+ } else {
+ if (new_item->span.second+(new_item->rule->rhs.size()-new_item->dot) <= span.second) {
+ active.add(new_item, span);
+ cout << "new active item [3] " << *new_item;
+ }
+ }
+ }
+ cout << "I" << endl;
+ }
+ }
+ }
+ }
+ cout << "J" << endl;
+ if (!advanced) {
+ cout << "K" << endl;
+ remaining_items.push_back(item);
+ }
+ }
+
+ for (auto new_sym: new_symbols) {
+ cout << "new sym " << new_sym << endl;
+ for (auto rem_item: remaining_items) {
+ if (rem_item->dot != 0 ||
+ rem_item->rule->rhs[rem_item->dot]->type() != G::NON_TERMINAL) {
+ continue;
+ cout << "K1" << endl;
+ }
+ cout << "K2" << endl;
+ if (rem_item->rule->rhs[rem_item->dot]->symbol() == new_sym) {
+ cout << "K3" << endl;
+ ChartItem* new_item = new ChartItem(*rem_item);
+ cout << "K31" << endl;
+ //new_item->tails_spans[new_item->dot-1] = span;
+ new_item->tails_spans.push_back(span);
+ new_item->dot++;
+ cout << "K32" << endl;
+ if (new_item->dot == new_item->rule->rhs.size()) {
+ cout << "K4" << endl;
+ new_symbols.insert(new_item->rule->lhs->symbol());
+ passive.add(new_item, span);
+ }
+ }
+ }
+ }
+
+ cout << "L" << endl;
+ cout << "-------------------" << endl;
+ cout << endl;
+ }
+
+ //cout << "ACTIVE" << endl << active << endl;
+ cout << "PASSIVE" << endl << passive << endl;
+}
+
+} //
+
diff --git a/util/read_pak.cc b/src/read_pak.cc
index afd6e6a..c894442 100644
--- a/util/read_pak.cc
+++ b/src/read_pak.cc
@@ -4,7 +4,6 @@
using namespace std;
-
int
main(int argc, char** argv)
{
diff --git a/fast/semiring.hh b/src/semiring.hh
index 3f4ac08..3f4ac08 100644
--- a/fast/semiring.hh
+++ b/src/semiring.hh
diff --git a/fast/sparse_vector.hh b/src/sparse_vector.hh
index 9d815ff..7fff338 100644
--- a/fast/sparse_vector.hh
+++ b/src/sparse_vector.hh
@@ -7,7 +7,7 @@
#include <vector>
#include "util.hh"
-#include "weaver.hh"
+#include "types.hh"
using namespace std;
diff --git a/fast/test/test_grammar.cc b/src/test_grammar.cc
index bbe76e7..7fee79a 100644
--- a/fast/test/test_grammar.cc
+++ b/src/test_grammar.cc
@@ -4,7 +4,6 @@
using namespace std;
-
int
main(int argc, char** argv)
{
diff --git a/src/test_parse.cc b/src/test_parse.cc
new file mode 100644
index 0000000..2d51d44
--- /dev/null
+++ b/src/test_parse.cc
@@ -0,0 +1,19 @@
+#include "parse.hh"
+
+int main(int argc, char** argv)
+{
+ //string in("ich sah ein kleines haus");
+ //string in("europa bildet den ersten oder zweiten markt für die zehn am häufigsten von indien exportierten produkte , erklärte der europäische kommissar weiter . die asiatischen und europäischen giganten tauschen jährlich güter im wert von 47 milliarden euro und dienstleistungen im wert von 10 milliarden euro aus , hatte diese woche daniéle smadja , vorsitzende der abordnung der europäischen kommission in neu delhi , erklärt , und bedauert , dass der gegenseitige handel sein potential noch nicht ausgeschöpft hat . die eu und indien treffen sich am freitag zu ihrem achten diplomatischen in neu delhi , bei dem premierminister manmohan singh und der präsident der europäischen kommission josé manuel durao barrosso anwesend sein werden .");
+ //string in("aber schon bald nach seinem eintritt kam der erste große erfolg .");
+ string in("lebensmittel schuld an europäischer inflation");
+ vector<symbol_t> tok = util::tokenize(in);
+ size_t n = tok.size();
+ G::Vocabulary v;
+ G::Grammar g(argv[1], v);
+ g.add_glue(v);
+ Parse::Chart active(n);
+ Parse::Chart passive(n);
+ init(tok, n, active, passive, g);
+ parse(tok, n, active, passive, g);
+}
+
diff --git a/fast/test/test_sparse_vector.cc b/src/test_sparse_vector.cc
index 426bed1..69aaa21 100644
--- a/fast/test/test_sparse_vector.cc
+++ b/src/test_sparse_vector.cc
@@ -1,6 +1,5 @@
#include "sparse_vector.hh"
-
int
main(void)
{
diff --git a/fast/weaver.hh b/src/types.hh
index e89b4dd..e89b4dd 100644
--- a/fast/weaver.hh
+++ b/src/types.hh
diff --git a/fast/util.hh b/src/util.hh
index 9ce19da..93ea320 100644
--- a/fast/util.hh
+++ b/src/util.hh
@@ -2,7 +2,7 @@
#include <string>
-#include "weaver.hh"
+#include "types.hh"
using namespace std;
diff --git a/util/Makefile b/util/Makefile
deleted file mode 100644
index 30564fe..0000000
--- a/util/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-COMPILER=clang
-
-
-all: make_pak read_pak
-
-make_pak: make_pak.cc json-cpp/single_include/json-cpp.hpp ../fast/hypergraph.hh ../fast/weaver.hh
- $(COMPILER) -std=c++11 -lstdc++ -lm -lmsgpack make_pak.cc -o make_pak
-
-read_pak: read_pak.cc
- $(COMPILER) -std=c++11 -lstdc++ -lmsgpack read_pak.cc -o read_pak
-
-clean:
- rm -f make_pak read_pak
-
diff --git a/util/cdec2json.py b/util/cdec2json.py
deleted file mode 100755
index e7c8e93..0000000
--- a/util/cdec2json.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#!/usr/bin/env python2
-
-import cdec
-import sys, argparse
-import json
-import gzip
-
-
-#FIXME new format
-# strings?
-# map?
-def hg2json(hg, weights):
- """
- output a JSON representation of a cdec hypegraph
- """
- res = ''
- res += "{\n"
- res += '"rules":[\n'
- rules = []
- for i in hg.edges:
- s = json.dumps(str(i.trule))
- try:
- rules.index(s)
- except:
- rules.append(s)
- res += ",\n".join(rules)
- res += "\n],\n"
- res += '"nodes":'+"\n"
- res += "[\n"
- a = []
- a.append( '{ "id":0, "symbol":"root", "span":[-1,-1] }' )
- for i in hg.nodes:
- a.append('{ "id":%d, "symbol":"%s", "span":[%d,%d] }'%(i.id+1, i.cat, i.span[0], i.span[1]))
- res += ",\n".join(a)+"\n"
- res += "],\n"
- res += '"edges":'+"\n"
- res += "[\n"
- a = []
- for i in hg.edges:
- s = "{"
- s += '"head":%d'%(i.head_node.id+1)
- s += ', "rule":%s'%(rules.index(json.dumps(str(i.trule))))
- # f
- #xs = ' "f":{'
- #b = []
- #for j in i.feature_values:
- # b.append( '"%s":%s'%(j[0], j[1]) )
- #xs += ", ".join(b)
- #xs += "},"
- # tails
- if len(list(i.tail_nodes)) > 0:
- s += ', "tails":[ %s ],'%(",".join([str(n.id+1) for n in i.tail_nodes]))
- else:
- s += ', "tails":[ 0 ],'
- #s += xs
- s += ' "score":%s }'%(i.prob)
- a.append(s)
- res += ",\n".join(a)+"\n"
- res += "]\n"
- res += "}\n"
- return res
-
-def main():
- parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs')
- parser.add_argument('-c', '--config', required=True, help='decoder configuration')
- parser.add_argument('-w', '--weights', required=True, help='feature weights')
- parser.add_argument('-g', '--grammar', required=False, help='grammar')
- args = parser.parse_args()
- with open(args.config) as config:
- config = config.read()
- decoder = cdec.Decoder(config)
- decoder.read_weights(args.weights)
- ins = sys.stdin.readline().strip()
- if args.grammar:
- with gzip.open(args.grammar) as grammar:
- grammar = grammar.read()
- hg = decoder.translate(ins, grammar=grammar)
- else:
- hg = decoder.translate(ins)
-
- sys.stderr.write( "input:\n '%s'\n"%(ins) )
- sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) )
- num_nodes = 0
- for i in hg.nodes: num_nodes+=1
- sys.stderr.write( "# nodes = %s\n"%(num_nodes) )
- num_edges = 0
- for i in hg.edges: num_edges+=1
- sys.stderr.write( "# edges = %s\n"%(num_edges) )
- sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) )
-
- print hg2json(hg, decoder.weights)
-
-
-if __name__=="__main__":
- main()
-