diff options
author | Patrick Simianer <p@simianer.de> | 2014-09-16 10:23:14 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-09-16 10:23:14 +0100 |
commit | 129a22cfcc7651daa4b11ed52e7870249f6373a5 (patch) | |
tree | 78de4649396ab0d37a325b7598f9873c2d65f4c9 | |
parent | df70006a07fb67b17fb39aa56762c50c2e7b8131 (diff) |
spring cleaning
-rw-r--r-- | .gitignore | 13 | ||||
-rw-r--r-- | .gitmodules | 2 | ||||
-rw-r--r-- | Makefile | 58 | ||||
-rw-r--r-- | README.md | 39 | ||||
m--------- | external/json-cpp (renamed from util/json-cpp) | 0 | ||||
-rw-r--r-- | fast/Makefile | 15 | ||||
-rw-r--r-- | fast/README.md | 38 | ||||
-rw-r--r-- | fast/parse.hh | 108 | ||||
-rw-r--r-- | fast/test/Makefile | 19 | ||||
-rwxr-xr-x | fast/test/test_grammar | bin | 60943 -> 0 bytes | |||
-rwxr-xr-x | fast/test/test_sparse_vector | bin | 44288 -> 0 bytes | |||
-rw-r--r-- | prototype/grammar.rb (renamed from grammar.rb) | 2 | ||||
-rw-r--r-- | prototype/hypergraph.rb (renamed from hg.rb) | 3 | ||||
-rw-r--r-- | prototype/parse.rb (renamed from parse.rb) | 5 | ||||
-rwxr-xr-x | prototype/test_hg.rb (renamed from test/test_hg.rb) | 2 | ||||
-rwxr-xr-x | prototype/test_parse.rb (renamed from test/test_parse.rb) | 2 | ||||
-rwxr-xr-x | prototype/weaver.rb (renamed from main.rb) | 2 | ||||
-rw-r--r-- | src/fast_weaver.cc (renamed from fast/main.cc) | 1 | ||||
-rw-r--r-- | src/grammar.hh (renamed from fast/grammar.hh) | 26 | ||||
-rw-r--r-- | src/hypergraph.cc (renamed from fast/hypergraph.cc) | 1 | ||||
-rw-r--r-- | src/hypergraph.hh (renamed from fast/hypergraph.hh) | 3 | ||||
-rw-r--r-- | src/make_pak.cc (renamed from util/make_pak.cc) | 5 | ||||
-rw-r--r-- | src/parse.hh | 301 | ||||
-rw-r--r-- | src/read_pak.cc (renamed from util/read_pak.cc) | 1 | ||||
-rw-r--r-- | src/semiring.hh (renamed from fast/semiring.hh) | 0 | ||||
-rw-r--r-- | src/sparse_vector.hh (renamed from fast/sparse_vector.hh) | 2 | ||||
-rw-r--r-- | src/test_grammar.cc (renamed from fast/test/test_grammar.cc) | 1 | ||||
-rw-r--r-- | src/test_parse.cc | 19 | ||||
-rw-r--r-- | src/test_sparse_vector.cc (renamed from fast/test/test_sparse_vector.cc) | 1 | ||||
-rw-r--r-- | src/types.hh (renamed from fast/weaver.hh) | 0 | ||||
-rw-r--r-- | src/util.hh (renamed from fast/util.hh) | 2 | ||||
-rw-r--r-- | util/Makefile | 14 | ||||
-rwxr-xr-x | util/cdec2json.py | 96 |
33 files changed, 448 insertions, 333 deletions
@@ -1,8 +1,7 @@ *.o -fast/example/ -fast/fast_weaver -fast/test_grammar -fast/test_sparse_vector -util/make_pak -util/read_pak -fast/gperftools-2.1/ +fast_weaver +test_grammar +test_sparse_vector +make_pak +read_pak +external/gperftools-2.1/ diff --git a/.gitmodules b/.gitmodules index 843caa2..7fe83f1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "util/json-cpp"] - path = util/json-cpp + path = external/json-cpp url = https://github.com/ascheglov/json-cpp.git @@ -1,7 +1,53 @@ -json_examples: - ./main.rb -w example/3/weights.init < example/3/in.sgm > example/3/3.json 2>/dev/null - ./main.rb -w example/3/weights.init -l < example/3/in.sgm > example/3/3-with-glue.json 2>/dev/null - ./main.rb -w example/glue/weights -l < example/glue/in.sgm > example/glue/glue.json 2>/dev/null - ./main.rb -w example/toy/weights < example/toy/in.sgm > example/toy/toy.json 2>/dev/null - ./main.rb -w example/toy/weights < example/toy/in-test.sgm > example/toy/toy-test.json 2>/dev/null +COMPILER=clang +CFLAGS=-std=c++11 -O3 -Wall +TCMALLOC=$(shell pwd)/external/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread +SRC=src + +all: $(SRC)/hypergraph.o $(SRC)/fast_weaver.cc + $(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack $(TCMALLOC) $(SRC)/hypergraph.o \ + $(SRC)/fast_weaver.cc \ + -o fast_weaver + +$(SRC)/hypergraph.o: $(SRC)/hypergraph.cc $(SRC)/hypergraph.hh \ + $(SRC)/semiring.hh $(SRC)/sparse_vector.hh \ + $(SRC)/types.hh + $(COMPILER) $(CFLAGS) -g -c $(TCMALLOC) \ + $(SRC)/hypergraph.cc \ + -o $(SRC)/hypergraph.o + +util: make_pak read_pak + +make_pak: $(SRC)/make_pak.cc external/json-cpp/single_include/json-cpp.hpp \ + $(SRC)/hypergraph.hh $(SRC)/types.hh + $(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack -I./external \ + $(SRC)/make_pak.cc \ + -o make_pak + +read_pak: $(SRC)/read_pak.cc + $(COMPILER) $(CFLAGS) -lstdc++ -lmsgpack \ + $(SRC)/read_pak.cc \ + -o read_pak + +test: test_grammar test_parse test_sparse_vector + +test_grammar: $(SRC)/test_grammar.cc $(SRC)/grammar.hh + $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) \ + $(SRC)/test_grammar.cc \ + -o test_grammar + +test_parse: $(SRC)/test_parse.cc $(SRC)/parse.hh \ + $(SRC)/grammar.hh $(SRC)/util.hh + $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) \ + $(SRC)/test_parse.cc \ + -o test_parse + +test_sparse_vector: $(SRC)/test_sparse_vector.cc $(SRC)/sparse_vector.hh + $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) \ + $(SRC)/test_sparse_vector.cc \ + -o test_sparse_vector + +clean: + rm -f fast_weaver hypergraph.o + rm -f make_pak read_pak + rm -f test_grammar test_sparse_vector test_parse @@ -1,3 +1,42 @@ +TODO + * sparse vector (unordered_map) -> where to store? + * parser + * Rule -> ChartItem -> Node ? + * k-best + * other semirings + * include language model + * compress/hash words/feature strings? + * cast? Rule -> Edge, ChartItem -> Node + * feature factory, observer + +Dependencies: + * MessagePack for object serialization [1] + * kenlm language model [2] + +This is Linux only. + + +[1] http://msgpack.org +[2] http://kheafield.com/code/kenlm/ + + +stuff to have a look at: +http://math.nist.gov/spblas/ +http://lapackpp.sourceforge.net/ +http://www.cvmlib.com/ +http://sourceforge.net/projects/lpp/ +http://math-atlas.sourceforge.net/ +http://www.netlib.org/lapack/ +http://bytes.com/topic/c/answers/702569-blas-vs-cblas-c +http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack +http://www.osl.iu.edu/research/mtl/download.php3 +http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library +https://software.intel.com/en-us/tbb_4.2_doc +http://goog-perftools.sourceforge.net/doc/tcmalloc.html +http://www.sgi.com/tech/stl/Rope.html +http://www.cs.unc.edu/Research/compgeom/gzstream/ +https://github.com/facebook/folly/blob/6e46d468cf2876dd59c7a4dddcb4e37abf070b7a/folly/docs/Overview.md +--- not much to see here, yet (SCFG machine translation decoder in ruby, currently implements CKY+ parsing and hypergraph viterbi) diff --git a/util/json-cpp b/external/json-cpp -Subproject 4eb4b47cf4d622bc7bf34071d6b68fc5beb3705 +Subproject 4eb4b47cf4d622bc7bf34071d6b68fc5beb3705 diff --git a/fast/Makefile b/fast/Makefile deleted file mode 100644 index 1a7f5b9..0000000 --- a/fast/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -COMPILER=g++ -CFLAGS=-std=c++11 -O3 -TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread - - -all: hypergraph.o main.cc - $(COMPILER) $(CFLAGS) -lstdc++ -lm -lmsgpack $(TCMALLOC) hypergraph.o main.cc -o fast_weaver - -hypergraph.o: hypergraph.cc hypergraph.hh semiring.hh sparse_vector.hh weaver.hh - $(COMPILER) $(CFLAGS) -g -c $(TCMALLOC) hypergraph.cc - -clean: - rm -f fast_weaver - rm -f hypergraph.o parse.o - diff --git a/fast/README.md b/fast/README.md deleted file mode 100644 index f92245b..0000000 --- a/fast/README.md +++ /dev/null @@ -1,38 +0,0 @@ -TODO - * sparse vector (unordered_map) -> where to store? - * parser - * Rule -> ChartItem -> Node ? - * k-best - * other semirings - * include language model - * compress/hash words/feature strings? - * cast? Rule -> Edge, ChartItem -> Node - * feature factory, observer - -Dependencies: - * MessagePack for object serialization [1] - * kenlm language model [2] - -This is Linux only. - - -[1] http://msgpack.org -[2] http://kheafield.com/code/kenlm/ - - -stuff to have a look at: -http://math.nist.gov/spblas/ -http://lapackpp.sourceforge.net/ -http://www.cvmlib.com/ -http://sourceforge.net/projects/lpp/ -http://math-atlas.sourceforge.net/ -http://www.netlib.org/lapack/ -http://bytes.com/topic/c/answers/702569-blas-vs-cblas-c -http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack -http://www.osl.iu.edu/research/mtl/download.php3 -http://scicomp.stackexchange.com/questions/351/recommendations-for-a-usable-fast-c-matrix-library -https://software.intel.com/en-us/tbb_4.2_doc -http://goog-perftools.sourceforge.net/doc/tcmalloc.html -http://www.sgi.com/tech/stl/Rope.html -http://www.cs.unc.edu/Research/compgeom/gzstream/ - diff --git a/fast/parse.hh b/fast/parse.hh deleted file mode 100644 index 33ea9ce..0000000 --- a/fast/parse.hh +++ /dev/null @@ -1,108 +0,0 @@ -#pragma once - -#include <vector> -#include <utility> -#include <sstream> -#include <unordered_map> - -#include "grammar.hh" -#include "util.hh" -#include "weaver.hh" - - -using namespace std; - -typedef pair<size_t,size_t> Span; -namespace std { - template <> - struct hash<Span> - { - size_t - operator()(Span const& k) const - { - return ((hash<size_t>()(k.first) - ^ (hash<size_t>()(k.second) << 1)) >> 1); - } - }; -} - -namespace Parse { - -void visit(vector<Span>& p, - size_t i, size_t l, size_t r, size_t x=0) -{ - for (size_t span = i; span <= r-x; span++) { - for (size_t k = l; k <= r-span; k++) { - p.push_back(Span(k,k+span)); - } - } -} - -struct ChartItem -{ - Span span; - G::Rule const* rule; - vector<ChartItem*> tails; - size_t dot; - - ChartItem(G::Rule* r) - { - rule = r; - dot = 0; - } - - ChartItem(ChartItem const& o) - { - span.first = o.span.first; - span.second = o.span.second; - rule = o.rule; - for (auto it: o.tails) - tails.push_back(it); - dot = o.dot; - } -}; - -struct Chart -{ - size_t n_; - unordered_map<Span, vector<ChartItem*> > m_; - unordered_map<string,bool> b_; - - vector<ChartItem*>& at(Span s) - { - return m_[s]; - } - - string h(ChartItem* item, Span s) - { - ostringstream ss; - item->rule->lhs->symbol(); - ss << s.first; - ss << s.second; - - return ss.str(); - } - - void add(ChartItem* item, Span s) - { - m_[s].push_back(item); - b_[h(item, s)] = true; - } - - Chart(size_t n) - { - } -}; - - -void -init(vector<symbol_t> const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g) -{ - for (auto rule: g.rules) { - cout << *rule << endl; - } -} - - -} // - diff --git a/fast/test/Makefile b/fast/test/Makefile deleted file mode 100644 index 65e97ef..0000000 --- a/fast/test/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -COMPILER=g++ -CFLAGS=-std=c++11 -O3 -I../ -TCMALLOC=/home/pks/src/weaver/fast/gperftools-2.1/lib/libtcmalloc_minimal.a -pthread - - -all: test_grammar test_sparse_vector test_parse - -test_grammar: test_grammar.cc ../grammar.hh - $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_grammar.cc -o test_grammar - -test_sparse_vector: test_sparse_vector.cc ../sparse_vector.hh - $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_sparse_vector.cc -o test_sparse_vector - -test_parse: test_parse.cc ../parse.hh ../grammar.hh ../util.hh - $(COMPILER) $(CFLAGS) -lstdc++ -lm $(TCMALLOC) test_parse.cc -o test_parse - -clean: - rm -f test_grammar test_sparse_vector - diff --git a/fast/test/test_grammar b/fast/test/test_grammar Binary files differdeleted file mode 100755 index 6cf7ad5..0000000 --- a/fast/test/test_grammar +++ /dev/null diff --git a/fast/test/test_sparse_vector b/fast/test/test_sparse_vector Binary files differdeleted file mode 100755 index c06fe9e..0000000 --- a/fast/test/test_sparse_vector +++ /dev/null diff --git a/grammar.rb b/prototype/grammar.rb index e91c0f6..42ffbc0 100644 --- a/grammar.rb +++ b/prototype/grammar.rb @@ -1,6 +1,5 @@ module Grammar - class T attr_accessor :word @@ -134,6 +133,5 @@ class Grammar end end - end #module diff --git a/hg.rb b/prototype/hypergraph.rb index a68407c..d43d5ee 100644 --- a/hg.rb +++ b/prototype/hypergraph.rb @@ -1,10 +1,8 @@ require 'zipf' require_relative 'grammar' - module HG - class HG::Node attr_accessor :id, :symbol, :left, :right, :outgoing, :incoming, :score @@ -217,6 +215,5 @@ def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=fal return Hypergraph.new(nodes, edges), nodes_by_id end - end #module diff --git a/parse.rb b/prototype/parse.rb index 1f69c91..adf2b91 100644 --- a/parse.rb +++ b/prototype/parse.rb @@ -1,11 +1,9 @@ require 'zipf' require_relative 'grammar' -require_relative 'hg' - +require_relative 'hypergraph' module Parse - def Parse::visit i, l, r, x=0 i.upto(r-x) { |span| l.upto(r-span) { |k| @@ -205,6 +203,5 @@ def Parse::parse input, n, active_chart, passive_chart, grammar } end - end #module diff --git a/test/test_hg.rb b/prototype/test_hg.rb index 6311bac..4be72bd 100755 --- a/test/test_hg.rb +++ b/prototype/test_hg.rb @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require_relative '../hg' +require_relative 'hypergraph' def main diff --git a/test/test_parse.rb b/prototype/test_parse.rb index c3be5ae..918101b 100755 --- a/test/test_parse.rb +++ b/prototype/test_parse.rb @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require_relative '../parse' +require_relative 'parse' def main diff --git a/main.rb b/prototype/weaver.rb index e41d0f1..966d4c8 100755 --- a/main.rb +++ b/prototype/weaver.rb @@ -4,7 +4,6 @@ require 'trollop' require 'xmlsimple' require_relative 'parse' - def read_grammar fn, add_glue, add_pass_through, input=nil STDERR.write "> reading grammar '#{fn}'\n" grammar = Grammar::Grammar.new fn @@ -67,6 +66,5 @@ def main } end - main diff --git a/fast/main.cc b/src/fast_weaver.cc index 825ad59..4854476 100644 --- a/fast/main.cc +++ b/src/fast_weaver.cc @@ -1,7 +1,6 @@ #include "hypergraph.hh" #include <ctime> - int main(int argc, char** argv) { diff --git a/fast/grammar.hh b/src/grammar.hh index 4906c46..c489ec5 100644 --- a/fast/grammar.hh +++ b/src/grammar.hh @@ -6,6 +6,7 @@ #include <sstream> #include <string> #include <vector> +#include <set> #include <msgpack.hpp> @@ -14,7 +15,6 @@ using namespace std; - namespace G { enum item_type { @@ -40,7 +40,7 @@ struct Item { struct NT : public Item { symbol_t symbol_; - size_t index_; + size_t index_; NT() {} @@ -276,6 +276,7 @@ struct Grammar { vector<Rule*> flat; vector<Rule*> start_non_terminal; vector<Rule*> start_terminal; + set<symbol_t> nts; Grammar() {} @@ -286,6 +287,7 @@ struct Grammar { while (getline(ifs, line)) { G::Rule* r = new G::Rule(line, vocab); rules.push_back(r); + nts.insert(r->lhs->symbol()); if (r->arity == 0) flat.push_back(r); else if (r->rhs.front()->type() == NON_TERMINAL) @@ -295,8 +297,24 @@ struct Grammar { } } - void add_glue(); - // ^^^ TODO + void + add_glue(Vocabulary& vocab) + { + for (auto nt: nts) { + ostringstream oss_1; + oss_1 << "[S] ||| [" << nt << ",1] ||| [" << nt << ",1] ||| "; + cout << oss_1.str() << endl; + Rule* r1 = new Rule(oss_1.str(), vocab); + rules.push_back(r1); start_non_terminal.push_back(r1); + ostringstream oss_2; + oss_2 << "[S] ||| [S,1] [" << nt << ",2] ||| [S,1] [" << nt << ",2] ||| "; + cout << oss_2.str() << endl; + Rule* r2 = new Rule(oss_2.str(), vocab); + cout << *r2 << endl; + rules.push_back(r2); start_non_terminal.push_back(r2); + } + } + void add_pass_through(const string& input); // ^^^ TODO diff --git a/fast/hypergraph.cc b/src/hypergraph.cc index 2b33ff4..40bcc64 100644 --- a/fast/hypergraph.cc +++ b/src/hypergraph.cc @@ -1,6 +1,5 @@ #include "hypergraph.hh" - namespace Hg { template<typename Semiring> void diff --git a/fast/hypergraph.hh b/src/hypergraph.hh index 1c48a88..8e05e9f 100644 --- a/fast/hypergraph.hh +++ b/src/hypergraph.hh @@ -16,11 +16,10 @@ #include "grammar.hh" #include "semiring.hh" #include "sparse_vector.hh" -#include "weaver.hh" +#include "types.hh" using namespace std; - namespace Hg { struct Node; diff --git a/util/make_pak.cc b/src/make_pak.cc index e858155..db3a8a4 100644 --- a/util/make_pak.cc +++ b/src/make_pak.cc @@ -5,12 +5,11 @@ #include <string> #include "json-cpp/single_include/json-cpp.hpp" -#include "../fast/hypergraph.hh" -#include "../fast/weaver.hh" +#include "hypergraph.hh" +#include "types.hh" using namespace std; - struct DummyNode { size_t id; string symbol; diff --git a/src/parse.hh b/src/parse.hh new file mode 100644 index 0000000..0dd2fc0 --- /dev/null +++ b/src/parse.hh @@ -0,0 +1,301 @@ +#pragma once + +#include <vector> +#include <utility> +#include <sstream> +#include <unordered_map> +#include <set> + +#include "grammar.hh" +#include "util.hh" +#include "types.hh" + +using namespace std; + +typedef pair<size_t,size_t> Span; +namespace std { + template <> + struct hash<Span> + { + size_t + operator()(Span const& k) const + { + return ((hash<size_t>()(k.first) + ^ (hash<size_t>()(k.second) << 1)) >> 1); + } + }; +} + +namespace Parse { + +void visit(vector<Span>& p, + size_t i, size_t l, size_t r, size_t x=0) +{ + for (size_t s = i; s <= r-x; s++) { + for (size_t k = l; k <= r-s; k++) { + p.push_back(Span(k,k+s)); + } + } +} + +struct ChartItem +{ + Span span; + G::Rule const* rule; + vector<Span> tails_spans; + size_t dot; + + ChartItem() {} + + ChartItem(G::Rule* r) : rule(r), dot(0) {} + + ChartItem(G::Rule* r, Span s, size_t dot) + : rule(r), span(s), dot(dot) {} + + ChartItem(ChartItem const& o) + : span(o.span), + rule(o.rule), + tails_spans(o.tails_spans), + dot(o.dot) + { + } + + ostream& + repr(ostream& os) const + { + os << "ChartItem<"; + os << "span=(" << span.first << "," << span.second << "), lhs="; + rule->lhs->repr(os); + os << ", dot=" << dot; + os << ", tails=" << tails_spans.size() << ", "; + os << "rule="; + rule->repr(os); + os << ">"; + os << endl; + } + + friend ostream& + operator<<(ostream& os, ChartItem item) + { + item.repr(os); + + return os; + } +}; + +struct Chart +{ + size_t n_; + map<Span, vector<ChartItem*> > m_; + unordered_map<string,bool> b_; + + vector<ChartItem*>& at(Span s) + { + return m_[s]; + } + + string h(symbol_t sym, Span s) + { + ostringstream ss; + ss << sym; + ss << s.first; + ss << s.second; + + return ss.str(); + } + + bool + has_at(symbol_t sym, Span s) + { + return b_[h(sym, s)]; + } + + void add(ChartItem* item, Span s) + { + if (m_.count(s) > 0) + m_[s].push_back(item); + else { + m_.insert(make_pair(s, vector<ChartItem*>{item})); + } + b_[h(item->rule->lhs->symbol(), s)] = true; + } + + Chart(size_t n) : n_(n) {} + + friend ostream& + operator<<(ostream& os, Chart const& chart) + { + for (map<Span, vector<ChartItem*> >::const_iterator it = chart.m_.cbegin(); + it != chart.m_.cend(); it++) { + os << "(" << it->first.first << "," << it->first.second << ")" << endl; + for (auto jt: it->second) + jt->repr(os); os << endl; + } + + return os; + } +}; + +bool +scan(ChartItem* item, vector<symbol_t> in, size_t limit, Chart& passive) +{ + //cout << "S1" << endl; + while (item->dot < item->rule->rhs.size() && + item->rule->rhs[item->dot]->type() == G::TERMINAL) { + //cout << "S2" << endl; + if (item->span.second == limit) return false; + //cout << "S3" << endl; + if (item->rule->rhs[item->dot]->symbol() == in[item->span.second]) { + //cout << "S4" << endl; + item->dot++; + //cout << "S5" << endl; + item->span.second++; + //cout << "S6" << endl; + } else { + //cout << "S7" << endl; + return false; + } + } + //cout << "S8" << endl; + return true; +} + + +void +init(vector<symbol_t> const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g) +{ + for (auto rule: g.flat) { + size_t j = 0; + for (auto it: in) { + if (it == rule->rhs.front()->symbol()) { + cout << it << " " << j << j+rule->rhs.size() << endl; + Span span(j, j+rule->rhs.size()); + passive.add(new ChartItem(rule, span, rule->rhs.size()), span); + cout << "new passive item [1] " << *passive.at(span).back() << endl; + } + j++; + } + } +} + +void +parse(vector<symbol_t> const& in, size_t n, Chart& active, Chart& passive, G::Grammar const& g) +{ + vector<Span> spans; + Parse::visit(spans, 1, 0, n); + for (auto span: spans) { + + cout << "Span (" << span.first << "," << span.second << ")" << endl; + + for (auto it: g.start_terminal) { + ChartItem* item = new ChartItem(it, Span(span.first,span.first), 0); + if (scan(item, in, span.second, passive) + && span.first + item->rule->rhs.size() <= span.second) { + active.add(item, span); + cout << "new active item [1] " << *active.at(span).back(); + } + } + + for (auto it: g.start_non_terminal) { + if (it->rhs.size() > span.second-span.first) continue; + active.add(new ChartItem(it, Span(span.first,span.first), 0), span); + cout << "new active item [2] " << *active.at(span).back(); + } + + set<symbol_t> new_symbols; + vector<ChartItem*> remaining_items; + + while (true) { + cout << "active size at (" << span.first << "," << span.second << ") " << active.at(span).size() << endl; + cout << "passive size at (" << span.first << "," << span.second << ") " << passive.at(span).size() << endl; + if (active.at(span).empty()) break; + ChartItem* item = active.at(span).back(); + cout << "current item " << *item; + active.at(span).pop_back(); + bool advanced = false; + vector<Span> spans2; + Parse::visit(spans2, 1, span.first, span.second, 1); + for (auto span2: spans2) { + cout << "A" << endl; + if (item->rule->rhs[item->dot]->type() == G::NON_TERMINAL) { + cout << "B" << endl; + if (passive.has_at(item->rule->rhs[item->dot]->symbol(), span2)) { + cout << "C" << endl; + if (span2.first == item->span.second) { + cout << "D" << endl; + ChartItem* new_item = new ChartItem(*item); + cout << "D1" << endl; + new_item->span.second = span2.second; + cout << "D2" << endl; + new_item->dot++; + cout << "D3" << endl; + new_item->tails_spans.push_back(span2); + cout << "D4" << endl; + if (scan(new_item, in, span.second, passive)) { + cout << "E" << endl; + if (new_item->dot == new_item->rule->rhs.size()) { + cout << "F" << endl; + if (new_item->span.first == span.first && new_item->span.second == span.second) { + cout << "G" << endl; + cout << "H" << endl; + new_symbols.insert(new_item->rule->lhs->symbol()); + passive.add(new_item, span); + cout << "new passive item [2] " << *new_item; + advanced = true; + } + } else { + if (new_item->span.second+(new_item->rule->rhs.size()-new_item->dot) <= span.second) { + active.add(new_item, span); + cout << "new active item [3] " << *new_item; + } + } + } + cout << "I" << endl; + } + } + } + } + cout << "J" << endl; + if (!advanced) { + cout << "K" << endl; + remaining_items.push_back(item); + } + } + + for (auto new_sym: new_symbols) { + cout << "new sym " << new_sym << endl; + for (auto rem_item: remaining_items) { + if (rem_item->dot != 0 || + rem_item->rule->rhs[rem_item->dot]->type() != G::NON_TERMINAL) { + continue; + cout << "K1" << endl; + } + cout << "K2" << endl; + if (rem_item->rule->rhs[rem_item->dot]->symbol() == new_sym) { + cout << "K3" << endl; + ChartItem* new_item = new ChartItem(*rem_item); + cout << "K31" << endl; + //new_item->tails_spans[new_item->dot-1] = span; + new_item->tails_spans.push_back(span); + new_item->dot++; + cout << "K32" << endl; + if (new_item->dot == new_item->rule->rhs.size()) { + cout << "K4" << endl; + new_symbols.insert(new_item->rule->lhs->symbol()); + passive.add(new_item, span); + } + } + } + } + + cout << "L" << endl; + cout << "-------------------" << endl; + cout << endl; + } + + //cout << "ACTIVE" << endl << active << endl; + cout << "PASSIVE" << endl << passive << endl; +} + +} // + diff --git a/util/read_pak.cc b/src/read_pak.cc index afd6e6a..c894442 100644 --- a/util/read_pak.cc +++ b/src/read_pak.cc @@ -4,7 +4,6 @@ using namespace std; - int main(int argc, char** argv) { diff --git a/fast/semiring.hh b/src/semiring.hh index 3f4ac08..3f4ac08 100644 --- a/fast/semiring.hh +++ b/src/semiring.hh diff --git a/fast/sparse_vector.hh b/src/sparse_vector.hh index 9d815ff..7fff338 100644 --- a/fast/sparse_vector.hh +++ b/src/sparse_vector.hh @@ -7,7 +7,7 @@ #include <vector> #include "util.hh" -#include "weaver.hh" +#include "types.hh" using namespace std; diff --git a/fast/test/test_grammar.cc b/src/test_grammar.cc index bbe76e7..7fee79a 100644 --- a/fast/test/test_grammar.cc +++ b/src/test_grammar.cc @@ -4,7 +4,6 @@ using namespace std; - int main(int argc, char** argv) { diff --git a/src/test_parse.cc b/src/test_parse.cc new file mode 100644 index 0000000..2d51d44 --- /dev/null +++ b/src/test_parse.cc @@ -0,0 +1,19 @@ +#include "parse.hh" + +int main(int argc, char** argv) +{ + //string in("ich sah ein kleines haus"); + //string in("europa bildet den ersten oder zweiten markt für die zehn am häufigsten von indien exportierten produkte , erklärte der europäische kommissar weiter . die asiatischen und europäischen giganten tauschen jährlich güter im wert von 47 milliarden euro und dienstleistungen im wert von 10 milliarden euro aus , hatte diese woche daniéle smadja , vorsitzende der abordnung der europäischen kommission in neu delhi , erklärt , und bedauert , dass der gegenseitige handel sein potential noch nicht ausgeschöpft hat . die eu und indien treffen sich am freitag zu ihrem achten diplomatischen in neu delhi , bei dem premierminister manmohan singh und der präsident der europäischen kommission josé manuel durao barrosso anwesend sein werden ."); + //string in("aber schon bald nach seinem eintritt kam der erste große erfolg ."); + string in("lebensmittel schuld an europäischer inflation"); + vector<symbol_t> tok = util::tokenize(in); + size_t n = tok.size(); + G::Vocabulary v; + G::Grammar g(argv[1], v); + g.add_glue(v); + Parse::Chart active(n); + Parse::Chart passive(n); + init(tok, n, active, passive, g); + parse(tok, n, active, passive, g); +} + diff --git a/fast/test/test_sparse_vector.cc b/src/test_sparse_vector.cc index 426bed1..69aaa21 100644 --- a/fast/test/test_sparse_vector.cc +++ b/src/test_sparse_vector.cc @@ -1,6 +1,5 @@ #include "sparse_vector.hh" - int main(void) { diff --git a/fast/weaver.hh b/src/types.hh index e89b4dd..e89b4dd 100644 --- a/fast/weaver.hh +++ b/src/types.hh diff --git a/fast/util.hh b/src/util.hh index 9ce19da..93ea320 100644 --- a/fast/util.hh +++ b/src/util.hh @@ -2,7 +2,7 @@ #include <string> -#include "weaver.hh" +#include "types.hh" using namespace std; diff --git a/util/Makefile b/util/Makefile deleted file mode 100644 index 30564fe..0000000 --- a/util/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -COMPILER=clang - - -all: make_pak read_pak - -make_pak: make_pak.cc json-cpp/single_include/json-cpp.hpp ../fast/hypergraph.hh ../fast/weaver.hh - $(COMPILER) -std=c++11 -lstdc++ -lm -lmsgpack make_pak.cc -o make_pak - -read_pak: read_pak.cc - $(COMPILER) -std=c++11 -lstdc++ -lmsgpack read_pak.cc -o read_pak - -clean: - rm -f make_pak read_pak - diff --git a/util/cdec2json.py b/util/cdec2json.py deleted file mode 100755 index e7c8e93..0000000 --- a/util/cdec2json.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python2 - -import cdec -import sys, argparse -import json -import gzip - - -#FIXME new format -# strings? -# map? -def hg2json(hg, weights): - """ - output a JSON representation of a cdec hypegraph - """ - res = '' - res += "{\n" - res += '"rules":[\n' - rules = [] - for i in hg.edges: - s = json.dumps(str(i.trule)) - try: - rules.index(s) - except: - rules.append(s) - res += ",\n".join(rules) - res += "\n],\n" - res += '"nodes":'+"\n" - res += "[\n" - a = [] - a.append( '{ "id":0, "symbol":"root", "span":[-1,-1] }' ) - for i in hg.nodes: - a.append('{ "id":%d, "symbol":"%s", "span":[%d,%d] }'%(i.id+1, i.cat, i.span[0], i.span[1])) - res += ",\n".join(a)+"\n" - res += "],\n" - res += '"edges":'+"\n" - res += "[\n" - a = [] - for i in hg.edges: - s = "{" - s += '"head":%d'%(i.head_node.id+1) - s += ', "rule":%s'%(rules.index(json.dumps(str(i.trule)))) - # f - #xs = ' "f":{' - #b = [] - #for j in i.feature_values: - # b.append( '"%s":%s'%(j[0], j[1]) ) - #xs += ", ".join(b) - #xs += "}," - # tails - if len(list(i.tail_nodes)) > 0: - s += ', "tails":[ %s ],'%(",".join([str(n.id+1) for n in i.tail_nodes])) - else: - s += ', "tails":[ 0 ],' - #s += xs - s += ' "score":%s }'%(i.prob) - a.append(s) - res += ",\n".join(a)+"\n" - res += "]\n" - res += "}\n" - return res - -def main(): - parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs') - parser.add_argument('-c', '--config', required=True, help='decoder configuration') - parser.add_argument('-w', '--weights', required=True, help='feature weights') - parser.add_argument('-g', '--grammar', required=False, help='grammar') - args = parser.parse_args() - with open(args.config) as config: - config = config.read() - decoder = cdec.Decoder(config) - decoder.read_weights(args.weights) - ins = sys.stdin.readline().strip() - if args.grammar: - with gzip.open(args.grammar) as grammar: - grammar = grammar.read() - hg = decoder.translate(ins, grammar=grammar) - else: - hg = decoder.translate(ins) - - sys.stderr.write( "input:\n '%s'\n"%(ins) ) - sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) ) - num_nodes = 0 - for i in hg.nodes: num_nodes+=1 - sys.stderr.write( "# nodes = %s\n"%(num_nodes) ) - num_edges = 0 - for i in hg.edges: num_edges+=1 - sys.stderr.write( "# edges = %s\n"%(num_edges) ) - sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) ) - - print hg2json(hg, decoder.weights) - - -if __name__=="__main__": - main() - |