summaryrefslogtreecommitdiff
path: root/example
diff options
context:
space:
mode:
Diffstat (limited to 'example')
-rw-r--r--example/1020/cdec.json.gzbin5379230 -> 0 bytes
-rw-r--r--example/1020/weaver.pakbin15604656 -> 0 bytes
-rw-r--r--example/1391/cdec.json.gzbin1226399 -> 0 bytes
-rw-r--r--example/1391/weaver.pakbin3352772 -> 0 bytes
-rw-r--r--example/1495/cdec.json.gzbin4186 -> 0 bytes
-rw-r--r--example/1495/weaver.pakbin22670 -> 0 bytes
-rw-r--r--example/1570/cdec.json.gzbin1802121 -> 0 bytes
-rw-r--r--example/1570/weaver.pakbin5388695 -> 0 bytes
-rw-r--r--example/1889/cdec.json.gzbin45579 -> 0 bytes
-rw-r--r--example/1889/weaver.pakbin227174 -> 0 bytes
-rw-r--r--example/2002/cdec.json.gzbin200453 -> 0 bytes
-rw-r--r--example/2002/weaver.pakbin738937 -> 0 bytes
-rw-r--r--example/3/cdec.json.gzbin327386 -> 0 bytes
-rw-r--r--example/3/in.sgm1
-rw-r--r--example/3/weaver.pakbin998508 -> 0 bytes
-rw-r--r--example/429/cdec.json.gzbin385743 -> 0 bytes
-rw-r--r--example/429/weaver.pakbin1228069 -> 0 bytes
-rw-r--r--example/748/cdec.json.gzbin648 -> 0 bytes
-rw-r--r--example/748/weaver.pakbin431 -> 0 bytes
-rwxr-xr-xexample/cdec/cdec2json.py (renamed from example/cdec2json.py)41
-rw-r--r--example/cdec/default.ini2
-rw-r--r--example/cdec/passthrough.ini2
-rw-r--r--example/glue/in.sgm1
-rwxr-xr-xexample/run172
-rw-r--r--example/toy/in-test.sgm1
-rw-r--r--example/toy/in.sgm1
-rw-r--r--example/toy/toy.json28
-rw-r--r--example/toy/weights.toy (renamed from example/toy/weights)0
28 files changed, 164 insertions, 85 deletions
diff --git a/example/1020/cdec.json.gz b/example/1020/cdec.json.gz
deleted file mode 100644
index e3ff24f..0000000
--- a/example/1020/cdec.json.gz
+++ /dev/null
Binary files differ
diff --git a/example/1020/weaver.pak b/example/1020/weaver.pak
deleted file mode 100644
index 5b33440..0000000
--- a/example/1020/weaver.pak
+++ /dev/null
Binary files differ
diff --git a/example/1391/cdec.json.gz b/example/1391/cdec.json.gz
deleted file mode 100644
index 824bfc8..0000000
--- a/example/1391/cdec.json.gz
+++ /dev/null
Binary files differ
diff --git a/example/1391/weaver.pak b/example/1391/weaver.pak
deleted file mode 100644
index cc24919..0000000
--- a/example/1391/weaver.pak
+++ /dev/null
Binary files differ
diff --git a/example/1495/cdec.json.gz b/example/1495/cdec.json.gz
deleted file mode 100644
index ac0f3da..0000000
--- a/example/1495/cdec.json.gz
+++ /dev/null
Binary files differ
diff --git a/example/1495/weaver.pak b/example/1495/weaver.pak
deleted file mode 100644
index 8307d51..0000000
--- a/example/1495/weaver.pak
+++ /dev/null
Binary files differ
diff --git a/example/1570/cdec.json.gz b/example/1570/cdec.json.gz
deleted file mode 100644
index 4fa5096..0000000
--- a/example/1570/cdec.json.gz
+++ /dev/null
Binary files differ
diff --git a/example/1570/weaver.pak b/example/1570/weaver.pak
deleted file mode 100644
index f337dd2..0000000
--- a/example/1570/weaver.pak
+++ /dev/null
Binary files differ
diff --git a/example/1889/cdec.json.gz b/example/1889/cdec.json.gz
deleted file mode 100644
index 74b6f3e..0000000
--- a/example/1889/cdec.json.gz
+++ /dev/null
Binary files differ
diff --git a/example/1889/weaver.pak b/example/1889/weaver.pak
deleted file mode 100644
index b62bc80..0000000
--- a/example/1889/weaver.pak
+++ /dev/null
Binary files differ
diff --git a/example/2002/cdec.json.gz b/example/2002/cdec.json.gz
deleted file mode 100644
index 6ca177d..0000000
--- a/example/2002/cdec.json.gz
+++ /dev/null
Binary files differ
diff --git a/example/2002/weaver.pak b/example/2002/weaver.pak
deleted file mode 100644
index 9b2fccd..0000000
--- a/example/2002/weaver.pak
+++ /dev/null
Binary files differ
diff --git a/example/3/cdec.json.gz b/example/3/cdec.json.gz
deleted file mode 100644
index 6049a2c..0000000
--- a/example/3/cdec.json.gz
+++ /dev/null
Binary files differ
diff --git a/example/3/in.sgm b/example/3/in.sgm
deleted file mode 100644
index a609b54..0000000
--- a/example/3/in.sgm
+++ /dev/null
@@ -1 +0,0 @@
-<seg id='0' grammar='example/3/grammar'>offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .</seg>
diff --git a/example/3/weaver.pak b/example/3/weaver.pak
deleted file mode 100644
index 8d93900..0000000
--- a/example/3/weaver.pak
+++ /dev/null
Binary files differ
diff --git a/example/429/cdec.json.gz b/example/429/cdec.json.gz
deleted file mode 100644
index a5fe5ad..0000000
--- a/example/429/cdec.json.gz
+++ /dev/null
Binary files differ
diff --git a/example/429/weaver.pak b/example/429/weaver.pak
deleted file mode 100644
index ba00865..0000000
--- a/example/429/weaver.pak
+++ /dev/null
Binary files differ
diff --git a/example/748/cdec.json.gz b/example/748/cdec.json.gz
deleted file mode 100644
index ca169b3..0000000
--- a/example/748/cdec.json.gz
+++ /dev/null
Binary files differ
diff --git a/example/748/weaver.pak b/example/748/weaver.pak
deleted file mode 100644
index 26276f5..0000000
--- a/example/748/weaver.pak
+++ /dev/null
Binary files differ
diff --git a/example/cdec2json.py b/example/cdec/cdec2json.py
index cd2a846..66bc8b4 100755
--- a/example/cdec2json.py
+++ b/example/cdec/cdec2json.py
@@ -2,19 +2,19 @@
import cdec
import sys, argparse
-import json
-import gzip
+import json, gzip
-
-#FIXME new format
-# strings?
-# map?
def hg2json(hg, weights):
"""
- output a JSON representation of a cdec hypegraph
+ hackishly output a JSON representation of a cdec hypegraph
+
+ TODO: cdec now uses boost's serialization instead of JSON
+ (github repo @b88176dc4fd53480e77d601ff63bf5300cf8fc7f
+ still uses the old format)
"""
res = ''
res += "{\n"
+ # rules
res += '"rules":[\n'
rules = []
for i in hg.edges:
@@ -25,14 +25,16 @@ def hg2json(hg, weights):
rules.append(s)
res += ",\n".join(rules)
res += "\n],\n"
+ # nodes
res += '"nodes":'+"\n"
res += "[\n"
a = []
- a.append( '{ "id":0, "symbol":"root", "span":[-1,-1] }' )
+ a.append('{ "id":0, "symbol":"root", "span":[-1,-1] }')
for i in hg.nodes:
a.append('{ "id":%d, "symbol":"%s", "span":[%d,%d] }'%(i.id+1, i.cat, i.span[0], i.span[1]))
res += ",\n".join(a)+"\n"
res += "],\n"
+ # edges
res += '"edges":'+"\n"
res += "[\n"
a = []
@@ -40,19 +42,11 @@ def hg2json(hg, weights):
s = "{"
s += '"head":%d'%(i.head_node.id+1)
s += ', "rule":%s'%(rules.index(json.dumps(str(i.trule))))
- # f
- #xs = ' "f":{'
- #b = []
- #for j in i.feature_values:
- # b.append( '"%s":%s'%(j[0], j[1]) )
- #xs += ", ".join(b)
- #xs += "},"
# tails
if len(list(i.tail_nodes)) > 0:
s += ', "tails":[ %s ],'%(",".join([str(n.id+1) for n in i.tail_nodes]))
else:
s += ', "tails":[ 0 ],'
- #s += xs
s += ' "score":%s }'%(i.prob)
a.append(s)
res += ",\n".join(a)+"\n"
@@ -61,7 +55,7 @@ def hg2json(hg, weights):
return res
def main():
- parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs')
+ parser = argparse.ArgumentParser(description='get a nice json representation of cdec hypergraphs')
parser.add_argument('-c', '--config', required=True, help='decoder configuration')
parser.add_argument('-w', '--weights', required=True, help='feature weights')
parser.add_argument('-g', '--grammar', required=False, help='grammar')
@@ -82,18 +76,17 @@ def main():
else:
hg = decoder.translate(ins)
- sys.stderr.write( "input:\n '%s'\n"%(ins) )
- sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) )
+ sys.stderr.write("input:\n '%s'\n"%(ins))
+ sys.stderr.write("viterbi translation:\n '%s'\n"%(hg.viterbi()))
num_nodes = 0
for i in hg.nodes: num_nodes+=1
- sys.stderr.write( "# nodes = %s\n"%(num_nodes) )
+ sys.stderr.write("# nodes = %s\n"%(num_nodes))
num_edges = 0
for i in hg.edges: num_edges+=1
- sys.stderr.write( "# edges = %s\n"%(num_edges) )
- sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) )
-
- print hg2json(hg, decoder.weights)
+ sys.stderr.write("# edges = %s\n"%(num_edges))
+ sys.stderr.write("viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)))
+ print hg2json(hg, decoder.weights).encode('utf-8')
if __name__=="__main__":
main()
diff --git a/example/cdec/default.ini b/example/cdec/default.ini
index 10ea512..8174118 100644
--- a/example/cdec/default.ini
+++ b/example/cdec/default.ini
@@ -1,3 +1,3 @@
formalism=scfg
intersection_strategy=full
-scfg_max_span_limit=999999999
+scfg_max_span_limit=2147483647 # std::numeric_limits<int>::max() on my machine
diff --git a/example/cdec/passthrough.ini b/example/cdec/passthrough.ini
index d9280ad..15ed4b6 100644
--- a/example/cdec/passthrough.ini
+++ b/example/cdec/passthrough.ini
@@ -1,4 +1,4 @@
formalism=scfg
intersection_strategy=full
-scfg_max_span_limit=999999999
+scfg_max_span_limit=2147483647 # std::numeric_limits<int>::max() on my machine
add_pass_through_rules=true
diff --git a/example/glue/in.sgm b/example/glue/in.sgm
deleted file mode 100644
index 2f1a89b..0000000
--- a/example/glue/in.sgm
+++ /dev/null
@@ -1 +0,0 @@
-<seg id='0' grammar='example/glue/grammar'>lebensmittel schuld an europäischer inflation</seg>
diff --git a/example/run b/example/run
index 4440634..5149115 100755
--- a/example/run
+++ b/example/run
@@ -1,65 +1,127 @@
#!/bin/zsh -x
-CDEC_BIN=~/src/cdec-dtrain/decoder/cdec
-CDEC_MINIMAL_BIN=~/src/cdec-dtrain/decoder/minimal_decoder
-WEAVER_PROTOTYPE_BIN=../prototype/weaver.rb
-FAST_WEAVER_BIN=../fast_weaver
+ CDEC=~/src/cdec_json_serialization/decoder/cdec
+ CDEC_MINIMAL=~/src/cdec_json_serialization/decoder/minimal_decoder
+WEAVER_PROTOTYPE=../prototype/weaver_proto.rb
+ FAST_WEAVER=../bin/fast_weaver
+ CDEC2JSON=./cdec/cdec2json.py
+ MAKE_PAK=../bin/make_pak
+# 1020 \
for example in \
+ 1391 \
+ 1495 \
+ 1570 \
+ 1889 \
+ 2002 \
3 \
+ 429 \
+ 748 \
; do
-mkdir -p $example/output
-
-$CDEC_BIN -c cdec/default.ini -w weights/weights -n \
- -g $example/grammar < $example/in \
- > $example/output/cdec.nothing.out \
- 2>$example/output/cdec.nothing.err
-$CDEC_BIN -c cdec/default.ini -w weights/weights \
- -g $example/grammar < $example/in \
- > $example/output/cdec.glue.out \
- 2>$example/output/cdec.glue.err
-$CDEC_BIN -c cdec/passthrough.ini -w weights/weights -n \
- -g $example/grammar < $example/in \
- > $example/output/cdec.passthrough.out \
- 2>$example/output/cdec.passthrough.err
-$CDEC_BIN -c cdec/default.ini -w weights/weights \
- -g $example/grammar < $example/in \
- > $example/output/cdec.default.out \
- 2>$example/output/cdec.default.err
-$CDEC_BIN -c cdec/default.ini -w weights/weights.0 \
- -g $example/grammar < $example/in \
- > $example/output/cdec.default-0.out \
- 2>$example/output/cdec.default-0.err
-
-$CDEC_MINIMAL_BIN $example/cdec.json.gz weights/weights \
- > $example/output/cdec-minimal.out \
- 2>$example/output/cdec-minimal.err
-$CDEC_MINIMAL_BIN $example/cdec.json.gz weights/weights.0 \
- > $example/output/cdec-minimal.0.out \
- 2>$example/output/cdec-minimal.0.err
-
-$FAST_WEAVER_BIN $example/weaver.pak \
- > $example/output/fast_weaver.out \
- 2>$example/output/fast_weaver.err
-
-$WEAVER_PROTOTYPE_BIN -w weights/weights -g $example/grammar -i $example/in \
- > $example/output/weaver-prototype.nothing.out \
- 2>$example/output/weaver-prototype.nothing.err
-$WEAVER_PROTOTYPE_BIN -w weights/weights -g $example/grammar -i $example/in -l \
- > $example/output/weaver-prototype.glue.out \
- 2>$example/output/weaver-prototype.glue.err
-$WEAVER_PROTOTYPE_BIN -w weights/weights -g $example/grammar -i $example/in -p \
- > $example/output/weaver-prototype.passthrough.out \
- 2>$example/output/weaver-prototype.passthrough.err
-$WEAVER_PROTOTYPE_BIN -w weights/weights -g $example/grammar -i $example/in \
- -l -p \
- > $example/output/weaver-prototype.default.out \
- 2>$example/output/weaver-prototype.default.err
-$WEAVER_PROTOTYPE_BIN -w weights/weights.0 -g $example/grammar -i $example/in \
- -l -p \
- > $example/output/weaver-prototype.default-0.out \
- 2>$example/output/weaver-prototype.default-0.err
+OUT_DIR=$example/output
+mkdir -p $OUT_DIR
+
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+# cdec
+#
+$CDEC \
+ -c cdec/default.ini \
+ -w weights/weights \
+ -g $example/grammar \
+ -O $OUT_DIR \
+ < $example/in \
+ > $OUT_DIR/cdec.out \
+ 2>$OUT_DIR/cdec.err
+mv $OUT_DIR/0.json.gz $OUT_DIR/cdec.json.gz
+
+# +passthrough
+$CDEC \
+ -c cdec/passthrough.ini \
+ -w weights/weights \
+ -g $example/grammar \
+ -O $OUT_DIR \
+ < $example/in \
+ > $OUT_DIR/cdec.passthrough.out \
+ 2>$OUT_DIR/cdec.passthrough.err
+mv $OUT_DIR/0.json.gz $OUT_DIR/cdec.passthrough.json.gz
+
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+# cdec2json
+#
+$CDEC2JSON \
+ -c cdec/default.ini \
+ -w weights/weights \
+ -g $example/grammar \
+ < $example/in \
+ > $OUT_DIR/cdec2json.json \
+ 2>$OUT_DIR/cdec2json.err
+
+# +passthrough
+$CDEC2JSON \
+ -c cdec/passthrough.ini \
+ -w weights/weights \
+ -g $example/grammar \
+ < $example/in \
+ > $OUT_DIR/cdec2json.passthrough.json \
+ 2>$OUT_DIR/cdec2json.passthrough.err
+
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+# cdec minimal_decoder
+#
+$CDEC_MINIMAL \
+ $OUT_DIR/cdec.json.gz \
+ weights/weights \
+ > $OUT_DIR/cdec_minimal.out \
+ 2>$OUT_DIR/cdec_minimal.err
+
+$CDEC_MINIMAL \
+ $OUT_DIR/cdec.passthrough.json.gz \
+ weights/weights \
+ > $OUT_DIR/cdec_minimal.passthrough.out \
+ 2>$OUT_DIR/cdec_minimal.passthrough.err
+
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+# fast_weaver
+#
+# make pak files first
+$MAKE_PAK \
+ $OUT_DIR/cdec2json.json \
+ $OUT_DIR/weaver.pak
+$MAKE_PAK \
+ $OUT_DIR/cdec2json.passthrough.json \
+ $OUT_DIR/weaver.passthrough.pak
+
+$FAST_WEAVER \
+ $OUT_DIR/weaver.pak \
+ > $OUT_DIR/fast_weaver.out \
+ 2>$OUT_DIR/fast_weaver.err
+
+$FAST_WEAVER \
+ $OUT_DIR/weaver.passthrough.pak \
+ > $OUT_DIR/fast_weaver.passthrough.out \
+ 2>$OUT_DIR/fast_weaver.passthrough.err
+
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+# weaver prototype
+#
+$WEAVER_PROTOTYPE \
+ -w weights/weights \
+ -g $example/grammar \
+ -i $example/in \
+ -l \
+ > $OUT_DIR/weaver_proto.out \
+ 2>$OUT_DIR/weaver_proto.err
+
+# +passthrough
+$WEAVER_PROTOTYPE \
+ -w weights/weights \
+ -g $example/grammar \
+ -i $example/in \
+ -l \
+ -p \
+ > $OUT_DIR/weaver_proto.passthrough.out \
+ 2>$OUT_DIR/weaver_proto.passthrough.err
done
diff --git a/example/toy/in-test.sgm b/example/toy/in-test.sgm
deleted file mode 100644
index a13f93c..0000000
--- a/example/toy/in-test.sgm
+++ /dev/null
@@ -1 +0,0 @@
-<seg id='0' grammar='example/toy/grammar-test'>ich sah ein kleines haus</seg>
diff --git a/example/toy/in.sgm b/example/toy/in.sgm
deleted file mode 100644
index 561d346..0000000
--- a/example/toy/in.sgm
+++ /dev/null
@@ -1 +0,0 @@
-<seg id='0' grammar='example/toy/grammar'>ich sah ein kleines haus</seg>
diff --git a/example/toy/toy.json b/example/toy/toy.json
new file mode 100644
index 0000000..c1b5c55
--- /dev/null
+++ b/example/toy/toy.json
@@ -0,0 +1,28 @@
+{
+"weights":{"logp":2.0,"use_house":0.0,"use_shell":1.0},
+"nodes":
+[
+{ "id":-1, "cat":"root", "span":[-1,-1] },
+{ "id":0, "cat":"NP", "span":[0,1] },
+{ "id":1, "cat":"V", "span":[1,2] },
+{ "id":2, "cat":"JJ", "span":[3,4] },
+{ "id":3, "cat":"NN", "span":[3,5] },
+{ "id":4, "cat":"NP", "span":[2,5] },
+{ "id":5, "cat":"VP", "span":[1,5] },
+{ "id":6, "cat":"S", "span":[0,5] }
+],
+"edges":
+[
+{ "head":0, "rule":"[NP] ||| ich ||| i", "tails":[-1], "f":{"logp":-0.5,"use_i":1.0} },
+{ "head":1, "rule":"[V] ||| sah ||| saw", "tails":[-1], "f":{"logp":-0.25,"use_saw":1.0} },
+{ "head":2, "rule":"[JJ] ||| kleines ||| small", "tails":[-1], "f":{"logp":0.0,"use_small":1.0} },
+{ "head":2, "rule":"[JJ] ||| kleines ||| little", "tails":[-1], "f":{"logp":0.0,"use_little":1.0} },
+{ "head":3, "rule":"[NN] ||| kleines haus ||| small house", "tails":[-1], "f":{"logp":0.0,"use_house":1.0} },
+{ "head":3, "rule":"[NN] ||| kleines haus ||| little house", "tails":[-1], "f":{"logp":0.0,"use_house":1.0} },
+{ "head":3, "rule":"[NN] ||| [JJ,1] haus ||| [JJ,1] shell", "tails":[2], "f":{"logp":0.0,"use_shell":1.0} },
+{ "head":3, "rule":"[NN] ||| [JJ,1] haus ||| [JJ,1] house", "tails":[2], "f":{"logp":0.0,"use_house":1.0} },
+{ "head":4, "rule":"[NP] ||| ein [NN,1] ||| a [NN,1]", "tails":[3], "f":{"logp":0.0,"use_a":1.0} },
+{ "head":5, "rule":"[VP] ||| [V,1] [NP,2] ||| [V,1] [NP,2]", "tails":[1, 4], "f":{"logp":0.0} },
+{ "head":6, "rule":"[S] ||| [NP,1] [VP,2] ||| [NP,1] [VP,2]", "tails":[0, 5], "f":{"logp":0.0} }
+]
+}
diff --git a/example/toy/weights b/example/toy/weights.toy
index 70075b7..70075b7 100644
--- a/example/toy/weights
+++ b/example/toy/weights.toy