diff options
-rw-r--r-- | example/1020/cdec.json.gz | bin | 5379230 -> 0 bytes | |||
-rw-r--r-- | example/1020/weaver.pak | bin | 15604656 -> 0 bytes | |||
-rw-r--r-- | example/1391/cdec.json.gz | bin | 1226399 -> 0 bytes | |||
-rw-r--r-- | example/1391/weaver.pak | bin | 3352772 -> 0 bytes | |||
-rw-r--r-- | example/1495/cdec.json.gz | bin | 4186 -> 0 bytes | |||
-rw-r--r-- | example/1495/weaver.pak | bin | 22670 -> 0 bytes | |||
-rw-r--r-- | example/1570/cdec.json.gz | bin | 1802121 -> 0 bytes | |||
-rw-r--r-- | example/1570/weaver.pak | bin | 5388695 -> 0 bytes | |||
-rw-r--r-- | example/1889/cdec.json.gz | bin | 45579 -> 0 bytes | |||
-rw-r--r-- | example/1889/weaver.pak | bin | 227174 -> 0 bytes | |||
-rw-r--r-- | example/2002/cdec.json.gz | bin | 200453 -> 0 bytes | |||
-rw-r--r-- | example/2002/weaver.pak | bin | 738937 -> 0 bytes | |||
-rw-r--r-- | example/3/cdec.json.gz | bin | 327386 -> 0 bytes | |||
-rw-r--r-- | example/3/in.sgm | 1 | ||||
-rw-r--r-- | example/3/weaver.pak | bin | 998508 -> 0 bytes | |||
-rw-r--r-- | example/429/cdec.json.gz | bin | 385743 -> 0 bytes | |||
-rw-r--r-- | example/429/weaver.pak | bin | 1228069 -> 0 bytes | |||
-rw-r--r-- | example/748/cdec.json.gz | bin | 648 -> 0 bytes | |||
-rw-r--r-- | example/748/weaver.pak | bin | 431 -> 0 bytes | |||
-rwxr-xr-x | example/cdec/cdec2json.py (renamed from example/cdec2json.py) | 41 | ||||
-rw-r--r-- | example/cdec/default.ini | 2 | ||||
-rw-r--r-- | example/cdec/passthrough.ini | 2 | ||||
-rw-r--r-- | example/glue/in.sgm | 1 | ||||
-rwxr-xr-x | example/run | 172 | ||||
-rw-r--r-- | example/toy/in-test.sgm | 1 | ||||
-rw-r--r-- | example/toy/in.sgm | 1 | ||||
-rw-r--r-- | example/toy/toy.json | 28 | ||||
-rw-r--r-- | example/toy/weights.toy (renamed from example/toy/weights) | 0 |
28 files changed, 164 insertions, 85 deletions
diff --git a/example/1020/cdec.json.gz b/example/1020/cdec.json.gz Binary files differdeleted file mode 100644 index e3ff24f..0000000 --- a/example/1020/cdec.json.gz +++ /dev/null diff --git a/example/1020/weaver.pak b/example/1020/weaver.pak Binary files differdeleted file mode 100644 index 5b33440..0000000 --- a/example/1020/weaver.pak +++ /dev/null diff --git a/example/1391/cdec.json.gz b/example/1391/cdec.json.gz Binary files differdeleted file mode 100644 index 824bfc8..0000000 --- a/example/1391/cdec.json.gz +++ /dev/null diff --git a/example/1391/weaver.pak b/example/1391/weaver.pak Binary files differdeleted file mode 100644 index cc24919..0000000 --- a/example/1391/weaver.pak +++ /dev/null diff --git a/example/1495/cdec.json.gz b/example/1495/cdec.json.gz Binary files differdeleted file mode 100644 index ac0f3da..0000000 --- a/example/1495/cdec.json.gz +++ /dev/null diff --git a/example/1495/weaver.pak b/example/1495/weaver.pak Binary files differdeleted file mode 100644 index 8307d51..0000000 --- a/example/1495/weaver.pak +++ /dev/null diff --git a/example/1570/cdec.json.gz b/example/1570/cdec.json.gz Binary files differdeleted file mode 100644 index 4fa5096..0000000 --- a/example/1570/cdec.json.gz +++ /dev/null diff --git a/example/1570/weaver.pak b/example/1570/weaver.pak Binary files differdeleted file mode 100644 index f337dd2..0000000 --- a/example/1570/weaver.pak +++ /dev/null diff --git a/example/1889/cdec.json.gz b/example/1889/cdec.json.gz Binary files differdeleted file mode 100644 index 74b6f3e..0000000 --- a/example/1889/cdec.json.gz +++ /dev/null diff --git a/example/1889/weaver.pak b/example/1889/weaver.pak Binary files differdeleted file mode 100644 index b62bc80..0000000 --- a/example/1889/weaver.pak +++ /dev/null diff --git a/example/2002/cdec.json.gz b/example/2002/cdec.json.gz Binary files differdeleted file mode 100644 index 6ca177d..0000000 --- a/example/2002/cdec.json.gz +++ /dev/null diff --git a/example/2002/weaver.pak b/example/2002/weaver.pak Binary files differdeleted file mode 100644 index 9b2fccd..0000000 --- a/example/2002/weaver.pak +++ /dev/null diff --git a/example/3/cdec.json.gz b/example/3/cdec.json.gz Binary files differdeleted file mode 100644 index 6049a2c..0000000 --- a/example/3/cdec.json.gz +++ /dev/null diff --git a/example/3/in.sgm b/example/3/in.sgm deleted file mode 100644 index a609b54..0000000 --- a/example/3/in.sgm +++ /dev/null @@ -1 +0,0 @@ -<seg id='0' grammar='example/3/grammar'>offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .</seg> diff --git a/example/3/weaver.pak b/example/3/weaver.pak Binary files differdeleted file mode 100644 index 8d93900..0000000 --- a/example/3/weaver.pak +++ /dev/null diff --git a/example/429/cdec.json.gz b/example/429/cdec.json.gz Binary files differdeleted file mode 100644 index a5fe5ad..0000000 --- a/example/429/cdec.json.gz +++ /dev/null diff --git a/example/429/weaver.pak b/example/429/weaver.pak Binary files differdeleted file mode 100644 index ba00865..0000000 --- a/example/429/weaver.pak +++ /dev/null diff --git a/example/748/cdec.json.gz b/example/748/cdec.json.gz Binary files differdeleted file mode 100644 index ca169b3..0000000 --- a/example/748/cdec.json.gz +++ /dev/null diff --git a/example/748/weaver.pak b/example/748/weaver.pak Binary files differdeleted file mode 100644 index 26276f5..0000000 --- a/example/748/weaver.pak +++ /dev/null diff --git a/example/cdec2json.py b/example/cdec/cdec2json.py index cd2a846..66bc8b4 100755 --- a/example/cdec2json.py +++ b/example/cdec/cdec2json.py @@ -2,19 +2,19 @@ import cdec import sys, argparse -import json -import gzip +import json, gzip - -#FIXME new format -# strings? -# map? def hg2json(hg, weights): """ - output a JSON representation of a cdec hypegraph + hackishly output a JSON representation of a cdec hypegraph + + TODO: cdec now uses boost's serialization instead of JSON + (github repo @b88176dc4fd53480e77d601ff63bf5300cf8fc7f + still uses the old format) """ res = '' res += "{\n" + # rules res += '"rules":[\n' rules = [] for i in hg.edges: @@ -25,14 +25,16 @@ def hg2json(hg, weights): rules.append(s) res += ",\n".join(rules) res += "\n],\n" + # nodes res += '"nodes":'+"\n" res += "[\n" a = [] - a.append( '{ "id":0, "symbol":"root", "span":[-1,-1] }' ) + a.append('{ "id":0, "symbol":"root", "span":[-1,-1] }') for i in hg.nodes: a.append('{ "id":%d, "symbol":"%s", "span":[%d,%d] }'%(i.id+1, i.cat, i.span[0], i.span[1])) res += ",\n".join(a)+"\n" res += "],\n" + # edges res += '"edges":'+"\n" res += "[\n" a = [] @@ -40,19 +42,11 @@ def hg2json(hg, weights): s = "{" s += '"head":%d'%(i.head_node.id+1) s += ', "rule":%s'%(rules.index(json.dumps(str(i.trule)))) - # f - #xs = ' "f":{' - #b = [] - #for j in i.feature_values: - # b.append( '"%s":%s'%(j[0], j[1]) ) - #xs += ", ".join(b) - #xs += "}," # tails if len(list(i.tail_nodes)) > 0: s += ', "tails":[ %s ],'%(",".join([str(n.id+1) for n in i.tail_nodes])) else: s += ', "tails":[ 0 ],' - #s += xs s += ' "score":%s }'%(i.prob) a.append(s) res += ",\n".join(a)+"\n" @@ -61,7 +55,7 @@ def hg2json(hg, weights): return res def main(): - parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs') + parser = argparse.ArgumentParser(description='get a nice json representation of cdec hypergraphs') parser.add_argument('-c', '--config', required=True, help='decoder configuration') parser.add_argument('-w', '--weights', required=True, help='feature weights') parser.add_argument('-g', '--grammar', required=False, help='grammar') @@ -82,18 +76,17 @@ def main(): else: hg = decoder.translate(ins) - sys.stderr.write( "input:\n '%s'\n"%(ins) ) - sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) ) + sys.stderr.write("input:\n '%s'\n"%(ins)) + sys.stderr.write("viterbi translation:\n '%s'\n"%(hg.viterbi())) num_nodes = 0 for i in hg.nodes: num_nodes+=1 - sys.stderr.write( "# nodes = %s\n"%(num_nodes) ) + sys.stderr.write("# nodes = %s\n"%(num_nodes)) num_edges = 0 for i in hg.edges: num_edges+=1 - sys.stderr.write( "# edges = %s\n"%(num_edges) ) - sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) ) - - print hg2json(hg, decoder.weights) + sys.stderr.write("# edges = %s\n"%(num_edges)) + sys.stderr.write("viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2))) + print hg2json(hg, decoder.weights).encode('utf-8') if __name__=="__main__": main() diff --git a/example/cdec/default.ini b/example/cdec/default.ini index 10ea512..8174118 100644 --- a/example/cdec/default.ini +++ b/example/cdec/default.ini @@ -1,3 +1,3 @@ formalism=scfg intersection_strategy=full -scfg_max_span_limit=999999999 +scfg_max_span_limit=2147483647 # std::numeric_limits<int>::max() on my machine diff --git a/example/cdec/passthrough.ini b/example/cdec/passthrough.ini index d9280ad..15ed4b6 100644 --- a/example/cdec/passthrough.ini +++ b/example/cdec/passthrough.ini @@ -1,4 +1,4 @@ formalism=scfg intersection_strategy=full -scfg_max_span_limit=999999999 +scfg_max_span_limit=2147483647 # std::numeric_limits<int>::max() on my machine add_pass_through_rules=true diff --git a/example/glue/in.sgm b/example/glue/in.sgm deleted file mode 100644 index 2f1a89b..0000000 --- a/example/glue/in.sgm +++ /dev/null @@ -1 +0,0 @@ -<seg id='0' grammar='example/glue/grammar'>lebensmittel schuld an europäischer inflation</seg> diff --git a/example/run b/example/run index 4440634..5149115 100755 --- a/example/run +++ b/example/run @@ -1,65 +1,127 @@ #!/bin/zsh -x -CDEC_BIN=~/src/cdec-dtrain/decoder/cdec -CDEC_MINIMAL_BIN=~/src/cdec-dtrain/decoder/minimal_decoder -WEAVER_PROTOTYPE_BIN=../prototype/weaver.rb -FAST_WEAVER_BIN=../fast_weaver + CDEC=~/src/cdec_json_serialization/decoder/cdec + CDEC_MINIMAL=~/src/cdec_json_serialization/decoder/minimal_decoder +WEAVER_PROTOTYPE=../prototype/weaver_proto.rb + FAST_WEAVER=../bin/fast_weaver + CDEC2JSON=./cdec/cdec2json.py + MAKE_PAK=../bin/make_pak +# 1020 \ for example in \ + 1391 \ + 1495 \ + 1570 \ + 1889 \ + 2002 \ 3 \ + 429 \ + 748 \ ; do -mkdir -p $example/output - -$CDEC_BIN -c cdec/default.ini -w weights/weights -n \ - -g $example/grammar < $example/in \ - > $example/output/cdec.nothing.out \ - 2>$example/output/cdec.nothing.err -$CDEC_BIN -c cdec/default.ini -w weights/weights \ - -g $example/grammar < $example/in \ - > $example/output/cdec.glue.out \ - 2>$example/output/cdec.glue.err -$CDEC_BIN -c cdec/passthrough.ini -w weights/weights -n \ - -g $example/grammar < $example/in \ - > $example/output/cdec.passthrough.out \ - 2>$example/output/cdec.passthrough.err -$CDEC_BIN -c cdec/default.ini -w weights/weights \ - -g $example/grammar < $example/in \ - > $example/output/cdec.default.out \ - 2>$example/output/cdec.default.err -$CDEC_BIN -c cdec/default.ini -w weights/weights.0 \ - -g $example/grammar < $example/in \ - > $example/output/cdec.default-0.out \ - 2>$example/output/cdec.default-0.err - -$CDEC_MINIMAL_BIN $example/cdec.json.gz weights/weights \ - > $example/output/cdec-minimal.out \ - 2>$example/output/cdec-minimal.err -$CDEC_MINIMAL_BIN $example/cdec.json.gz weights/weights.0 \ - > $example/output/cdec-minimal.0.out \ - 2>$example/output/cdec-minimal.0.err - -$FAST_WEAVER_BIN $example/weaver.pak \ - > $example/output/fast_weaver.out \ - 2>$example/output/fast_weaver.err - -$WEAVER_PROTOTYPE_BIN -w weights/weights -g $example/grammar -i $example/in \ - > $example/output/weaver-prototype.nothing.out \ - 2>$example/output/weaver-prototype.nothing.err -$WEAVER_PROTOTYPE_BIN -w weights/weights -g $example/grammar -i $example/in -l \ - > $example/output/weaver-prototype.glue.out \ - 2>$example/output/weaver-prototype.glue.err -$WEAVER_PROTOTYPE_BIN -w weights/weights -g $example/grammar -i $example/in -p \ - > $example/output/weaver-prototype.passthrough.out \ - 2>$example/output/weaver-prototype.passthrough.err -$WEAVER_PROTOTYPE_BIN -w weights/weights -g $example/grammar -i $example/in \ - -l -p \ - > $example/output/weaver-prototype.default.out \ - 2>$example/output/weaver-prototype.default.err -$WEAVER_PROTOTYPE_BIN -w weights/weights.0 -g $example/grammar -i $example/in \ - -l -p \ - > $example/output/weaver-prototype.default-0.out \ - 2>$example/output/weaver-prototype.default-0.err +OUT_DIR=$example/output +mkdir -p $OUT_DIR + +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +# cdec +# +$CDEC \ + -c cdec/default.ini \ + -w weights/weights \ + -g $example/grammar \ + -O $OUT_DIR \ + < $example/in \ + > $OUT_DIR/cdec.out \ + 2>$OUT_DIR/cdec.err +mv $OUT_DIR/0.json.gz $OUT_DIR/cdec.json.gz + +# +passthrough +$CDEC \ + -c cdec/passthrough.ini \ + -w weights/weights \ + -g $example/grammar \ + -O $OUT_DIR \ + < $example/in \ + > $OUT_DIR/cdec.passthrough.out \ + 2>$OUT_DIR/cdec.passthrough.err +mv $OUT_DIR/0.json.gz $OUT_DIR/cdec.passthrough.json.gz + +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +# cdec2json +# +$CDEC2JSON \ + -c cdec/default.ini \ + -w weights/weights \ + -g $example/grammar \ + < $example/in \ + > $OUT_DIR/cdec2json.json \ + 2>$OUT_DIR/cdec2json.err + +# +passthrough +$CDEC2JSON \ + -c cdec/passthrough.ini \ + -w weights/weights \ + -g $example/grammar \ + < $example/in \ + > $OUT_DIR/cdec2json.passthrough.json \ + 2>$OUT_DIR/cdec2json.passthrough.err + +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +# cdec minimal_decoder +# +$CDEC_MINIMAL \ + $OUT_DIR/cdec.json.gz \ + weights/weights \ + > $OUT_DIR/cdec_minimal.out \ + 2>$OUT_DIR/cdec_minimal.err + +$CDEC_MINIMAL \ + $OUT_DIR/cdec.passthrough.json.gz \ + weights/weights \ + > $OUT_DIR/cdec_minimal.passthrough.out \ + 2>$OUT_DIR/cdec_minimal.passthrough.err + +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +# fast_weaver +# +# make pak files first +$MAKE_PAK \ + $OUT_DIR/cdec2json.json \ + $OUT_DIR/weaver.pak +$MAKE_PAK \ + $OUT_DIR/cdec2json.passthrough.json \ + $OUT_DIR/weaver.passthrough.pak + +$FAST_WEAVER \ + $OUT_DIR/weaver.pak \ + > $OUT_DIR/fast_weaver.out \ + 2>$OUT_DIR/fast_weaver.err + +$FAST_WEAVER \ + $OUT_DIR/weaver.passthrough.pak \ + > $OUT_DIR/fast_weaver.passthrough.out \ + 2>$OUT_DIR/fast_weaver.passthrough.err + +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +# weaver prototype +# +$WEAVER_PROTOTYPE \ + -w weights/weights \ + -g $example/grammar \ + -i $example/in \ + -l \ + > $OUT_DIR/weaver_proto.out \ + 2>$OUT_DIR/weaver_proto.err + +# +passthrough +$WEAVER_PROTOTYPE \ + -w weights/weights \ + -g $example/grammar \ + -i $example/in \ + -l \ + -p \ + > $OUT_DIR/weaver_proto.passthrough.out \ + 2>$OUT_DIR/weaver_proto.passthrough.err done diff --git a/example/toy/in-test.sgm b/example/toy/in-test.sgm deleted file mode 100644 index a13f93c..0000000 --- a/example/toy/in-test.sgm +++ /dev/null @@ -1 +0,0 @@ -<seg id='0' grammar='example/toy/grammar-test'>ich sah ein kleines haus</seg> diff --git a/example/toy/in.sgm b/example/toy/in.sgm deleted file mode 100644 index 561d346..0000000 --- a/example/toy/in.sgm +++ /dev/null @@ -1 +0,0 @@ -<seg id='0' grammar='example/toy/grammar'>ich sah ein kleines haus</seg> diff --git a/example/toy/toy.json b/example/toy/toy.json new file mode 100644 index 0000000..c1b5c55 --- /dev/null +++ b/example/toy/toy.json @@ -0,0 +1,28 @@ +{ +"weights":{"logp":2.0,"use_house":0.0,"use_shell":1.0}, +"nodes": +[ +{ "id":-1, "cat":"root", "span":[-1,-1] }, +{ "id":0, "cat":"NP", "span":[0,1] }, +{ "id":1, "cat":"V", "span":[1,2] }, +{ "id":2, "cat":"JJ", "span":[3,4] }, +{ "id":3, "cat":"NN", "span":[3,5] }, +{ "id":4, "cat":"NP", "span":[2,5] }, +{ "id":5, "cat":"VP", "span":[1,5] }, +{ "id":6, "cat":"S", "span":[0,5] } +], +"edges": +[ +{ "head":0, "rule":"[NP] ||| ich ||| i", "tails":[-1], "f":{"logp":-0.5,"use_i":1.0} }, +{ "head":1, "rule":"[V] ||| sah ||| saw", "tails":[-1], "f":{"logp":-0.25,"use_saw":1.0} }, +{ "head":2, "rule":"[JJ] ||| kleines ||| small", "tails":[-1], "f":{"logp":0.0,"use_small":1.0} }, +{ "head":2, "rule":"[JJ] ||| kleines ||| little", "tails":[-1], "f":{"logp":0.0,"use_little":1.0} }, +{ "head":3, "rule":"[NN] ||| kleines haus ||| small house", "tails":[-1], "f":{"logp":0.0,"use_house":1.0} }, +{ "head":3, "rule":"[NN] ||| kleines haus ||| little house", "tails":[-1], "f":{"logp":0.0,"use_house":1.0} }, +{ "head":3, "rule":"[NN] ||| [JJ,1] haus ||| [JJ,1] shell", "tails":[2], "f":{"logp":0.0,"use_shell":1.0} }, +{ "head":3, "rule":"[NN] ||| [JJ,1] haus ||| [JJ,1] house", "tails":[2], "f":{"logp":0.0,"use_house":1.0} }, +{ "head":4, "rule":"[NP] ||| ein [NN,1] ||| a [NN,1]", "tails":[3], "f":{"logp":0.0,"use_a":1.0} }, +{ "head":5, "rule":"[VP] ||| [V,1] [NP,2] ||| [V,1] [NP,2]", "tails":[1, 4], "f":{"logp":0.0} }, +{ "head":6, "rule":"[S] ||| [NP,1] [VP,2] ||| [NP,1] [VP,2]", "tails":[0, 5], "f":{"logp":0.0} } +] +} diff --git a/example/toy/weights b/example/toy/weights.toy index 70075b7..70075b7 100644 --- a/example/toy/weights +++ b/example/toy/weights.toy |