From e834fd628f61aca04f98691eb56e9808c33c6787 Mon Sep 17 00:00:00 2001 From: Patrick Simianer
Date: Thu, 12 Jun 2014 14:39:37 +0200 Subject: cleanup --- README.md | 7 ++-- example/3/3.in | 1 - example/3/cdec.ini | 5 ++- example/3/in | 1 + example/toy/cdec.ini | 2 +- example/toy/grammar-test | 24 +++++++------- example/toy/toy-cdec.json | 33 +++++++++++++++++++ example/toy/toy1.json | 33 ------------------- util/cdec2json.py | 82 +++++++++++++++++++++++++++++++++++++++++++++++ util/cdec_hg_to_json.py | 82 ----------------------------------------------- 10 files changed, 135 insertions(+), 135 deletions(-) delete mode 100644 example/3/3.in create mode 100644 example/3/in create mode 100644 example/toy/toy-cdec.json delete mode 100644 example/toy/toy1.json create mode 100755 util/cdec2json.py delete mode 100755 util/cdec_hg_to_json.py diff --git a/README.md b/README.md index 98065f1..edf89bd 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ -nothing to see here +not much to see here, yet (SCFG machine translation decoder in ruby, currently implements CKY+ parsing and hypergraph viterbi) helpful stuff * https://github.com/jweese/thrax/wiki/Glue-grammar + * http://aclweb.org/aclwiki/index.php?title=Hypergraph_Format todo - * animate? - * to json +==== + * animate parsing? * integrate with HG diff --git a/example/3/3.in b/example/3/3.in deleted file mode 100644 index ee3509c..0000000 --- a/example/3/3.in +++ /dev/null @@ -1 +0,0 @@ -offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg . diff --git a/example/3/cdec.ini b/example/3/cdec.ini index 23c6ad3..ee65b4e 100644 --- a/example/3/cdec.ini +++ b/example/3/cdec.ini @@ -1,6 +1,5 @@ formalism=scfg -grammar=grammars/grammar.3.gz intersection_strategy=full -#grammar=grammar-test +grammar=grammars/grammar.3.gz #add_pass_through_rules=true -#weights=weights + diff --git a/example/3/in b/example/3/in new file mode 100644 index 0000000..ee3509c --- /dev/null +++ b/example/3/in @@ -0,0 +1 @@ +offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg . diff --git a/example/toy/cdec.ini b/example/toy/cdec.ini index 8276d9b..f12de54 100644 --- a/example/toy/cdec.ini +++ b/example/toy/cdec.ini @@ -1,5 +1,5 @@ formalism=scfg +intersection_strategy=full grammar=grammar #grammar=grammar-test #add_pass_through_rules=true -#weights=weights diff --git a/example/toy/grammar-test b/example/toy/grammar-test index 4027c71..18900ae 100644 --- a/example/toy/grammar-test +++ b/example/toy/grammar-test @@ -1,22 +1,22 @@ -[S] ||| [B,1] ||| [1] ||| logp=0 -[S] ||| ich [V,1] ein [JJ,2] haus ||| i [1] a [2] ||| logp=0 +[S] ||| [B,1] ||| [B,1] ||| logp=0 +[S] ||| ich [V,1] ein [JJ,2] haus ||| i [V,1] a [JJ,2] ||| logp=0 [S] ||| ich sah ein kleines haus ||| i saw a small house ||| logp=0 -[S] ||| ich sah ein [JJ,1] ||| i saw a [1] ||| logp=0 -[B] ||| [C,1] ||| [1] ||| logp=0 -[C] ||| [Q,1] ||| [1] ||| logp=0 -[Q] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 +[S] ||| ich sah ein [JJ,1] ||| i saw a [JJ,1] ||| logp=0 +[B] ||| [C,1] ||| [C,1] ||| logp=0 +[C] ||| [Q,1] ||| [Q,1] ||| logp=0 +[Q] ||| [NP,1] [VP,2] ||| [NP,1] [VP,2] ||| logp=0 [NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0 -[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1.0 -[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 -[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 -[JJ] ||| [H,1] ||| [1] ||| logp=0 +[NP] ||| ein [NN,1] ||| a [NN,1] ||| logp=0 use_a=1.0 +[NN] ||| [JJ,1] haus ||| [JJ,1] house ||| logp=0 use_house=1 +[NN] ||| [JJ,1] haus ||| [JJ,1] shell ||| logp=0 use_shell=1 +[JJ] ||| [H,1] ||| [H,1] ||| logp=0 [H] ||| kleines [Z,1] ||| small [1] ||| logp=0 -[Z] ||| [I,1] ||| [1] ||| logp=0 +[Z] ||| [I,1] ||| [I,1] ||| logp=0 [I] ||| haus ||| house ||| logp=0 [JJ] ||| kleines ||| small ||| logp=0 use_small=1.0 [JJ] ||| kleines ||| little ||| logp=0 use_little=1.0 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 -[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 +[VP] ||| [V,1] [NP,2] ||| [V,1] [NP,2] ||| logp=0 [V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0 [V] ||| fand ||| found ||| logp=0 diff --git a/example/toy/toy-cdec.json b/example/toy/toy-cdec.json new file mode 100644 index 0000000..c58c249 --- /dev/null +++ b/example/toy/toy-cdec.json @@ -0,0 +1,33 @@ +{ +"weights":{ +"logp":2.0, "use_shell":1.0 +}, +"nodes": +[ +{ "id":-1, "cat":"root", "span":[-1,-1] }, +{ "id":0, "cat":"NP", "span":[0,1] }, +{ "id":1, "cat":"V", "span":[1,2] }, +{ "id":2, "cat":"JJ", "span":[3,4] }, +{ "id":3, "cat":"NN", "span":[3,5] }, +{ "id":4, "cat":"NP", "span":[2,5] }, +{ "id":5, "cat":"VP", "span":[1,5] }, +{ "id":6, "cat":"S", "span":[0,5] }, +{ "id":7, "cat":"Goal", "span":[0,5] } +], +"edges": +[ +{"head":0, "rule":"[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0", "tails":[ -1 ], "f":{"logp":-0.5, "use_i":1.0}, "weight":0.367879441171 }, +{"head":1, "rule":"[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0", "tails":[ -1 ], "f":{"logp":-0.25, "use_saw":1.0}, "weight":0.606530659713 }, +{"head":2, "rule":"[JJ] ||| kleines ||| small ||| logp=0.0 use_small=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_small":1.0}, "weight":1.0 }, +{"head":2, "rule":"[JJ] ||| kleines ||| little ||| logp=0.0 use_little=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_little":1.0}, "weight":1.0 }, +{"head":3, "rule":"[NN] ||| kleines haus ||| small house ||| logp=0.0 use_house=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, +{"head":3, "rule":"[NN] ||| kleines haus ||| little house ||| logp=0.0 use_house=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, +{"head":3, "rule":"[NN] ||| [JJ,1] haus ||| [JJ,1] house ||| logp=0.0 use_house=1.0", "tails":[ 2 ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, +{"head":3, "rule":"[NN] ||| [JJ,1] haus ||| [JJ,1] shell ||| logp=0.0 use_shell=1.0", "tails":[ 2 ], "f":{"logp":0.0, "use_shell":1.0}, "weight":2.71828182846 }, +{"head":4, "rule":"[NP] ||| ein [NN,1] ||| a [NN,1] ||| logp=0.0 use_a=1.0", "tails":[ 3 ], "f":{"logp":0.0, "use_a":1.0}, "weight":1.0 }, +{"head":5, "rule":"[VP] ||| [V,1] [NP,2] ||| [V,1] [NP,2] ||| logp=0.0", "tails":[ 1,4 ], "f":{"logp":0.0}, "weight":1.0 }, +{"head":6, "rule":"[S] ||| [NP,1] [VP,2] ||| [VP,1] [NP,2] ||| logp=0.0", "tails":[ 0,5 ], "f":{"logp":0.0}, "weight":1.0 }, +{"head":7, "rule":"[Goal] ||| [S,1] ||| [S,1] ||| ", "tails":[ 6 ], "f":{}, "weight":1.0 } +] +} + diff --git a/example/toy/toy1.json b/example/toy/toy1.json deleted file mode 100644 index c58c249..0000000 --- a/example/toy/toy1.json +++ /dev/null @@ -1,33 +0,0 @@ -{ -"weights":{ -"logp":2.0, "use_shell":1.0 -}, -"nodes": -[ -{ "id":-1, "cat":"root", "span":[-1,-1] }, -{ "id":0, "cat":"NP", "span":[0,1] }, -{ "id":1, "cat":"V", "span":[1,2] }, -{ "id":2, "cat":"JJ", "span":[3,4] }, -{ "id":3, "cat":"NN", "span":[3,5] }, -{ "id":4, "cat":"NP", "span":[2,5] }, -{ "id":5, "cat":"VP", "span":[1,5] }, -{ "id":6, "cat":"S", "span":[0,5] }, -{ "id":7, "cat":"Goal", "span":[0,5] } -], -"edges": -[ -{"head":0, "rule":"[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0", "tails":[ -1 ], "f":{"logp":-0.5, "use_i":1.0}, "weight":0.367879441171 }, -{"head":1, "rule":"[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0", "tails":[ -1 ], "f":{"logp":-0.25, "use_saw":1.0}, "weight":0.606530659713 }, -{"head":2, "rule":"[JJ] ||| kleines ||| small ||| logp=0.0 use_small=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_small":1.0}, "weight":1.0 }, -{"head":2, "rule":"[JJ] ||| kleines ||| little ||| logp=0.0 use_little=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_little":1.0}, "weight":1.0 }, -{"head":3, "rule":"[NN] ||| kleines haus ||| small house ||| logp=0.0 use_house=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, -{"head":3, "rule":"[NN] ||| kleines haus ||| little house ||| logp=0.0 use_house=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, -{"head":3, "rule":"[NN] ||| [JJ,1] haus ||| [JJ,1] house ||| logp=0.0 use_house=1.0", "tails":[ 2 ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, -{"head":3, "rule":"[NN] ||| [JJ,1] haus ||| [JJ,1] shell ||| logp=0.0 use_shell=1.0", "tails":[ 2 ], "f":{"logp":0.0, "use_shell":1.0}, "weight":2.71828182846 }, -{"head":4, "rule":"[NP] ||| ein [NN,1] ||| a [NN,1] ||| logp=0.0 use_a=1.0", "tails":[ 3 ], "f":{"logp":0.0, "use_a":1.0}, "weight":1.0 }, -{"head":5, "rule":"[VP] ||| [V,1] [NP,2] ||| [V,1] [NP,2] ||| logp=0.0", "tails":[ 1,4 ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":6, "rule":"[S] ||| [NP,1] [VP,2] ||| [VP,1] [NP,2] ||| logp=0.0", "tails":[ 0,5 ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":7, "rule":"[Goal] ||| [S,1] ||| [S,1] ||| ", "tails":[ 6 ], "f":{}, "weight":1.0 } -] -} - diff --git a/util/cdec2json.py b/util/cdec2json.py new file mode 100755 index 0000000..76e2cd4 --- /dev/null +++ b/util/cdec2json.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python2 + +import cdec +import sys, argparse + + +def hg2json(hg, weights): + """ + output a JSON representation of a cdec hypegraph + """ + res = '' + res += "{\n" + res += '"weights":{'+"\n" + a = [] + for i in weights: + if i[1] != 0: + a.append( '"%s":%s'%(i[0], i[1]) ) + res += ", ".join(a)+"\n" + res += "},\n" + res += '"nodes":'+"\n" + res += "[\n" + a = [] + a.append( '{ "id":-1, "cat":"root", "span":[-1,-1] }' ) + for i in hg.nodes: + a.append('{ "id":%d, "cat":"%s", "span":[%d,%d] }'%(i.id, i.cat, i.span[0], i.span[1])) + res += ",\n".join(a)+"\n" + res += "],\n" + res += '"edges":'+"\n" + res += "[\n" + a = [] + for i in hg.edges: + s = "{" + s += '"head":%d'%(i.head_node.id) + s += ', "rule":"%s"'%(i.trule) + # f + xs = ' "f":{' + b = [] + for j in i.feature_values: + b.append( '"%s":%s'%(j[0], j[1]) ) + xs += ", ".join(b) + xs += "}," + # tails + if len(list(i.tail_nodes)) > 0: + s += ', "tails":[ %s ],'%(",".join([str(n.id) for n in i.tail_nodes])) + else: + s += ', "tails":[ -1 ],' + s += xs + s += ' "weight":%s }'%(i.prob) + a.append(s) + res += ",\n".join(a)+"\n" + res += "]\n" + res += "}\n" + return res + +def main(): + parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs') + parser.add_argument('-c', '--config', required=True, help='decoder configuration') + parser.add_argument('-w', '--weights', required=True, help='feature weights') + args = parser.parse_args() + with open(args.config) as config: + config = config.read() + decoder = cdec.Decoder(config) + decoder.read_weights(args.weights) + ins = sys.stdin.readline().strip() + hg = decoder.translate(ins) + + sys.stderr.write( "input:\n '%s'\n"%(ins) ) + sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) ) + num_nodes = 0 + for i in hg.nodes: num_nodes+=1 + sys.stderr.write( "# nodes = %s\n"%(num_nodes) ) + num_edges = 0 + for i in hg.edges: num_edges+=1 + sys.stderr.write( "# edges = %s\n"%(num_edges) ) + sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) ) + + print hg2json(hg, decoder.weights) + + +if __name__=="__main__": + main() + diff --git a/util/cdec_hg_to_json.py b/util/cdec_hg_to_json.py deleted file mode 100755 index 2fcc409..0000000 --- a/util/cdec_hg_to_json.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python2 - -import cdec -import sys, argparse - -def hg2json(hg, weights): - """ - output a JSON representation of a cdec hypegraph - (see http://aclweb.org/aclwiki/index.php?title=Hypergraph_Format ) - """ - res = '' - res += "{\n" - res += '"weights":{'+"\n" - a = [] - for i in weights: - if i[1] != 0: - a.append( '"%s":%s'%(i[0], i[1]) ) - res += ", ".join(a)+"\n" - res += "},\n" - res += '"nodes":'+"\n" - res += "[\n" - a = [] - a.append( '{ "id":-1, "cat":"root", "span":[-1,-1] }' ) - for i in hg.nodes: - a.append('{ "id":%d, "cat":"%s", "span":[%d,%d] }'%(i.id, i.cat, i.span[0], i.span[1])) - res += ",\n".join(a)+"\n" - res += "],\n" - res += '"edges":'+"\n" - res += "[\n" - a = [] - for i in hg.edges: - s = "{" - s += '"head":%d'%(i.head_node.id) - s += ', "rule":"%s"'%(i.trule) - # f - xs = ' "f":{' - b = [] - for j in i.feature_values: - b.append( '"%s":%s'%(j[0], j[1]) ) - xs += ", ".join(b) - xs += "}," - # tails - if len(list(i.tail_nodes)) > 0: - s += ', "tails":[ %s ],'%(",".join([str(n.id) for n in i.tail_nodes])) - else: - s += ', "tails":[ -1 ],' - s += xs - s += ' "weight":%s }'%(i.prob) - a.append(s) - res += ",\n".join(a)+"\n" - res += "]\n" - res += "}\n" - return res - -def main(): - parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs') - parser.add_argument('-c', '--config', required=True, help='decoder configuration') - parser.add_argument('-w', '--weights', required=True, help='feature weights') - args = parser.parse_args() - with open(args.config) as config: - config = config.read() - decoder = cdec.Decoder(config) - decoder.read_weights(args.weights) - ins = sys.stdin.readline().strip() - hg = decoder.translate(ins) - - sys.stderr.write( "input:\n '%s'\n"%(ins) ) - sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) ) - num_nodes = 0 - for i in hg.nodes: num_nodes+=1 - sys.stderr.write( "# nodes = %s\n"%(num_nodes) ) - num_edges = 0 - for i in hg.edges: num_edges+=1 - sys.stderr.write( "# edges = %s\n"%(num_edges) ) - sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) ) - - print hg2json(hg, decoder.weights) - - -if __name__=="__main__": - main() - -- cgit v1.2.3