summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-11-16 17:16:28 +0100
committerPatrick Simianer <p@simianer.de>2015-11-16 17:16:28 +0100
commitdc078a058595ddb6b6b35d86d0cefc5d0e631b04 (patch)
tree88884cb208f9fa4be49a7ff4e39eef8d76e35048
parent2497837c9d470b51a87060a0af6041f9a6c11063 (diff)
new cdec2json.py
-rwxr-xr-xexample/cdec/cdec2json.py94
1 files changed, 48 insertions, 46 deletions
diff --git a/example/cdec/cdec2json.py b/example/cdec/cdec2json.py
index 66bc8b4..fda5604 100755
--- a/example/cdec/cdec2json.py
+++ b/example/cdec/cdec2json.py
@@ -2,52 +2,64 @@
import cdec
import sys, argparse
-import json, gzip
+import json
+import codecs
def hg2json(hg, weights):
"""
- hackishly output a JSON representation of a cdec hypegraph
-
- TODO: cdec now uses boost's serialization instead of JSON
- (github repo @b88176dc4fd53480e77d601ff63bf5300cf8fc7f
- still uses the old format)
+ output a JSON representation of a cdec hypegraph
+ (see http://aclweb.org/aclwiki/index.php?title=Hypergraph_Format )
"""
res = ''
res += "{\n"
- # rules
- res += '"rules":[\n'
- rules = []
- for i in hg.edges:
- s = json.dumps(str(i.trule))
- try:
- rules.index(s)
- except:
- rules.append(s)
- res += ",\n".join(rules)
- res += "\n],\n"
- # nodes
+ res += '"weights":{'+"\n"
+ a = []
+ for i in weights:
+ a.append( '%s:%f'%(json.dumps(i[0]), i[1]) )
+ res += ", ".join(a)+"\n"
+ res += "},\n"
res += '"nodes":'+"\n"
res += "[\n"
a = []
- a.append('{ "id":0, "symbol":"root", "span":[-1,-1] }')
+ a.append( '{ "id":-1, "cat":"root", "span":[-1,-1] }' )
for i in hg.nodes:
- a.append('{ "id":%d, "symbol":"%s", "span":[%d,%d] }'%(i.id+1, i.cat, i.span[0], i.span[1]))
+ a.append( '{ "id":%d, "cat":"%s", "span":[%d, %d] }'%(i.id, i.cat, i.span[0], i.span[1]) )
res += ",\n".join(a)+"\n"
res += "],\n"
- # edges
res += '"edges":'+"\n"
res += "[\n"
a = []
for i in hg.edges:
s = "{"
- s += '"head":%d'%(i.head_node.id+1)
- s += ', "rule":%s'%(rules.index(json.dumps(str(i.trule))))
- # tails
- if len(list(i.tail_nodes)) > 0:
- s += ', "tails":[ %s ],'%(",".join([str(n.id+1) for n in i.tail_nodes]))
+ s += '"head":%d'%(i.head_node.id)
+ xs = ' "f":{'
+ b = []
+ for j in i.feature_values:
+ b.append( '"%s":%s'%(j[0], j[1]) )
+ xs += ", ".join(b)
+ xs += "},"
+ c = []
+ for j in i.tail_nodes:
+ c.append(str(j.id))
+ if len(c) > 0:
+ s += ', "tails":[ %s ],'%(",".join(c))
else:
- s += ', "tails":[ 0 ],'
- s += ' "score":%s }'%(i.prob)
+ s += ', "tails":[ -1 ],'
+ s += xs
+ f = []
+ for x in i.trule.f:
+ if type(x) == type(u'x'):
+ f.append(codecs.encode(x, 'utf-8'))
+ else:
+ f.append(str(x))
+ e = []
+ for x in i.trule.e:
+ if type(x) == type(u'x'):
+ e.append(codecs.encode(x, 'utf-8'))
+ else:
+ e.append(str(x))
+ s += " \"rule\":\"%s ||| %s ||| %s\""%(str(i.trule.lhs), json.dumps(" ".join(f))[1:-1], json.dumps(" ".join(e))[1:-1])
+ s += ' }'
a.append(s)
res += ",\n".join(a)+"\n"
res += "]\n"
@@ -55,38 +67,28 @@ def hg2json(hg, weights):
return res
def main():
- parser = argparse.ArgumentParser(description='get a nice json representation of cdec hypergraphs')
+ parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs')
parser.add_argument('-c', '--config', required=True, help='decoder configuration')
parser.add_argument('-w', '--weights', required=True, help='feature weights')
- parser.add_argument('-g', '--grammar', required=False, help='grammar')
args = parser.parse_args()
with open(args.config) as config:
config = config.read()
decoder = cdec.Decoder(config)
decoder.read_weights(args.weights)
ins = sys.stdin.readline().strip()
- if args.grammar:
- if args.grammar.split('.')[-1] == 'gz':
- with gzip.open(args.grammar) as grammar:
- grammar = grammar.read()
- else:
- with open(args.grammar) as grammar:
- grammar = grammar.read()
- hg = decoder.translate(ins, grammar=grammar)
- else:
- hg = decoder.translate(ins)
+ hg = decoder.translate(ins)
- sys.stderr.write("input:\n '%s'\n"%(ins))
- sys.stderr.write("viterbi translation:\n '%s'\n"%(hg.viterbi()))
+ sys.stderr.write( "input:\n '%s'\n"%(ins) )
+ sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) )
num_nodes = 0
for i in hg.nodes: num_nodes+=1
- sys.stderr.write("# nodes = %s\n"%(num_nodes))
+ sys.stderr.write( "# nodes = %s\n"%(num_nodes) )
num_edges = 0
for i in hg.edges: num_edges+=1
- sys.stderr.write("# edges = %s\n"%(num_edges))
- sys.stderr.write("viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)))
+ sys.stderr.write( "# edges = %s\n"%(num_edges) )
+ sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) )
- print hg2json(hg, decoder.weights).encode('utf-8')
+ print hg2json(hg, decoder.weights)
if __name__=="__main__":
main()