diff options
| -rwxr-xr-x | example/cdec/cdec2json.py | 94 | 
1 files changed, 48 insertions, 46 deletions
| diff --git a/example/cdec/cdec2json.py b/example/cdec/cdec2json.py index 66bc8b4..fda5604 100755 --- a/example/cdec/cdec2json.py +++ b/example/cdec/cdec2json.py @@ -2,52 +2,64 @@  import cdec  import sys, argparse -import json, gzip +import json +import codecs  def hg2json(hg, weights):    """ -  hackishly output a JSON representation of a cdec hypegraph - -  TODO: cdec now uses boost's serialization instead of JSON -        (github repo @b88176dc4fd53480e77d601ff63bf5300cf8fc7f -         still uses the old format) +  output a JSON representation of a cdec hypegraph +  (see http://aclweb.org/aclwiki/index.php?title=Hypergraph_Format )    """    res = ''    res += "{\n" -  # rules -  res += '"rules":[\n' -  rules = [] -  for i in hg.edges: -    s = json.dumps(str(i.trule)) -    try: -      rules.index(s) -    except: -      rules.append(s) -  res += ",\n".join(rules) -  res += "\n],\n" -  # nodes +  res += '"weights":{'+"\n" +  a = [] +  for i in weights: +    a.append( '%s:%f'%(json.dumps(i[0]), i[1]) ) +  res += ", ".join(a)+"\n" +  res += "},\n"    res += '"nodes":'+"\n"    res += "[\n"    a = [] -  a.append('{ "id":0, "symbol":"root", "span":[-1,-1] }') +  a.append( '{ "id":-1, "cat":"root", "span":[-1,-1] }' )    for i in hg.nodes: -    a.append('{ "id":%d, "symbol":"%s", "span":[%d,%d] }'%(i.id+1, i.cat, i.span[0], i.span[1])) +    a.append( '{ "id":%d, "cat":"%s", "span":[%d, %d] }'%(i.id, i.cat, i.span[0], i.span[1]) )    res += ",\n".join(a)+"\n"    res += "],\n" -  # edges    res += '"edges":'+"\n"    res += "[\n"    a = []    for i in hg.edges:      s = "{" -    s += '"head":%d'%(i.head_node.id+1) -    s += ', "rule":%s'%(rules.index(json.dumps(str(i.trule)))) -    # tails -    if len(list(i.tail_nodes)) > 0: -      s += ', "tails":[ %s ],'%(",".join([str(n.id+1) for n in i.tail_nodes])) +    s += '"head":%d'%(i.head_node.id) +    xs = ' "f":{' +    b = [] +    for j in i.feature_values: +      b.append( '"%s":%s'%(j[0], j[1]) ) +    xs += ", ".join(b) +    xs += "}," +    c = [] +    for j in i.tail_nodes: +      c.append(str(j.id)) +    if len(c) > 0: +      s += ', "tails":[ %s ],'%(",".join(c))      else: -      s += ', "tails":[ 0 ],' -    s += ' "score":%s }'%(i.prob) +      s += ', "tails":[ -1 ],' +    s += xs +    f =  [] +    for x in i.trule.f: +      if type(x) == type(u'x'): +        f.append(codecs.encode(x, 'utf-8')) +      else: +        f.append(str(x)) +    e = [] +    for x in i.trule.e: +      if type(x) == type(u'x'): +        e.append(codecs.encode(x, 'utf-8')) +      else: +        e.append(str(x)) +    s += " \"rule\":\"%s ||| %s ||| %s\""%(str(i.trule.lhs), json.dumps(" ".join(f))[1:-1], json.dumps(" ".join(e))[1:-1]) +    s += ' }'      a.append(s)    res += ",\n".join(a)+"\n"    res += "]\n" @@ -55,38 +67,28 @@ def hg2json(hg, weights):    return res  def main(): -  parser = argparse.ArgumentParser(description='get a nice json representation of cdec hypergraphs') +  parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs')    parser.add_argument('-c', '--config', required=True, help='decoder configuration')    parser.add_argument('-w', '--weights', required=True, help='feature weights') -  parser.add_argument('-g', '--grammar', required=False, help='grammar')    args = parser.parse_args()    with open(args.config) as config:      config = config.read()    decoder = cdec.Decoder(config)    decoder.read_weights(args.weights)    ins = sys.stdin.readline().strip() -  if args.grammar: -    if args.grammar.split('.')[-1] == 'gz': -      with gzip.open(args.grammar) as grammar: -        grammar = grammar.read() -    else: -      with open(args.grammar) as grammar: -        grammar = grammar.read() -    hg = decoder.translate(ins, grammar=grammar) -  else: -    hg = decoder.translate(ins) +  hg = decoder.translate(ins) -  sys.stderr.write("input:\n '%s'\n"%(ins)) -  sys.stderr.write("viterbi translation:\n '%s'\n"%(hg.viterbi())) +  sys.stderr.write( "input:\n '%s'\n"%(ins) ) +  sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) )    num_nodes = 0    for i in hg.nodes: num_nodes+=1 -  sys.stderr.write("# nodes = %s\n"%(num_nodes)) +  sys.stderr.write( "# nodes = %s\n"%(num_nodes) )    num_edges = 0    for i in hg.edges: num_edges+=1 -  sys.stderr.write("# edges = %s\n"%(num_edges)) -  sys.stderr.write("viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2))) +  sys.stderr.write( "# edges = %s\n"%(num_edges) ) +  sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) ) -  print hg2json(hg, decoder.weights).encode('utf-8') +  print hg2json(hg, decoder.weights)  if __name__=="__main__":    main() | 
