diff options
Diffstat (limited to 'example/cdec')
| -rwxr-xr-x | example/cdec/cdec2json.py | 93 | ||||
| -rw-r--r-- | example/cdec/default.ini | 2 | ||||
| -rw-r--r-- | example/cdec/passthrough.ini | 2 | 
3 files changed, 95 insertions, 2 deletions
| diff --git a/example/cdec/cdec2json.py b/example/cdec/cdec2json.py new file mode 100755 index 0000000..66bc8b4 --- /dev/null +++ b/example/cdec/cdec2json.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python2 + +import cdec +import sys, argparse +import json, gzip + +def hg2json(hg, weights): +  """ +  hackishly output a JSON representation of a cdec hypegraph + +  TODO: cdec now uses boost's serialization instead of JSON +        (github repo @b88176dc4fd53480e77d601ff63bf5300cf8fc7f +         still uses the old format) +  """ +  res = '' +  res += "{\n" +  # rules +  res += '"rules":[\n' +  rules = [] +  for i in hg.edges: +    s = json.dumps(str(i.trule)) +    try: +      rules.index(s) +    except: +      rules.append(s) +  res += ",\n".join(rules) +  res += "\n],\n" +  # nodes +  res += '"nodes":'+"\n" +  res += "[\n" +  a = [] +  a.append('{ "id":0, "symbol":"root", "span":[-1,-1] }') +  for i in hg.nodes: +    a.append('{ "id":%d, "symbol":"%s", "span":[%d,%d] }'%(i.id+1, i.cat, i.span[0], i.span[1])) +  res += ",\n".join(a)+"\n" +  res += "],\n" +  # edges +  res += '"edges":'+"\n" +  res += "[\n" +  a = [] +  for i in hg.edges: +    s = "{" +    s += '"head":%d'%(i.head_node.id+1) +    s += ', "rule":%s'%(rules.index(json.dumps(str(i.trule)))) +    # tails +    if len(list(i.tail_nodes)) > 0: +      s += ', "tails":[ %s ],'%(",".join([str(n.id+1) for n in i.tail_nodes])) +    else: +      s += ', "tails":[ 0 ],' +    s += ' "score":%s }'%(i.prob) +    a.append(s) +  res += ",\n".join(a)+"\n" +  res += "]\n" +  res += "}\n" +  return res + +def main(): +  parser = argparse.ArgumentParser(description='get a nice json representation of cdec hypergraphs') +  parser.add_argument('-c', '--config', required=True, help='decoder configuration') +  parser.add_argument('-w', '--weights', required=True, help='feature weights') +  parser.add_argument('-g', '--grammar', required=False, help='grammar') +  args = parser.parse_args() +  with open(args.config) as config: +    config = config.read() +  decoder = cdec.Decoder(config) +  decoder.read_weights(args.weights) +  ins = sys.stdin.readline().strip() +  if args.grammar: +    if args.grammar.split('.')[-1] == 'gz': +      with gzip.open(args.grammar) as grammar: +        grammar = grammar.read() +    else: +      with open(args.grammar) as grammar: +        grammar = grammar.read() +    hg = decoder.translate(ins, grammar=grammar) +  else: +    hg = decoder.translate(ins) + +  sys.stderr.write("input:\n '%s'\n"%(ins)) +  sys.stderr.write("viterbi translation:\n '%s'\n"%(hg.viterbi())) +  num_nodes = 0 +  for i in hg.nodes: num_nodes+=1 +  sys.stderr.write("# nodes = %s\n"%(num_nodes)) +  num_edges = 0 +  for i in hg.edges: num_edges+=1 +  sys.stderr.write("# edges = %s\n"%(num_edges)) +  sys.stderr.write("viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2))) + +  print hg2json(hg, decoder.weights).encode('utf-8') + +if __name__=="__main__": +  main() + diff --git a/example/cdec/default.ini b/example/cdec/default.ini index 10ea512..8174118 100644 --- a/example/cdec/default.ini +++ b/example/cdec/default.ini @@ -1,3 +1,3 @@  formalism=scfg  intersection_strategy=full -scfg_max_span_limit=999999999 +scfg_max_span_limit=2147483647 # std::numeric_limits<int>::max() on my machine diff --git a/example/cdec/passthrough.ini b/example/cdec/passthrough.ini index d9280ad..15ed4b6 100644 --- a/example/cdec/passthrough.ini +++ b/example/cdec/passthrough.ini @@ -1,4 +1,4 @@  formalism=scfg  intersection_strategy=full -scfg_max_span_limit=999999999 +scfg_max_span_limit=2147483647 # std::numeric_limits<int>::max() on my machine  add_pass_through_rules=true | 
