summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xhg2json.py81
-rw-r--r--test/hg2json/cdec.ini5
-rw-r--r--test/hg2json/grammar.gzbin0 -> 1399915 bytes
-rw-r--r--test/hg2json/hg.json.gzbin0 -> 318029 bytes
-rw-r--r--test/hg2json/hg.meta7
-rw-r--r--test/hg2json/in1
-rw-r--r--test/hg2json/toy.cdec.ini2
-rw-r--r--test/hg2json/toy.grammar12
-rw-r--r--test/hg2json/toy.in1
-rw-r--r--test/hg2json/toy.weights3
-rw-r--r--test/hg2json/weights17
11 files changed, 129 insertions, 0 deletions
diff --git a/hg2json.py b/hg2json.py
new file mode 100755
index 0000000..5bd5c2c
--- /dev/null
+++ b/hg2json.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python2
+
+import cdec
+import sys, argparse
+
+def hg2json(hg, weights):
+ """
+ output a JSON representation of a cdec hypegraph
+ (see http://aclweb.org/aclwiki/index.php?title=Hypergraph_Format )
+ """
+ res = ''
+ res += "{\n"
+ res += '"weights":{'+"\n"
+ a = []
+ for i in weights:
+ a.append( '"%s":%s'%(i[0], i[1]) )
+ res += ", ".join(a)+"\n"
+ res += "},\n"
+ res += '"nodes":'+"\n"
+ res += "[\n"
+ a = []
+ a.append( '{ "label":"root", "cat":"root" }' )
+ for i in hg.nodes:
+ a.append( '{ "label":"%s", "cat":"%s" }'%(i.id, i.cat) )
+ res += ",\n".join(a)+"\n"
+ res += "],\n"
+ res += '"edges":'+"\n"
+ res += "[\n"
+ a = []
+ for i in hg.edges:
+ s = "{"
+ s += '"head":"%s"'%(i.head_node.id)
+ xs = ' "f":{'
+ b = []
+ for j in i.feature_values:
+ b.append( '"%s":%s'%(j[0], j[1]) )
+ xs += ", ".join(b)
+ xs += "},"
+ c = []
+ for j in i.tail_nodes:
+ c.append( '"'+str(j.id)+'"' )
+ if len(c) > 0:
+ s += ', "tails":[ %s ],'%(",".join(c))
+ else:
+ s += ', "tails":[ "root" ],'
+ s += xs
+ s += ' "weight":%s }'%(i.prob)
+ a.append(s)
+ res += ",\n".join(a)+"\n"
+ res += "]\n"
+ res += "}\n"
+ return res
+
+def main():
+ parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs')
+ parser.add_argument('-c', '--config', required=True, help='decoder configuration')
+ parser.add_argument('-w', '--weights', required=True, help='feature weights')
+ args = parser.parse_args()
+ with open(args.config) as config:
+ config = config.read()
+ decoder = cdec.Decoder(config)
+ decoder.read_weights(args.weights)
+ ins = sys.stdin.readline().strip()
+ hg = decoder.translate(ins)
+
+ sys.stderr.write( "input:\n '%s'\n"%(ins) )
+ sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) )
+ num_nodes = 0
+ for i in hg.nodes: num_nodes+=1
+ sys.stderr.write( "# nodes = %s\n"%(num_nodes) )
+ num_edges = 0
+ for i in hg.edges: num_edges+=1
+ sys.stderr.write( "# edges = %s\n"%(num_edges) )
+ sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) )
+
+ print hg2json(hg, decoder.weights)
+
+
+if __name__=="__main__":
+ main()
+
diff --git a/test/hg2json/cdec.ini b/test/hg2json/cdec.ini
new file mode 100644
index 0000000..1ad25b5
--- /dev/null
+++ b/test/hg2json/cdec.ini
@@ -0,0 +1,5 @@
+formalism=scfg
+grammar=test/hg2json/grammar.gz
+add_pass_through_rules=true
+feature_function=WordPenalty
+intersection_strategy=full
diff --git a/test/hg2json/grammar.gz b/test/hg2json/grammar.gz
new file mode 100644
index 0000000..78dda98
--- /dev/null
+++ b/test/hg2json/grammar.gz
Binary files differ
diff --git a/test/hg2json/hg.json.gz b/test/hg2json/hg.json.gz
new file mode 100644
index 0000000..ed178c6
--- /dev/null
+++ b/test/hg2json/hg.json.gz
Binary files differ
diff --git a/test/hg2json/hg.meta b/test/hg2json/hg.meta
new file mode 100644
index 0000000..d33a54c
--- /dev/null
+++ b/test/hg2json/hg.meta
@@ -0,0 +1,7 @@
+input:
+ 'in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen .'
+viterbi translation:
+ 'which labor market desperate transformed into attempting gathered by failed to show any the non - is making festzuhalten gathered by pervez musharraf meant to its borders with within than the non - have pakistan 's intelligence relied constitutional for security as a its borders with declared a state of emergency - range missiles .'
+# nodes = 220
+# edges = 16640
+viterbi score = 228.95
diff --git a/test/hg2json/in b/test/hg2json/in
new file mode 100644
index 0000000..7dc411d
--- /dev/null
+++ b/test/hg2json/in
@@ -0,0 +1 @@
+in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen .
diff --git a/test/hg2json/toy.cdec.ini b/test/hg2json/toy.cdec.ini
new file mode 100644
index 0000000..d4a2896
--- /dev/null
+++ b/test/hg2json/toy.cdec.ini
@@ -0,0 +1,2 @@
+formalism=scfg
+grammar=test/hg2json/toy.grammar
diff --git a/test/hg2json/toy.grammar b/test/hg2json/toy.grammar
new file mode 100644
index 0000000..382c94f
--- /dev/null
+++ b/test/hg2json/toy.grammar
@@ -0,0 +1,12 @@
+[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0
+[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0
+[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1.0
+[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1
+[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1
+[JJ] ||| kleines ||| small ||| logp=0 use_small=1.0
+[JJ] ||| kleines ||| little ||| logp=0 use_little=1.0
+[JJ] ||| grosses ||| big ||| logp=0
+[JJ] ||| grosses ||| large ||| logp=0
+[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0
+[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0
+[V] ||| fand ||| found ||| logp=0
diff --git a/test/hg2json/toy.in b/test/hg2json/toy.in
new file mode 100644
index 0000000..e6df927
--- /dev/null
+++ b/test/hg2json/toy.in
@@ -0,0 +1 @@
+ich sah ein kleines haus
diff --git a/test/hg2json/toy.weights b/test/hg2json/toy.weights
new file mode 100644
index 0000000..70075b7
--- /dev/null
+++ b/test/hg2json/toy.weights
@@ -0,0 +1,3 @@
+logp 2
+use_house 0
+use_shell 1
diff --git a/test/hg2json/weights b/test/hg2json/weights
new file mode 100644
index 0000000..7f96f1d
--- /dev/null
+++ b/test/hg2json/weights
@@ -0,0 +1,17 @@
+PhraseModel_0 1.0
+PhraseModel_1 1.0
+PhraseModel_2 1.0
+PhraseModel_3 1.0
+PhraseModel_4 1.0
+PhraseModel_5 1.0
+PhraseModel_6 1.0
+PassThrough -1.0
+PassThrough_1 -1.0
+PassThrough_2 -1.0
+PassThrough_3 -1.0
+PassThrough_4 -1.0
+PassThrough_5 -1.0
+PassThrough_6 -1.0
+Glue 0.1
+LanguageModel 10.0
+LanguageModel_OOV -10