summaryrefslogtreecommitdiff
path: root/gi/evaluation/extract_ccg_labels.py
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
committerKenneth Heafield <github@kheafield.com>2012-10-22 12:07:20 +0100
commitac586bc9b156b4ae687cd5961ba1fe7b20ec57d6 (patch)
tree052473b46d7fa18d51f897cdb9e7c93a7186dafd /gi/evaluation/extract_ccg_labels.py
parent97b85c082b3e55c28a8b0c0eb762483ac84a1577 (diff)
parentad6d4a1b2519896f2b16a282699ce4e64041fab8 (diff)
Merge remote branch 'upstream/master'
Conflicts: Jamroot bjam decoder/Jamfile decoder/cdec.cc dpmert/Jamfile jam-files/sanity.jam klm/lm/Jamfile klm/util/Jamfile mira/Jamfile
Diffstat (limited to 'gi/evaluation/extract_ccg_labels.py')
-rw-r--r--gi/evaluation/extract_ccg_labels.py129
1 files changed, 0 insertions, 129 deletions
diff --git a/gi/evaluation/extract_ccg_labels.py b/gi/evaluation/extract_ccg_labels.py
deleted file mode 100644
index e0034648..00000000
--- a/gi/evaluation/extract_ccg_labels.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Takes spans input along with treebank and spits out CG style categories for each span.
-# spans = output from CDEC's extools/extractor with --base_phrase_spans option
-# treebank = PTB format, one tree per line
-#
-# Output is in CDEC labelled-span format
-#
-
-import sys, itertools, tree
-
-tinfile = open(sys.argv[1])
-einfile = open(sys.argv[2])
-
-def number_leaves(node, next=0):
- left, right = None, None
- for child in node.children:
- l, r = number_leaves(child, next)
- next = max(next, r+1)
- if left == None or l < left:
- left = l
- if right == None or r > right:
- right = r
-
- #print node, left, right, next
- if left == None or right == None:
- assert not node.children
- left = right = next
-
- node.left = left
- node.right = right
-
- return left, right
-
-def ancestor(node, indices):
- #print node, node.left, node.right, indices
- # returns the deepest node covering all the indices
- if min(indices) >= node.left and max(indices) <= node.right:
- # try the children
- for child in node.children:
- x = ancestor(child, indices)
- if x: return x
- return node
- else:
- return None
-
-def frontier(node, indices):
- #print 'frontier for node', node, 'indices', indices
- if node.left > max(indices) or node.right < min(indices):
- #print '\toutside'
- return [node]
- elif node.children:
- #print '\tcovering at least part'
- ns = []
- for child in node.children:
- n = frontier(child, indices)
- ns.extend(n)
- return ns
- else:
- return [node]
-
-def project_heads(node):
- #print 'project_heads', node
- is_head = node.data.tag.endswith('-HEAD')
- if node.children:
- found = 0
- for child in node.children:
- x = project_heads(child)
- if x:
- node.data.tag = x
- found += 1
- assert found == 1
- elif is_head:
- node.data.tag = node.data.tag[:-len('-HEAD')]
-
- if is_head:
- return node.data.tag
- else:
- return None
-
-for tline, eline in itertools.izip(tinfile, einfile):
- if tline.strip() != '(())':
- if tline.startswith('( '):
- tline = tline[2:-1].strip()
- tr = tree.parse_PST(tline)
- if tr != None:
- number_leaves(tr)
- #project_heads(tr) # assumes Bikel-style head annotation for the input trees
- else:
- tr = None
-
- parts = eline.strip().split(" ||| ")
- zh, en = parts[:2]
- spans = parts[-1]
- print '|||',
- for span in spans.split():
- sps = span.split(":")
- i, j, x, y = map(int, sps[0].split("-"))
-
- if tr:
- a = ancestor(tr, range(x,y))
- try:
- fs = frontier(a, range(x,y))
- except:
- print >>sys.stderr, "problem with line", tline.strip(), "--", eline.strip()
- raise
-
- #print x, y
- #print 'ancestor', a
- #print 'frontier', fs
-
- cat = a.data.tag
- for f in fs:
- if f.right < x:
- cat += '\\' + f.data.tag
- else:
- break
- fs.reverse()
- for f in fs:
- if f.left >= y:
- cat += '/' + f.data.tag
- else:
- break
- else:
- cat = 'FAIL'
-
- print '%d-%d:%s' % (x, y, cat),
- print