summaryrefslogtreecommitdiff
path: root/sa-extract/sgml.py
diff options
context:
space:
mode:
Diffstat (limited to 'sa-extract/sgml.py')
-rw-r--r--sa-extract/sgml.py194
1 files changed, 0 insertions, 194 deletions
diff --git a/sa-extract/sgml.py b/sa-extract/sgml.py
deleted file mode 100644
index 2db8b5dc..00000000
--- a/sa-extract/sgml.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import sys, sgmllib, xml.sax.saxutils, sym
-
-def attrs_to_str(d):
- if len(d) == 0:
- return ""
- l = [""]+["%s=%s" % (name, xml.sax.saxutils.quoteattr(value)) for (name, value) in d]
- return " ".join(l)
-
-def attrs_to_dict(a):
- d = {}
- for (name, value) in a:
- if d.has_key(name.lower()):
- raise ValueError, "duplicate attribute names"
- d[name.lower()] = value
- return d
-
-class Sentence(object):
- def __init__(self, words=None, meta=None):
- if words is not None:
- self.words = list(words)
- else:
- self.words = []
- if meta is not None:
- self.meta = meta
- else:
- self.meta = []
-
- def copy(self):
- return Sentence(self.words, list(self.meta))
-
- def mark(self, tag, attrs):
- self.meta.append((tag, attrs, 0, len(self.words)))
-
- def getmark(self):
- if len(self.meta) > 0:
- (tag, attrs, i, j) = self.meta[-1]
- if i == 0 and j == len(self.words):
- return (tag, attrs)
- else:
- return None
- else:
- return None
-
- def unmark(self):
- mark = self.getmark()
- if mark is not None:
- self.meta = self.meta[:-1]
- return mark
-
- def __cmp__(self, other):
- return cmp((self.words, self.meta), (other.words, other.meta))
-
- def __str__(self):
- def cmp_spans((tag1,attr1,i1,j1),(tag2,attr2,i2,j2)):
- if i1==i2<=j1==j2:
- return 0
- elif i2<=i1<=j1<=j2:
- return -1
- elif i1<=i2<=j2<=j1:
- return 1
- else:
- return cmp((i1,j1),(i2,j2)) # don't care
- # this guarantees that equal spans will come out nested
- # we want the later spans to be outer
- # this relies on stable sort
- open = [[] for i in xrange(len(self.words)+1)]
- # there seems to be a bug still with empty spans
- empty = [[] for i in xrange(len(self.words)+1)]
- close = [[] for j in xrange(len(self.words)+1)]
- for (tag,attrs,i,j) in sorted(self.meta, cmp=cmp_spans):
- if i == j:
- # do we want these to nest?
- empty[i].append("<%s%s></%s>\n" % (tag, attrs_to_str(attrs), tag))
- else:
- open[i].append("<%s%s>\n" % (tag, attrs_to_str(attrs)))
- close[j].append("</%s>\n" % tag)
-
- result = []
- if len(empty[0]) > 0:
- result.extend(empty[0])
- for i in xrange(len(self.words)):
- if i > 0:
- result.append(" ")
- result.extend(reversed(open[i]))
- result.append(xml.sax.saxutils.escape(sym.tostring(self.words[i])))
- result.extend(close[i+1])
- if len(empty[i+1]) > 0:
- result.extend(empty[i+1])
-
- return "".join(result)
-
- def __add__(self, other):
- if type(other) in (list, tuple):
- return Sentence(self.words + list(other), self.meta)
- else:
- othermeta = [(tag, attrs, i+len(self.words), j+len(self.words)) for (tag, attrs, i, j) in other.meta]
- return Sentence(self.words + other.words, self.meta+othermeta)
-
-def read_raw(f):
- """Read a raw file into a list of Sentences."""
- if type(f) is str:
- f = file(f, "r")
- i = 0
- line = f.readline()
- while line != "":
- sent = process_sgml_line(line, i)
- mark = sent.getmark()
- if mark is None:
- sent.mark('seg', [('id',str(i))])
- else:
- (tag, attrs) = mark
- if tag == "seg" and not attrs_to_dict(attrs).has_key('id'):
- x = ('id',str(i))
- attrs.append(x)
- sent.mark('seg', attrs)
- if tag != "seg":
- sent.mark('seg', [('id',str(i))])
- yield sent
- i += 1
- line = f.readline()
-
-def process_sgml_line(line, id=None):
- p = DatasetParser(None)
- p.pos = 0
- p.words = []
- p.meta = []
- p.feed(line)
- p.close()
- sent = Sentence(p.words, p.meta)
- return sent
-
-class DatasetParser(sgmllib.SGMLParser):
- def __init__(self, set):
- sgmllib.SGMLParser.__init__(self)
- self.words = None
- self.mystack = []
- self.string = None
- self.convref = d = {"amp":"&", "lt":"<", "gt":">", "quot":'"', "squot":"'"}
- def close(self):
- self.convert()
- sgmllib.SGMLParser.close(self)
-
- def handle_starttag(self, tag, method, attrs):
- thing = method(attrs)
- self.mystack.append(thing)
-
- def handle_endtag(self, tag, method):
- thing = self.mystack.pop()
- method(thing)
-
- def unknown_starttag(self, tag, attrs):
- thing = self.start(tag, attrs)
- self.mystack.append(thing)
-
- def unknown_endtag(self, tag):
- thing = self.mystack.pop()
- self.end(tag, thing)
-
- def start(self, tag, attrs):
- self.convert()
- if self.words is not None:
- return (tag, attrs, self.pos, None)
- else:
- return None
-
- def convert(self):
- if self.words is not None and self.string is not None:
- words = self.string.split()
- self.pos += len(words)
- self.words.extend(words)
- self.string = None
-
- def end(self, tag, thing):
- self.convert()
- if self.words is not None:
- (tag, attrs, i, j) = thing
- self.meta.append((tag, attrs, i, self.pos))
-
- def handle_data(self, s):
- if self.words is not None:
- if (self.string is None):
- self.string = s
- else:
- self.string += s
-
- def handle_entityref(self, ref):
- # s=self.convert_entityref(ref) # if python 2.5
- s=self.convref[ref]
- if self.words is not None:
- if (self.string is None):
- self.string = s
- else:
- self.string += s
-