diff options
Diffstat (limited to 'sa-extract/sgml.py')
-rw-r--r-- | sa-extract/sgml.py | 194 |
1 files changed, 0 insertions, 194 deletions
diff --git a/sa-extract/sgml.py b/sa-extract/sgml.py deleted file mode 100644 index 2db8b5dc..00000000 --- a/sa-extract/sgml.py +++ /dev/null @@ -1,194 +0,0 @@ -import sys, sgmllib, xml.sax.saxutils, sym - -def attrs_to_str(d): - if len(d) == 0: - return "" - l = [""]+["%s=%s" % (name, xml.sax.saxutils.quoteattr(value)) for (name, value) in d] - return " ".join(l) - -def attrs_to_dict(a): - d = {} - for (name, value) in a: - if d.has_key(name.lower()): - raise ValueError, "duplicate attribute names" - d[name.lower()] = value - return d - -class Sentence(object): - def __init__(self, words=None, meta=None): - if words is not None: - self.words = list(words) - else: - self.words = [] - if meta is not None: - self.meta = meta - else: - self.meta = [] - - def copy(self): - return Sentence(self.words, list(self.meta)) - - def mark(self, tag, attrs): - self.meta.append((tag, attrs, 0, len(self.words))) - - def getmark(self): - if len(self.meta) > 0: - (tag, attrs, i, j) = self.meta[-1] - if i == 0 and j == len(self.words): - return (tag, attrs) - else: - return None - else: - return None - - def unmark(self): - mark = self.getmark() - if mark is not None: - self.meta = self.meta[:-1] - return mark - - def __cmp__(self, other): - return cmp((self.words, self.meta), (other.words, other.meta)) - - def __str__(self): - def cmp_spans((tag1,attr1,i1,j1),(tag2,attr2,i2,j2)): - if i1==i2<=j1==j2: - return 0 - elif i2<=i1<=j1<=j2: - return -1 - elif i1<=i2<=j2<=j1: - return 1 - else: - return cmp((i1,j1),(i2,j2)) # don't care - # this guarantees that equal spans will come out nested - # we want the later spans to be outer - # this relies on stable sort - open = [[] for i in xrange(len(self.words)+1)] - # there seems to be a bug still with empty spans - empty = [[] for i in xrange(len(self.words)+1)] - close = [[] for j in xrange(len(self.words)+1)] - for (tag,attrs,i,j) in sorted(self.meta, cmp=cmp_spans): - if i == j: - # do we want these to nest? - empty[i].append("<%s%s></%s>\n" % (tag, attrs_to_str(attrs), tag)) - else: - open[i].append("<%s%s>\n" % (tag, attrs_to_str(attrs))) - close[j].append("</%s>\n" % tag) - - result = [] - if len(empty[0]) > 0: - result.extend(empty[0]) - for i in xrange(len(self.words)): - if i > 0: - result.append(" ") - result.extend(reversed(open[i])) - result.append(xml.sax.saxutils.escape(sym.tostring(self.words[i]))) - result.extend(close[i+1]) - if len(empty[i+1]) > 0: - result.extend(empty[i+1]) - - return "".join(result) - - def __add__(self, other): - if type(other) in (list, tuple): - return Sentence(self.words + list(other), self.meta) - else: - othermeta = [(tag, attrs, i+len(self.words), j+len(self.words)) for (tag, attrs, i, j) in other.meta] - return Sentence(self.words + other.words, self.meta+othermeta) - -def read_raw(f): - """Read a raw file into a list of Sentences.""" - if type(f) is str: - f = file(f, "r") - i = 0 - line = f.readline() - while line != "": - sent = process_sgml_line(line, i) - mark = sent.getmark() - if mark is None: - sent.mark('seg', [('id',str(i))]) - else: - (tag, attrs) = mark - if tag == "seg" and not attrs_to_dict(attrs).has_key('id'): - x = ('id',str(i)) - attrs.append(x) - sent.mark('seg', attrs) - if tag != "seg": - sent.mark('seg', [('id',str(i))]) - yield sent - i += 1 - line = f.readline() - -def process_sgml_line(line, id=None): - p = DatasetParser(None) - p.pos = 0 - p.words = [] - p.meta = [] - p.feed(line) - p.close() - sent = Sentence(p.words, p.meta) - return sent - -class DatasetParser(sgmllib.SGMLParser): - def __init__(self, set): - sgmllib.SGMLParser.__init__(self) - self.words = None - self.mystack = [] - self.string = None - self.convref = d = {"amp":"&", "lt":"<", "gt":">", "quot":'"', "squot":"'"} - def close(self): - self.convert() - sgmllib.SGMLParser.close(self) - - def handle_starttag(self, tag, method, attrs): - thing = method(attrs) - self.mystack.append(thing) - - def handle_endtag(self, tag, method): - thing = self.mystack.pop() - method(thing) - - def unknown_starttag(self, tag, attrs): - thing = self.start(tag, attrs) - self.mystack.append(thing) - - def unknown_endtag(self, tag): - thing = self.mystack.pop() - self.end(tag, thing) - - def start(self, tag, attrs): - self.convert() - if self.words is not None: - return (tag, attrs, self.pos, None) - else: - return None - - def convert(self): - if self.words is not None and self.string is not None: - words = self.string.split() - self.pos += len(words) - self.words.extend(words) - self.string = None - - def end(self, tag, thing): - self.convert() - if self.words is not None: - (tag, attrs, i, j) = thing - self.meta.append((tag, attrs, i, self.pos)) - - def handle_data(self, s): - if self.words is not None: - if (self.string is None): - self.string = s - else: - self.string += s - - def handle_entityref(self, ref): - # s=self.convert_entityref(ref) # if python 2.5 - s=self.convref[ref] - if self.words is not None: - if (self.string is None): - self.string = s - else: - self.string += s - |