lopez suffix array extractor with copyrighted david chiang code excised

author: Chris Dyer <prguest11@taipan.cs> 2012-02-02 06:29:50 +0000
committer: Chris Dyer <prguest11@taipan.cs> 2012-02-02 06:29:50 +0000
commit: 8e5fad9bcbadf36bbab3c1c5b053e3c8f7dddbce (patch)
tree: 9c812b3f267aa1975cdf8b7af928c4b20eb36f93 /sa-extract/sgml.py
parent: ff496d3089e84846c8562c574155d8df1e4d911c (diff)
1 files changed, 194 insertions, 0 deletions
diff --git a/sa-extract/sgml.py b/sa-extract/sgml.py
new file mode 100644
index 00000000..2db8b5dc
--- /dev/null
+++ b/sa-extract/sgml.py
@@ -0,0 +1,194 @@
+import sys, sgmllib, xml.sax.saxutils, sym
+
+def attrs_to_str(d):
+    if len(d) == 0:
+        return ""
+    l = [""]+["%s=%s" % (name, xml.sax.saxutils.quoteattr(value)) for (name, value) in d]
+    return " ".join(l)
+
+def attrs_to_dict(a):
+    d = {}
+    for (name, value) in a:
+	if d.has_key(name.lower()):
+	    raise ValueError, "duplicate attribute names"
+	d[name.lower()] = value
+    return d
+
+class Sentence(object):
+    def __init__(self, words=None, meta=None):
+        if words is not None:
+            self.words = list(words)
+        else:
+            self.words = []
+        if meta is not None:
+            self.meta = meta
+        else:
+            self.meta = []
+
+    def copy(self):
+        return Sentence(self.words, list(self.meta))
+
+    def mark(self, tag, attrs):
+        self.meta.append((tag, attrs, 0, len(self.words)))
+
+    def getmark(self):
+        if len(self.meta) > 0:
+            (tag, attrs, i, j) = self.meta[-1]
+            if i == 0 and j == len(self.words):
+                return (tag, attrs)
+            else:
+                return None
+        else:
+            return None
+
+    def unmark(self):
+        mark = self.getmark()
+        if mark is not None:
+            self.meta = self.meta[:-1]
+        return mark
+
+    def __cmp__(self, other):
+        return cmp((self.words, self.meta), (other.words, other.meta))
+
+    def __str__(self):
+        def cmp_spans((tag1,attr1,i1,j1),(tag2,attr2,i2,j2)):
+            if i1==i2<=j1==j2:
+                return 0
+            elif i2<=i1<=j1<=j2:
+                return -1
+            elif i1<=i2<=j2<=j1:
+                return 1
+            else:
+                return cmp((i1,j1),(i2,j2)) # don't care
+        # this guarantees that equal spans will come out nested
+        # we want the later spans to be outer
+        # this relies on stable sort
+        open = [[] for i in xrange(len(self.words)+1)]
+        # there seems to be a bug still with empty spans
+        empty = [[] for i in xrange(len(self.words)+1)]
+        close = [[] for j in xrange(len(self.words)+1)]
+        for (tag,attrs,i,j) in sorted(self.meta, cmp=cmp_spans):
+            if i == j:
+                # do we want these to nest?
+                empty[i].append("<%s%s></%s>\n" % (tag, attrs_to_str(attrs), tag))
+            else:
+                open[i].append("<%s%s>\n" % (tag, attrs_to_str(attrs)))
+                close[j].append("</%s>\n" % tag)
+
+        result = []
+        if len(empty[0]) > 0:
+            result.extend(empty[0])
+        for i in xrange(len(self.words)):
+            if i > 0:
+                result.append(" ")
+            result.extend(reversed(open[i]))
+            result.append(xml.sax.saxutils.escape(sym.tostring(self.words[i])))
+            result.extend(close[i+1])
+            if len(empty[i+1]) > 0:
+                result.extend(empty[i+1])
+
+        return "".join(result)
+
+    def __add__(self, other):
+        if type(other) in (list, tuple):
+            return Sentence(self.words + list(other), self.meta)
+        else:
+            othermeta = [(tag, attrs, i+len(self.words), j+len(self.words)) for (tag, attrs, i, j) in other.meta]
+            return Sentence(self.words + other.words, self.meta+othermeta)
+
+def read_raw(f):
+    """Read a raw file into a list of Sentences."""
+    if type(f) is str:
+        f = file(f, "r")
+    i = 0
+    line = f.readline()
+    while line != "":
+        sent = process_sgml_line(line, i)
+        mark = sent.getmark()
+        if mark is None:
+            sent.mark('seg', [('id',str(i))])
+        else:
+            (tag, attrs) = mark
+            if tag == "seg" and not attrs_to_dict(attrs).has_key('id'):
+                x = ('id',str(i))
+                attrs.append(x)
+                sent.mark('seg', attrs)
+            if tag != "seg":
+                sent.mark('seg', [('id',str(i))])
+        yield sent
+        i += 1
+        line = f.readline()
+
+def process_sgml_line(line, id=None):
+    p = DatasetParser(None)
+    p.pos = 0
+    p.words = []
+    p.meta = []
+    p.feed(line)
+    p.close()
+    sent = Sentence(p.words, p.meta)
+    return sent
+
+class DatasetParser(sgmllib.SGMLParser):
+    def __init__(self, set):
+        sgmllib.SGMLParser.__init__(self)
+	self.words = None
+        self.mystack = []
+	self.string = None
+	self.convref = d = {"amp":"&", "lt":"<", "gt":">", "quot":'"', "squot":"'"}
+    def close(self):
+        self.convert()
+        sgmllib.SGMLParser.close(self)
+
+    def handle_starttag(self, tag, method, attrs):
+        thing = method(attrs)
+        self.mystack.append(thing)
+
+    def handle_endtag(self, tag, method):
+        thing = self.mystack.pop()
+        method(thing)
+
+    def unknown_starttag(self, tag, attrs):
+        thing = self.start(tag, attrs)
+        self.mystack.append(thing)
+
+    def unknown_endtag(self, tag):
+        thing = self.mystack.pop()
+        self.end(tag, thing)
+
+    def start(self, tag, attrs):
+        self.convert()
+        if self.words is not None:
+            return (tag, attrs, self.pos, None)
+        else:
+            return None
+
+    def convert(self):
+        if self.words is not None and self.string is not None:
+            words = self.string.split()
+            self.pos += len(words)
+	    self.words.extend(words)
+	    self.string = None
+	
+    def end(self, tag, thing):
+        self.convert()
+        if self.words is not None:
+            (tag, attrs, i, j) = thing
+            self.meta.append((tag, attrs, i, self.pos))
+
+    def handle_data(self, s):
+        if self.words is not None:
+	    if (self.string is None):
+	       self.string = s
+	    else:
+	       self.string += s
+
+    def handle_entityref(self, ref):
+	# s=self.convert_entityref(ref)  # if python 2.5
+	s=self.convref[ref]
+        if self.words is not None:
+	    if (self.string is None):
+	       self.string = s
+	    else:
+	       self.string += s
+
author	Chris Dyer <prguest11@taipan.cs>	2012-02-02 06:29:50 +0000
committer	Chris Dyer <prguest11@taipan.cs>	2012-02-02 06:29:50 +0000
commit	8e5fad9bcbadf36bbab3c1c5b053e3c8f7dddbce (patch)
tree	9c812b3f267aa1975cdf8b7af928c4b20eb36f93 /sa-extract/sgml.py
parent	ff496d3089e84846c8562c574155d8df1e4d911c (diff)