summaryrefslogtreecommitdiff
path: root/sa-extract/extractor.py
blob: 9d66ebf001f4edb044571f76df748906878f73dd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python

# vim:expandtab:shiftwidth=4

import sys, gc, monitor, sgml
import optparse
import model
import log
import cn

models = []

def add_model(m,w=0.0):
    models.append(m)

def extract_grammar(input):
    confnet = cn.ConfusionNet(input)
    meta = input.meta
    for m in models:
        m.input(confnet.columns, meta)

if __name__ == "__main__":
    optparser = optparse.OptionParser()
    optparser.add_option("-c", "--config", dest="config", help="configuration module")
    optparser.add_option("-x", "--extra", dest="extra", help="output grammar name override")
    (opts,args) = optparser.parse_args()

    if opts.config is None:
        raise ValueError, "You must specify a configuration file."
    else:
        if log.level >= 1:
            log.write("Reading configuration from %s\n" % opts.config)
        execfile(opts.config)

    if len(args) >= 1 and args[0] != "-":
        input_file = file(args[0], "r")
    else:
        input_file = sys.stdin

    if len(args) >= 2 and args[1] != "-":
        output_file = file(args[1], "w")
    else:
        output_file = sys.stdout

    gc.collect()
    if log.level >= 1:
        log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu()))
        log.write("models: %s\n" % (" ".join(str(x.name) for x in models)))

    sents = sgml.read_raw(input_file)
    for sent in sents:
        mark = sent.getmark()
        if mark is not None:
            (tag, attrs) = mark
            if tag == "seg":
                sent.unmark()
                dattrs = sgml.attrs_to_dict(attrs)
                sent.meta = attrs
        extract_grammar(sent)