diff options
author | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 |
commit | ef6085e558e26c8819f1735425761103021b6470 (patch) | |
tree | 5cf70e4c48c64d838e1326b5a505c8c4061bff4a /sa-extract/extractor.py | |
parent | 10a232656a0c882b3b955d2bcfac138ce11e8a2e (diff) | |
parent | dfbc278c1057555fda9312291c8024049e00b7d8 (diff) |
merge with upstream
Diffstat (limited to 'sa-extract/extractor.py')
-rwxr-xr-x | sa-extract/extractor.py | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/sa-extract/extractor.py b/sa-extract/extractor.py new file mode 100755 index 00000000..9d66ebf0 --- /dev/null +++ b/sa-extract/extractor.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +# vim:expandtab:shiftwidth=4 + +import sys, gc, monitor, sgml +import optparse +import model +import log +import cn + +models = [] + +def add_model(m,w=0.0): + models.append(m) + +def extract_grammar(input): + confnet = cn.ConfusionNet(input) + meta = input.meta + for m in models: + m.input(confnet.columns, meta) + +if __name__ == "__main__": + optparser = optparse.OptionParser() + optparser.add_option("-c", "--config", dest="config", help="configuration module") + optparser.add_option("-x", "--extra", dest="extra", help="output grammar name override") + (opts,args) = optparser.parse_args() + + if opts.config is None: + raise ValueError, "You must specify a configuration file." + else: + if log.level >= 1: + log.write("Reading configuration from %s\n" % opts.config) + execfile(opts.config) + + if len(args) >= 1 and args[0] != "-": + input_file = file(args[0], "r") + else: + input_file = sys.stdin + + if len(args) >= 2 and args[1] != "-": + output_file = file(args[1], "w") + else: + output_file = sys.stdout + + gc.collect() + if log.level >= 1: + log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu())) + log.write("models: %s\n" % (" ".join(str(x.name) for x in models))) + + sents = sgml.read_raw(input_file) + for sent in sents: + mark = sent.getmark() + if mark is not None: + (tag, attrs) = mark + if tag == "seg": + sent.unmark() + dattrs = sgml.attrs_to_dict(attrs) + sent.meta = attrs + extract_grammar(sent) + |