summaryrefslogtreecommitdiff
path: root/sa-extract/extractor.py
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2012-03-13 09:24:47 +0100
committerPatrick Simianer <p@simianer.de>2012-03-13 09:24:47 +0100
commitef6085e558e26c8819f1735425761103021b6470 (patch)
tree5cf70e4c48c64d838e1326b5a505c8c4061bff4a /sa-extract/extractor.py
parent10a232656a0c882b3b955d2bcfac138ce11e8a2e (diff)
parentdfbc278c1057555fda9312291c8024049e00b7d8 (diff)
merge with upstream
Diffstat (limited to 'sa-extract/extractor.py')
-rwxr-xr-xsa-extract/extractor.py60
1 files changed, 60 insertions, 0 deletions
diff --git a/sa-extract/extractor.py b/sa-extract/extractor.py
new file mode 100755
index 00000000..9d66ebf0
--- /dev/null
+++ b/sa-extract/extractor.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+# vim:expandtab:shiftwidth=4
+
+import sys, gc, monitor, sgml
+import optparse
+import model
+import log
+import cn
+
+models = []
+
+def add_model(m,w=0.0):
+ models.append(m)
+
+def extract_grammar(input):
+ confnet = cn.ConfusionNet(input)
+ meta = input.meta
+ for m in models:
+ m.input(confnet.columns, meta)
+
+if __name__ == "__main__":
+ optparser = optparse.OptionParser()
+ optparser.add_option("-c", "--config", dest="config", help="configuration module")
+ optparser.add_option("-x", "--extra", dest="extra", help="output grammar name override")
+ (opts,args) = optparser.parse_args()
+
+ if opts.config is None:
+ raise ValueError, "You must specify a configuration file."
+ else:
+ if log.level >= 1:
+ log.write("Reading configuration from %s\n" % opts.config)
+ execfile(opts.config)
+
+ if len(args) >= 1 and args[0] != "-":
+ input_file = file(args[0], "r")
+ else:
+ input_file = sys.stdin
+
+ if len(args) >= 2 and args[1] != "-":
+ output_file = file(args[1], "w")
+ else:
+ output_file = sys.stdout
+
+ gc.collect()
+ if log.level >= 1:
+ log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu()))
+ log.write("models: %s\n" % (" ".join(str(x.name) for x in models)))
+
+ sents = sgml.read_raw(input_file)
+ for sent in sents:
+ mark = sent.getmark()
+ if mark is not None:
+ (tag, attrs) = mark
+ if tag == "seg":
+ sent.unmark()
+ dattrs = sgml.attrs_to_dict(attrs)
+ sent.meta = attrs
+ extract_grammar(sent)
+