summaryrefslogtreecommitdiff
path: root/sa-extract/extractor.py
diff options
context:
space:
mode:
authorChris Dyer <prguest11@taipan.cs>2012-02-02 06:29:50 +0000
committerChris Dyer <prguest11@taipan.cs>2012-02-02 06:29:50 +0000
commit7d37102fde8d8b46ed5218245a11496e870fd09f (patch)
tree97bb3de03d634ffb754a6381c961b6d5cf8d3a7d /sa-extract/extractor.py
parenta95fe83969d15b074892c8c06fc2a948b75910ff (diff)
lopez suffix array extractor with copyrighted david chiang code excised
Diffstat (limited to 'sa-extract/extractor.py')
-rwxr-xr-xsa-extract/extractor.py60
1 files changed, 60 insertions, 0 deletions
diff --git a/sa-extract/extractor.py b/sa-extract/extractor.py
new file mode 100755
index 00000000..9d66ebf0
--- /dev/null
+++ b/sa-extract/extractor.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+# vim:expandtab:shiftwidth=4
+
+import sys, gc, monitor, sgml
+import optparse
+import model
+import log
+import cn
+
+models = []
+
+def add_model(m,w=0.0):
+ models.append(m)
+
+def extract_grammar(input):
+ confnet = cn.ConfusionNet(input)
+ meta = input.meta
+ for m in models:
+ m.input(confnet.columns, meta)
+
+if __name__ == "__main__":
+ optparser = optparse.OptionParser()
+ optparser.add_option("-c", "--config", dest="config", help="configuration module")
+ optparser.add_option("-x", "--extra", dest="extra", help="output grammar name override")
+ (opts,args) = optparser.parse_args()
+
+ if opts.config is None:
+ raise ValueError, "You must specify a configuration file."
+ else:
+ if log.level >= 1:
+ log.write("Reading configuration from %s\n" % opts.config)
+ execfile(opts.config)
+
+ if len(args) >= 1 and args[0] != "-":
+ input_file = file(args[0], "r")
+ else:
+ input_file = sys.stdin
+
+ if len(args) >= 2 and args[1] != "-":
+ output_file = file(args[1], "w")
+ else:
+ output_file = sys.stdout
+
+ gc.collect()
+ if log.level >= 1:
+ log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu()))
+ log.write("models: %s\n" % (" ".join(str(x.name) for x in models)))
+
+ sents = sgml.read_raw(input_file)
+ for sent in sents:
+ mark = sent.getmark()
+ if mark is not None:
+ (tag, attrs) = mark
+ if tag == "seg":
+ sent.unmark()
+ dattrs = sgml.attrs_to_dict(attrs)
+ sent.meta = attrs
+ extract_grammar(sent)
+