diff options
| author | Chris Dyer <prguest11@taipan.cs> | 2012-02-02 06:29:50 +0000 | 
|---|---|---|
| committer | Chris Dyer <prguest11@taipan.cs> | 2012-02-02 06:29:50 +0000 | 
| commit | 8e5fad9bcbadf36bbab3c1c5b053e3c8f7dddbce (patch) | |
| tree | 9c812b3f267aa1975cdf8b7af928c4b20eb36f93 /sa-extract/compile_bin.py | |
| parent | ff496d3089e84846c8562c574155d8df1e4d911c (diff) | |
lopez suffix array extractor with copyrighted david chiang code excised
Diffstat (limited to 'sa-extract/compile_bin.py')
| -rwxr-xr-x | sa-extract/compile_bin.py | 148 | 
1 files changed, 148 insertions, 0 deletions
| diff --git a/sa-extract/compile_bin.py b/sa-extract/compile_bin.py new file mode 100755 index 00000000..0196e552 --- /dev/null +++ b/sa-extract/compile_bin.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python + +'''This program compiles/decompiles binary data objects used +by the decoder''' + +import sys +import cdat +import calignment +import csuf +import clex +import precomputation +#import parse +import monitor +import optparse + +def main(argv=None): +	'''Call this from the command-line to create a  +	pre-computed binary data array for later use''' +	if argv is None: +		argv = sys.argv + +	parser = optparse.OptionParser(usage="Usage: %prog [-s|-d|-a|-p] <input file> <output file>"+ +								"\n\nNote: -d,-s,-a, and -p are mutually exclusive") +	parser.add_option("-d", "--data-array",  +					action="store_true", default=False, +					dest="da", help="Compile file into data array (default)") +	parser.add_option("-s", "--suffix-array",  +					action="store_true", default=False, +					dest="sa", help="Compile file into suffix array") +	parser.add_option("-a", "--alignment",  +					action="store_true", default=False, +					dest="a", help="Compile file into alignment") +	parser.add_option("-l", "--lexical",  +					action="store_true", default=False, +					dest="l", help="Compile file into lex file") +	parser.add_option("-x", "--compute_lexical", action="store", nargs=2, +					dest="lex_args", help="Compute lex file from data", +					metavar="<f file> <e file>") +	parser.add_option("-p", "--parse",  +					action="store_true", default=False, +					dest="p", help="Compile file into parse") +	parser.add_option("-b", "--binary-infile",  +					action="store_true", default=False, +					dest="bin", help="Input file is binary (default: text)") +	parser.add_option("-t", "--text-outfile",  +					action="store_true", default=False, +					dest="text", help="Output file is text (default: binary)") +	parser.add_option("-e", "--enhanced-outfile",  +					action="store_true", default=False, +					dest="enhanced", help="Output file is enhanced text (default: binary)") +	parser.add_option("-r", action="store", nargs=7, +					dest="precomp_args", help="Precompute collocations (Hiero only)",  +					metavar="max-len=<INT> max-nt=<INT> max-size=<INT> min-gap=<INT> rank1=<INT> rank2=<INT> sa=<FILE>") +	(options, args) = parser.parse_args() + +	filetype_opts =  [options.da, options.sa, options.a, options.p] + +	if (len(filter(lambda x: x, filetype_opts))) > 1 or len(args) != 2: +		parser.print_help() +		sys.exit(1) + +	(infilename, outfilename) = args +	if options.bin: +		bin = " binary" +	else: +		bin = "" + +	start_time = monitor.cpu() +	if options.precomp_args: +		if options.bin: +			obj = precomputation.Precomputation(infilename, from_binary=True) +		else: +			keys = set(["max-len", "max-nt", "max-size", "min-gap", "rank1", "rank2", "sa"]) +			precomp_opts = {}  +			sys.stderr.write("Precomputing statistics for list %s\n" % infilename) +			for pair in options.precomp_args: +				(key, val) = pair.split("=") +				if key in keys: +					keys.remove(key) +					if key != "sa": +						val = int(val) +					precomp_opts[key] = val +				else: +					sys.stderr.write("Unknown keyword arg %s for -r (must be one of: max-len, max-nt, max-size, min-gap, rank1, rank2)\n" % key) +					return 1 +			sa = csuf.SuffixArray(precomp_opts["sa"], True) +			obj = precomputation.Precomputation(infilename, sa,  +				precompute_rank=precomp_opts["rank1"],  +				precompute_secondary_rank=precomp_opts["rank2"],  +				max_length=precomp_opts["max-len"],  +				max_nonterminals=precomp_opts["max-nt"],  +				train_max_initial_size=precomp_opts["max-size"],  +				train_min_gap_size=precomp_opts["min-gap"]) +	elif options.sa: +		sys.stderr.write("Reading %s as%s suffix array...\n" % (infilename, bin)) +		obj = csuf.SuffixArray(infilename, options.bin) +	elif options.a: +		sys.stderr.write("Reading %s as%s alignment array...\n" % (infilename, bin)) +		obj = calignment.Alignment(infilename, options.bin) +	elif options.p: +		sys.stderr.write("Reading %s as%s parse array...\n" % (infilename, bin)) +		obj = parse.ParseArray(infilename, options.bin) +	elif options.l: +		sys.stderr.write("Reading %s as%s lex array...\n" % (infilename, bin)) +		obj = clex.CLex(infilename, options.bin) +	elif options.lex_args: +		ffile = options.lex_args[0] +		efile = options.lex_args[1] +		sys.stderr.write("Computing lex array from:\n A=%s\n F=%s\n E=%s\n" % (infilename, ffile, efile)) +		fsarray = csuf.SuffixArray(ffile, True) +		earray = cdat.DataArray(efile, True) +		aarray = calignment.Alignment(infilename, True) +		obj = clex.CLex(aarray, from_data=True, earray=earray, fsarray=fsarray) +	else: +		sys.stderr.write("Reading %s as%s data array...\n" % (infilename, bin)) +		obj = cdat.DataArray(infilename, options.bin) + +	sys.stderr.write("  Total time for read: %f\n" % (monitor.cpu() - start_time)) +	start_time = monitor.cpu() +	if options.text: +		sys.stderr.write("Writing text file %s...\n" % outfilename) +		obj.write_text(outfilename) +	elif options.enhanced: +		sys.stderr.write("Writing enhanced text file %s...\n" % outfilename) +		obj.write_enhanced(outfilename) +	else: +		sys.stderr.write("Writing binary file %s...\n" % outfilename) +		obj.write_binary(outfilename) +	sys.stderr.write("Finished.\n") +	sys.stderr.write("  Total time for write: %f\n" % (monitor.cpu() - start_time)) + +	mem_use = float(monitor.memory()) +	metric = "B" +	if mem_use / 1000 > 1: +		mem_use /= 1000 +		metric = "KB" +	if mem_use / 1000 > 1: +		mem_use /= 1000 +		metric = "MB" +	if mem_use / 1000 > 1: +		mem_use /= 1000 +		metric = "GB" +	sys.stderr.write("  Memory usage: %.1f%s\n" % (mem_use, metric)) + + + +if __name__ == "__main__": +	sys.exit(main(sys.argv)) | 
