#!/usr/bin/env python '''This program compiles/decompiles binary data objects used by the decoder''' import sys import cdat import calignment import csuf import clex import precomputation #import parse import monitor import optparse def main(argv=None): '''Call this from the command-line to create a pre-computed binary data array for later use''' if argv is None: argv = sys.argv parser = optparse.OptionParser(usage="Usage: %prog [-s|-d|-a|-p] "+ "\n\nNote: -d,-s,-a, and -p are mutually exclusive") parser.add_option("-d", "--data-array", action="store_true", default=False, dest="da", help="Compile file into data array (default)") parser.add_option("-s", "--suffix-array", action="store_true", default=False, dest="sa", help="Compile file into suffix array") parser.add_option("-a", "--alignment", action="store_true", default=False, dest="a", help="Compile file into alignment") parser.add_option("-l", "--lexical", action="store_true", default=False, dest="l", help="Compile file into lex file") parser.add_option("-x", "--compute_lexical", action="store", nargs=2, dest="lex_args", help="Compute lex file from data", metavar=" ") parser.add_option("-p", "--parse", action="store_true", default=False, dest="p", help="Compile file into parse") parser.add_option("-b", "--binary-infile", action="store_true", default=False, dest="bin", help="Input file is binary (default: text)") parser.add_option("-t", "--text-outfile", action="store_true", default=False, dest="text", help="Output file is text (default: binary)") parser.add_option("-e", "--enhanced-outfile", action="store_true", default=False, dest="enhanced", help="Output file is enhanced text (default: binary)") parser.add_option("-r", action="store", nargs=7, dest="precomp_args", help="Precompute collocations (Hiero only)", metavar="max-len= max-nt= max-size= min-gap= rank1= rank2= sa=") (options, args) = parser.parse_args() filetype_opts = [options.da, options.sa, options.a, options.p] if (len(filter(lambda x: x, filetype_opts))) > 1 or len(args) != 2: parser.print_help() sys.exit(1) (infilename, outfilename) = args if options.bin: bin = " binary" else: bin = "" start_time = monitor.cpu() if options.precomp_args: if options.bin: obj = precomputation.Precomputation(infilename, from_binary=True) else: keys = set(["max-len", "max-nt", "max-size", "min-gap", "rank1", "rank2", "sa"]) precomp_opts = {} sys.stderr.write("Precomputing statistics for list %s\n" % infilename) for pair in options.precomp_args: (key, val) = pair.split("=") if key in keys: keys.remove(key) if key != "sa": val = int(val) precomp_opts[key] = val else: sys.stderr.write("Unknown keyword arg %s for -r (must be one of: max-len, max-nt, max-size, min-gap, rank1, rank2)\n" % key) return 1 sa = csuf.SuffixArray(precomp_opts["sa"], True) obj = precomputation.Precomputation(infilename, sa, precompute_rank=precomp_opts["rank1"], precompute_secondary_rank=precomp_opts["rank2"], max_length=precomp_opts["max-len"], max_nonterminals=precomp_opts["max-nt"], train_max_initial_size=precomp_opts["max-size"], train_min_gap_size=precomp_opts["min-gap"]) elif options.sa: sys.stderr.write("Reading %s as%s suffix array...\n" % (infilename, bin)) obj = csuf.SuffixArray(infilename, options.bin) elif options.a: sys.stderr.write("Reading %s as%s alignment array...\n" % (infilename, bin)) obj = calignment.Alignment(infilename, options.bin) elif options.p: sys.stderr.write("Reading %s as%s parse array...\n" % (infilename, bin)) obj = parse.ParseArray(infilename, options.bin) elif options.l: sys.stderr.write("Reading %s as%s lex array...\n" % (infilename, bin)) obj = clex.CLex(infilename, options.bin) elif options.lex_args: ffile = options.lex_args[0] efile = options.lex_args[1] sys.stderr.write("Computing lex array from:\n A=%s\n F=%s\n E=%s\n" % (infilename, ffile, efile)) fsarray = csuf.SuffixArray(ffile, True) earray = cdat.DataArray(efile, True) aarray = calignment.Alignment(infilename, True) obj = clex.CLex(aarray, from_data=True, earray=earray, fsarray=fsarray) else: sys.stderr.write("Reading %s as%s data array...\n" % (infilename, bin)) obj = cdat.DataArray(infilename, options.bin) sys.stderr.write(" Total time for read: %f\n" % (monitor.cpu() - start_time)) start_time = monitor.cpu() if options.text: sys.stderr.write("Writing text file %s...\n" % outfilename) obj.write_text(outfilename) elif options.enhanced: sys.stderr.write("Writing enhanced text file %s...\n" % outfilename) obj.write_enhanced(outfilename) else: sys.stderr.write("Writing binary file %s...\n" % outfilename) obj.write_binary(outfilename) sys.stderr.write("Finished.\n") sys.stderr.write(" Total time for write: %f\n" % (monitor.cpu() - start_time)) mem_use = float(monitor.memory()) metric = "B" if mem_use / 1000 > 1: mem_use /= 1000 metric = "KB" if mem_use / 1000 > 1: mem_use /= 1000 metric = "MB" if mem_use / 1000 > 1: mem_use /= 1000 metric = "GB" sys.stderr.write(" Memory usage: %.1f%s\n" % (mem_use, metric)) if __name__ == "__main__": sys.exit(main(sys.argv))