diff options
author | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-07-28 17:12:40 -0400 |
---|---|---|
committer | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-07-28 17:12:40 -0400 |
commit | 0b0616c6f7400ce52d07350f7a7054a2513d9813 (patch) | |
tree | 9681e9c5b6e14ee2c697b2e659b0cc2296ff24e5 /python/pkg/cdec/sa | |
parent | b81b2e85bdfd5e9dda98a6e448e6354ca0c6d26b (diff) |
[python] Suffix array compiler can read bitext (-b)
Diffstat (limited to 'python/pkg/cdec/sa')
-rw-r--r-- | python/pkg/cdec/sa/compile.py | 24 |
1 files changed, 18 insertions, 6 deletions
diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py index 30e605a6..2a89243b 100644 --- a/python/pkg/cdec/sa/compile.py +++ b/python/pkg/cdec/sa/compile.py @@ -37,16 +37,22 @@ def main(): help='Number of pre-computed super-frequent patterns)') parser.add_argument('-c', '--config', default='/dev/stdout', help='Output configuration') - parser.add_argument('-o', '--output', required=True, - help='Output path') - parser.add_argument('-f', '--source', required=True, + parser.add_argument('-f', '--source', help='Source language corpus') - parser.add_argument('-e', '--target', required=True, + parser.add_argument('-e', '--target', help='Target language corpus') + parser.add_argument('-b', '--bitext', + help='Parallel text (source ||| target)') parser.add_argument('-a', '--alignment', required=True, help='Bitext word alignment') + parser.add_argument('-o', '--output', required=True, + help='Output path') args = parser.parse_args() + if not ((args.source and args.target) or args.bitext): + parser.error('a parallel corpus is required\n' + '\tuse -f (source) with -e (target) or -b (bitext)') + param_names = ("max_len", "max_nt", "max_size", "min_gap", "rank1", "rank2") params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, args.rank1, args.rank2) @@ -61,11 +67,17 @@ def main(): lex_bin = os.path.join(args.output, 'lex.bin') logger.info('Compiling source suffix array') - f_sa = cdec.sa.SuffixArray(from_text=args.source) + if args.bitext: + f_sa = cdec.sa.SuffixArray(from_text=args.bitext, side='source') + else: + f_sa = cdec.sa.SuffixArray(from_text=args.source) f_sa.write_binary(f_sa_bin) logger.info('Compiling target data array') - e = cdec.sa.DataArray(from_text=args.target) + if args.bitext: + e = cdec.sa.DataArray(from_text=args.bitext, side='target') + else: + e = cdec.sa.DataArray(from_text=args.target) e.write_binary(e_bin) logger.info('Precomputing frequent phrases') |