summaryrefslogtreecommitdiff
path: root/python/cdec/sa/compile.py
blob: 78ab729db65cbfb92f4ea2a527827c13b9cff304 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python
import argparse
import os
import logging
import cdec.configobj
import cdec.sa
from cdec.sa._sa import monitor_cpu
import sys

MAX_PHRASE_LENGTH = 4
def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2, tight_phrases):
    lcp = cdec.sa.LCP(f_sa)
    stats = sorted(lcp.compute_stats(MAX_PHRASE_LENGTH), reverse=True)
    precomp = cdec.sa.Precomputation(from_stats=stats,
            fsarray=f_sa,
            precompute_rank=rank1,
            precompute_secondary_rank=rank2,
            max_length=max_len,
            max_nonterminals=max_nt,
            train_max_initial_size=max_size,
            train_min_gap_size=min_gap)
    return precomp

def main():
    preprocess_start_time = monitor_cpu()
    sys.setrecursionlimit(sys.getrecursionlimit() * 100)

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger('cdec.sa.compile')
    parser = argparse.ArgumentParser(description='Compile a corpus into a suffix array.')
    parser.add_argument('--maxnt', '-n', type=int, default=2,
                        help='Maximum number of non-terminal symbols')
    parser.add_argument('--maxlen', '-l', type=int, default=5,
                        help='Maximum number of terminals')
    parser.add_argument('--maxsize', '-s', type=int, default=15,
                        help='Maximum rule span')
    parser.add_argument('--mingap', '-g', type=int, default=1,
                        help='Minimum gap size')
    parser.add_argument('--rank1', '-r1', type=int, default=100,
                        help='Number of pre-computed frequent patterns')
    parser.add_argument('--rank2', '-r2', type=int, default=10,
                        help='Number of pre-computed super-frequent patterns)')
    parser.add_argument('--loose', action='store_true',
                        help='Enable loose phrase extraction (default: tight)')
    parser.add_argument('-c', '--config', default='/dev/stdout',
                        help='Output configuration')
    parser.add_argument('-f', '--source',
                        help='Source language corpus')
    parser.add_argument('-e', '--target',
                        help='Target language corpus')
    parser.add_argument('-b', '--bitext',
                        help='Parallel text (source ||| target)')
    parser.add_argument('-a', '--alignment', required=True,
                        help='Bitext word alignment')
    parser.add_argument('-o', '--output', required=True,
                        help='Output path')
    parser.add_argument('--online', action='store_true',
                        help='Compile data for online grammar extraction')
    args = parser.parse_args()

    if not ((args.source and args.target) or args.bitext):
        parser.error('a parallel corpus is required\n'
        '\tuse -f (source) with -e (target) or -b (bitext)')

    param_names = ('max_len', 'max_nt', 'max_size', 'min_gap',
            'rank1', 'rank2', 'tight_phrases')
    params = (args.maxlen, args.maxnt, args.maxsize, args.mingap,
            args.rank1, args.rank2, not args.loose)

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    f_sa_bin = os.path.join(args.output, 'f.sa.bin')
    e_bin = os.path.join(args.output, 'e.bin')
    precomp_file = 'precomp.{0}.{1}.{2}.{3}.{4}.{5}.bin'.format(*params)
    precomp_bin = os.path.join(args.output, precomp_file)
    a_bin = os.path.join(args.output, 'a.bin')
    lex_bin = os.path.join(args.output, 'lex.bin')
    # online only
    bilex_file = os.path.join(args.output, 'bilex.gz')

    config = cdec.configobj.ConfigObj(args.config, unrepr=True)
    config['f_sa_file'] = os.path.abspath(f_sa_bin)
    config['e_file'] = os.path.abspath(e_bin)
    config['a_file'] = os.path.abspath(a_bin)
    config['lex_file'] = os.path.abspath(lex_bin)
    config['precompute_file'] = os.path.abspath(precomp_bin)
    if args.online:
        config['bilex_file'] = os.path.abspath(bilex_file)

    start_time = monitor_cpu()
    logger.info('Compiling source suffix array')
    if args.bitext:
        f_sa = cdec.sa.SuffixArray(from_text=args.bitext, side='source')
    else:
        f_sa = cdec.sa.SuffixArray(from_text=args.source)
    f_sa.write_binary(f_sa_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling source suffix array took %f seconds', stop_time - start_time)

    start_time = monitor_cpu()
    logger.info('Compiling target data array')
    if args.bitext:
        e = cdec.sa.DataArray(from_text=args.bitext, side='target')
    else:
        e = cdec.sa.DataArray(from_text=args.target)
    e.write_binary(e_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling target data array took %f seconds', stop_time - start_time)

    start_time = monitor_cpu()
    logger.info('Precomputing frequent phrases')
    precompute(f_sa, *params).write_binary(precomp_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling precomputations took %f seconds', stop_time - start_time)

    start_time = monitor_cpu()
    logger.info('Compiling alignment')
    a = cdec.sa.Alignment(from_text=args.alignment)
    a.write_binary(a_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling alignment took %f seconds', stop_time - start_time)

    start_time = monitor_cpu()
    logger.info('Compiling bilexical dictionary')
    lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa)
    lex.write_binary(lex_bin)
    stop_time = monitor_cpu()
    logger.info('Compiling bilexical dictionary took %f seconds', stop_time - start_time)

    if args.online:
        start_time = monitor_cpu()
        logger.info('Compiling online bilexical dictionary')
        if args.bitext:
            bilex = cdec.sa.online.Bilex()
            bilex.add_bitext(args.alignment, args.bitext)
        else:
            bilex = cdec.sa.online.Bilex()
            bilex.add_bitext(args.alignment, args.source, args.target)
        bilex.write(bilex_file)
        stop_time = monitor_cpu()
        logger.info('Compiling online bilexical dictionary took %f seconds', stop_time - start_time)

    # Write configuration
    for name, value in zip(param_names, params):
        config[name] = value
    config.write()
    preprocess_stop_time = monitor_cpu()
    logger.info('Overall preprocessing step took %f seconds', preprocess_stop_time - preprocess_start_time)

if __name__ == '__main__':
    main()