summaryrefslogtreecommitdiff
path: root/tmx-to-plain.py
blob: 07cac6f3464d5e2e605cfca2903b8a649e455622 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import argparse
import datetime
import sys

from translate.storage.tmx import tmxfile


def extract_from_tmx(tmx_file_path,
                     src_out_path,
                     tgt_out_path,
                     begin_date,
                     date,
                     src_out_after,
                     tgt_out_after):
    with open(tmx_file_path, 'rb') as in_fp:
        tmx_file = tmxfile(in_fp)
    
    if src_out_after is not None and tgt_out_after is not None:
        src_out_after_fp = open(src_out_after, "w")
        tgt_out_after_fp = open(tgt_out_after, "w")
    
        
    with open(src_out_path, "w") as src_out_fp, open(tgt_out_path, "w") as tgt_out_fp:
        for index, node in enumerate(tmx_file.unit_iter()):
            src_out_fp_ = src_out_fp
            tgt_out_fp_ = tgt_out_fp
            
            if begin_date is not None:
                date_string = node.get_target_dom().get('lastusagedate')[:8]
                date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date()
                if date_obj < begin_date:
                    continue
            
            if date is not None:
                date_string = node.get_target_dom().get('changedate')[:8]
                date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date()
                if date_obj > date:
                    src_out_fp_ = src_out_after_fp
                    tgt_out_fp_ = tgt_out_after_fp
        
            src_string = f"{node.source}"
            tgt_string = f"{node.target}"
            src_string = src_string.replace('\n', ' ').replace('\r', '')
            tgt_string = tgt_string.replace('\n', ' ').replace('\r', '')
        
            src_out_fp_.write(f"{src_string}\n")
            tgt_out_fp_.write(f"{tgt_string}\n")
            if (index + 1) % 1000 == 0:
                sys.stdout.write(f"Processed {index + 1} lines\r")
                sys.stdout.flush()

    if src_out_after is not None and tgt_out_after is not None:
        src_out_after_fp.close()
        tgt_out_after_fp.close()


def main():

    usage = "Usage: python tmx_to_plain.py [options]"
    parser = argparse.ArgumentParser(usage=usage)
    parser.add_argument("-i", "--input", help="input tmx file")
    parser.add_argument("-d", "--date", help="date for splitting the output")
    parser.add_argument("-b", "--begin_date", help="earliest date (lastusage) to retain data")

    args = parser.parse_args()

    if args.input is None:
        parser.print_help()
        sys.exit(1)
    
    args.input
    
    src_out = args.input + ".src"
    tgt_out = args.input + ".tgt"
    
    
    if args.date is not None:
        date = datetime.datetime.strptime(args.date, '%Y-%m-%d').date()
        src_out_after = src_out + ".after." + args.date
        tgt_out_after = tgt_out + ".after." + args.date
    else:
        date = None
        src_out_after = None
        tgt_out_after = None
        
    if args.begin_date is not None:
        begin_date = datetime.datetime.strptime(args.begin_date, '%Y-%m-%d').date()
    else:
        begin_date = None
        
    extract_from_tmx(args.input, src_out, tgt_out, begin_date, date, src_out_after, tgt_out_after)    
    

if __name__ == '__main__':
    main()