From a539de0bdb07b1ba0f69aab289c3ca0aa1a10031 Mon Sep 17 00:00:00 2001 From: pks Date: Thu, 22 Dec 2022 09:37:20 +0100 Subject: tmx-to-plain.py --- tmx-to-plain.py | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 tmx-to-plain.py diff --git a/tmx-to-plain.py b/tmx-to-plain.py new file mode 100644 index 0000000..07cac6f --- /dev/null +++ b/tmx-to-plain.py @@ -0,0 +1,95 @@ +import argparse +import datetime +import sys + +from translate.storage.tmx import tmxfile + + +def extract_from_tmx(tmx_file_path, + src_out_path, + tgt_out_path, + begin_date, + date, + src_out_after, + tgt_out_after): + with open(tmx_file_path, 'rb') as in_fp: + tmx_file = tmxfile(in_fp) + + if src_out_after is not None and tgt_out_after is not None: + src_out_after_fp = open(src_out_after, "w") + tgt_out_after_fp = open(tgt_out_after, "w") + + + with open(src_out_path, "w") as src_out_fp, open(tgt_out_path, "w") as tgt_out_fp: + for index, node in enumerate(tmx_file.unit_iter()): + src_out_fp_ = src_out_fp + tgt_out_fp_ = tgt_out_fp + + if begin_date is not None: + date_string = node.get_target_dom().get('lastusagedate')[:8] + date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date() + if date_obj < begin_date: + continue + + if date is not None: + date_string = node.get_target_dom().get('changedate')[:8] + date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date() + if date_obj > date: + src_out_fp_ = src_out_after_fp + tgt_out_fp_ = tgt_out_after_fp + + src_string = f"{node.source}" + tgt_string = f"{node.target}" + src_string = src_string.replace('\n', ' ').replace('\r', '') + tgt_string = tgt_string.replace('\n', ' ').replace('\r', '') + + src_out_fp_.write(f"{src_string}\n") + tgt_out_fp_.write(f"{tgt_string}\n") + if (index + 1) % 1000 == 0: + sys.stdout.write(f"Processed {index + 1} lines\r") + sys.stdout.flush() + + if src_out_after is not None and tgt_out_after is not None: + src_out_after_fp.close() + tgt_out_after_fp.close() + + +def main(): + + usage = "Usage: python tmx_to_plain.py [options]" + parser = argparse.ArgumentParser(usage=usage) + parser.add_argument("-i", "--input", help="input tmx file") + parser.add_argument("-d", "--date", help="date for splitting the output") + parser.add_argument("-b", "--begin_date", help="earliest date (lastusage) to retain data") + + args = parser.parse_args() + + if args.input is None: + parser.print_help() + sys.exit(1) + + args.input + + src_out = args.input + ".src" + tgt_out = args.input + ".tgt" + + + if args.date is not None: + date = datetime.datetime.strptime(args.date, '%Y-%m-%d').date() + src_out_after = src_out + ".after." + args.date + tgt_out_after = tgt_out + ".after." + args.date + else: + date = None + src_out_after = None + tgt_out_after = None + + if args.begin_date is not None: + begin_date = datetime.datetime.strptime(args.begin_date, '%Y-%m-%d').date() + else: + begin_date = None + + extract_from_tmx(args.input, src_out, tgt_out, begin_date, date, src_out_after, tgt_out_after) + + +if __name__ == '__main__': + main() -- cgit v1.2.3