#!/usr/bin/env python3 import argparse import datetime import sys from translate.storage.tmx import tmxfile def extract_from_tmx(tmx_file_path, src_out_path, tgt_out_path, begin_date, date, src_out_after, tgt_out_after): with open(tmx_file_path, "rb") as in_fp: tmx_file = tmxfile(in_fp) if src_out_after is not None and tgt_out_after is not None: src_out_after_fp = open(src_out_after, "w") tgt_out_after_fp = open(tgt_out_after, "w") with open(src_out_path, "w") as src_out_fp, open(tgt_out_path, "w") as tgt_out_fp: for index, node in enumerate(tmx_file.unit_iter()): src_out_fp_ = src_out_fp tgt_out_fp_ = tgt_out_fp if begin_date is not None: date_string = node.get_target_dom().get("lastusagedate")[:8] date_obj = datetime.datetime.strptime(date_string, "%Y%m%d").date() if date_obj < begin_date: continue if date is not None: date_string = node.get_target_dom().get("changedate")[:8] date_obj = datetime.datetime.strptime(date_string, "%Y%m%d").date() if date_obj > date: src_out_fp_ = src_out_after_fp tgt_out_fp_ = tgt_out_after_fp src_string = f"{node.source}" tgt_string = f"{node.target}" src_string = src_string.replace("\n", " ").replace("\r", "") tgt_string = tgt_string.replace("\n", " ").replace("\r", "") src_out_fp_.write(f"{src_string}\n") tgt_out_fp_.write(f"{tgt_string}\n") if (index + 1) % 1000 == 0: sys.stdout.write(f"Processed {index + 1} lines\r") sys.stdout.flush() if src_out_after is not None and tgt_out_after is not None: src_out_after_fp.close() tgt_out_after_fp.close() def main(): usage = f"Usage: {sys.argv[0]} [options]" parser = argparse.ArgumentParser(usage=usage) parser.add_argument("-i", "--input", help="input tmx file") parser.add_argument("-d", "--date", help="date for splitting the output") parser.add_argument("-b", "--begin_date", help="earliest date (lastusage) to retain data") args = parser.parse_args() if args.input is None: parser.print_help() sys.exit(1) src_out = f"{args.input}.src" tgt_out = f"{args.input}.tgt" if args.date is not None: date = datetime.datetime.strptime(args.date, "%Y-%m-%d").date() src_out_after = f"{src_out}.after.{args.date}" tgt_out_after = f"{tgt_out}.after.{args.date}" else: date = None src_out_after = None tgt_out_after = None if args.begin_date is not None: begin_date = datetime.datetime.strptime(args.begin_date, "%Y-%m-%d").date() else: begin_date = None extract_from_tmx(args.input, src_out, tgt_out, begin_date, date, src_out_after, tgt_out_after) if __name__ == "__main__": main()