summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpks <pks@pks.rocks>2022-12-22 09:37:20 +0100
committerpks <pks@pks.rocks>2022-12-22 09:37:20 +0100
commita539de0bdb07b1ba0f69aab289c3ca0aa1a10031 (patch)
tree050f0c6568f350d6d80a8dc5025a00637ae4b297
parentecdba524d96ced5eca86cb8e8085c04f177c524d (diff)
tmx-to-plain.py
-rw-r--r--tmx-to-plain.py95
1 files changed, 95 insertions, 0 deletions
diff --git a/tmx-to-plain.py b/tmx-to-plain.py
new file mode 100644
index 0000000..07cac6f
--- /dev/null
+++ b/tmx-to-plain.py
@@ -0,0 +1,95 @@
+import argparse
+import datetime
+import sys
+
+from translate.storage.tmx import tmxfile
+
+
+def extract_from_tmx(tmx_file_path,
+ src_out_path,
+ tgt_out_path,
+ begin_date,
+ date,
+ src_out_after,
+ tgt_out_after):
+ with open(tmx_file_path, 'rb') as in_fp:
+ tmx_file = tmxfile(in_fp)
+
+ if src_out_after is not None and tgt_out_after is not None:
+ src_out_after_fp = open(src_out_after, "w")
+ tgt_out_after_fp = open(tgt_out_after, "w")
+
+
+ with open(src_out_path, "w") as src_out_fp, open(tgt_out_path, "w") as tgt_out_fp:
+ for index, node in enumerate(tmx_file.unit_iter()):
+ src_out_fp_ = src_out_fp
+ tgt_out_fp_ = tgt_out_fp
+
+ if begin_date is not None:
+ date_string = node.get_target_dom().get('lastusagedate')[:8]
+ date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date()
+ if date_obj < begin_date:
+ continue
+
+ if date is not None:
+ date_string = node.get_target_dom().get('changedate')[:8]
+ date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date()
+ if date_obj > date:
+ src_out_fp_ = src_out_after_fp
+ tgt_out_fp_ = tgt_out_after_fp
+
+ src_string = f"{node.source}"
+ tgt_string = f"{node.target}"
+ src_string = src_string.replace('\n', ' ').replace('\r', '')
+ tgt_string = tgt_string.replace('\n', ' ').replace('\r', '')
+
+ src_out_fp_.write(f"{src_string}\n")
+ tgt_out_fp_.write(f"{tgt_string}\n")
+ if (index + 1) % 1000 == 0:
+ sys.stdout.write(f"Processed {index + 1} lines\r")
+ sys.stdout.flush()
+
+ if src_out_after is not None and tgt_out_after is not None:
+ src_out_after_fp.close()
+ tgt_out_after_fp.close()
+
+
+def main():
+
+ usage = "Usage: python tmx_to_plain.py [options]"
+ parser = argparse.ArgumentParser(usage=usage)
+ parser.add_argument("-i", "--input", help="input tmx file")
+ parser.add_argument("-d", "--date", help="date for splitting the output")
+ parser.add_argument("-b", "--begin_date", help="earliest date (lastusage) to retain data")
+
+ args = parser.parse_args()
+
+ if args.input is None:
+ parser.print_help()
+ sys.exit(1)
+
+ args.input
+
+ src_out = args.input + ".src"
+ tgt_out = args.input + ".tgt"
+
+
+ if args.date is not None:
+ date = datetime.datetime.strptime(args.date, '%Y-%m-%d').date()
+ src_out_after = src_out + ".after." + args.date
+ tgt_out_after = tgt_out + ".after." + args.date
+ else:
+ date = None
+ src_out_after = None
+ tgt_out_after = None
+
+ if args.begin_date is not None:
+ begin_date = datetime.datetime.strptime(args.begin_date, '%Y-%m-%d').date()
+ else:
+ begin_date = None
+
+ extract_from_tmx(args.input, src_out, tgt_out, begin_date, date, src_out_after, tgt_out_after)
+
+
+if __name__ == '__main__':
+ main()