From b31ace79ea5f6b3f279c544cd3a443d6fbf2a24d Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 26 Feb 2026 10:05:59 +0000 Subject: overhaul --- tmx-extract | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100755 tmx-extract (limited to 'tmx-extract') diff --git a/tmx-extract b/tmx-extract new file mode 100755 index 0000000..7791eb6 --- /dev/null +++ b/tmx-extract @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# +# Adapted from Apertium +# http://wiki.apertium.org/wiki/Tools_for_TMX +# + +from xml.sax import make_parser +from xml.sax.handler import ContentHandler + +import sys +import codecs + +class TMXHandler(ContentHandler): + def __init__ (self, slang, tlang, sfile, tfile): + self.pair = set([slang, tlang]) + self.files = {} + self.files[slang] = sfile + self.files[tlang] = tfile + self.inTag = "" + self.note = "" + self.tuid = "" + self.type = "" + self.cur_pair = set() + self.cur_lang = "" + self.seg = {} + self.seg[slang] = "" + self.seg[tlang] = "" + + def startElement(self, name, attrs): + + if name == "tu": + self.cur_pair = set() + self.inTag = "tu" + self.tuid = attrs.get("tuid", "") + self.type = attrs.get("datatype", "") + elif name == "note": + self.inTag = "note" + self.note = "" + elif name == "tuv": + self.inTag = "tuv" + self.cur_lang = attrs.get("xml:lang", "") + self.cur_pair.add(self.cur_lang) + elif name == "seg": + self.inTag = "seg" + if self.cur_lang in self.pair: + self.seg[self.cur_lang] = "" + + def characters (self, c): + if self.inTag == "note": + self.note += c + elif self.inTag == "seg" and self.cur_lang in self.pair: + self.seg[self.cur_lang] += c + + def endElement(self, name): + if name == "tu" and self.pair == self.cur_pair: + for lang in self.cur_pair: + self.files[lang].write("{}\n".format(self.seg[lang].replace("\n", " ").strip())) + + +if __name__ == "__main__": + parser = make_parser() + + if len(sys.argv) < 3: + print(f"Usage: {sys.argv[0]} ") + print() + sys.exit(-1) + + sfile_path = f"{sys.argv[1]}.{sys.argv[2]}" + tfile_path = f"{sys.argv[1]}.{sys.argv[3]}" + + with open(sfile_path, "w+") as sfile, open(tfile_path, "w+") as tfile: + curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile) + parser.setContentHandler(curHandler) + with open(sys.argv[1], "r") as tmx_file: + parser.parse(tmx_file) -- cgit v1.2.3