summaryrefslogtreecommitdiff
path: root/bitext2tmx
diff options
context:
space:
mode:
authorPatrick Simianer <patrick@lilt.com>2026-02-26 10:05:59 +0000
committerPatrick Simianer <patrick@lilt.com>2026-02-26 10:05:59 +0000
commitb31ace79ea5f6b3f279c544cd3a443d6fbf2a24d (patch)
tree31f2b599fa5f6996aeb134390d58deb63eefe04a /bitext2tmx
parent8805e95ae94d798c6441f7e1b72c90e049563f17 (diff)
overhaulHEADmaster
Diffstat (limited to 'bitext2tmx')
-rwxr-xr-xbitext2tmx39
1 files changed, 39 insertions, 0 deletions
diff --git a/bitext2tmx b/bitext2tmx
new file mode 100755
index 0000000..e9c8e23
--- /dev/null
+++ b/bitext2tmx
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+
+import sys
+from xml.sax.saxutils import escape
+
+
+if __name__ == "__main__":
+ prefix = """<tmx version="1.4">
+ <header
+ creationtool="bitext2tmx" creationtoolversion="1.0"
+ datatype="PlainText" segtype="sentence"
+ adminlang="en-us" srclang="en"
+ o-tmf="ABCTransMem"/>
+ <body>"""
+
+ with open(sys.argv[1], "r") as src_file, open(sys.argv[2], "r") as tgt_file:
+ src_lang = sys.argv[1].split(".")[-1]
+ tgt_lang = sys.argv[2].split(".")[-1]
+
+ tus = []
+ for src_line, tgt_line in zip(src_file.readlines(), tgt_file.readlines()):
+ src_line = src_line.rstrip("\n")
+ tgt_line = tgt_line.rstrip("\n")
+ tus.append(f"""
+ <tu>
+ <tuv xml:lang="{src_lang}">
+ <seg>{escape(src_line)}</seg>
+ </tuv>
+ <tuv xml:lang="{tgt_lang}">
+ <seg>{escape(tgt_line)}</seg>
+ </tuv>
+ </tu>""")
+
+ suffix = """ </body>
+</tmx>"""
+
+ complete = "\n".join([prefix] + tus + [suffix])
+
+ print(complete)