diff options
| author | pks <pks@pks.rocks> | 2024-04-08 09:08:36 +0000 | 
|---|---|---|
| committer | pks <pks@pks.rocks> | 2024-04-08 09:08:36 +0000 | 
| commit | 8805e95ae94d798c6441f7e1b72c90e049563f17 (patch) | |
| tree | 59c8218499bed1e21b055851e9913f7f3b29c851 | |
| parent | d6bdc618e4fd92ea0be794800b73a78d2fb9991f (diff) | |
| -rwxr-xr-x | bitext2tmx.py | 41 | 
1 files changed, 41 insertions, 0 deletions
| diff --git a/bitext2tmx.py b/bitext2tmx.py new file mode 100755 index 0000000..1cdc4b3 --- /dev/null +++ b/bitext2tmx.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + +import sys +from xml.sax.saxutils import escape + + +if __name__ == "__main__": +    prefix = """<tmx version="1.4"> +  <header +    creationtool="bitext2tmx.py" creationtoolversion="1.0" +    datatype="PlainText" segtype="sentence" +    adminlang="en-us" srclang="en" +    o-tmf="ABCTransMem"/> +  <body>""" + +    src_file = open(sys.argv[1], "r") +    tgt_file = open(sys.argv[2], "r") + +    src_lang = sys.argv[1].split(".")[-1] +    tgt_lang = sys.argv[2].split(".")[-1] + +    tus = [] +    for src_line, tgt_line in zip(src_file.readlines(), tgt_file.readlines()): +        src_line = src_line.rstrip("\n") +        tgt_line = tgt_line.rstrip("\n") +        tus.append(f""" +    <tu> +      <tuv xml:lang="{src_lang}"> +        <seg>{escape(src_line)}</seg> +      </tuv> +      <tuv xml:lang="{tgt_lang}"> +        <seg>{escape(tgt_line)}</seg> +      </tuv> +    </tu>""") + +    suffix = """  </body> +</tmx>""" + +    complete = "\n".join([prefix] + tus + [suffix]) + +    print(complete) | 
