summaryrefslogtreecommitdiff
path: root/bitext2tmx.py
blob: 1cdc4b3bdf4dc55056b19c0024da2d25ae000b79 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python3

import sys
from xml.sax.saxutils import escape


if __name__ == "__main__":
    prefix = """<tmx version="1.4">
  <header
    creationtool="bitext2tmx.py" creationtoolversion="1.0"
    datatype="PlainText" segtype="sentence"
    adminlang="en-us" srclang="en"
    o-tmf="ABCTransMem"/>
  <body>"""

    src_file = open(sys.argv[1], "r")
    tgt_file = open(sys.argv[2], "r")

    src_lang = sys.argv[1].split(".")[-1]
    tgt_lang = sys.argv[2].split(".")[-1]

    tus = []
    for src_line, tgt_line in zip(src_file.readlines(), tgt_file.readlines()):
        src_line = src_line.rstrip("\n")
        tgt_line = tgt_line.rstrip("\n")
        tus.append(f"""
    <tu>
      <tuv xml:lang="{src_lang}">
        <seg>{escape(src_line)}</seg>
      </tuv>
      <tuv xml:lang="{tgt_lang}">
        <seg>{escape(tgt_line)}</seg>
      </tuv>
    </tu>""")

    suffix = """  </body>
</tmx>"""

    complete = "\n".join([prefix] + tus + [suffix])

    print(complete)