summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <pks@pks.rocks>2018-01-30 13:15:19 +0100
committerPatrick Simianer <pks@pks.rocks>2018-01-30 13:15:19 +0100
commitc3caa66919439ff5f92733f5ee7825c4e6783f23 (patch)
tree0597135a2c64e84c4441628d64436069278b231d
parent5a53215ed46e12db68cdd321a6e1228956b163e0 (diff)
tmx-extract.py
-rwxr-xr-xtmx-extract.py76
1 files changed, 76 insertions, 0 deletions
diff --git a/tmx-extract.py b/tmx-extract.py
new file mode 100755
index 0000000..20e4bac
--- /dev/null
+++ b/tmx-extract.py
@@ -0,0 +1,76 @@
+#!/usr/bin/python
+#
+# Adapted from Apertium
+# http://wiki.apertium.org/wiki/Tools_for_TMX
+#
+
+from xml.sax import make_parser
+from xml.sax.handler import ContentHandler
+
+import sys
+import codecs
+
+class TMXHandler(ContentHandler):
+ def __init__ (self, slang, tlang, sfile, tfile):
+ self.pair = set([slang, tlang])
+ self.files = {}
+ self.files[slang] = sfile
+ self.files[tlang] = tfile
+ self.inTag = ''
+ self.note = ''
+ self.tuid = ''
+ self.type = ''
+ self.cur_pair = set()
+ self.cur_lang = ''
+ self.seg = {}
+ self.seg[slang] = ''
+ self.seg[tlang] = ''
+
+ def startElement(self, name, attrs):
+
+ if name == 'tu':
+ self.cur_pair = set();
+ self.inTag = 'tu'
+ self.tuid = attrs.get('tuid','')
+ self.type = attrs.get('datatype','')
+ elif name == 'note':
+ self.inTag = 'note'
+ self.note = ""
+ elif name == 'tuv':
+ self.inTag = 'tuv'
+ self.cur_lang = attrs.get('xml:lang', '')
+ self.cur_pair.add(self.cur_lang)
+ elif name == 'seg':
+ self.inTag = 'seg'
+ if self.cur_lang in self.pair:
+ self.seg[self.cur_lang] = ''
+
+ def characters (self, c):
+ if self.inTag == 'note':
+ self.note += c
+ elif self.inTag == 'seg' and self.cur_lang in self.pair:
+ self.seg[self.cur_lang] += c
+
+ def endElement(self, name):
+ if name == 'tu' and self.pair == self.cur_pair:
+ for lang in self.cur_pair:
+ self.files[lang].write(self.seg[lang].encode('utf-8').strip()+"\n")
+
+parser = make_parser()
+
+if len(sys.argv) < 3:
+ print 'Usage: tmx-extract.py <file> <slang> <tlang>'
+ print ''
+ sys.exit(-1)
+
+sfile = open(sys.argv[1]+"."+sys.argv[2], 'w+')
+tfile = open(sys.argv[1]+"."+sys.argv[3], 'w+')
+curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile)
+
+parser.setContentHandler(curHandler)
+
+parser.parse(open(sys.argv[1]))
+
+sfile.close()
+tfile.close()
+