summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormjdenkowski <michael.j.denkowski@gmail.com>2014-03-12 22:06:35 -0400
committermjdenkowski <michael.j.denkowski@gmail.com>2014-03-12 22:06:35 -0400
commitb2b25c9e71a7ae1b20b7350a236419aa37179e7c (patch)
tree4eb60454c9b6cbc3ed6f8b5b08880bbc97513cca
parent70ef91b22ee4abc5e50c15c4eb08121739af2bfd (diff)
XML file tokenization for all your WMT needs.
-rwxr-xr-xcorpus/support/fix-eos.pl2
-rwxr-xr-xcorpus/xml-tok.py47
2 files changed, 49 insertions, 0 deletions
diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl
index 584f8b46..fe03727b 100755
--- a/corpus/support/fix-eos.pl
+++ b/corpus/support/fix-eos.pl
@@ -1,4 +1,6 @@
#!/usr/bin/perl -w
+$|++;
+
use strict;
use utf8;
diff --git a/corpus/xml-tok.py b/corpus/xml-tok.py
new file mode 100755
index 00000000..4357ced6
--- /dev/null
+++ b/corpus/xml-tok.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+import os
+import re
+import subprocess
+import sys
+
+# Tokenize XML files with tokenize-anything.sh
+# in: <seg id="963"> The earnings on its 10-year bonds are 28.45%. </seg>
+# out: <seg id="963"> The earnings on its 10 - year bonds are 28.45 % . </seg>
+
+def escape(s):
+ return s.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;').replace('"', '&quot;').replace('\'', '&apos;')
+
+def unescape(s):
+ return s.replace('&gt;', '>').replace('&lt;', '<').replace('&quot;', '"').replace('&apos;', '\'').replace('&amp;', '&')
+
+def main():
+ tok = subprocess.Popen([os.path.join(os.path.dirname(__file__), 'tokenize-anything.sh'), '-u'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ while True:
+ line = sys.stdin.readline()
+ if not line:
+ break
+ line = line.strip()
+ pieces = []
+ eol = len(line)
+ pos = 0
+ while pos < eol:
+ next = line.find('<', pos)
+ if next == -1:
+ next = eol
+ tok.stdin.write('{}\n'.format(unescape(line[pos:next])))
+ pieces.append(escape(tok.stdout.readline().strip()))
+ if next == eol:
+ break
+ pos = line.find('>', next + 1)
+ if pos == -1:
+ pos = eol
+ else:
+ pos += 1
+ pieces.append(line[next:pos])
+ sys.stdout.write('{}\n'.format(' '.join(pieces).strip()))
+ tok.stdin.close()
+ tok.wait()
+
+if __name__ == '__main__':
+ main()