From de8ffd4598d6c1e45273b50642870a661b4bcad4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 10 Mar 2014 18:40:13 -0400 Subject: few tokenization bugs --- corpus/support/quote-norm.pl | 1 + corpus/support/token_list | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'corpus/support') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 33604027..0366fad5 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -39,6 +39,7 @@ while() { s/&\#([0-9]+);/pack("U", $1)/ge; # Regularlize spaces: + s/\x{ad}//g; # soft hyphen s/\x{a0}/ /g; # non-breaking space s/\x{2009}/ /g; # thin space s/\x{2028}/ /g; # "line separator" diff --git a/corpus/support/token_list b/corpus/support/token_list index 228663f6..d38638cf 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -156,8 +156,9 @@ Mass. Md. Mfg. Mgr. -Mexican-U.S. -Mich. +Mio. +Mrd. +Bio. Minn. Mo. Mon. @@ -187,6 +188,7 @@ Rd. Rev. R.J. C.L +Rs. Rte. Sat. W.T -- cgit v1.2.3 From 0afd9d510cde40c340cf2c389b1aa22b5a9379c5 Mon Sep 17 00:00:00 2001 From: mjdenkowski Date: Wed, 12 Mar 2014 22:06:35 -0400 Subject: XML file tokenization for all your WMT needs. --- corpus/support/fix-eos.pl | 2 ++ corpus/xml-tok.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100755 corpus/xml-tok.py (limited to 'corpus/support') diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl index 584f8b46..fe03727b 100755 --- a/corpus/support/fix-eos.pl +++ b/corpus/support/fix-eos.pl @@ -1,4 +1,6 @@ #!/usr/bin/perl -w +$|++; + use strict; use utf8; diff --git a/corpus/xml-tok.py b/corpus/xml-tok.py new file mode 100755 index 00000000..4357ced6 --- /dev/null +++ b/corpus/xml-tok.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +import os +import re +import subprocess +import sys + +# Tokenize XML files with tokenize-anything.sh +# in: The earnings on its 10-year bonds are 28.45%. +# out: The earnings on its 10 - year bonds are 28.45 % . + +def escape(s): + return s.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"').replace('\'', ''') + +def unescape(s): + return s.replace('>', '>').replace('<', '<').replace('"', '"').replace(''', '\'').replace('&', '&') + +def main(): + tok = subprocess.Popen([os.path.join(os.path.dirname(__file__), 'tokenize-anything.sh'), '-u'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + while True: + line = sys.stdin.readline() + if not line: + break + line = line.strip() + pieces = [] + eol = len(line) + pos = 0 + while pos < eol: + next = line.find('<', pos) + if next == -1: + next = eol + tok.stdin.write('{}\n'.format(unescape(line[pos:next]))) + pieces.append(escape(tok.stdout.readline().strip())) + if next == eol: + break + pos = line.find('>', next + 1) + if pos == -1: + pos = eol + else: + pos += 1 + pieces.append(line[next:pos]) + sys.stdout.write('{}\n'.format(' '.join(pieces).strip())) + tok.stdin.close() + tok.wait() + +if __name__ == '__main__': + main() -- cgit v1.2.3