diff options
Diffstat (limited to 'corpus')
| -rwxr-xr-x | corpus/support/fix-eos.pl | 2 | ||||
| -rwxr-xr-x | corpus/support/quote-norm.pl | 1 | ||||
| -rw-r--r-- | corpus/support/token_list | 6 | ||||
| -rwxr-xr-x | corpus/xml-tok.py | 47 | 
4 files changed, 54 insertions, 2 deletions
| diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl index 584f8b46..fe03727b 100755 --- a/corpus/support/fix-eos.pl +++ b/corpus/support/fix-eos.pl @@ -1,4 +1,6 @@  #!/usr/bin/perl -w +$|++; +  use strict;  use utf8; diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 33604027..0366fad5 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -39,6 +39,7 @@ while(<STDIN>) {    s/&\#([0-9]+);/pack("U", $1)/ge;    # Regularlize spaces: +  s/\x{ad}//g;        # soft hyphen    s/\x{a0}/ /g;       # non-breaking space    s/\x{2009}/ /g;     # thin space    s/\x{2028}/ /g;     # "line separator" diff --git a/corpus/support/token_list b/corpus/support/token_list index 228663f6..d38638cf 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -156,8 +156,9 @@ Mass.  Md.  Mfg.  Mgr. -Mexican-U.S. -Mich. +Mio. +Mrd. +Bio.  Minn.  Mo.  Mon. @@ -187,6 +188,7 @@ Rd.  Rev.  R.J.  C.L +Rs.  Rte.  Sat.  W.T diff --git a/corpus/xml-tok.py b/corpus/xml-tok.py new file mode 100755 index 00000000..4357ced6 --- /dev/null +++ b/corpus/xml-tok.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +import os +import re +import subprocess +import sys + +# Tokenize XML files with tokenize-anything.sh +# in:  <seg id="963"> The earnings on its 10-year bonds are 28.45%. </seg> +# out: <seg id="963"> The earnings on its 10 - year bonds are 28.45 % . </seg> + +def escape(s): +    return s.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"').replace('\'', ''') + +def unescape(s): +    return s.replace('>', '>').replace('<', '<').replace('"', '"').replace(''', '\'').replace('&', '&') + +def main(): +    tok = subprocess.Popen([os.path.join(os.path.dirname(__file__), 'tokenize-anything.sh'), '-u'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) +    while True: +        line = sys.stdin.readline() +        if not line: +            break +        line = line.strip() +        pieces = [] +        eol = len(line) +        pos = 0 +        while pos < eol: +            next = line.find('<', pos) +            if next == -1: +                next = eol +            tok.stdin.write('{}\n'.format(unescape(line[pos:next]))) +            pieces.append(escape(tok.stdout.readline().strip())) +            if next == eol: +                break +            pos = line.find('>', next + 1) +            if pos == -1: +                pos = eol +            else: +                pos += 1 +            pieces.append(line[next:pos]) +        sys.stdout.write('{}\n'.format(' '.join(pieces).strip())) +    tok.stdin.close() +    tok.wait() + +if __name__ == '__main__': +    main() | 
