diff options
Diffstat (limited to 'corpus')
| -rwxr-xr-x | corpus/support/fix-eos.pl | 2 | ||||
| -rwxr-xr-x | corpus/xml-tok.py | 47 | 
2 files changed, 49 insertions, 0 deletions
| diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl index 584f8b46..fe03727b 100755 --- a/corpus/support/fix-eos.pl +++ b/corpus/support/fix-eos.pl @@ -1,4 +1,6 @@  #!/usr/bin/perl -w +$|++; +  use strict;  use utf8; diff --git a/corpus/xml-tok.py b/corpus/xml-tok.py new file mode 100755 index 00000000..4357ced6 --- /dev/null +++ b/corpus/xml-tok.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +import os +import re +import subprocess +import sys + +# Tokenize XML files with tokenize-anything.sh +# in:  <seg id="963"> The earnings on its 10-year bonds are 28.45%. </seg> +# out: <seg id="963"> The earnings on its 10 - year bonds are 28.45 % . </seg> + +def escape(s): +    return s.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"').replace('\'', ''') + +def unescape(s): +    return s.replace('>', '>').replace('<', '<').replace('"', '"').replace(''', '\'').replace('&', '&') + +def main(): +    tok = subprocess.Popen([os.path.join(os.path.dirname(__file__), 'tokenize-anything.sh'), '-u'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) +    while True: +        line = sys.stdin.readline() +        if not line: +            break +        line = line.strip() +        pieces = [] +        eol = len(line) +        pos = 0 +        while pos < eol: +            next = line.find('<', pos) +            if next == -1: +                next = eol +            tok.stdin.write('{}\n'.format(unescape(line[pos:next]))) +            pieces.append(escape(tok.stdout.readline().strip())) +            if next == eol: +                break +            pos = line.find('>', next + 1) +            if pos == -1: +                pos = eol +            else: +                pos += 1 +            pieces.append(line[next:pos]) +        sys.stdout.write('{}\n'.format(' '.join(pieces).strip())) +    tok.stdin.close() +    tok.wait() + +if __name__ == '__main__': +    main() | 
