diff options
| author | Patrick Simianer <p@simianer.de> | 2016-07-05 11:01:46 +0200 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2016-07-05 11:01:46 +0200 | 
| commit | 2b1d7f881c19c4d4b5afae194e02d3300c7675d0 (patch) | |
| tree | 5a06ee7de98640a39244b57bb369697176b44ebf /normalize-punctuation | |
| parent | 69949dda35c3ea21d8e926e5f0a596a0a0f61c6a (diff) | |
mv
Diffstat (limited to 'normalize-punctuation')
| -rwxr-xr-x | normalize-punctuation | 46 | 
1 files changed, 46 insertions, 0 deletions
| diff --git a/normalize-punctuation b/normalize-punctuation new file mode 100755 index 0000000..108de44 --- /dev/null +++ b/normalize-punctuation @@ -0,0 +1,46 @@ +#!/usr/bin/perl -w +# adapted from the moses scripts + +use strict; + +my ($language) = @ARGV; + +while(<STDIN>) { +    s/\r//g; +    # normalize unicode punctuation +    s/„/\"/g; +    s/“/\"/g; +    s/”/\"/g; +    s/–/-/g; +    s/—/ - /g; s/ +/ /g; +    s/´/\'/g; +    s/([a-z])‘([a-z])/$1\'$2/gi; +    s/([a-z])’([a-z])/$1\'$2/gi; +    s/‘/\"/g; +    s/‚/\"/g; +    s/’/\"/g; +    s/''/\"/g; +    s/´´/\"/g; +    s/…/.../g; +    # French quotes +    s/ « / \"/g; +    s/« /\"/g; +    s/«/\"/g; +    s/ » /\" /g; +    s/ »/\"/g; +    s/»/\"/g; +    # handle pseudo-spaces +    s/ \%/\%/g; +    s/nº /nº /g; +    s/ :/:/g; +    s/ ºC/ ºC/g; +    s/ cm/ cm/g; +    s/ \?/\?/g; +    s/ \!/\!/g; +    s/ ;/;/g; +    s/, /, /g; s/ +/ /g; + +    print STDERR $_ if //; + +    print $_; +} | 
