diff options
| author | Patrick Simianer <p@simianer.de> | 2013-12-05 07:56:38 +0100 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2013-12-05 07:56:38 +0100 | 
| commit | db6a6ecfa350cae29739c59df1210d8f76a479c9 (patch) | |
| tree | f137a001f57f170455c28ce97b5abb2726006cf6 /normalize_punctuation | |
init
Diffstat (limited to 'normalize_punctuation')
| -rwxr-xr-x | normalize_punctuation | 46 | 
1 files changed, 46 insertions, 0 deletions
| diff --git a/normalize_punctuation b/normalize_punctuation new file mode 100755 index 0000000..108de44 --- /dev/null +++ b/normalize_punctuation @@ -0,0 +1,46 @@ +#!/usr/bin/perl -w +# adapted from the moses scripts + +use strict; + +my ($language) = @ARGV; + +while(<STDIN>) { +    s/\r//g; +    # normalize unicode punctuation +    s/„/\"/g; +    s/“/\"/g; +    s/”/\"/g; +    s/–/-/g; +    s/—/ - /g; s/ +/ /g; +    s/´/\'/g; +    s/([a-z])‘([a-z])/$1\'$2/gi; +    s/([a-z])’([a-z])/$1\'$2/gi; +    s/‘/\"/g; +    s/‚/\"/g; +    s/’/\"/g; +    s/''/\"/g; +    s/´´/\"/g; +    s/…/.../g; +    # French quotes +    s/ « / \"/g; +    s/« /\"/g; +    s/«/\"/g; +    s/ » /\" /g; +    s/ »/\"/g; +    s/»/\"/g; +    # handle pseudo-spaces +    s/ \%/\%/g; +    s/nº /nº /g; +    s/ :/:/g; +    s/ ºC/ ºC/g; +    s/ cm/ cm/g; +    s/ \?/\?/g; +    s/ \!/\!/g; +    s/ ;/;/g; +    s/, /, /g; s/ +/ /g; + +    print STDERR $_ if //; + +    print $_; +} | 
