diff options
Diffstat (limited to 'normalize_punctuation')
-rwxr-xr-x | normalize_punctuation | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/normalize_punctuation b/normalize_punctuation new file mode 100755 index 0000000..108de44 --- /dev/null +++ b/normalize_punctuation @@ -0,0 +1,46 @@ +#!/usr/bin/perl -w +# adapted from the moses scripts + +use strict; + +my ($language) = @ARGV; + +while(<STDIN>) { + s/\r//g; + # normalize unicode punctuation + s/„/\"/g; + s/“/\"/g; + s/”/\"/g; + s/–/-/g; + s/—/ - /g; s/ +/ /g; + s/´/\'/g; + s/([a-z])‘([a-z])/$1\'$2/gi; + s/([a-z])’([a-z])/$1\'$2/gi; + s/‘/\"/g; + s/‚/\"/g; + s/’/\"/g; + s/''/\"/g; + s/´´/\"/g; + s/…/.../g; + # French quotes + s/ « / \"/g; + s/« /\"/g; + s/«/\"/g; + s/ » /\" /g; + s/ »/\"/g; + s/»/\"/g; + # handle pseudo-spaces + s/ \%/\%/g; + s/nº /nº /g; + s/ :/:/g; + s/ ºC/ ºC/g; + s/ cm/ cm/g; + s/ \?/\?/g; + s/ \!/\!/g; + s/ ;/;/g; + s/, /, /g; s/ +/ /g; + + print STDERR $_ if //; + + print $_; +} |