From 2b1d7f881c19c4d4b5afae194e02d3300c7675d0 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 5 Jul 2016 11:01:46 +0200 Subject: mv --- normalize-punctuation | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100755 normalize-punctuation (limited to 'normalize-punctuation') diff --git a/normalize-punctuation b/normalize-punctuation new file mode 100755 index 0000000..108de44 --- /dev/null +++ b/normalize-punctuation @@ -0,0 +1,46 @@ +#!/usr/bin/perl -w +# adapted from the moses scripts + +use strict; + +my ($language) = @ARGV; + +while() { + s/\r//g; + # normalize unicode punctuation + s/„/\"/g; + s/“/\"/g; + s/”/\"/g; + s/–/-/g; + s/—/ - /g; s/ +/ /g; + s/´/\'/g; + s/([a-z])‘([a-z])/$1\'$2/gi; + s/([a-z])’([a-z])/$1\'$2/gi; + s/‘/\"/g; + s/‚/\"/g; + s/’/\"/g; + s/''/\"/g; + s/´´/\"/g; + s/…/.../g; + # French quotes + s/ « / \"/g; + s/« /\"/g; + s/«/\"/g; + s/ » /\" /g; + s/ »/\"/g; + s/»/\"/g; + # handle pseudo-spaces + s/ \%/\%/g; + s/nº /nº /g; + s/ :/:/g; + s/ ºC/ ºC/g; + s/ cm/ cm/g; + s/ \?/\?/g; + s/ \!/\!/g; + s/ ;/;/g; + s/, /, /g; s/ +/ /g; + + print STDERR $_ if //; + + print $_; +} -- cgit v1.2.3