summaryrefslogtreecommitdiff
path: root/normalize_punctuation
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2013-12-05 07:56:38 +0100
committerPatrick Simianer <p@simianer.de>2013-12-05 07:56:38 +0100
commitdb6a6ecfa350cae29739c59df1210d8f76a479c9 (patch)
treef137a001f57f170455c28ce97b5abb2726006cf6 /normalize_punctuation
init
Diffstat (limited to 'normalize_punctuation')
-rwxr-xr-xnormalize_punctuation46
1 files changed, 46 insertions, 0 deletions
diff --git a/normalize_punctuation b/normalize_punctuation
new file mode 100755
index 0000000..108de44
--- /dev/null
+++ b/normalize_punctuation
@@ -0,0 +1,46 @@
+#!/usr/bin/perl -w
+# adapted from the moses scripts
+
+use strict;
+
+my ($language) = @ARGV;
+
+while(<STDIN>) {
+ s/\r//g;
+ # normalize unicode punctuation
+ s/„/\"/g;
+ s/“/\"/g;
+ s/”/\"/g;
+ s/–/-/g;
+ s/—/ - /g; s/ +/ /g;
+ s/´/\'/g;
+ s/([a-z])‘([a-z])/$1\'$2/gi;
+ s/([a-z])’([a-z])/$1\'$2/gi;
+ s/‘/\"/g;
+ s/‚/\"/g;
+ s/’/\"/g;
+ s/''/\"/g;
+ s/´´/\"/g;
+ s/…/.../g;
+ # French quotes
+ s/ « / \"/g;
+ s/« /\"/g;
+ s/«/\"/g;
+ s/ » /\" /g;
+ s/ »/\"/g;
+ s/»/\"/g;
+ # handle pseudo-spaces
+ s/ \%/\%/g;
+ s/nº /nº /g;
+ s/ :/:/g;
+ s/ ºC/ ºC/g;
+ s/ cm/ cm/g;
+ s/ \?/\?/g;
+ s/ \!/\!/g;
+ s/ ;/;/g;
+ s/, /, /g; s/ +/ /g;
+
+ print STDERR $_ if //;
+
+ print $_;
+}