summaryrefslogtreecommitdiff
path: root/normalize-punctuation
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2016-07-05 11:01:46 +0200
committerPatrick Simianer <p@simianer.de>2016-07-05 11:01:46 +0200
commit2b1d7f881c19c4d4b5afae194e02d3300c7675d0 (patch)
tree5a06ee7de98640a39244b57bb369697176b44ebf /normalize-punctuation
parent69949dda35c3ea21d8e926e5f0a596a0a0f61c6a (diff)
mv
Diffstat (limited to 'normalize-punctuation')
-rwxr-xr-xnormalize-punctuation46
1 files changed, 46 insertions, 0 deletions
diff --git a/normalize-punctuation b/normalize-punctuation
new file mode 100755
index 0000000..108de44
--- /dev/null
+++ b/normalize-punctuation
@@ -0,0 +1,46 @@
+#!/usr/bin/perl -w
+# adapted from the moses scripts
+
+use strict;
+
+my ($language) = @ARGV;
+
+while(<STDIN>) {
+ s/\r//g;
+ # normalize unicode punctuation
+ s/„/\"/g;
+ s/“/\"/g;
+ s/”/\"/g;
+ s/–/-/g;
+ s/—/ - /g; s/ +/ /g;
+ s/´/\'/g;
+ s/([a-z])‘([a-z])/$1\'$2/gi;
+ s/([a-z])’([a-z])/$1\'$2/gi;
+ s/‘/\"/g;
+ s/‚/\"/g;
+ s/’/\"/g;
+ s/''/\"/g;
+ s/´´/\"/g;
+ s/…/.../g;
+ # French quotes
+ s/ « / \"/g;
+ s/« /\"/g;
+ s/«/\"/g;
+ s/ » /\" /g;
+ s/ »/\"/g;
+ s/»/\"/g;
+ # handle pseudo-spaces
+ s/ \%/\%/g;
+ s/nº /nº /g;
+ s/ :/:/g;
+ s/ ºC/ ºC/g;
+ s/ cm/ cm/g;
+ s/ \?/\?/g;
+ s/ \!/\!/g;
+ s/ ;/;/g;
+ s/, /, /g; s/ +/ /g;
+
+ print STDERR $_ if //;
+
+ print $_;
+}