summaryrefslogtreecommitdiff
path: root/normalize_punctuation
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2016-07-05 11:01:46 +0200
committerPatrick Simianer <p@simianer.de>2016-07-05 11:01:46 +0200
commit2b1d7f881c19c4d4b5afae194e02d3300c7675d0 (patch)
tree5a06ee7de98640a39244b57bb369697176b44ebf /normalize_punctuation
parent69949dda35c3ea21d8e926e5f0a596a0a0f61c6a (diff)
mv
Diffstat (limited to 'normalize_punctuation')
-rwxr-xr-xnormalize_punctuation46
1 files changed, 0 insertions, 46 deletions
diff --git a/normalize_punctuation b/normalize_punctuation
deleted file mode 100755
index 108de44..0000000
--- a/normalize_punctuation
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/perl -w
-# adapted from the moses scripts
-
-use strict;
-
-my ($language) = @ARGV;
-
-while(<STDIN>) {
- s/\r//g;
- # normalize unicode punctuation
- s/„/\"/g;
- s/“/\"/g;
- s/”/\"/g;
- s/–/-/g;
- s/—/ - /g; s/ +/ /g;
- s/´/\'/g;
- s/([a-z])‘([a-z])/$1\'$2/gi;
- s/([a-z])’([a-z])/$1\'$2/gi;
- s/‘/\"/g;
- s/‚/\"/g;
- s/’/\"/g;
- s/''/\"/g;
- s/´´/\"/g;
- s/…/.../g;
- # French quotes
- s/ « / \"/g;
- s/« /\"/g;
- s/«/\"/g;
- s/ » /\" /g;
- s/ »/\"/g;
- s/»/\"/g;
- # handle pseudo-spaces
- s/ \%/\%/g;
- s/nº /nº /g;
- s/ :/:/g;
- s/ ºC/ ºC/g;
- s/ cm/ cm/g;
- s/ \?/\?/g;
- s/ \!/\!/g;
- s/ ;/;/g;
- s/, /, /g; s/ +/ /g;
-
- print STDERR $_ if //;
-
- print $_;
-}