blob: 108de44495342376059b03b4915958d474bfa6c1 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
#!/usr/bin/perl -w
# adapted from the moses scripts
use strict;
my ($language) = @ARGV;
while(<STDIN>) {
s/\r//g;
# normalize unicode punctuation
s/„/\"/g;
s/“/\"/g;
s/”/\"/g;
s/–/-/g;
s/—/ - /g; s/ +/ /g;
s/´/\'/g;
s/([a-z])‘([a-z])/$1\'$2/gi;
s/([a-z])’([a-z])/$1\'$2/gi;
s/‘/\"/g;
s/‚/\"/g;
s/’/\"/g;
s/''/\"/g;
s/´´/\"/g;
s/…/.../g;
# French quotes
s/ « / \"/g;
s/« /\"/g;
s/«/\"/g;
s/ » /\" /g;
s/ »/\"/g;
s/»/\"/g;
# handle pseudo-spaces
s/ \%/\%/g;
s/nº /nº /g;
s/ :/:/g;
s/ ºC/ ºC/g;
s/ cm/ cm/g;
s/ \?/\?/g;
s/ \!/\!/g;
s/ ;/;/g;
s/, /, /g; s/ +/ /g;
print STDERR $_ if //;
print $_;
}
|