summaryrefslogtreecommitdiff
path: root/corpus/support/quote-norm.pl
blob: 0c5b9c260875475dcf9936b3bec867ef991481a3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/perl -w
use strict;
use utf8;
binmode(STDIN,"utf8");
binmode(STDOUT,"utf8");
while(<STDIN>) {
  chomp;
  $_ = " $_ ";
  s/&\s*lt\s*;/</gi;
  s/&\s*gt\s*;/>/gi;
  s/&\s*squot\s*;/'/gi;
  s/&\s*quot\s*;/"/gi;
  s/&\s*amp\s*;/&/gi;
  s/ (\d\d): (\d\d)/ $1:$2/g;
  s/[\x{20a0}]\x{20ac}]/ EUR /g;
  s/[\x{00A3}]/ GBP /g;
  s/(\W)([A-Z]+\$?)(\d*\.\d+|\d+)/$1$2 $3/g;
  s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi;
  s/&\s*#45\s*;\s*&\s*#45\s*;/--/g;
  s/&\s*#45\s*;/--/g;
  s/ ,,/ "/g;
  s/``/"/g;
  s/''/"/g;
  s/〃/"/g;
  s/¨/"/g;
  s/¡/ ¡ /g;
  s/¿/ ¿ /g;
  s/ˇ/'/g;
  s/´/'/g;
  s/`/'/g;
  s/’/'/g;
  s/ ́/'/g;
  s/‘/'/g;
  s/ˉ/'/g;
  s/β/ß/g; # WMT 2010 error
  s/“/"/g;
  s/”/"/g;
  s/«/"/g;
  s/»/"/g;
  tr/!-~/!-~/;
  s/、/,/g;
  s/。/./g;
  s/…/.../g;
  s/―/--/g;
  s/–/--/g;
  s/─/--/g;
  s/—/--/g;
  s/•/ * /g;
  s/\*/ * /g;
  s/،/,/g;
  s/؟/?/g;
  s/ـ/ /g;
  s/Ã ̄/i/g;
  s/’/'/g;
  s/â€"/"/g;
  s/؛/;/g;
		    
  s/\s+/ /g;
  s/^\s+//;
  s/\s+$//;
  s/[\x{00}-\x{1f}]//g;
  print "$_\n";
}