diff options
author | Michael Denkowski <michael.j.denkowski@gmail.com> | 2012-12-22 16:01:23 -0500 |
---|---|---|
committer | Michael Denkowski <michael.j.denkowski@gmail.com> | 2012-12-22 16:01:23 -0500 |
commit | 778a4cec55f82bcc66b3f52de7cc871e8daaeb92 (patch) | |
tree | 2a5bccaa85965855104c4e8ac3738b2e1c77f164 /corpus/support/quote-norm.pl | |
parent | 57fff9eea5ba0e71fb958fdb4f32d17f2fe31108 (diff) | |
parent | d21491daa5e50b4456c7c5f9c2e51d25afd2a757 (diff) |
Merge branch 'master' of git://github.com/redpony/cdec
Diffstat (limited to 'corpus/support/quote-norm.pl')
-rwxr-xr-x | corpus/support/quote-norm.pl | 7 |
1 files changed, 6 insertions, 1 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 0c5b9c26..72b0064d 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -18,13 +18,18 @@ while(<STDIN>) { s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; s/&\s*#45\s*;/--/g; + s/�c/--/g; s/ ,,/ "/g; s/``/"/g; s/''/"/g; + s/[「」]/"/g; s/〃/"/g; s/¨/"/g; s/¡/ ¡ /g; s/¿/ ¿ /g; + # â<U+0080><U+0099> + s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; + s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; s/ˇ/'/g; s/´/'/g; s/`/'/g; @@ -39,7 +44,7 @@ while(<STDIN>) { s/»/"/g; tr/!-~/!-~/; s/、/,/g; - s/。/./g; + # s/。/./g; s/…/.../g; s/―/--/g; s/–/--/g; |