summaryrefslogtreecommitdiff
path: root/corpus/support
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2014-02-27 19:45:08 -0500
committerChris Dyer <redpony@gmail.com>2014-02-27 19:45:08 -0500
commited56625e5edeadbe9297680b07e269c42b7ea420 (patch)
tree144a66df31d0ff2c8b8929315fc57c9f6c8c37da /corpus/support
parentd843587027d815f3a1c9b8dd5394f3fe04ac85fa (diff)
ptb to normal
Diffstat (limited to 'corpus/support')
-rwxr-xr-xcorpus/support/quote-norm.pl9
1 files changed, 9 insertions, 0 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 1d9bb96f..33604027 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -11,6 +11,15 @@ while(<STDIN>) {
# Delete control characters:
s/[\x{00}-\x{1f}]//g;
+ # PTB --> normal
+ s/-LRB-/(/g;
+ s/-RRB-/)/g;
+ s/-LSB-/[/g;
+ s/-RSB-/]/g;
+ s/-LCB-/{/g;
+ s/-RCB-/}/g;
+ s/ gon na / gonna /g;
+
# Regularize named HTML/XML escapes:
s/&\s*lt\s*;/</gi; # HTML opening angle bracket
s/&\s*gt\s*;/>/gi; # HTML closing angle bracket