diff options
author | Chris Dyer <redpony@gmail.com> | 2014-02-27 19:45:08 -0500 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2014-02-27 19:45:08 -0500 |
commit | 6da95d3dac00ffc4cfba0c17b50abb071c5fb7b3 (patch) | |
tree | 8ddce03952f5eeda42cf1c97d7c2d657e6c2ab1a /corpus | |
parent | 3524a33c3f2612c643a020437356846895504407 (diff) |
ptb to normal
Diffstat (limited to 'corpus')
-rwxr-xr-x | corpus/support/quote-norm.pl | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 1d9bb96f..33604027 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -11,6 +11,15 @@ while(<STDIN>) { # Delete control characters: s/[\x{00}-\x{1f}]//g; + # PTB --> normal + s/-LRB-/(/g; + s/-RRB-/)/g; + s/-LSB-/[/g; + s/-RSB-/]/g; + s/-LCB-/{/g; + s/-RCB-/}/g; + s/ gon na / gonna /g; + # Regularize named HTML/XML escapes: s/&\s*lt\s*;/</gi; # HTML opening angle bracket s/&\s*gt\s*;/>/gi; # HTML closing angle bracket |