summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2013-09-04 15:14:16 -0700
committerMichael Denkowski <mdenkows@cs.cmu.edu>2013-09-04 15:14:16 -0700
commit7525905ca57bbd425f96a33768e3f6777ba86f7a (patch)
treebd8b69d7212c6d7d0a8ef950303e4d072d94241c /corpus
parent40eac315f63b018eec10da4124b801869cd788f5 (diff)
Detokenizer
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/untok.pl63
1 files changed, 63 insertions, 0 deletions
diff --git a/corpus/untok.pl b/corpus/untok.pl
new file mode 100755
index 00000000..723e78cb
--- /dev/null
+++ b/corpus/untok.pl
@@ -0,0 +1,63 @@
+#!/usr/bin/perl -w
+
+use IO::Handle;
+STDOUT->autoflush(1);
+
+while (<>) {
+ $output = "";
+ @tokens = split;
+ $lspace = 0;
+ $qflag = 0;
+ for ($i=0; $i<=$#tokens; $i++) {
+ $token = $tokens[$i];
+ $prev = $next = "";
+ $rspace = 1;
+ if ($i > 0) {
+ $prev = $tokens[$i-1];
+ }
+ if ($i < $#tokens) {
+ $next = $tokens[$i+1];
+ }
+
+ # possessives join to the left
+ if ($token =~ /^(n't|'(s|m|re|ll|ve|d))$/) {
+ $lspace = 0;
+ } elsif ($token eq "'" && $prev =~ /s$/) {
+ $lspace = 0;
+
+ # hyphen only when a hyphen, not a dash
+ } elsif ($token eq "-" && $prev =~ /[A-Za-z0-9]$/ && $next =~ /^[A-Za-z0-9]/) {
+ $lspace = $rspace = 0;
+
+ # quote marks alternate
+ } elsif ($token eq '"') {
+ if ($qflag) {
+ $lspace = 0;
+ } else {
+ $rspace = 0;
+ }
+ $qflag = !$qflag;
+
+ # period joins on both sides when a decimal point
+ } elsif ($token eq "." && $prev =~ /\d$/ && $next =~ /\d$/) {
+ $lspace = $rspace = 0;
+
+ # Left joiners
+ } elsif ($token =~ /^[.,:;?!%)\]]$/) {
+ $lspace = 0;
+ # Right joiners
+ } elsif ($token =~ /^[$(\[]$/) {
+ $rspace = 0;
+ # Joiners on both sides
+ } elsif ($token =~ /^[\/]$/) {
+ $lspace = $rspace = 0;
+ }
+
+ if ($lspace) {
+ $output .= " ";
+ }
+ $output .= $token;
+ $lspace = $rspace;
+ }
+ print "$output\n";
+}