diff options
author | Michael Denkowski <mdenkows@cs.cmu.edu> | 2013-09-04 15:14:16 -0700 |
---|---|---|
committer | Michael Denkowski <mdenkows@cs.cmu.edu> | 2013-09-04 15:14:16 -0700 |
commit | cf7f814acfebf03b577e105a9295ef67cfdb74cd (patch) | |
tree | d0495af533da96f01d5b9566786766ebd74d967e | |
parent | ed6325760c6cbd7681330bd9591f66ce5efe60ae (diff) |
Detokenizer
-rwxr-xr-x | corpus/untok.pl | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/corpus/untok.pl b/corpus/untok.pl new file mode 100755 index 00000000..723e78cb --- /dev/null +++ b/corpus/untok.pl @@ -0,0 +1,63 @@ +#!/usr/bin/perl -w + +use IO::Handle; +STDOUT->autoflush(1); + +while (<>) { + $output = ""; + @tokens = split; + $lspace = 0; + $qflag = 0; + for ($i=0; $i<=$#tokens; $i++) { + $token = $tokens[$i]; + $prev = $next = ""; + $rspace = 1; + if ($i > 0) { + $prev = $tokens[$i-1]; + } + if ($i < $#tokens) { + $next = $tokens[$i+1]; + } + + # possessives join to the left + if ($token =~ /^(n't|'(s|m|re|ll|ve|d))$/) { + $lspace = 0; + } elsif ($token eq "'" && $prev =~ /s$/) { + $lspace = 0; + + # hyphen only when a hyphen, not a dash + } elsif ($token eq "-" && $prev =~ /[A-Za-z0-9]$/ && $next =~ /^[A-Za-z0-9]/) { + $lspace = $rspace = 0; + + # quote marks alternate + } elsif ($token eq '"') { + if ($qflag) { + $lspace = 0; + } else { + $rspace = 0; + } + $qflag = !$qflag; + + # period joins on both sides when a decimal point + } elsif ($token eq "." && $prev =~ /\d$/ && $next =~ /\d$/) { + $lspace = $rspace = 0; + + # Left joiners + } elsif ($token =~ /^[.,:;?!%)\]]$/) { + $lspace = 0; + # Right joiners + } elsif ($token =~ /^[$(\[]$/) { + $rspace = 0; + # Joiners on both sides + } elsif ($token =~ /^[\/]$/) { + $lspace = $rspace = 0; + } + + if ($lspace) { + $output .= " "; + } + $output .= $token; + $lspace = $rspace; + } + print "$output\n"; +} |