From 7525905ca57bbd425f96a33768e3f6777ba86f7a Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Wed, 4 Sep 2013 15:14:16 -0700 Subject: Detokenizer --- corpus/untok.pl | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100755 corpus/untok.pl (limited to 'corpus') diff --git a/corpus/untok.pl b/corpus/untok.pl new file mode 100755 index 00000000..723e78cb --- /dev/null +++ b/corpus/untok.pl @@ -0,0 +1,63 @@ +#!/usr/bin/perl -w + +use IO::Handle; +STDOUT->autoflush(1); + +while (<>) { + $output = ""; + @tokens = split; + $lspace = 0; + $qflag = 0; + for ($i=0; $i<=$#tokens; $i++) { + $token = $tokens[$i]; + $prev = $next = ""; + $rspace = 1; + if ($i > 0) { + $prev = $tokens[$i-1]; + } + if ($i < $#tokens) { + $next = $tokens[$i+1]; + } + + # possessives join to the left + if ($token =~ /^(n't|'(s|m|re|ll|ve|d))$/) { + $lspace = 0; + } elsif ($token eq "'" && $prev =~ /s$/) { + $lspace = 0; + + # hyphen only when a hyphen, not a dash + } elsif ($token eq "-" && $prev =~ /[A-Za-z0-9]$/ && $next =~ /^[A-Za-z0-9]/) { + $lspace = $rspace = 0; + + # quote marks alternate + } elsif ($token eq '"') { + if ($qflag) { + $lspace = 0; + } else { + $rspace = 0; + } + $qflag = !$qflag; + + # period joins on both sides when a decimal point + } elsif ($token eq "." && $prev =~ /\d$/ && $next =~ /\d$/) { + $lspace = $rspace = 0; + + # Left joiners + } elsif ($token =~ /^[.,:;?!%)\]]$/) { + $lspace = 0; + # Right joiners + } elsif ($token =~ /^[$(\[]$/) { + $rspace = 0; + # Joiners on both sides + } elsif ($token =~ /^[\/]$/) { + $lspace = $rspace = 0; + } + + if ($lspace) { + $output .= " "; + } + $output .= $token; + $lspace = $rspace; + } + print "$output\n"; +} -- cgit v1.2.3