summaryrefslogtreecommitdiff
path: root/corpus/untok.pl
blob: 723e78cbe6e00c561cb0bc0e806361f80acd9f0c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/perl -w

use IO::Handle;
STDOUT->autoflush(1);

while (<>) {
  $output = "";
  @tokens = split;
  $lspace = 0;
  $qflag = 0;
  for ($i=0; $i<=$#tokens; $i++) {
    $token = $tokens[$i];
    $prev = $next = "";
    $rspace = 1;
    if ($i > 0) {
      $prev = $tokens[$i-1];
    }
    if ($i < $#tokens) {
      $next = $tokens[$i+1];
    }

    # possessives join to the left
    if ($token =~ /^(n't|'(s|m|re|ll|ve|d))$/) {
      $lspace = 0;
    } elsif ($token eq "'" && $prev =~ /s$/) {
      $lspace = 0;

    # hyphen only when a hyphen, not a dash
    } elsif ($token eq "-" && $prev =~ /[A-Za-z0-9]$/ && $next =~ /^[A-Za-z0-9]/) {
      $lspace = $rspace = 0;

    # quote marks alternate
    } elsif ($token eq '"') {
      if ($qflag) {
        $lspace = 0;
      } else {
        $rspace = 0;
      }
      $qflag = !$qflag;

    # period joins on both sides when a decimal point
    } elsif ($token eq "." && $prev =~ /\d$/ && $next =~ /\d$/) {
      $lspace = $rspace = 0;

    # Left joiners
    } elsif ($token =~ /^[.,:;?!%)\]]$/) {
      $lspace = 0;
    # Right joiners
    } elsif ($token =~ /^[$(\[]$/) {
      $rspace = 0;
    # Joiners on both sides
    } elsif ($token =~ /^[\/]$/) {
      $lspace = $rspace = 0;
    }

    if ($lspace) {
      $output .= " ";
    }
    $output .= $token;
    $lspace = $rspace;
  }
  print "$output\n";
}