summaryrefslogtreecommitdiff
path: root/word-aligner/stemmers/rw.pl
blob: 6d873b40de7db3e769734cdfdd6e096ed874fec1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/perl -w

use strict;
use utf8;

binmode(STDIN, ":utf8");
binmode(STDOUT,":utf8");

my $vocab = undef;
if (scalar @ARGV > 0) {
  die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1);
  $vocab = 1;
}

my %dict;
while(<STDIN>) {
  chomp;
  my @words = split /\s+/;
  my @out = ();
  for my $w (@words) {
    my $tw = $dict{$w};
    if (!defined $tw) {
      my $el = 5;
      if ($w =~ /(ny|jy|nk|nt|sh|cy)/) { $el++; }
      if ($el > length($w)) { $el = length($w); }
      $tw = substr $w, 0, $el;
      $dict{$w} = $tw;
    }
    push @out, $tw;
  }
  if ($vocab) {
    die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1;
    print "$_ @out\n";
  } else {
    print "@out\n";
  }
}