blob: 2f79a94e7343ef8cc51d5179bbe2fd0a44349434 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
#!/usr/bin/perl -w
use strict;
use utf8;
binmode(STDIN, ":utf8");
binmode(STDOUT,":utf8");
my $vocab = undef;
if (scalar @ARGV > 0) {
die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1);
$vocab = 1;
}
my %dict;
while(<STDIN>) {
chomp;
my @words = split /\s+/;
my @out = ();
for my $w (@words) {
my $tw = $dict{$w};
if (!defined $tw) {
my $el = 5;
if ($w =~ /(ndz|ndr|nts|ntr)/) { $el++; }
if ($w =~ /^(mp|mb|nd)/) { $el++; }
if ($el > length($w)) { $el = length($w); }
$tw = substr $w, 0, $el;
$dict{$w} = $tw;
}
push @out, $tw;
}
if ($vocab) {
die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1;
print "$_ @out\n";
} else {
print "@out\n";
}
}
|