1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
#!/usr/bin/perl -w
use strict;
use utf8;
die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz\n" unless scalar @ARGV == 3;
my $MAX_INMEM = 2500;
open FV,"<$ARGV[0]" or die "Can't read $ARGV[0]: $!";
open C,"<$ARGV[1]" or die "Can't read $ARGV[1]: $!";
open G,"gunzip -c $ARGV[2]|" or die "Can't read $ARGV[2]: $!";
binmode STDOUT, ":utf8";
binmode FV, ":utf8";
binmode C, ":utf8";
binmode G, ":utf8";
my $vc = 0;
my %most_freq;
$most_freq{"<eps>"} = 1;
while(my $f = <FV>) {
chomp $f;
$most_freq{$f}=1;
$vc++;
last if $vc == $MAX_INMEM;
}
close FV;
print STDERR "Loaded $vc vocabulary items for permanent translation cache\n";
my %grammar;
my $memrc = 0;
my $loadrc = 0;
while(<G>) {
chomp;
my ($f, $e, $feats) = split / \|\|\| /;
if ($most_freq{$f}) {
print "$_\n";
$memrc++;
} else {
$loadrc++;
my $r = $grammar{$f};
if (!defined $r) {
$r = [];
$grammar{$f} = $r;
}
push @$r, "$e ||| $feats";
}
}
close G;
print STDERR " mem rc: $memrc\n";
print STDERR " load rc: $loadrc\n";
my $id = 0;
open O, ">ps.grammar" or die;
binmode(O,":utf8");
while(<C>) {
my ($f,$e) = split / \|\|\| /;
my @fwords = split /\s+/, $f;
my $tot = 0;
my %used;
my $fpos = tell(O);
for my $f (@fwords) {
next if $most_freq{$f};
next if $used{$f};
my $r = $grammar{$f};
die "No translations for: $f" unless $r;
my $num = scalar @$r;
$tot += $num;
for my $rule (@$r) {
print O "$f ||| $rule\n";
}
$used{$f} = 1;
}
print O "###EOS###\n";
print STDERR "id=$id POS=$fpos\n";
$id++;
last if $id == 10;
}
close O;
|