#!/usr/bin/perl -w use strict; use utf8; die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz\n" unless scalar @ARGV == 3; my $MAX_INMEM = 2500; open FV,"<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; open C,"<$ARGV[1]" or die "Can't read $ARGV[1]: $!"; open G,"gunzip -c $ARGV[2]|" or die "Can't read $ARGV[2]: $!"; binmode STDOUT, ":utf8"; binmode FV, ":utf8"; binmode C, ":utf8"; binmode G, ":utf8"; my $vc = 0; my %most_freq; $most_freq{""} = 1; while(my $f = ) { chomp $f; $most_freq{$f}=1; $vc++; last if $vc == $MAX_INMEM; } close FV; print STDERR "Loaded $vc vocabulary items for permanent translation cache\n"; my %grammar; my $memrc = 0; my $loadrc = 0; while() { chomp; my ($f, $e, $feats) = split / \|\|\| /; if ($most_freq{$f}) { print "$_\n"; $memrc++; } else { $loadrc++; my $r = $grammar{$f}; if (!defined $r) { $r = []; $grammar{$f} = $r; } push @$r, "$e ||| $feats"; } } close G; print STDERR " mem rc: $memrc\n"; print STDERR " load rc: $loadrc\n"; my $id = 0; open O, ">ps.grammar" or die; binmode(O,":utf8"); while() { chomp; my ($f,$e) = split / \|\|\| /; my @fwords = split /\s+/, $f; my $tot = 0; my %used; my $fpos = tell(O); for my $f (@fwords) { next if $most_freq{$f}; next if $used{$f}; my $r = $grammar{$f}; die "No translations for: $f" unless $r; my $num = scalar @$r; $tot += $num; for my $rule (@$r) { print O "$f ||| $rule\n"; } $used{$f} = 1; } print O "###EOS###\n"; print STDERR " $_ \n"; #print STDERR "id=$id POS=$fpos\n"; $id++; last if $id == 10; } close O;