diff options
Diffstat (limited to 'word-aligner/support/generate-per-sentence-grammars.pl')
-rw-r--r-- | word-aligner/support/generate-per-sentence-grammars.pl | 48 |
1 files changed, 0 insertions, 48 deletions
diff --git a/word-aligner/support/generate-per-sentence-grammars.pl b/word-aligner/support/generate-per-sentence-grammars.pl deleted file mode 100644 index d621213e..00000000 --- a/word-aligner/support/generate-per-sentence-grammars.pl +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use utf8; - -die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz\n" unless scalar @ARGV == 3; - -my $MAX_INMEM = 3000; - -open FV,"<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; -open C,"<$ARGV[1]" or die "Can't read $ARGV[1]: $!"; -open G,"gunzip -c $ARGV[2]|" or die "Can't read $ARGV[2]: $!"; - -binmode FV, ":utf8"; -binmode C, ":utf8"; -binmode G, ":utf8"; - -my $vc = 0; -my %most_freq; -$most_freq{"<eps>"} = 1; -while(my $f = <FV>) { - chomp $f; - $most_freq{$f}=1; - $vc++; - last if $vc == $MAX_INMEM; -} -close FV; - -print STDERR "Loaded $vc vocabulary items for permanent translation cache\n"; - -my %grammar; -my $memrc = 0; -my $loadrc = 0; -while(<G>) { - chomp; - my ($f, $e, $feats) = split / \|\|\| /; - if ($most_freq{$f}) { - #print "$_\n"; - $memrc++; - } else { - $loadrc++; - $grammar{$f}="$e ||| $feats"; - } -} - -print STDERR " mem rc: $memrc\n"; -print STDERR " load rc: $loadrc\n"; - - |