diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-28 17:55:19 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-28 17:55:19 +0000 |
commit | 6548dbd9e6421b79899384a748bd356ff126cff3 (patch) | |
tree | 8330b076c809bda8fd9b7c35d4b2a34889a0aabc /word-aligner | |
parent | 22e05bf807ad59bfad38fcdf35bb51524034e23b (diff) |
try this
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@694 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'word-aligner')
-rw-r--r-- | word-aligner/support/generate-per-sentence-grammars.pl | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/word-aligner/support/generate-per-sentence-grammars.pl b/word-aligner/support/generate-per-sentence-grammars.pl new file mode 100644 index 00000000..695cfc17 --- /dev/null +++ b/word-aligner/support/generate-per-sentence-grammars.pl @@ -0,0 +1,47 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz\n" unless scalar @ARGV == 3; + +my $MAX_INMEM = 1000; + +open FV,"<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; +open C,"<$ARGV[1]" or die "Can't read $ARGV[1]: $!"; +open G,"gunzip -c $ARGV[2]|" or die "Can't read $ARGV[2]: $!"; + +binmode FV, ":utf8"; +binmode C, ":utf8"; +binmode G, ":utf8"; + +my $vc = 0; +my %most_freq; +$most_freq{"<eps>"} = 1; +while(my $f = <FV>) { + chomp $f; + %most_freq{$f}=1; + $vc++; + last if $vc == $MAX_INMEM; +} +close FV; + +print STDERR "Loaded $vc vocabulary items for permanent translation cache\n"; + +my $memrc = 0; +my $loadrc = 0; +while(<G>) { + chomp; + my ($f, $e, $feats) = split / \|\|\| /; + if ($most_freq{$f}) { + #print "$_\n"; + $memrc++; + } else { + $loadrc++; + $grammar{$f}="$e ||| $feats"; + } +} + +print STDERR " mem rc: $memrc\n"; +print STDERR " load rc: $loadrc\n"; + + |