summaryrefslogtreecommitdiff
path: root/word-aligner/support
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-28 17:55:19 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-28 17:55:19 +0000
commit17f53fcb8b6a18a48444f233eaeff609087035a3 (patch)
treedd1ff089d984b1fab8bc1ba71d064149b4d183a2 /word-aligner/support
parentad5a1a959648483f6d0d049af7ce54346c28728f (diff)
try this
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@694 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'word-aligner/support')
-rw-r--r--word-aligner/support/generate-per-sentence-grammars.pl47
1 files changed, 47 insertions, 0 deletions
diff --git a/word-aligner/support/generate-per-sentence-grammars.pl b/word-aligner/support/generate-per-sentence-grammars.pl
new file mode 100644
index 00000000..695cfc17
--- /dev/null
+++ b/word-aligner/support/generate-per-sentence-grammars.pl
@@ -0,0 +1,47 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz\n" unless scalar @ARGV == 3;
+
+my $MAX_INMEM = 1000;
+
+open FV,"<$ARGV[0]" or die "Can't read $ARGV[0]: $!";
+open C,"<$ARGV[1]" or die "Can't read $ARGV[1]: $!";
+open G,"gunzip -c $ARGV[2]|" or die "Can't read $ARGV[2]: $!";
+
+binmode FV, ":utf8";
+binmode C, ":utf8";
+binmode G, ":utf8";
+
+my $vc = 0;
+my %most_freq;
+$most_freq{"<eps>"} = 1;
+while(my $f = <FV>) {
+ chomp $f;
+ %most_freq{$f}=1;
+ $vc++;
+ last if $vc == $MAX_INMEM;
+}
+close FV;
+
+print STDERR "Loaded $vc vocabulary items for permanent translation cache\n";
+
+my $memrc = 0;
+my $loadrc = 0;
+while(<G>) {
+ chomp;
+ my ($f, $e, $feats) = split / \|\|\| /;
+ if ($most_freq{$f}) {
+ #print "$_\n";
+ $memrc++;
+ } else {
+ $loadrc++;
+ $grammar{$f}="$e ||| $feats";
+ }
+}
+
+print STDERR " mem rc: $memrc\n";
+print STDERR " load rc: $loadrc\n";
+
+