From 6368b781d5aad9a70a929b324e48f6673e87a93c Mon Sep 17 00:00:00 2001 From: redpony Date: Thu, 28 Oct 2010 18:02:35 +0000 Subject: fix, rename git-svn-id: https://ws10smt.googlecode.com/svn/trunk@696 ec762483-ff6d-05da-a07a-a48fb63a330f --- .../support/generate-per-sentence-grammars.pl | 48 ---------------- .../support/generate_per_sentence_grammars.pl | 64 ++++++++++++++++++++++ 2 files changed, 64 insertions(+), 48 deletions(-) delete mode 100644 word-aligner/support/generate-per-sentence-grammars.pl create mode 100755 word-aligner/support/generate_per_sentence_grammars.pl (limited to 'word-aligner') diff --git a/word-aligner/support/generate-per-sentence-grammars.pl b/word-aligner/support/generate-per-sentence-grammars.pl deleted file mode 100644 index d621213e..00000000 --- a/word-aligner/support/generate-per-sentence-grammars.pl +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use utf8; - -die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz\n" unless scalar @ARGV == 3; - -my $MAX_INMEM = 3000; - -open FV,"<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; -open C,"<$ARGV[1]" or die "Can't read $ARGV[1]: $!"; -open G,"gunzip -c $ARGV[2]|" or die "Can't read $ARGV[2]: $!"; - -binmode FV, ":utf8"; -binmode C, ":utf8"; -binmode G, ":utf8"; - -my $vc = 0; -my %most_freq; -$most_freq{""} = 1; -while(my $f = ) { - chomp $f; - $most_freq{$f}=1; - $vc++; - last if $vc == $MAX_INMEM; -} -close FV; - -print STDERR "Loaded $vc vocabulary items for permanent translation cache\n"; - -my %grammar; -my $memrc = 0; -my $loadrc = 0; -while() { - chomp; - my ($f, $e, $feats) = split / \|\|\| /; - if ($most_freq{$f}) { - #print "$_\n"; - $memrc++; - } else { - $loadrc++; - $grammar{$f}="$e ||| $feats"; - } -} - -print STDERR " mem rc: $memrc\n"; -print STDERR " load rc: $loadrc\n"; - - diff --git a/word-aligner/support/generate_per_sentence_grammars.pl b/word-aligner/support/generate_per_sentence_grammars.pl new file mode 100755 index 00000000..c644ec6d --- /dev/null +++ b/word-aligner/support/generate_per_sentence_grammars.pl @@ -0,0 +1,64 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz\n" unless scalar @ARGV == 3; + +my $MAX_INMEM = 3000; + +open FV,"<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; +open C,"<$ARGV[1]" or die "Can't read $ARGV[1]: $!"; +open G,"gunzip -c $ARGV[2]|" or die "Can't read $ARGV[2]: $!"; + +binmode FV, ":utf8"; +binmode C, ":utf8"; +binmode G, ":utf8"; + +my $vc = 0; +my %most_freq; +$most_freq{""} = 1; +while(my $f = ) { + chomp $f; + $most_freq{$f}=1; + $vc++; + last if $vc == $MAX_INMEM; +} +close FV; + +print STDERR "Loaded $vc vocabulary items for permanent translation cache\n"; + +my %grammar; +my $memrc = 0; +my $loadrc = 0; +while() { + chomp; + my ($f, $e, $feats) = split / \|\|\| /; + if ($most_freq{$f}) { + #print "$_\n"; + $memrc++; + } else { + $loadrc++; + my $r = $grammar{$f}; + if (!defined $r) { + $r = []; + $grammar{$f} = $r; + } + push @$r, "$e ||| $feats"; + } +} +close G; +print STDERR " mem rc: $memrc\n"; +print STDERR " load rc: $loadrc\n"; + +while() { + my ($f,$e) = split / \|\|\| /; + my @fwords = split /\s+/, $f; + my $tot = 0; + for my $f (@fwords) { + my $r = $grammar{$f}; + die "No translations for: $f" unless $r; + my $num = scalar @$r; + $tot += $num; + } + print "RULES: $tot\n"; +} -- cgit v1.2.3