From 2609926eaeb4afb16a787f2abc0de61915ff4ecf Mon Sep 17 00:00:00 2001 From: redpony Date: Thu, 28 Oct 2010 18:37:31 +0000 Subject: generate psgs git-svn-id: https://ws10smt.googlecode.com/svn/trunk@697 ec762483-ff6d-05da-a07a-a48fb63a330f --- .../support/generate_per_sentence_grammars.pl | 23 +++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'word-aligner/support') diff --git a/word-aligner/support/generate_per_sentence_grammars.pl b/word-aligner/support/generate_per_sentence_grammars.pl index c644ec6d..730035d8 100755 --- a/word-aligner/support/generate_per_sentence_grammars.pl +++ b/word-aligner/support/generate_per_sentence_grammars.pl @@ -4,12 +4,13 @@ use utf8; die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz\n" unless scalar @ARGV == 3; -my $MAX_INMEM = 3000; +my $MAX_INMEM = 2500; open FV,"<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; open C,"<$ARGV[1]" or die "Can't read $ARGV[1]: $!"; open G,"gunzip -c $ARGV[2]|" or die "Can't read $ARGV[2]: $!"; +binmode STDOUT, ":utf8"; binmode FV, ":utf8"; binmode C, ":utf8"; binmode G, ":utf8"; @@ -34,7 +35,7 @@ while() { chomp; my ($f, $e, $feats) = split / \|\|\| /; if ($most_freq{$f}) { - #print "$_\n"; + print "$_\n"; $memrc++; } else { $loadrc++; @@ -50,15 +51,31 @@ close G; print STDERR " mem rc: $memrc\n"; print STDERR " load rc: $loadrc\n"; +my $id = 0; +open O, ">ps.grammar" or die; +binmode(O,":utf8"); while() { my ($f,$e) = split / \|\|\| /; my @fwords = split /\s+/, $f; my $tot = 0; + my %used; + my $fpos = tell(O); for my $f (@fwords) { + next if $most_freq{$f}; + next if $used{$f}; my $r = $grammar{$f}; die "No translations for: $f" unless $r; my $num = scalar @$r; $tot += $num; + for my $rule (@$r) { + print O "$f ||| $rule\n"; + } + $used{$f} = 1; } - print "RULES: $tot\n"; + print O "###EOS###\n"; + print STDERR "id=$id POS=$fpos\n"; + $id++; + last if $id == 10; } + +close O; -- cgit v1.2.3