psg for lex trans

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@699 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-28 21:10:12 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-28 21:10:12 +0000
commit: 14b4d7dff699259bc5e606fa0d5beb77001e32fb (patch)
tree: 098afdaf5f635361eca60ceec3259849958809fe /word-aligner/support
parent: 0b528dff0ff3dec0dd7a65b6cda84483092dacfb (diff)
1 files changed, 17 insertions, 11 deletions
diff --git a/word-aligner/support/generate_per_sentence_grammars.pl b/word-aligner/support/generate_per_sentence_grammars.pl
index 80243419..8779ac9c 100755
--- a/word-aligner/support/generate_per_sentence_grammars.pl
+++ b/word-aligner/support/generate_per_sentence_grammars.pl
@@ -2,7 +2,7 @@
 use strict;
 use utf8;
 
-die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz\n" unless scalar @ARGV == 3;
+die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz [OUT]filtered.f-e.gz [OUT]per_sentence_grammar.f-e [OUT]train.f-e.sgml\n" unless scalar @ARGV == 6;
 
 my $MAX_INMEM = 2500;
 
@@ -10,7 +10,14 @@ open FV,"<$ARGV[0]" or die "Can't read $ARGV[0]: $!";
 open C,"<$ARGV[1]" or die "Can't read $ARGV[1]: $!";
 open G,"gunzip -c $ARGV[2]|" or die "Can't read $ARGV[2]: $!";
 
+open FILT,"|gzip -c > $ARGV[3]" or die "Can't write $ARGV[3]: $!";
+open PSG,">$ARGV[4]" or die "Can't write $ARGV[4]: $!";
+open OTRAIN,">$ARGV[5]" or die "Can't write $ARGV[5]: $!";
+
+binmode FILT, ":utf8";
+binmode PSG, ":utf8";
 binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
 binmode FV, ":utf8";
 binmode C, ":utf8";
 binmode G, ":utf8";
@@ -35,7 +42,7 @@ while(<G>) {
   chomp;
   my ($f, $e, $feats) = split / \|\|\| /;
   if ($most_freq{$f}) {
-    print "$_\n";
+    print FILT "$_\n";
     $memrc++;
   } else {
     $loadrc++;
@@ -47,20 +54,19 @@ while(<G>) {
     push @$r, "$e ||| $feats";
   }
 }
+close FILT;
 close G;
 print STDERR "  mem rc: $memrc\n";
 print STDERR " load rc: $loadrc\n";
 
 my $id = 0;
-open O, ">ps.grammar" or die;
-binmode(O,":utf8");
 while(<C>) {
   chomp;
   my ($f,$e) = split / \|\|\| /;
   my @fwords = split /\s+/, $f;
   my $tot = 0;
   my %used;
-  my $fpos = tell(O);
+  my $fpos = tell(PSG);
   for my $f (@fwords) {
     next if $most_freq{$f};
     next if $used{$f};
@@ -69,15 +75,15 @@ while(<C>) {
     my $num = scalar @$r;
     $tot += $num;
     for my $rule (@$r) {
-      print O "$f ||| $rule\n";
+      print PSG "$f ||| $rule\n";
     }
     $used{$f} = 1;
   }
-  print O "###EOS###\n";
-  print STDERR "<seg id=\"$id\" grammar=\"\@$fpos\"> $_ </seg>\n";
-  #print STDERR "id=$id POS=$fpos\n";
+  print PSG "###EOS###\n";
+  print OTRAIN "<seg id=\"$id\" psg=\"\@$fpos\"> $_ </seg>\n";
   $id++;
-  last if $id == 10;
 }
+close PSG;
+close OTRAIN;
+print STDERR "Done.\n";
 
-close O;
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-28 21:10:12 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-28 21:10:12 +0000
commit	14b4d7dff699259bc5e606fa0d5beb77001e32fb (patch)
tree	098afdaf5f635361eca60ceec3259849958809fe /word-aligner/support
parent	0b528dff0ff3dec0dd7a65b6cda84483092dacfb (diff)