diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-28 21:10:12 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-28 21:10:12 +0000 |
commit | 14b4d7dff699259bc5e606fa0d5beb77001e32fb (patch) | |
tree | 098afdaf5f635361eca60ceec3259849958809fe /word-aligner | |
parent | 0b528dff0ff3dec0dd7a65b6cda84483092dacfb (diff) |
psg for lex trans
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@699 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'word-aligner')
-rwxr-xr-x | word-aligner/support/generate_per_sentence_grammars.pl | 28 |
1 files changed, 17 insertions, 11 deletions
diff --git a/word-aligner/support/generate_per_sentence_grammars.pl b/word-aligner/support/generate_per_sentence_grammars.pl index 80243419..8779ac9c 100755 --- a/word-aligner/support/generate_per_sentence_grammars.pl +++ b/word-aligner/support/generate_per_sentence_grammars.pl @@ -2,7 +2,7 @@ use strict; use utf8; -die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz\n" unless scalar @ARGV == 3; +die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz [OUT]filtered.f-e.gz [OUT]per_sentence_grammar.f-e [OUT]train.f-e.sgml\n" unless scalar @ARGV == 6; my $MAX_INMEM = 2500; @@ -10,7 +10,14 @@ open FV,"<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; open C,"<$ARGV[1]" or die "Can't read $ARGV[1]: $!"; open G,"gunzip -c $ARGV[2]|" or die "Can't read $ARGV[2]: $!"; +open FILT,"|gzip -c > $ARGV[3]" or die "Can't write $ARGV[3]: $!"; +open PSG,">$ARGV[4]" or die "Can't write $ARGV[4]: $!"; +open OTRAIN,">$ARGV[5]" or die "Can't write $ARGV[5]: $!"; + +binmode FILT, ":utf8"; +binmode PSG, ":utf8"; binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; binmode FV, ":utf8"; binmode C, ":utf8"; binmode G, ":utf8"; @@ -35,7 +42,7 @@ while(<G>) { chomp; my ($f, $e, $feats) = split / \|\|\| /; if ($most_freq{$f}) { - print "$_\n"; + print FILT "$_\n"; $memrc++; } else { $loadrc++; @@ -47,20 +54,19 @@ while(<G>) { push @$r, "$e ||| $feats"; } } +close FILT; close G; print STDERR " mem rc: $memrc\n"; print STDERR " load rc: $loadrc\n"; my $id = 0; -open O, ">ps.grammar" or die; -binmode(O,":utf8"); while(<C>) { chomp; my ($f,$e) = split / \|\|\| /; my @fwords = split /\s+/, $f; my $tot = 0; my %used; - my $fpos = tell(O); + my $fpos = tell(PSG); for my $f (@fwords) { next if $most_freq{$f}; next if $used{$f}; @@ -69,15 +75,15 @@ while(<C>) { my $num = scalar @$r; $tot += $num; for my $rule (@$r) { - print O "$f ||| $rule\n"; + print PSG "$f ||| $rule\n"; } $used{$f} = 1; } - print O "###EOS###\n"; - print STDERR "<seg id=\"$id\" grammar=\"\@$fpos\"> $_ </seg>\n"; - #print STDERR "id=$id POS=$fpos\n"; + print PSG "###EOS###\n"; + print OTRAIN "<seg id=\"$id\" psg=\"\@$fpos\"> $_ </seg>\n"; $id++; - last if $id == 10; } +close PSG; +close OTRAIN; +print STDERR "Done.\n"; -close O; |