summaryrefslogtreecommitdiff
path: root/word-aligner
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-28 18:37:31 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-28 18:37:31 +0000
commit6f21de07db8631992be2ed01fef3c839ec5aedae (patch)
tree52100510320895241c67d2f0db3096b4e13db5e5 /word-aligner
parentc1e9b0d59290843498dec2b4821532d0c36eab71 (diff)
generate psgs
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@697 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'word-aligner')
-rwxr-xr-xword-aligner/support/generate_per_sentence_grammars.pl23
1 files changed, 20 insertions, 3 deletions
diff --git a/word-aligner/support/generate_per_sentence_grammars.pl b/word-aligner/support/generate_per_sentence_grammars.pl
index c644ec6d..730035d8 100755
--- a/word-aligner/support/generate_per_sentence_grammars.pl
+++ b/word-aligner/support/generate_per_sentence_grammars.pl
@@ -4,12 +4,13 @@ use utf8;
die "Usage: $0 f.voc corpus.f-e grammar.f-e.gz\n" unless scalar @ARGV == 3;
-my $MAX_INMEM = 3000;
+my $MAX_INMEM = 2500;
open FV,"<$ARGV[0]" or die "Can't read $ARGV[0]: $!";
open C,"<$ARGV[1]" or die "Can't read $ARGV[1]: $!";
open G,"gunzip -c $ARGV[2]|" or die "Can't read $ARGV[2]: $!";
+binmode STDOUT, ":utf8";
binmode FV, ":utf8";
binmode C, ":utf8";
binmode G, ":utf8";
@@ -34,7 +35,7 @@ while(<G>) {
chomp;
my ($f, $e, $feats) = split / \|\|\| /;
if ($most_freq{$f}) {
- #print "$_\n";
+ print "$_\n";
$memrc++;
} else {
$loadrc++;
@@ -50,15 +51,31 @@ close G;
print STDERR " mem rc: $memrc\n";
print STDERR " load rc: $loadrc\n";
+my $id = 0;
+open O, ">ps.grammar" or die;
+binmode(O,":utf8");
while(<C>) {
my ($f,$e) = split / \|\|\| /;
my @fwords = split /\s+/, $f;
my $tot = 0;
+ my %used;
+ my $fpos = tell(O);
for my $f (@fwords) {
+ next if $most_freq{$f};
+ next if $used{$f};
my $r = $grammar{$f};
die "No translations for: $f" unless $r;
my $num = scalar @$r;
$tot += $num;
+ for my $rule (@$r) {
+ print O "$f ||| $rule\n";
+ }
+ $used{$f} = 1;
}
- print "RULES: $tot\n";
+ print O "###EOS###\n";
+ print STDERR "id=$id POS=$fpos\n";
+ $id++;
+ last if $id == 10;
}
+
+close O;