summaryrefslogtreecommitdiff
path: root/word-aligner/support/make_lex_grammar.pl
blob: 47d4d945828d9a9418062d971328a5db20a7775a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/perl -w
use utf8;
use strict;

my $LIMIT_SIZE=30;

my ($effile, $model1, $imodel1) = @ARGV;
die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1;

my $ADD_NULL = 1;

open EF, "<$effile" or die;
open M1, "<$model1" or die;
open IM1, "<$imodel1" or die;
binmode(EF,":utf8");
binmode(M1,":utf8");
binmode(IM1,":utf8");
binmode(STDOUT,":utf8");
my %model1;
print STDERR "Reading model1...\n";
my %sizes = ();
while(<M1>) {
  chomp;
  my ($f, $e, $lp) = split /\s+/;
  $model1{$f}->{$e} = 1;
  $sizes{$f}++;
}
close M1;

my $inv_add = 0;
my %invm1;
print STDERR "Reading inverse model1...\n";
my %esizes=();
while(<IM1>) {
  chomp;
  my ($e, $f, $lp) = split /\s+/;
  $invm1{$e}->{$f} = 1;
  $esizes{$e}++;
  if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) {
    $model1{$f}->{$e} = 1;
    $sizes{$f}++;
    $inv_add++;
  }
}
close IM1;
print STDERR "Added $inv_add from inverse model1\n";

print STDERR "Generating grammars...\n";

my %fdict;
while(<EF>) {
  chomp;
  my ($f, $e) = split /\s*\|\|\|\s*/;
  my @es = split /\s+/, $e;
  my @fs = split /\s+/, $f;
  for my $ew (@es){
    die "E: Empty word" if $ew eq '';
  }
  push @fs, '<eps>' if $ADD_NULL;
  my $i = 0;
  for my $fw (@fs){
    $i++;
    die "F: Empty word\nI=$i FS: @fs" if $fw eq '';
  }
  for my $fw (@fs){
    for my $ew (@es){
      $fdict{$fw}->{$ew}++;
    }
  }
}

my %model4;
#while(<M4>) {
#  my $en = <M4>; chomp $en;
#  my $zh = <M4>; chomp $zh;
#  die unless $zh =~ /^NULL \({/;
#  my @ewords = split /\s+/, $en;
#  my @chunks = split /\}\) ?/, $zh;
#
#  for my $c (@chunks) {
#    my ($zh, $taps) = split / \(\{ /, $c;
#    if ($zh eq 'NULL') { $zh = '<eps>'; }
#    my @aps = map { $ewords[$_ - 1]; } (split / /, $taps);
#    #print "$zh -> @aps\n";
#    for my $ap (@aps) {
#      $model4{$zh}->{$ap} += 1;
#    }
#  }
#}
#close M4;

my $specials = 0;
my $fc = 1000000;
my $sids = 1000000;
for my $f (sort keys %fdict) {
  my $re = $fdict{$f};
  my $max;
  for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) {
    my $efcount = $re->{$e};
    unless (defined $max) { $max = $efcount; }
    my $m1 = $model1{$f}->{$e};
    my $m4 = $model4{$f}->{$e};
    my $im1 = $invm1{$e}->{$f};
    my $is_good_pair = (defined $m1 || defined $m4);
    my $ident = ($e eq $f);
    if ($ident) { $is_good_pair = 1; }
    next unless $is_good_pair;
    print "$f ||| $e ||| X=0\n" if $is_good_pair;
  }
}