blob: ab7371211b898f4083ed693b70797d1b9c32379c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
#!/usr/bin/perl -w
use strict;
use utf8;
my $MIN_PMI = -3;
my %fs;
my %es;
my %ef;
die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0;
binmode(STDIN,":utf8");
binmode(STDOUT,":utf8");
binmode(STDERR,":utf8");
my $tot = 0;
print STDERR "Reading alignments from STDIN ...\n";
while(<STDIN>) {
chomp;
my ($fsent, $esent, $alsent) = split / \|\|\| /;
die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent;
my @fws = split /\s+/, $fsent;
my @ews = split /\s+/, $esent;
my @as = split /\s+/, $alsent;
my %a2b;
my %b2a;
for my $ap (@as) {
my ($a,$b) = split /-/, $ap;
$a2b{$a}->{$b} = 1;
$b2a{$b}->{$a} = 1;
}
for my $a (keys %a2b) {
my $bref = $a2b{$a};
next unless scalar keys %$bref < 2;
my $b = (keys %$bref)[0];
next unless scalar keys %{$b2a{$b}} < 2;
my $f = $fws[$a];
next unless defined $f;
next unless length($f) > 3;
my $e = $ews[$b];
next unless defined $e;
next unless length($e) > 3;
$ef{$f}->{$e}++;
$es{$e}++;
$fs{$f}++;
$tot++;
}
}
my $ltot = log($tot);
my $num = 0;
print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n";
for my $f (keys %fs) {
my $logf = log($fs{$f});
my $esref = $ef{$f};
for my $e (keys %$esref) {
my $loge = log($es{$e});
my $ef = $esref->{$e};
my $logef = log($ef);
my $pmi = $logef - ($loge + $logf);
next if $pmi < $MIN_PMI;
my @flets = split //, $f;
my @elets = split //, $e;
print "@flets ||| @elets\n";
$num++;
}
}
print STDERR "Extracted $num pairs.\n";
print STDERR "Recommend running:\n ../../training/model1 -t -99999 output.txt\n";
|