summaryrefslogtreecommitdiff
path: root/gi/pf/guess-translits.pl
blob: aafec13a3fa750341702b6e0d015b43774b2c68f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/perl -w
use strict;
use utf8;

my $MIN_PMI = -3;

my %fs;
my %es;
my %ef;

die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0;

binmode(STDIN,":utf8");
binmode(STDOUT,":utf8");
binmode(STDERR,":utf8");

my $tot = 0;
print STDERR "Reading alignments from STDIN ...\n";
while(<STDIN>) {
  chomp;
  my ($fsent, $esent, $alsent) = split / \|\|\| /;
  die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent;

  my @fws = split /\s+/, $fsent;  
  my @ews = split /\s+/, $esent;
  my @as = split /\s+/, $alsent;
  my %a2b;
  my %b2a;
  for my $ap (@as) {
    my ($a,$b) = split /-/, $ap;
    die "BAD INPUT: $_\n" unless defined $a && defined $b;
    $a2b{$a}->{$b} = 1;
    $b2a{$b}->{$a} = 1;
  }
  for my $a (keys %a2b) {
    my $bref = $a2b{$a};
    next unless scalar keys %$bref < 2;
    my $b = (keys %$bref)[0];
    next unless scalar keys %{$b2a{$b}} < 2;
    my $f = $fws[$a];
    next unless defined $f;
    next unless length($f) > 3;
    my $e = $ews[$b];
    next unless defined $e;
    next unless length($e) > 3;

    $ef{$f}->{$e}++;
    $es{$e}++;
    $fs{$f}++;
    $tot++;
  }  
}
my $ltot = log($tot);
my $num = 0;
print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n";
for my $f (keys %fs) {
  my $logf = log($fs{$f});
  my $esref = $ef{$f};
  for my $e (keys %$esref) {
    my $loge = log($es{$e});
    my $ef = $esref->{$e};
    my $logef = log($ef);
    my $pmi = $logef - ($loge + $logf);
    next if $pmi < $MIN_PMI;
    my @flets = split //, $f;
    my @elets = split //, $e;
    print "@flets ||| @elets\n";
    $num++;
  }
}
print STDERR "Extracted $num pairs.\n";
print STDERR "Recommend running:\n   ../../training/model1 -t -99999 output.txt\n";