diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-01-11 01:22:54 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-01-11 01:22:54 -0500 |
commit | 72b0ebee7d3398dfb657f2949b9e5dac82342198 (patch) | |
tree | 465c731143cfaf088a0ee1581fa9fb4265482eb9 /gi | |
parent | e069f9ad0c7aa54eb38bb6c68f5d624193c0bef7 (diff) |
script to pull out candidate transliterations from a word-aligned parallel corpus
Diffstat (limited to 'gi')
-rwxr-xr-x | gi/pf/guess-translits.pl | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl new file mode 100755 index 00000000..ab737121 --- /dev/null +++ b/gi/pf/guess-translits.pl @@ -0,0 +1,71 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +my $MIN_PMI = -3; + +my %fs; +my %es; +my %ef; + +die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); +binmode(STDERR,":utf8"); + +my $tot = 0; +print STDERR "Reading alignments from STDIN ...\n"; +while(<STDIN>) { + chomp; + my ($fsent, $esent, $alsent) = split / \|\|\| /; + die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent; + + my @fws = split /\s+/, $fsent; + my @ews = split /\s+/, $esent; + my @as = split /\s+/, $alsent; + my %a2b; + my %b2a; + for my $ap (@as) { + my ($a,$b) = split /-/, $ap; + $a2b{$a}->{$b} = 1; + $b2a{$b}->{$a} = 1; + } + for my $a (keys %a2b) { + my $bref = $a2b{$a}; + next unless scalar keys %$bref < 2; + my $b = (keys %$bref)[0]; + next unless scalar keys %{$b2a{$b}} < 2; + my $f = $fws[$a]; + next unless defined $f; + next unless length($f) > 3; + my $e = $ews[$b]; + next unless defined $e; + next unless length($e) > 3; + + $ef{$f}->{$e}++; + $es{$e}++; + $fs{$f}++; + $tot++; + } +} +my $ltot = log($tot); +my $num = 0; +print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n"; +for my $f (keys %fs) { + my $logf = log($fs{$f}); + my $esref = $ef{$f}; + for my $e (keys %$esref) { + my $loge = log($es{$e}); + my $ef = $esref->{$e}; + my $logef = log($ef); + my $pmi = $logef - ($loge + $logf); + next if $pmi < $MIN_PMI; + my @flets = split //, $f; + my @elets = split //, $e; + print "@flets ||| @elets\n"; + $num++; + } +} +print STDERR "Extracted $num pairs.\n"; +print STDERR "Recommend running:\n ../../training/model1 -t -99999 output.txt\n"; |