diff options
Diffstat (limited to 'word-aligner/ortho-norm')
-rw-r--r-- | word-aligner/ortho-norm/README | 2 | ||||
-rwxr-xr-x | word-aligner/ortho-norm/ar.pl | 32 | ||||
-rwxr-xr-x | word-aligner/ortho-norm/en.pl | 11 | ||||
-rwxr-xr-x | word-aligner/ortho-norm/fr.pl | 22 | ||||
-rwxr-xr-x | word-aligner/ortho-norm/ur.pl | 34 |
5 files changed, 101 insertions, 0 deletions
diff --git a/word-aligner/ortho-norm/README b/word-aligner/ortho-norm/README new file mode 100644 index 00000000..7071798a --- /dev/null +++ b/word-aligner/ortho-norm/README @@ -0,0 +1,2 @@ +The normalizations in this directory are supposed to create forms +that are roughly comparable by string comparison between languages. diff --git a/word-aligner/ortho-norm/ar.pl b/word-aligner/ortho-norm/ar.pl new file mode 100755 index 00000000..e8bd521a --- /dev/null +++ b/word-aligner/ortho-norm/ar.pl @@ -0,0 +1,32 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +while(<STDIN>) { + chomp; + my $len = length($_); + if ($len > 1 && !($_ =~ /\d/)) { + s/\$/sh/g; + } + s/([a-z])\~/$1$1/g; + s/E/'/g; + s/^Aw/o/g; + s/\|/a/g; + s/@/h/g; + s/c/ch/g; + s/x/kh/g; + s/\*/dh/g; + s/w/o/g; + s/v/th/g; + if ($len > 1) { s/}/'/g; } + s/Z/dh/g; + s/y/i/g; + s/Y/a/g; + if ($len > 1) { s/p$//; } + $_ = lc $_; + print "$_\n"; +} + diff --git a/word-aligner/ortho-norm/en.pl b/word-aligner/ortho-norm/en.pl new file mode 100755 index 00000000..b167803e --- /dev/null +++ b/word-aligner/ortho-norm/en.pl @@ -0,0 +1,11 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +while(<STDIN>) { + $_ = lc $_; + s/ al-/ al/g; + s/^al-/al/; + print; +} + diff --git a/word-aligner/ortho-norm/fr.pl b/word-aligner/ortho-norm/fr.pl new file mode 100755 index 00000000..5592ab05 --- /dev/null +++ b/word-aligner/ortho-norm/fr.pl @@ -0,0 +1,22 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +while(<STDIN>) { + $_ = lc $_; + # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French + s/â/as/g; + s/ê/es/g; + s/î/is/g; + s/ô/os/g; + s/û/us/g; + + s/ç/c/g; + s/é|è/e/g; + s/á/a/g; + print; +} + diff --git a/word-aligner/ortho-norm/ur.pl b/word-aligner/ortho-norm/ur.pl new file mode 100755 index 00000000..d125b744 --- /dev/null +++ b/word-aligner/ortho-norm/ur.pl @@ -0,0 +1,34 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +my @out = (); +while(<STDIN>) { + chomp; + my @words = split /\s+/; + for my $of (@words) { + if (length($of) > 1 && !($of =~ /\d/)) { + $of =~ s/\$/sh/g; + } + $of =~ s/([a-z])\~/$1$1/g; + $of =~ s/E/'/g; + $of =~ s/^Aw/o/g; + $of =~ s/\|/a/g; + $of =~ s/@/h/g; + $of =~ s/c/ch/g; + $of =~ s/x/kh/g; + $of =~ s/\*/dh/g; + $of =~ s/w/o/g; + $of =~ s/Z/dh/g; + $of =~ s/y/i/g; + $of =~ s/Y/a/g; + $of = lc $of; + $of = lc $of; + push @out, $of; + } + print "@out\n"; +} + |