summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-26 21:42:51 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-26 21:42:51 +0000
commit580c488f2074377026de7312a20faf933e751287 (patch)
treeb426f54561e49996712614565eeb72ce71d8fc89
parent089c60cae00edefe3e45baae04e3582f5819849c (diff)
buck 2 utf converter
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@428 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-xgi/scripts/buck2utf8.pl87
1 files changed, 87 insertions, 0 deletions
diff --git a/gi/scripts/buck2utf8.pl b/gi/scripts/buck2utf8.pl
new file mode 100755
index 00000000..1acfae8d
--- /dev/null
+++ b/gi/scripts/buck2utf8.pl
@@ -0,0 +1,87 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+binmode(STDOUT, ":utf8");
+while(<>) {
+ chomp;
+ my @words = split /\s+/;
+ for my $w (@words) {
+ $_ = $w;
+ if ($w =~ /^__NTK__/o) {
+ s/__NTK__//go;
+ next if /^$/;
+ print STDOUT "$_ ";
+ next;
+ }
+s/tR/\x{0679}/g; # retroflex t
+s/dR/\x{0688}/g; # retroflex d
+s/rR/\x{0691}/g; # retroflex r
+s/p/\x{067E}/g; # peh
+s/c/\x{0686}/g; # tcheh
+s/g/\x{06AF}/g; # geh (G=ghain)
+s/@/\x{06BE}/g; # heh doachashmee
+s/h'/\x{06c2}/g; # heh goal + hamza
+s/h/\x{06c1}/g; # heh goal
+s/J/\x{0698}/g; # zheh (rare, usually persian loan words)
+s/k/\x{06A9}/g; # k
+s/Y'/\x{06d3}/g; # yeh barree + hamza above (ligature)
+s/y/\x{06cc}/g; # same as ya' in arabic
+s/Y/\x{06d2}/g; # yeh barree
+s/N/\x{06BA}/g; # Ghunna
+
+ s/\'/\x{0621}/g;
+ s/\|/\x{0622}/g;
+ s/\>/\x{0623}/g;
+ s/\&/\x{0624}/g;
+ s/\</\x{0625}/g;
+ s/\}/\x{0626}/g;
+ s/A/\x{0627}/g;
+ s/b/\x{0628}/g;
+ s/t/\x{062A}/g;
+ s/v/\x{062B}/g;
+ s/j/\x{062C}/g;
+ s/H/\x{062D}/g;
+ s/x/\x{062E}/g;
+ s/d/\x{062F}/g;
+ s/\*/\x{0630}/g;
+ s/r/\x{0631}/g;
+ s/z/\x{0632}/g;
+ s/s/\x{0633}/g;
+ s/\$/\x{0634}/g;
+ s/S/\x{0635}/g;
+ s/D/\x{0636}/g;
+ s/T/\x{0637}/g;
+ s/Z/\x{0638}/g;
+ s/E/\x{0639}/g;
+ s/g/\x{063A}/g;
+ s/_/\x{0640}/g;
+ s/f/\x{0641}/g;
+ s/q/\x{0642}/g;
+ s/k/\x{0643}/g;
+ s/l/\x{0644}/g;
+ s/m/\x{0645}/g;
+ s/n/\x{0646}/g;
+ s/h/\x{0647}/g;
+ s/w/\x{0648}/g;
+ s/Y/\x{0649}/g;
+ s/y/\x{064A}/g;
+ s/F/\x{064B}/g;
+ s/N/\x{064C}/g;
+ s/K/\x{064D}/g;
+ s/a/\x{064E}/g;
+ s/u/\x{064F}/g;
+ s/i/\x{0650}/g;
+ s/\~/\x{0651}/g;
+ s/o/\x{0652}/g;
+ s/\`/\x{0670}/g;
+ s/\{/\x{0671}/g;
+ s/P/\x{067E}/g;
+ s/J/\x{0686}/g;
+ s/V/\x{06A4}/g;
+ s/G/\x{06AF}/g;
+
+
+print STDOUT "$_ ";
+ }
+ print STDOUT "\n";
+}