From 580c488f2074377026de7312a20faf933e751287 Mon Sep 17 00:00:00 2001 From: redpony Date: Mon, 26 Jul 2010 21:42:51 +0000 Subject: buck 2 utf converter git-svn-id: https://ws10smt.googlecode.com/svn/trunk@428 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/scripts/buck2utf8.pl | 87 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100755 gi/scripts/buck2utf8.pl (limited to 'gi/scripts') diff --git a/gi/scripts/buck2utf8.pl b/gi/scripts/buck2utf8.pl new file mode 100755 index 00000000..1acfae8d --- /dev/null +++ b/gi/scripts/buck2utf8.pl @@ -0,0 +1,87 @@ +#!/usr/bin/perl -w +use strict; +use utf8; +binmode(STDOUT, ":utf8"); +while(<>) { + chomp; + my @words = split /\s+/; + for my $w (@words) { + $_ = $w; + if ($w =~ /^__NTK__/o) { + s/__NTK__//go; + next if /^$/; + print STDOUT "$_ "; + next; + } +s/tR/\x{0679}/g; # retroflex t +s/dR/\x{0688}/g; # retroflex d +s/rR/\x{0691}/g; # retroflex r +s/p/\x{067E}/g; # peh +s/c/\x{0686}/g; # tcheh +s/g/\x{06AF}/g; # geh (G=ghain) +s/@/\x{06BE}/g; # heh doachashmee +s/h'/\x{06c2}/g; # heh goal + hamza +s/h/\x{06c1}/g; # heh goal +s/J/\x{0698}/g; # zheh (rare, usually persian loan words) +s/k/\x{06A9}/g; # k +s/Y'/\x{06d3}/g; # yeh barree + hamza above (ligature) +s/y/\x{06cc}/g; # same as ya' in arabic +s/Y/\x{06d2}/g; # yeh barree +s/N/\x{06BA}/g; # Ghunna + + s/\'/\x{0621}/g; + s/\|/\x{0622}/g; + s/\>/\x{0623}/g; + s/\&/\x{0624}/g; + s/\