From 1915f029aa02f3528dafcd7b80a62a9c890b462b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 20 Jan 2014 00:11:27 -0500 Subject: hindi edits --- corpus/support/fix-eos.pl | 10 +++++++++ corpus/support/token_list | 51 +++++++++++++++++++++++++++++++++++++++++++++ corpus/support/tokenizer.pl | 1 + corpus/tokenize-anything.sh | 1 + 4 files changed, 63 insertions(+) create mode 100755 corpus/support/fix-eos.pl (limited to 'corpus') diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl new file mode 100755 index 00000000..584f8b46 --- /dev/null +++ b/corpus/support/fix-eos.pl @@ -0,0 +1,10 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +while() { + s/(\p{Devanagari}{2}[A-Za-z0-9! ,.\@\p{Devanagari}]+?)\s+(\.)(\s*$|\s+\|\|\|)/$1 \x{0964}$3/s; + print; +} diff --git a/corpus/support/token_list b/corpus/support/token_list index 43dd80d9..0de72821 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -1,6 +1,57 @@ ##################### hyphenated words added by Fei since 3/7/05 ##X-ray +# hindi abbreviation patterns +डी.एल. +डी.टी.ओ. +डी.ए. +ए.एस.आई. +डी.टी.ओ. +एम.एस.आर.टी.सी. +बी.बी.एम.बी. +डी.एस.पी. +सी.आर.पी. +एस.डी.एम. +सी.डी.पी.ओ. +बी.डी.ओ. +एस.डी.ओ. +एम.पी.पी. +पी.एच.ई. +एस.एच.ओ. +ए.सी.पी. +यू.पी. +पी.एम. +आर.बी.डी. +वी.पी. +सी.ए.डी.पी. +ए. +बी. +सी. +डी. +ई. +एफ. +जी. +एच. +आई. +जे. +के. +एल. +एम. +एन. +ओ. +पी. +क़यू. +आर. +एस. +टी. +यू. +वी. +डबल्यू. +एक्स. +वाई. +ज़ेड. +ज़ी. + ##################### words made of punct only :- :-) diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 079f45b6..475b9a1c 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -112,6 +112,7 @@ my $new_token_total = 0; while(){ chomp(); + s/\x{0970}/./g; # dev abbreviation character if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^