diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2014-01-20 00:11:27 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2014-01-20 00:11:27 -0500 |
commit | 02a764cc553f238cdd0be4c6dee4ea144cda84ba (patch) | |
tree | cc1b42844831c2bd190903fe29f6a68cfc41c219 | |
parent | 54449ce0eb266ac8e1617f6db7f5390b0a3e26ce (diff) |
hindi edits
-rwxr-xr-x | corpus/support/fix-eos.pl | 10 | ||||
-rw-r--r-- | corpus/support/token_list | 51 | ||||
-rwxr-xr-x | corpus/support/tokenizer.pl | 1 | ||||
-rwxr-xr-x | corpus/tokenize-anything.sh | 1 |
4 files changed, 63 insertions, 0 deletions
diff --git a/corpus/support/fix-eos.pl b/corpus/support/fix-eos.pl new file mode 100755 index 00000000..584f8b46 --- /dev/null +++ b/corpus/support/fix-eos.pl @@ -0,0 +1,10 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +while(<STDIN>) { + s/(\p{Devanagari}{2}[A-Za-z0-9! ,.\@\p{Devanagari}]+?)\s+(\.)(\s*$|\s+\|\|\|)/$1 \x{0964}$3/s; + print; +} diff --git a/corpus/support/token_list b/corpus/support/token_list index 43dd80d9..0de72821 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -1,6 +1,57 @@ ##################### hyphenated words added by Fei since 3/7/05 ##X-ray +# hindi abbreviation patterns +डी.एल. +डी.टी.ओ. +डी.ए. +ए.एस.आई. +डी.टी.ओ. +एम.एस.आर.टी.सी. +बी.बी.एम.बी. +डी.एस.पी. +सी.आर.पी. +एस.डी.एम. +सी.डी.पी.ओ. +बी.डी.ओ. +एस.डी.ओ. +एम.पी.पी. +पी.एच.ई. +एस.एच.ओ. +ए.सी.पी. +यू.पी. +पी.एम. +आर.बी.डी. +वी.पी. +सी.ए.डी.पी. +ए. +बी. +सी. +डी. +ई. +एफ. +जी. +एच. +आई. +जे. +के. +एल. +एम. +एन. +ओ. +पी. +क़यू. +आर. +एस. +टी. +यू. +वी. +डबल्यू. +एक्स. +वाई. +ज़ेड. +ज़ी. + ##################### words made of punct only :- :-) diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 079f45b6..475b9a1c 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -112,6 +112,7 @@ my $new_token_total = 0; while(<STDIN>){ chomp(); + s/\x{0970}/./g; # dev abbreviation character if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) { ## markup print STDOUT "$_\n"; diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh index 5b7933d8..bca954d1 100755 --- a/corpus/tokenize-anything.sh +++ b/corpus/tokenize-anything.sh @@ -14,6 +14,7 @@ fi $SUPPORT/utf8-normalize.sh $NORMARGS | $SUPPORT/quote-norm.pl | $SUPPORT/tokenizer.pl | + $SUPPORT/fix-eos.pl | sed $SEDFLAGS -e 's/ al - / al-/g' | $SUPPORT/fix-contract.pl | sed $SEDFLAGS -e 's/^ //' | sed $SEDFLAGS -e 's/ $//' | |