summaryrefslogtreecommitdiff
path: root/corpus/support/tokenizer.pl
diff options
context:
space:
mode:
Diffstat (limited to 'corpus/support/tokenizer.pl')
-rwxr-xr-xcorpus/support/tokenizer.pl1
1 files changed, 1 insertions, 0 deletions
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index 079f45b6..475b9a1c 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -112,6 +112,7 @@ my $new_token_total = 0;
while(<STDIN>){
chomp();
+ s/\x{0970}/./g; # dev abbreviation character
if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) {
## markup
print STDOUT "$_\n";