diff options
Diffstat (limited to 'corpus/support/tokenizer.pl')
-rwxr-xr-x | corpus/support/tokenizer.pl | 1 |
1 files changed, 1 insertions, 0 deletions
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 079f45b6..475b9a1c 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -112,6 +112,7 @@ my $new_token_total = 0; while(<STDIN>){ chomp(); + s/\x{0970}/./g; # dev abbreviation character if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) { ## markup print STDOUT "$_\n"; |