From 2a9ee1febae6a63173f74ae24e2bfe439e409525 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 18 Mar 2014 02:05:25 -0400 Subject: chris edits --- corpus/support/tokenizer.pl | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'corpus') diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 7771201f..f57bc87a 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -240,6 +240,10 @@ sub proc_token { return $token; } + if($token =~ /^\d+(.\d+)+(亿|百万|万|千)?$/){ + return $token; + } + ## 1,234,345.34 if($token =~ /^\d+(\.\d{3})*,\d+$/){ ## number -- cgit v1.2.3