diff options
author | Patrick Simianer <p@simianer.de> | 2014-06-12 13:56:42 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-06-12 13:56:42 +0200 |
commit | a39aa79b18347e22ef36ebc0da5a7eb220bcb23f (patch) | |
tree | 2c0f3009f8e381002bfeb82c0ea3bd0c41125761 /corpus/support | |
parent | 62bd9a4bdcea606d6ff2031fa4b207ef20caac31 (diff) | |
parent | 0e2f8d3d049f06afb08b4639c6a28aa5461cdc78 (diff) |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'corpus/support')
-rwxr-xr-x | corpus/support/quote-norm.pl | 1 | ||||
-rwxr-xr-x | corpus/support/tokenizer.pl | 4 |
2 files changed, 5 insertions, 0 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 0366fad5..3eee0666 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -40,6 +40,7 @@ while(<STDIN>) { # Regularlize spaces: s/\x{ad}//g; # soft hyphen + s/\x{200C}//g; # zero-width non-joiner s/\x{a0}/ /g; # non-breaking space s/\x{2009}/ /g; # thin space s/\x{2028}/ /g; # "line separator" diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 7771201f..f57bc87a 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -240,6 +240,10 @@ sub proc_token { return $token; } + if($token =~ /^\d+(.\d+)+(亿|百万|万|千)?$/){ + return $token; + } + ## 1,234,345.34 if($token =~ /^\d+(\.\d{3})*,\d+$/){ ## number |