summaryrefslogtreecommitdiff
path: root/corpus/support
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-06-12 13:56:42 +0200
committerPatrick Simianer <p@simianer.de>2014-06-12 13:56:42 +0200
commit244971287003d079e46193b8a209c28955f90134 (patch)
tree8beaae6b12b913acb213fc7f2415fd63886192f9 /corpus/support
parent5250fd67a4b8f242068cff87f0a6a4211f8b0fcf (diff)
parentb66e838ed52decc0be1eb5817b2a77c3840db2c5 (diff)
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'corpus/support')
-rwxr-xr-xcorpus/support/quote-norm.pl1
-rwxr-xr-xcorpus/support/tokenizer.pl4
2 files changed, 5 insertions, 0 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 0366fad5..3eee0666 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -40,6 +40,7 @@ while(<STDIN>) {
# Regularlize spaces:
s/\x{ad}//g; # soft hyphen
+ s/\x{200C}//g; # zero-width non-joiner
s/\x{a0}/ /g; # non-breaking space
s/\x{2009}/ /g; # thin space
s/\x{2028}/ /g; # "line separator"
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index 7771201f..f57bc87a 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -240,6 +240,10 @@ sub proc_token {
return $token;
}
+ if($token =~ /^\d+(.\d+)+(亿|百万|万|千)?$/){
+ return $token;
+ }
+
## 1,234,345.34
if($token =~ /^\d+(\.\d{3})*,\d+$/){
## number