summaryrefslogtreecommitdiff
path: root/normchr
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-11-12 13:48:47 +0100
committerPatrick Simianer <p@simianer.de>2015-11-12 13:48:47 +0100
commit159eac6e781d228bce720b9afd6a2934b8d909d5 (patch)
treed728658fea6a7d8012bea7607a64ef7792acd43d /normchr
parentda8d0a3e1764e62f1ec321e5d64ab4e746d704a8 (diff)
normalize on char level
Diffstat (limited to 'normchr')
-rwxr-xr-xnormchr35
1 files changed, 35 insertions, 0 deletions
diff --git a/normchr b/normchr
new file mode 100755
index 0000000..f8e5798
--- /dev/null
+++ b/normchr
@@ -0,0 +1,35 @@
+#!/usr/bin/env ruby
+
+# http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string-literal
+# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
+
+require 'htmlentities'
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+coder = HTMLEntities.new
+
+while line = STDIN.gets
+ line.sub! "\xef\xbb\xbf", "" # BOM
+ line.strip! # superfluous
+ line.lstrip! # whitespace
+ line.gsub! /[[:cntrl:]]+/, " " # control characters
+ line.gsub! /\u{00a0}/, " " # misc whitespace
+ line.gsub! /\u{1680}/, " " # ^
+ line.gsub! /\u{180e}/, " " # ^
+ line.gsub! /\u{3000}/, " " # ^
+ line.gsub! /\u{feff}/, " " # ^
+ line = line.scan(/[[:print:]]/).join # only printable characters
+ line.gsub! /[\u{e000}-\u{f8ff}]/, " " # UTF-8 PUA
+ line.gsub! /[\u{f0000}-\u{ffffd}]/, " "
+ line.gsub! /[\u{100000}-\u{10fffd}]/, " "
+ line.gsub! "\r", " " # carriage return
+ line.gsub! /[\u{2000}-\u{200f}]/, " " # EN QUAD -- RIGHT-TO-LEFT MARK
+ line.gsub! /[\u{2028}-\u{202f}]/, " " # LINE SEPARATOR -- NARROW NO-BREAK SPACE
+ line.gsub! /[\u{205f}-\u{206f}]/, " " # MEDIUM MATHEMATICAL SPACE -- NOMINAL DIGIT SHAPES
+ line.gsub! /\s*\xc2\xad\s*/, "" # remove hyphens
+ line.gsub! /[[:space:]]+/, " " # collapse space
+ puts coder.decode(line)
+end
+