From 159eac6e781d228bce720b9afd6a2934b8d909d5 Mon Sep 17 00:00:00 2001 From: Patrick Simianer
Date: Thu, 12 Nov 2015 13:48:47 +0100 Subject: normalize on char level --- normchr | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 normchr diff --git a/normchr b/normchr new file mode 100755 index 0000000..f8e5798 --- /dev/null +++ b/normchr @@ -0,0 +1,35 @@ +#!/usr/bin/env ruby + +# http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string-literal +# https://www.cs.tut.fi/~jkorpela/chars/spaces.html + +require 'htmlentities' + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +coder = HTMLEntities.new + +while line = STDIN.gets + line.sub! "\xef\xbb\xbf", "" # BOM + line.strip! # superfluous + line.lstrip! # whitespace + line.gsub! /[[:cntrl:]]+/, " " # control characters + line.gsub! /\u{00a0}/, " " # misc whitespace + line.gsub! /\u{1680}/, " " # ^ + line.gsub! /\u{180e}/, " " # ^ + line.gsub! /\u{3000}/, " " # ^ + line.gsub! /\u{feff}/, " " # ^ + line = line.scan(/[[:print:]]/).join # only printable characters + line.gsub! /[\u{e000}-\u{f8ff}]/, " " # UTF-8 PUA + line.gsub! /[\u{f0000}-\u{ffffd}]/, " " + line.gsub! /[\u{100000}-\u{10fffd}]/, " " + line.gsub! "\r", " " # carriage return + line.gsub! /[\u{2000}-\u{200f}]/, " " # EN QUAD -- RIGHT-TO-LEFT MARK + line.gsub! /[\u{2028}-\u{202f}]/, " " # LINE SEPARATOR -- NARROW NO-BREAK SPACE + line.gsub! /[\u{205f}-\u{206f}]/, " " # MEDIUM MATHEMATICAL SPACE -- NOMINAL DIGIT SHAPES + line.gsub! /\s*\xc2\xad\s*/, "" # remove hyphens + line.gsub! /[[:space:]]+/, " " # collapse space + puts coder.decode(line) +end + -- cgit v1.2.3