diff options
| author | Patrick Simianer <p@simianer.de> | 2015-11-12 13:48:47 +0100 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2015-11-12 13:48:47 +0100 | 
| commit | 159eac6e781d228bce720b9afd6a2934b8d909d5 (patch) | |
| tree | d728658fea6a7d8012bea7607a64ef7792acd43d | |
| parent | da8d0a3e1764e62f1ec321e5d64ab4e746d704a8 (diff) | |
normalize on char level
| -rwxr-xr-x | normchr | 35 | 
1 files changed, 35 insertions, 0 deletions
@@ -0,0 +1,35 @@ +#!/usr/bin/env ruby + +# http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string-literal +# https://www.cs.tut.fi/~jkorpela/chars/spaces.html + +require 'htmlentities' + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +coder = HTMLEntities.new + +while line = STDIN.gets +  line.sub! "\xef\xbb\xbf", ""          # BOM +  line.strip!                           # superfluous +  line.lstrip!                          #   whitespace +  line.gsub! /[[:cntrl:]]+/, " "        # control characters +  line.gsub! /\u{00a0}/, " "            # misc whitespace +  line.gsub! /\u{1680}/, " "            # ^ +  line.gsub! /\u{180e}/, " "            # ^ +  line.gsub! /\u{3000}/, " "            # ^ +  line.gsub! /\u{feff}/, " "            # ^ +  line = line.scan(/[[:print:]]/).join  # only printable characters +  line.gsub! /[\u{e000}-\u{f8ff}]/, " " # UTF-8 PUA +  line.gsub! /[\u{f0000}-\u{ffffd}]/, " " +  line.gsub! /[\u{100000}-\u{10fffd}]/, " " +  line.gsub! "\r", " "                  # carriage return   +  line.gsub! /[\u{2000}-\u{200f}]/, " " #                   EN QUAD -- RIGHT-TO-LEFT MARK +  line.gsub! /[\u{2028}-\u{202f}]/, " " #            LINE SEPARATOR -- NARROW NO-BREAK SPACE +  line.gsub! /[\u{205f}-\u{206f}]/, " " # MEDIUM MATHEMATICAL SPACE -- NOMINAL DIGIT SHAPES +  line.gsub! /\s*\xc2\xad\s*/, ""       # remove hyphens +  line.gsub! /[[:space:]]+/, " "        # collapse space +  puts coder.decode(line) +end +  | 
