1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
#!/usr/bin/env ruby
# http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string-literal
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
require 'htmlentities'
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
coder = HTMLEntities.new
while line = STDIN.gets
line.sub! "\xef\xbb\xbf", "" # BOM
line.strip! # superfluous
line.lstrip! # whitespace
line.gsub! /[[:cntrl:]]+/, " " # control characters
line.gsub! /\u{00a0}/, " " # misc whitespace
line.gsub! /\u{1680}/, " " # ^
line.gsub! /\u{180e}/, " " # ^
line.gsub! /\u{3000}/, " " # ^
line.gsub! /\u{feff}/, " " # ^
line = line.scan(/[[:print:]]/).join # only printable characters
line.gsub! /[\u{e000}-\u{f8ff}]/, " " # UTF-8 PUA
line.gsub! /[\u{f0000}-\u{ffffd}]/, " "
line.gsub! /[\u{100000}-\u{10fffd}]/, " "
line.gsub! "\r", " " # carriage return
line.gsub! /[\u{2000}-\u{200f}]/, " " # EN QUAD -- RIGHT-TO-LEFT MARK
line.gsub! /[\u{2028}-\u{202f}]/, " " # LINE SEPARATOR -- NARROW NO-BREAK SPACE
line.gsub! /[\u{205f}-\u{206f}]/, " " # MEDIUM MATHEMATICAL SPACE -- NOMINAL DIGIT SHAPES
line.gsub! /\s*\xc2\xad\s*/, "" # remove hyphens
line.gsub! /[[:space:]]+/, " " # collapse space
puts coder.decode(line)
end
|