#!/usr/bin/env ruby

# http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string-literal
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html

require 'htmlentities'

STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'

coder = HTMLEntities.new

while line = STDIN.gets
  line.sub! "\xef\xbb\xbf", ""          # BOM
  line.strip!                           # superfluous
  line.lstrip!                          #   whitespace
  line.gsub! /[[:cntrl:]]+/, " "        # control characters
  line.gsub! /\u{00a0}/, " "            # misc whitespace
  line.gsub! /\u{1680}/, " "            # ^
  line.gsub! /\u{180e}/, " "            # ^
  line.gsub! /\u{3000}/, " "            # ^
  line.gsub! /\u{feff}/, " "            # ^
  line = line.scan(/[[:print:]]/).join  # only printable characters
  line.gsub! /[\u{e000}-\u{f8ff}]/, " " # UTF-8 PUA
  line.gsub! /[\u{f0000}-\u{ffffd}]/, " "
  line.gsub! /[\u{100000}-\u{10fffd}]/, " "
  line.gsub! "\r", " "                  # carriage return  
  line.gsub! /[\u{2000}-\u{200f}]/, " " #                   EN QUAD -- RIGHT-TO-LEFT MARK
  line.gsub! /[\u{2028}-\u{202f}]/, " " #            LINE SEPARATOR -- NARROW NO-BREAK SPACE
  line.gsub! /[\u{205f}-\u{206f}]/, " " # MEDIUM MATHEMATICAL SPACE -- NOMINAL DIGIT SHAPES
  line.gsub! /\s*\xc2\xad\s*/, ""       # remove hyphens
  line.gsub! /[[:space:]]+/, " "        # collapse space
  puts coder.decode(line)
end