From 22b81757a97f89e6b3415782c0556ffaf4625baa Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 22 Jul 2014 16:22:56 +0200 Subject: collapse_tags.rb --- collapse_tags.rb | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100755 collapse_tags.rb diff --git a/collapse_tags.rb b/collapse_tags.rb new file mode 100755 index 0000000..75fcaf5 --- /dev/null +++ b/collapse_tags.rb @@ -0,0 +1,40 @@ +#!/usr/bin/env ruby + +# works with gigaword en v5 + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + + +in_p = false +in_dateline = false +collect = [] + +while line = STDIN.gets + line.strip! + if line.downcase == "" + in_dateline = true + next + elsif line.downcase == "" + in_dateline = false + next + elsif in_dateline + next + elsif line.downcase == "

" and not in_p + in_p = true + collect = [] + next + elsif line.downcase == "

" and in_p + if collect.size > 0 + puts collect.join(" ").strip + end + in_p = false + next + elsif in_p + collect.push line + next + else + puts line + end +end + -- cgit v1.2.3