diff options
-rwxr-xr-x | collapse_tags.rb | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/collapse_tags.rb b/collapse_tags.rb new file mode 100755 index 0000000..75fcaf5 --- /dev/null +++ b/collapse_tags.rb @@ -0,0 +1,40 @@ +#!/usr/bin/env ruby + +# works with gigaword en v5 + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + + +in_p = false +in_dateline = false +collect = [] + +while line = STDIN.gets + line.strip! + if line.downcase == "<dateline>" + in_dateline = true + next + elsif line.downcase == "</dateline>" + in_dateline = false + next + elsif in_dateline + next + elsif line.downcase == "<p>" and not in_p + in_p = true + collect = [] + next + elsif line.downcase == "</p>" and in_p + if collect.size > 0 + puts collect.join(" ").strip + end + in_p = false + next + elsif in_p + collect.push line + next + else + puts line + end +end + |