diff options
author | Patrick Simianer <p@simianer.de> | 2014-07-22 16:22:56 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-07-22 16:22:56 +0200 |
commit | 22b81757a97f89e6b3415782c0556ffaf4625baa (patch) | |
tree | ed5ff3a66136a4d369b6a12850baeb0d6ceb253f | |
parent | dce92ce8e0c5dbc12ff5a948af8ca129315b6eac (diff) |
collapse_tags.rb
-rwxr-xr-x | collapse_tags.rb | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/collapse_tags.rb b/collapse_tags.rb new file mode 100755 index 0000000..75fcaf5 --- /dev/null +++ b/collapse_tags.rb @@ -0,0 +1,40 @@ +#!/usr/bin/env ruby + +# works with gigaword en v5 + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + + +in_p = false +in_dateline = false +collect = [] + +while line = STDIN.gets + line.strip! + if line.downcase == "<dateline>" + in_dateline = true + next + elsif line.downcase == "</dateline>" + in_dateline = false + next + elsif in_dateline + next + elsif line.downcase == "<p>" and not in_p + in_p = true + collect = [] + next + elsif line.downcase == "</p>" and in_p + if collect.size > 0 + puts collect.join(" ").strip + end + in_p = false + next + elsif in_p + collect.push line + next + else + puts line + end +end + |