summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-07-22 16:22:56 +0200
committerPatrick Simianer <p@simianer.de>2014-07-22 16:22:56 +0200
commit22b81757a97f89e6b3415782c0556ffaf4625baa (patch)
treeed5ff3a66136a4d369b6a12850baeb0d6ceb253f
parentdce92ce8e0c5dbc12ff5a948af8ca129315b6eac (diff)
collapse_tags.rb
-rwxr-xr-xcollapse_tags.rb40
1 files changed, 40 insertions, 0 deletions
diff --git a/collapse_tags.rb b/collapse_tags.rb
new file mode 100755
index 0000000..75fcaf5
--- /dev/null
+++ b/collapse_tags.rb
@@ -0,0 +1,40 @@
+#!/usr/bin/env ruby
+
+# works with gigaword en v5
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+
+in_p = false
+in_dateline = false
+collect = []
+
+while line = STDIN.gets
+ line.strip!
+ if line.downcase == "<dateline>"
+ in_dateline = true
+ next
+ elsif line.downcase == "</dateline>"
+ in_dateline = false
+ next
+ elsif in_dateline
+ next
+ elsif line.downcase == "<p>" and not in_p
+ in_p = true
+ collect = []
+ next
+ elsif line.downcase == "</p>" and in_p
+ if collect.size > 0
+ puts collect.join(" ").strip
+ end
+ in_p = false
+ next
+ elsif in_p
+ collect.push line
+ next
+ else
+ puts line
+ end
+end
+