summaryrefslogtreecommitdiff
path: root/gigaword-collapse-tags
blob: cbaf7d79261cf920f917a6c6f934e0d917fe2683 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/env ruby

# works with gigaword en v5

STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'

in_p = false
in_dateline = false
collect = []

while line = STDIN.gets
  line.strip!
  if line.downcase == "<dateline>"
    in_dateline = true
    next
  elsif line.downcase == "</dateline>"
    in_dateline = false
    next
  elsif in_dateline
    next
  elsif line.downcase == "<p>" and not in_p
    in_p = true
    collect = []
    next
  elsif line.downcase == "</p>" and in_p
    if collect.size > 0
        puts collect.join(" ").strip
    end
    in_p = false
    next
 elsif in_p
   collect.push line
   next
 else
   puts line
 end
end