blob: cbaf7d79261cf920f917a6c6f934e0d917fe2683 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
#!/usr/bin/env ruby
# works with gigaword en v5
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
in_p = false
in_dateline = false
collect = []
while line = STDIN.gets
line.strip!
if line.downcase == "<dateline>"
in_dateline = true
next
elsif line.downcase == "</dateline>"
in_dateline = false
next
elsif in_dateline
next
elsif line.downcase == "<p>" and not in_p
in_p = true
collect = []
next
elsif line.downcase == "</p>" and in_p
if collect.size > 0
puts collect.join(" ").strip
end
in_p = false
next
elsif in_p
collect.push line
next
else
puts line
end
end
|