1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
#!/usr/bin/env ruby
require "thread"
require "optimist"
STDIN.set_encoding "utf-8"
STDOUT.set_encoding "utf-8"
conf = Optimist::options do
banner "norm_german < <file w/ lowercased tokens>"
opt :upper, "uppercase", :type => :bool, :default => false
opt :threads, "#threads", :type => :int, :default => 1, :short => "-h"
opt :shard_size, "shard size", :type => :int, :default => 1000
opt :train, "train", :type => :bool
opt :apply, "apply", :type => :bool
end
pairs_lower = [ ["ß","ss"], ["ue", "ü"], ["ae","ä"], ["oe", "ö"] ]
pairs_upper = [ ["Ä", "Ae"], ["Ö", "Oe"], ["Ü", "Ue"] ]
if conf[:upper]
PAIRS = pairs_lower
else
PAIRS = pairs_lower+pairs_upper
end
def get_key(old, new)
PAIRS.each { |i|
return old if new.gsub(i[0], i[1])==old
return old if new.gsub(i[1], i[0])==old
}
return nil
end
def build_partial(tokens)
h = {}
tokens.each { |tok|
found = false
h.keys.each { |i|
if get_key i, tok
h[i] << tok
found = true
break
end
}
h[tok] = [tok] if !found
}
return h
end
h = {}
threads = []
thread_n = 0
counter = 0
token_stock = []
mutex = Mutex.new
while tok = STDIN.gets
token_stock << [] if !token_stock[thread_n]
token_stock[thread_n] << tok.strip!
counter += 1
if token_stock[thread_n].size%conf[:shard_size]==0
STDERR.write "Starting thread ##{thread_n}\n"
threads << Thread.new(token_stock[thread_n]) { |tokens|
th = build_partial tokens
mutex.synchronize do
h.merge! th
end
}
threads.last.abort_on_exception = true
thread_n += 1
else
next
end
if thread_n==conf[:threads]
threads.each { |i| i.join }
token_stock.each { |i| i.clear }
thread_n = 0
end
STDERR.write "#keys #{h.keys.size}\n"
end
token_stock.each { |i|
if i.size!=0
h.merge! build_partial i
end
}
|