1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
#!/usr/bin/env ruby
require 'thread'
require 'optimist'
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
conf = Optimist::options do
banner "norm_german < <file w/ lowercased tokens>"
opt :upper, "uppercase", :type => :bool, :default => false
opt :threads, "#threads", :type => :int, :default => 1, :short => '-h'
opt :shard_size, "shard size", :type => :int, :default => 1000
opt :train, "train", :type => :bool
opt :apply, "apply", :type => :bool
end
pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ]
pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ]
if conf[:upper]
PAIRS = pairs_lower
else
PAIRS = pairs_lower+pairs_upper
end
def get_key(old, new)
PAIRS.each { |i|
return old if new.gsub(i[0], i[1])==old
return old if new.gsub(i[1], i[0])==old
}
return nil
end
def build_partial(tokens)
h = {}
tokens.each { |tok|
found = false
h.keys.each { |i|
if get_key i, tok
h[i] << tok
found = true
break
end
}
h[tok] = [tok] if !found
}
return h
end
h = {}
threads = []
thread_n = 0
counter = 0
token_stock = []
mutex = Mutex.new
while tok = STDIN.gets
token_stock << [] if !token_stock[thread_n]
token_stock[thread_n] << tok.strip!
counter += 1
if token_stock[thread_n].size%conf[:shard_size]==0
STDERR.write "Starting thread ##{thread_n}\n"
threads << Thread.new(token_stock[thread_n]) { |tokens|
th = build_partial tokens
mutex.synchronize do
h.merge! th
end
}
threads.last.abort_on_exception = true
thread_n += 1
else
next
end
if thread_n==conf[:threads]
threads.each { |i| i.join }
token_stock.each { |i| i.clear }
thread_n = 0
end
STDERR.write "#keys #{h.keys.size}\n"
end
token_stock.each { |i|
if i.size!=0
h.merge! build_partial i
end
}
|