summaryrefslogtreecommitdiff
path: root/norm-german
blob: 85a39da374cd6cc53529b20373f1f6e6b7a62386 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env ruby

require 'thread'
require 'optimist'

STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'


conf = Optimist::options do
  banner "norm_german < <file w/ lowercased tokens>"
  opt :upper, "uppercase", :type => :bool, :default => false
  opt :threads, "#threads", :type => :int, :default => 1, :short => '-h'
  opt :shard_size, "shard size", :type => :int, :default => 1000
  opt :train, "train", :type => :bool
  opt :apply, "apply", :type => :bool
end

pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ]
pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ]
if conf[:upper]
  PAIRS = pairs_lower
else
  PAIRS = pairs_lower+pairs_upper
end

def get_key(old, new)
  PAIRS.each { |i|
    return old if new.gsub(i[0], i[1])==old
    return old if new.gsub(i[1], i[0])==old
  }
  return nil
end

def build_partial(tokens)
  h = {}
  tokens.each { |tok|
    found = false
    h.keys.each { |i|
      if get_key i, tok
        h[i] << tok
        found = true
        break
      end
    }
    h[tok] = [tok] if !found
  }
  return h
end

h = {}
threads = []
thread_n = 0
counter = 0
token_stock = []
mutex = Mutex.new
while tok = STDIN.gets
  token_stock << [] if !token_stock[thread_n]
  token_stock[thread_n] << tok.strip!
  counter += 1
  if token_stock[thread_n].size%conf[:shard_size]==0
    STDERR.write "Starting thread ##{thread_n}\n"
    threads << Thread.new(token_stock[thread_n]) { |tokens|
      th = build_partial tokens
      mutex.synchronize do
        h.merge! th
      end
    }
    threads.last.abort_on_exception = true
    thread_n += 1
  else
    next
  end
  if thread_n==conf[:threads]
    threads.each { |i|  i.join }
    token_stock.each { |i| i.clear }
    thread_n = 0
  end
  STDERR.write "#keys #{h.keys.size}\n"
end

token_stock.each { |i|
  if i.size!=0
    h.merge! build_partial i
  end
}