diff options
author | Patrick Simianer <p@simianer.de> | 2016-07-05 11:01:46 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2016-07-05 11:01:46 +0200 |
commit | 2b1d7f881c19c4d4b5afae194e02d3300c7675d0 (patch) | |
tree | 5a06ee7de98640a39244b57bb369697176b44ebf /norm-german | |
parent | 69949dda35c3ea21d8e926e5f0a596a0a0f61c6a (diff) |
mv
Diffstat (limited to 'norm-german')
-rwxr-xr-x | norm-german | 87 |
1 files changed, 87 insertions, 0 deletions
diff --git a/norm-german b/norm-german new file mode 100755 index 0000000..cf9c060 --- /dev/null +++ b/norm-german @@ -0,0 +1,87 @@ +#!/usr/bin/env ruby + +require 'thread' +require 'trollop' + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + + +conf = Trollop::options do + banner "norm_german < <file w/ lowercased tokens>" + opt :upper, "uppercase", :type => :bool, :default => false + opt :threads, "#threads", :type => :int, :default => 1, :short => '-h' + opt :shard_size, "shard size", :type => :int, :default => 1000 + opt :train, "train", :type => :bool + opt :apply, "apply", :type => :bool +end + +pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ] +pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ] +if conf[:upper] + PAIRS = pairs_lower +else + PAIRS = pairs_lower+pairs_upper +end + +def get_key(old, new) + PAIRS.each { |i| + return old if new.gsub(i[0], i[1])==old + return old if new.gsub(i[1], i[0])==old + } + return nil +end + +def build_partial(tokens) + h = {} + tokens.each { |tok| + found = false + h.keys.each { |i| + if get_key i, tok + h[i] << tok + found = true + break + end + } + h[tok] = [tok] if !found + } + return h +end + +h = {} +threads = [] +thread_n = 0 +counter = 0 +token_stock = [] +mutex = Mutex.new +while tok = STDIN.gets + token_stock << [] if !token_stock[thread_n] + token_stock[thread_n] << tok.strip! + counter += 1 + if token_stock[thread_n].size%conf[:shard_size]==0 + STDERR.write "Starting thread ##{thread_n}\n" + threads << Thread.new(token_stock[thread_n]) { |tokens| + th = build_partial tokens + mutex.synchronize do + h.merge! th + end + } + threads.last.abort_on_exception = true + thread_n += 1 + else + next + end + if thread_n==conf[:threads] + threads.each { |i| i.join } + token_stock.each { |i| i.clear } + thread_n = 0 + end + STDERR.write "#keys #{h.keys.size}\n" +end + +token_stock.each { |i| + if i.size!=0 + h.merge! build_partial i + end +} + |