diff options
author | Patrick Simianer <p@simianer.de> | 2016-07-05 11:01:46 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2016-07-05 11:01:46 +0200 |
commit | 2b1d7f881c19c4d4b5afae194e02d3300c7675d0 (patch) | |
tree | 5a06ee7de98640a39244b57bb369697176b44ebf /norm_german | |
parent | 69949dda35c3ea21d8e926e5f0a596a0a0f61c6a (diff) |
mv
Diffstat (limited to 'norm_german')
-rwxr-xr-x | norm_german | 87 |
1 files changed, 0 insertions, 87 deletions
diff --git a/norm_german b/norm_german deleted file mode 100755 index cf9c060..0000000 --- a/norm_german +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env ruby - -require 'thread' -require 'trollop' - -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - - -conf = Trollop::options do - banner "norm_german < <file w/ lowercased tokens>" - opt :upper, "uppercase", :type => :bool, :default => false - opt :threads, "#threads", :type => :int, :default => 1, :short => '-h' - opt :shard_size, "shard size", :type => :int, :default => 1000 - opt :train, "train", :type => :bool - opt :apply, "apply", :type => :bool -end - -pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ] -pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ] -if conf[:upper] - PAIRS = pairs_lower -else - PAIRS = pairs_lower+pairs_upper -end - -def get_key(old, new) - PAIRS.each { |i| - return old if new.gsub(i[0], i[1])==old - return old if new.gsub(i[1], i[0])==old - } - return nil -end - -def build_partial(tokens) - h = {} - tokens.each { |tok| - found = false - h.keys.each { |i| - if get_key i, tok - h[i] << tok - found = true - break - end - } - h[tok] = [tok] if !found - } - return h -end - -h = {} -threads = [] -thread_n = 0 -counter = 0 -token_stock = [] -mutex = Mutex.new -while tok = STDIN.gets - token_stock << [] if !token_stock[thread_n] - token_stock[thread_n] << tok.strip! - counter += 1 - if token_stock[thread_n].size%conf[:shard_size]==0 - STDERR.write "Starting thread ##{thread_n}\n" - threads << Thread.new(token_stock[thread_n]) { |tokens| - th = build_partial tokens - mutex.synchronize do - h.merge! th - end - } - threads.last.abort_on_exception = true - thread_n += 1 - else - next - end - if thread_n==conf[:threads] - threads.each { |i| i.join } - token_stock.each { |i| i.clear } - thread_n = 0 - end - STDERR.write "#keys #{h.keys.size}\n" -end - -token_stock.each { |i| - if i.size!=0 - h.merge! build_partial i - end -} - |