summaryrefslogtreecommitdiff
path: root/norm_german
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2016-07-05 11:01:46 +0200
committerPatrick Simianer <p@simianer.de>2016-07-05 11:01:46 +0200
commit2b1d7f881c19c4d4b5afae194e02d3300c7675d0 (patch)
tree5a06ee7de98640a39244b57bb369697176b44ebf /norm_german
parent69949dda35c3ea21d8e926e5f0a596a0a0f61c6a (diff)
mv
Diffstat (limited to 'norm_german')
-rwxr-xr-xnorm_german87
1 files changed, 0 insertions, 87 deletions
diff --git a/norm_german b/norm_german
deleted file mode 100755
index cf9c060..0000000
--- a/norm_german
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'thread'
-require 'trollop'
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-
-conf = Trollop::options do
- banner "norm_german < <file w/ lowercased tokens>"
- opt :upper, "uppercase", :type => :bool, :default => false
- opt :threads, "#threads", :type => :int, :default => 1, :short => '-h'
- opt :shard_size, "shard size", :type => :int, :default => 1000
- opt :train, "train", :type => :bool
- opt :apply, "apply", :type => :bool
-end
-
-pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ]
-pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ]
-if conf[:upper]
- PAIRS = pairs_lower
-else
- PAIRS = pairs_lower+pairs_upper
-end
-
-def get_key(old, new)
- PAIRS.each { |i|
- return old if new.gsub(i[0], i[1])==old
- return old if new.gsub(i[1], i[0])==old
- }
- return nil
-end
-
-def build_partial(tokens)
- h = {}
- tokens.each { |tok|
- found = false
- h.keys.each { |i|
- if get_key i, tok
- h[i] << tok
- found = true
- break
- end
- }
- h[tok] = [tok] if !found
- }
- return h
-end
-
-h = {}
-threads = []
-thread_n = 0
-counter = 0
-token_stock = []
-mutex = Mutex.new
-while tok = STDIN.gets
- token_stock << [] if !token_stock[thread_n]
- token_stock[thread_n] << tok.strip!
- counter += 1
- if token_stock[thread_n].size%conf[:shard_size]==0
- STDERR.write "Starting thread ##{thread_n}\n"
- threads << Thread.new(token_stock[thread_n]) { |tokens|
- th = build_partial tokens
- mutex.synchronize do
- h.merge! th
- end
- }
- threads.last.abort_on_exception = true
- thread_n += 1
- else
- next
- end
- if thread_n==conf[:threads]
- threads.each { |i| i.join }
- token_stock.each { |i| i.clear }
- thread_n = 0
- end
- STDERR.write "#keys #{h.keys.size}\n"
-end
-
-token_stock.each { |i|
- if i.size!=0
- h.merge! build_partial i
- end
-}
-