summaryrefslogtreecommitdiff
path: root/norm_german
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2013-12-05 07:56:38 +0100
committerPatrick Simianer <p@simianer.de>2013-12-05 07:56:38 +0100
commitdb6a6ecfa350cae29739c59df1210d8f76a479c9 (patch)
treef137a001f57f170455c28ce97b5abb2726006cf6 /norm_german
init
Diffstat (limited to 'norm_german')
-rwxr-xr-xnorm_german93
1 files changed, 93 insertions, 0 deletions
diff --git a/norm_german b/norm_german
new file mode 100755
index 0000000..57a37bb
--- /dev/null
+++ b/norm_german
@@ -0,0 +1,93 @@
+#!/usr/bin/env ruby
+
+require 'thread'
+require 'trollop'
+
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+def usage
+ STDERR.write "./avg [-r <d>] < <one number per line>\n"
+ exit 1
+end
+usage if not [0,2,4].include? ARGV.size
+
+opts = Trollop::options do
+ opt :upper, "uppercase", :type => :bool, :default => false
+ opt :threads, "#threads", :type => :int, :default => 1, :short => '-h'
+ opt :shard_size, "shard size", :type => :int, :default => 1000
+ opt :train, "train", :type => :bool
+ opt :apply, "apply", :type => :bool
+end
+
+
+pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ]
+pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ]
+if opts[:upper]
+ PAIRS = pairs_lower
+else
+ PAIRS = pairs_lower+pairs_upper
+end
+
+def get_key(old, new)
+ PAIRS.each { |i|
+ return old if new.gsub(i[0], i[1])==old
+ return old if new.gsub(i[1], i[0])==old
+ }
+ return nil
+end
+
+def build_partial(tokens)
+ h = {}
+ tokens.each { |tok|
+ found = false
+ h.keys.each { |i|
+ if get_key i, tok
+ h[i] << tok
+ found = true
+ break
+ end
+ }
+ h[tok] = [tok] if !found
+ }
+ return h
+end
+
+h = {}
+threads = []
+thread_n = 0
+counter = 0
+token_stock = []
+mutex = Mutex.new
+while tok = STDIN.gets # expects stream of (lowercased) tokens
+ token_stock << [] if !token_stock[thread_n]
+ token_stock[thread_n] << tok.strip!
+ counter += 1
+ if token_stock[thread_n].size%opts[:shard_size]==0
+ STDERR.write "Starting thread ##{thread_n}\n"
+ threads << Thread.new(token_stock[thread_n]) { |tokens|
+ th = build_partial tokens
+ mutex.synchronize do
+ h.merge! th
+ end
+ }
+ threads.last.abort_on_exception = true
+ thread_n += 1
+ else
+ next
+ end
+ if thread_n==opts[:threads]
+ threads.each { |i| i.join }
+ token_stock.each { |i| i.clear }
+ thread_n = 0
+ end
+ STDERR.write "#keys #{h.keys.size}\n"
+end
+
+token_stock.each { |i|
+ if i.size!=0
+ h.merge! build_partial i
+ end
+}
+