summaryrefslogtreecommitdiff
path: root/norm_german
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-01-29 19:14:08 +0100
committerPatrick Simianer <p@simianer.de>2014-01-29 19:14:08 +0100
commit68acbb9a0c7967cb90a7e3756fc94fdd8a73d154 (patch)
tree3b445131dcb203e94473ae1d8aa82a1798585276 /norm_german
parent49158e721bfaf6423dca9fc633873218f691c83a (diff)
make use of nlp_ruby, LICENSE
Diffstat (limited to 'norm_german')
-rwxr-xr-xnorm_german26
1 files changed, 10 insertions, 16 deletions
diff --git a/norm_german b/norm_german
index 57a37bb..ef0408e 100755
--- a/norm_german
+++ b/norm_german
@@ -3,17 +3,12 @@
require 'thread'
require 'trollop'
-
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-def usage
- STDERR.write "./avg [-r <d>] < <one number per line>\n"
- exit 1
-end
-usage if not [0,2,4].include? ARGV.size
-opts = Trollop::options do
+cfg = Trollop::options do
+ banner "norm_german < <file w/ lowercased tokens>"
opt :upper, "uppercase", :type => :bool, :default => false
opt :threads, "#threads", :type => :int, :default => 1, :short => '-h'
opt :shard_size, "shard size", :type => :int, :default => 1000
@@ -21,10 +16,9 @@ opts = Trollop::options do
opt :apply, "apply", :type => :bool
end
-
pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ]
pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ]
-if opts[:upper]
+if cfg[:upper]
PAIRS = pairs_lower
else
PAIRS = pairs_lower+pairs_upper
@@ -46,7 +40,7 @@ def build_partial(tokens)
if get_key i, tok
h[i] << tok
found = true
- break
+ break
end
}
h[tok] = [tok] if !found
@@ -60,24 +54,24 @@ thread_n = 0
counter = 0
token_stock = []
mutex = Mutex.new
-while tok = STDIN.gets # expects stream of (lowercased) tokens
+while tok = STDIN.gets
token_stock << [] if !token_stock[thread_n]
token_stock[thread_n] << tok.strip!
counter += 1
- if token_stock[thread_n].size%opts[:shard_size]==0
+ if token_stock[thread_n].size%cfg[:shard_size]==0
STDERR.write "Starting thread ##{thread_n}\n"
threads << Thread.new(token_stock[thread_n]) { |tokens|
th = build_partial tokens
mutex.synchronize do
- h.merge! th
+ h.merge! th
end
}
threads.last.abort_on_exception = true
thread_n += 1
- else
+ else
next
end
- if thread_n==opts[:threads]
+ if thread_n==cfg[:threads]
threads.each { |i| i.join }
token_stock.each { |i| i.clear }
thread_n = 0
@@ -86,7 +80,7 @@ while tok = STDIN.gets # expects stream of (lowercased) tokens
end
token_stock.each { |i|
- if i.size!=0
+ if i.size!=0
h.merge! build_partial i
end
}