41 files changed, 311 insertions, 433 deletions
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0d5dab3
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,7 @@
+Copyright (C) 2014 Patrick Simianer <p ät simianer.de>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README b/README
deleted file mode 100644
index 8ce273f..0000000
--- a/README
+++ /dev/null
@@ -1,2 +0,0 @@
-misc. nlp related scripts
-
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8b5b4ad
--- /dev/null
+++ b/README.md
@@ -0,0 +1,7 @@
+scripts
+=======
+
+A number of NLP related scripts.
+Some scripts require my rubynlp gem,
+see https://github.com/pks/nlp_ruby .
+
diff --git a/add_seg b/add_seg
index e661b40..684a236 100755
--- a/add_seg
+++ b/add_seg
@@ -2,30 +2,24 @@
 
 require 'trollop'
 
-
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-def usage
-  puts "addseg [--nogz] [--loo] [--grammar] <path to grammars dir>\n"
-  exit 1
-end
 
-opts = Trollop::options do
-  opt :grammar, "(Abs) path of folder containing grammar.", :type => :string, :short => '-g', :required => true
+cfg = Trollop::options do
+  opt :grammar, "(Absolute) path of folder containing grammars.", :type => :string, :short => '-g', :required => true
   opt :loo, "leave one out", :type => :bool, :default => false
   opt :start_id, "start with this id", :type => :int, :default => 0, :short => '-i'
   opt :nogz, "grammar files not gzipped", :type => :bool, :default => false
 end
 
-
-i = opts[:start_id]
+i = cfg[:start_id]
 while line = STDIN.gets
   ext = '.gz'
-  ext = '' if opts[:nogz]
+  ext = '' if cfg[:nogz]
   s = "<seg"
-  if opts[:loo] then s += " exclude=\"#{i}\"" end
-  if opts[:grammar] then s += " grammar=\"#{opts[:grammar]}/grammar.#{i}#{ext}\"" end
+  if cfg[:loo] then s += " exclude=\"#{i}\"" end
+  if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{i}#{ext}\"" end
   puts s + " id=\"#{i}\"> #{line.strip} </seg>"
   i+=1
 end
diff --git a/add_start_end b/add_start_end
index a14a65e..30deaec 100755
--- a/add_start_end
+++ b/add_start_end
@@ -3,6 +3,7 @@
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
+
 while line = STDIN.gets
   puts "<s> #{line.strip} </s>"
 end
diff --git a/avg b/avg
index cc4c0e6..ed31465 100755
--- a/avg
+++ b/avg
@@ -3,28 +3,22 @@
 require 'trollop'
 
 
-def usage
-  STDERR.write "./avg [-r <d>] < <one number per line>\n"
-  exit 1
-end
-usage if not [0,2].include? ARGV.size
-
-opts = Trollop::options do
+cfg = Trollop::options do
+  banner "avg < <one number per line>"
   opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
 end
 
-
 sum = 0.0
 i = 0
-while line=STDIN.gets
-  sum += line.strip.to_f
+while line = STDIN.gets
+  sum += line.to_f
   i +=1
 end
 
 avg = sum/i.to_f
 
-if opts[:round] >= 0
-  puts avg.round opts[:round]
+if cfg[:round] >= 0
+  puts avg.round cfg[:round]
 else
   puts avg
 end
diff --git a/avg_weights b/avg_weights
index 2b72747..71ffdd9 100755
--- a/avg_weights
+++ b/avg_weights
@@ -1,46 +1,34 @@
 #!/usr/bin/env ruby
 
+require 'nlp_ruby'
 require 'trollop'
 require 'zlib'
 
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-opts = Trollop::options do
+cfg = Trollop::options do
+  opt :weights_files, "a number of weights files: name value", :required => true
   opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false
 end
 
-def usage
-  puts "avg_weights_filter [--filter] <filename>+"
-  exit 1
-end
-usage if ARGV.size==0
-
-
 h = {}
 ARGV.each { |fn|
-if File.extname(fn)=='.gz'
-  f = Zlib::GzipReader.new(File.new(fn, 'rb'))
-else
-  f = File.new fn, 'r'
-end
-while line = f.gets
-  k, v = line.split
-  v = v.to_f
-  if h.has_key? k
-    h[k] << v
-  else
-    h[k] = [v]
+  f = ReadFile.new fn
+  while line = f.gets
+    k, v = line.split
+    v = v.to_f
+    if h.has_key? k
+      h[k] << v
+    else
+      h[k] = [v]
+    end
   end
-end
-f.close
+  f.close
 }
 
 n = ARGV.size.to_f
 
-h.each_pair { |k,a|
-  next if opts[:filter] and a.size < n
-  puts "#{k} #{a.inject(:+)/n}"
+h.each_pair { |k,w|
+  next if cfg[:filter] and w.size < n
+  puts "#{k} #{w.inject(:+)/n}"
 }
 
diff --git a/firstisupper b/firstisupper
index 4278334..516dd8a 100755
--- a/firstisupper
+++ b/firstisupper
@@ -1,8 +1,7 @@
 #!/usr/bin/env ruby
 
-def downcase?(string)
-  string[/[[:lower:]]/]
-end
+require 'nlp_ruby'
+
 
 while line = STDIN.gets
   puts line.strip if downcase? line[0]
diff --git a/htmlentities b/htmlentities
index ecbee3f..f3c2d34 100755
--- a/htmlentities
+++ b/htmlentities
@@ -2,10 +2,10 @@
 
 require 'htmlentities'
 
-
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
+
 coder = HTMLEntities.new
 
 while line = STDIN.gets
diff --git a/keycount b/keycount
index 15b4095..deaa522 100755
--- a/keycount
+++ b/keycount
@@ -1,11 +1,14 @@
 #!/usr/bin/env ruby
 
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
 h = {}
-h.default = 0 
+h.default = 0
 while line = STDIN.gets
-  line.strip! 
+  line.strip!
   h[line] += 1
 end
 
-h.each_pair {|k,v| puts "#{k} #{v}"}
+h.each_pair { |k,v| puts "#{k} #{v}" }
 
diff --git a/kmeans b/kmeans
index 89cc329..5c49d9a 100755
--- a/kmeans
+++ b/kmeans
@@ -1,141 +1,97 @@
 #!/usr/bin/env ruby
 
+require 'nlp_ruby'
 require 'trollop'
 
 
-# {s:f} {s:f} => f
-def dot(x,y)
-  sum = 0.0
-  x.each_pair { |k,v| sum += v * y[k] }
-  return sum
-end
-
-# {s:f} => f
-def mag(x)
-  return Math.sqrt x.values.inject { |sum,i| sum+i**2 }
-end
-
-# {s:f} {s:f} => f
-def cos_sim(x,y)
-  return dot(x,y)/(mag(x)*mag(y))
-end
-
-# {s:f} {s:f} => f
-def euclidian_dist(x,y)
-  dims = [x.keys, y.keys].flatten.uniq  
-  sum = 0.0
-  dims.each { |i| sum += (x[i] - y[i])**2 }
-  return Math.sqrt(sum)
-end
-
-# str => {s:{s:f}}
-def read(fn)
-  h = {}
-  f = File.new fn, 'r'
-  while line = f.gets
-    g = eval line 
-    h[g[0]] = g[1]
-    h[g[0]].default = 0.0
-  end
-  return h
+def read_data fn
+  data = {}
+  ReadFile.new(fn).readlines_strip.map{ |i|
+    a = i.split ' ', 2
+    data[a.first] = read_feature_string a.last
+  }
+  return data
 end
 
-# {s:{s:f}} i => [{s:f}]
-def rand_init(docs, k)
-  prng = Random.new 
-  return docs.keys.sample k, random:prng
+def rand_init data, k
+  prng = Random.new
+  return data.keys.sample k, random:prng
 end
 
-def rand_init2(docs, k)
-  prng = Random.new 
+def rand_means_init data, k
+  prng = Random.new
   a = []
   0.upto(k-1) do
-    a << mean(docs.values.sample k, random:prng)
+    a << mean_sparse_vector(data.values.sample k, random:prng)
   end
   return a
 end
 
-# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]}
-def assign(docs, centroids)
+def assign centroids, data
   assignment = {}
-  docs.each_pair { |name,feature_vector|
+  data.each_pair { |name,feature_vector|
       min = 1.0/0
       min_index = nil
-      centroids.each_with_index { |c,j|
-        dist = euclidian_dist(c, feature_vector)
-        if dist < min 
-          min = dist 
-          min_index = j
+      centroids.each_with_index { |c,i|
+        dist = c.euclidian_dist(feature_vector)
+        if dist < min
+          min = dist
+          min_index = i
         end
       }
       if assignment.has_key? min_index
-        assignment[min_index] << [name, feature_vector]
+        assignment[min_index] << name
       else
-        assignment[min_index] = [[name, feature_vector]]
+        assignment[min_index] = [name]
       end
   }
   return assignment
 end
 
-# [{s:f}] => {s:f}
-def mean(a)
-  res = {}
-  res.default = 0.0
-  a.each { |i|
-    i.each_pair { |k,v|
-      res[k] += v
-    }
-  }
-  n = a.size.to_f
-  res.each_pair { |k,v|
-    res[k] = v/n 
-  }
-end
-
-# {i:[{s:f}]} => [{s:f}]
-def update(assignment)
+def update assignment, data
   new_centroids = []
-  assignment.each_pair { |centroid,docs|
-    new_centroids << mean(docs.map{|i |i[1]}) 
+  assignment.each_pair { |centroid_index,a|
+    new_centroids << mean_sparse_vector(assignment[centroid_index].map{ |i| data[i] })
   }
   return new_centroids
 end
 
 def main
-  opts = Trollop::options do
+  cfg = Trollop::options do
     opt :k, "k", :type => :int, :required => true
     opt :input, "input: one feature vector per line", :type => :string, :required => true
     opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
-    opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3
+    opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3
     opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
   end
-  docs = read opts[:input]
-  k = opts[:k]
+  # data is 'ID f1=v1 f2=v2'
+  data = read_data cfg[:input]
+  k = cfg[:k]
   centroids = nil
-  if opts[:init] == 1
-    centroids = rand_init(docs, k)
+  if cfg[:init] == 1
+    centroids = rand_init(data, k)
   else
-    centroids = rand_init2(docs, k)
+    centroids = rand_means_init(data, k)
   end
   STDERR.write "\n         k #{k}\n"
-  STDERR.write "     input #{opts[:input]}\n"
-  STDERR.write "iterations #{opts[:max_iterations]}\n"
-  STDERR.write "max no ch. #{opts[:max_no_change]}\n"
-  STDERR.write "      init #{opts[:init]}\n\n"
+  STDERR.write "     input #{cfg[:input]}\n"
+  STDERR.write "iterations #{cfg[:max_iterations]}\n"
+  STDERR.write "max no ch. #{cfg[:max_no_change]}\n"
+  STDERR.write "      init #{cfg[:init]}\n\n"
   assignment = nil
   prev_stats = []
   stats = []
   no_change = 0
   max_no_change = 5
-  STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n"
-  0.upto(opts[:max_iterations]) do |i|
+  STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n"
+  0.upto(cfg[:max_iterations]) do |i|
     s = "iteration #{i}"
     STDERR.write "#{s}\n#{'-'*s.size}\n"
-    assignment = assign(docs, centroids)
+    assignment = assign centroids, data
     sizes = []
-    assignment.each_pair { |centroid_index,docs|
-      sizes << docs.size
-    } 
+    assignment.each_pair { |centroid_index, a|
+      sizes << a.size
+    }
     median = sizes.sort[k/2]
     max = sizes.max
     min = sizes.min
@@ -148,12 +104,12 @@ def main
     STDERR.write "    min cluster sz=#{min}\n\n"
     if no_change == max_no_change
       STDERR.write "\nmax no change hit!\n\n"
-      assignment.each_pair { |centroid_index,docs| 
-        puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}"
+      assignment.each_pair { |centroid_index,a|
+        puts "#{centroid_index} #{a.to_s}"
       }
       break
     end
-    centroids = update(assignment)
+    centroids = update assignment, data
   end
 end
 
diff --git a/max b/max
index 506bd03..87f3c73 100755
--- a/max
+++ b/max
@@ -1,9 +1,11 @@
 #!/usr/bin/env ruby
 
+
 max = -1.0/0
 while line = STDIN.gets
-  v = line.strip.to_f 
+  v = line.to_f
   max = v if v > max
 end
+
 puts max
 
diff --git a/merge_files b/merge_files
index db9d5da..051ad6d 100755
--- a/merge_files
+++ b/merge_files
@@ -1,31 +1,31 @@
 #!/usr/bin/env ruby
 
-STDOUT.set_encoding 'utf-8'
+require 'nlp_ruby'
+
 
 def usage
-  STDERR.write "merge_files [file]+\n"
+  STDERR.write "merge_files <file>+\n"
   exit 1
 end
 usage if ARGV.size==0
 
-
 files = ARGV
-dicts = []
+hashes = []
 
 files.each { |i|
-  dicts.push Hash.new
-  dicts.last.default = 0
-  File.open i, "r:UTF-8" do |f|
-    while line = f.gets
-      dicts.last[line.strip] += 1
-    end
+  hashes.push Hash.new
+  hashes.last.default = 0
+  f = ReadFile.new i
+  while line = f.gets
+    hashes.last[line.strip] += 1
   end
+  f.close
 }
 
-dicts.each { |h|
+hashes.each { |h|
   h.each { |k,v|
     counts = []
-    dicts.each { |j| counts.push j[k]; j.delete k }
+    hashes.each { |j| counts.push j[k]; j.delete k }
     counts.max.times { puts k }
   }
 }
diff --git a/min b/min
index c2f85b9..398b0fb 100755
--- a/min
+++ b/min
@@ -1,9 +1,11 @@
 #!/usr/bin/env ruby
 
+
 min = 1.0/0
 while line = STDIN.gets
-  v = line.strip.to_f 
-  min = v if v < min
+  v = line.to_f
+  min = v if v<min
 end
+
 puts min
 
diff --git a/min_max b/min_max
index f27de88..653cde3 100755
--- a/min_max
+++ b/min_max
@@ -1,33 +1,26 @@
 #!/usr/bin/ruby
 
+require 'nlp_ruby'
 require 'trollop'
 
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-def usage
-  puts "filter-min-max.rb --min <min> --max <max> --in_f <in f> --in_e <in e> --out_f <out f> --out_e <out e> --out_id <out ids>" 
-end
-usage if ARGV.size!=14
-
-opts = Trollop::options do
+cfg = Trollop::options do
   opt :min, "minimum #tokens", :type => :int, :default => 1
-  opt :max, "maximum #tokens", :type => :int, :default => 80
-  opt :in_f "input 'French' file", :type => string
-  opt :in_e "input 'English' file", :type => string
-  opt :out_f "output 'French' file", :type => string
-  opt :out_e "output 'English' file", :type => string
-  opt :out_id "output line Nos", :type => string
+  opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n'
+  opt :in_f, "input 'French' file", :type => :string, :required => true
+  opt :in_e, "input 'English' file", :type => :string, :required => true
+  opt :out_f, "output 'French' file", :type => :string, :required => true
+  opt :out_e, "output 'English' file", :type => :string, :required => true
+  opt :out_id, "output line Nos", :type => :string, :required => true
 end
 
 
 files = {}
-files[:f_file] = File.new opts[:in_f], 'r:UTF-8'
-files[:e_file] = File.new opts[:in_e], 'r:UTF-8'
-files[:f_out_file] = File.new opts[:out_f], 'w:UTF-8'
-files[:e_out_file] = File.new opts[:out_e], 'w:UTF-8'
-files[:id_out_file] = File.new opts[:out_id], 'w'
+files[:f_file] = ReadFile.new cfg[:in_f]
+files[:e_file] = ReadFile.new cfg[:in_e]
+files[:f_out_file] = WriteFile.new cfg[:out_f]
+files[:e_out_file] = WriteFile.new cfg[:out_e]
+files[:id_out_file] = WriteFile.new cfg[:out_id]
 i = 0
 while f_line = files[:f_file].gets
   e_line = files[:e_file].gets
@@ -35,13 +28,14 @@ while f_line = files[:f_file].gets
   e_line.strip!
   a = f_line.split
   b = e_line.split
-  if a.size >= opts[:min] and a.size <= opts[:max] and \
-      b.size >= opts[:min] and b.size <= opts[:max]
+  if a.size >= cfg[:min] and a.size <= cfg[:max] and \
+      b.size >= cfg[:min] and b.size <= cfg[:max]
     files[:f_out_file].write "#{f_line}\n"
     files[:e_out_file].write "#{e_line}\n"
     files[:id_out_file].write "#{i}\n"
-  end  
+  end
   i+=1
 end
-files.values.each{|f|f.close}
+
+files.values.each{ |f| f.close }
 
diff --git a/moses_1best b/moses_1best
index 5c6bf9d..1a0805d 100755
--- a/moses_1best
+++ b/moses_1best
@@ -1,13 +1,13 @@
 #!/usr/bin/env ruby
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+require 'nlp_ruby'
+
 
 prev_idx = nil
 while line = STDIN.gets
   line.strip!
-  idx = line.split('|||')[0].to_i
-  if idx != prev_idx 
+  idx = splitpipe(line)[0].to_i
+  if idx != prev_idx
     puts line
     prev_idx = idx
   end
diff --git a/mult b/mult
index eaead89..2ef0149 100755
--- a/mult
+++ b/mult
@@ -1,4 +1,8 @@
 #!/usr/bin/env ruby
 
-puts STDIN.gets.to_f * ARGV[0].to_f
+
+factor = ARGV[0].to_f
+while line = STDIN.gets
+  puts line.to_f * factor
+end
 
diff --git a/ng b/ng
index de314b8..dbc59eb 100755
--- a/ng
+++ b/ng
@@ -1,39 +1,19 @@
 #!/usr/bin/env ruby
 
-def ngrams_it(s, n, fix=false)
-  a = s.strip.split
-  a.each_with_index { |tok, i|
-    tok.strip!
-    0.upto([n-1, a.size-i-1].min) { |m|
-      yield a[i..i+m] if !(fix||(a[i..i+m].size>n))
-    }
-  }
-end
-
-def main(n, fix, sep)
-  STDIN.set_encoding 'utf-8'
-  STDOUT.set_encoding 'utf-8'
-  while line = STDIN.gets
-    a = []
-    ngrams_it(line, n, fix) {|ng| a << ng.join(' ')}
-    a.reject! {|i| i.strip.size==0 }
-    puts a.join sep if a.size > 0
-  end
-end
+require 'nlp_ruby'
+require 'trollop'
 
-def usage
-  STDERR.write "./ng [-n <n>] [--fix] [--separator <s>] < <one number per line>\n"
-  exit 1
+cfg = Trollop::options do
+  banner "ng < <input>"
+  opt :n, "n for Ngrams", :type => :int, :default => 4
+  opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => false
+  opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n"
 end
 
-if __FILE__ == $0
-  require 'trollop'
-  opts = Trollop::options do
-    opt :n, "Ngrams", :type => :int, :default => 4
-    opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => false
-    opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n"
-  end
-  usage if not [0,2,4,6].include? ARGV.size
-  main(opts[:n], opts[:fix], opts[:separator])
+while line = STDIN.gets
+  a = []
+  ngrams(line, cfg[:n], cfg[:fix]) { |ng| a << ng.join(' ') }
+  a.reject! { |i| i.strip.size==0 }
+  puts a.join cfg[:separator] if a.size>0
 end
 
diff --git a/no_empty b/no_empty
index ecdbcdf..cd825c0 100755
--- a/no_empty
+++ b/no_empty
@@ -1,12 +1,14 @@
 #!/usr/bin/env ruby
 
+require 'nlp_ruby'
+
+
 files = []
-(0..1).each { |i| files << File.new(ARGV[i], 'r') }
-(2..3).each { |i| files << File.new(ARGV[i], 'w') }
-files.each { |f| f.set_encoding('utf-8') }
+(0..1).each { |i| files << ReadFile.new(ARGV[i]) }
+(2..3).each { |i| files << WriteFile.new(ARGV[i]) }
 
 while line_f = files[0].gets
-  line_e = files[1].gets 
+  line_e = files[1].gets
   line_f.strip!; line_e.strip!
   next if line_f=='' || line_e==''
   files[2].write line_f+"\n"
diff --git a/no_non_printables b/no_non_printables
index fda1e40..20d1e3d 100755
--- a/no_non_printables
+++ b/no_non_printables
@@ -1 +1,4 @@
+#!/bin/sh
+
 sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g'
+
diff --git a/norm_german b/norm_german
index 57a37bb..ef0408e 100755
--- a/norm_german
+++ b/norm_german
@@ -3,17 +3,12 @@
 require 'thread'
 require 'trollop'
 
-
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-def usage
-  STDERR.write "./avg [-r <d>] < <one number per line>\n"
-  exit 1
-end
-usage if not [0,2,4].include? ARGV.size
 
-opts = Trollop::options do
+cfg = Trollop::options do
+  banner "norm_german < <file w/ lowercased tokens>"
   opt :upper, "uppercase", :type => :bool, :default => false
   opt :threads, "#threads", :type => :int, :default => 1, :short => '-h'
   opt :shard_size, "shard size", :type => :int, :default => 1000
@@ -21,10 +16,9 @@ opts = Trollop::options do
   opt :apply, "apply", :type => :bool
 end
 
-
 pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ]
 pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ]
-if opts[:upper]
+if cfg[:upper]
   PAIRS = pairs_lower
 else
   PAIRS = pairs_lower+pairs_upper
@@ -46,7 +40,7 @@ def build_partial(tokens)
       if get_key i, tok
         h[i] << tok
         found = true
-        break 
+        break
       end
     }
     h[tok] = [tok] if !found
@@ -60,24 +54,24 @@ thread_n = 0
 counter = 0
 token_stock = []
 mutex = Mutex.new
-while tok = STDIN.gets # expects stream of (lowercased) tokens
+while tok = STDIN.gets
   token_stock << [] if !token_stock[thread_n]
   token_stock[thread_n] << tok.strip!
   counter += 1
-  if token_stock[thread_n].size%opts[:shard_size]==0
+  if token_stock[thread_n].size%cfg[:shard_size]==0
     STDERR.write "Starting thread ##{thread_n}\n"
     threads << Thread.new(token_stock[thread_n]) { |tokens|
       th = build_partial tokens
       mutex.synchronize do
-        h.merge! th 
+        h.merge! th
       end
     }
     threads.last.abort_on_exception = true
     thread_n += 1
-  else 
+  else
     next
   end
-  if thread_n==opts[:threads]
+  if thread_n==cfg[:threads]
     threads.each { |i|  i.join }
     token_stock.each { |i| i.clear }
     thread_n = 0
@@ -86,7 +80,7 @@ while tok = STDIN.gets # expects stream of (lowercased) tokens
 end
 
 token_stock.each { |i|
-  if i.size!=0  
+  if i.size!=0
     h.merge! build_partial i
   end
 }
diff --git a/num_tok b/num_tok
index a11b0d7..53b99a0 100755
--- a/num_tok
+++ b/num_tok
@@ -1,8 +1,10 @@
 #!/usr/bin/env ruby
 
-STDIN.set_encoding('utf-8')
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
 
 while line = STDIN.gets
-  puts line.split.length
+  puts line.strip.split.length
 end
 
diff --git a/odd b/odd
index 0bd9336..93aaa80 100755
--- a/odd
+++ b/odd
@@ -3,6 +3,7 @@
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
+
 i = 1
 while line = STDIN.gets
   puts line if i%2!=0
diff --git a/paste_pairs b/paste_pairs
index 6ede8f6..07c1f22 100755
--- a/paste_pairs
+++ b/paste_pairs
@@ -8,4 +8,4 @@ for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.a
   print linenr, (src_line.strip())
   print linenr, (tgt_line.strip())
   print
-  
+
diff --git a/per_sentence_bleu b/per_sentence_bleu
index c7c0b0e..724b1e1 100755
--- a/per_sentence_bleu
+++ b/per_sentence_bleu
@@ -1,29 +1,21 @@
 #!/usr/bin/env ruby
 
+require 'nlp_ruby'
 require 'trollop'
 
 
-def ngrams_it(s, n, fix=false)
-  a = s.strip.split
-  a.each_with_index { |tok, i|
-    tok.strip!
-    0.upto([n-1, a.size-i-1].min) { |m|
-      yield a[i..i+m] if !(fix||(a[i..i+m].size>n))
-    }
-  }
-end
-
-def brevity_penalty hypothesis, reference
-  a = hypothesis.split; b = reference.split
-  return 1.0 if a.size>b.size
-  return Math.exp(1.0 - ((b.size.to_f+1)/a.size));
+# reference-length hack as in (Nakov et al., 2012)
+def brevity_penalty hypothesis, reference, hack=0
+  a = tokenize hypothesis; b = tokenize reference
+  return 1.0 if a.size>=b.size
+  return Math.exp(1.0 - ((b.size.to_f+hack)/a.size));
 end
 
-def per_sentence_bleu hypothesis, reference, n=4
+def per_sentence_bleu hypothesis, reference, n=4, hack=0
   h_ng = {}; r_ng = {}
   (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}
-  ngrams_it(hypothesis, n) {|i| h_ng[i.size] << i}
-  ngrams_it(reference, n) {|i| r_ng[i.size] << i}
+  ngrams(hypothesis, n) {|i| h_ng[i.size] << i}
+  ngrams(reference, n) {|i| r_ng[i.size] << i}
   m = [n, reference.split.size].min
   weight = 1.0/m
   add = 0.0
@@ -35,31 +27,29 @@ def per_sentence_bleu hypothesis, reference, n=4
     add = 1.0 if i >= 2
     sum += weight * Math.log((counts_clipped + add)/(counts_sum + add));
   }
-  return brevity_penalty(hypothesis, reference) * Math.exp(sum)
+  return brevity_penalty(hypothesis, reference, hack) * Math.exp(sum)
 end
 
 def main
-  opts = Trollop::options do
+  cfg = Trollop::options do
     opt :input, "input", :type => :string, :default => '-'
     opt :references, "references", :type => :string, :required => true
+    opt :len_hack, "hack of Nakov et al", :type => :int, :default => 0
+    opt :n, "N", :default => 4
   end
- 
-  refs = File.new(opts[:references], 'r').readlines.map{|i|i.strip} 
+
+  refs = ReadFile.new(cfg[:references]).readlines_strip
   i = -1
-  if opts[:input] == '-'
-    input = STDIN 
-  else
-    input = File.new opts[:input], 'r'
-  end
+  input = ReadFile.new cfg[:input]
   while line = input.gets
     i += 1
     if line.strip == ''
       puts 0.0
       next
     end
-    puts per_sentence_bleu line.strip, refs[i]
+    puts per_sentence_bleu line.strip, refs[i], cfg[:n], cfg[:len_hack]
   end
-  input.close if opts[:input]!='-'
+  input.close
 end
 
 
diff --git a/preprocess b/preprocess
index bc6b5d2..4bf782a 100755
--- a/preprocess
+++ b/preprocess
@@ -1,4 +1,4 @@
-#!/bin/zsh
+#!/bin/bash
 
 LANG=$1
 /toolbox/scripts/htmlentities 2>htmlentities.$LANG.err | /toolbox/scripts/normalize_punctuation 2>normalize-punctuation.$LANG.err | /toolbox/moses/scripts/tokenizer/tokenizer.no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | /toolbox/moses/scripts/tokenizer/lowercase.perl 2>lowercase.$LANG.err
diff --git a/round b/round
index 52cd013..3dfbb6f 100755
--- a/round
+++ b/round
@@ -1,4 +1,8 @@
 #!/usr/bin/env ruby
 
-puts STDIN.gets.to_f.round ARGV[0].to_i
+
+r = ARGV[0].to_i
+while line = STDIN.gets
+  puts line.to_f.round r
+end
 
diff --git a/ruby_eval b/ruby_eval
index fe0d181..96b2ecb 100755
--- a/ruby_eval
+++ b/ruby_eval
@@ -1,5 +1,6 @@
 #!/usr/bin/env ruby
 
+
 while line = STDIN.gets
   puts "#{eval line}"
 end
diff --git a/rule_shapes b/rule_shapes
index 039b0dc..fd42249 100755
--- a/rule_shapes
+++ b/rule_shapes
@@ -3,11 +3,12 @@
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
+
 def shape s
   res  = []
   in_t = false
   s.split.each { |i|
-    if i.match /\A\[X,\d\]\z/
+    if i.match(/\A\[X,\d\]\z/)
       if in_t
         in_t = false
       end
@@ -22,7 +23,7 @@ def shape s
 end
 
 while line = STDIN.gets
-  f,e = line.split "\t"
+  f, e = line.split(/\t/)
   f.strip!; e.strip!
   puts shape(f).join('_')+"-"+shape(e).join('_')
 end
diff --git a/sample b/sample
index b4706c6..e693d5c 100755
--- a/sample
+++ b/sample
@@ -2,23 +2,16 @@
 
 require 'trollop'
 
-
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-def usage
-  STDERR.write "./sample --size <n> < <line separated data>\n"
-  exit 1
-end
-usage if ARGV.size!=4
 
 opts = Trollop::options do
+  banner "sample --size <n> < <line separated data>"
   opt :size, "Sample n% (percentage).", :type => :int
 end
 
-
 prng = Random.new(Random.new_seed)
-
 while line = STDIN.gets
   STDOUT.write line if prng.rand(1..opts[:size])==0
 end
diff --git a/sample_n b/sample_n
index 2115407..286646b 100755
--- a/sample_n
+++ b/sample_n
@@ -3,20 +3,13 @@
 require 'trollop'
 
 
-def usage
-  STDERR.write "./sample --size <n> --population <n>\n"
-  exit 1
-end
-usage if ARGV.size!=4
-
 opts = Trollop::options do
+  banner "sample --size <n> --population <n>"
   opt :size, "Sample size (percentage).", :type => :int
   opt :population, "'Population' (number \in N)", :type => :int
 end
 
-
 prng = Random.new(Random.new_seed)
-
 1.upto(opts[:population]) { |i|
   puts i if prng.rand(1..opts[:size])==0
 }
diff --git a/shard b/shard
index 7729699..f952104 100755
--- a/shard
+++ b/shard
@@ -12,11 +12,11 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false
   index.shuffle! if rand
   shard_sz = lc / num_shards
   leftover = lc % num_shards
-  in_f = File.new input, 'r'
+  in_f = ReadFile.new input
   in_lines = in_f.readlines
-  refs_f = File.new refs, 'r'
+  refs_f = ReadFile.new refs
   refs_lines = refs_f.readlines
-  a_f = File.new alignments, 'r'
+  a_f = ReadFile.new alignments
   a_lines = a_f.readlines
   shard_in_files = []
   shard_refs_files = []
@@ -26,13 +26,13 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false
   a_fns = []
   0.upto(num_shards-1) { |shard|
     in_fn = "#{output_prefix}.#{shard}.#{input_ext}"
-    shard_in = File.new in_fn, 'w+'
+    shard_in = WriteFile.new in_fn
     in_fns << in_fn
     refs_fn = "#{output_prefix}.#{shard}.#{refs_ext}"
-    shard_refs = File.new refs_fn, 'w+'
+    shard_refs = WriteFile.new refs_fn
     refs_fns << refs_fn
     a_fn = "#{output_prefix}.#{shard}.a"
-    shard_a = File.new a_fn, 'w+'
+    shard_a = WriteFile.new a_fn
     a_fns << a_fn
     0.upto(shard_sz-1) { |i|
       j = index.pop
@@ -69,12 +69,12 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false
 end
 
 opts = Trollop::options do
-  opt :input, 'input', :type => :string
-  opt :references, 'references', :type => :string
-  opt :alignments, 'alignments', :type => :string
-  opt :output_prefix, 'output prefix', :type => :string
+  opt :input, 'input', :type => :string, :required => true
+  opt :references, 'references', :type => :string, :required => true
+  opt :alignments, 'alignments', :type => :string, :required => true
+  opt :output_prefix, 'output prefix', :type => :string, :required => true
   opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z'
-  opt :num_shards, 'number of shards', :type => :int
+  opt :num_shards, 'number of shards', :type => :int, :required => true
 end
 
 make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize])
diff --git a/splitpipes b/splitpipes
index b0c3c9c..35ee176 100755
--- a/splitpipes
+++ b/splitpipes
@@ -2,24 +2,19 @@
 
 require 'trollop'
 
-
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-def usage
-  STDERR.write "splitpipes -f <n> < <input>\n"
-  exit 1
-end
-usage if ARGV.size!=2
 
-opts = Trollop::options do
+cfg = Trollop::options do
+  banner "splitpipes -f <n> < <input>"
   opt :field, "field", :type => :int
 end
 
 while line = STDIN.gets
   j = 1
   line.strip.split(' ||| ').each { |i|
-    if j == opts[:field]
+    if j == cfg[:field]
       puts i.strip
       break
     end
diff --git a/stddev b/stddev
index 3bf0270..891c4c9 100755
--- a/stddev
+++ b/stddev
@@ -3,22 +3,16 @@
 require 'trollop'
 
 
-def usage
-  STDERR.write "./stddev [-r <d>] < <one number per line>\n"
-  exit 1
-end
-usage if not [0,2].include? ARGV.size
-
-opts = Trollop::options do
+cfg = Trollop::options do
+  banner "stddev [-r <d>] < <one number per line>"
   opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
 end
 
-
 sum = 0.0
 i = 0
 cached = []
 while line=STDIN.gets
-  v = line.strip.to_f
+  v = line.to_f
   sum += v
   cached << v
   i +=1
@@ -33,8 +27,8 @@ cached.each { |v|
 
 stddev = Math.sqrt(var)
 
-if opts[:round] >= 0
-  puts stddev.round opts[:round]
+if cfg[:round] >= 0
+  puts stddev.round cfg[:round]
 else
   puts stddev
 end
diff --git a/strip_whitespace b/strips
index 37c02e5..11c00b4 100755
--- a/strip_whitespace
+++ b/strips
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
 while line = STDIN.gets
-  puts line.lstrip.strip
+  puts line.strip
 end
 
diff --git a/sum b/sum
index 3fca95e..dac72d3 100755
--- a/sum
+++ b/sum
@@ -1,8 +1,10 @@
 #!/usr/bin/env ruby
 
+
 sum = 0.0
 while line = STDIN.gets
-  sum += line.strip.to_f
+  sum += line.to_f
 end
+
 puts sum
 
diff --git a/test/kmeans/data b/test/kmeans/data
new file mode 100644
index 0000000..b5b3db2
--- /dev/null
+++ b/test/kmeans/data
@@ -0,0 +1,9 @@
+d00 feature_0=1.0 feature_1=0.5
+d01 feature_0=1.5 feature_1=0.4
+d02 feature_0=1.8 feature_1=0.3
+d10 feature_1=0.5 feature_2=1.0
+d11 feature_1=0.4 feature_2=2.0
+d12 feature_1=0.6 feature_2=1.5
+d20 feature_2=0.2 feature_3=1.0
+d21 feature_2=0.5 feature_3=2.0
+d22 feature_2=0.6 feature_3=3.0
diff --git a/tf-idf b/tf-idf
index 3edaaf8..e1502b3 100755
--- a/tf-idf
+++ b/tf-idf
@@ -1,68 +1,41 @@
 #!/usr/bin/env ruby
 
+require 'nlp_ruby'
 require 'trollop'
 
 
-# returns word='raw frequency' for a single document
-def tf(d, stopwords=[])
-  v = {}; v.default = 0
-  d.uniq.each { |i|
-   next if stopwords.include? i
-   v[i] = d.count(i).to_f
-  }
-  return v
-end
-
-# smoothes raw frequencies
-def ntf(w, a=0.4)
-  max = w.values.max.to_f
-  w.each_pair { |k,v|
-    w[k] = a + (1-a)*(v/max)
-  }
-end
-
-# returns idf value for each word in vocab
-def idf(collection)
-  vocab = collection.values.flatten.uniq
-  n = collection.size.to_f
-  idf = {}
-  vocab.each { |i|
-    df = collection.values.flatten.count i
-    idf[i] = Math.log(n/df)
-  }
-  return idf
-end
-
 def main
-  opts = Trollop::options do
-    opt :docs, "input files (documents)", :type => :strings, :required => true
-    opt :filter_stopwords, "filter stopwords (give file)", :type => :string
-    opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool
+  cfg = Trollop::options do
+    opt :documents, "input files (documents)", :type => :strings, :required => true
+    opt :filter_stopwords, "filter stopwords (give file)", :type => :string, :default => nil
+    opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool, :default => false
     opt :ntf, "length-normalize tf values", :type => :bool
     opt :idf, "weight tf by idf", :type => :bool
   end
 
   stopwords = []
-  if opts[:filter_stopwords]
-    stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''}
+  if cfg[:filter_stopwords]
+    stopwords = ReadFile.new(cfg[:filter_stopwords]).readlines.map{ |i|
+      i.split('|').first.strip
+    }.reject{ |i| i=='' }
   end
 
-  docs = {} # fn => [words...]
-  opts[:docs].each { |i|
-    if opts[:one_item_per_line]
-      docs[i] = File.new(i, 'r').readlines.map{|i| i.strip}
+  docs = {}
+  cfg[:documents].each { |i|
+    if cfg[:one_item_per_line]
+      docs[i] = ReadFile.new(i).readlines_strip
     else
-     docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip}
+     docs[i] = ReadFile.new(i).read.split(/\s/).map{ |i| i.strip }
     end
   }
 
   idf_values = idf docs
 
   docs.each_pair { |name, words|
-    just_tf = tf(words)
-    just_tf = ntf(just_tf) if opts[:ntf]
+    just_tf = tf words, stopwords
+    just_tf = ntf(just_tf) if cfg[:ntf]
     tf_idf = {}; tf_idf.default = 0.0
-    if opts[:idf]
+    if cfg[:idf]
       just_tf.each_pair { |word,f|
         tf_idf[word] = idf_values[word] * f
       }
diff --git a/traintestsplit b/traintestsplit
index 7ec52ae..7cc5bcf 100755
--- a/traintestsplit
+++ b/traintestsplit
@@ -1,55 +1,51 @@
 #!/usr/bin/env ruby
 
+require 'nlp_ruby'
 require 'trollop'
 
 
-def main
-  opts = Trollop::options do
-    opt :foreign, "foreign file", :type => :string, :required => true
-    opt :english, "english file", :type => :string, :required => true
-    opt :size, "one size", :type => :int, :required => true
-    opt :repeat, "number of repetitions", :type => :int, :default => 1
-    opt :prefix, "prefix for output files", :type => :string
-  end
-  fn = opts[:foreign]
-  fn_ext = fn.split('.').last
-  f = File.new(fn, 'r').readlines
-  en = opts[:english]
-  en_ext = en.split('.').last
-  e = File.new(en, 'r').readlines
-  size = opts[:size]
-  nlines_f = `wc -l #{fn}`.split()[0].to_i
-  nlines_e = `wc -l #{en}`.split()[0].to_i
-  if nlines_f!=nlines_e 
-    STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n"
-    exit 1
-  end
-
-  prefix = opts[:prefix]
-  a = (0..nlines_e-1).to_a
-  i = 0
-  opts[:repeat].times {
-    b = a.sample(size)
-    ax = a.reject{|j| b.include? j}
-    `mkdir split_#{i}`
-    new_f = File.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}", 'w+'
-    new_e = File.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}", 'w+'
-    ax.each { |j|
-      new_f.write f[j]
-      new_e.write e[j]
-    }
-    new_f.close; new_e.close
-    new_f = File.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}", 'w+'
-    new_e = File.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}", 'w+'
-    b.each { |j|
-      new_f.write f[j]
-      new_e.write e[j]
-    }
-    new_f.close; new_e.close
-    i += 1
-  }
+cfg = Trollop::options do
+  opt :foreign, "foreign file", :type => :string, :required => true
+  opt :english, "english file", :type => :string, :required => true
+  opt :size, "one size", :type => :int, :required => true
+  opt :repeat, "number of repetitions", :type => :int, :default => 1
+  opt :prefix, "prefix for output files", :type => :string
+end
+fn = cfg[:foreign]
+fn_ext = fn.split('.').last
+f = ReadFile.new(fn).readlines
+en = cfg[:english]
+en_ext = en.split('.').last
+e = ReadFile(en).readlines
+size = cfg[:size]
+nlines_f = `wc -l #{fn}`.split()[0].to_i
+nlines_e = `wc -l #{en}`.split()[0].to_i
+if nlines_f != nlines_e
+  STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n"
+  exit 1
 end
 
-
-main
+prefix = cfg[:prefix]
+a = (0..nlines_e-1).to_a
+i = 0
+cfg[:repeat].times {
+  b = a.sample(size)
+  ax = a.reject{|j| b.include? j}
+  `mkdir split_#{i}`
+  new_f = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{fn_ext}")
+  new_e = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{en_ext}")
+  ax.each { |j|
+    new_f.write f[j]
+    new_e.write e[j]
+  }
+  new_f.close; new_e.close
+  new_f = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{fn_ext}")
+  new_e = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{en_ext}")
+  b.each { |j|
+    new_f.write f[j]
+    new_e.write e[j]
+  }
+  new_f.close; new_e.close
+  i += 1
+}
 
diff --git a/var b/var
index 08b75b6..30c638a 100755
--- a/var
+++ b/var
@@ -3,13 +3,8 @@
 require 'trollop'
 
 
-def usage
-  STDERR.write "./stddev [-r <d>] < <one number per line>\n"
-  exit 1
-end
-usage if not [0,2].include? ARGV.size
-
-opts = Trollop::options do
+cfg = Trollop::options do
+  banner "stddev [-r <d>] < <one number per line>"
   opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
 end
 
@@ -18,10 +13,10 @@ sum = 0.0
 i = 0
 cached = []
 while line=STDIN.gets
-  v = line.strip.to_f
+  v = line.to_f
   sum += v
   cached << v
-  i +=1 
+  i +=1
 end
 
 avg = sum/i.to_f
@@ -31,8 +26,8 @@ cached.each { |v|
   var += (avg - v)**2
 }
 
-if opts[:round] >= 0 
-  puts var.round opts[:round]
+if cfg[:round] >= 0
+  puts var.round cfg[:round]
 else
   puts var
 end
diff --git a/wrap-xml.perl b/wrap-xml.perl
index d29065a..06303b7 100755
--- a/wrap-xml.perl
+++ b/wrap-xml.perl
@@ -1,5 +1,6 @@
 #!/usr/bin/perl -w
 # original: https://smt.googlecode.com/svn/trunk/moses64/tools/scripts/wrap-xml.perl
+# (licensed under LGPL)
 
 use strict;