summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--LICENSE7
-rw-r--r--README2
-rw-r--r--README.md7
-rwxr-xr-xadd_seg18
-rwxr-xr-xadd_start_end1
-rwxr-xr-xavg18
-rwxr-xr-xavg_weights44
-rwxr-xr-xfirstisupper5
-rwxr-xr-xhtmlentities2
-rwxr-xr-xkeycount9
-rwxr-xr-xkmeans138
-rwxr-xr-xmax4
-rwxr-xr-xmerge_files24
-rwxr-xr-xmin6
-rwxr-xr-xmin_max42
-rwxr-xr-xmoses_1best8
-rwxr-xr-xmult6
-rwxr-xr-xng44
-rwxr-xr-xno_empty10
-rwxr-xr-xno_non_printables3
-rwxr-xr-xnorm_german26
-rwxr-xr-xnum_tok6
-rwxr-xr-xodd1
-rwxr-xr-xpaste_pairs2
-rwxr-xr-xper_sentence_bleu46
-rwxr-xr-xpreprocess2
-rwxr-xr-xround6
-rwxr-xr-xruby_eval1
-rwxr-xr-xrule_shapes5
-rwxr-xr-xsample9
-rwxr-xr-xsample_n9
-rwxr-xr-xshard22
-rwxr-xr-xsplitpipes11
-rwxr-xr-xstddev16
-rwxr-xr-xstrips (renamed from strip_whitespace)2
-rwxr-xr-xsum4
-rw-r--r--test/kmeans/data9
-rwxr-xr-xtf-idf61
-rwxr-xr-xtraintestsplit90
-rwxr-xr-xvar17
-rwxr-xr-xwrap-xml.perl1
41 files changed, 311 insertions, 433 deletions
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0d5dab3
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,7 @@
+Copyright (C) 2014 Patrick Simianer <p ät simianer.de>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README b/README
deleted file mode 100644
index 8ce273f..0000000
--- a/README
+++ /dev/null
@@ -1,2 +0,0 @@
-misc. nlp related scripts
-
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8b5b4ad
--- /dev/null
+++ b/README.md
@@ -0,0 +1,7 @@
+scripts
+=======
+
+A number of NLP related scripts.
+Some scripts require my rubynlp gem,
+see https://github.com/pks/nlp_ruby .
+
diff --git a/add_seg b/add_seg
index e661b40..684a236 100755
--- a/add_seg
+++ b/add_seg
@@ -2,30 +2,24 @@
require 'trollop'
-
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-def usage
- puts "addseg [--nogz] [--loo] [--grammar] <path to grammars dir>\n"
- exit 1
-end
-opts = Trollop::options do
- opt :grammar, "(Abs) path of folder containing grammar.", :type => :string, :short => '-g', :required => true
+cfg = Trollop::options do
+ opt :grammar, "(Absolute) path of folder containing grammars.", :type => :string, :short => '-g', :required => true
opt :loo, "leave one out", :type => :bool, :default => false
opt :start_id, "start with this id", :type => :int, :default => 0, :short => '-i'
opt :nogz, "grammar files not gzipped", :type => :bool, :default => false
end
-
-i = opts[:start_id]
+i = cfg[:start_id]
while line = STDIN.gets
ext = '.gz'
- ext = '' if opts[:nogz]
+ ext = '' if cfg[:nogz]
s = "<seg"
- if opts[:loo] then s += " exclude=\"#{i}\"" end
- if opts[:grammar] then s += " grammar=\"#{opts[:grammar]}/grammar.#{i}#{ext}\"" end
+ if cfg[:loo] then s += " exclude=\"#{i}\"" end
+ if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{i}#{ext}\"" end
puts s + " id=\"#{i}\"> #{line.strip} </seg>"
i+=1
end
diff --git a/add_start_end b/add_start_end
index a14a65e..30deaec 100755
--- a/add_start_end
+++ b/add_start_end
@@ -3,6 +3,7 @@
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
+
while line = STDIN.gets
puts "<s> #{line.strip} </s>"
end
diff --git a/avg b/avg
index cc4c0e6..ed31465 100755
--- a/avg
+++ b/avg
@@ -3,28 +3,22 @@
require 'trollop'
-def usage
- STDERR.write "./avg [-r <d>] < <one number per line>\n"
- exit 1
-end
-usage if not [0,2].include? ARGV.size
-
-opts = Trollop::options do
+cfg = Trollop::options do
+ banner "avg < <one number per line>"
opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
end
-
sum = 0.0
i = 0
-while line=STDIN.gets
- sum += line.strip.to_f
+while line = STDIN.gets
+ sum += line.to_f
i +=1
end
avg = sum/i.to_f
-if opts[:round] >= 0
- puts avg.round opts[:round]
+if cfg[:round] >= 0
+ puts avg.round cfg[:round]
else
puts avg
end
diff --git a/avg_weights b/avg_weights
index 2b72747..71ffdd9 100755
--- a/avg_weights
+++ b/avg_weights
@@ -1,46 +1,34 @@
#!/usr/bin/env ruby
+require 'nlp_ruby'
require 'trollop'
require 'zlib'
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-opts = Trollop::options do
+cfg = Trollop::options do
+ opt :weights_files, "a number of weights files: name value", :required => true
opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false
end
-def usage
- puts "avg_weights_filter [--filter] <filename>+"
- exit 1
-end
-usage if ARGV.size==0
-
-
h = {}
ARGV.each { |fn|
-if File.extname(fn)=='.gz'
- f = Zlib::GzipReader.new(File.new(fn, 'rb'))
-else
- f = File.new fn, 'r'
-end
-while line = f.gets
- k, v = line.split
- v = v.to_f
- if h.has_key? k
- h[k] << v
- else
- h[k] = [v]
+ f = ReadFile.new fn
+ while line = f.gets
+ k, v = line.split
+ v = v.to_f
+ if h.has_key? k
+ h[k] << v
+ else
+ h[k] = [v]
+ end
end
-end
-f.close
+ f.close
}
n = ARGV.size.to_f
-h.each_pair { |k,a|
- next if opts[:filter] and a.size < n
- puts "#{k} #{a.inject(:+)/n}"
+h.each_pair { |k,w|
+ next if cfg[:filter] and w.size < n
+ puts "#{k} #{w.inject(:+)/n}"
}
diff --git a/firstisupper b/firstisupper
index 4278334..516dd8a 100755
--- a/firstisupper
+++ b/firstisupper
@@ -1,8 +1,7 @@
#!/usr/bin/env ruby
-def downcase?(string)
- string[/[[:lower:]]/]
-end
+require 'nlp_ruby'
+
while line = STDIN.gets
puts line.strip if downcase? line[0]
diff --git a/htmlentities b/htmlentities
index ecbee3f..f3c2d34 100755
--- a/htmlentities
+++ b/htmlentities
@@ -2,10 +2,10 @@
require 'htmlentities'
-
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
+
coder = HTMLEntities.new
while line = STDIN.gets
diff --git a/keycount b/keycount
index 15b4095..deaa522 100755
--- a/keycount
+++ b/keycount
@@ -1,11 +1,14 @@
#!/usr/bin/env ruby
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
h = {}
-h.default = 0
+h.default = 0
while line = STDIN.gets
- line.strip!
+ line.strip!
h[line] += 1
end
-h.each_pair {|k,v| puts "#{k} #{v}"}
+h.each_pair { |k,v| puts "#{k} #{v}" }
diff --git a/kmeans b/kmeans
index 89cc329..5c49d9a 100755
--- a/kmeans
+++ b/kmeans
@@ -1,141 +1,97 @@
#!/usr/bin/env ruby
+require 'nlp_ruby'
require 'trollop'
-# {s:f} {s:f} => f
-def dot(x,y)
- sum = 0.0
- x.each_pair { |k,v| sum += v * y[k] }
- return sum
-end
-
-# {s:f} => f
-def mag(x)
- return Math.sqrt x.values.inject { |sum,i| sum+i**2 }
-end
-
-# {s:f} {s:f} => f
-def cos_sim(x,y)
- return dot(x,y)/(mag(x)*mag(y))
-end
-
-# {s:f} {s:f} => f
-def euclidian_dist(x,y)
- dims = [x.keys, y.keys].flatten.uniq
- sum = 0.0
- dims.each { |i| sum += (x[i] - y[i])**2 }
- return Math.sqrt(sum)
-end
-
-# str => {s:{s:f}}
-def read(fn)
- h = {}
- f = File.new fn, 'r'
- while line = f.gets
- g = eval line
- h[g[0]] = g[1]
- h[g[0]].default = 0.0
- end
- return h
+def read_data fn
+ data = {}
+ ReadFile.new(fn).readlines_strip.map{ |i|
+ a = i.split ' ', 2
+ data[a.first] = read_feature_string a.last
+ }
+ return data
end
-# {s:{s:f}} i => [{s:f}]
-def rand_init(docs, k)
- prng = Random.new
- return docs.keys.sample k, random:prng
+def rand_init data, k
+ prng = Random.new
+ return data.keys.sample k, random:prng
end
-def rand_init2(docs, k)
- prng = Random.new
+def rand_means_init data, k
+ prng = Random.new
a = []
0.upto(k-1) do
- a << mean(docs.values.sample k, random:prng)
+ a << mean_sparse_vector(data.values.sample k, random:prng)
end
return a
end
-# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]}
-def assign(docs, centroids)
+def assign centroids, data
assignment = {}
- docs.each_pair { |name,feature_vector|
+ data.each_pair { |name,feature_vector|
min = 1.0/0
min_index = nil
- centroids.each_with_index { |c,j|
- dist = euclidian_dist(c, feature_vector)
- if dist < min
- min = dist
- min_index = j
+ centroids.each_with_index { |c,i|
+ dist = c.euclidian_dist(feature_vector)
+ if dist < min
+ min = dist
+ min_index = i
end
}
if assignment.has_key? min_index
- assignment[min_index] << [name, feature_vector]
+ assignment[min_index] << name
else
- assignment[min_index] = [[name, feature_vector]]
+ assignment[min_index] = [name]
end
}
return assignment
end
-# [{s:f}] => {s:f}
-def mean(a)
- res = {}
- res.default = 0.0
- a.each { |i|
- i.each_pair { |k,v|
- res[k] += v
- }
- }
- n = a.size.to_f
- res.each_pair { |k,v|
- res[k] = v/n
- }
-end
-
-# {i:[{s:f}]} => [{s:f}]
-def update(assignment)
+def update assignment, data
new_centroids = []
- assignment.each_pair { |centroid,docs|
- new_centroids << mean(docs.map{|i |i[1]})
+ assignment.each_pair { |centroid_index,a|
+ new_centroids << mean_sparse_vector(assignment[centroid_index].map{ |i| data[i] })
}
return new_centroids
end
def main
- opts = Trollop::options do
+ cfg = Trollop::options do
opt :k, "k", :type => :int, :required => true
opt :input, "input: one feature vector per line", :type => :string, :required => true
opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
- opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3
+ opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3
opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
end
- docs = read opts[:input]
- k = opts[:k]
+ # data is 'ID f1=v1 f2=v2'
+ data = read_data cfg[:input]
+ k = cfg[:k]
centroids = nil
- if opts[:init] == 1
- centroids = rand_init(docs, k)
+ if cfg[:init] == 1
+ centroids = rand_init(data, k)
else
- centroids = rand_init2(docs, k)
+ centroids = rand_means_init(data, k)
end
STDERR.write "\n k #{k}\n"
- STDERR.write " input #{opts[:input]}\n"
- STDERR.write "iterations #{opts[:max_iterations]}\n"
- STDERR.write "max no ch. #{opts[:max_no_change]}\n"
- STDERR.write " init #{opts[:init]}\n\n"
+ STDERR.write " input #{cfg[:input]}\n"
+ STDERR.write "iterations #{cfg[:max_iterations]}\n"
+ STDERR.write "max no ch. #{cfg[:max_no_change]}\n"
+ STDERR.write " init #{cfg[:init]}\n\n"
assignment = nil
prev_stats = []
stats = []
no_change = 0
max_no_change = 5
- STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n"
- 0.upto(opts[:max_iterations]) do |i|
+ STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n"
+ 0.upto(cfg[:max_iterations]) do |i|
s = "iteration #{i}"
STDERR.write "#{s}\n#{'-'*s.size}\n"
- assignment = assign(docs, centroids)
+ assignment = assign centroids, data
sizes = []
- assignment.each_pair { |centroid_index,docs|
- sizes << docs.size
- }
+ assignment.each_pair { |centroid_index, a|
+ sizes << a.size
+ }
median = sizes.sort[k/2]
max = sizes.max
min = sizes.min
@@ -148,12 +104,12 @@ def main
STDERR.write " min cluster sz=#{min}\n\n"
if no_change == max_no_change
STDERR.write "\nmax no change hit!\n\n"
- assignment.each_pair { |centroid_index,docs|
- puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}"
+ assignment.each_pair { |centroid_index,a|
+ puts "#{centroid_index} #{a.to_s}"
}
break
end
- centroids = update(assignment)
+ centroids = update assignment, data
end
end
diff --git a/max b/max
index 506bd03..87f3c73 100755
--- a/max
+++ b/max
@@ -1,9 +1,11 @@
#!/usr/bin/env ruby
+
max = -1.0/0
while line = STDIN.gets
- v = line.strip.to_f
+ v = line.to_f
max = v if v > max
end
+
puts max
diff --git a/merge_files b/merge_files
index db9d5da..051ad6d 100755
--- a/merge_files
+++ b/merge_files
@@ -1,31 +1,31 @@
#!/usr/bin/env ruby
-STDOUT.set_encoding 'utf-8'
+require 'nlp_ruby'
+
def usage
- STDERR.write "merge_files [file]+\n"
+ STDERR.write "merge_files <file>+\n"
exit 1
end
usage if ARGV.size==0
-
files = ARGV
-dicts = []
+hashes = []
files.each { |i|
- dicts.push Hash.new
- dicts.last.default = 0
- File.open i, "r:UTF-8" do |f|
- while line = f.gets
- dicts.last[line.strip] += 1
- end
+ hashes.push Hash.new
+ hashes.last.default = 0
+ f = ReadFile.new i
+ while line = f.gets
+ hashes.last[line.strip] += 1
end
+ f.close
}
-dicts.each { |h|
+hashes.each { |h|
h.each { |k,v|
counts = []
- dicts.each { |j| counts.push j[k]; j.delete k }
+ hashes.each { |j| counts.push j[k]; j.delete k }
counts.max.times { puts k }
}
}
diff --git a/min b/min
index c2f85b9..398b0fb 100755
--- a/min
+++ b/min
@@ -1,9 +1,11 @@
#!/usr/bin/env ruby
+
min = 1.0/0
while line = STDIN.gets
- v = line.strip.to_f
- min = v if v < min
+ v = line.to_f
+ min = v if v<min
end
+
puts min
diff --git a/min_max b/min_max
index f27de88..653cde3 100755
--- a/min_max
+++ b/min_max
@@ -1,33 +1,26 @@
#!/usr/bin/ruby
+require 'nlp_ruby'
require 'trollop'
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-def usage
- puts "filter-min-max.rb --min <min> --max <max> --in_f <in f> --in_e <in e> --out_f <out f> --out_e <out e> --out_id <out ids>"
-end
-usage if ARGV.size!=14
-
-opts = Trollop::options do
+cfg = Trollop::options do
opt :min, "minimum #tokens", :type => :int, :default => 1
- opt :max, "maximum #tokens", :type => :int, :default => 80
- opt :in_f "input 'French' file", :type => string
- opt :in_e "input 'English' file", :type => string
- opt :out_f "output 'French' file", :type => string
- opt :out_e "output 'English' file", :type => string
- opt :out_id "output line Nos", :type => string
+ opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n'
+ opt :in_f, "input 'French' file", :type => :string, :required => true
+ opt :in_e, "input 'English' file", :type => :string, :required => true
+ opt :out_f, "output 'French' file", :type => :string, :required => true
+ opt :out_e, "output 'English' file", :type => :string, :required => true
+ opt :out_id, "output line Nos", :type => :string, :required => true
end
files = {}
-files[:f_file] = File.new opts[:in_f], 'r:UTF-8'
-files[:e_file] = File.new opts[:in_e], 'r:UTF-8'
-files[:f_out_file] = File.new opts[:out_f], 'w:UTF-8'
-files[:e_out_file] = File.new opts[:out_e], 'w:UTF-8'
-files[:id_out_file] = File.new opts[:out_id], 'w'
+files[:f_file] = ReadFile.new cfg[:in_f]
+files[:e_file] = ReadFile.new cfg[:in_e]
+files[:f_out_file] = WriteFile.new cfg[:out_f]
+files[:e_out_file] = WriteFile.new cfg[:out_e]
+files[:id_out_file] = WriteFile.new cfg[:out_id]
i = 0
while f_line = files[:f_file].gets
e_line = files[:e_file].gets
@@ -35,13 +28,14 @@ while f_line = files[:f_file].gets
e_line.strip!
a = f_line.split
b = e_line.split
- if a.size >= opts[:min] and a.size <= opts[:max] and \
- b.size >= opts[:min] and b.size <= opts[:max]
+ if a.size >= cfg[:min] and a.size <= cfg[:max] and \
+ b.size >= cfg[:min] and b.size <= cfg[:max]
files[:f_out_file].write "#{f_line}\n"
files[:e_out_file].write "#{e_line}\n"
files[:id_out_file].write "#{i}\n"
- end
+ end
i+=1
end
-files.values.each{|f|f.close}
+
+files.values.each{ |f| f.close }
diff --git a/moses_1best b/moses_1best
index 5c6bf9d..1a0805d 100755
--- a/moses_1best
+++ b/moses_1best
@@ -1,13 +1,13 @@
#!/usr/bin/env ruby
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+require 'nlp_ruby'
+
prev_idx = nil
while line = STDIN.gets
line.strip!
- idx = line.split('|||')[0].to_i
- if idx != prev_idx
+ idx = splitpipe(line)[0].to_i
+ if idx != prev_idx
puts line
prev_idx = idx
end
diff --git a/mult b/mult
index eaead89..2ef0149 100755
--- a/mult
+++ b/mult
@@ -1,4 +1,8 @@
#!/usr/bin/env ruby
-puts STDIN.gets.to_f * ARGV[0].to_f
+
+factor = ARGV[0].to_f
+while line = STDIN.gets
+ puts line.to_f * factor
+end
diff --git a/ng b/ng
index de314b8..dbc59eb 100755
--- a/ng
+++ b/ng
@@ -1,39 +1,19 @@
#!/usr/bin/env ruby
-def ngrams_it(s, n, fix=false)
- a = s.strip.split
- a.each_with_index { |tok, i|
- tok.strip!
- 0.upto([n-1, a.size-i-1].min) { |m|
- yield a[i..i+m] if !(fix||(a[i..i+m].size>n))
- }
- }
-end
-
-def main(n, fix, sep)
- STDIN.set_encoding 'utf-8'
- STDOUT.set_encoding 'utf-8'
- while line = STDIN.gets
- a = []
- ngrams_it(line, n, fix) {|ng| a << ng.join(' ')}
- a.reject! {|i| i.strip.size==0 }
- puts a.join sep if a.size > 0
- end
-end
+require 'nlp_ruby'
+require 'trollop'
-def usage
- STDERR.write "./ng [-n <n>] [--fix] [--separator <s>] < <one number per line>\n"
- exit 1
+cfg = Trollop::options do
+ banner "ng < <input>"
+ opt :n, "n for Ngrams", :type => :int, :default => 4
+ opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => false
+ opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n"
end
-if __FILE__ == $0
- require 'trollop'
- opts = Trollop::options do
- opt :n, "Ngrams", :type => :int, :default => 4
- opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => false
- opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n"
- end
- usage if not [0,2,4,6].include? ARGV.size
- main(opts[:n], opts[:fix], opts[:separator])
+while line = STDIN.gets
+ a = []
+ ngrams(line, cfg[:n], cfg[:fix]) { |ng| a << ng.join(' ') }
+ a.reject! { |i| i.strip.size==0 }
+ puts a.join cfg[:separator] if a.size>0
end
diff --git a/no_empty b/no_empty
index ecdbcdf..cd825c0 100755
--- a/no_empty
+++ b/no_empty
@@ -1,12 +1,14 @@
#!/usr/bin/env ruby
+require 'nlp_ruby'
+
+
files = []
-(0..1).each { |i| files << File.new(ARGV[i], 'r') }
-(2..3).each { |i| files << File.new(ARGV[i], 'w') }
-files.each { |f| f.set_encoding('utf-8') }
+(0..1).each { |i| files << ReadFile.new(ARGV[i]) }
+(2..3).each { |i| files << WriteFile.new(ARGV[i]) }
while line_f = files[0].gets
- line_e = files[1].gets
+ line_e = files[1].gets
line_f.strip!; line_e.strip!
next if line_f=='' || line_e==''
files[2].write line_f+"\n"
diff --git a/no_non_printables b/no_non_printables
index fda1e40..20d1e3d 100755
--- a/no_non_printables
+++ b/no_non_printables
@@ -1 +1,4 @@
+#!/bin/sh
+
sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g'
+
diff --git a/norm_german b/norm_german
index 57a37bb..ef0408e 100755
--- a/norm_german
+++ b/norm_german
@@ -3,17 +3,12 @@
require 'thread'
require 'trollop'
-
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-def usage
- STDERR.write "./avg [-r <d>] < <one number per line>\n"
- exit 1
-end
-usage if not [0,2,4].include? ARGV.size
-opts = Trollop::options do
+cfg = Trollop::options do
+ banner "norm_german < <file w/ lowercased tokens>"
opt :upper, "uppercase", :type => :bool, :default => false
opt :threads, "#threads", :type => :int, :default => 1, :short => '-h'
opt :shard_size, "shard size", :type => :int, :default => 1000
@@ -21,10 +16,9 @@ opts = Trollop::options do
opt :apply, "apply", :type => :bool
end
-
pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ]
pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ]
-if opts[:upper]
+if cfg[:upper]
PAIRS = pairs_lower
else
PAIRS = pairs_lower+pairs_upper
@@ -46,7 +40,7 @@ def build_partial(tokens)
if get_key i, tok
h[i] << tok
found = true
- break
+ break
end
}
h[tok] = [tok] if !found
@@ -60,24 +54,24 @@ thread_n = 0
counter = 0
token_stock = []
mutex = Mutex.new
-while tok = STDIN.gets # expects stream of (lowercased) tokens
+while tok = STDIN.gets
token_stock << [] if !token_stock[thread_n]
token_stock[thread_n] << tok.strip!
counter += 1
- if token_stock[thread_n].size%opts[:shard_size]==0
+ if token_stock[thread_n].size%cfg[:shard_size]==0
STDERR.write "Starting thread ##{thread_n}\n"
threads << Thread.new(token_stock[thread_n]) { |tokens|
th = build_partial tokens
mutex.synchronize do
- h.merge! th
+ h.merge! th
end
}
threads.last.abort_on_exception = true
thread_n += 1
- else
+ else
next
end
- if thread_n==opts[:threads]
+ if thread_n==cfg[:threads]
threads.each { |i| i.join }
token_stock.each { |i| i.clear }
thread_n = 0
@@ -86,7 +80,7 @@ while tok = STDIN.gets # expects stream of (lowercased) tokens
end
token_stock.each { |i|
- if i.size!=0
+ if i.size!=0
h.merge! build_partial i
end
}
diff --git a/num_tok b/num_tok
index a11b0d7..53b99a0 100755
--- a/num_tok
+++ b/num_tok
@@ -1,8 +1,10 @@
#!/usr/bin/env ruby
-STDIN.set_encoding('utf-8')
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
while line = STDIN.gets
- puts line.split.length
+ puts line.strip.split.length
end
diff --git a/odd b/odd
index 0bd9336..93aaa80 100755
--- a/odd
+++ b/odd
@@ -3,6 +3,7 @@
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
+
i = 1
while line = STDIN.gets
puts line if i%2!=0
diff --git a/paste_pairs b/paste_pairs
index 6ede8f6..07c1f22 100755
--- a/paste_pairs
+++ b/paste_pairs
@@ -8,4 +8,4 @@ for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.a
print linenr, (src_line.strip())
print linenr, (tgt_line.strip())
print
-
+
diff --git a/per_sentence_bleu b/per_sentence_bleu
index c7c0b0e..724b1e1 100755
--- a/per_sentence_bleu
+++ b/per_sentence_bleu
@@ -1,29 +1,21 @@
#!/usr/bin/env ruby
+require 'nlp_ruby'
require 'trollop'
-def ngrams_it(s, n, fix=false)
- a = s.strip.split
- a.each_with_index { |tok, i|
- tok.strip!
- 0.upto([n-1, a.size-i-1].min) { |m|
- yield a[i..i+m] if !(fix||(a[i..i+m].size>n))
- }
- }
-end
-
-def brevity_penalty hypothesis, reference
- a = hypothesis.split; b = reference.split
- return 1.0 if a.size>b.size
- return Math.exp(1.0 - ((b.size.to_f+1)/a.size));
+# reference-length hack as in (Nakov et al., 2012)
+def brevity_penalty hypothesis, reference, hack=0
+ a = tokenize hypothesis; b = tokenize reference
+ return 1.0 if a.size>=b.size
+ return Math.exp(1.0 - ((b.size.to_f+hack)/a.size));
end
-def per_sentence_bleu hypothesis, reference, n=4
+def per_sentence_bleu hypothesis, reference, n=4, hack=0
h_ng = {}; r_ng = {}
(1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}
- ngrams_it(hypothesis, n) {|i| h_ng[i.size] << i}
- ngrams_it(reference, n) {|i| r_ng[i.size] << i}
+ ngrams(hypothesis, n) {|i| h_ng[i.size] << i}
+ ngrams(reference, n) {|i| r_ng[i.size] << i}
m = [n, reference.split.size].min
weight = 1.0/m
add = 0.0
@@ -35,31 +27,29 @@ def per_sentence_bleu hypothesis, reference, n=4
add = 1.0 if i >= 2
sum += weight * Math.log((counts_clipped + add)/(counts_sum + add));
}
- return brevity_penalty(hypothesis, reference) * Math.exp(sum)
+ return brevity_penalty(hypothesis, reference, hack) * Math.exp(sum)
end
def main
- opts = Trollop::options do
+ cfg = Trollop::options do
opt :input, "input", :type => :string, :default => '-'
opt :references, "references", :type => :string, :required => true
+ opt :len_hack, "hack of Nakov et al", :type => :int, :default => 0
+ opt :n, "N", :default => 4
end
-
- refs = File.new(opts[:references], 'r').readlines.map{|i|i.strip}
+
+ refs = ReadFile.new(cfg[:references]).readlines_strip
i = -1
- if opts[:input] == '-'
- input = STDIN
- else
- input = File.new opts[:input], 'r'
- end
+ input = ReadFile.new cfg[:input]
while line = input.gets
i += 1
if line.strip == ''
puts 0.0
next
end
- puts per_sentence_bleu line.strip, refs[i]
+ puts per_sentence_bleu line.strip, refs[i], cfg[:n], cfg[:len_hack]
end
- input.close if opts[:input]!='-'
+ input.close
end
diff --git a/preprocess b/preprocess
index bc6b5d2..4bf782a 100755
--- a/preprocess
+++ b/preprocess
@@ -1,4 +1,4 @@
-#!/bin/zsh
+#!/bin/bash
LANG=$1
/toolbox/scripts/htmlentities 2>htmlentities.$LANG.err | /toolbox/scripts/normalize_punctuation 2>normalize-punctuation.$LANG.err | /toolbox/moses/scripts/tokenizer/tokenizer.no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | /toolbox/moses/scripts/tokenizer/lowercase.perl 2>lowercase.$LANG.err
diff --git a/round b/round
index 52cd013..3dfbb6f 100755
--- a/round
+++ b/round
@@ -1,4 +1,8 @@
#!/usr/bin/env ruby
-puts STDIN.gets.to_f.round ARGV[0].to_i
+
+r = ARGV[0].to_i
+while line = STDIN.gets
+ puts line.to_f.round r
+end
diff --git a/ruby_eval b/ruby_eval
index fe0d181..96b2ecb 100755
--- a/ruby_eval
+++ b/ruby_eval
@@ -1,5 +1,6 @@
#!/usr/bin/env ruby
+
while line = STDIN.gets
puts "#{eval line}"
end
diff --git a/rule_shapes b/rule_shapes
index 039b0dc..fd42249 100755
--- a/rule_shapes
+++ b/rule_shapes
@@ -3,11 +3,12 @@
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
+
def shape s
res = []
in_t = false
s.split.each { |i|
- if i.match /\A\[X,\d\]\z/
+ if i.match(/\A\[X,\d\]\z/)
if in_t
in_t = false
end
@@ -22,7 +23,7 @@ def shape s
end
while line = STDIN.gets
- f,e = line.split "\t"
+ f, e = line.split(/\t/)
f.strip!; e.strip!
puts shape(f).join('_')+"-"+shape(e).join('_')
end
diff --git a/sample b/sample
index b4706c6..e693d5c 100755
--- a/sample
+++ b/sample
@@ -2,23 +2,16 @@
require 'trollop'
-
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-def usage
- STDERR.write "./sample --size <n> < <line separated data>\n"
- exit 1
-end
-usage if ARGV.size!=4
opts = Trollop::options do
+ banner "sample --size <n> < <line separated data>"
opt :size, "Sample n% (percentage).", :type => :int
end
-
prng = Random.new(Random.new_seed)
-
while line = STDIN.gets
STDOUT.write line if prng.rand(1..opts[:size])==0
end
diff --git a/sample_n b/sample_n
index 2115407..286646b 100755
--- a/sample_n
+++ b/sample_n
@@ -3,20 +3,13 @@
require 'trollop'
-def usage
- STDERR.write "./sample --size <n> --population <n>\n"
- exit 1
-end
-usage if ARGV.size!=4
-
opts = Trollop::options do
+ banner "sample --size <n> --population <n>"
opt :size, "Sample size (percentage).", :type => :int
opt :population, "'Population' (number \in N)", :type => :int
end
-
prng = Random.new(Random.new_seed)
-
1.upto(opts[:population]) { |i|
puts i if prng.rand(1..opts[:size])==0
}
diff --git a/shard b/shard
index 7729699..f952104 100755
--- a/shard
+++ b/shard
@@ -12,11 +12,11 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false
index.shuffle! if rand
shard_sz = lc / num_shards
leftover = lc % num_shards
- in_f = File.new input, 'r'
+ in_f = ReadFile.new input
in_lines = in_f.readlines
- refs_f = File.new refs, 'r'
+ refs_f = ReadFile.new refs
refs_lines = refs_f.readlines
- a_f = File.new alignments, 'r'
+ a_f = ReadFile.new alignments
a_lines = a_f.readlines
shard_in_files = []
shard_refs_files = []
@@ -26,13 +26,13 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false
a_fns = []
0.upto(num_shards-1) { |shard|
in_fn = "#{output_prefix}.#{shard}.#{input_ext}"
- shard_in = File.new in_fn, 'w+'
+ shard_in = WriteFile.new in_fn
in_fns << in_fn
refs_fn = "#{output_prefix}.#{shard}.#{refs_ext}"
- shard_refs = File.new refs_fn, 'w+'
+ shard_refs = WriteFile.new refs_fn
refs_fns << refs_fn
a_fn = "#{output_prefix}.#{shard}.a"
- shard_a = File.new a_fn, 'w+'
+ shard_a = WriteFile.new a_fn
a_fns << a_fn
0.upto(shard_sz-1) { |i|
j = index.pop
@@ -69,12 +69,12 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false
end
opts = Trollop::options do
- opt :input, 'input', :type => :string
- opt :references, 'references', :type => :string
- opt :alignments, 'alignments', :type => :string
- opt :output_prefix, 'output prefix', :type => :string
+ opt :input, 'input', :type => :string, :required => true
+ opt :references, 'references', :type => :string, :required => true
+ opt :alignments, 'alignments', :type => :string, :required => true
+ opt :output_prefix, 'output prefix', :type => :string, :required => true
opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z'
- opt :num_shards, 'number of shards', :type => :int
+ opt :num_shards, 'number of shards', :type => :int, :required => true
end
make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize])
diff --git a/splitpipes b/splitpipes
index b0c3c9c..35ee176 100755
--- a/splitpipes
+++ b/splitpipes
@@ -2,24 +2,19 @@
require 'trollop'
-
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-def usage
- STDERR.write "splitpipes -f <n> < <input>\n"
- exit 1
-end
-usage if ARGV.size!=2
-opts = Trollop::options do
+cfg = Trollop::options do
+ banner "splitpipes -f <n> < <input>"
opt :field, "field", :type => :int
end
while line = STDIN.gets
j = 1
line.strip.split(' ||| ').each { |i|
- if j == opts[:field]
+ if j == cfg[:field]
puts i.strip
break
end
diff --git a/stddev b/stddev
index 3bf0270..891c4c9 100755
--- a/stddev
+++ b/stddev
@@ -3,22 +3,16 @@
require 'trollop'
-def usage
- STDERR.write "./stddev [-r <d>] < <one number per line>\n"
- exit 1
-end
-usage if not [0,2].include? ARGV.size
-
-opts = Trollop::options do
+cfg = Trollop::options do
+ banner "stddev [-r <d>] < <one number per line>"
opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
end
-
sum = 0.0
i = 0
cached = []
while line=STDIN.gets
- v = line.strip.to_f
+ v = line.to_f
sum += v
cached << v
i +=1
@@ -33,8 +27,8 @@ cached.each { |v|
stddev = Math.sqrt(var)
-if opts[:round] >= 0
- puts stddev.round opts[:round]
+if cfg[:round] >= 0
+ puts stddev.round cfg[:round]
else
puts stddev
end
diff --git a/strip_whitespace b/strips
index 37c02e5..11c00b4 100755
--- a/strip_whitespace
+++ b/strips
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
while line = STDIN.gets
- puts line.lstrip.strip
+ puts line.strip
end
diff --git a/sum b/sum
index 3fca95e..dac72d3 100755
--- a/sum
+++ b/sum
@@ -1,8 +1,10 @@
#!/usr/bin/env ruby
+
sum = 0.0
while line = STDIN.gets
- sum += line.strip.to_f
+ sum += line.to_f
end
+
puts sum
diff --git a/test/kmeans/data b/test/kmeans/data
new file mode 100644
index 0000000..b5b3db2
--- /dev/null
+++ b/test/kmeans/data
@@ -0,0 +1,9 @@
+d00 feature_0=1.0 feature_1=0.5
+d01 feature_0=1.5 feature_1=0.4
+d02 feature_0=1.8 feature_1=0.3
+d10 feature_1=0.5 feature_2=1.0
+d11 feature_1=0.4 feature_2=2.0
+d12 feature_1=0.6 feature_2=1.5
+d20 feature_2=0.2 feature_3=1.0
+d21 feature_2=0.5 feature_3=2.0
+d22 feature_2=0.6 feature_3=3.0
diff --git a/tf-idf b/tf-idf
index 3edaaf8..e1502b3 100755
--- a/tf-idf
+++ b/tf-idf
@@ -1,68 +1,41 @@
#!/usr/bin/env ruby
+require 'nlp_ruby'
require 'trollop'
-# returns word='raw frequency' for a single document
-def tf(d, stopwords=[])
- v = {}; v.default = 0
- d.uniq.each { |i|
- next if stopwords.include? i
- v[i] = d.count(i).to_f
- }
- return v
-end
-
-# smoothes raw frequencies
-def ntf(w, a=0.4)
- max = w.values.max.to_f
- w.each_pair { |k,v|
- w[k] = a + (1-a)*(v/max)
- }
-end
-
-# returns idf value for each word in vocab
-def idf(collection)
- vocab = collection.values.flatten.uniq
- n = collection.size.to_f
- idf = {}
- vocab.each { |i|
- df = collection.values.flatten.count i
- idf[i] = Math.log(n/df)
- }
- return idf
-end
-
def main
- opts = Trollop::options do
- opt :docs, "input files (documents)", :type => :strings, :required => true
- opt :filter_stopwords, "filter stopwords (give file)", :type => :string
- opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool
+ cfg = Trollop::options do
+ opt :documents, "input files (documents)", :type => :strings, :required => true
+ opt :filter_stopwords, "filter stopwords (give file)", :type => :string, :default => nil
+ opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool, :default => false
opt :ntf, "length-normalize tf values", :type => :bool
opt :idf, "weight tf by idf", :type => :bool
end
stopwords = []
- if opts[:filter_stopwords]
- stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''}
+ if cfg[:filter_stopwords]
+ stopwords = ReadFile.new(cfg[:filter_stopwords]).readlines.map{ |i|
+ i.split('|').first.strip
+ }.reject{ |i| i=='' }
end
- docs = {} # fn => [words...]
- opts[:docs].each { |i|
- if opts[:one_item_per_line]
- docs[i] = File.new(i, 'r').readlines.map{|i| i.strip}
+ docs = {}
+ cfg[:documents].each { |i|
+ if cfg[:one_item_per_line]
+ docs[i] = ReadFile.new(i).readlines_strip
else
- docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip}
+ docs[i] = ReadFile.new(i).read.split(/\s/).map{ |i| i.strip }
end
}
idf_values = idf docs
docs.each_pair { |name, words|
- just_tf = tf(words)
- just_tf = ntf(just_tf) if opts[:ntf]
+ just_tf = tf words, stopwords
+ just_tf = ntf(just_tf) if cfg[:ntf]
tf_idf = {}; tf_idf.default = 0.0
- if opts[:idf]
+ if cfg[:idf]
just_tf.each_pair { |word,f|
tf_idf[word] = idf_values[word] * f
}
diff --git a/traintestsplit b/traintestsplit
index 7ec52ae..7cc5bcf 100755
--- a/traintestsplit
+++ b/traintestsplit
@@ -1,55 +1,51 @@
#!/usr/bin/env ruby
+require 'nlp_ruby'
require 'trollop'
-def main
- opts = Trollop::options do
- opt :foreign, "foreign file", :type => :string, :required => true
- opt :english, "english file", :type => :string, :required => true
- opt :size, "one size", :type => :int, :required => true
- opt :repeat, "number of repetitions", :type => :int, :default => 1
- opt :prefix, "prefix for output files", :type => :string
- end
- fn = opts[:foreign]
- fn_ext = fn.split('.').last
- f = File.new(fn, 'r').readlines
- en = opts[:english]
- en_ext = en.split('.').last
- e = File.new(en, 'r').readlines
- size = opts[:size]
- nlines_f = `wc -l #{fn}`.split()[0].to_i
- nlines_e = `wc -l #{en}`.split()[0].to_i
- if nlines_f!=nlines_e
- STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n"
- exit 1
- end
-
- prefix = opts[:prefix]
- a = (0..nlines_e-1).to_a
- i = 0
- opts[:repeat].times {
- b = a.sample(size)
- ax = a.reject{|j| b.include? j}
- `mkdir split_#{i}`
- new_f = File.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}", 'w+'
- new_e = File.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}", 'w+'
- ax.each { |j|
- new_f.write f[j]
- new_e.write e[j]
- }
- new_f.close; new_e.close
- new_f = File.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}", 'w+'
- new_e = File.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}", 'w+'
- b.each { |j|
- new_f.write f[j]
- new_e.write e[j]
- }
- new_f.close; new_e.close
- i += 1
- }
+cfg = Trollop::options do
+ opt :foreign, "foreign file", :type => :string, :required => true
+ opt :english, "english file", :type => :string, :required => true
+ opt :size, "one size", :type => :int, :required => true
+ opt :repeat, "number of repetitions", :type => :int, :default => 1
+ opt :prefix, "prefix for output files", :type => :string
+end
+fn = cfg[:foreign]
+fn_ext = fn.split('.').last
+f = ReadFile.new(fn).readlines
+en = cfg[:english]
+en_ext = en.split('.').last
+e = ReadFile(en).readlines
+size = cfg[:size]
+nlines_f = `wc -l #{fn}`.split()[0].to_i
+nlines_e = `wc -l #{en}`.split()[0].to_i
+if nlines_f != nlines_e
+ STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n"
+ exit 1
end
-
-main
+prefix = cfg[:prefix]
+a = (0..nlines_e-1).to_a
+i = 0
+cfg[:repeat].times {
+ b = a.sample(size)
+ ax = a.reject{|j| b.include? j}
+ `mkdir split_#{i}`
+ new_f = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{fn_ext}")
+ new_e = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{en_ext}")
+ ax.each { |j|
+ new_f.write f[j]
+ new_e.write e[j]
+ }
+ new_f.close; new_e.close
+ new_f = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{fn_ext}")
+ new_e = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{en_ext}")
+ b.each { |j|
+ new_f.write f[j]
+ new_e.write e[j]
+ }
+ new_f.close; new_e.close
+ i += 1
+}
diff --git a/var b/var
index 08b75b6..30c638a 100755
--- a/var
+++ b/var
@@ -3,13 +3,8 @@
require 'trollop'
-def usage
- STDERR.write "./stddev [-r <d>] < <one number per line>\n"
- exit 1
-end
-usage if not [0,2].include? ARGV.size
-
-opts = Trollop::options do
+cfg = Trollop::options do
+ banner "stddev [-r <d>] < <one number per line>"
opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
end
@@ -18,10 +13,10 @@ sum = 0.0
i = 0
cached = []
while line=STDIN.gets
- v = line.strip.to_f
+ v = line.to_f
sum += v
cached << v
- i +=1
+ i +=1
end
avg = sum/i.to_f
@@ -31,8 +26,8 @@ cached.each { |v|
var += (avg - v)**2
}
-if opts[:round] >= 0
- puts var.round opts[:round]
+if cfg[:round] >= 0
+ puts var.round cfg[:round]
else
puts var
end
diff --git a/wrap-xml.perl b/wrap-xml.perl
index d29065a..06303b7 100755
--- a/wrap-xml.perl
+++ b/wrap-xml.perl
@@ -1,5 +1,6 @@
#!/usr/bin/perl -w
# original: https://smt.googlecode.com/svn/trunk/moses64/tools/scripts/wrap-xml.perl
+# (licensed under LGPL)
use strict;