diff options
-rw-r--r-- | LICENSE | 7 | ||||
-rw-r--r-- | README | 2 | ||||
-rw-r--r-- | README.md | 7 | ||||
-rwxr-xr-x | add_seg | 18 | ||||
-rwxr-xr-x | add_start_end | 1 | ||||
-rwxr-xr-x | avg | 18 | ||||
-rwxr-xr-x | avg_weights | 44 | ||||
-rwxr-xr-x | firstisupper | 5 | ||||
-rwxr-xr-x | htmlentities | 2 | ||||
-rwxr-xr-x | keycount | 9 | ||||
-rwxr-xr-x | kmeans | 138 | ||||
-rwxr-xr-x | max | 4 | ||||
-rwxr-xr-x | merge_files | 24 | ||||
-rwxr-xr-x | min | 6 | ||||
-rwxr-xr-x | min_max | 42 | ||||
-rwxr-xr-x | moses_1best | 8 | ||||
-rwxr-xr-x | mult | 6 | ||||
-rwxr-xr-x | ng | 44 | ||||
-rwxr-xr-x | no_empty | 10 | ||||
-rwxr-xr-x | no_non_printables | 3 | ||||
-rwxr-xr-x | norm_german | 26 | ||||
-rwxr-xr-x | num_tok | 6 | ||||
-rwxr-xr-x | odd | 1 | ||||
-rwxr-xr-x | paste_pairs | 2 | ||||
-rwxr-xr-x | per_sentence_bleu | 46 | ||||
-rwxr-xr-x | preprocess | 2 | ||||
-rwxr-xr-x | round | 6 | ||||
-rwxr-xr-x | ruby_eval | 1 | ||||
-rwxr-xr-x | rule_shapes | 5 | ||||
-rwxr-xr-x | sample | 9 | ||||
-rwxr-xr-x | sample_n | 9 | ||||
-rwxr-xr-x | shard | 22 | ||||
-rwxr-xr-x | splitpipes | 11 | ||||
-rwxr-xr-x | stddev | 16 | ||||
-rwxr-xr-x | strips (renamed from strip_whitespace) | 2 | ||||
-rwxr-xr-x | sum | 4 | ||||
-rw-r--r-- | test/kmeans/data | 9 | ||||
-rwxr-xr-x | tf-idf | 61 | ||||
-rwxr-xr-x | traintestsplit | 90 | ||||
-rwxr-xr-x | var | 17 | ||||
-rwxr-xr-x | wrap-xml.perl | 1 |
41 files changed, 311 insertions, 433 deletions
@@ -0,0 +1,7 @@ +Copyright (C) 2014 Patrick Simianer <p ät simianer.de> + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. @@ -1,2 +0,0 @@ -misc. nlp related scripts - diff --git a/README.md b/README.md new file mode 100644 index 0000000..8b5b4ad --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +scripts +======= + +A number of NLP related scripts. +Some scripts require my rubynlp gem, +see https://github.com/pks/nlp_ruby . + @@ -2,30 +2,24 @@ require 'trollop' - STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' -def usage - puts "addseg [--nogz] [--loo] [--grammar] <path to grammars dir>\n" - exit 1 -end -opts = Trollop::options do - opt :grammar, "(Abs) path of folder containing grammar.", :type => :string, :short => '-g', :required => true +cfg = Trollop::options do + opt :grammar, "(Absolute) path of folder containing grammars.", :type => :string, :short => '-g', :required => true opt :loo, "leave one out", :type => :bool, :default => false opt :start_id, "start with this id", :type => :int, :default => 0, :short => '-i' opt :nogz, "grammar files not gzipped", :type => :bool, :default => false end - -i = opts[:start_id] +i = cfg[:start_id] while line = STDIN.gets ext = '.gz' - ext = '' if opts[:nogz] + ext = '' if cfg[:nogz] s = "<seg" - if opts[:loo] then s += " exclude=\"#{i}\"" end - if opts[:grammar] then s += " grammar=\"#{opts[:grammar]}/grammar.#{i}#{ext}\"" end + if cfg[:loo] then s += " exclude=\"#{i}\"" end + if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{i}#{ext}\"" end puts s + " id=\"#{i}\"> #{line.strip} </seg>" i+=1 end diff --git a/add_start_end b/add_start_end index a14a65e..30deaec 100755 --- a/add_start_end +++ b/add_start_end @@ -3,6 +3,7 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' + while line = STDIN.gets puts "<s> #{line.strip} </s>" end @@ -3,28 +3,22 @@ require 'trollop' -def usage - STDERR.write "./avg [-r <d>] < <one number per line>\n" - exit 1 -end -usage if not [0,2].include? ARGV.size - -opts = Trollop::options do +cfg = Trollop::options do + banner "avg < <one number per line>" opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 end - sum = 0.0 i = 0 -while line=STDIN.gets - sum += line.strip.to_f +while line = STDIN.gets + sum += line.to_f i +=1 end avg = sum/i.to_f -if opts[:round] >= 0 - puts avg.round opts[:round] +if cfg[:round] >= 0 + puts avg.round cfg[:round] else puts avg end diff --git a/avg_weights b/avg_weights index 2b72747..71ffdd9 100755 --- a/avg_weights +++ b/avg_weights @@ -1,46 +1,34 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' require 'zlib' -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - -opts = Trollop::options do +cfg = Trollop::options do + opt :weights_files, "a number of weights files: name value", :required => true opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false end -def usage - puts "avg_weights_filter [--filter] <filename>+" - exit 1 -end -usage if ARGV.size==0 - - h = {} ARGV.each { |fn| -if File.extname(fn)=='.gz' - f = Zlib::GzipReader.new(File.new(fn, 'rb')) -else - f = File.new fn, 'r' -end -while line = f.gets - k, v = line.split - v = v.to_f - if h.has_key? k - h[k] << v - else - h[k] = [v] + f = ReadFile.new fn + while line = f.gets + k, v = line.split + v = v.to_f + if h.has_key? k + h[k] << v + else + h[k] = [v] + end end -end -f.close + f.close } n = ARGV.size.to_f -h.each_pair { |k,a| - next if opts[:filter] and a.size < n - puts "#{k} #{a.inject(:+)/n}" +h.each_pair { |k,w| + next if cfg[:filter] and w.size < n + puts "#{k} #{w.inject(:+)/n}" } diff --git a/firstisupper b/firstisupper index 4278334..516dd8a 100755 --- a/firstisupper +++ b/firstisupper @@ -1,8 +1,7 @@ #!/usr/bin/env ruby -def downcase?(string) - string[/[[:lower:]]/] -end +require 'nlp_ruby' + while line = STDIN.gets puts line.strip if downcase? line[0] diff --git a/htmlentities b/htmlentities index ecbee3f..f3c2d34 100755 --- a/htmlentities +++ b/htmlentities @@ -2,10 +2,10 @@ require 'htmlentities' - STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' + coder = HTMLEntities.new while line = STDIN.gets @@ -1,11 +1,14 @@ #!/usr/bin/env ruby +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + h = {} -h.default = 0 +h.default = 0 while line = STDIN.gets - line.strip! + line.strip! h[line] += 1 end -h.each_pair {|k,v| puts "#{k} #{v}"} +h.each_pair { |k,v| puts "#{k} #{v}" } @@ -1,141 +1,97 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' -# {s:f} {s:f} => f -def dot(x,y) - sum = 0.0 - x.each_pair { |k,v| sum += v * y[k] } - return sum -end - -# {s:f} => f -def mag(x) - return Math.sqrt x.values.inject { |sum,i| sum+i**2 } -end - -# {s:f} {s:f} => f -def cos_sim(x,y) - return dot(x,y)/(mag(x)*mag(y)) -end - -# {s:f} {s:f} => f -def euclidian_dist(x,y) - dims = [x.keys, y.keys].flatten.uniq - sum = 0.0 - dims.each { |i| sum += (x[i] - y[i])**2 } - return Math.sqrt(sum) -end - -# str => {s:{s:f}} -def read(fn) - h = {} - f = File.new fn, 'r' - while line = f.gets - g = eval line - h[g[0]] = g[1] - h[g[0]].default = 0.0 - end - return h +def read_data fn + data = {} + ReadFile.new(fn).readlines_strip.map{ |i| + a = i.split ' ', 2 + data[a.first] = read_feature_string a.last + } + return data end -# {s:{s:f}} i => [{s:f}] -def rand_init(docs, k) - prng = Random.new - return docs.keys.sample k, random:prng +def rand_init data, k + prng = Random.new + return data.keys.sample k, random:prng end -def rand_init2(docs, k) - prng = Random.new +def rand_means_init data, k + prng = Random.new a = [] 0.upto(k-1) do - a << mean(docs.values.sample k, random:prng) + a << mean_sparse_vector(data.values.sample k, random:prng) end return a end -# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]} -def assign(docs, centroids) +def assign centroids, data assignment = {} - docs.each_pair { |name,feature_vector| + data.each_pair { |name,feature_vector| min = 1.0/0 min_index = nil - centroids.each_with_index { |c,j| - dist = euclidian_dist(c, feature_vector) - if dist < min - min = dist - min_index = j + centroids.each_with_index { |c,i| + dist = c.euclidian_dist(feature_vector) + if dist < min + min = dist + min_index = i end } if assignment.has_key? min_index - assignment[min_index] << [name, feature_vector] + assignment[min_index] << name else - assignment[min_index] = [[name, feature_vector]] + assignment[min_index] = [name] end } return assignment end -# [{s:f}] => {s:f} -def mean(a) - res = {} - res.default = 0.0 - a.each { |i| - i.each_pair { |k,v| - res[k] += v - } - } - n = a.size.to_f - res.each_pair { |k,v| - res[k] = v/n - } -end - -# {i:[{s:f}]} => [{s:f}] -def update(assignment) +def update assignment, data new_centroids = [] - assignment.each_pair { |centroid,docs| - new_centroids << mean(docs.map{|i |i[1]}) + assignment.each_pair { |centroid_index,a| + new_centroids << mean_sparse_vector(assignment[centroid_index].map{ |i| data[i] }) } return new_centroids end def main - opts = Trollop::options do + cfg = Trollop::options do opt :k, "k", :type => :int, :required => true opt :input, "input: one feature vector per line", :type => :string, :required => true opt :max_iterations, "max. number of iterations", :type => :int, :default => 100 - opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3 + opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3 opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2 end - docs = read opts[:input] - k = opts[:k] + # data is 'ID f1=v1 f2=v2' + data = read_data cfg[:input] + k = cfg[:k] centroids = nil - if opts[:init] == 1 - centroids = rand_init(docs, k) + if cfg[:init] == 1 + centroids = rand_init(data, k) else - centroids = rand_init2(docs, k) + centroids = rand_means_init(data, k) end STDERR.write "\n k #{k}\n" - STDERR.write " input #{opts[:input]}\n" - STDERR.write "iterations #{opts[:max_iterations]}\n" - STDERR.write "max no ch. #{opts[:max_no_change]}\n" - STDERR.write " init #{opts[:init]}\n\n" + STDERR.write " input #{cfg[:input]}\n" + STDERR.write "iterations #{cfg[:max_iterations]}\n" + STDERR.write "max no ch. #{cfg[:max_no_change]}\n" + STDERR.write " init #{cfg[:init]}\n\n" assignment = nil prev_stats = [] stats = [] no_change = 0 max_no_change = 5 - STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n" - 0.upto(opts[:max_iterations]) do |i| + STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n" + 0.upto(cfg[:max_iterations]) do |i| s = "iteration #{i}" STDERR.write "#{s}\n#{'-'*s.size}\n" - assignment = assign(docs, centroids) + assignment = assign centroids, data sizes = [] - assignment.each_pair { |centroid_index,docs| - sizes << docs.size - } + assignment.each_pair { |centroid_index, a| + sizes << a.size + } median = sizes.sort[k/2] max = sizes.max min = sizes.min @@ -148,12 +104,12 @@ def main STDERR.write " min cluster sz=#{min}\n\n" if no_change == max_no_change STDERR.write "\nmax no change hit!\n\n" - assignment.each_pair { |centroid_index,docs| - puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}" + assignment.each_pair { |centroid_index,a| + puts "#{centroid_index} #{a.to_s}" } break end - centroids = update(assignment) + centroids = update assignment, data end end @@ -1,9 +1,11 @@ #!/usr/bin/env ruby + max = -1.0/0 while line = STDIN.gets - v = line.strip.to_f + v = line.to_f max = v if v > max end + puts max diff --git a/merge_files b/merge_files index db9d5da..051ad6d 100755 --- a/merge_files +++ b/merge_files @@ -1,31 +1,31 @@ #!/usr/bin/env ruby -STDOUT.set_encoding 'utf-8' +require 'nlp_ruby' + def usage - STDERR.write "merge_files [file]+\n" + STDERR.write "merge_files <file>+\n" exit 1 end usage if ARGV.size==0 - files = ARGV -dicts = [] +hashes = [] files.each { |i| - dicts.push Hash.new - dicts.last.default = 0 - File.open i, "r:UTF-8" do |f| - while line = f.gets - dicts.last[line.strip] += 1 - end + hashes.push Hash.new + hashes.last.default = 0 + f = ReadFile.new i + while line = f.gets + hashes.last[line.strip] += 1 end + f.close } -dicts.each { |h| +hashes.each { |h| h.each { |k,v| counts = [] - dicts.each { |j| counts.push j[k]; j.delete k } + hashes.each { |j| counts.push j[k]; j.delete k } counts.max.times { puts k } } } @@ -1,9 +1,11 @@ #!/usr/bin/env ruby + min = 1.0/0 while line = STDIN.gets - v = line.strip.to_f - min = v if v < min + v = line.to_f + min = v if v<min end + puts min @@ -1,33 +1,26 @@ #!/usr/bin/ruby +require 'nlp_ruby' require 'trollop' -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - -def usage - puts "filter-min-max.rb --min <min> --max <max> --in_f <in f> --in_e <in e> --out_f <out f> --out_e <out e> --out_id <out ids>" -end -usage if ARGV.size!=14 - -opts = Trollop::options do +cfg = Trollop::options do opt :min, "minimum #tokens", :type => :int, :default => 1 - opt :max, "maximum #tokens", :type => :int, :default => 80 - opt :in_f "input 'French' file", :type => string - opt :in_e "input 'English' file", :type => string - opt :out_f "output 'French' file", :type => string - opt :out_e "output 'English' file", :type => string - opt :out_id "output line Nos", :type => string + opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n' + opt :in_f, "input 'French' file", :type => :string, :required => true + opt :in_e, "input 'English' file", :type => :string, :required => true + opt :out_f, "output 'French' file", :type => :string, :required => true + opt :out_e, "output 'English' file", :type => :string, :required => true + opt :out_id, "output line Nos", :type => :string, :required => true end files = {} -files[:f_file] = File.new opts[:in_f], 'r:UTF-8' -files[:e_file] = File.new opts[:in_e], 'r:UTF-8' -files[:f_out_file] = File.new opts[:out_f], 'w:UTF-8' -files[:e_out_file] = File.new opts[:out_e], 'w:UTF-8' -files[:id_out_file] = File.new opts[:out_id], 'w' +files[:f_file] = ReadFile.new cfg[:in_f] +files[:e_file] = ReadFile.new cfg[:in_e] +files[:f_out_file] = WriteFile.new cfg[:out_f] +files[:e_out_file] = WriteFile.new cfg[:out_e] +files[:id_out_file] = WriteFile.new cfg[:out_id] i = 0 while f_line = files[:f_file].gets e_line = files[:e_file].gets @@ -35,13 +28,14 @@ while f_line = files[:f_file].gets e_line.strip! a = f_line.split b = e_line.split - if a.size >= opts[:min] and a.size <= opts[:max] and \ - b.size >= opts[:min] and b.size <= opts[:max] + if a.size >= cfg[:min] and a.size <= cfg[:max] and \ + b.size >= cfg[:min] and b.size <= cfg[:max] files[:f_out_file].write "#{f_line}\n" files[:e_out_file].write "#{e_line}\n" files[:id_out_file].write "#{i}\n" - end + end i+=1 end -files.values.each{|f|f.close} + +files.values.each{ |f| f.close } diff --git a/moses_1best b/moses_1best index 5c6bf9d..1a0805d 100755 --- a/moses_1best +++ b/moses_1best @@ -1,13 +1,13 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +require 'nlp_ruby' + prev_idx = nil while line = STDIN.gets line.strip! - idx = line.split('|||')[0].to_i - if idx != prev_idx + idx = splitpipe(line)[0].to_i + if idx != prev_idx puts line prev_idx = idx end @@ -1,4 +1,8 @@ #!/usr/bin/env ruby -puts STDIN.gets.to_f * ARGV[0].to_f + +factor = ARGV[0].to_f +while line = STDIN.gets + puts line.to_f * factor +end @@ -1,39 +1,19 @@ #!/usr/bin/env ruby -def ngrams_it(s, n, fix=false) - a = s.strip.split - a.each_with_index { |tok, i| - tok.strip! - 0.upto([n-1, a.size-i-1].min) { |m| - yield a[i..i+m] if !(fix||(a[i..i+m].size>n)) - } - } -end - -def main(n, fix, sep) - STDIN.set_encoding 'utf-8' - STDOUT.set_encoding 'utf-8' - while line = STDIN.gets - a = [] - ngrams_it(line, n, fix) {|ng| a << ng.join(' ')} - a.reject! {|i| i.strip.size==0 } - puts a.join sep if a.size > 0 - end -end +require 'nlp_ruby' +require 'trollop' -def usage - STDERR.write "./ng [-n <n>] [--fix] [--separator <s>] < <one number per line>\n" - exit 1 +cfg = Trollop::options do + banner "ng < <input>" + opt :n, "n for Ngrams", :type => :int, :default => 4 + opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => false + opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n" end -if __FILE__ == $0 - require 'trollop' - opts = Trollop::options do - opt :n, "Ngrams", :type => :int, :default => 4 - opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => false - opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n" - end - usage if not [0,2,4,6].include? ARGV.size - main(opts[:n], opts[:fix], opts[:separator]) +while line = STDIN.gets + a = [] + ngrams(line, cfg[:n], cfg[:fix]) { |ng| a << ng.join(' ') } + a.reject! { |i| i.strip.size==0 } + puts a.join cfg[:separator] if a.size>0 end @@ -1,12 +1,14 @@ #!/usr/bin/env ruby +require 'nlp_ruby' + + files = [] -(0..1).each { |i| files << File.new(ARGV[i], 'r') } -(2..3).each { |i| files << File.new(ARGV[i], 'w') } -files.each { |f| f.set_encoding('utf-8') } +(0..1).each { |i| files << ReadFile.new(ARGV[i]) } +(2..3).each { |i| files << WriteFile.new(ARGV[i]) } while line_f = files[0].gets - line_e = files[1].gets + line_e = files[1].gets line_f.strip!; line_e.strip! next if line_f=='' || line_e=='' files[2].write line_f+"\n" diff --git a/no_non_printables b/no_non_printables index fda1e40..20d1e3d 100755 --- a/no_non_printables +++ b/no_non_printables @@ -1 +1,4 @@ +#!/bin/sh + sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g' + diff --git a/norm_german b/norm_german index 57a37bb..ef0408e 100755 --- a/norm_german +++ b/norm_german @@ -3,17 +3,12 @@ require 'thread' require 'trollop' - STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' -def usage - STDERR.write "./avg [-r <d>] < <one number per line>\n" - exit 1 -end -usage if not [0,2,4].include? ARGV.size -opts = Trollop::options do +cfg = Trollop::options do + banner "norm_german < <file w/ lowercased tokens>" opt :upper, "uppercase", :type => :bool, :default => false opt :threads, "#threads", :type => :int, :default => 1, :short => '-h' opt :shard_size, "shard size", :type => :int, :default => 1000 @@ -21,10 +16,9 @@ opts = Trollop::options do opt :apply, "apply", :type => :bool end - pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ] pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ] -if opts[:upper] +if cfg[:upper] PAIRS = pairs_lower else PAIRS = pairs_lower+pairs_upper @@ -46,7 +40,7 @@ def build_partial(tokens) if get_key i, tok h[i] << tok found = true - break + break end } h[tok] = [tok] if !found @@ -60,24 +54,24 @@ thread_n = 0 counter = 0 token_stock = [] mutex = Mutex.new -while tok = STDIN.gets # expects stream of (lowercased) tokens +while tok = STDIN.gets token_stock << [] if !token_stock[thread_n] token_stock[thread_n] << tok.strip! counter += 1 - if token_stock[thread_n].size%opts[:shard_size]==0 + if token_stock[thread_n].size%cfg[:shard_size]==0 STDERR.write "Starting thread ##{thread_n}\n" threads << Thread.new(token_stock[thread_n]) { |tokens| th = build_partial tokens mutex.synchronize do - h.merge! th + h.merge! th end } threads.last.abort_on_exception = true thread_n += 1 - else + else next end - if thread_n==opts[:threads] + if thread_n==cfg[:threads] threads.each { |i| i.join } token_stock.each { |i| i.clear } thread_n = 0 @@ -86,7 +80,7 @@ while tok = STDIN.gets # expects stream of (lowercased) tokens end token_stock.each { |i| - if i.size!=0 + if i.size!=0 h.merge! build_partial i end } @@ -1,8 +1,10 @@ #!/usr/bin/env ruby -STDIN.set_encoding('utf-8') +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + while line = STDIN.gets - puts line.split.length + puts line.strip.split.length end @@ -3,6 +3,7 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' + i = 1 while line = STDIN.gets puts line if i%2!=0 diff --git a/paste_pairs b/paste_pairs index 6ede8f6..07c1f22 100755 --- a/paste_pairs +++ b/paste_pairs @@ -8,4 +8,4 @@ for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.a print linenr, (src_line.strip()) print linenr, (tgt_line.strip()) print - + diff --git a/per_sentence_bleu b/per_sentence_bleu index c7c0b0e..724b1e1 100755 --- a/per_sentence_bleu +++ b/per_sentence_bleu @@ -1,29 +1,21 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' -def ngrams_it(s, n, fix=false) - a = s.strip.split - a.each_with_index { |tok, i| - tok.strip! - 0.upto([n-1, a.size-i-1].min) { |m| - yield a[i..i+m] if !(fix||(a[i..i+m].size>n)) - } - } -end - -def brevity_penalty hypothesis, reference - a = hypothesis.split; b = reference.split - return 1.0 if a.size>b.size - return Math.exp(1.0 - ((b.size.to_f+1)/a.size)); +# reference-length hack as in (Nakov et al., 2012) +def brevity_penalty hypothesis, reference, hack=0 + a = tokenize hypothesis; b = tokenize reference + return 1.0 if a.size>=b.size + return Math.exp(1.0 - ((b.size.to_f+hack)/a.size)); end -def per_sentence_bleu hypothesis, reference, n=4 +def per_sentence_bleu hypothesis, reference, n=4, hack=0 h_ng = {}; r_ng = {} (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} - ngrams_it(hypothesis, n) {|i| h_ng[i.size] << i} - ngrams_it(reference, n) {|i| r_ng[i.size] << i} + ngrams(hypothesis, n) {|i| h_ng[i.size] << i} + ngrams(reference, n) {|i| r_ng[i.size] << i} m = [n, reference.split.size].min weight = 1.0/m add = 0.0 @@ -35,31 +27,29 @@ def per_sentence_bleu hypothesis, reference, n=4 add = 1.0 if i >= 2 sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); } - return brevity_penalty(hypothesis, reference) * Math.exp(sum) + return brevity_penalty(hypothesis, reference, hack) * Math.exp(sum) end def main - opts = Trollop::options do + cfg = Trollop::options do opt :input, "input", :type => :string, :default => '-' opt :references, "references", :type => :string, :required => true + opt :len_hack, "hack of Nakov et al", :type => :int, :default => 0 + opt :n, "N", :default => 4 end - - refs = File.new(opts[:references], 'r').readlines.map{|i|i.strip} + + refs = ReadFile.new(cfg[:references]).readlines_strip i = -1 - if opts[:input] == '-' - input = STDIN - else - input = File.new opts[:input], 'r' - end + input = ReadFile.new cfg[:input] while line = input.gets i += 1 if line.strip == '' puts 0.0 next end - puts per_sentence_bleu line.strip, refs[i] + puts per_sentence_bleu line.strip, refs[i], cfg[:n], cfg[:len_hack] end - input.close if opts[:input]!='-' + input.close end @@ -1,4 +1,4 @@ -#!/bin/zsh +#!/bin/bash LANG=$1 /toolbox/scripts/htmlentities 2>htmlentities.$LANG.err | /toolbox/scripts/normalize_punctuation 2>normalize-punctuation.$LANG.err | /toolbox/moses/scripts/tokenizer/tokenizer.no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | /toolbox/moses/scripts/tokenizer/lowercase.perl 2>lowercase.$LANG.err @@ -1,4 +1,8 @@ #!/usr/bin/env ruby -puts STDIN.gets.to_f.round ARGV[0].to_i + +r = ARGV[0].to_i +while line = STDIN.gets + puts line.to_f.round r +end @@ -1,5 +1,6 @@ #!/usr/bin/env ruby + while line = STDIN.gets puts "#{eval line}" end diff --git a/rule_shapes b/rule_shapes index 039b0dc..fd42249 100755 --- a/rule_shapes +++ b/rule_shapes @@ -3,11 +3,12 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' + def shape s res = [] in_t = false s.split.each { |i| - if i.match /\A\[X,\d\]\z/ + if i.match(/\A\[X,\d\]\z/) if in_t in_t = false end @@ -22,7 +23,7 @@ def shape s end while line = STDIN.gets - f,e = line.split "\t" + f, e = line.split(/\t/) f.strip!; e.strip! puts shape(f).join('_')+"-"+shape(e).join('_') end @@ -2,23 +2,16 @@ require 'trollop' - STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' -def usage - STDERR.write "./sample --size <n> < <line separated data>\n" - exit 1 -end -usage if ARGV.size!=4 opts = Trollop::options do + banner "sample --size <n> < <line separated data>" opt :size, "Sample n% (percentage).", :type => :int end - prng = Random.new(Random.new_seed) - while line = STDIN.gets STDOUT.write line if prng.rand(1..opts[:size])==0 end @@ -3,20 +3,13 @@ require 'trollop' -def usage - STDERR.write "./sample --size <n> --population <n>\n" - exit 1 -end -usage if ARGV.size!=4 - opts = Trollop::options do + banner "sample --size <n> --population <n>" opt :size, "Sample size (percentage).", :type => :int opt :population, "'Population' (number \in N)", :type => :int end - prng = Random.new(Random.new_seed) - 1.upto(opts[:population]) { |i| puts i if prng.rand(1..opts[:size])==0 } @@ -12,11 +12,11 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false index.shuffle! if rand shard_sz = lc / num_shards leftover = lc % num_shards - in_f = File.new input, 'r' + in_f = ReadFile.new input in_lines = in_f.readlines - refs_f = File.new refs, 'r' + refs_f = ReadFile.new refs refs_lines = refs_f.readlines - a_f = File.new alignments, 'r' + a_f = ReadFile.new alignments a_lines = a_f.readlines shard_in_files = [] shard_refs_files = [] @@ -26,13 +26,13 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false a_fns = [] 0.upto(num_shards-1) { |shard| in_fn = "#{output_prefix}.#{shard}.#{input_ext}" - shard_in = File.new in_fn, 'w+' + shard_in = WriteFile.new in_fn in_fns << in_fn refs_fn = "#{output_prefix}.#{shard}.#{refs_ext}" - shard_refs = File.new refs_fn, 'w+' + shard_refs = WriteFile.new refs_fn refs_fns << refs_fn a_fn = "#{output_prefix}.#{shard}.a" - shard_a = File.new a_fn, 'w+' + shard_a = WriteFile.new a_fn a_fns << a_fn 0.upto(shard_sz-1) { |i| j = index.pop @@ -69,12 +69,12 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false end opts = Trollop::options do - opt :input, 'input', :type => :string - opt :references, 'references', :type => :string - opt :alignments, 'alignments', :type => :string - opt :output_prefix, 'output prefix', :type => :string + opt :input, 'input', :type => :string, :required => true + opt :references, 'references', :type => :string, :required => true + opt :alignments, 'alignments', :type => :string, :required => true + opt :output_prefix, 'output prefix', :type => :string, :required => true opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z' - opt :num_shards, 'number of shards', :type => :int + opt :num_shards, 'number of shards', :type => :int, :required => true end make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize]) @@ -2,24 +2,19 @@ require 'trollop' - STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' -def usage - STDERR.write "splitpipes -f <n> < <input>\n" - exit 1 -end -usage if ARGV.size!=2 -opts = Trollop::options do +cfg = Trollop::options do + banner "splitpipes -f <n> < <input>" opt :field, "field", :type => :int end while line = STDIN.gets j = 1 line.strip.split(' ||| ').each { |i| - if j == opts[:field] + if j == cfg[:field] puts i.strip break end @@ -3,22 +3,16 @@ require 'trollop' -def usage - STDERR.write "./stddev [-r <d>] < <one number per line>\n" - exit 1 -end -usage if not [0,2].include? ARGV.size - -opts = Trollop::options do +cfg = Trollop::options do + banner "stddev [-r <d>] < <one number per line>" opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 end - sum = 0.0 i = 0 cached = [] while line=STDIN.gets - v = line.strip.to_f + v = line.to_f sum += v cached << v i +=1 @@ -33,8 +27,8 @@ cached.each { |v| stddev = Math.sqrt(var) -if opts[:round] >= 0 - puts stddev.round opts[:round] +if cfg[:round] >= 0 + puts stddev.round cfg[:round] else puts stddev end diff --git a/strip_whitespace b/strips index 37c02e5..11c00b4 100755 --- a/strip_whitespace +++ b/strips @@ -1,6 +1,6 @@ #!/usr/bin/env ruby while line = STDIN.gets - puts line.lstrip.strip + puts line.strip end @@ -1,8 +1,10 @@ #!/usr/bin/env ruby + sum = 0.0 while line = STDIN.gets - sum += line.strip.to_f + sum += line.to_f end + puts sum diff --git a/test/kmeans/data b/test/kmeans/data new file mode 100644 index 0000000..b5b3db2 --- /dev/null +++ b/test/kmeans/data @@ -0,0 +1,9 @@ +d00 feature_0=1.0 feature_1=0.5 +d01 feature_0=1.5 feature_1=0.4 +d02 feature_0=1.8 feature_1=0.3 +d10 feature_1=0.5 feature_2=1.0 +d11 feature_1=0.4 feature_2=2.0 +d12 feature_1=0.6 feature_2=1.5 +d20 feature_2=0.2 feature_3=1.0 +d21 feature_2=0.5 feature_3=2.0 +d22 feature_2=0.6 feature_3=3.0 @@ -1,68 +1,41 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' -# returns word='raw frequency' for a single document -def tf(d, stopwords=[]) - v = {}; v.default = 0 - d.uniq.each { |i| - next if stopwords.include? i - v[i] = d.count(i).to_f - } - return v -end - -# smoothes raw frequencies -def ntf(w, a=0.4) - max = w.values.max.to_f - w.each_pair { |k,v| - w[k] = a + (1-a)*(v/max) - } -end - -# returns idf value for each word in vocab -def idf(collection) - vocab = collection.values.flatten.uniq - n = collection.size.to_f - idf = {} - vocab.each { |i| - df = collection.values.flatten.count i - idf[i] = Math.log(n/df) - } - return idf -end - def main - opts = Trollop::options do - opt :docs, "input files (documents)", :type => :strings, :required => true - opt :filter_stopwords, "filter stopwords (give file)", :type => :string - opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool + cfg = Trollop::options do + opt :documents, "input files (documents)", :type => :strings, :required => true + opt :filter_stopwords, "filter stopwords (give file)", :type => :string, :default => nil + opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool, :default => false opt :ntf, "length-normalize tf values", :type => :bool opt :idf, "weight tf by idf", :type => :bool end stopwords = [] - if opts[:filter_stopwords] - stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''} + if cfg[:filter_stopwords] + stopwords = ReadFile.new(cfg[:filter_stopwords]).readlines.map{ |i| + i.split('|').first.strip + }.reject{ |i| i=='' } end - docs = {} # fn => [words...] - opts[:docs].each { |i| - if opts[:one_item_per_line] - docs[i] = File.new(i, 'r').readlines.map{|i| i.strip} + docs = {} + cfg[:documents].each { |i| + if cfg[:one_item_per_line] + docs[i] = ReadFile.new(i).readlines_strip else - docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip} + docs[i] = ReadFile.new(i).read.split(/\s/).map{ |i| i.strip } end } idf_values = idf docs docs.each_pair { |name, words| - just_tf = tf(words) - just_tf = ntf(just_tf) if opts[:ntf] + just_tf = tf words, stopwords + just_tf = ntf(just_tf) if cfg[:ntf] tf_idf = {}; tf_idf.default = 0.0 - if opts[:idf] + if cfg[:idf] just_tf.each_pair { |word,f| tf_idf[word] = idf_values[word] * f } diff --git a/traintestsplit b/traintestsplit index 7ec52ae..7cc5bcf 100755 --- a/traintestsplit +++ b/traintestsplit @@ -1,55 +1,51 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' -def main - opts = Trollop::options do - opt :foreign, "foreign file", :type => :string, :required => true - opt :english, "english file", :type => :string, :required => true - opt :size, "one size", :type => :int, :required => true - opt :repeat, "number of repetitions", :type => :int, :default => 1 - opt :prefix, "prefix for output files", :type => :string - end - fn = opts[:foreign] - fn_ext = fn.split('.').last - f = File.new(fn, 'r').readlines - en = opts[:english] - en_ext = en.split('.').last - e = File.new(en, 'r').readlines - size = opts[:size] - nlines_f = `wc -l #{fn}`.split()[0].to_i - nlines_e = `wc -l #{en}`.split()[0].to_i - if nlines_f!=nlines_e - STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" - exit 1 - end - - prefix = opts[:prefix] - a = (0..nlines_e-1).to_a - i = 0 - opts[:repeat].times { - b = a.sample(size) - ax = a.reject{|j| b.include? j} - `mkdir split_#{i}` - new_f = File.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}", 'w+' - new_e = File.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}", 'w+' - ax.each { |j| - new_f.write f[j] - new_e.write e[j] - } - new_f.close; new_e.close - new_f = File.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}", 'w+' - new_e = File.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}", 'w+' - b.each { |j| - new_f.write f[j] - new_e.write e[j] - } - new_f.close; new_e.close - i += 1 - } +cfg = Trollop::options do + opt :foreign, "foreign file", :type => :string, :required => true + opt :english, "english file", :type => :string, :required => true + opt :size, "one size", :type => :int, :required => true + opt :repeat, "number of repetitions", :type => :int, :default => 1 + opt :prefix, "prefix for output files", :type => :string +end +fn = cfg[:foreign] +fn_ext = fn.split('.').last +f = ReadFile.new(fn).readlines +en = cfg[:english] +en_ext = en.split('.').last +e = ReadFile(en).readlines +size = cfg[:size] +nlines_f = `wc -l #{fn}`.split()[0].to_i +nlines_e = `wc -l #{en}`.split()[0].to_i +if nlines_f != nlines_e + STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" + exit 1 end - -main +prefix = cfg[:prefix] +a = (0..nlines_e-1).to_a +i = 0 +cfg[:repeat].times { + b = a.sample(size) + ax = a.reject{|j| b.include? j} + `mkdir split_#{i}` + new_f = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{fn_ext}") + new_e = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{en_ext}") + ax.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + new_f = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{fn_ext}") + new_e = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{en_ext}") + b.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + i += 1 +} @@ -3,13 +3,8 @@ require 'trollop' -def usage - STDERR.write "./stddev [-r <d>] < <one number per line>\n" - exit 1 -end -usage if not [0,2].include? ARGV.size - -opts = Trollop::options do +cfg = Trollop::options do + banner "stddev [-r <d>] < <one number per line>" opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 end @@ -18,10 +13,10 @@ sum = 0.0 i = 0 cached = [] while line=STDIN.gets - v = line.strip.to_f + v = line.to_f sum += v cached << v - i +=1 + i +=1 end avg = sum/i.to_f @@ -31,8 +26,8 @@ cached.each { |v| var += (avg - v)**2 } -if opts[:round] >= 0 - puts var.round opts[:round] +if cfg[:round] >= 0 + puts var.round cfg[:round] else puts var end diff --git a/wrap-xml.perl b/wrap-xml.perl index d29065a..06303b7 100755 --- a/wrap-xml.perl +++ b/wrap-xml.perl @@ -1,5 +1,6 @@ #!/usr/bin/perl -w # original: https://smt.googlecode.com/svn/trunk/moses64/tools/scripts/wrap-xml.perl +# (licensed under LGPL) use strict; |