From 68acbb9a0c7967cb90a7e3756fc94fdd8a73d154 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 29 Jan 2014 19:14:08 +0100 Subject: make use of nlp_ruby, LICENSE --- LICENSE | 7 +++ README | 2 - README.md | 7 +++ add_seg | 18 +++---- add_start_end | 1 + avg | 18 +++---- avg_weights | 44 +++++++---------- firstisupper | 5 +- htmlentities | 2 +- keycount | 9 ++-- kmeans | 138 +++++++++++++++++++----------------------------------- max | 4 +- merge_files | 24 +++++----- min | 6 ++- min_max | 42 +++++++---------- moses_1best | 8 ++-- mult | 6 ++- ng | 44 +++++------------ no_empty | 10 ++-- no_non_printables | 3 ++ norm_german | 26 ++++------ num_tok | 6 ++- odd | 1 + paste_pairs | 2 +- per_sentence_bleu | 46 +++++++----------- preprocess | 2 +- round | 6 ++- ruby_eval | 1 + rule_shapes | 5 +- sample | 9 +--- sample_n | 9 +--- shard | 22 ++++----- splitpipes | 11 ++--- stddev | 16 ++----- strip_whitespace | 6 --- strips | 6 +++ sum | 4 +- test/kmeans/data | 9 ++++ tf-idf | 61 +++++++----------------- traintestsplit | 90 +++++++++++++++++------------------ var | 17 +++---- wrap-xml.perl | 1 + 42 files changed, 316 insertions(+), 438 deletions(-) create mode 100644 LICENSE delete mode 100644 README create mode 100644 README.md delete mode 100755 strip_whitespace create mode 100755 strips create mode 100644 test/kmeans/data diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0d5dab3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,7 @@ +Copyright (C) 2014 Patrick Simianer

+ +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README b/README deleted file mode 100644 index 8ce273f..0000000 --- a/README +++ /dev/null @@ -1,2 +0,0 @@ -misc. nlp related scripts - diff --git a/README.md b/README.md new file mode 100644 index 0000000..8b5b4ad --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +scripts +======= + +A number of NLP related scripts. +Some scripts require my rubynlp gem, +see https://github.com/pks/nlp_ruby . + diff --git a/add_seg b/add_seg index e661b40..684a236 100755 --- a/add_seg +++ b/add_seg @@ -2,30 +2,24 @@ require 'trollop' - STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' -def usage - puts "addseg [--nogz] [--loo] [--grammar] \n" - exit 1 -end -opts = Trollop::options do - opt :grammar, "(Abs) path of folder containing grammar.", :type => :string, :short => '-g', :required => true +cfg = Trollop::options do + opt :grammar, "(Absolute) path of folder containing grammars.", :type => :string, :short => '-g', :required => true opt :loo, "leave one out", :type => :bool, :default => false opt :start_id, "start with this id", :type => :int, :default => 0, :short => '-i' opt :nogz, "grammar files not gzipped", :type => :bool, :default => false end - -i = opts[:start_id] +i = cfg[:start_id] while line = STDIN.gets ext = '.gz' - ext = '' if opts[:nogz] + ext = '' if cfg[:nogz] s = " #{line.strip} " i+=1 end diff --git a/add_start_end b/add_start_end index a14a65e..30deaec 100755 --- a/add_start_end +++ b/add_start_end @@ -3,6 +3,7 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' + while line = STDIN.gets puts " #{line.strip} " end diff --git a/avg b/avg index cc4c0e6..ed31465 100755 --- a/avg +++ b/avg @@ -3,28 +3,22 @@ require 'trollop' -def usage - STDERR.write "./avg [-r ] < \n" - exit 1 -end -usage if not [0,2].include? ARGV.size - -opts = Trollop::options do +cfg = Trollop::options do + banner "avg < " opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 end - sum = 0.0 i = 0 -while line=STDIN.gets - sum += line.strip.to_f +while line = STDIN.gets + sum += line.to_f i +=1 end avg = sum/i.to_f -if opts[:round] >= 0 - puts avg.round opts[:round] +if cfg[:round] >= 0 + puts avg.round cfg[:round] else puts avg end diff --git a/avg_weights b/avg_weights index 2b72747..71ffdd9 100755 --- a/avg_weights +++ b/avg_weights @@ -1,46 +1,34 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' require 'zlib' -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - -opts = Trollop::options do +cfg = Trollop::options do + opt :weights_files, "a number of weights files: name value", :required => true opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false end -def usage - puts "avg_weights_filter [--filter] +" - exit 1 -end -usage if ARGV.size==0 - - h = {} ARGV.each { |fn| -if File.extname(fn)=='.gz' - f = Zlib::GzipReader.new(File.new(fn, 'rb')) -else - f = File.new fn, 'r' -end -while line = f.gets - k, v = line.split - v = v.to_f - if h.has_key? k - h[k] << v - else - h[k] = [v] + f = ReadFile.new fn + while line = f.gets + k, v = line.split + v = v.to_f + if h.has_key? k + h[k] << v + else + h[k] = [v] + end end -end -f.close + f.close } n = ARGV.size.to_f -h.each_pair { |k,a| - next if opts[:filter] and a.size < n - puts "#{k} #{a.inject(:+)/n}" +h.each_pair { |k,w| + next if cfg[:filter] and w.size < n + puts "#{k} #{w.inject(:+)/n}" } diff --git a/firstisupper b/firstisupper index 4278334..516dd8a 100755 --- a/firstisupper +++ b/firstisupper @@ -1,8 +1,7 @@ #!/usr/bin/env ruby -def downcase?(string) - string[/[[:lower:]]/] -end +require 'nlp_ruby' + while line = STDIN.gets puts line.strip if downcase? line[0] diff --git a/htmlentities b/htmlentities index ecbee3f..f3c2d34 100755 --- a/htmlentities +++ b/htmlentities @@ -2,10 +2,10 @@ require 'htmlentities' - STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' + coder = HTMLEntities.new while line = STDIN.gets diff --git a/keycount b/keycount index 15b4095..deaa522 100755 --- a/keycount +++ b/keycount @@ -1,11 +1,14 @@ #!/usr/bin/env ruby +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + h = {} -h.default = 0 +h.default = 0 while line = STDIN.gets - line.strip! + line.strip! h[line] += 1 end -h.each_pair {|k,v| puts "#{k} #{v}"} +h.each_pair { |k,v| puts "#{k} #{v}" } diff --git a/kmeans b/kmeans index 89cc329..5c49d9a 100755 --- a/kmeans +++ b/kmeans @@ -1,141 +1,97 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' -# {s:f} {s:f} => f -def dot(x,y) - sum = 0.0 - x.each_pair { |k,v| sum += v * y[k] } - return sum -end - -# {s:f} => f -def mag(x) - return Math.sqrt x.values.inject { |sum,i| sum+i**2 } -end - -# {s:f} {s:f} => f -def cos_sim(x,y) - return dot(x,y)/(mag(x)*mag(y)) -end - -# {s:f} {s:f} => f -def euclidian_dist(x,y) - dims = [x.keys, y.keys].flatten.uniq - sum = 0.0 - dims.each { |i| sum += (x[i] - y[i])**2 } - return Math.sqrt(sum) -end - -# str => {s:{s:f}} -def read(fn) - h = {} - f = File.new fn, 'r' - while line = f.gets - g = eval line - h[g[0]] = g[1] - h[g[0]].default = 0.0 - end - return h +def read_data fn + data = {} + ReadFile.new(fn).readlines_strip.map{ |i| + a = i.split ' ', 2 + data[a.first] = read_feature_string a.last + } + return data end -# {s:{s:f}} i => [{s:f}] -def rand_init(docs, k) - prng = Random.new - return docs.keys.sample k, random:prng +def rand_init data, k + prng = Random.new + return data.keys.sample k, random:prng end -def rand_init2(docs, k) - prng = Random.new +def rand_means_init data, k + prng = Random.new a = [] 0.upto(k-1) do - a << mean(docs.values.sample k, random:prng) + a << mean_sparse_vector(data.values.sample k, random:prng) end return a end -# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]} -def assign(docs, centroids) +def assign centroids, data assignment = {} - docs.each_pair { |name,feature_vector| + data.each_pair { |name,feature_vector| min = 1.0/0 min_index = nil - centroids.each_with_index { |c,j| - dist = euclidian_dist(c, feature_vector) - if dist < min - min = dist - min_index = j + centroids.each_with_index { |c,i| + dist = c.euclidian_dist(feature_vector) + if dist < min + min = dist + min_index = i end } if assignment.has_key? min_index - assignment[min_index] << [name, feature_vector] + assignment[min_index] << name else - assignment[min_index] = [[name, feature_vector]] + assignment[min_index] = [name] end } return assignment end -# [{s:f}] => {s:f} -def mean(a) - res = {} - res.default = 0.0 - a.each { |i| - i.each_pair { |k,v| - res[k] += v - } - } - n = a.size.to_f - res.each_pair { |k,v| - res[k] = v/n - } -end - -# {i:[{s:f}]} => [{s:f}] -def update(assignment) +def update assignment, data new_centroids = [] - assignment.each_pair { |centroid,docs| - new_centroids << mean(docs.map{|i |i[1]}) + assignment.each_pair { |centroid_index,a| + new_centroids << mean_sparse_vector(assignment[centroid_index].map{ |i| data[i] }) } return new_centroids end def main - opts = Trollop::options do + cfg = Trollop::options do opt :k, "k", :type => :int, :required => true opt :input, "input: one feature vector per line", :type => :string, :required => true opt :max_iterations, "max. number of iterations", :type => :int, :default => 100 - opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3 + opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3 opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2 end - docs = read opts[:input] - k = opts[:k] + # data is 'ID f1=v1 f2=v2' + data = read_data cfg[:input] + k = cfg[:k] centroids = nil - if opts[:init] == 1 - centroids = rand_init(docs, k) + if cfg[:init] == 1 + centroids = rand_init(data, k) else - centroids = rand_init2(docs, k) + centroids = rand_means_init(data, k) end STDERR.write "\n k #{k}\n" - STDERR.write " input #{opts[:input]}\n" - STDERR.write "iterations #{opts[:max_iterations]}\n" - STDERR.write "max no ch. #{opts[:max_no_change]}\n" - STDERR.write " init #{opts[:init]}\n\n" + STDERR.write " input #{cfg[:input]}\n" + STDERR.write "iterations #{cfg[:max_iterations]}\n" + STDERR.write "max no ch. #{cfg[:max_no_change]}\n" + STDERR.write " init #{cfg[:init]}\n\n" assignment = nil prev_stats = [] stats = [] no_change = 0 max_no_change = 5 - STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n" - 0.upto(opts[:max_iterations]) do |i| + STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n" + 0.upto(cfg[:max_iterations]) do |i| s = "iteration #{i}" STDERR.write "#{s}\n#{'-'*s.size}\n" - assignment = assign(docs, centroids) + assignment = assign centroids, data sizes = [] - assignment.each_pair { |centroid_index,docs| - sizes << docs.size - } + assignment.each_pair { |centroid_index, a| + sizes << a.size + } median = sizes.sort[k/2] max = sizes.max min = sizes.min @@ -148,12 +104,12 @@ def main STDERR.write " min cluster sz=#{min}\n\n" if no_change == max_no_change STDERR.write "\nmax no change hit!\n\n" - assignment.each_pair { |centroid_index,docs| - puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}" + assignment.each_pair { |centroid_index,a| + puts "#{centroid_index} #{a.to_s}" } break end - centroids = update(assignment) + centroids = update assignment, data end end diff --git a/max b/max index 506bd03..87f3c73 100755 --- a/max +++ b/max @@ -1,9 +1,11 @@ #!/usr/bin/env ruby + max = -1.0/0 while line = STDIN.gets - v = line.strip.to_f + v = line.to_f max = v if v > max end + puts max diff --git a/merge_files b/merge_files index db9d5da..051ad6d 100755 --- a/merge_files +++ b/merge_files @@ -1,31 +1,31 @@ #!/usr/bin/env ruby -STDOUT.set_encoding 'utf-8' +require 'nlp_ruby' + def usage - STDERR.write "merge_files [file]+\n" + STDERR.write "merge_files +\n" exit 1 end usage if ARGV.size==0 - files = ARGV -dicts = [] +hashes = [] files.each { |i| - dicts.push Hash.new - dicts.last.default = 0 - File.open i, "r:UTF-8" do |f| - while line = f.gets - dicts.last[line.strip] += 1 - end + hashes.push Hash.new + hashes.last.default = 0 + f = ReadFile.new i + while line = f.gets + hashes.last[line.strip] += 1 end + f.close } -dicts.each { |h| +hashes.each { |h| h.each { |k,v| counts = [] - dicts.each { |j| counts.push j[k]; j.delete k } + hashes.each { |j| counts.push j[k]; j.delete k } counts.max.times { puts k } } } diff --git a/min b/min index c2f85b9..398b0fb 100755 --- a/min +++ b/min @@ -1,9 +1,11 @@ #!/usr/bin/env ruby + min = 1.0/0 while line = STDIN.gets - v = line.strip.to_f - min = v if v < min + v = line.to_f + min = v if v --max --in_f --in_e --out_f --out_e --out_id " -end -usage if ARGV.size!=14 - -opts = Trollop::options do +cfg = Trollop::options do opt :min, "minimum #tokens", :type => :int, :default => 1 - opt :max, "maximum #tokens", :type => :int, :default => 80 - opt :in_f "input 'French' file", :type => string - opt :in_e "input 'English' file", :type => string - opt :out_f "output 'French' file", :type => string - opt :out_e "output 'English' file", :type => string - opt :out_id "output line Nos", :type => string + opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n' + opt :in_f, "input 'French' file", :type => :string, :required => true + opt :in_e, "input 'English' file", :type => :string, :required => true + opt :out_f, "output 'French' file", :type => :string, :required => true + opt :out_e, "output 'English' file", :type => :string, :required => true + opt :out_id, "output line Nos", :type => :string, :required => true end files = {} -files[:f_file] = File.new opts[:in_f], 'r:UTF-8' -files[:e_file] = File.new opts[:in_e], 'r:UTF-8' -files[:f_out_file] = File.new opts[:out_f], 'w:UTF-8' -files[:e_out_file] = File.new opts[:out_e], 'w:UTF-8' -files[:id_out_file] = File.new opts[:out_id], 'w' +files[:f_file] = ReadFile.new cfg[:in_f] +files[:e_file] = ReadFile.new cfg[:in_e] +files[:f_out_file] = WriteFile.new cfg[:out_f] +files[:e_out_file] = WriteFile.new cfg[:out_e] +files[:id_out_file] = WriteFile.new cfg[:out_id] i = 0 while f_line = files[:f_file].gets e_line = files[:e_file].gets @@ -35,13 +28,14 @@ while f_line = files[:f_file].gets e_line.strip! a = f_line.split b = e_line.split - if a.size >= opts[:min] and a.size <= opts[:max] and \ - b.size >= opts[:min] and b.size <= opts[:max] + if a.size >= cfg[:min] and a.size <= cfg[:max] and \ + b.size >= cfg[:min] and b.size <= cfg[:max] files[:f_out_file].write "#{f_line}\n" files[:e_out_file].write "#{e_line}\n" files[:id_out_file].write "#{i}\n" - end + end i+=1 end -files.values.each{|f|f.close} + +files.values.each{ |f| f.close } diff --git a/moses_1best b/moses_1best index 5c6bf9d..1a0805d 100755 --- a/moses_1best +++ b/moses_1best @@ -1,13 +1,13 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +require 'nlp_ruby' + prev_idx = nil while line = STDIN.gets line.strip! - idx = line.split('|||')[0].to_i - if idx != prev_idx + idx = splitpipe(line)[0].to_i + if idx != prev_idx puts line prev_idx = idx end diff --git a/mult b/mult index eaead89..2ef0149 100755 --- a/mult +++ b/mult @@ -1,4 +1,8 @@ #!/usr/bin/env ruby -puts STDIN.gets.to_f * ARGV[0].to_f + +factor = ARGV[0].to_f +while line = STDIN.gets + puts line.to_f * factor +end diff --git a/ng b/ng index de314b8..dbc59eb 100755 --- a/ng +++ b/ng @@ -1,39 +1,19 @@ #!/usr/bin/env ruby -def ngrams_it(s, n, fix=false) - a = s.strip.split - a.each_with_index { |tok, i| - tok.strip! - 0.upto([n-1, a.size-i-1].min) { |m| - yield a[i..i+m] if !(fix||(a[i..i+m].size>n)) - } - } -end - -def main(n, fix, sep) - STDIN.set_encoding 'utf-8' - STDOUT.set_encoding 'utf-8' - while line = STDIN.gets - a = [] - ngrams_it(line, n, fix) {|ng| a << ng.join(' ')} - a.reject! {|i| i.strip.size==0 } - puts a.join sep if a.size > 0 - end -end +require 'nlp_ruby' +require 'trollop' -def usage - STDERR.write "./ng [-n ] [--fix] [--separator ] < \n" - exit 1 +cfg = Trollop::options do + banner "ng < " + opt :n, "n for Ngrams", :type => :int, :default => 4 + opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => false + opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n" end -if __FILE__ == $0 - require 'trollop' - opts = Trollop::options do - opt :n, "Ngrams", :type => :int, :default => 4 - opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => false - opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n" - end - usage if not [0,2,4,6].include? ARGV.size - main(opts[:n], opts[:fix], opts[:separator]) +while line = STDIN.gets + a = [] + ngrams(line, cfg[:n], cfg[:fix]) { |ng| a << ng.join(' ') } + a.reject! { |i| i.strip.size==0 } + puts a.join cfg[:separator] if a.size>0 end diff --git a/no_empty b/no_empty index ecdbcdf..cd825c0 100755 --- a/no_empty +++ b/no_empty @@ -1,12 +1,14 @@ #!/usr/bin/env ruby +require 'nlp_ruby' + + files = [] -(0..1).each { |i| files << File.new(ARGV[i], 'r') } -(2..3).each { |i| files << File.new(ARGV[i], 'w') } -files.each { |f| f.set_encoding('utf-8') } +(0..1).each { |i| files << ReadFile.new(ARGV[i]) } +(2..3).each { |i| files << WriteFile.new(ARGV[i]) } while line_f = files[0].gets - line_e = files[1].gets + line_e = files[1].gets line_f.strip!; line_e.strip! next if line_f=='' || line_e=='' files[2].write line_f+"\n" diff --git a/no_non_printables b/no_non_printables index fda1e40..20d1e3d 100755 --- a/no_non_printables +++ b/no_non_printables @@ -1 +1,4 @@ +#!/bin/sh + sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g' + diff --git a/norm_german b/norm_german index 57a37bb..ef0408e 100755 --- a/norm_german +++ b/norm_german @@ -3,17 +3,12 @@ require 'thread' require 'trollop' - STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' -def usage - STDERR.write "./avg [-r ] < \n" - exit 1 -end -usage if not [0,2,4].include? ARGV.size -opts = Trollop::options do +cfg = Trollop::options do + banner "norm_german < " opt :upper, "uppercase", :type => :bool, :default => false opt :threads, "#threads", :type => :int, :default => 1, :short => '-h' opt :shard_size, "shard size", :type => :int, :default => 1000 @@ -21,10 +16,9 @@ opts = Trollop::options do opt :apply, "apply", :type => :bool end - pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ] pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ] -if opts[:upper] +if cfg[:upper] PAIRS = pairs_lower else PAIRS = pairs_lower+pairs_upper @@ -46,7 +40,7 @@ def build_partial(tokens) if get_key i, tok h[i] << tok found = true - break + break end } h[tok] = [tok] if !found @@ -60,24 +54,24 @@ thread_n = 0 counter = 0 token_stock = [] mutex = Mutex.new -while tok = STDIN.gets # expects stream of (lowercased) tokens +while tok = STDIN.gets token_stock << [] if !token_stock[thread_n] token_stock[thread_n] << tok.strip! counter += 1 - if token_stock[thread_n].size%opts[:shard_size]==0 + if token_stock[thread_n].size%cfg[:shard_size]==0 STDERR.write "Starting thread ##{thread_n}\n" threads << Thread.new(token_stock[thread_n]) { |tokens| th = build_partial tokens mutex.synchronize do - h.merge! th + h.merge! th end } threads.last.abort_on_exception = true thread_n += 1 - else + else next end - if thread_n==opts[:threads] + if thread_n==cfg[:threads] threads.each { |i| i.join } token_stock.each { |i| i.clear } thread_n = 0 @@ -86,7 +80,7 @@ while tok = STDIN.gets # expects stream of (lowercased) tokens end token_stock.each { |i| - if i.size!=0 + if i.size!=0 h.merge! build_partial i end } diff --git a/num_tok b/num_tok index a11b0d7..53b99a0 100755 --- a/num_tok +++ b/num_tok @@ -1,8 +1,10 @@ #!/usr/bin/env ruby -STDIN.set_encoding('utf-8') +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + while line = STDIN.gets - puts line.split.length + puts line.strip.split.length end diff --git a/odd b/odd index 0bd9336..93aaa80 100755 --- a/odd +++ b/odd @@ -3,6 +3,7 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' + i = 1 while line = STDIN.gets puts line if i%2!=0 diff --git a/paste_pairs b/paste_pairs index 6ede8f6..07c1f22 100755 --- a/paste_pairs +++ b/paste_pairs @@ -8,4 +8,4 @@ for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.a print linenr, (src_line.strip()) print linenr, (tgt_line.strip()) print - + diff --git a/per_sentence_bleu b/per_sentence_bleu index c7c0b0e..724b1e1 100755 --- a/per_sentence_bleu +++ b/per_sentence_bleu @@ -1,29 +1,21 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' -def ngrams_it(s, n, fix=false) - a = s.strip.split - a.each_with_index { |tok, i| - tok.strip! - 0.upto([n-1, a.size-i-1].min) { |m| - yield a[i..i+m] if !(fix||(a[i..i+m].size>n)) - } - } -end - -def brevity_penalty hypothesis, reference - a = hypothesis.split; b = reference.split - return 1.0 if a.size>b.size - return Math.exp(1.0 - ((b.size.to_f+1)/a.size)); +# reference-length hack as in (Nakov et al., 2012) +def brevity_penalty hypothesis, reference, hack=0 + a = tokenize hypothesis; b = tokenize reference + return 1.0 if a.size>=b.size + return Math.exp(1.0 - ((b.size.to_f+hack)/a.size)); end -def per_sentence_bleu hypothesis, reference, n=4 +def per_sentence_bleu hypothesis, reference, n=4, hack=0 h_ng = {}; r_ng = {} (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} - ngrams_it(hypothesis, n) {|i| h_ng[i.size] << i} - ngrams_it(reference, n) {|i| r_ng[i.size] << i} + ngrams(hypothesis, n) {|i| h_ng[i.size] << i} + ngrams(reference, n) {|i| r_ng[i.size] << i} m = [n, reference.split.size].min weight = 1.0/m add = 0.0 @@ -35,31 +27,29 @@ def per_sentence_bleu hypothesis, reference, n=4 add = 1.0 if i >= 2 sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); } - return brevity_penalty(hypothesis, reference) * Math.exp(sum) + return brevity_penalty(hypothesis, reference, hack) * Math.exp(sum) end def main - opts = Trollop::options do + cfg = Trollop::options do opt :input, "input", :type => :string, :default => '-' opt :references, "references", :type => :string, :required => true + opt :len_hack, "hack of Nakov et al", :type => :int, :default => 0 + opt :n, "N", :default => 4 end - - refs = File.new(opts[:references], 'r').readlines.map{|i|i.strip} + + refs = ReadFile.new(cfg[:references]).readlines_strip i = -1 - if opts[:input] == '-' - input = STDIN - else - input = File.new opts[:input], 'r' - end + input = ReadFile.new cfg[:input] while line = input.gets i += 1 if line.strip == '' puts 0.0 next end - puts per_sentence_bleu line.strip, refs[i] + puts per_sentence_bleu line.strip, refs[i], cfg[:n], cfg[:len_hack] end - input.close if opts[:input]!='-' + input.close end diff --git a/preprocess b/preprocess index bc6b5d2..4bf782a 100755 --- a/preprocess +++ b/preprocess @@ -1,4 +1,4 @@ -#!/bin/zsh +#!/bin/bash LANG=$1 /toolbox/scripts/htmlentities 2>htmlentities.$LANG.err | /toolbox/scripts/normalize_punctuation 2>normalize-punctuation.$LANG.err | /toolbox/moses/scripts/tokenizer/tokenizer.no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | /toolbox/moses/scripts/tokenizer/lowercase.perl 2>lowercase.$LANG.err diff --git a/round b/round index 52cd013..3dfbb6f 100755 --- a/round +++ b/round @@ -1,4 +1,8 @@ #!/usr/bin/env ruby -puts STDIN.gets.to_f.round ARGV[0].to_i + +r = ARGV[0].to_i +while line = STDIN.gets + puts line.to_f.round r +end diff --git a/ruby_eval b/ruby_eval index fe0d181..96b2ecb 100755 --- a/ruby_eval +++ b/ruby_eval @@ -1,5 +1,6 @@ #!/usr/bin/env ruby + while line = STDIN.gets puts "#{eval line}" end diff --git a/rule_shapes b/rule_shapes index 039b0dc..fd42249 100755 --- a/rule_shapes +++ b/rule_shapes @@ -3,11 +3,12 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' + def shape s res = [] in_t = false s.split.each { |i| - if i.match /\A\[X,\d\]\z/ + if i.match(/\A\[X,\d\]\z/) if in_t in_t = false end @@ -22,7 +23,7 @@ def shape s end while line = STDIN.gets - f,e = line.split "\t" + f, e = line.split(/\t/) f.strip!; e.strip! puts shape(f).join('_')+"-"+shape(e).join('_') end diff --git a/sample b/sample index b4706c6..e693d5c 100755 --- a/sample +++ b/sample @@ -2,23 +2,16 @@ require 'trollop' - STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' -def usage - STDERR.write "./sample --size < \n" - exit 1 -end -usage if ARGV.size!=4 opts = Trollop::options do + banner "sample --size < " opt :size, "Sample n% (percentage).", :type => :int end - prng = Random.new(Random.new_seed) - while line = STDIN.gets STDOUT.write line if prng.rand(1..opts[:size])==0 end diff --git a/sample_n b/sample_n index 2115407..286646b 100755 --- a/sample_n +++ b/sample_n @@ -3,20 +3,13 @@ require 'trollop' -def usage - STDERR.write "./sample --size --population \n" - exit 1 -end -usage if ARGV.size!=4 - opts = Trollop::options do + banner "sample --size --population " opt :size, "Sample size (percentage).", :type => :int opt :population, "'Population' (number \in N)", :type => :int end - prng = Random.new(Random.new_seed) - 1.upto(opts[:population]) { |i| puts i if prng.rand(1..opts[:size])==0 } diff --git a/shard b/shard index 7729699..f952104 100755 --- a/shard +++ b/shard @@ -12,11 +12,11 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false index.shuffle! if rand shard_sz = lc / num_shards leftover = lc % num_shards - in_f = File.new input, 'r' + in_f = ReadFile.new input in_lines = in_f.readlines - refs_f = File.new refs, 'r' + refs_f = ReadFile.new refs refs_lines = refs_f.readlines - a_f = File.new alignments, 'r' + a_f = ReadFile.new alignments a_lines = a_f.readlines shard_in_files = [] shard_refs_files = [] @@ -26,13 +26,13 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false a_fns = [] 0.upto(num_shards-1) { |shard| in_fn = "#{output_prefix}.#{shard}.#{input_ext}" - shard_in = File.new in_fn, 'w+' + shard_in = WriteFile.new in_fn in_fns << in_fn refs_fn = "#{output_prefix}.#{shard}.#{refs_ext}" - shard_refs = File.new refs_fn, 'w+' + shard_refs = WriteFile.new refs_fn refs_fns << refs_fn a_fn = "#{output_prefix}.#{shard}.a" - shard_a = File.new a_fn, 'w+' + shard_a = WriteFile.new a_fn a_fns << a_fn 0.upto(shard_sz-1) { |i| j = index.pop @@ -69,12 +69,12 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false end opts = Trollop::options do - opt :input, 'input', :type => :string - opt :references, 'references', :type => :string - opt :alignments, 'alignments', :type => :string - opt :output_prefix, 'output prefix', :type => :string + opt :input, 'input', :type => :string, :required => true + opt :references, 'references', :type => :string, :required => true + opt :alignments, 'alignments', :type => :string, :required => true + opt :output_prefix, 'output prefix', :type => :string, :required => true opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z' - opt :num_shards, 'number of shards', :type => :int + opt :num_shards, 'number of shards', :type => :int, :required => true end make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize]) diff --git a/splitpipes b/splitpipes index b0c3c9c..35ee176 100755 --- a/splitpipes +++ b/splitpipes @@ -2,24 +2,19 @@ require 'trollop' - STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' -def usage - STDERR.write "splitpipes -f < \n" - exit 1 -end -usage if ARGV.size!=2 -opts = Trollop::options do +cfg = Trollop::options do + banner "splitpipes -f < " opt :field, "field", :type => :int end while line = STDIN.gets j = 1 line.strip.split(' ||| ').each { |i| - if j == opts[:field] + if j == cfg[:field] puts i.strip break end diff --git a/stddev b/stddev index 3bf0270..891c4c9 100755 --- a/stddev +++ b/stddev @@ -3,22 +3,16 @@ require 'trollop' -def usage - STDERR.write "./stddev [-r ] < \n" - exit 1 -end -usage if not [0,2].include? ARGV.size - -opts = Trollop::options do +cfg = Trollop::options do + banner "stddev [-r ] < " opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 end - sum = 0.0 i = 0 cached = [] while line=STDIN.gets - v = line.strip.to_f + v = line.to_f sum += v cached << v i +=1 @@ -33,8 +27,8 @@ cached.each { |v| stddev = Math.sqrt(var) -if opts[:round] >= 0 - puts stddev.round opts[:round] +if cfg[:round] >= 0 + puts stddev.round cfg[:round] else puts stddev end diff --git a/strip_whitespace b/strip_whitespace deleted file mode 100755 index 37c02e5..0000000 --- a/strip_whitespace +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env ruby - -while line = STDIN.gets - puts line.lstrip.strip -end - diff --git a/strips b/strips new file mode 100755 index 0000000..11c00b4 --- /dev/null +++ b/strips @@ -0,0 +1,6 @@ +#!/usr/bin/env ruby + +while line = STDIN.gets + puts line.strip +end + diff --git a/sum b/sum index 3fca95e..dac72d3 100755 --- a/sum +++ b/sum @@ -1,8 +1,10 @@ #!/usr/bin/env ruby + sum = 0.0 while line = STDIN.gets - sum += line.strip.to_f + sum += line.to_f end + puts sum diff --git a/test/kmeans/data b/test/kmeans/data new file mode 100644 index 0000000..b5b3db2 --- /dev/null +++ b/test/kmeans/data @@ -0,0 +1,9 @@ +d00 feature_0=1.0 feature_1=0.5 +d01 feature_0=1.5 feature_1=0.4 +d02 feature_0=1.8 feature_1=0.3 +d10 feature_1=0.5 feature_2=1.0 +d11 feature_1=0.4 feature_2=2.0 +d12 feature_1=0.6 feature_2=1.5 +d20 feature_2=0.2 feature_3=1.0 +d21 feature_2=0.5 feature_3=2.0 +d22 feature_2=0.6 feature_3=3.0 diff --git a/tf-idf b/tf-idf index 3edaaf8..e1502b3 100755 --- a/tf-idf +++ b/tf-idf @@ -1,68 +1,41 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' -# returns word='raw frequency' for a single document -def tf(d, stopwords=[]) - v = {}; v.default = 0 - d.uniq.each { |i| - next if stopwords.include? i - v[i] = d.count(i).to_f - } - return v -end - -# smoothes raw frequencies -def ntf(w, a=0.4) - max = w.values.max.to_f - w.each_pair { |k,v| - w[k] = a + (1-a)*(v/max) - } -end - -# returns idf value for each word in vocab -def idf(collection) - vocab = collection.values.flatten.uniq - n = collection.size.to_f - idf = {} - vocab.each { |i| - df = collection.values.flatten.count i - idf[i] = Math.log(n/df) - } - return idf -end - def main - opts = Trollop::options do - opt :docs, "input files (documents)", :type => :strings, :required => true - opt :filter_stopwords, "filter stopwords (give file)", :type => :string - opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool + cfg = Trollop::options do + opt :documents, "input files (documents)", :type => :strings, :required => true + opt :filter_stopwords, "filter stopwords (give file)", :type => :string, :default => nil + opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool, :default => false opt :ntf, "length-normalize tf values", :type => :bool opt :idf, "weight tf by idf", :type => :bool end stopwords = [] - if opts[:filter_stopwords] - stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''} + if cfg[:filter_stopwords] + stopwords = ReadFile.new(cfg[:filter_stopwords]).readlines.map{ |i| + i.split('|').first.strip + }.reject{ |i| i=='' } end - docs = {} # fn => [words...] - opts[:docs].each { |i| - if opts[:one_item_per_line] - docs[i] = File.new(i, 'r').readlines.map{|i| i.strip} + docs = {} + cfg[:documents].each { |i| + if cfg[:one_item_per_line] + docs[i] = ReadFile.new(i).readlines_strip else - docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip} + docs[i] = ReadFile.new(i).read.split(/\s/).map{ |i| i.strip } end } idf_values = idf docs docs.each_pair { |name, words| - just_tf = tf(words) - just_tf = ntf(just_tf) if opts[:ntf] + just_tf = tf words, stopwords + just_tf = ntf(just_tf) if cfg[:ntf] tf_idf = {}; tf_idf.default = 0.0 - if opts[:idf] + if cfg[:idf] just_tf.each_pair { |word,f| tf_idf[word] = idf_values[word] * f } diff --git a/traintestsplit b/traintestsplit index 7ec52ae..7cc5bcf 100755 --- a/traintestsplit +++ b/traintestsplit @@ -1,55 +1,51 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' -def main - opts = Trollop::options do - opt :foreign, "foreign file", :type => :string, :required => true - opt :english, "english file", :type => :string, :required => true - opt :size, "one size", :type => :int, :required => true - opt :repeat, "number of repetitions", :type => :int, :default => 1 - opt :prefix, "prefix for output files", :type => :string - end - fn = opts[:foreign] - fn_ext = fn.split('.').last - f = File.new(fn, 'r').readlines - en = opts[:english] - en_ext = en.split('.').last - e = File.new(en, 'r').readlines - size = opts[:size] - nlines_f = `wc -l #{fn}`.split()[0].to_i - nlines_e = `wc -l #{en}`.split()[0].to_i - if nlines_f!=nlines_e - STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" - exit 1 - end - - prefix = opts[:prefix] - a = (0..nlines_e-1).to_a - i = 0 - opts[:repeat].times { - b = a.sample(size) - ax = a.reject{|j| b.include? j} - `mkdir split_#{i}` - new_f = File.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}", 'w+' - new_e = File.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}", 'w+' - ax.each { |j| - new_f.write f[j] - new_e.write e[j] - } - new_f.close; new_e.close - new_f = File.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}", 'w+' - new_e = File.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}", 'w+' - b.each { |j| - new_f.write f[j] - new_e.write e[j] - } - new_f.close; new_e.close - i += 1 - } +cfg = Trollop::options do + opt :foreign, "foreign file", :type => :string, :required => true + opt :english, "english file", :type => :string, :required => true + opt :size, "one size", :type => :int, :required => true + opt :repeat, "number of repetitions", :type => :int, :default => 1 + opt :prefix, "prefix for output files", :type => :string +end +fn = cfg[:foreign] +fn_ext = fn.split('.').last +f = ReadFile.new(fn).readlines +en = cfg[:english] +en_ext = en.split('.').last +e = ReadFile(en).readlines +size = cfg[:size] +nlines_f = `wc -l #{fn}`.split()[0].to_i +nlines_e = `wc -l #{en}`.split()[0].to_i +if nlines_f != nlines_e + STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" + exit 1 end - -main +prefix = cfg[:prefix] +a = (0..nlines_e-1).to_a +i = 0 +cfg[:repeat].times { + b = a.sample(size) + ax = a.reject{|j| b.include? j} + `mkdir split_#{i}` + new_f = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{fn_ext}") + new_e = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{en_ext}") + ax.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + new_f = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{fn_ext}") + new_e = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{en_ext}") + b.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + i += 1 +} diff --git a/var b/var index 08b75b6..30c638a 100755 --- a/var +++ b/var @@ -3,13 +3,8 @@ require 'trollop' -def usage - STDERR.write "./stddev [-r ] < \n" - exit 1 -end -usage if not [0,2].include? ARGV.size - -opts = Trollop::options do +cfg = Trollop::options do + banner "stddev [-r ] < " opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 end @@ -18,10 +13,10 @@ sum = 0.0 i = 0 cached = [] while line=STDIN.gets - v = line.strip.to_f + v = line.to_f sum += v cached << v - i +=1 + i +=1 end avg = sum/i.to_f @@ -31,8 +26,8 @@ cached.each { |v| var += (avg - v)**2 } -if opts[:round] >= 0 - puts var.round opts[:round] +if cfg[:round] >= 0 + puts var.round cfg[:round] else puts var end diff --git a/wrap-xml.perl b/wrap-xml.perl index d29065a..06303b7 100755 --- a/wrap-xml.perl +++ b/wrap-xml.perl @@ -1,5 +1,6 @@ #!/usr/bin/perl -w # original: https://smt.googlecode.com/svn/trunk/moses64/tools/scripts/wrap-xml.perl +# (licensed under LGPL) use strict; -- cgit v1.2.3