From db6a6ecfa350cae29739c59df1210d8f76a479c9 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 5 Dec 2013 07:56:38 +0100 Subject: init --- README | 2 + add_seg | 32 ++++++++++ add_start_end | 9 +++ avg | 31 ++++++++++ avg_weights | 46 ++++++++++++++ de-sgm | 4 ++ even | 11 ++++ firstisupper | 10 ++++ htmlentities | 14 +++++ keycount | 11 ++++ kmeans | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++ max | 9 +++ merge_files | 32 ++++++++++ min | 9 +++ min_max | 47 +++++++++++++++ moses_1best | 15 +++++ mult | 4 ++ ng | 39 ++++++++++++ nn | 4 ++ no_empty | 17 ++++++ no_non_printables | 1 + norm_german | 93 +++++++++++++++++++++++++++++ normalize_punctuation | 46 ++++++++++++++ num_tok | 8 +++ odd | 11 ++++ paste_pairs | 11 ++++ preprocess | 5 ++ preprocess_nolow | 5 ++ round | 4 ++ ruby_eval | 6 ++ rule_shapes | 29 +++++++++ sample | 25 ++++++++ sample_n | 23 +++++++ shard | 81 +++++++++++++++++++++++++ splitpipes | 29 +++++++++ stddev | 41 +++++++++++++ strip_whitespace | 6 ++ sum | 8 +++ tf-idf | 80 +++++++++++++++++++++++++ toks | 10 ++++ var | 39 ++++++++++++ vocab | 8 +++ wrap-xml.perl | 40 +++++++++++++ 43 files changed, 1117 insertions(+) create mode 100644 README create mode 100755 add_seg create mode 100755 add_start_end create mode 100755 avg create mode 100755 avg_weights create mode 100755 de-sgm create mode 100755 even create mode 100755 firstisupper create mode 100755 htmlentities create mode 100755 keycount create mode 100755 kmeans create mode 100755 max create mode 100755 merge_files create mode 100755 min create mode 100755 min_max create mode 100755 moses_1best create mode 100755 mult create mode 100755 ng create mode 100755 nn create mode 100755 no_empty create mode 100755 no_non_printables create mode 100755 norm_german create mode 100755 normalize_punctuation create mode 100755 num_tok create mode 100755 odd create mode 100755 paste_pairs create mode 100755 preprocess create mode 100755 preprocess_nolow create mode 100755 round create mode 100755 ruby_eval create mode 100755 rule_shapes create mode 100755 sample create mode 100755 sample_n create mode 100755 shard create mode 100755 splitpipes create mode 100755 stddev create mode 100755 strip_whitespace create mode 100755 sum create mode 100755 tf-idf create mode 100755 toks create mode 100755 var create mode 100755 vocab create mode 100755 wrap-xml.perl diff --git a/README b/README new file mode 100644 index 0000000..8ce273f --- /dev/null +++ b/README @@ -0,0 +1,2 @@ +misc. nlp related scripts + diff --git a/add_seg b/add_seg new file mode 100755 index 0000000..e661b40 --- /dev/null +++ b/add_seg @@ -0,0 +1,32 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def usage + puts "addseg [--nogz] [--loo] [--grammar] \n" + exit 1 +end + +opts = Trollop::options do + opt :grammar, "(Abs) path of folder containing grammar.", :type => :string, :short => '-g', :required => true + opt :loo, "leave one out", :type => :bool, :default => false + opt :start_id, "start with this id", :type => :int, :default => 0, :short => '-i' + opt :nogz, "grammar files not gzipped", :type => :bool, :default => false +end + + +i = opts[:start_id] +while line = STDIN.gets + ext = '.gz' + ext = '' if opts[:nogz] + s = " #{line.strip} " + i+=1 +end + diff --git a/add_start_end b/add_start_end new file mode 100755 index 0000000..a14a65e --- /dev/null +++ b/add_start_end @@ -0,0 +1,9 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +while line = STDIN.gets + puts " #{line.strip} " +end + diff --git a/avg b/avg new file mode 100755 index 0000000..cc4c0e6 --- /dev/null +++ b/avg @@ -0,0 +1,31 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def usage + STDERR.write "./avg [-r ] < \n" + exit 1 +end +usage if not [0,2].include? ARGV.size + +opts = Trollop::options do + opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 +end + + +sum = 0.0 +i = 0 +while line=STDIN.gets + sum += line.strip.to_f + i +=1 +end + +avg = sum/i.to_f + +if opts[:round] >= 0 + puts avg.round opts[:round] +else + puts avg +end + diff --git a/avg_weights b/avg_weights new file mode 100755 index 0000000..2b72747 --- /dev/null +++ b/avg_weights @@ -0,0 +1,46 @@ +#!/usr/bin/env ruby + +require 'trollop' +require 'zlib' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +opts = Trollop::options do + opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false +end + +def usage + puts "avg_weights_filter [--filter] +" + exit 1 +end +usage if ARGV.size==0 + + +h = {} +ARGV.each { |fn| +if File.extname(fn)=='.gz' + f = Zlib::GzipReader.new(File.new(fn, 'rb')) +else + f = File.new fn, 'r' +end +while line = f.gets + k, v = line.split + v = v.to_f + if h.has_key? k + h[k] << v + else + h[k] = [v] + end +end +f.close +} + +n = ARGV.size.to_f + +h.each_pair { |k,a| + next if opts[:filter] and a.size < n + puts "#{k} #{a.inject(:+)/n}" +} + diff --git a/de-sgm b/de-sgm new file mode 100755 index 0000000..fa28301 --- /dev/null +++ b/de-sgm @@ -0,0 +1,4 @@ +#!/bin/sh + +grep -v -P "^[[:space:]]*(<\?xml.*\?>|]*>)[[:space:]]*$" | grep -v -P "^[[:space:]]*<(url|description|keywords|talkid|title)>.*[[:space:]]*$" | sed "s|]*>\s*||" | sed "s|\s*$||" + diff --git a/even b/even new file mode 100755 index 0000000..dcee3d9 --- /dev/null +++ b/even @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +i = 1 +while line = STDIN.gets + puts line if i%2==0 + i+=1 +end + diff --git a/firstisupper b/firstisupper new file mode 100755 index 0000000..4278334 --- /dev/null +++ b/firstisupper @@ -0,0 +1,10 @@ +#!/usr/bin/env ruby + +def downcase?(string) + string[/[[:lower:]]/] +end + +while line = STDIN.gets + puts line.strip if downcase? line[0] +end + diff --git a/htmlentities b/htmlentities new file mode 100755 index 0000000..ecbee3f --- /dev/null +++ b/htmlentities @@ -0,0 +1,14 @@ +#!/usr/bin/ruby + +require 'htmlentities' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +coder = HTMLEntities.new + +while line = STDIN.gets + puts coder.decode(line.strip) +end + diff --git a/keycount b/keycount new file mode 100755 index 0000000..15b4095 --- /dev/null +++ b/keycount @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby + +h = {} +h.default = 0 +while line = STDIN.gets + line.strip! + h[line] += 1 +end + +h.each_pair {|k,v| puts "#{k} #{v}"} + diff --git a/kmeans b/kmeans new file mode 100755 index 0000000..89cc329 --- /dev/null +++ b/kmeans @@ -0,0 +1,162 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +# {s:f} {s:f} => f +def dot(x,y) + sum = 0.0 + x.each_pair { |k,v| sum += v * y[k] } + return sum +end + +# {s:f} => f +def mag(x) + return Math.sqrt x.values.inject { |sum,i| sum+i**2 } +end + +# {s:f} {s:f} => f +def cos_sim(x,y) + return dot(x,y)/(mag(x)*mag(y)) +end + +# {s:f} {s:f} => f +def euclidian_dist(x,y) + dims = [x.keys, y.keys].flatten.uniq + sum = 0.0 + dims.each { |i| sum += (x[i] - y[i])**2 } + return Math.sqrt(sum) +end + +# str => {s:{s:f}} +def read(fn) + h = {} + f = File.new fn, 'r' + while line = f.gets + g = eval line + h[g[0]] = g[1] + h[g[0]].default = 0.0 + end + return h +end + +# {s:{s:f}} i => [{s:f}] +def rand_init(docs, k) + prng = Random.new + return docs.keys.sample k, random:prng +end + +def rand_init2(docs, k) + prng = Random.new + a = [] + 0.upto(k-1) do + a << mean(docs.values.sample k, random:prng) + end + return a +end + +# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]} +def assign(docs, centroids) + assignment = {} + docs.each_pair { |name,feature_vector| + min = 1.0/0 + min_index = nil + centroids.each_with_index { |c,j| + dist = euclidian_dist(c, feature_vector) + if dist < min + min = dist + min_index = j + end + } + if assignment.has_key? min_index + assignment[min_index] << [name, feature_vector] + else + assignment[min_index] = [[name, feature_vector]] + end + } + return assignment +end + +# [{s:f}] => {s:f} +def mean(a) + res = {} + res.default = 0.0 + a.each { |i| + i.each_pair { |k,v| + res[k] += v + } + } + n = a.size.to_f + res.each_pair { |k,v| + res[k] = v/n + } +end + +# {i:[{s:f}]} => [{s:f}] +def update(assignment) + new_centroids = [] + assignment.each_pair { |centroid,docs| + new_centroids << mean(docs.map{|i |i[1]}) + } + return new_centroids +end + +def main + opts = Trollop::options do + opt :k, "k", :type => :int, :required => true + opt :input, "input: one feature vector per line", :type => :string, :required => true + opt :max_iterations, "max. number of iterations", :type => :int, :default => 100 + opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3 + opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2 + end + docs = read opts[:input] + k = opts[:k] + centroids = nil + if opts[:init] == 1 + centroids = rand_init(docs, k) + else + centroids = rand_init2(docs, k) + end + STDERR.write "\n k #{k}\n" + STDERR.write " input #{opts[:input]}\n" + STDERR.write "iterations #{opts[:max_iterations]}\n" + STDERR.write "max no ch. #{opts[:max_no_change]}\n" + STDERR.write " init #{opts[:init]}\n\n" + assignment = nil + prev_stats = [] + stats = [] + no_change = 0 + max_no_change = 5 + STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n" + 0.upto(opts[:max_iterations]) do |i| + s = "iteration #{i}" + STDERR.write "#{s}\n#{'-'*s.size}\n" + assignment = assign(docs, centroids) + sizes = [] + assignment.each_pair { |centroid_index,docs| + sizes << docs.size + } + median = sizes.sort[k/2] + max = sizes.max + min = sizes.min + stats = [median, max, min] + no_change += 1 if stats==prev_stats + prev_stats = stats + STDERR.write sizes.to_s + "\n" + STDERR.write " median cluster sz=#{median}\n" + STDERR.write " max cluster sz=#{max}\n" + STDERR.write " min cluster sz=#{min}\n\n" + if no_change == max_no_change + STDERR.write "\nmax no change hit!\n\n" + assignment.each_pair { |centroid_index,docs| + puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}" + } + break + end + centroids = update(assignment) + end +end + + +main + diff --git a/max b/max new file mode 100755 index 0000000..506bd03 --- /dev/null +++ b/max @@ -0,0 +1,9 @@ +#!/usr/bin/env ruby + +max = -1.0/0 +while line = STDIN.gets + v = line.strip.to_f + max = v if v > max +end +puts max + diff --git a/merge_files b/merge_files new file mode 100755 index 0000000..db9d5da --- /dev/null +++ b/merge_files @@ -0,0 +1,32 @@ +#!/usr/bin/env ruby + +STDOUT.set_encoding 'utf-8' + +def usage + STDERR.write "merge_files [file]+\n" + exit 1 +end +usage if ARGV.size==0 + + +files = ARGV +dicts = [] + +files.each { |i| + dicts.push Hash.new + dicts.last.default = 0 + File.open i, "r:UTF-8" do |f| + while line = f.gets + dicts.last[line.strip] += 1 + end + end +} + +dicts.each { |h| + h.each { |k,v| + counts = [] + dicts.each { |j| counts.push j[k]; j.delete k } + counts.max.times { puts k } + } +} + diff --git a/min b/min new file mode 100755 index 0000000..c2f85b9 --- /dev/null +++ b/min @@ -0,0 +1,9 @@ +#!/usr/bin/env ruby + +min = 1.0/0 +while line = STDIN.gets + v = line.strip.to_f + min = v if v < min +end +puts min + diff --git a/min_max b/min_max new file mode 100755 index 0000000..f27de88 --- /dev/null +++ b/min_max @@ -0,0 +1,47 @@ +#!/usr/bin/ruby + +require 'trollop' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def usage + puts "filter-min-max.rb --min --max --in_f --in_e --out_f --out_e --out_id " +end +usage if ARGV.size!=14 + +opts = Trollop::options do + opt :min, "minimum #tokens", :type => :int, :default => 1 + opt :max, "maximum #tokens", :type => :int, :default => 80 + opt :in_f "input 'French' file", :type => string + opt :in_e "input 'English' file", :type => string + opt :out_f "output 'French' file", :type => string + opt :out_e "output 'English' file", :type => string + opt :out_id "output line Nos", :type => string +end + + +files = {} +files[:f_file] = File.new opts[:in_f], 'r:UTF-8' +files[:e_file] = File.new opts[:in_e], 'r:UTF-8' +files[:f_out_file] = File.new opts[:out_f], 'w:UTF-8' +files[:e_out_file] = File.new opts[:out_e], 'w:UTF-8' +files[:id_out_file] = File.new opts[:out_id], 'w' +i = 0 +while f_line = files[:f_file].gets + e_line = files[:e_file].gets + f_line.strip! + e_line.strip! + a = f_line.split + b = e_line.split + if a.size >= opts[:min] and a.size <= opts[:max] and \ + b.size >= opts[:min] and b.size <= opts[:max] + files[:f_out_file].write "#{f_line}\n" + files[:e_out_file].write "#{e_line}\n" + files[:id_out_file].write "#{i}\n" + end + i+=1 +end +files.values.each{|f|f.close} + diff --git a/moses_1best b/moses_1best new file mode 100755 index 0000000..5c6bf9d --- /dev/null +++ b/moses_1best @@ -0,0 +1,15 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +prev_idx = nil +while line = STDIN.gets + line.strip! + idx = line.split('|||')[0].to_i + if idx != prev_idx + puts line + prev_idx = idx + end +end + diff --git a/mult b/mult new file mode 100755 index 0000000..eaead89 --- /dev/null +++ b/mult @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby + +puts STDIN.gets.to_f * ARGV[0].to_f + diff --git a/ng b/ng new file mode 100755 index 0000000..d8b01ae --- /dev/null +++ b/ng @@ -0,0 +1,39 @@ +#!/usr/bin/env ruby + +def ngrams_it(s, n, fix=false) + a = s.strip.split + a.each_with_index { |tok, i| + tok.strip! + 0.upto([n-1, a.size-i-1].min) { |m| + yield a[i..i+m] if !(fix^(a[i..i+m].size==n)) + } + } +end + +def main(n, fix, sep) + STDIN.set_encoding 'utf-8' + STDOUT.set_encoding 'utf-8' + while line = STDIN.gets + a = [] + ngrams_it(line, n, fix) {|ng| a << ng.join(' ')} + a.reject! {|i| i.strip.size==0 } + puts a.join sep if a.size > 0 + end +end + +def usage + STDERR.write "./ng [-n ] [--fix] [--separator ] < \n" + exit 1 +end + +if __FILE__ == $0 + require 'trollop' + opts = Trollop::options do + opt :n, "Ngrams", :type => :int, :default => 4 + opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => true + opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n" + end + usage if not [0,2,4,6].include? ARGV.size + main(opts[:n], opts[:fix], opts[:separator]) +end + diff --git a/nn b/nn new file mode 100755 index 0000000..4d1dab7 --- /dev/null +++ b/nn @@ -0,0 +1,4 @@ +#!/bin/sh + +tr '[:digit:]' $1 < $2 > $(basename $2 ${2##*.})nn.${2##*.} + diff --git a/no_empty b/no_empty new file mode 100755 index 0000000..ecdbcdf --- /dev/null +++ b/no_empty @@ -0,0 +1,17 @@ +#!/usr/bin/env ruby + +files = [] +(0..1).each { |i| files << File.new(ARGV[i], 'r') } +(2..3).each { |i| files << File.new(ARGV[i], 'w') } +files.each { |f| f.set_encoding('utf-8') } + +while line_f = files[0].gets + line_e = files[1].gets + line_f.strip!; line_e.strip! + next if line_f=='' || line_e=='' + files[2].write line_f+"\n" + files[3].write line_e+"\n" +end + +files.each { |f| f.close } + diff --git a/no_non_printables b/no_non_printables new file mode 100755 index 0000000..fda1e40 --- /dev/null +++ b/no_non_printables @@ -0,0 +1 @@ +sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g' diff --git a/norm_german b/norm_german new file mode 100755 index 0000000..57a37bb --- /dev/null +++ b/norm_german @@ -0,0 +1,93 @@ +#!/usr/bin/env ruby + +require 'thread' +require 'trollop' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def usage + STDERR.write "./avg [-r ] < \n" + exit 1 +end +usage if not [0,2,4].include? ARGV.size + +opts = Trollop::options do + opt :upper, "uppercase", :type => :bool, :default => false + opt :threads, "#threads", :type => :int, :default => 1, :short => '-h' + opt :shard_size, "shard size", :type => :int, :default => 1000 + opt :train, "train", :type => :bool + opt :apply, "apply", :type => :bool +end + + +pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ] +pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ] +if opts[:upper] + PAIRS = pairs_lower +else + PAIRS = pairs_lower+pairs_upper +end + +def get_key(old, new) + PAIRS.each { |i| + return old if new.gsub(i[0], i[1])==old + return old if new.gsub(i[1], i[0])==old + } + return nil +end + +def build_partial(tokens) + h = {} + tokens.each { |tok| + found = false + h.keys.each { |i| + if get_key i, tok + h[i] << tok + found = true + break + end + } + h[tok] = [tok] if !found + } + return h +end + +h = {} +threads = [] +thread_n = 0 +counter = 0 +token_stock = [] +mutex = Mutex.new +while tok = STDIN.gets # expects stream of (lowercased) tokens + token_stock << [] if !token_stock[thread_n] + token_stock[thread_n] << tok.strip! + counter += 1 + if token_stock[thread_n].size%opts[:shard_size]==0 + STDERR.write "Starting thread ##{thread_n}\n" + threads << Thread.new(token_stock[thread_n]) { |tokens| + th = build_partial tokens + mutex.synchronize do + h.merge! th + end + } + threads.last.abort_on_exception = true + thread_n += 1 + else + next + end + if thread_n==opts[:threads] + threads.each { |i| i.join } + token_stock.each { |i| i.clear } + thread_n = 0 + end + STDERR.write "#keys #{h.keys.size}\n" +end + +token_stock.each { |i| + if i.size!=0 + h.merge! build_partial i + end +} + diff --git a/normalize_punctuation b/normalize_punctuation new file mode 100755 index 0000000..108de44 --- /dev/null +++ b/normalize_punctuation @@ -0,0 +1,46 @@ +#!/usr/bin/perl -w +# adapted from the moses scripts + +use strict; + +my ($language) = @ARGV; + +while() { + s/\r//g; + # normalize unicode punctuation + s/„/\"/g; + s/“/\"/g; + s/”/\"/g; + s/–/-/g; + s/—/ - /g; s/ +/ /g; + s/´/\'/g; + s/([a-z])‘([a-z])/$1\'$2/gi; + s/([a-z])’([a-z])/$1\'$2/gi; + s/‘/\"/g; + s/‚/\"/g; + s/’/\"/g; + s/''/\"/g; + s/´´/\"/g; + s/…/.../g; + # French quotes + s/ « / \"/g; + s/« /\"/g; + s/«/\"/g; + s/ » /\" /g; + s/ »/\"/g; + s/»/\"/g; + # handle pseudo-spaces + s/ \%/\%/g; + s/nº /nº /g; + s/ :/:/g; + s/ ºC/ ºC/g; + s/ cm/ cm/g; + s/ \?/\?/g; + s/ \!/\!/g; + s/ ;/;/g; + s/, /, /g; s/ +/ /g; + + print STDERR $_ if //; + + print $_; +} diff --git a/num_tok b/num_tok new file mode 100755 index 0000000..7cc500c --- /dev/null +++ b/num_tok @@ -0,0 +1,8 @@ +#!/usr/bin/env ruby1.9.1 + +STDIN.set_encoding('utf-8') + +while line = STDIN.gets + puts line.split.length +end + diff --git a/odd b/odd new file mode 100755 index 0000000..0bd9336 --- /dev/null +++ b/odd @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +i = 1 +while line = STDIN.gets + puts line if i%2!=0 + i+=1 +end + diff --git a/paste_pairs b/paste_pairs new file mode 100755 index 0000000..6ede8f6 --- /dev/null +++ b/paste_pairs @@ -0,0 +1,11 @@ +#!/usr/bin/python + +import sys +from itertools import izip + + +for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))): + print linenr, (src_line.strip()) + print linenr, (tgt_line.strip()) + print + diff --git a/preprocess b/preprocess new file mode 100755 index 0000000..716255d --- /dev/null +++ b/preprocess @@ -0,0 +1,5 @@ +#!/bin/zsh + +LANG=$1 +~/scripts/htmlentities 2>htmlentities.$LANG.err | ~/scripts/normalize-punctuation 2>normalize-punctuation.$LANG.err | ~/moses/scripts/tokenizer/tokenizer.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | ~/moses/scripts/tokenizer/lowercase.perl 2>lowercase.$LANG.err + diff --git a/preprocess_nolow b/preprocess_nolow new file mode 100755 index 0000000..fc466b6 --- /dev/null +++ b/preprocess_nolow @@ -0,0 +1,5 @@ +#!/bin/zsh + +LANG=$1 +~/scripts/htmlentities 2>htmlentities.$LANG.err | ~/scripts/normalize-punctuation 2>normalize-punctuation.$LANG.err | ~/moses/scripts/tokenizer/tokenizer.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err + diff --git a/round b/round new file mode 100755 index 0000000..52cd013 --- /dev/null +++ b/round @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby + +puts STDIN.gets.to_f.round ARGV[0].to_i + diff --git a/ruby_eval b/ruby_eval new file mode 100755 index 0000000..fe0d181 --- /dev/null +++ b/ruby_eval @@ -0,0 +1,6 @@ +#!/usr/bin/env ruby + +while line = STDIN.gets + puts "#{eval line}" +end + diff --git a/rule_shapes b/rule_shapes new file mode 100755 index 0000000..039b0dc --- /dev/null +++ b/rule_shapes @@ -0,0 +1,29 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def shape s + res = [] + in_t = false + s.split.each { |i| + if i.match /\A\[X,\d\]\z/ + if in_t + in_t = false + end + res << "NT" + next + else + res << "T" if not in_t + in_t = true + end + } + return res +end + +while line = STDIN.gets + f,e = line.split "\t" + f.strip!; e.strip! + puts shape(f).join('_')+"-"+shape(e).join('_') +end + diff --git a/sample b/sample new file mode 100755 index 0000000..b4706c6 --- /dev/null +++ b/sample @@ -0,0 +1,25 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def usage + STDERR.write "./sample --size < \n" + exit 1 +end +usage if ARGV.size!=4 + +opts = Trollop::options do + opt :size, "Sample n% (percentage).", :type => :int +end + + +prng = Random.new(Random.new_seed) + +while line = STDIN.gets + STDOUT.write line if prng.rand(1..opts[:size])==0 +end + diff --git a/sample_n b/sample_n new file mode 100755 index 0000000..2115407 --- /dev/null +++ b/sample_n @@ -0,0 +1,23 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def usage + STDERR.write "./sample --size --population \n" + exit 1 +end +usage if ARGV.size!=4 + +opts = Trollop::options do + opt :size, "Sample size (percentage).", :type => :int + opt :population, "'Population' (number \in N)", :type => :int +end + + +prng = Random.new(Random.new_seed) + +1.upto(opts[:population]) { |i| + puts i if prng.rand(1..opts[:size])==0 +} + diff --git a/shard b/shard new file mode 100755 index 0000000..7729699 --- /dev/null +++ b/shard @@ -0,0 +1,81 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false) + lc = `wc -l #{input}`.split.first.to_i + input_ext = input.split('.').last + refs_ext = refs.split('.').last + index = (0..lc-1).to_a + index.reverse! + index.shuffle! if rand + shard_sz = lc / num_shards + leftover = lc % num_shards + in_f = File.new input, 'r' + in_lines = in_f.readlines + refs_f = File.new refs, 'r' + refs_lines = refs_f.readlines + a_f = File.new alignments, 'r' + a_lines = a_f.readlines + shard_in_files = [] + shard_refs_files = [] + shard_a_files = [] + in_fns = [] + refs_fns = [] + a_fns = [] + 0.upto(num_shards-1) { |shard| + in_fn = "#{output_prefix}.#{shard}.#{input_ext}" + shard_in = File.new in_fn, 'w+' + in_fns << in_fn + refs_fn = "#{output_prefix}.#{shard}.#{refs_ext}" + shard_refs = File.new refs_fn, 'w+' + refs_fns << refs_fn + a_fn = "#{output_prefix}.#{shard}.a" + shard_a = File.new a_fn, 'w+' + a_fns << a_fn + 0.upto(shard_sz-1) { |i| + j = index.pop + shard_in.write in_lines[j] + shard_refs.write refs_lines[j] + shard_a.write a_lines[j] + } + shard_in_files << shard_in + shard_refs_files << shard_refs + shard_a_files << shard_a + } + if !rand + while leftover > 0 + j = index.pop + shard_in_files[-1].write in_lines[j] + shard_refs_files[-1].write refs_lines[j] + shard_a_files[-1].write a_lines[j] + leftover -= 1 + end + else + 0.upto(num_shards-1) { |shard| + break if leftover <= 0 + j = index.pop + shard_in_files[shard].write in_lines[j] + shard_refs_files[shard].write refs_lines[j] + shard_a_files[shard].write a_lines[j] + leftover -= 1 + } + end + (shard_in_files + shard_refs_files).each do |f| f.close end + in_f.close + refs_f.close + return [in_fns, refs_fns] +end + +opts = Trollop::options do + opt :input, 'input', :type => :string + opt :references, 'references', :type => :string + opt :alignments, 'alignments', :type => :string + opt :output_prefix, 'output prefix', :type => :string + opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z' + opt :num_shards, 'number of shards', :type => :int +end + +make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize]) + diff --git a/splitpipes b/splitpipes new file mode 100755 index 0000000..b0c3c9c --- /dev/null +++ b/splitpipes @@ -0,0 +1,29 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def usage + STDERR.write "splitpipes -f < \n" + exit 1 +end +usage if ARGV.size!=2 + +opts = Trollop::options do + opt :field, "field", :type => :int +end + +while line = STDIN.gets + j = 1 + line.strip.split(' ||| ').each { |i| + if j == opts[:field] + puts i.strip + break + end + j += 1 + } +end + diff --git a/stddev b/stddev new file mode 100755 index 0000000..3bf0270 --- /dev/null +++ b/stddev @@ -0,0 +1,41 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def usage + STDERR.write "./stddev [-r ] < \n" + exit 1 +end +usage if not [0,2].include? ARGV.size + +opts = Trollop::options do + opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 +end + + +sum = 0.0 +i = 0 +cached = [] +while line=STDIN.gets + v = line.strip.to_f + sum += v + cached << v + i +=1 +end + +avg = sum/i.to_f + +var = 0 +cached.each { |v| + var += (avg - v)**2 +} + +stddev = Math.sqrt(var) + +if opts[:round] >= 0 + puts stddev.round opts[:round] +else + puts stddev +end + diff --git a/strip_whitespace b/strip_whitespace new file mode 100755 index 0000000..37c02e5 --- /dev/null +++ b/strip_whitespace @@ -0,0 +1,6 @@ +#!/usr/bin/env ruby + +while line = STDIN.gets + puts line.lstrip.strip +end + diff --git a/sum b/sum new file mode 100755 index 0000000..3fca95e --- /dev/null +++ b/sum @@ -0,0 +1,8 @@ +#!/usr/bin/env ruby + +sum = 0.0 +while line = STDIN.gets + sum += line.strip.to_f +end +puts sum + diff --git a/tf-idf b/tf-idf new file mode 100755 index 0000000..3edaaf8 --- /dev/null +++ b/tf-idf @@ -0,0 +1,80 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +# returns word='raw frequency' for a single document +def tf(d, stopwords=[]) + v = {}; v.default = 0 + d.uniq.each { |i| + next if stopwords.include? i + v[i] = d.count(i).to_f + } + return v +end + +# smoothes raw frequencies +def ntf(w, a=0.4) + max = w.values.max.to_f + w.each_pair { |k,v| + w[k] = a + (1-a)*(v/max) + } +end + +# returns idf value for each word in vocab +def idf(collection) + vocab = collection.values.flatten.uniq + n = collection.size.to_f + idf = {} + vocab.each { |i| + df = collection.values.flatten.count i + idf[i] = Math.log(n/df) + } + return idf +end + +def main + opts = Trollop::options do + opt :docs, "input files (documents)", :type => :strings, :required => true + opt :filter_stopwords, "filter stopwords (give file)", :type => :string + opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool + opt :ntf, "length-normalize tf values", :type => :bool + opt :idf, "weight tf by idf", :type => :bool + end + + stopwords = [] + if opts[:filter_stopwords] + stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''} + end + + docs = {} # fn => [words...] + opts[:docs].each { |i| + if opts[:one_item_per_line] + docs[i] = File.new(i, 'r').readlines.map{|i| i.strip} + else + docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip} + end + } + + idf_values = idf docs + + docs.each_pair { |name, words| + just_tf = tf(words) + just_tf = ntf(just_tf) if opts[:ntf] + tf_idf = {}; tf_idf.default = 0.0 + if opts[:idf] + just_tf.each_pair { |word,f| + tf_idf[word] = idf_values[word] * f + } + else + tf_idf = just_tf + end + docs[name] = tf_idf + } + + docs.each { |i| puts i.to_s } +end + + +main + diff --git a/toks b/toks new file mode 100755 index 0000000..ed40dbb --- /dev/null +++ b/toks @@ -0,0 +1,10 @@ +#!/usr/bin/ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + + +while line = STDIN.gets + line.strip.split(/\s/).each { |i| puts i } +end + diff --git a/var b/var new file mode 100755 index 0000000..08b75b6 --- /dev/null +++ b/var @@ -0,0 +1,39 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def usage + STDERR.write "./stddev [-r ] < \n" + exit 1 +end +usage if not [0,2].include? ARGV.size + +opts = Trollop::options do + opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 +end + + +sum = 0.0 +i = 0 +cached = [] +while line=STDIN.gets + v = line.strip.to_f + sum += v + cached << v + i +=1 +end + +avg = sum/i.to_f + +var = 0 +cached.each { |v| + var += (avg - v)**2 +} + +if opts[:round] >= 0 + puts var.round opts[:round] +else + puts var +end + diff --git a/vocab b/vocab new file mode 100755 index 0000000..e7b03fe --- /dev/null +++ b/vocab @@ -0,0 +1,8 @@ +#!/bin/sh + +pushd `dirname $0` > /dev/null +SCRIPTPATH=`pwd -P` +popd > /dev/null + +$SCRIPTPATH/toks ${1+"$@"} | sort | uniq -u + diff --git a/wrap-xml.perl b/wrap-xml.perl new file mode 100755 index 0000000..d29065a --- /dev/null +++ b/wrap-xml.perl @@ -0,0 +1,40 @@ +#!/usr/bin/perl -w +# original: https://smt.googlecode.com/svn/trunk/moses64/tools/scripts/wrap-xml.perl + +use strict; + +my $src = $ARGV[0]; +my $language = $ARGV[1]; +die("syntax: wrap-xml.perl xml-frame language [system-name]") + unless $src && $language && -e $src; +my $system = "my-system"; +$system = $ARGV[2] if defined($ARGV[2]); + +open(SRC,$src); +my @OUT = ; +chomp(@OUT); +#my @OUT = `cat $decoder_output`; +while() { + chomp; + if (/^/) { + s/(]+> *).+(<\/seg>)/$1$line$2/; + } + else { + s/(]+> *)[^<]+/$1$line/; + } + } + print $_."\n"; +} + -- cgit v1.2.3