diff options
author | Patrick Simianer <p@simianer.de> | 2013-12-05 07:56:38 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2013-12-05 07:56:38 +0100 |
commit | db6a6ecfa350cae29739c59df1210d8f76a479c9 (patch) | |
tree | f137a001f57f170455c28ce97b5abb2726006cf6 |
init
-rw-r--r-- | README | 2 | ||||
-rwxr-xr-x | add_seg | 32 | ||||
-rwxr-xr-x | add_start_end | 9 | ||||
-rwxr-xr-x | avg | 31 | ||||
-rwxr-xr-x | avg_weights | 46 | ||||
-rwxr-xr-x | de-sgm | 4 | ||||
-rwxr-xr-x | even | 11 | ||||
-rwxr-xr-x | firstisupper | 10 | ||||
-rwxr-xr-x | htmlentities | 14 | ||||
-rwxr-xr-x | keycount | 11 | ||||
-rwxr-xr-x | kmeans | 162 | ||||
-rwxr-xr-x | max | 9 | ||||
-rwxr-xr-x | merge_files | 32 | ||||
-rwxr-xr-x | min | 9 | ||||
-rwxr-xr-x | min_max | 47 | ||||
-rwxr-xr-x | moses_1best | 15 | ||||
-rwxr-xr-x | mult | 4 | ||||
-rwxr-xr-x | ng | 39 | ||||
-rwxr-xr-x | nn | 4 | ||||
-rwxr-xr-x | no_empty | 17 | ||||
-rwxr-xr-x | no_non_printables | 1 | ||||
-rwxr-xr-x | norm_german | 93 | ||||
-rwxr-xr-x | normalize_punctuation | 46 | ||||
-rwxr-xr-x | num_tok | 8 | ||||
-rwxr-xr-x | odd | 11 | ||||
-rwxr-xr-x | paste_pairs | 11 | ||||
-rwxr-xr-x | preprocess | 5 | ||||
-rwxr-xr-x | preprocess_nolow | 5 | ||||
-rwxr-xr-x | round | 4 | ||||
-rwxr-xr-x | ruby_eval | 6 | ||||
-rwxr-xr-x | rule_shapes | 29 | ||||
-rwxr-xr-x | sample | 25 | ||||
-rwxr-xr-x | sample_n | 23 | ||||
-rwxr-xr-x | shard | 81 | ||||
-rwxr-xr-x | splitpipes | 29 | ||||
-rwxr-xr-x | stddev | 41 | ||||
-rwxr-xr-x | strip_whitespace | 6 | ||||
-rwxr-xr-x | sum | 8 | ||||
-rwxr-xr-x | tf-idf | 80 | ||||
-rwxr-xr-x | toks | 10 | ||||
-rwxr-xr-x | var | 39 | ||||
-rwxr-xr-x | vocab | 8 | ||||
-rwxr-xr-x | wrap-xml.perl | 40 |
43 files changed, 1117 insertions, 0 deletions
@@ -0,0 +1,2 @@ +misc. nlp related scripts + @@ -0,0 +1,32 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def usage + puts "addseg [--nogz] [--loo] [--grammar] <path to grammars dir>\n" + exit 1 +end + +opts = Trollop::options do + opt :grammar, "(Abs) path of folder containing grammar.", :type => :string, :short => '-g', :required => true + opt :loo, "leave one out", :type => :bool, :default => false + opt :start_id, "start with this id", :type => :int, :default => 0, :short => '-i' + opt :nogz, "grammar files not gzipped", :type => :bool, :default => false +end + + +i = opts[:start_id] +while line = STDIN.gets + ext = '.gz' + ext = '' if opts[:nogz] + s = "<seg" + if opts[:loo] then s += " exclude=\"#{i}\"" end + if opts[:grammar] then s += " grammar=\"#{opts[:grammar]}/grammar.#{i}#{ext}\"" end + puts s + " id=\"#{i}\"> #{line.strip} </seg>" + i+=1 +end + diff --git a/add_start_end b/add_start_end new file mode 100755 index 0000000..a14a65e --- /dev/null +++ b/add_start_end @@ -0,0 +1,9 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +while line = STDIN.gets + puts "<s> #{line.strip} </s>" +end + @@ -0,0 +1,31 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def usage + STDERR.write "./avg [-r <d>] < <one number per line>\n" + exit 1 +end +usage if not [0,2].include? ARGV.size + +opts = Trollop::options do + opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 +end + + +sum = 0.0 +i = 0 +while line=STDIN.gets + sum += line.strip.to_f + i +=1 +end + +avg = sum/i.to_f + +if opts[:round] >= 0 + puts avg.round opts[:round] +else + puts avg +end + diff --git a/avg_weights b/avg_weights new file mode 100755 index 0000000..2b72747 --- /dev/null +++ b/avg_weights @@ -0,0 +1,46 @@ +#!/usr/bin/env ruby + +require 'trollop' +require 'zlib' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +opts = Trollop::options do + opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false +end + +def usage + puts "avg_weights_filter [--filter] <filename>+" + exit 1 +end +usage if ARGV.size==0 + + +h = {} +ARGV.each { |fn| +if File.extname(fn)=='.gz' + f = Zlib::GzipReader.new(File.new(fn, 'rb')) +else + f = File.new fn, 'r' +end +while line = f.gets + k, v = line.split + v = v.to_f + if h.has_key? k + h[k] << v + else + h[k] = [v] + end +end +f.close +} + +n = ARGV.size.to_f + +h.each_pair { |k,a| + next if opts[:filter] and a.size < n + puts "#{k} #{a.inject(:+)/n}" +} + @@ -0,0 +1,4 @@ +#!/bin/sh + +grep -v -P "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset)[^>]*>)[[:space:]]*$" | grep -v -P "^[[:space:]]*<(url|description|keywords|talkid|title)>.*</(url|description|keywords|talkid|title)>[[:space:]]*$" | sed "s|<seg[^>]*>\s*||" | sed "s|\s*</seg>$||" + @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +i = 1 +while line = STDIN.gets + puts line if i%2==0 + i+=1 +end + diff --git a/firstisupper b/firstisupper new file mode 100755 index 0000000..4278334 --- /dev/null +++ b/firstisupper @@ -0,0 +1,10 @@ +#!/usr/bin/env ruby + +def downcase?(string) + string[/[[:lower:]]/] +end + +while line = STDIN.gets + puts line.strip if downcase? line[0] +end + diff --git a/htmlentities b/htmlentities new file mode 100755 index 0000000..ecbee3f --- /dev/null +++ b/htmlentities @@ -0,0 +1,14 @@ +#!/usr/bin/ruby + +require 'htmlentities' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +coder = HTMLEntities.new + +while line = STDIN.gets + puts coder.decode(line.strip) +end + diff --git a/keycount b/keycount new file mode 100755 index 0000000..15b4095 --- /dev/null +++ b/keycount @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby + +h = {} +h.default = 0 +while line = STDIN.gets + line.strip! + h[line] += 1 +end + +h.each_pair {|k,v| puts "#{k} #{v}"} + @@ -0,0 +1,162 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +# {s:f} {s:f} => f +def dot(x,y) + sum = 0.0 + x.each_pair { |k,v| sum += v * y[k] } + return sum +end + +# {s:f} => f +def mag(x) + return Math.sqrt x.values.inject { |sum,i| sum+i**2 } +end + +# {s:f} {s:f} => f +def cos_sim(x,y) + return dot(x,y)/(mag(x)*mag(y)) +end + +# {s:f} {s:f} => f +def euclidian_dist(x,y) + dims = [x.keys, y.keys].flatten.uniq + sum = 0.0 + dims.each { |i| sum += (x[i] - y[i])**2 } + return Math.sqrt(sum) +end + +# str => {s:{s:f}} +def read(fn) + h = {} + f = File.new fn, 'r' + while line = f.gets + g = eval line + h[g[0]] = g[1] + h[g[0]].default = 0.0 + end + return h +end + +# {s:{s:f}} i => [{s:f}] +def rand_init(docs, k) + prng = Random.new + return docs.keys.sample k, random:prng +end + +def rand_init2(docs, k) + prng = Random.new + a = [] + 0.upto(k-1) do + a << mean(docs.values.sample k, random:prng) + end + return a +end + +# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]} +def assign(docs, centroids) + assignment = {} + docs.each_pair { |name,feature_vector| + min = 1.0/0 + min_index = nil + centroids.each_with_index { |c,j| + dist = euclidian_dist(c, feature_vector) + if dist < min + min = dist + min_index = j + end + } + if assignment.has_key? min_index + assignment[min_index] << [name, feature_vector] + else + assignment[min_index] = [[name, feature_vector]] + end + } + return assignment +end + +# [{s:f}] => {s:f} +def mean(a) + res = {} + res.default = 0.0 + a.each { |i| + i.each_pair { |k,v| + res[k] += v + } + } + n = a.size.to_f + res.each_pair { |k,v| + res[k] = v/n + } +end + +# {i:[{s:f}]} => [{s:f}] +def update(assignment) + new_centroids = [] + assignment.each_pair { |centroid,docs| + new_centroids << mean(docs.map{|i |i[1]}) + } + return new_centroids +end + +def main + opts = Trollop::options do + opt :k, "k", :type => :int, :required => true + opt :input, "input: one feature vector per line", :type => :string, :required => true + opt :max_iterations, "max. number of iterations", :type => :int, :default => 100 + opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3 + opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2 + end + docs = read opts[:input] + k = opts[:k] + centroids = nil + if opts[:init] == 1 + centroids = rand_init(docs, k) + else + centroids = rand_init2(docs, k) + end + STDERR.write "\n k #{k}\n" + STDERR.write " input #{opts[:input]}\n" + STDERR.write "iterations #{opts[:max_iterations]}\n" + STDERR.write "max no ch. #{opts[:max_no_change]}\n" + STDERR.write " init #{opts[:init]}\n\n" + assignment = nil + prev_stats = [] + stats = [] + no_change = 0 + max_no_change = 5 + STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n" + 0.upto(opts[:max_iterations]) do |i| + s = "iteration #{i}" + STDERR.write "#{s}\n#{'-'*s.size}\n" + assignment = assign(docs, centroids) + sizes = [] + assignment.each_pair { |centroid_index,docs| + sizes << docs.size + } + median = sizes.sort[k/2] + max = sizes.max + min = sizes.min + stats = [median, max, min] + no_change += 1 if stats==prev_stats + prev_stats = stats + STDERR.write sizes.to_s + "\n" + STDERR.write " median cluster sz=#{median}\n" + STDERR.write " max cluster sz=#{max}\n" + STDERR.write " min cluster sz=#{min}\n\n" + if no_change == max_no_change + STDERR.write "\nmax no change hit!\n\n" + assignment.each_pair { |centroid_index,docs| + puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}" + } + break + end + centroids = update(assignment) + end +end + + +main + @@ -0,0 +1,9 @@ +#!/usr/bin/env ruby + +max = -1.0/0 +while line = STDIN.gets + v = line.strip.to_f + max = v if v > max +end +puts max + diff --git a/merge_files b/merge_files new file mode 100755 index 0000000..db9d5da --- /dev/null +++ b/merge_files @@ -0,0 +1,32 @@ +#!/usr/bin/env ruby + +STDOUT.set_encoding 'utf-8' + +def usage + STDERR.write "merge_files [file]+\n" + exit 1 +end +usage if ARGV.size==0 + + +files = ARGV +dicts = [] + +files.each { |i| + dicts.push Hash.new + dicts.last.default = 0 + File.open i, "r:UTF-8" do |f| + while line = f.gets + dicts.last[line.strip] += 1 + end + end +} + +dicts.each { |h| + h.each { |k,v| + counts = [] + dicts.each { |j| counts.push j[k]; j.delete k } + counts.max.times { puts k } + } +} + @@ -0,0 +1,9 @@ +#!/usr/bin/env ruby + +min = 1.0/0 +while line = STDIN.gets + v = line.strip.to_f + min = v if v < min +end +puts min + @@ -0,0 +1,47 @@ +#!/usr/bin/ruby + +require 'trollop' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def usage + puts "filter-min-max.rb --min <min> --max <max> --in_f <in f> --in_e <in e> --out_f <out f> --out_e <out e> --out_id <out ids>" +end +usage if ARGV.size!=14 + +opts = Trollop::options do + opt :min, "minimum #tokens", :type => :int, :default => 1 + opt :max, "maximum #tokens", :type => :int, :default => 80 + opt :in_f "input 'French' file", :type => string + opt :in_e "input 'English' file", :type => string + opt :out_f "output 'French' file", :type => string + opt :out_e "output 'English' file", :type => string + opt :out_id "output line Nos", :type => string +end + + +files = {} +files[:f_file] = File.new opts[:in_f], 'r:UTF-8' +files[:e_file] = File.new opts[:in_e], 'r:UTF-8' +files[:f_out_file] = File.new opts[:out_f], 'w:UTF-8' +files[:e_out_file] = File.new opts[:out_e], 'w:UTF-8' +files[:id_out_file] = File.new opts[:out_id], 'w' +i = 0 +while f_line = files[:f_file].gets + e_line = files[:e_file].gets + f_line.strip! + e_line.strip! + a = f_line.split + b = e_line.split + if a.size >= opts[:min] and a.size <= opts[:max] and \ + b.size >= opts[:min] and b.size <= opts[:max] + files[:f_out_file].write "#{f_line}\n" + files[:e_out_file].write "#{e_line}\n" + files[:id_out_file].write "#{i}\n" + end + i+=1 +end +files.values.each{|f|f.close} + diff --git a/moses_1best b/moses_1best new file mode 100755 index 0000000..5c6bf9d --- /dev/null +++ b/moses_1best @@ -0,0 +1,15 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +prev_idx = nil +while line = STDIN.gets + line.strip! + idx = line.split('|||')[0].to_i + if idx != prev_idx + puts line + prev_idx = idx + end +end + @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby + +puts STDIN.gets.to_f * ARGV[0].to_f + @@ -0,0 +1,39 @@ +#!/usr/bin/env ruby + +def ngrams_it(s, n, fix=false) + a = s.strip.split + a.each_with_index { |tok, i| + tok.strip! + 0.upto([n-1, a.size-i-1].min) { |m| + yield a[i..i+m] if !(fix^(a[i..i+m].size==n)) + } + } +end + +def main(n, fix, sep) + STDIN.set_encoding 'utf-8' + STDOUT.set_encoding 'utf-8' + while line = STDIN.gets + a = [] + ngrams_it(line, n, fix) {|ng| a << ng.join(' ')} + a.reject! {|i| i.strip.size==0 } + puts a.join sep if a.size > 0 + end +end + +def usage + STDERR.write "./ng [-n <n>] [--fix] [--separator <s>] < <one number per line>\n" + exit 1 +end + +if __FILE__ == $0 + require 'trollop' + opts = Trollop::options do + opt :n, "Ngrams", :type => :int, :default => 4 + opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => true + opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n" + end + usage if not [0,2,4,6].include? ARGV.size + main(opts[:n], opts[:fix], opts[:separator]) +end + @@ -0,0 +1,4 @@ +#!/bin/sh + +tr '[:digit:]' $1 < $2 > $(basename $2 ${2##*.})nn.${2##*.} + diff --git a/no_empty b/no_empty new file mode 100755 index 0000000..ecdbcdf --- /dev/null +++ b/no_empty @@ -0,0 +1,17 @@ +#!/usr/bin/env ruby + +files = [] +(0..1).each { |i| files << File.new(ARGV[i], 'r') } +(2..3).each { |i| files << File.new(ARGV[i], 'w') } +files.each { |f| f.set_encoding('utf-8') } + +while line_f = files[0].gets + line_e = files[1].gets + line_f.strip!; line_e.strip! + next if line_f=='' || line_e=='' + files[2].write line_f+"\n" + files[3].write line_e+"\n" +end + +files.each { |f| f.close } + diff --git a/no_non_printables b/no_non_printables new file mode 100755 index 0000000..fda1e40 --- /dev/null +++ b/no_non_printables @@ -0,0 +1 @@ +sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g' diff --git a/norm_german b/norm_german new file mode 100755 index 0000000..57a37bb --- /dev/null +++ b/norm_german @@ -0,0 +1,93 @@ +#!/usr/bin/env ruby + +require 'thread' +require 'trollop' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def usage + STDERR.write "./avg [-r <d>] < <one number per line>\n" + exit 1 +end +usage if not [0,2,4].include? ARGV.size + +opts = Trollop::options do + opt :upper, "uppercase", :type => :bool, :default => false + opt :threads, "#threads", :type => :int, :default => 1, :short => '-h' + opt :shard_size, "shard size", :type => :int, :default => 1000 + opt :train, "train", :type => :bool + opt :apply, "apply", :type => :bool +end + + +pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ] +pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ] +if opts[:upper] + PAIRS = pairs_lower +else + PAIRS = pairs_lower+pairs_upper +end + +def get_key(old, new) + PAIRS.each { |i| + return old if new.gsub(i[0], i[1])==old + return old if new.gsub(i[1], i[0])==old + } + return nil +end + +def build_partial(tokens) + h = {} + tokens.each { |tok| + found = false + h.keys.each { |i| + if get_key i, tok + h[i] << tok + found = true + break + end + } + h[tok] = [tok] if !found + } + return h +end + +h = {} +threads = [] +thread_n = 0 +counter = 0 +token_stock = [] +mutex = Mutex.new +while tok = STDIN.gets # expects stream of (lowercased) tokens + token_stock << [] if !token_stock[thread_n] + token_stock[thread_n] << tok.strip! + counter += 1 + if token_stock[thread_n].size%opts[:shard_size]==0 + STDERR.write "Starting thread ##{thread_n}\n" + threads << Thread.new(token_stock[thread_n]) { |tokens| + th = build_partial tokens + mutex.synchronize do + h.merge! th + end + } + threads.last.abort_on_exception = true + thread_n += 1 + else + next + end + if thread_n==opts[:threads] + threads.each { |i| i.join } + token_stock.each { |i| i.clear } + thread_n = 0 + end + STDERR.write "#keys #{h.keys.size}\n" +end + +token_stock.each { |i| + if i.size!=0 + h.merge! build_partial i + end +} + diff --git a/normalize_punctuation b/normalize_punctuation new file mode 100755 index 0000000..108de44 --- /dev/null +++ b/normalize_punctuation @@ -0,0 +1,46 @@ +#!/usr/bin/perl -w +# adapted from the moses scripts + +use strict; + +my ($language) = @ARGV; + +while(<STDIN>) { + s/\r//g; + # normalize unicode punctuation + s/„/\"/g; + s/“/\"/g; + s/”/\"/g; + s/–/-/g; + s/—/ - /g; s/ +/ /g; + s/´/\'/g; + s/([a-z])‘([a-z])/$1\'$2/gi; + s/([a-z])’([a-z])/$1\'$2/gi; + s/‘/\"/g; + s/‚/\"/g; + s/’/\"/g; + s/''/\"/g; + s/´´/\"/g; + s/…/.../g; + # French quotes + s/ « / \"/g; + s/« /\"/g; + s/«/\"/g; + s/ » /\" /g; + s/ »/\"/g; + s/»/\"/g; + # handle pseudo-spaces + s/ \%/\%/g; + s/nº /nº /g; + s/ :/:/g; + s/ ºC/ ºC/g; + s/ cm/ cm/g; + s/ \?/\?/g; + s/ \!/\!/g; + s/ ;/;/g; + s/, /, /g; s/ +/ /g; + + print STDERR $_ if //; + + print $_; +} @@ -0,0 +1,8 @@ +#!/usr/bin/env ruby1.9.1 + +STDIN.set_encoding('utf-8') + +while line = STDIN.gets + puts line.split.length +end + @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +i = 1 +while line = STDIN.gets + puts line if i%2!=0 + i+=1 +end + diff --git a/paste_pairs b/paste_pairs new file mode 100755 index 0000000..6ede8f6 --- /dev/null +++ b/paste_pairs @@ -0,0 +1,11 @@ +#!/usr/bin/python + +import sys +from itertools import izip + + +for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))): + print linenr, (src_line.strip()) + print linenr, (tgt_line.strip()) + print + diff --git a/preprocess b/preprocess new file mode 100755 index 0000000..716255d --- /dev/null +++ b/preprocess @@ -0,0 +1,5 @@ +#!/bin/zsh + +LANG=$1 +~/scripts/htmlentities 2>htmlentities.$LANG.err | ~/scripts/normalize-punctuation 2>normalize-punctuation.$LANG.err | ~/moses/scripts/tokenizer/tokenizer.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | ~/moses/scripts/tokenizer/lowercase.perl 2>lowercase.$LANG.err + diff --git a/preprocess_nolow b/preprocess_nolow new file mode 100755 index 0000000..fc466b6 --- /dev/null +++ b/preprocess_nolow @@ -0,0 +1,5 @@ +#!/bin/zsh + +LANG=$1 +~/scripts/htmlentities 2>htmlentities.$LANG.err | ~/scripts/normalize-punctuation 2>normalize-punctuation.$LANG.err | ~/moses/scripts/tokenizer/tokenizer.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err + @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby + +puts STDIN.gets.to_f.round ARGV[0].to_i + diff --git a/ruby_eval b/ruby_eval new file mode 100755 index 0000000..fe0d181 --- /dev/null +++ b/ruby_eval @@ -0,0 +1,6 @@ +#!/usr/bin/env ruby + +while line = STDIN.gets + puts "#{eval line}" +end + diff --git a/rule_shapes b/rule_shapes new file mode 100755 index 0000000..039b0dc --- /dev/null +++ b/rule_shapes @@ -0,0 +1,29 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def shape s + res = [] + in_t = false + s.split.each { |i| + if i.match /\A\[X,\d\]\z/ + if in_t + in_t = false + end + res << "NT" + next + else + res << "T" if not in_t + in_t = true + end + } + return res +end + +while line = STDIN.gets + f,e = line.split "\t" + f.strip!; e.strip! + puts shape(f).join('_')+"-"+shape(e).join('_') +end + @@ -0,0 +1,25 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def usage + STDERR.write "./sample --size <n> < <line separated data>\n" + exit 1 +end +usage if ARGV.size!=4 + +opts = Trollop::options do + opt :size, "Sample n% (percentage).", :type => :int +end + + +prng = Random.new(Random.new_seed) + +while line = STDIN.gets + STDOUT.write line if prng.rand(1..opts[:size])==0 +end + diff --git a/sample_n b/sample_n new file mode 100755 index 0000000..2115407 --- /dev/null +++ b/sample_n @@ -0,0 +1,23 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def usage + STDERR.write "./sample --size <n> --population <n>\n" + exit 1 +end +usage if ARGV.size!=4 + +opts = Trollop::options do + opt :size, "Sample size (percentage).", :type => :int + opt :population, "'Population' (number \in N)", :type => :int +end + + +prng = Random.new(Random.new_seed) + +1.upto(opts[:population]) { |i| + puts i if prng.rand(1..opts[:size])==0 +} + @@ -0,0 +1,81 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false) + lc = `wc -l #{input}`.split.first.to_i + input_ext = input.split('.').last + refs_ext = refs.split('.').last + index = (0..lc-1).to_a + index.reverse! + index.shuffle! if rand + shard_sz = lc / num_shards + leftover = lc % num_shards + in_f = File.new input, 'r' + in_lines = in_f.readlines + refs_f = File.new refs, 'r' + refs_lines = refs_f.readlines + a_f = File.new alignments, 'r' + a_lines = a_f.readlines + shard_in_files = [] + shard_refs_files = [] + shard_a_files = [] + in_fns = [] + refs_fns = [] + a_fns = [] + 0.upto(num_shards-1) { |shard| + in_fn = "#{output_prefix}.#{shard}.#{input_ext}" + shard_in = File.new in_fn, 'w+' + in_fns << in_fn + refs_fn = "#{output_prefix}.#{shard}.#{refs_ext}" + shard_refs = File.new refs_fn, 'w+' + refs_fns << refs_fn + a_fn = "#{output_prefix}.#{shard}.a" + shard_a = File.new a_fn, 'w+' + a_fns << a_fn + 0.upto(shard_sz-1) { |i| + j = index.pop + shard_in.write in_lines[j] + shard_refs.write refs_lines[j] + shard_a.write a_lines[j] + } + shard_in_files << shard_in + shard_refs_files << shard_refs + shard_a_files << shard_a + } + if !rand + while leftover > 0 + j = index.pop + shard_in_files[-1].write in_lines[j] + shard_refs_files[-1].write refs_lines[j] + shard_a_files[-1].write a_lines[j] + leftover -= 1 + end + else + 0.upto(num_shards-1) { |shard| + break if leftover <= 0 + j = index.pop + shard_in_files[shard].write in_lines[j] + shard_refs_files[shard].write refs_lines[j] + shard_a_files[shard].write a_lines[j] + leftover -= 1 + } + end + (shard_in_files + shard_refs_files).each do |f| f.close end + in_f.close + refs_f.close + return [in_fns, refs_fns] +end + +opts = Trollop::options do + opt :input, 'input', :type => :string + opt :references, 'references', :type => :string + opt :alignments, 'alignments', :type => :string + opt :output_prefix, 'output prefix', :type => :string + opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z' + opt :num_shards, 'number of shards', :type => :int +end + +make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize]) + diff --git a/splitpipes b/splitpipes new file mode 100755 index 0000000..b0c3c9c --- /dev/null +++ b/splitpipes @@ -0,0 +1,29 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +def usage + STDERR.write "splitpipes -f <n> < <input>\n" + exit 1 +end +usage if ARGV.size!=2 + +opts = Trollop::options do + opt :field, "field", :type => :int +end + +while line = STDIN.gets + j = 1 + line.strip.split(' ||| ').each { |i| + if j == opts[:field] + puts i.strip + break + end + j += 1 + } +end + @@ -0,0 +1,41 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def usage + STDERR.write "./stddev [-r <d>] < <one number per line>\n" + exit 1 +end +usage if not [0,2].include? ARGV.size + +opts = Trollop::options do + opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 +end + + +sum = 0.0 +i = 0 +cached = [] +while line=STDIN.gets + v = line.strip.to_f + sum += v + cached << v + i +=1 +end + +avg = sum/i.to_f + +var = 0 +cached.each { |v| + var += (avg - v)**2 +} + +stddev = Math.sqrt(var) + +if opts[:round] >= 0 + puts stddev.round opts[:round] +else + puts stddev +end + diff --git a/strip_whitespace b/strip_whitespace new file mode 100755 index 0000000..37c02e5 --- /dev/null +++ b/strip_whitespace @@ -0,0 +1,6 @@ +#!/usr/bin/env ruby + +while line = STDIN.gets + puts line.lstrip.strip +end + @@ -0,0 +1,8 @@ +#!/usr/bin/env ruby + +sum = 0.0 +while line = STDIN.gets + sum += line.strip.to_f +end +puts sum + @@ -0,0 +1,80 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +# returns word='raw frequency' for a single document +def tf(d, stopwords=[]) + v = {}; v.default = 0 + d.uniq.each { |i| + next if stopwords.include? i + v[i] = d.count(i).to_f + } + return v +end + +# smoothes raw frequencies +def ntf(w, a=0.4) + max = w.values.max.to_f + w.each_pair { |k,v| + w[k] = a + (1-a)*(v/max) + } +end + +# returns idf value for each word in vocab +def idf(collection) + vocab = collection.values.flatten.uniq + n = collection.size.to_f + idf = {} + vocab.each { |i| + df = collection.values.flatten.count i + idf[i] = Math.log(n/df) + } + return idf +end + +def main + opts = Trollop::options do + opt :docs, "input files (documents)", :type => :strings, :required => true + opt :filter_stopwords, "filter stopwords (give file)", :type => :string + opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool + opt :ntf, "length-normalize tf values", :type => :bool + opt :idf, "weight tf by idf", :type => :bool + end + + stopwords = [] + if opts[:filter_stopwords] + stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''} + end + + docs = {} # fn => [words...] + opts[:docs].each { |i| + if opts[:one_item_per_line] + docs[i] = File.new(i, 'r').readlines.map{|i| i.strip} + else + docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip} + end + } + + idf_values = idf docs + + docs.each_pair { |name, words| + just_tf = tf(words) + just_tf = ntf(just_tf) if opts[:ntf] + tf_idf = {}; tf_idf.default = 0.0 + if opts[:idf] + just_tf.each_pair { |word,f| + tf_idf[word] = idf_values[word] * f + } + else + tf_idf = just_tf + end + docs[name] = tf_idf + } + + docs.each { |i| puts i.to_s } +end + + +main + @@ -0,0 +1,10 @@ +#!/usr/bin/ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + + +while line = STDIN.gets + line.strip.split(/\s/).each { |i| puts i } +end + @@ -0,0 +1,39 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def usage + STDERR.write "./stddev [-r <d>] < <one number per line>\n" + exit 1 +end +usage if not [0,2].include? ARGV.size + +opts = Trollop::options do + opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 +end + + +sum = 0.0 +i = 0 +cached = [] +while line=STDIN.gets + v = line.strip.to_f + sum += v + cached << v + i +=1 +end + +avg = sum/i.to_f + +var = 0 +cached.each { |v| + var += (avg - v)**2 +} + +if opts[:round] >= 0 + puts var.round opts[:round] +else + puts var +end + @@ -0,0 +1,8 @@ +#!/bin/sh + +pushd `dirname $0` > /dev/null +SCRIPTPATH=`pwd -P` +popd > /dev/null + +$SCRIPTPATH/toks ${1+"$@"} | sort | uniq -u + diff --git a/wrap-xml.perl b/wrap-xml.perl new file mode 100755 index 0000000..d29065a --- /dev/null +++ b/wrap-xml.perl @@ -0,0 +1,40 @@ +#!/usr/bin/perl -w +# original: https://smt.googlecode.com/svn/trunk/moses64/tools/scripts/wrap-xml.perl + +use strict; + +my $src = $ARGV[0]; +my $language = $ARGV[1]; +die("syntax: wrap-xml.perl xml-frame language [system-name]") + unless $src && $language && -e $src; +my $system = "my-system"; +$system = $ARGV[2] if defined($ARGV[2]); + +open(SRC,$src); +my @OUT = <STDIN>; +chomp(@OUT); +#my @OUT = `cat $decoder_output`; +while(<SRC>) { + chomp; + if (/^<srcset/) { + s/<srcset/<tstset trglang="$language" sysid="$system"/; + } + elsif (/^<\/srcset/) { + s/<\/srcset/<\/tstset/; + } + elsif (/^<DOC/) { + s/<DOC/<DOC sysid="$system"/; + } + elsif (/<seg/) { + my $line = shift(@OUT); + $line = "" if $line =~ /NO BEST TRANSLATION/; + if (/<\/seg>/) { + s/(<seg[^>]+> *).+(<\/seg>)/$1$line$2/; + } + else { + s/(<seg[^>]+> *)[^<]+/$1$line/; + } + } + print $_."\n"; +} + |