From db6a6ecfa350cae29739c59df1210d8f76a479c9 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Thu, 5 Dec 2013 07:56:38 +0100
Subject: init
---
README | 2 +
add_seg | 32 ++++++++++
add_start_end | 9 +++
avg | 31 ++++++++++
avg_weights | 46 ++++++++++++++
de-sgm | 4 ++
even | 11 ++++
firstisupper | 10 ++++
htmlentities | 14 +++++
keycount | 11 ++++
kmeans | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++
max | 9 +++
merge_files | 32 ++++++++++
min | 9 +++
min_max | 47 +++++++++++++++
moses_1best | 15 +++++
mult | 4 ++
ng | 39 ++++++++++++
nn | 4 ++
no_empty | 17 ++++++
no_non_printables | 1 +
norm_german | 93 +++++++++++++++++++++++++++++
normalize_punctuation | 46 ++++++++++++++
num_tok | 8 +++
odd | 11 ++++
paste_pairs | 11 ++++
preprocess | 5 ++
preprocess_nolow | 5 ++
round | 4 ++
ruby_eval | 6 ++
rule_shapes | 29 +++++++++
sample | 25 ++++++++
sample_n | 23 +++++++
shard | 81 +++++++++++++++++++++++++
splitpipes | 29 +++++++++
stddev | 41 +++++++++++++
strip_whitespace | 6 ++
sum | 8 +++
tf-idf | 80 +++++++++++++++++++++++++
toks | 10 ++++
var | 39 ++++++++++++
vocab | 8 +++
wrap-xml.perl | 40 +++++++++++++
43 files changed, 1117 insertions(+)
create mode 100644 README
create mode 100755 add_seg
create mode 100755 add_start_end
create mode 100755 avg
create mode 100755 avg_weights
create mode 100755 de-sgm
create mode 100755 even
create mode 100755 firstisupper
create mode 100755 htmlentities
create mode 100755 keycount
create mode 100755 kmeans
create mode 100755 max
create mode 100755 merge_files
create mode 100755 min
create mode 100755 min_max
create mode 100755 moses_1best
create mode 100755 mult
create mode 100755 ng
create mode 100755 nn
create mode 100755 no_empty
create mode 100755 no_non_printables
create mode 100755 norm_german
create mode 100755 normalize_punctuation
create mode 100755 num_tok
create mode 100755 odd
create mode 100755 paste_pairs
create mode 100755 preprocess
create mode 100755 preprocess_nolow
create mode 100755 round
create mode 100755 ruby_eval
create mode 100755 rule_shapes
create mode 100755 sample
create mode 100755 sample_n
create mode 100755 shard
create mode 100755 splitpipes
create mode 100755 stddev
create mode 100755 strip_whitespace
create mode 100755 sum
create mode 100755 tf-idf
create mode 100755 toks
create mode 100755 var
create mode 100755 vocab
create mode 100755 wrap-xml.perl
diff --git a/README b/README
new file mode 100644
index 0000000..8ce273f
--- /dev/null
+++ b/README
@@ -0,0 +1,2 @@
+misc. nlp related scripts
+
diff --git a/add_seg b/add_seg
new file mode 100755
index 0000000..e661b40
--- /dev/null
+++ b/add_seg
@@ -0,0 +1,32 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+def usage
+ puts "addseg [--nogz] [--loo] [--grammar] \n"
+ exit 1
+end
+
+opts = Trollop::options do
+ opt :grammar, "(Abs) path of folder containing grammar.", :type => :string, :short => '-g', :required => true
+ opt :loo, "leave one out", :type => :bool, :default => false
+ opt :start_id, "start with this id", :type => :int, :default => 0, :short => '-i'
+ opt :nogz, "grammar files not gzipped", :type => :bool, :default => false
+end
+
+
+i = opts[:start_id]
+while line = STDIN.gets
+ ext = '.gz'
+ ext = '' if opts[:nogz]
+ s = " #{line.strip} "
+ i+=1
+end
+
diff --git a/add_start_end b/add_start_end
new file mode 100755
index 0000000..a14a65e
--- /dev/null
+++ b/add_start_end
@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+while line = STDIN.gets
+ puts " #{line.strip} "
+end
+
diff --git a/avg b/avg
new file mode 100755
index 0000000..cc4c0e6
--- /dev/null
+++ b/avg
@@ -0,0 +1,31 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+def usage
+ STDERR.write "./avg [-r ] < \n"
+ exit 1
+end
+usage if not [0,2].include? ARGV.size
+
+opts = Trollop::options do
+ opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
+end
+
+
+sum = 0.0
+i = 0
+while line=STDIN.gets
+ sum += line.strip.to_f
+ i +=1
+end
+
+avg = sum/i.to_f
+
+if opts[:round] >= 0
+ puts avg.round opts[:round]
+else
+ puts avg
+end
+
diff --git a/avg_weights b/avg_weights
new file mode 100755
index 0000000..2b72747
--- /dev/null
+++ b/avg_weights
@@ -0,0 +1,46 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+require 'zlib'
+
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+opts = Trollop::options do
+ opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false
+end
+
+def usage
+ puts "avg_weights_filter [--filter] +"
+ exit 1
+end
+usage if ARGV.size==0
+
+
+h = {}
+ARGV.each { |fn|
+if File.extname(fn)=='.gz'
+ f = Zlib::GzipReader.new(File.new(fn, 'rb'))
+else
+ f = File.new fn, 'r'
+end
+while line = f.gets
+ k, v = line.split
+ v = v.to_f
+ if h.has_key? k
+ h[k] << v
+ else
+ h[k] = [v]
+ end
+end
+f.close
+}
+
+n = ARGV.size.to_f
+
+h.each_pair { |k,a|
+ next if opts[:filter] and a.size < n
+ puts "#{k} #{a.inject(:+)/n}"
+}
+
diff --git a/de-sgm b/de-sgm
new file mode 100755
index 0000000..fa28301
--- /dev/null
+++ b/de-sgm
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+grep -v -P "^[[:space:]]*(<\?xml.*\?>|?(mteval|doc|srcset|refset)[^>]*>)[[:space:]]*$" | grep -v -P "^[[:space:]]*<(url|description|keywords|talkid|title)>.*(url|description|keywords|talkid|title)>[[:space:]]*$" | sed "s|]*>\s*||" | sed "s|\s*$||"
+
diff --git a/even b/even
new file mode 100755
index 0000000..dcee3d9
--- /dev/null
+++ b/even
@@ -0,0 +1,11 @@
+#!/usr/bin/env ruby
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+i = 1
+while line = STDIN.gets
+ puts line if i%2==0
+ i+=1
+end
+
diff --git a/firstisupper b/firstisupper
new file mode 100755
index 0000000..4278334
--- /dev/null
+++ b/firstisupper
@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+
+def downcase?(string)
+ string[/[[:lower:]]/]
+end
+
+while line = STDIN.gets
+ puts line.strip if downcase? line[0]
+end
+
diff --git a/htmlentities b/htmlentities
new file mode 100755
index 0000000..ecbee3f
--- /dev/null
+++ b/htmlentities
@@ -0,0 +1,14 @@
+#!/usr/bin/ruby
+
+require 'htmlentities'
+
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+coder = HTMLEntities.new
+
+while line = STDIN.gets
+ puts coder.decode(line.strip)
+end
+
diff --git a/keycount b/keycount
new file mode 100755
index 0000000..15b4095
--- /dev/null
+++ b/keycount
@@ -0,0 +1,11 @@
+#!/usr/bin/env ruby
+
+h = {}
+h.default = 0
+while line = STDIN.gets
+ line.strip!
+ h[line] += 1
+end
+
+h.each_pair {|k,v| puts "#{k} #{v}"}
+
diff --git a/kmeans b/kmeans
new file mode 100755
index 0000000..89cc329
--- /dev/null
+++ b/kmeans
@@ -0,0 +1,162 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+# {s:f} {s:f} => f
+def dot(x,y)
+ sum = 0.0
+ x.each_pair { |k,v| sum += v * y[k] }
+ return sum
+end
+
+# {s:f} => f
+def mag(x)
+ return Math.sqrt x.values.inject { |sum,i| sum+i**2 }
+end
+
+# {s:f} {s:f} => f
+def cos_sim(x,y)
+ return dot(x,y)/(mag(x)*mag(y))
+end
+
+# {s:f} {s:f} => f
+def euclidian_dist(x,y)
+ dims = [x.keys, y.keys].flatten.uniq
+ sum = 0.0
+ dims.each { |i| sum += (x[i] - y[i])**2 }
+ return Math.sqrt(sum)
+end
+
+# str => {s:{s:f}}
+def read(fn)
+ h = {}
+ f = File.new fn, 'r'
+ while line = f.gets
+ g = eval line
+ h[g[0]] = g[1]
+ h[g[0]].default = 0.0
+ end
+ return h
+end
+
+# {s:{s:f}} i => [{s:f}]
+def rand_init(docs, k)
+ prng = Random.new
+ return docs.keys.sample k, random:prng
+end
+
+def rand_init2(docs, k)
+ prng = Random.new
+ a = []
+ 0.upto(k-1) do
+ a << mean(docs.values.sample k, random:prng)
+ end
+ return a
+end
+
+# {s:{s:f}} [{s:f}] => {i:[[s:{s:f}]]}
+def assign(docs, centroids)
+ assignment = {}
+ docs.each_pair { |name,feature_vector|
+ min = 1.0/0
+ min_index = nil
+ centroids.each_with_index { |c,j|
+ dist = euclidian_dist(c, feature_vector)
+ if dist < min
+ min = dist
+ min_index = j
+ end
+ }
+ if assignment.has_key? min_index
+ assignment[min_index] << [name, feature_vector]
+ else
+ assignment[min_index] = [[name, feature_vector]]
+ end
+ }
+ return assignment
+end
+
+# [{s:f}] => {s:f}
+def mean(a)
+ res = {}
+ res.default = 0.0
+ a.each { |i|
+ i.each_pair { |k,v|
+ res[k] += v
+ }
+ }
+ n = a.size.to_f
+ res.each_pair { |k,v|
+ res[k] = v/n
+ }
+end
+
+# {i:[{s:f}]} => [{s:f}]
+def update(assignment)
+ new_centroids = []
+ assignment.each_pair { |centroid,docs|
+ new_centroids << mean(docs.map{|i |i[1]})
+ }
+ return new_centroids
+end
+
+def main
+ opts = Trollop::options do
+ opt :k, "k", :type => :int, :required => true
+ opt :input, "input: one feature vector per line", :type => :string, :required => true
+ opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
+ opt :max_no_change, "max. no stalled iteration before stopping ", :type => :int, :short => '-n', :default => 3
+ opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
+ end
+ docs = read opts[:input]
+ k = opts[:k]
+ centroids = nil
+ if opts[:init] == 1
+ centroids = rand_init(docs, k)
+ else
+ centroids = rand_init2(docs, k)
+ end
+ STDERR.write "\n k #{k}\n"
+ STDERR.write " input #{opts[:input]}\n"
+ STDERR.write "iterations #{opts[:max_iterations]}\n"
+ STDERR.write "max no ch. #{opts[:max_no_change]}\n"
+ STDERR.write " init #{opts[:init]}\n\n"
+ assignment = nil
+ prev_stats = []
+ stats = []
+ no_change = 0
+ max_no_change = 5
+ STDERR.write "expected cluster sz=#{docs.size/k.to_f}\n\n"
+ 0.upto(opts[:max_iterations]) do |i|
+ s = "iteration #{i}"
+ STDERR.write "#{s}\n#{'-'*s.size}\n"
+ assignment = assign(docs, centroids)
+ sizes = []
+ assignment.each_pair { |centroid_index,docs|
+ sizes << docs.size
+ }
+ median = sizes.sort[k/2]
+ max = sizes.max
+ min = sizes.min
+ stats = [median, max, min]
+ no_change += 1 if stats==prev_stats
+ prev_stats = stats
+ STDERR.write sizes.to_s + "\n"
+ STDERR.write " median cluster sz=#{median}\n"
+ STDERR.write " max cluster sz=#{max}\n"
+ STDERR.write " min cluster sz=#{min}\n\n"
+ if no_change == max_no_change
+ STDERR.write "\nmax no change hit!\n\n"
+ assignment.each_pair { |centroid_index,docs|
+ puts "#{centroid_index} #{docs.map{|i| i[0]}.to_s}"
+ }
+ break
+ end
+ centroids = update(assignment)
+ end
+end
+
+
+main
+
diff --git a/max b/max
new file mode 100755
index 0000000..506bd03
--- /dev/null
+++ b/max
@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+
+max = -1.0/0
+while line = STDIN.gets
+ v = line.strip.to_f
+ max = v if v > max
+end
+puts max
+
diff --git a/merge_files b/merge_files
new file mode 100755
index 0000000..db9d5da
--- /dev/null
+++ b/merge_files
@@ -0,0 +1,32 @@
+#!/usr/bin/env ruby
+
+STDOUT.set_encoding 'utf-8'
+
+def usage
+ STDERR.write "merge_files [file]+\n"
+ exit 1
+end
+usage if ARGV.size==0
+
+
+files = ARGV
+dicts = []
+
+files.each { |i|
+ dicts.push Hash.new
+ dicts.last.default = 0
+ File.open i, "r:UTF-8" do |f|
+ while line = f.gets
+ dicts.last[line.strip] += 1
+ end
+ end
+}
+
+dicts.each { |h|
+ h.each { |k,v|
+ counts = []
+ dicts.each { |j| counts.push j[k]; j.delete k }
+ counts.max.times { puts k }
+ }
+}
+
diff --git a/min b/min
new file mode 100755
index 0000000..c2f85b9
--- /dev/null
+++ b/min
@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+
+min = 1.0/0
+while line = STDIN.gets
+ v = line.strip.to_f
+ min = v if v < min
+end
+puts min
+
diff --git a/min_max b/min_max
new file mode 100755
index 0000000..f27de88
--- /dev/null
+++ b/min_max
@@ -0,0 +1,47 @@
+#!/usr/bin/ruby
+
+require 'trollop'
+
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+def usage
+ puts "filter-min-max.rb --min --max --in_f --in_e --out_f --out_e --out_id "
+end
+usage if ARGV.size!=14
+
+opts = Trollop::options do
+ opt :min, "minimum #tokens", :type => :int, :default => 1
+ opt :max, "maximum #tokens", :type => :int, :default => 80
+ opt :in_f "input 'French' file", :type => string
+ opt :in_e "input 'English' file", :type => string
+ opt :out_f "output 'French' file", :type => string
+ opt :out_e "output 'English' file", :type => string
+ opt :out_id "output line Nos", :type => string
+end
+
+
+files = {}
+files[:f_file] = File.new opts[:in_f], 'r:UTF-8'
+files[:e_file] = File.new opts[:in_e], 'r:UTF-8'
+files[:f_out_file] = File.new opts[:out_f], 'w:UTF-8'
+files[:e_out_file] = File.new opts[:out_e], 'w:UTF-8'
+files[:id_out_file] = File.new opts[:out_id], 'w'
+i = 0
+while f_line = files[:f_file].gets
+ e_line = files[:e_file].gets
+ f_line.strip!
+ e_line.strip!
+ a = f_line.split
+ b = e_line.split
+ if a.size >= opts[:min] and a.size <= opts[:max] and \
+ b.size >= opts[:min] and b.size <= opts[:max]
+ files[:f_out_file].write "#{f_line}\n"
+ files[:e_out_file].write "#{e_line}\n"
+ files[:id_out_file].write "#{i}\n"
+ end
+ i+=1
+end
+files.values.each{|f|f.close}
+
diff --git a/moses_1best b/moses_1best
new file mode 100755
index 0000000..5c6bf9d
--- /dev/null
+++ b/moses_1best
@@ -0,0 +1,15 @@
+#!/usr/bin/env ruby
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+prev_idx = nil
+while line = STDIN.gets
+ line.strip!
+ idx = line.split('|||')[0].to_i
+ if idx != prev_idx
+ puts line
+ prev_idx = idx
+ end
+end
+
diff --git a/mult b/mult
new file mode 100755
index 0000000..eaead89
--- /dev/null
+++ b/mult
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+
+puts STDIN.gets.to_f * ARGV[0].to_f
+
diff --git a/ng b/ng
new file mode 100755
index 0000000..d8b01ae
--- /dev/null
+++ b/ng
@@ -0,0 +1,39 @@
+#!/usr/bin/env ruby
+
+def ngrams_it(s, n, fix=false)
+ a = s.strip.split
+ a.each_with_index { |tok, i|
+ tok.strip!
+ 0.upto([n-1, a.size-i-1].min) { |m|
+ yield a[i..i+m] if !(fix^(a[i..i+m].size==n))
+ }
+ }
+end
+
+def main(n, fix, sep)
+ STDIN.set_encoding 'utf-8'
+ STDOUT.set_encoding 'utf-8'
+ while line = STDIN.gets
+ a = []
+ ngrams_it(line, n, fix) {|ng| a << ng.join(' ')}
+ a.reject! {|i| i.strip.size==0 }
+ puts a.join sep if a.size > 0
+ end
+end
+
+def usage
+ STDERR.write "./ng [-n ] [--fix] [--separator ] < \n"
+ exit 1
+end
+
+if __FILE__ == $0
+ require 'trollop'
+ opts = Trollop::options do
+ opt :n, "Ngrams", :type => :int, :default => 4
+ opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => true
+ opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n"
+ end
+ usage if not [0,2,4,6].include? ARGV.size
+ main(opts[:n], opts[:fix], opts[:separator])
+end
+
diff --git a/nn b/nn
new file mode 100755
index 0000000..4d1dab7
--- /dev/null
+++ b/nn
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+tr '[:digit:]' $1 < $2 > $(basename $2 ${2##*.})nn.${2##*.}
+
diff --git a/no_empty b/no_empty
new file mode 100755
index 0000000..ecdbcdf
--- /dev/null
+++ b/no_empty
@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+
+files = []
+(0..1).each { |i| files << File.new(ARGV[i], 'r') }
+(2..3).each { |i| files << File.new(ARGV[i], 'w') }
+files.each { |f| f.set_encoding('utf-8') }
+
+while line_f = files[0].gets
+ line_e = files[1].gets
+ line_f.strip!; line_e.strip!
+ next if line_f=='' || line_e==''
+ files[2].write line_f+"\n"
+ files[3].write line_e+"\n"
+end
+
+files.each { |f| f.close }
+
diff --git a/no_non_printables b/no_non_printables
new file mode 100755
index 0000000..fda1e40
--- /dev/null
+++ b/no_non_printables
@@ -0,0 +1 @@
+sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g'
diff --git a/norm_german b/norm_german
new file mode 100755
index 0000000..57a37bb
--- /dev/null
+++ b/norm_german
@@ -0,0 +1,93 @@
+#!/usr/bin/env ruby
+
+require 'thread'
+require 'trollop'
+
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+def usage
+ STDERR.write "./avg [-r ] < \n"
+ exit 1
+end
+usage if not [0,2,4].include? ARGV.size
+
+opts = Trollop::options do
+ opt :upper, "uppercase", :type => :bool, :default => false
+ opt :threads, "#threads", :type => :int, :default => 1, :short => '-h'
+ opt :shard_size, "shard size", :type => :int, :default => 1000
+ opt :train, "train", :type => :bool
+ opt :apply, "apply", :type => :bool
+end
+
+
+pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ]
+pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ]
+if opts[:upper]
+ PAIRS = pairs_lower
+else
+ PAIRS = pairs_lower+pairs_upper
+end
+
+def get_key(old, new)
+ PAIRS.each { |i|
+ return old if new.gsub(i[0], i[1])==old
+ return old if new.gsub(i[1], i[0])==old
+ }
+ return nil
+end
+
+def build_partial(tokens)
+ h = {}
+ tokens.each { |tok|
+ found = false
+ h.keys.each { |i|
+ if get_key i, tok
+ h[i] << tok
+ found = true
+ break
+ end
+ }
+ h[tok] = [tok] if !found
+ }
+ return h
+end
+
+h = {}
+threads = []
+thread_n = 0
+counter = 0
+token_stock = []
+mutex = Mutex.new
+while tok = STDIN.gets # expects stream of (lowercased) tokens
+ token_stock << [] if !token_stock[thread_n]
+ token_stock[thread_n] << tok.strip!
+ counter += 1
+ if token_stock[thread_n].size%opts[:shard_size]==0
+ STDERR.write "Starting thread ##{thread_n}\n"
+ threads << Thread.new(token_stock[thread_n]) { |tokens|
+ th = build_partial tokens
+ mutex.synchronize do
+ h.merge! th
+ end
+ }
+ threads.last.abort_on_exception = true
+ thread_n += 1
+ else
+ next
+ end
+ if thread_n==opts[:threads]
+ threads.each { |i| i.join }
+ token_stock.each { |i| i.clear }
+ thread_n = 0
+ end
+ STDERR.write "#keys #{h.keys.size}\n"
+end
+
+token_stock.each { |i|
+ if i.size!=0
+ h.merge! build_partial i
+ end
+}
+
diff --git a/normalize_punctuation b/normalize_punctuation
new file mode 100755
index 0000000..108de44
--- /dev/null
+++ b/normalize_punctuation
@@ -0,0 +1,46 @@
+#!/usr/bin/perl -w
+# adapted from the moses scripts
+
+use strict;
+
+my ($language) = @ARGV;
+
+while() {
+ s/\r//g;
+ # normalize unicode punctuation
+ s/„/\"/g;
+ s/“/\"/g;
+ s/”/\"/g;
+ s/–/-/g;
+ s/—/ - /g; s/ +/ /g;
+ s/´/\'/g;
+ s/([a-z])‘([a-z])/$1\'$2/gi;
+ s/([a-z])’([a-z])/$1\'$2/gi;
+ s/‘/\"/g;
+ s/‚/\"/g;
+ s/’/\"/g;
+ s/''/\"/g;
+ s/´´/\"/g;
+ s/…/.../g;
+ # French quotes
+ s/ « / \"/g;
+ s/« /\"/g;
+ s/«/\"/g;
+ s/ » /\" /g;
+ s/ »/\"/g;
+ s/»/\"/g;
+ # handle pseudo-spaces
+ s/ \%/\%/g;
+ s/nº /nº /g;
+ s/ :/:/g;
+ s/ ºC/ ºC/g;
+ s/ cm/ cm/g;
+ s/ \?/\?/g;
+ s/ \!/\!/g;
+ s/ ;/;/g;
+ s/, /, /g; s/ +/ /g;
+
+ print STDERR $_ if //;
+
+ print $_;
+}
diff --git a/num_tok b/num_tok
new file mode 100755
index 0000000..7cc500c
--- /dev/null
+++ b/num_tok
@@ -0,0 +1,8 @@
+#!/usr/bin/env ruby1.9.1
+
+STDIN.set_encoding('utf-8')
+
+while line = STDIN.gets
+ puts line.split.length
+end
+
diff --git a/odd b/odd
new file mode 100755
index 0000000..0bd9336
--- /dev/null
+++ b/odd
@@ -0,0 +1,11 @@
+#!/usr/bin/env ruby
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+i = 1
+while line = STDIN.gets
+ puts line if i%2!=0
+ i+=1
+end
+
diff --git a/paste_pairs b/paste_pairs
new file mode 100755
index 0000000..6ede8f6
--- /dev/null
+++ b/paste_pairs
@@ -0,0 +1,11 @@
+#!/usr/bin/python
+
+import sys
+from itertools import izip
+
+
+for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))):
+ print linenr, (src_line.strip())
+ print linenr, (tgt_line.strip())
+ print
+
diff --git a/preprocess b/preprocess
new file mode 100755
index 0000000..716255d
--- /dev/null
+++ b/preprocess
@@ -0,0 +1,5 @@
+#!/bin/zsh
+
+LANG=$1
+~/scripts/htmlentities 2>htmlentities.$LANG.err | ~/scripts/normalize-punctuation 2>normalize-punctuation.$LANG.err | ~/moses/scripts/tokenizer/tokenizer.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | ~/moses/scripts/tokenizer/lowercase.perl 2>lowercase.$LANG.err
+
diff --git a/preprocess_nolow b/preprocess_nolow
new file mode 100755
index 0000000..fc466b6
--- /dev/null
+++ b/preprocess_nolow
@@ -0,0 +1,5 @@
+#!/bin/zsh
+
+LANG=$1
+~/scripts/htmlentities 2>htmlentities.$LANG.err | ~/scripts/normalize-punctuation 2>normalize-punctuation.$LANG.err | ~/moses/scripts/tokenizer/tokenizer.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err
+
diff --git a/round b/round
new file mode 100755
index 0000000..52cd013
--- /dev/null
+++ b/round
@@ -0,0 +1,4 @@
+#!/usr/bin/env ruby
+
+puts STDIN.gets.to_f.round ARGV[0].to_i
+
diff --git a/ruby_eval b/ruby_eval
new file mode 100755
index 0000000..fe0d181
--- /dev/null
+++ b/ruby_eval
@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+
+while line = STDIN.gets
+ puts "#{eval line}"
+end
+
diff --git a/rule_shapes b/rule_shapes
new file mode 100755
index 0000000..039b0dc
--- /dev/null
+++ b/rule_shapes
@@ -0,0 +1,29 @@
+#!/usr/bin/env ruby
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+def shape s
+ res = []
+ in_t = false
+ s.split.each { |i|
+ if i.match /\A\[X,\d\]\z/
+ if in_t
+ in_t = false
+ end
+ res << "NT"
+ next
+ else
+ res << "T" if not in_t
+ in_t = true
+ end
+ }
+ return res
+end
+
+while line = STDIN.gets
+ f,e = line.split "\t"
+ f.strip!; e.strip!
+ puts shape(f).join('_')+"-"+shape(e).join('_')
+end
+
diff --git a/sample b/sample
new file mode 100755
index 0000000..b4706c6
--- /dev/null
+++ b/sample
@@ -0,0 +1,25 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+def usage
+ STDERR.write "./sample --size < \n"
+ exit 1
+end
+usage if ARGV.size!=4
+
+opts = Trollop::options do
+ opt :size, "Sample n% (percentage).", :type => :int
+end
+
+
+prng = Random.new(Random.new_seed)
+
+while line = STDIN.gets
+ STDOUT.write line if prng.rand(1..opts[:size])==0
+end
+
diff --git a/sample_n b/sample_n
new file mode 100755
index 0000000..2115407
--- /dev/null
+++ b/sample_n
@@ -0,0 +1,23 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+def usage
+ STDERR.write "./sample --size --population \n"
+ exit 1
+end
+usage if ARGV.size!=4
+
+opts = Trollop::options do
+ opt :size, "Sample size (percentage).", :type => :int
+ opt :population, "'Population' (number \in N)", :type => :int
+end
+
+
+prng = Random.new(Random.new_seed)
+
+1.upto(opts[:population]) { |i|
+ puts i if prng.rand(1..opts[:size])==0
+}
+
diff --git a/shard b/shard
new file mode 100755
index 0000000..7729699
--- /dev/null
+++ b/shard
@@ -0,0 +1,81 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false)
+ lc = `wc -l #{input}`.split.first.to_i
+ input_ext = input.split('.').last
+ refs_ext = refs.split('.').last
+ index = (0..lc-1).to_a
+ index.reverse!
+ index.shuffle! if rand
+ shard_sz = lc / num_shards
+ leftover = lc % num_shards
+ in_f = File.new input, 'r'
+ in_lines = in_f.readlines
+ refs_f = File.new refs, 'r'
+ refs_lines = refs_f.readlines
+ a_f = File.new alignments, 'r'
+ a_lines = a_f.readlines
+ shard_in_files = []
+ shard_refs_files = []
+ shard_a_files = []
+ in_fns = []
+ refs_fns = []
+ a_fns = []
+ 0.upto(num_shards-1) { |shard|
+ in_fn = "#{output_prefix}.#{shard}.#{input_ext}"
+ shard_in = File.new in_fn, 'w+'
+ in_fns << in_fn
+ refs_fn = "#{output_prefix}.#{shard}.#{refs_ext}"
+ shard_refs = File.new refs_fn, 'w+'
+ refs_fns << refs_fn
+ a_fn = "#{output_prefix}.#{shard}.a"
+ shard_a = File.new a_fn, 'w+'
+ a_fns << a_fn
+ 0.upto(shard_sz-1) { |i|
+ j = index.pop
+ shard_in.write in_lines[j]
+ shard_refs.write refs_lines[j]
+ shard_a.write a_lines[j]
+ }
+ shard_in_files << shard_in
+ shard_refs_files << shard_refs
+ shard_a_files << shard_a
+ }
+ if !rand
+ while leftover > 0
+ j = index.pop
+ shard_in_files[-1].write in_lines[j]
+ shard_refs_files[-1].write refs_lines[j]
+ shard_a_files[-1].write a_lines[j]
+ leftover -= 1
+ end
+ else
+ 0.upto(num_shards-1) { |shard|
+ break if leftover <= 0
+ j = index.pop
+ shard_in_files[shard].write in_lines[j]
+ shard_refs_files[shard].write refs_lines[j]
+ shard_a_files[shard].write a_lines[j]
+ leftover -= 1
+ }
+ end
+ (shard_in_files + shard_refs_files).each do |f| f.close end
+ in_f.close
+ refs_f.close
+ return [in_fns, refs_fns]
+end
+
+opts = Trollop::options do
+ opt :input, 'input', :type => :string
+ opt :references, 'references', :type => :string
+ opt :alignments, 'alignments', :type => :string
+ opt :output_prefix, 'output prefix', :type => :string
+ opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z'
+ opt :num_shards, 'number of shards', :type => :int
+end
+
+make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize])
+
diff --git a/splitpipes b/splitpipes
new file mode 100755
index 0000000..b0c3c9c
--- /dev/null
+++ b/splitpipes
@@ -0,0 +1,29 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+def usage
+ STDERR.write "splitpipes -f < \n"
+ exit 1
+end
+usage if ARGV.size!=2
+
+opts = Trollop::options do
+ opt :field, "field", :type => :int
+end
+
+while line = STDIN.gets
+ j = 1
+ line.strip.split(' ||| ').each { |i|
+ if j == opts[:field]
+ puts i.strip
+ break
+ end
+ j += 1
+ }
+end
+
diff --git a/stddev b/stddev
new file mode 100755
index 0000000..3bf0270
--- /dev/null
+++ b/stddev
@@ -0,0 +1,41 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+def usage
+ STDERR.write "./stddev [-r ] < \n"
+ exit 1
+end
+usage if not [0,2].include? ARGV.size
+
+opts = Trollop::options do
+ opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
+end
+
+
+sum = 0.0
+i = 0
+cached = []
+while line=STDIN.gets
+ v = line.strip.to_f
+ sum += v
+ cached << v
+ i +=1
+end
+
+avg = sum/i.to_f
+
+var = 0
+cached.each { |v|
+ var += (avg - v)**2
+}
+
+stddev = Math.sqrt(var)
+
+if opts[:round] >= 0
+ puts stddev.round opts[:round]
+else
+ puts stddev
+end
+
diff --git a/strip_whitespace b/strip_whitespace
new file mode 100755
index 0000000..37c02e5
--- /dev/null
+++ b/strip_whitespace
@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+
+while line = STDIN.gets
+ puts line.lstrip.strip
+end
+
diff --git a/sum b/sum
new file mode 100755
index 0000000..3fca95e
--- /dev/null
+++ b/sum
@@ -0,0 +1,8 @@
+#!/usr/bin/env ruby
+
+sum = 0.0
+while line = STDIN.gets
+ sum += line.strip.to_f
+end
+puts sum
+
diff --git a/tf-idf b/tf-idf
new file mode 100755
index 0000000..3edaaf8
--- /dev/null
+++ b/tf-idf
@@ -0,0 +1,80 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+# returns word='raw frequency' for a single document
+def tf(d, stopwords=[])
+ v = {}; v.default = 0
+ d.uniq.each { |i|
+ next if stopwords.include? i
+ v[i] = d.count(i).to_f
+ }
+ return v
+end
+
+# smoothes raw frequencies
+def ntf(w, a=0.4)
+ max = w.values.max.to_f
+ w.each_pair { |k,v|
+ w[k] = a + (1-a)*(v/max)
+ }
+end
+
+# returns idf value for each word in vocab
+def idf(collection)
+ vocab = collection.values.flatten.uniq
+ n = collection.size.to_f
+ idf = {}
+ vocab.each { |i|
+ df = collection.values.flatten.count i
+ idf[i] = Math.log(n/df)
+ }
+ return idf
+end
+
+def main
+ opts = Trollop::options do
+ opt :docs, "input files (documents)", :type => :strings, :required => true
+ opt :filter_stopwords, "filter stopwords (give file)", :type => :string
+ opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool
+ opt :ntf, "length-normalize tf values", :type => :bool
+ opt :idf, "weight tf by idf", :type => :bool
+ end
+
+ stopwords = []
+ if opts[:filter_stopwords]
+ stopwords = File.new('stop.txt.utf8', 'r').readlines.map{|i| i.split('|').first.strip}.reject{|i|i==''}
+ end
+
+ docs = {} # fn => [words...]
+ opts[:docs].each { |i|
+ if opts[:one_item_per_line]
+ docs[i] = File.new(i, 'r').readlines.map{|i| i.strip}
+ else
+ docs[i] = File.new(i, 'r').read.split(/\s/).map{|i| i.strip}
+ end
+ }
+
+ idf_values = idf docs
+
+ docs.each_pair { |name, words|
+ just_tf = tf(words)
+ just_tf = ntf(just_tf) if opts[:ntf]
+ tf_idf = {}; tf_idf.default = 0.0
+ if opts[:idf]
+ just_tf.each_pair { |word,f|
+ tf_idf[word] = idf_values[word] * f
+ }
+ else
+ tf_idf = just_tf
+ end
+ docs[name] = tf_idf
+ }
+
+ docs.each { |i| puts i.to_s }
+end
+
+
+main
+
diff --git a/toks b/toks
new file mode 100755
index 0000000..ed40dbb
--- /dev/null
+++ b/toks
@@ -0,0 +1,10 @@
+#!/usr/bin/ruby
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+
+while line = STDIN.gets
+ line.strip.split(/\s/).each { |i| puts i }
+end
+
diff --git a/var b/var
new file mode 100755
index 0000000..08b75b6
--- /dev/null
+++ b/var
@@ -0,0 +1,39 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+def usage
+ STDERR.write "./stddev [-r ] < \n"
+ exit 1
+end
+usage if not [0,2].include? ARGV.size
+
+opts = Trollop::options do
+ opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
+end
+
+
+sum = 0.0
+i = 0
+cached = []
+while line=STDIN.gets
+ v = line.strip.to_f
+ sum += v
+ cached << v
+ i +=1
+end
+
+avg = sum/i.to_f
+
+var = 0
+cached.each { |v|
+ var += (avg - v)**2
+}
+
+if opts[:round] >= 0
+ puts var.round opts[:round]
+else
+ puts var
+end
+
diff --git a/vocab b/vocab
new file mode 100755
index 0000000..e7b03fe
--- /dev/null
+++ b/vocab
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+pushd `dirname $0` > /dev/null
+SCRIPTPATH=`pwd -P`
+popd > /dev/null
+
+$SCRIPTPATH/toks ${1+"$@"} | sort | uniq -u
+
diff --git a/wrap-xml.perl b/wrap-xml.perl
new file mode 100755
index 0000000..d29065a
--- /dev/null
+++ b/wrap-xml.perl
@@ -0,0 +1,40 @@
+#!/usr/bin/perl -w
+# original: https://smt.googlecode.com/svn/trunk/moses64/tools/scripts/wrap-xml.perl
+
+use strict;
+
+my $src = $ARGV[0];
+my $language = $ARGV[1];
+die("syntax: wrap-xml.perl xml-frame language [system-name]")
+ unless $src && $language && -e $src;
+my $system = "my-system";
+$system = $ARGV[2] if defined($ARGV[2]);
+
+open(SRC,$src);
+my @OUT = ;
+chomp(@OUT);
+#my @OUT = `cat $decoder_output`;
+while() {
+ chomp;
+ if (/^/) {
+ s/(]+> *).+(<\/seg>)/$1$line$2/;
+ }
+ else {
+ s/(]+> *)[^<]+/$1$line/;
+ }
+ }
+ print $_."\n";
+}
+
--
cgit v1.2.3