summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md14
-rwxr-xr-xadd_seg2
-rwxr-xr-xavg1
-rwxr-xr-xavg_weights1
-rwxr-xr-xcdec_hg_to_json (renamed from hg2json.py)1
-rwxr-xr-xdot9
-rwxr-xr-xfirst_lower (renamed from firstlower)1
-rwxr-xr-xgigaword_collapse_tags (renamed from collapse_tags.rb)1
-rwxr-xr-xkbest_bleu_oracles2
-rwxr-xr-xkey_count (renamed from keycount)0
-rwxr-xr-xkmeans2
-rwxr-xr-xlin_reg2
-rwxr-xr-xlog_reg2
-rwxr-xr-xmax1
-rwxr-xr-xmedian1
-rwxr-xr-xmem_usage (renamed from memusg)1
-rwxr-xr-xmerge_files1
-rwxr-xr-xmerge_ttable2
-rwxr-xr-xmin1
-rwxr-xr-xmin_max1
-rwxr-xr-xmoses_1best1
-rwxr-xr-xmult1
-rwxr-xr-xno_empty1
-rwxr-xr-xnum_tok1
-rwxr-xr-xodd1
-rwxr-xr-xpaste_pairs1
-rwxr-xr-xper_sentence_bleu2
-rwxr-xr-xper_sentence_bleu_kbest2
-rwxr-xr-xper_sentence_ter2
-rwxr-xr-xpot1
-rwxr-xr-xround1
-rwxr-xr-xruby_eval1
-rwxr-xr-xrule_shapes1
-rwxr-xr-xshard1
-rwxr-xr-xsplit_pipes (renamed from splitpipes)1
-rwxr-xr-xstanford_parser_run (renamed from parse-stanford.sh)0
-rwxr-xr-xstddev1
-rwxr-xr-xsum1
-rwxr-xr-xtc1
-rw-r--r--test/cdec_hg_to_json/cdec.ini (renamed from test/hg2json/cdec.ini)0
-rw-r--r--test/cdec_hg_to_json/grammar.gz (renamed from test/hg2json/grammar.gz)bin1399915 -> 1399915 bytes
-rw-r--r--test/cdec_hg_to_json/hg.json.gz (renamed from test/hg2json/hg.json.gz)bin318029 -> 318029 bytes
-rw-r--r--test/cdec_hg_to_json/hg.meta (renamed from test/hg2json/hg.meta)0
-rw-r--r--test/cdec_hg_to_json/in (renamed from test/hg2json/in)0
-rw-r--r--test/cdec_hg_to_json/toy.cdec.ini (renamed from test/hg2json/toy.cdec.ini)0
-rw-r--r--test/cdec_hg_to_json/toy.grammar (renamed from test/hg2json/toy.grammar)0
-rw-r--r--test/cdec_hg_to_json/toy.in (renamed from test/hg2json/toy.in)0
-rw-r--r--test/cdec_hg_to_json/toy.weights (renamed from test/hg2json/toy.weights)0
-rw-r--r--test/cdec_hg_to_json/weights (renamed from test/hg2json/weights)0
-rwxr-xr-xtf-idf2
-rwxr-xr-xto_ascii1
-rwxr-xr-xtokenizer-no-escape.perl (renamed from tokenizer.no-escape.perl)0
-rwxr-xr-xtoks1
-rwxr-xr-xtrain_test_split (renamed from traintestsplit)1
-rwxr-xr-xvar1
55 files changed, 16 insertions, 57 deletions
diff --git a/README.md b/README.md
index 3a6b1b7..fd42922 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,11 @@
-scripts
-=======
+a number of NLP related scripts. Some scripts require my zipf gem, see [1]
-A number of NLP related scripts.
-Some scripts require my zipf gem,
-see https://github.com/pks/zipf
+\*.perl taken from the moses [2] toolkit
-compound-splitter.perl and tokenizer.no-escape.perl
-taken from the moses [1] toolkit.
+mem\_usage taken from [3]
-[1] https://github.com/moses-smt/mosesdecoder
+[1] https://github.com/pks/zipf
+[2] https://github.com/moses-smt/mosesdecoder
+[3] https://gist.github.com/netj/526585
diff --git a/add_seg b/add_seg
index e4fe22d..7a4ca7a 100755
--- a/add_seg
+++ b/add_seg
@@ -24,8 +24,8 @@ while line = STDIN.gets
s = "<seg"
if cfg[:loo] then s += " exclude=\"#{i}\"" end
if index.size > 0
- puts s + " id=\"#{index[j]}\"> #{line.strip} </seg>"
if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{index[j]}#{ext}\"" end
+ puts s + " id=\"#{index[j]}\"> #{line.strip} </seg>"
else
if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{i}#{ext}\"" end
puts s + " id=\"#{i}\"> #{line.strip} </seg>"
diff --git a/avg b/avg
index ed31465..07e3de9 100755
--- a/avg
+++ b/avg
@@ -2,7 +2,6 @@
require 'trollop'
-
cfg = Trollop::options do
banner "avg < <one number per line>"
opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
diff --git a/avg_weights b/avg_weights
index 1f9053f..2e23440 100755
--- a/avg_weights
+++ b/avg_weights
@@ -4,7 +4,6 @@ require 'zipf'
require 'trollop'
require 'zlib'
-
cfg = Trollop::options do
opt :weights_files, "a number of weights files: name value", :required => true
opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false
diff --git a/hg2json.py b/cdec_hg_to_json
index 5bd5c2c..5a26cf7 100755
--- a/hg2json.py
+++ b/cdec_hg_to_json
@@ -75,7 +75,6 @@ def main():
print hg2json(hg, decoder.weights)
-
if __name__=="__main__":
main()
diff --git a/dot b/dot
new file mode 100755
index 0000000..da0dc58
--- /dev/null
+++ b/dot
@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+a = SparseVector.from_file 'w', ' '
+b = SparseVector.from_file 'f', ' '
+puts a.to_s
+puts a.dot b
+
diff --git a/firstlower b/first_lower
index 682a9b7..1cddb8e 100755
--- a/firstlower
+++ b/first_lower
@@ -2,7 +2,6 @@
require 'zipf'
-
while line = STDIN.gets
line.strip!
if line && line!='' && line[0].downcase?
diff --git a/collapse_tags.rb b/gigaword_collapse_tags
index 75fcaf5..cbaf7d7 100755
--- a/collapse_tags.rb
+++ b/gigaword_collapse_tags
@@ -5,7 +5,6 @@
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-
in_p = false
in_dateline = false
collect = []
diff --git a/kbest_bleu_oracles b/kbest_bleu_oracles
index 2ac344b..7db1c7e 100755
--- a/kbest_bleu_oracles
+++ b/kbest_bleu_oracles
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def get_context kbest_lists, references, n
a = []
kbest_lists.each_index { |i|
@@ -48,6 +47,5 @@ def main
}
end
-
main
diff --git a/keycount b/key_count
index deaa522..deaa522 100755
--- a/keycount
+++ b/key_count
diff --git a/kmeans b/kmeans
index ec28897..201864b 100755
--- a/kmeans
+++ b/kmeans
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def read_data fn
data = {}
ReadFile.new(fn).readlines_strip.map{ |i|
@@ -114,6 +113,5 @@ def main
end
end
-
main
diff --git a/lin_reg b/lin_reg
index 168e7df..4a7c3b2 100755
--- a/lin_reg
+++ b/lin_reg
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def read_data fn, scale
f = ReadFile.new fn
data = []
@@ -67,6 +66,5 @@ def main
puts model.to_s
end
-
main
diff --git a/log_reg b/log_reg
index e6f47eb..3916d0c 100755
--- a/log_reg
+++ b/log_reg
@@ -4,7 +4,6 @@ require 'zipf'
require 'matrix'
require 'trollop'
-
def read_data fn
f = ReadFile.new fn
data = []
@@ -68,6 +67,5 @@ def main
puts model.to_s
end
-
main
diff --git a/max b/max
index 87f3c73..b2c1cae 100755
--- a/max
+++ b/max
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
max = -1.0/0
while line = STDIN.gets
v = line.to_f
diff --git a/median b/median
index 9499c95..0b1950b 100755
--- a/median
+++ b/median
@@ -2,7 +2,6 @@
require 'zipf'
-
a = []
while line = STDIN.gets
a << line.to_f
diff --git a/memusg b/mem_usage
index a69daaa..5c2104f 100755
--- a/memusg
+++ b/mem_usage
@@ -1,6 +1,5 @@
#!/bin/bash
-
"$@" &
pid=$! peak=0
while true; do
diff --git a/merge_files b/merge_files
index 0b4941e..714b57d 100755
--- a/merge_files
+++ b/merge_files
@@ -2,7 +2,6 @@
require 'zipf'
-
def usage
STDERR.write "merge_files <file>+\n"
exit 1
diff --git a/merge_ttable b/merge_ttable
index 20d86d3..e4621f5 100755
--- a/merge_ttable
+++ b/merge_ttable
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def main
cfg = Trollop::options do
opt :f, "f files", :type => :string, :required => true
@@ -31,6 +30,5 @@ def main
}
end
-
main
diff --git a/min b/min
index 398b0fb..f8a7e42 100755
--- a/min
+++ b/min
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
min = 1.0/0
while line = STDIN.gets
v = line.to_f
diff --git a/min_max b/min_max
index 17dc566..b79a743 100755
--- a/min_max
+++ b/min_max
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
cfg = Trollop::options do
opt :min, "minimum #tokens", :type => :int, :default => 1
opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n'
diff --git a/moses_1best b/moses_1best
index 849ebf1..fd35cf8 100755
--- a/moses_1best
+++ b/moses_1best
@@ -2,7 +2,6 @@
require 'zipf'
-
prev_idx = nil
while line = STDIN.gets
line.strip!
diff --git a/mult b/mult
index 2ef0149..478ec5e 100755
--- a/mult
+++ b/mult
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
factor = ARGV[0].to_f
while line = STDIN.gets
puts line.to_f * factor
diff --git a/no_empty b/no_empty
index 96c9ce4..da57e23 100755
--- a/no_empty
+++ b/no_empty
@@ -2,7 +2,6 @@
require 'zipf'
-
files = []
(0..1).each { |i| files << ReadFile.new(ARGV[i]) }
(2..3).each { |i| files << WriteFile.new(ARGV[i]) }
diff --git a/num_tok b/num_tok
index 53b99a0..56cbae9 100755
--- a/num_tok
+++ b/num_tok
@@ -3,7 +3,6 @@
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-
while line = STDIN.gets
puts line.strip.split.length
end
diff --git a/odd b/odd
index 93aaa80..0bd9336 100755
--- a/odd
+++ b/odd
@@ -3,7 +3,6 @@
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-
i = 1
while line = STDIN.gets
puts line if i%2!=0
diff --git a/paste_pairs b/paste_pairs
index 07c1f22..f6b8b31 100755
--- a/paste_pairs
+++ b/paste_pairs
@@ -3,7 +3,6 @@
import sys
from itertools import izip
-
for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))):
print linenr, (src_line.strip())
print linenr, (tgt_line.strip())
diff --git a/per_sentence_bleu b/per_sentence_bleu
index 76fcf38..5bacd1a 100755
--- a/per_sentence_bleu
+++ b/per_sentence_bleu
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def main
cfg = Trollop::options do
opt :input, "input", :type => :string, :default => '-'
@@ -26,6 +25,5 @@ def main
input.close
end
-
main
diff --git a/per_sentence_bleu_kbest b/per_sentence_bleu_kbest
index 4d821b3..e6a31cb 100755
--- a/per_sentence_bleu_kbest
+++ b/per_sentence_bleu_kbest
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def main
cfg = Trollop::options do
opt :kbests, "kbests", :type => :string, :default => '-'
@@ -29,6 +28,5 @@ def main
}
end
-
main
diff --git a/per_sentence_ter b/per_sentence_ter
index 8b04be5..343708e 100755
--- a/per_sentence_ter
+++ b/per_sentence_ter
@@ -4,7 +4,6 @@ require 'zipf'
require 'trollop'
require 'tempfile'
-
def main
cfg = Trollop::options do
opt :input, "input", :type => :string, :default => '-'
@@ -30,6 +29,5 @@ def main
input.close
end
-
main
diff --git a/pot b/pot
index ec199ea..24acabe 100755
--- a/pot
+++ b/pot
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
pow = ARGV[0].to_f
while line = STDIN.gets
puts line.to_f**pow
diff --git a/round b/round
index 3dfbb6f..dfef800 100755
--- a/round
+++ b/round
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
r = ARGV[0].to_i
while line = STDIN.gets
puts line.to_f.round r
diff --git a/ruby_eval b/ruby_eval
index 96b2ecb..fe0d181 100755
--- a/ruby_eval
+++ b/ruby_eval
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
while line = STDIN.gets
puts "#{eval line}"
end
diff --git a/rule_shapes b/rule_shapes
index fd42249..589a670 100755
--- a/rule_shapes
+++ b/rule_shapes
@@ -3,7 +3,6 @@
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-
def shape s
res = []
in_t = false
diff --git a/shard b/shard
index f952104..6155123 100755
--- a/shard
+++ b/shard
@@ -2,7 +2,6 @@
require 'trollop'
-
def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false)
lc = `wc -l #{input}`.split.first.to_i
input_ext = input.split('.').last
diff --git a/splitpipes b/split_pipes
index 35ee176..eeba69b 100755
--- a/splitpipes
+++ b/split_pipes
@@ -5,7 +5,6 @@ require 'trollop'
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-
cfg = Trollop::options do
banner "splitpipes -f <n> < <input>"
opt :field, "field", :type => :int
diff --git a/parse-stanford.sh b/stanford_parser_run
index f8d4210..f8d4210 100755
--- a/parse-stanford.sh
+++ b/stanford_parser_run
diff --git a/stddev b/stddev
index 5cda0e0..a7397b2 100755
--- a/stddev
+++ b/stddev
@@ -2,7 +2,6 @@
require 'trollop'
-
cfg = Trollop::options do
banner "stddev [-r <d>] < <one number per line>"
opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
diff --git a/sum b/sum
index dac72d3..acfa563 100755
--- a/sum
+++ b/sum
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
sum = 0.0
while line = STDIN.gets
sum += line.to_f
diff --git a/tc b/tc
index 993086a..7eefdd5 100755
--- a/tc
+++ b/tc
@@ -2,7 +2,6 @@
require 'zipf'
-
while line = STDIN.gets
puts tokenize(line.strip).size
end
diff --git a/test/hg2json/cdec.ini b/test/cdec_hg_to_json/cdec.ini
index 1ad25b5..1ad25b5 100644
--- a/test/hg2json/cdec.ini
+++ b/test/cdec_hg_to_json/cdec.ini
diff --git a/test/hg2json/grammar.gz b/test/cdec_hg_to_json/grammar.gz
index 78dda98..78dda98 100644
--- a/test/hg2json/grammar.gz
+++ b/test/cdec_hg_to_json/grammar.gz
Binary files differ
diff --git a/test/hg2json/hg.json.gz b/test/cdec_hg_to_json/hg.json.gz
index ed178c6..ed178c6 100644
--- a/test/hg2json/hg.json.gz
+++ b/test/cdec_hg_to_json/hg.json.gz
Binary files differ
diff --git a/test/hg2json/hg.meta b/test/cdec_hg_to_json/hg.meta
index d33a54c..d33a54c 100644
--- a/test/hg2json/hg.meta
+++ b/test/cdec_hg_to_json/hg.meta
diff --git a/test/hg2json/in b/test/cdec_hg_to_json/in
index 7dc411d..7dc411d 100644
--- a/test/hg2json/in
+++ b/test/cdec_hg_to_json/in
diff --git a/test/hg2json/toy.cdec.ini b/test/cdec_hg_to_json/toy.cdec.ini
index d4a2896..d4a2896 100644
--- a/test/hg2json/toy.cdec.ini
+++ b/test/cdec_hg_to_json/toy.cdec.ini
diff --git a/test/hg2json/toy.grammar b/test/cdec_hg_to_json/toy.grammar
index 382c94f..382c94f 100644
--- a/test/hg2json/toy.grammar
+++ b/test/cdec_hg_to_json/toy.grammar
diff --git a/test/hg2json/toy.in b/test/cdec_hg_to_json/toy.in
index e6df927..e6df927 100644
--- a/test/hg2json/toy.in
+++ b/test/cdec_hg_to_json/toy.in
diff --git a/test/hg2json/toy.weights b/test/cdec_hg_to_json/toy.weights
index 70075b7..70075b7 100644
--- a/test/hg2json/toy.weights
+++ b/test/cdec_hg_to_json/toy.weights
diff --git a/test/hg2json/weights b/test/cdec_hg_to_json/weights
index 7f96f1d..7f96f1d 100644
--- a/test/hg2json/weights
+++ b/test/cdec_hg_to_json/weights
diff --git a/tf-idf b/tf-idf
index fc6c2ec..450de6b 100755
--- a/tf-idf
+++ b/tf-idf
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def main
cfg = Trollop::options do
opt :documents, "input files (documents)", :type => :string, :required => true
@@ -48,6 +47,5 @@ def main
docs.each { |i| puts i.to_s }
end
-
main
diff --git a/to_ascii b/to_ascii
index 6c1d23e..10fd1c2 100755
--- a/to_ascii
+++ b/to_ascii
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
while line = STDIN.gets
encoding_options = {
:invalid => :replace,
diff --git a/tokenizer.no-escape.perl b/tokenizer-no-escape.perl
index 4397360..4397360 100755
--- a/tokenizer.no-escape.perl
+++ b/tokenizer-no-escape.perl
diff --git a/toks b/toks
index ed40dbb..8bee29f 100755
--- a/toks
+++ b/toks
@@ -3,7 +3,6 @@
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-
while line = STDIN.gets
line.strip.split(/\s/).each { |i| puts i }
end
diff --git a/traintestsplit b/train_test_split
index ec88df1..db56de9 100755
--- a/traintestsplit
+++ b/train_test_split
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
cfg = Trollop::options do
opt :foreign, "foreign file", :type => :string, :required => true
opt :english, "english file", :type => :string, :required => true
diff --git a/var b/var
index fe4aa22..faccefa 100755
--- a/var
+++ b/var
@@ -2,7 +2,6 @@
require 'trollop'
-
cfg = Trollop::options do
banner "stddev [-r <d>] < <one number per line>"
opt :round, "Number of digits after decimal point.", :type => :int, :default => -1