diff options
-rw-r--r-- | README.md | 14 | ||||
-rwxr-xr-x | add_seg | 2 | ||||
-rwxr-xr-x | avg | 1 | ||||
-rwxr-xr-x | avg_weights | 1 | ||||
-rwxr-xr-x | cdec_hg_to_json (renamed from hg2json.py) | 1 | ||||
-rwxr-xr-x | dot | 9 | ||||
-rwxr-xr-x | first_lower (renamed from firstlower) | 1 | ||||
-rwxr-xr-x | gigaword_collapse_tags (renamed from collapse_tags.rb) | 1 | ||||
-rwxr-xr-x | kbest_bleu_oracles | 2 | ||||
-rwxr-xr-x | key_count (renamed from keycount) | 0 | ||||
-rwxr-xr-x | kmeans | 2 | ||||
-rwxr-xr-x | lin_reg | 2 | ||||
-rwxr-xr-x | log_reg | 2 | ||||
-rwxr-xr-x | max | 1 | ||||
-rwxr-xr-x | median | 1 | ||||
-rwxr-xr-x | mem_usage (renamed from memusg) | 1 | ||||
-rwxr-xr-x | merge_files | 1 | ||||
-rwxr-xr-x | merge_ttable | 2 | ||||
-rwxr-xr-x | min | 1 | ||||
-rwxr-xr-x | min_max | 1 | ||||
-rwxr-xr-x | moses_1best | 1 | ||||
-rwxr-xr-x | mult | 1 | ||||
-rwxr-xr-x | no_empty | 1 | ||||
-rwxr-xr-x | num_tok | 1 | ||||
-rwxr-xr-x | odd | 1 | ||||
-rwxr-xr-x | paste_pairs | 1 | ||||
-rwxr-xr-x | per_sentence_bleu | 2 | ||||
-rwxr-xr-x | per_sentence_bleu_kbest | 2 | ||||
-rwxr-xr-x | per_sentence_ter | 2 | ||||
-rwxr-xr-x | pot | 1 | ||||
-rwxr-xr-x | round | 1 | ||||
-rwxr-xr-x | ruby_eval | 1 | ||||
-rwxr-xr-x | rule_shapes | 1 | ||||
-rwxr-xr-x | shard | 1 | ||||
-rwxr-xr-x | split_pipes (renamed from splitpipes) | 1 | ||||
-rwxr-xr-x | stanford_parser_run (renamed from parse-stanford.sh) | 0 | ||||
-rwxr-xr-x | stddev | 1 | ||||
-rwxr-xr-x | sum | 1 | ||||
-rwxr-xr-x | tc | 1 | ||||
-rw-r--r-- | test/cdec_hg_to_json/cdec.ini (renamed from test/hg2json/cdec.ini) | 0 | ||||
-rw-r--r-- | test/cdec_hg_to_json/grammar.gz (renamed from test/hg2json/grammar.gz) | bin | 1399915 -> 1399915 bytes | |||
-rw-r--r-- | test/cdec_hg_to_json/hg.json.gz (renamed from test/hg2json/hg.json.gz) | bin | 318029 -> 318029 bytes | |||
-rw-r--r-- | test/cdec_hg_to_json/hg.meta (renamed from test/hg2json/hg.meta) | 0 | ||||
-rw-r--r-- | test/cdec_hg_to_json/in (renamed from test/hg2json/in) | 0 | ||||
-rw-r--r-- | test/cdec_hg_to_json/toy.cdec.ini (renamed from test/hg2json/toy.cdec.ini) | 0 | ||||
-rw-r--r-- | test/cdec_hg_to_json/toy.grammar (renamed from test/hg2json/toy.grammar) | 0 | ||||
-rw-r--r-- | test/cdec_hg_to_json/toy.in (renamed from test/hg2json/toy.in) | 0 | ||||
-rw-r--r-- | test/cdec_hg_to_json/toy.weights (renamed from test/hg2json/toy.weights) | 0 | ||||
-rw-r--r-- | test/cdec_hg_to_json/weights (renamed from test/hg2json/weights) | 0 | ||||
-rwxr-xr-x | tf-idf | 2 | ||||
-rwxr-xr-x | to_ascii | 1 | ||||
-rwxr-xr-x | tokenizer-no-escape.perl (renamed from tokenizer.no-escape.perl) | 0 | ||||
-rwxr-xr-x | toks | 1 | ||||
-rwxr-xr-x | train_test_split (renamed from traintestsplit) | 1 | ||||
-rwxr-xr-x | var | 1 |
55 files changed, 16 insertions, 57 deletions
@@ -1,13 +1,11 @@ -scripts -======= +a number of NLP related scripts. Some scripts require my zipf gem, see [1] -A number of NLP related scripts. -Some scripts require my zipf gem, -see https://github.com/pks/zipf +\*.perl taken from the moses [2] toolkit -compound-splitter.perl and tokenizer.no-escape.perl -taken from the moses [1] toolkit. +mem\_usage taken from [3] -[1] https://github.com/moses-smt/mosesdecoder +[1] https://github.com/pks/zipf +[2] https://github.com/moses-smt/mosesdecoder +[3] https://gist.github.com/netj/526585 @@ -24,8 +24,8 @@ while line = STDIN.gets s = "<seg" if cfg[:loo] then s += " exclude=\"#{i}\"" end if index.size > 0 - puts s + " id=\"#{index[j]}\"> #{line.strip} </seg>" if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{index[j]}#{ext}\"" end + puts s + " id=\"#{index[j]}\"> #{line.strip} </seg>" else if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{i}#{ext}\"" end puts s + " id=\"#{i}\"> #{line.strip} </seg>" @@ -2,7 +2,6 @@ require 'trollop' - cfg = Trollop::options do banner "avg < <one number per line>" opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 diff --git a/avg_weights b/avg_weights index 1f9053f..2e23440 100755 --- a/avg_weights +++ b/avg_weights @@ -4,7 +4,6 @@ require 'zipf' require 'trollop' require 'zlib' - cfg = Trollop::options do opt :weights_files, "a number of weights files: name value", :required => true opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false diff --git a/hg2json.py b/cdec_hg_to_json index 5bd5c2c..5a26cf7 100755 --- a/hg2json.py +++ b/cdec_hg_to_json @@ -75,7 +75,6 @@ def main(): print hg2json(hg, decoder.weights) - if __name__=="__main__": main() @@ -0,0 +1,9 @@ +#!/usr/bin/env ruby + +require 'zipf' + +a = SparseVector.from_file 'w', ' ' +b = SparseVector.from_file 'f', ' ' +puts a.to_s +puts a.dot b + @@ -2,7 +2,6 @@ require 'zipf' - while line = STDIN.gets line.strip! if line && line!='' && line[0].downcase? diff --git a/collapse_tags.rb b/gigaword_collapse_tags index 75fcaf5..cbaf7d7 100755 --- a/collapse_tags.rb +++ b/gigaword_collapse_tags @@ -5,7 +5,6 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' - in_p = false in_dateline = false collect = [] diff --git a/kbest_bleu_oracles b/kbest_bleu_oracles index 2ac344b..7db1c7e 100755 --- a/kbest_bleu_oracles +++ b/kbest_bleu_oracles @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def get_context kbest_lists, references, n a = [] kbest_lists.each_index { |i| @@ -48,6 +47,5 @@ def main } end - main @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def read_data fn data = {} ReadFile.new(fn).readlines_strip.map{ |i| @@ -114,6 +113,5 @@ def main end end - main @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def read_data fn, scale f = ReadFile.new fn data = [] @@ -67,6 +66,5 @@ def main puts model.to_s end - main @@ -4,7 +4,6 @@ require 'zipf' require 'matrix' require 'trollop' - def read_data fn f = ReadFile.new fn data = [] @@ -68,6 +67,5 @@ def main puts model.to_s end - main @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - max = -1.0/0 while line = STDIN.gets v = line.to_f @@ -2,7 +2,6 @@ require 'zipf' - a = [] while line = STDIN.gets a << line.to_f @@ -1,6 +1,5 @@ #!/bin/bash - "$@" & pid=$! peak=0 while true; do diff --git a/merge_files b/merge_files index 0b4941e..714b57d 100755 --- a/merge_files +++ b/merge_files @@ -2,7 +2,6 @@ require 'zipf' - def usage STDERR.write "merge_files <file>+\n" exit 1 diff --git a/merge_ttable b/merge_ttable index 20d86d3..e4621f5 100755 --- a/merge_ttable +++ b/merge_ttable @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def main cfg = Trollop::options do opt :f, "f files", :type => :string, :required => true @@ -31,6 +30,5 @@ def main } end - main @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - min = 1.0/0 while line = STDIN.gets v = line.to_f @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - cfg = Trollop::options do opt :min, "minimum #tokens", :type => :int, :default => 1 opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n' diff --git a/moses_1best b/moses_1best index 849ebf1..fd35cf8 100755 --- a/moses_1best +++ b/moses_1best @@ -2,7 +2,6 @@ require 'zipf' - prev_idx = nil while line = STDIN.gets line.strip! @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - factor = ARGV[0].to_f while line = STDIN.gets puts line.to_f * factor @@ -2,7 +2,6 @@ require 'zipf' - files = [] (0..1).each { |i| files << ReadFile.new(ARGV[i]) } (2..3).each { |i| files << WriteFile.new(ARGV[i]) } @@ -3,7 +3,6 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' - while line = STDIN.gets puts line.strip.split.length end @@ -3,7 +3,6 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' - i = 1 while line = STDIN.gets puts line if i%2!=0 diff --git a/paste_pairs b/paste_pairs index 07c1f22..f6b8b31 100755 --- a/paste_pairs +++ b/paste_pairs @@ -3,7 +3,6 @@ import sys from itertools import izip - for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))): print linenr, (src_line.strip()) print linenr, (tgt_line.strip()) diff --git a/per_sentence_bleu b/per_sentence_bleu index 76fcf38..5bacd1a 100755 --- a/per_sentence_bleu +++ b/per_sentence_bleu @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def main cfg = Trollop::options do opt :input, "input", :type => :string, :default => '-' @@ -26,6 +25,5 @@ def main input.close end - main diff --git a/per_sentence_bleu_kbest b/per_sentence_bleu_kbest index 4d821b3..e6a31cb 100755 --- a/per_sentence_bleu_kbest +++ b/per_sentence_bleu_kbest @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def main cfg = Trollop::options do opt :kbests, "kbests", :type => :string, :default => '-' @@ -29,6 +28,5 @@ def main } end - main diff --git a/per_sentence_ter b/per_sentence_ter index 8b04be5..343708e 100755 --- a/per_sentence_ter +++ b/per_sentence_ter @@ -4,7 +4,6 @@ require 'zipf' require 'trollop' require 'tempfile' - def main cfg = Trollop::options do opt :input, "input", :type => :string, :default => '-' @@ -30,6 +29,5 @@ def main input.close end - main @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - pow = ARGV[0].to_f while line = STDIN.gets puts line.to_f**pow @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - r = ARGV[0].to_i while line = STDIN.gets puts line.to_f.round r @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - while line = STDIN.gets puts "#{eval line}" end diff --git a/rule_shapes b/rule_shapes index fd42249..589a670 100755 --- a/rule_shapes +++ b/rule_shapes @@ -3,7 +3,6 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' - def shape s res = [] in_t = false @@ -2,7 +2,6 @@ require 'trollop' - def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false) lc = `wc -l #{input}`.split.first.to_i input_ext = input.split('.').last @@ -5,7 +5,6 @@ require 'trollop' STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' - cfg = Trollop::options do banner "splitpipes -f <n> < <input>" opt :field, "field", :type => :int diff --git a/parse-stanford.sh b/stanford_parser_run index f8d4210..f8d4210 100755 --- a/parse-stanford.sh +++ b/stanford_parser_run @@ -2,7 +2,6 @@ require 'trollop' - cfg = Trollop::options do banner "stddev [-r <d>] < <one number per line>" opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - sum = 0.0 while line = STDIN.gets sum += line.to_f @@ -2,7 +2,6 @@ require 'zipf' - while line = STDIN.gets puts tokenize(line.strip).size end diff --git a/test/hg2json/cdec.ini b/test/cdec_hg_to_json/cdec.ini index 1ad25b5..1ad25b5 100644 --- a/test/hg2json/cdec.ini +++ b/test/cdec_hg_to_json/cdec.ini diff --git a/test/hg2json/grammar.gz b/test/cdec_hg_to_json/grammar.gz Binary files differindex 78dda98..78dda98 100644 --- a/test/hg2json/grammar.gz +++ b/test/cdec_hg_to_json/grammar.gz diff --git a/test/hg2json/hg.json.gz b/test/cdec_hg_to_json/hg.json.gz Binary files differindex ed178c6..ed178c6 100644 --- a/test/hg2json/hg.json.gz +++ b/test/cdec_hg_to_json/hg.json.gz diff --git a/test/hg2json/hg.meta b/test/cdec_hg_to_json/hg.meta index d33a54c..d33a54c 100644 --- a/test/hg2json/hg.meta +++ b/test/cdec_hg_to_json/hg.meta diff --git a/test/hg2json/in b/test/cdec_hg_to_json/in index 7dc411d..7dc411d 100644 --- a/test/hg2json/in +++ b/test/cdec_hg_to_json/in diff --git a/test/hg2json/toy.cdec.ini b/test/cdec_hg_to_json/toy.cdec.ini index d4a2896..d4a2896 100644 --- a/test/hg2json/toy.cdec.ini +++ b/test/cdec_hg_to_json/toy.cdec.ini diff --git a/test/hg2json/toy.grammar b/test/cdec_hg_to_json/toy.grammar index 382c94f..382c94f 100644 --- a/test/hg2json/toy.grammar +++ b/test/cdec_hg_to_json/toy.grammar diff --git a/test/hg2json/toy.in b/test/cdec_hg_to_json/toy.in index e6df927..e6df927 100644 --- a/test/hg2json/toy.in +++ b/test/cdec_hg_to_json/toy.in diff --git a/test/hg2json/toy.weights b/test/cdec_hg_to_json/toy.weights index 70075b7..70075b7 100644 --- a/test/hg2json/toy.weights +++ b/test/cdec_hg_to_json/toy.weights diff --git a/test/hg2json/weights b/test/cdec_hg_to_json/weights index 7f96f1d..7f96f1d 100644 --- a/test/hg2json/weights +++ b/test/cdec_hg_to_json/weights @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def main cfg = Trollop::options do opt :documents, "input files (documents)", :type => :string, :required => true @@ -48,6 +47,5 @@ def main docs.each { |i| puts i.to_s } end - main @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - while line = STDIN.gets encoding_options = { :invalid => :replace, diff --git a/tokenizer.no-escape.perl b/tokenizer-no-escape.perl index 4397360..4397360 100755 --- a/tokenizer.no-escape.perl +++ b/tokenizer-no-escape.perl @@ -3,7 +3,6 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' - while line = STDIN.gets line.strip.split(/\s/).each { |i| puts i } end diff --git a/traintestsplit b/train_test_split index ec88df1..db56de9 100755 --- a/traintestsplit +++ b/train_test_split @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - cfg = Trollop::options do opt :foreign, "foreign file", :type => :string, :required => true opt :english, "english file", :type => :string, :required => true @@ -2,7 +2,6 @@ require 'trollop' - cfg = Trollop::options do banner "stddev [-r <d>] < <one number per line>" opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 |