alles neu macht der mai

author: Patrick Simianer <p@simianer.de> 2014-10-09 20:47:23 +0100
committer: Patrick Simianer <p@simianer.de> 2014-10-09 20:47:23 +0100
commit: e0b634754d1bef33dc8e72509c6990cccc32745a (patch)
tree: 95d77abef518a333830881dbbd661f14f94868c3
parent: 254c27ed4af938f0b9c4a21cb99b75f8cc1cd1b2 (diff)
55 files changed, 16 insertions, 57 deletions
diff --git a/README.md b/README.md
index 3a6b1b7..fd42922 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,11 @@
-scripts
-=======
+a number of NLP related scripts. Some scripts require my zipf gem, see [1]
 
-A number of NLP related scripts.
-Some scripts require my zipf gem,
-see https://github.com/pks/zipf
+\*.perl taken from the moses [2] toolkit
 
-compound-splitter.perl and tokenizer.no-escape.perl
-taken from the moses [1] toolkit.
+mem\_usage taken from [3]
 
 
-[1] https://github.com/moses-smt/mosesdecoder
+[1] https://github.com/pks/zipf
+[2] https://github.com/moses-smt/mosesdecoder
+[3] https://gist.github.com/netj/526585
 
diff --git a/add_seg b/add_seg
index e4fe22d..7a4ca7a 100755
--- a/add_seg
+++ b/add_seg
@@ -24,8 +24,8 @@ while line = STDIN.gets
   s = "<seg"
   if cfg[:loo] then s += " exclude=\"#{i}\"" end
   if index.size > 0
-    puts s + " id=\"#{index[j]}\"> #{line.strip} </seg>"
     if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{index[j]}#{ext}\"" end
+    puts s + " id=\"#{index[j]}\"> #{line.strip} </seg>"
   else
     if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{i}#{ext}\"" end
     puts s + " id=\"#{i}\"> #{line.strip} </seg>"
diff --git a/avg b/avg
index ed31465..07e3de9 100755
--- a/avg
+++ b/avg
@@ -2,7 +2,6 @@
 
 require 'trollop'
 
-
 cfg = Trollop::options do
   banner "avg < <one number per line>"
   opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
diff --git a/avg_weights b/avg_weights
index 1f9053f..2e23440 100755
--- a/avg_weights
+++ b/avg_weights
@@ -4,7 +4,6 @@ require 'zipf'
 require 'trollop'
 require 'zlib'
 
-
 cfg = Trollop::options do
   opt :weights_files, "a number of weights files: name value", :required => true
   opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false
diff --git a/hg2json.py b/cdec_hg_to_json
index 5bd5c2c..5a26cf7 100755
--- a/hg2json.py
+++ b/cdec_hg_to_json
@@ -75,7 +75,6 @@ def main():
 
   print hg2json(hg, decoder.weights)
 
-
 if __name__=="__main__":
   main()
 
diff --git a/dot b/dot
new file mode 100755
index 0000000..da0dc58
--- /dev/null
+++ b/dot
@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+a = SparseVector.from_file 'w', ' '
+b = SparseVector.from_file 'f', ' '
+puts a.to_s
+puts a.dot b
+
diff --git a/firstlower b/first_lower
index 682a9b7..1cddb8e 100755
--- a/firstlower
+++ b/first_lower
@@ -2,7 +2,6 @@
 
 require 'zipf'
 
-
 while line = STDIN.gets
   line.strip!
   if line && line!='' && line[0].downcase?
diff --git a/collapse_tags.rb b/gigaword_collapse_tags
index 75fcaf5..cbaf7d7 100755
--- a/collapse_tags.rb
+++ b/gigaword_collapse_tags
@@ -5,7 +5,6 @@
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-
 in_p = false
 in_dateline = false
 collect = []
diff --git a/kbest_bleu_oracles b/kbest_bleu_oracles
index 2ac344b..7db1c7e 100755
--- a/kbest_bleu_oracles
+++ b/kbest_bleu_oracles
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def get_context kbest_lists, references, n
   a = []
   kbest_lists.each_index { |i|
@@ -48,6 +47,5 @@ def main
   }
 end
 
-
 main
 
diff --git a/keycount b/key_count
index deaa522..deaa522 100755
--- a/keycount
+++ b/key_count
diff --git a/kmeans b/kmeans
index ec28897..201864b 100755
--- a/kmeans
+++ b/kmeans
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def read_data fn
   data = {}
   ReadFile.new(fn).readlines_strip.map{ |i|
@@ -114,6 +113,5 @@ def main
   end
 end
 
-
 main
 
diff --git a/lin_reg b/lin_reg
index 168e7df..4a7c3b2 100755
--- a/lin_reg
+++ b/lin_reg
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def read_data fn, scale
   f = ReadFile.new fn
   data = []
@@ -67,6 +66,5 @@ def main
   puts model.to_s
 end
 
-
 main
 
diff --git a/log_reg b/log_reg
index e6f47eb..3916d0c 100755
--- a/log_reg
+++ b/log_reg
@@ -4,7 +4,6 @@ require 'zipf'
 require 'matrix'
 require 'trollop'
 
-
 def read_data fn
   f = ReadFile.new fn
   data = []
@@ -68,6 +67,5 @@ def main
   puts model.to_s
 end
 
-
 main
 
diff --git a/max b/max
index 87f3c73..b2c1cae 100755
--- a/max
+++ b/max
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 max = -1.0/0
 while line = STDIN.gets
   v = line.to_f
diff --git a/median b/median
index 9499c95..0b1950b 100755
--- a/median
+++ b/median
@@ -2,7 +2,6 @@
 
 require 'zipf'
 
-
 a = []
 while line = STDIN.gets
   a << line.to_f
diff --git a/memusg b/mem_usage
index a69daaa..5c2104f 100755
--- a/memusg
+++ b/mem_usage
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-
 "$@" &
 pid=$! peak=0
 while true; do
diff --git a/merge_files b/merge_files
index 0b4941e..714b57d 100755
--- a/merge_files
+++ b/merge_files
@@ -2,7 +2,6 @@
 
 require 'zipf'
 
-
 def usage
   STDERR.write "merge_files <file>+\n"
   exit 1
diff --git a/merge_ttable b/merge_ttable
index 20d86d3..e4621f5 100755
--- a/merge_ttable
+++ b/merge_ttable
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def main
   cfg = Trollop::options do
     opt :f, "f files", :type => :string, :required => true
@@ -31,6 +30,5 @@ def main
   } 
 end
 
-
 main
 
diff --git a/min b/min
index 398b0fb..f8a7e42 100755
--- a/min
+++ b/min
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 min = 1.0/0
 while line = STDIN.gets
   v = line.to_f
diff --git a/min_max b/min_max
index 17dc566..b79a743 100755
--- a/min_max
+++ b/min_max
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 cfg = Trollop::options do
   opt :min, "minimum #tokens", :type => :int, :default => 1
   opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n'
diff --git a/moses_1best b/moses_1best
index 849ebf1..fd35cf8 100755
--- a/moses_1best
+++ b/moses_1best
@@ -2,7 +2,6 @@
 
 require 'zipf'
 
-
 prev_idx = nil
 while line = STDIN.gets
   line.strip!
diff --git a/mult b/mult
index 2ef0149..478ec5e 100755
--- a/mult
+++ b/mult
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 factor = ARGV[0].to_f
 while line = STDIN.gets
   puts line.to_f * factor
diff --git a/no_empty b/no_empty
index 96c9ce4..da57e23 100755
--- a/no_empty
+++ b/no_empty
@@ -2,7 +2,6 @@
 
 require 'zipf'
 
-
 files = []
 (0..1).each { |i| files << ReadFile.new(ARGV[i]) }
 (2..3).each { |i| files << WriteFile.new(ARGV[i]) }
diff --git a/num_tok b/num_tok
index 53b99a0..56cbae9 100755
--- a/num_tok
+++ b/num_tok
@@ -3,7 +3,6 @@
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-
 while line = STDIN.gets
   puts line.strip.split.length
 end
diff --git a/odd b/odd
index 93aaa80..0bd9336 100755
--- a/odd
+++ b/odd
@@ -3,7 +3,6 @@
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-
 i = 1
 while line = STDIN.gets
   puts line if i%2!=0
diff --git a/paste_pairs b/paste_pairs
index 07c1f22..f6b8b31 100755
--- a/paste_pairs
+++ b/paste_pairs
@@ -3,7 +3,6 @@
 import sys
 from itertools import izip
 
-
 for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))):
   print linenr, (src_line.strip())
   print linenr, (tgt_line.strip())
diff --git a/per_sentence_bleu b/per_sentence_bleu
index 76fcf38..5bacd1a 100755
--- a/per_sentence_bleu
+++ b/per_sentence_bleu
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def main
   cfg = Trollop::options do
     opt :input, "input", :type => :string, :default => '-'
@@ -26,6 +25,5 @@ def main
   input.close
 end
 
-
 main
 
diff --git a/per_sentence_bleu_kbest b/per_sentence_bleu_kbest
index 4d821b3..e6a31cb 100755
--- a/per_sentence_bleu_kbest
+++ b/per_sentence_bleu_kbest
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def main
   cfg = Trollop::options do
     opt :kbests, "kbests", :type => :string, :default => '-'
@@ -29,6 +28,5 @@ def main
   }
 end
 
-
 main
 
diff --git a/per_sentence_ter b/per_sentence_ter
index 8b04be5..343708e 100755
--- a/per_sentence_ter
+++ b/per_sentence_ter
@@ -4,7 +4,6 @@ require 'zipf'
 require 'trollop'
 require 'tempfile'
 
-
 def main
   cfg = Trollop::options do
     opt :input, "input", :type => :string, :default => '-'
@@ -30,6 +29,5 @@ def main
   input.close
 end
 
-
 main
 
diff --git a/pot b/pot
index ec199ea..24acabe 100755
--- a/pot
+++ b/pot
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 pow = ARGV[0].to_f
 while line = STDIN.gets
   puts line.to_f**pow
diff --git a/round b/round
index 3dfbb6f..dfef800 100755
--- a/round
+++ b/round
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 r = ARGV[0].to_i
 while line = STDIN.gets
   puts line.to_f.round r
diff --git a/ruby_eval b/ruby_eval
index 96b2ecb..fe0d181 100755
--- a/ruby_eval
+++ b/ruby_eval
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 while line = STDIN.gets
   puts "#{eval line}"
 end
diff --git a/rule_shapes b/rule_shapes
index fd42249..589a670 100755
--- a/rule_shapes
+++ b/rule_shapes
@@ -3,7 +3,6 @@
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-
 def shape s
   res  = []
   in_t = false
diff --git a/shard b/shard
index f952104..6155123 100755
--- a/shard
+++ b/shard
@@ -2,7 +2,6 @@
 
 require 'trollop'
 
-
 def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false)
   lc = `wc -l #{input}`.split.first.to_i
   input_ext = input.split('.').last
diff --git a/splitpipes b/split_pipes
index 35ee176..eeba69b 100755
--- a/splitpipes
+++ b/split_pipes
@@ -5,7 +5,6 @@ require 'trollop'
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-
 cfg = Trollop::options do
   banner "splitpipes -f <n> < <input>"
   opt :field, "field", :type => :int
diff --git a/parse-stanford.sh b/stanford_parser_run
index f8d4210..f8d4210 100755
--- a/parse-stanford.sh
+++ b/stanford_parser_run
diff --git a/stddev b/stddev
index 5cda0e0..a7397b2 100755
--- a/stddev
+++ b/stddev
@@ -2,7 +2,6 @@
 
 require 'trollop'
 
-
 cfg = Trollop::options do
   banner "stddev [-r <d>] < <one number per line>"
   opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
diff --git a/sum b/sum
index dac72d3..acfa563 100755
--- a/sum
+++ b/sum
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 sum = 0.0
 while line = STDIN.gets
   sum += line.to_f
diff --git a/tc b/tc
index 993086a..7eefdd5 100755
--- a/tc
+++ b/tc
@@ -2,7 +2,6 @@
 
 require 'zipf'
 
-
 while line = STDIN.gets
   puts tokenize(line.strip).size
 end
diff --git a/test/hg2json/cdec.ini b/test/cdec_hg_to_json/cdec.ini
index 1ad25b5..1ad25b5 100644
--- a/test/hg2json/cdec.ini
+++ b/test/cdec_hg_to_json/cdec.ini
diff --git a/test/hg2json/grammar.gz b/test/cdec_hg_to_json/grammar.gz
index 78dda98..78dda98 100644
--- a/test/hg2json/grammar.gz
+++ b/test/cdec_hg_to_json/grammar.gz
diff --git a/test/hg2json/hg.json.gz b/test/cdec_hg_to_json/hg.json.gz
index ed178c6..ed178c6 100644
--- a/test/hg2json/hg.json.gz
+++ b/test/cdec_hg_to_json/hg.json.gz
diff --git a/test/hg2json/hg.meta b/test/cdec_hg_to_json/hg.meta
index d33a54c..d33a54c 100644
--- a/test/hg2json/hg.meta
+++ b/test/cdec_hg_to_json/hg.meta
diff --git a/test/hg2json/in b/test/cdec_hg_to_json/in
index 7dc411d..7dc411d 100644
--- a/test/hg2json/in
+++ b/test/cdec_hg_to_json/in
diff --git a/test/hg2json/toy.cdec.ini b/test/cdec_hg_to_json/toy.cdec.ini
index d4a2896..d4a2896 100644
--- a/test/hg2json/toy.cdec.ini
+++ b/test/cdec_hg_to_json/toy.cdec.ini
diff --git a/test/hg2json/toy.grammar b/test/cdec_hg_to_json/toy.grammar
index 382c94f..382c94f 100644
--- a/test/hg2json/toy.grammar
+++ b/test/cdec_hg_to_json/toy.grammar
diff --git a/test/hg2json/toy.in b/test/cdec_hg_to_json/toy.in
index e6df927..e6df927 100644
--- a/test/hg2json/toy.in
+++ b/test/cdec_hg_to_json/toy.in
diff --git a/test/hg2json/toy.weights b/test/cdec_hg_to_json/toy.weights
index 70075b7..70075b7 100644
--- a/test/hg2json/toy.weights
+++ b/test/cdec_hg_to_json/toy.weights
diff --git a/test/hg2json/weights b/test/cdec_hg_to_json/weights
index 7f96f1d..7f96f1d 100644
--- a/test/hg2json/weights
+++ b/test/cdec_hg_to_json/weights
diff --git a/tf-idf b/tf-idf
index fc6c2ec..450de6b 100755
--- a/tf-idf
+++ b/tf-idf
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def main
   cfg = Trollop::options do
     opt :documents, "input files (documents)", :type => :string, :required => true
@@ -48,6 +47,5 @@ def main
   docs.each { |i| puts i.to_s }
 end
 
-
 main
 
diff --git a/to_ascii b/to_ascii
index 6c1d23e..10fd1c2 100755
--- a/to_ascii
+++ b/to_ascii
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 while line = STDIN.gets
   encoding_options = {
     :invalid           => :replace,
diff --git a/tokenizer.no-escape.perl b/tokenizer-no-escape.perl
index 4397360..4397360 100755
--- a/tokenizer.no-escape.perl
+++ b/tokenizer-no-escape.perl
diff --git a/toks b/toks
index ed40dbb..8bee29f 100755
--- a/toks
+++ b/toks
@@ -3,7 +3,6 @@
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-
 while line = STDIN.gets
   line.strip.split(/\s/).each { |i| puts i }
 end
diff --git a/traintestsplit b/train_test_split
index ec88df1..db56de9 100755
--- a/traintestsplit
+++ b/train_test_split
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 cfg = Trollop::options do
   opt :foreign, "foreign file", :type => :string, :required => true
   opt :english, "english file", :type => :string, :required => true
diff --git a/var b/var
index fe4aa22..faccefa 100755
--- a/var
+++ b/var
@@ -2,7 +2,6 @@
 
 require 'trollop'
 
-
 cfg = Trollop::options do
   banner "stddev [-r <d>] < <one number per line>"
   opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
author	Patrick Simianer <p@simianer.de>	2014-10-09 20:47:23 +0100
committer	Patrick Simianer <p@simianer.de>	2014-10-09 20:47:23 +0100
commit	e0b634754d1bef33dc8e72509c6990cccc32745a (patch)
tree	95d77abef518a333830881dbbd661f14f94868c3
parent	254c27ed4af938f0b9c4a21cb99b75f8cc1cd1b2 (diff)