diff options
| -rwxr-xr-x | add-index | 1 | ||||
| -rwxr-xr-x | add-ln | 3 | ||||
| -rwxr-xr-x | add-seg | 13 | ||||
| -rwxr-xr-x | add-start-end | 5 | ||||
| -rwxr-xr-x | avg | 2 | ||||
| -rwxr-xr-x | avg-seg-len | 1 | ||||
| -rwxr-xr-x | avg-weights | 7 | ||||
| -rwxr-xr-x | bishuf | 1 | ||||
| -rwxr-xr-x | bitext-filter-length | 9 | ||||
| -rwxr-xr-x | bitext2tmx (renamed from bitext2tmx.py) | 20 | ||||
| -rwxr-xr-x | biuniq | 5 | ||||
| -rwxr-xr-x | bleu-cmp | 3 | ||||
| -rwxr-xr-x | cdec-hg-to-json | 1 | ||||
| -rwxr-xr-x | chars | 3 | ||||
| -rwxr-xr-x | cma | 3 | ||||
| -rwxr-xr-x | cumul | 17 | ||||
| -rwxr-xr-x | de-sgm | 1 | ||||
| -rwxr-xr-x | div | 1 | ||||
| -rwxr-xr-x | dot | 7 | ||||
| -rwxr-xr-x | even | 5 | ||||
| -rwxr-xr-x | exclude | 5 | ||||
| -rwxr-xr-x | feature-dict | 5 | ||||
| -rwxr-xr-x | filter-illegal | 5 | ||||
| -rwxr-xr-x | filter-len | 3 | ||||
| -rwxr-xr-x | filter-tokens | 5 | ||||
| -rwxr-xr-x | first-upper | 3 | ||||
| -rwxr-xr-x | fix-utf-8-pua | 1 | ||||
| -rwxr-xr-x | gigaword-collapse-tags | 5 | ||||
| -rwxr-xr-x | hadoop-uniq | 1 | ||||
| -rwxr-xr-x | hist-tok | 1 | ||||
| -rwxr-xr-x | htmlentities | 9 | ||||
| -rwxr-xr-x | inv | 3 | ||||
| -rwxr-xr-x | is-first-lower | 5 | ||||
| -rwxr-xr-x | joint-set | 5 | ||||
| -rwxr-xr-x | kbest-bleu-oracles | 5 | ||||
| -rwxr-xr-x | kendalls-tau | 11 | ||||
| -rwxr-xr-x | key-count | 5 | ||||
| -rwxr-xr-x | kmeans | 17 | ||||
| -rwxr-xr-x | lang | 11 | ||||
| -rwxr-xr-x | langid-polyglot | 3 | ||||
| -rwxr-xr-x | length-ratio | 3 | ||||
| -rwxr-xr-x | lin-reg | 7 | ||||
| -rwxr-xr-x | log-reg | 11 | ||||
| -rwxr-xr-x | ltok | 7 | ||||
| -rwxr-xr-x | make-rule-features | 7 | ||||
| -rwxr-xr-x | max | 3 | ||||
| -rwxr-xr-x | max-len | 5 | ||||
| -rwxr-xr-x | median | 3 | ||||
| -rwxr-xr-x | merge-files | 3 | ||||
| -rwxr-xr-x | merge-ttable | 17 | ||||
| -rwxr-xr-x | min | 3 | ||||
| -rwxr-xr-x | min-max | 17 | ||||
| -rwxr-xr-x | mkidx | 3 | ||||
| -rwxr-xr-x | moses-1best | 3 | ||||
| -rwxr-xr-x | moving-sum | 3 | ||||
| -rwxr-xr-x | mult | 1 | ||||
| -rwxr-xr-x | nfc (renamed from NFC) | 5 | ||||
| -rwxr-xr-x | ng | 9 | ||||
| -rwxr-xr-x | nn | 1 | ||||
| -rwxr-xr-x | no-empty | 5 | ||||
| -rwxr-xr-x | no-non-printables | 3 | ||||
| -rw-r--r-- | nonbreaking-prefixes/README.txt (renamed from nonbreaking_prefixes/README.txt) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.ca (renamed from nonbreaking_prefixes/nonbreaking_prefix.ca) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.cs (renamed from nonbreaking_prefixes/nonbreaking_prefix.cs) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.de (renamed from nonbreaking_prefixes/nonbreaking_prefix.de) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.el (renamed from nonbreaking_prefixes/nonbreaking_prefix.el) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.en (renamed from nonbreaking_prefixes/nonbreaking_prefix.en) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.es (renamed from nonbreaking_prefixes/nonbreaking_prefix.es) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.fr (renamed from nonbreaking_prefixes/nonbreaking_prefix.fr) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.is (renamed from nonbreaking_prefixes/nonbreaking_prefix.is) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.it (renamed from nonbreaking_prefixes/nonbreaking_prefix.it) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.nl (renamed from nonbreaking_prefixes/nonbreaking_prefix.nl) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.pl (renamed from nonbreaking_prefixes/nonbreaking_prefix.pl) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.pt (renamed from nonbreaking_prefixes/nonbreaking_prefix.pt) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.ro (renamed from nonbreaking_prefixes/nonbreaking_prefix.ro) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.ru (renamed from nonbreaking_prefixes/nonbreaking_prefix.ru) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.sk (renamed from nonbreaking_prefixes/nonbreaking_prefix.sk) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.sl (renamed from nonbreaking_prefixes/nonbreaking_prefix.sl) | 0 | ||||
| -rw-r--r-- | nonbreaking-prefixes/nonbreaking_prefix.sv (renamed from nonbreaking_prefixes/nonbreaking_prefix.sv) | 0 | ||||
| -rwxr-xr-x | norm | 1 | ||||
| -rwxr-xr-x | norm-german | 15 | ||||
| -rwxr-xr-x | norm-hyphens | 3 | ||||
| -rwxr-xr-x | normchr | 9 | ||||
| -rwxr-xr-x | num-tok | 5 | ||||
| -rwxr-xr-x | odd | 5 | ||||
| -rwxr-xr-x | overlap | 7 | ||||
| -rwxr-xr-x | paste-pairs | 12 | ||||
| -rwxr-xr-x | per-sentence-bleu | 9 | ||||
| -rwxr-xr-x | per-sentence-bleu-kbest | 9 | ||||
| -rwxr-xr-x | per-sentence-ter | 15 | ||||
| -rwxr-xr-x | percentile | 3 | ||||
| -rwxr-xr-x | pot | 1 | ||||
| -rwxr-xr-x | preprocess | 5 | ||||
| -rwxr-xr-x | preprocess-no-lower | 5 | ||||
| -rwxr-xr-x | pt-bloom | 11 | ||||
| -rwxr-xr-x | push-rules | 3 | ||||
| -rwxr-xr-x | remove-devtest | 4 | ||||
| -rwxr-xr-x | remove-test-from-bitext | 4 | ||||
| -rwxr-xr-x | repetition-rate | 7 | ||||
| -rwxr-xr-x | round | 1 | ||||
| -rwxr-xr-x | rule-shapes | 7 | ||||
| -rwxr-xr-x | sample | 14 | ||||
| -rwxr-xr-x | select | 7 | ||||
| -rwxr-xr-x | select-from | 11 | ||||
| -rwxr-xr-x | sentencepiece-decode | 3 | ||||
| -rwxr-xr-x | shard | 19 | ||||
| -rwxr-xr-x | sort-features | 1 | ||||
| -rwxr-xr-x | source-sides | 3 | ||||
| -rwxr-xr-x | split-kbest | 3 | ||||
| -rwxr-xr-x | split-lines | 5 | ||||
| -rwxr-xr-x | split-pipes | 13 | ||||
| -rwxr-xr-x | sqrt | 1 | ||||
| -rwxr-xr-x | stanford-parser-run | 3 | ||||
| -rwxr-xr-x | stddev | 3 | ||||
| -rwxr-xr-x | strips | 1 | ||||
| -rwxr-xr-x | subtract (renamed from substract) | 3 | ||||
| -rwxr-xr-x | sum | 1 | ||||
| -rwxr-xr-x | tc | 3 | ||||
| -rwxr-xr-x | tf-idf | 9 | ||||
| -rwxr-xr-x | tmx-extract (renamed from tmx-extract.py) | 59 | ||||
| -rwxr-xr-x | tmx-extract-original-py2 (renamed from tmx-extract-original-py2.py) | 1 | ||||
| -rwxr-xr-x[-rw-r--r--] | tmx-to-plain (renamed from tmx-to-plain.py) | 62 | ||||
| -rwxr-xr-x | to-ascii | 5 | ||||
| -rwxr-xr-x | toks | 7 | ||||
| -rwxr-xr-x | toks-per-line | 1 | ||||
| -rwxr-xr-x | train-test-split | 8 | ||||
| -rwxr-xr-x | tsv-exclude | 4 | ||||
| -rwxr-xr-x | tsv-joint-set | 7 | ||||
| -rwxr-xr-x | tsv-uniq | 7 | ||||
| -rwxr-xr-x | var | 3 | ||||
| -rwxr-xr-x | vocab | 1 | ||||
| -rwxr-xr-x | vocab-2 (renamed from vocab2) | 3 | ||||
| -rwxr-xr-x | zh-ko-or-ja | 5 |
133 files changed, 320 insertions, 432 deletions
@@ -9,4 +9,3 @@ while line = STDIN.gets puts "#{i}\t#{line}" i += 1 end - @@ -3,6 +3,5 @@ i = 0 while line = STDIN.gets puts "#{i}\t#{line}" - i += 1 + i += 1 end - @@ -1,12 +1,12 @@ #!/usr/bin/env ruby -require 'optimist' -require 'zipf' +require "optimist" +require "zipf" o = Optimist::options do - opt :grammar, "(Absolute) path of folder containing grammars.", :type => :string, :short => '-g', :default => nil + opt :grammar, "(Absolute) path of folder containing grammars.", :type => :string, :short => "-g", :default => nil opt :loo, "leave one out", :type => :bool, :default => false - opt :start_id, "start with this id", :type => :int, :default => 0, :short => '-i' + opt :start_id, "start with this id", :type => :int, :default => 0, :short => "-i" opt :nogz, "grammar files not gzipped", :type => :bool, :default => false opt :index, "number according to index", :type => :string, :default => nil end @@ -19,8 +19,8 @@ end i = o[:start_id] j = 0 while line = STDIN.gets - ext = '.gz' - ext = '' if o[:nogz] + ext = ".gz" + ext = "" if o[:nogz] s = "<seg" if o[:loo] then s += " exclude=\"#{i}\"" end if index.size > 0 @@ -33,4 +33,3 @@ while line = STDIN.gets i += 1 j += 1 end - diff --git a/add-start-end b/add-start-end index 30deaec..1e1061d 100755 --- a/add-start-end +++ b/add-start-end @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" while line = STDIN.gets puts "<s> #{line.strip} </s>" end - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" conf = Optimist::options do banner "avg < <one number per line>" diff --git a/avg-seg-len b/avg-seg-len index ee68827..bfd4f6c 100755 --- a/avg-seg-len +++ b/avg-seg-len @@ -6,4 +6,3 @@ while line = STDIN.gets end puts lens.inject(:+)/lens.size.to_f - diff --git a/avg-weights b/avg-weights index f090da9..bc734e8 100755 --- a/avg-weights +++ b/avg-weights @@ -1,8 +1,8 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' -require 'zlib' +require "zipf" +require "optimist" +require "zlib" conf = Optimist::options do opt :weights_files, "a number of weights files: name value", :required => true @@ -30,4 +30,3 @@ h.each_pair { |k,w| next if conf[:filter] and w.size < n puts "#{k} #{w.inject(:+)/n}" } - @@ -15,4 +15,3 @@ get_random() { seed="$1"; openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt < echo "shuffling ..." $SHUF --random-source=<(get_random 42) $1 > $1.shuf $SHUF --random-source=<(get_random 42) $2 > $2.shuf - diff --git a/bitext-filter-length b/bitext-filter-length index d1dc973..a77f10e 100755 --- a/bitext-filter-length +++ b/bitext-filter-length @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def main conf = Optimist::options do @@ -17,8 +17,8 @@ def main opt :reverse, "length ratios alway > 1", :type => :bool, :default => false, :short => "-r" end - fna,fnb = conf[:inputs].split ',' - a = ReadFile.new fna + fna,fnb = conf[:inputs].split "," + a = ReadFile.new fna b = ReadFile.new fnb if not conf[:output_index] @@ -62,4 +62,3 @@ def main end main - diff --git a/bitext2tmx.py b/bitext2tmx index 1cdc4b3..e9c8e23 100755 --- a/bitext2tmx.py +++ b/bitext2tmx @@ -7,23 +7,21 @@ from xml.sax.saxutils import escape if __name__ == "__main__": prefix = """<tmx version="1.4"> <header - creationtool="bitext2tmx.py" creationtoolversion="1.0" + creationtool="bitext2tmx" creationtoolversion="1.0" datatype="PlainText" segtype="sentence" adminlang="en-us" srclang="en" o-tmf="ABCTransMem"/> <body>""" - src_file = open(sys.argv[1], "r") - tgt_file = open(sys.argv[2], "r") + with open(sys.argv[1], "r") as src_file, open(sys.argv[2], "r") as tgt_file: + src_lang = sys.argv[1].split(".")[-1] + tgt_lang = sys.argv[2].split(".")[-1] - src_lang = sys.argv[1].split(".")[-1] - tgt_lang = sys.argv[2].split(".")[-1] - - tus = [] - for src_line, tgt_line in zip(src_file.readlines(), tgt_file.readlines()): - src_line = src_line.rstrip("\n") - tgt_line = tgt_line.rstrip("\n") - tus.append(f""" + tus = [] + for src_line, tgt_line in zip(src_file.readlines(), tgt_file.readlines()): + src_line = src_line.rstrip("\n") + tgt_line = tgt_line.rstrip("\n") + tus.append(f""" <tu> <tuv xml:lang="{src_lang}"> <seg>{escape(src_line)}</seg> @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" f1 = ReadFile.new ARGV[0] f2 = ReadFile.new ARGV[1] @@ -16,7 +16,7 @@ while line1 = f1.gets line2 = f2.gets if line2 == nil then line2 = "" end line2.strip! - + if !d1.include? line1 and !d2.include? line2 a1 << line1 a2 << line2 @@ -33,4 +33,3 @@ a1.each_with_index { |line1,i| o1.write line1 + "\n" o2.write a2[i] + "\n" } - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" f = ReadFile.new ARGV[0] g = ReadFile.new ARGV[1] @@ -20,4 +20,3 @@ while line = f.gets puts i += 1 end - diff --git a/cdec-hg-to-json b/cdec-hg-to-json index 5a26cf7..955cd6d 100755 --- a/cdec-hg-to-json +++ b/cdec-hg-to-json @@ -77,4 +77,3 @@ def main(): if __name__=="__main__": main() - @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" while line = STDIN.gets line.strip.each_char { |c| puts c } end - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" conf = Optimist::options do banner "cma < <one number per line>" @@ -20,4 +20,3 @@ while line = STDIN.gets end STDOUT.flush end - @@ -1,6 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" +require "tempfile" f = ReadFile.new ARGV[0] g = ReadFile.new ARGV[1] @@ -17,16 +18,16 @@ while line = f.gets sys1 << line1 sys2 << line2 - ff=File.new("/tmp/refs",'w+');ff.write(refs.join(""));ff.close - ff=File.new("/tmp/sys1",'w+');ff.write(sys1.join(""));ff.close - ff=File.new("/tmp/sys2",'w+');ff.write(sys2.join(""));ff.close + tmp_refs = Tempfile.new("refs"); tmp_refs.write(refs.join("")); tmp_refs.close + tmp_sys1 = Tempfile.new("sys1"); tmp_sys1.write(sys1.join("")); tmp_sys1.close + tmp_sys2 = Tempfile.new("sys2"); tmp_sys2.write(sys2.join("")); tmp_sys2.close - #a = `~/multi-bleu.perl /tmp/refs < /tmp/sys1`.split[2].gsub(',','').to_f - a = BLEU::bleu("/tmp/sys1", "/tmp/refs", 4) - b = BLEU::bleu("/tmp/sys2", "/tmp/refs", 4) + a = BLEU::bleu(tmp_sys1.path, tmp_refs.path, 4) + b = BLEU::bleu(tmp_sys2.path, tmp_refs.path, 4) + + tmp_refs.unlink; tmp_sys1.unlink; tmp_sys2.unlink diffs << b-a #puts ((diffs.inject(:+)/diffs.size)*100).round 2 puts (diffs[-1]*100).round 2 end - @@ -9,4 +9,3 @@ egrep -v -i "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|r | sed "s|\s*</speaker>\s*$||" \ | sed "s|\s*<hl>\s*$||" \ | sed "s|\s*</hl>\s*$||" - @@ -5,4 +5,3 @@ exit if factor==0 while line = STDIN.gets puts line.to_f / factor end - @@ -1,9 +1,8 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" -a = SparseVector.from_file 'w', ' ' -b = SparseVector.from_file 'f', ' ' +a = SparseVector.from_file "w", " " +b = SparseVector.from_file "f", " " puts a.to_s puts a.dot b - @@ -1,11 +1,10 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" i = 1 while line = STDIN.gets puts line if i%2==0 i+=1 end - @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'set' +require "zipf" +require "set" to_exclude = {} f = ReadFile.new ARGV[0] @@ -14,4 +14,3 @@ while line = STDIN.gets puts line end end - diff --git a/feature-dict b/feature-dict index 6849769..59ff020 100755 --- a/feature-dict +++ b/feature-dict @@ -7,7 +7,7 @@ l_i = 1 while line = STDIN.gets STDERR.write "#{l_i}\n" if l_i%1000==0&¬_quiet line.split.each { |i| - f, v = i.split('=', 2) + f, v = i.split("=", 2) if !feature_dict.has_key? f feature_dict[f] = n n += 1 @@ -16,9 +16,8 @@ while line = STDIN.gets l_i += 1 end -f = File.new ARGV[0], 'w' +f = File.new ARGV[0], "w" f.write Marshal.dump feature_dict f.close STDERR.write "size = #{feature_dict.size}\n" - diff --git a/filter-illegal b/filter-illegal index 8b29f3e..e44b2ac 100755 --- a/filter-illegal +++ b/filter-illegal @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" illegal = [ "[", "]", "|||" ] @@ -15,11 +15,10 @@ while line0 = in0.gets illegal.each { |k| if line0.index(k) or line1.index(k) then skip = true - skipi << i + skipi << i end } i += 1 end skipi.each { |j| puts j } - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" a = ReadFile.new ARGV[0] b = ReadFile.new ARGV[1] @@ -24,4 +24,3 @@ a.close b.close a_out.close b_out.close - diff --git a/filter-tokens b/filter-tokens index 00c8f2c..c851bd3 100755 --- a/filter-tokens +++ b/filter-tokens @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" bad_words = {} ReadFile.readlines_strip(ARGV[0]).each { |line| @@ -13,11 +13,10 @@ while line = STDIN.gets tokens = line.split bad_words.keys.each { |w| if tokens.include? w - bad = true + bad = true break end } puts i if bad i += 1 end - diff --git a/first-upper b/first-upper index 610e62c..f9b2ce9 100755 --- a/first-upper +++ b/first-upper @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" while line = STDIN.gets line.strip! line[0] = line[0].upcase puts line end - diff --git a/fix-utf-8-pua b/fix-utf-8-pua index 674d424..da77850 100755 --- a/fix-utf-8-pua +++ b/fix-utf-8-pua @@ -7,4 +7,3 @@ while line = STDIN.gets line.gsub! /[\u{e000}-\u{f8ff}]/, " " puts line end - diff --git a/gigaword-collapse-tags b/gigaword-collapse-tags index cbaf7d7..f2339c4 100755 --- a/gigaword-collapse-tags +++ b/gigaword-collapse-tags @@ -2,8 +2,8 @@ # works with gigaword en v5 -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" in_p = false in_dateline = false @@ -36,4 +36,3 @@ while line = STDIN.gets puts line end end - diff --git a/hadoop-uniq b/hadoop-uniq index 5052419..5f37fa4 100755 --- a/hadoop-uniq +++ b/hadoop-uniq @@ -8,4 +8,3 @@ $HADOOP_HOME/bin/hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \ -output d.uniq \ -mapper 'cut -d " " -f 1' \ -reducer /usr/bin/uniq - @@ -21,4 +21,3 @@ sorted.sort_by! { |i| sorted.each { |i| puts "#{i[0]}\t#{i[1]}" } - diff --git a/htmlentities b/htmlentities index f3c2d34..c0ccc0a 100755 --- a/htmlentities +++ b/htmlentities @@ -1,9 +1,9 @@ -#!/usr/bin/ruby +#!/usr/bin/env ruby -require 'htmlentities' +require "htmlentities" -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" coder = HTMLEntities.new @@ -11,4 +11,3 @@ coder = HTMLEntities.new while line = STDIN.gets puts coder.decode(line.strip) end - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" def main conf = Optimist::options do @@ -30,4 +30,3 @@ def main end main - diff --git a/is-first-lower b/is-first-lower index 1cddb8e..a7e2073 100755 --- a/is-first-lower +++ b/is-first-lower @@ -1,11 +1,10 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" while line = STDIN.gets line.strip! - if line && line!='' && line[0].downcase? + if line && line!="" && line[0].downcase? puts line end end - @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'set' -require 'zipf' +require "set" +require "zipf" n = ARGV.pop.to_i @@ -27,4 +27,3 @@ all_sets.each { |set| joint_set.each { |i| puts i } - diff --git a/kbest-bleu-oracles b/kbest-bleu-oracles index ea76ab1..03f321d 100755 --- a/kbest-bleu-oracles +++ b/kbest-bleu-oracles @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def get_context kbest_lists, references, n a = [] @@ -48,4 +48,3 @@ def main end main - diff --git a/kendalls-tau b/kendalls-tau index c0c20be..24f0744 100755 --- a/kendalls-tau +++ b/kendalls-tau @@ -2,7 +2,7 @@ ################################################# # reads space delimted pairs of scores as input, -# outputs Kendall's τ +# outputs Kendall"s τ ################################################# def kendall_with_ties l @@ -13,7 +13,7 @@ def kendall_with_ties l l.each_with_index { |k,i| l[i+1,l.size].each_with_index { |m,j| if (k.first < m.first && k[1] < m[1]) || - (k.first > m.first && k[1] > m[1]) + (k.first > m.first && k[1] > m[1]) concordant += 1 elsif (k.first == m.first && k[1] != m[1]) tie_a += 1 @@ -24,7 +24,7 @@ def kendall_with_ties l end } } - + return (concordant-disconcordant)/(Math.sqrt((concordant+disconcordant+tie_a)*(concordant+disconcordant+tie_b))) end @@ -34,7 +34,7 @@ def kendall l l.each_with_index { |k,i| l[i+1,l.size].each_with_index { |m,j| if (k.first <= m.first && k[1] <= m[1]) || - (k.first >= m.first && k[1] >= m[1]) + (k.first >= m.first && k[1] >= m[1]) concordant += 1 else disconcordant += 1 @@ -60,7 +60,7 @@ def main a,b = line.split l << [a.to_f, b.to_f] end - + v = -1 if has_ties? l v = kendall_with_ties l @@ -72,4 +72,3 @@ def main end main - @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" h = {} h.default = 0 @@ -11,4 +11,3 @@ while line = STDIN.gets end h.each_pair { |k,v| puts "#{k} #{v}" } - @@ -1,12 +1,12 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def read_data fn data = {} ReadFile.new(fn).readlines_strip.map{ |i| - a = i.split ' ', 2 + a = i.split " ", 2 v = SparseVector.from_kv a.last data[a.first] = v } @@ -30,7 +30,7 @@ end def assign centroids, data assignment = {} data.each_pair { |name,feature_vector| - min = 1.0/0 + min = Float::INFINITY min_index = nil centroids.each_with_index { |c,i| dist = c.euclidian_dist(feature_vector) @@ -61,10 +61,10 @@ def main opt :k, "k", :type => :int, :required => true opt :input, "input: one feature vector per line", :type => :string, :required => true opt :max_iterations, "max. number of iterations", :type => :int, :default => 100 - opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3 - opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2 + opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => "-n", :default => 3 + opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => "-j", :default => 2 end - # data is 'ID f1=v1 f2=v2' + # data is "ID f1=v1 f2=v2" data = read_data conf[:input] k = conf[:k] centroids = nil @@ -86,7 +86,7 @@ def main STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n" 0.upto(conf[:max_iterations]) do |i| s = "iteration #{i}" - STDERR.write "#{s}\n#{'-'*s.size}\n" + STDERR.write "#{s}\n#{"-" * s.size}\n" assignment = assign centroids, data sizes = [] assignment.each_pair { |centroid_index, a| @@ -114,4 +114,3 @@ def main end main - @@ -1,14 +1,14 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import langdetect from_stdin = False -if sys.argv[1] == '-': +if sys.argv[1] == "-": f = sys.stdin from_stdin = True else: - f = open(sys.argv[1], 'r') + f = open(sys.argv[1], "r") try: l = sys.argv[2].strip() @@ -32,7 +32,7 @@ if min_p and not l: if strict and not min_p: strict = False - + factory = langdetect.detector_factory.DetectorFactory() factory.load_profile(langdetect.detector_factory.PROFILES_DIRECTORY) @@ -71,5 +71,4 @@ for line in f: print("unk") if not from_stdin: - f.close - + f.close() diff --git a/langid-polyglot b/langid-polyglot index 0b0b20c..04f6b3b 100755 --- a/langid-polyglot +++ b/langid-polyglot @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import polyglot from polyglot.detect import Detector @@ -15,4 +15,3 @@ for line in fileinput.input(): except polyglot.detect.base.UnknownLanguage: print("??") pass - diff --git a/length-ratio b/length-ratio index 4b4432d..5b38826 100755 --- a/length-ratio +++ b/length-ratio @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" a = ReadFile.new ARGV[0] b = ReadFile.new ARGV[1] @@ -9,4 +9,3 @@ while linea = a.gets lineb = b.gets puts linea.strip.split.size.to_f / lineb.strip.split.size.to_f end - @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def read_data fn, scale f = ReadFile.new fn @@ -29,7 +29,7 @@ def main opt :output, "output data", :type => :string, :required => true opt :learning_rate, "learning rate", :type => :float, :default => 0.07 opt :stop, "stopping criterion", :type => :int, :default => 100 - opt :scale_features,"scale features", :type => :bool, :default => false, :short => '-t' + opt :scale_features,"scale features", :type => :bool, :default => false, :short => "-t" opt :show_loss, "show loss per iter", :type => :bool, :default => false end data = read_data conf[:input], conf[:scale_features] @@ -67,4 +67,3 @@ def main end main - @@ -1,8 +1,8 @@ #!/usr/bin/env ruby -require 'zipf' -require 'matrix' -require 'optimist' +require "zipf" +require "matrix" +require "optimist" def read_data fn f = ReadFile.new fn @@ -30,7 +30,7 @@ def approx_eql x, y, eps=10**-10 return false if !x||!y return false if x.size!=y.size x.each_with_index { |_,i| - return false if (x[i]-y[i]).abs>eps + return false if (x[i]-y[i]).abs>eps } return true end @@ -48,7 +48,7 @@ def main prev_model = nil gradient = Vector.elements zeros hessian = Matrix.build(dim,dim) { |i,j| 0.0 } - i = 0 + i = 0 while true i += 1 data.each_with_index { |x,j| @@ -68,4 +68,3 @@ def main end main - @@ -1,9 +1,8 @@ -#!/usr/bin/ruby +#!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" while line = STDIN.gets puts line.strip.split(/\s/).size end - diff --git a/make-rule-features b/make-rule-features index 7adb6e9..ae2cecc 100755 --- a/make-rule-features +++ b/make-rule-features @@ -1,10 +1,10 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" def mkrf src, tgt s = src.gsub /\[X,[1-9]\]/, "NX" - t = tgt.gsub /\[X,([1-9])\]/,'N\1' + t = tgt.gsub /\[X,([1-9])\]/,"N\1" return "R:X:#{s.gsub(" ","_")}:#{t.gsub(" ","_")}" end @@ -13,7 +13,7 @@ def mkrbf s, t if t == "S" s.gsub! /\[X,[1-9]\]/, "X" else - s.gsub! /\[X,([1-9])\]/, 'X\1' + s.gsub! /\[X,([1-9])\]/, "X\1" end s.reverse! s += " >r<" @@ -41,4 +41,3 @@ while line = STDIN.gets end h.keys.each { |f| puts f } - @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -max = -1.0/0 +max = -Float::INFINITY while line = STDIN.gets v = line.to_f max = v if v > max end puts max - @@ -1,11 +1,11 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" max = ARGV[0].to_i i = 0 -while line = STDIN.gets +while line = STDIN.gets if tokenize(line).size <= max puts i else @@ -13,4 +13,3 @@ while line = STDIN.gets end i += 1 end - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" a = [] while line = STDIN.gets @@ -10,4 +10,3 @@ end a.sort! puts a[a.size/2] - diff --git a/merge-files b/merge-files index 714b57d..78644ef 100755 --- a/merge-files +++ b/merge-files @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" def usage STDERR.write "merge_files <file>+\n" @@ -28,4 +28,3 @@ hashes.each { |h| counts.max.times { puts k } } } - diff --git a/merge-ttable b/merge-ttable index 77eae9f..20e5429 100755 --- a/merge-ttable +++ b/merge-ttable @@ -1,20 +1,20 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def main conf = Optimist::options do opt :f, "f files", :type => :string, :required => true opt :e, "e files", :type => :string, :required => true end - + f_files = conf[:f].split e_files = conf[:e].split - + h = {} f_files.each_with_index { |fn,i| - fa = ReadFile.readlines_strip fn + fa = ReadFile.readlines_strip fn ea = ReadFile.readlines_strip e_files[i] fa.each_with_index { |fw,j| if h.has_key? fw @@ -24,11 +24,10 @@ def main end } } - + h.each_pair { |f,ea| - puts "#{f}\t#{ea.first}" - } + puts "#{f}\t#{ea.first}" + } end main - @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -min = 1.0/0 +min = Float::INFINITY while line = STDIN.gets v = line.to_f min = v if v<min end puts min - @@ -1,15 +1,15 @@ -#!/usr/bin/ruby +#!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" conf = Optimist::options do opt :min, "minimum #tokens", :type => :int, :default => 1 - opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n' - opt :in_f, "input 'French' file", :type => :string, :required => true - opt :in_e, "input 'English' file", :type => :string, :required => true - opt :out_f, "output 'French' file", :type => :string, :required => true - opt :out_e, "output 'English' file", :type => :string, :required => true + opt :max, "maximum #tokens", :type => :int, :default => 80, :short => "-n" + opt :in_f, "input French file", :type => :string, :required => true + opt :in_e, "input English file", :type => :string, :required => true + opt :out_f, "output French file", :type => :string, :required => true + opt :out_e, "output English file", :type => :string, :required => true opt :out_id, "output line Nos", :type => :string, :required => true end @@ -37,4 +37,3 @@ while f_line = files[:f_file].gets end files.values.each{ |f| f.close } - @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" i = ARGV[0].to_i while line = STDIN.gets puts i i += 1 end - diff --git a/moses-1best b/moses-1best index fd35cf8..ffe5e22 100755 --- a/moses-1best +++ b/moses-1best @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" prev_idx = nil while line = STDIN.gets @@ -11,4 +11,3 @@ while line = STDIN.gets prev_idx = idx end end - @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" sum = 0.0 ReadFile.readlines_strip(ARGV[0]).each { |i| sum += i.to_f puts sum } - @@ -4,4 +4,3 @@ factor = ARGV[0].to_f while line = STDIN.gets puts line.to_f * factor end - @@ -1,9 +1,8 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import fileinput import unicodedata import sys for line in fileinput.input(): - sys.stdout.write(unicodedata.normalize('NFC', line)) - + sys.stdout.write(unicodedata.normalize("NFC", line)) @@ -1,19 +1,18 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" conf = Optimist::options do banner "ng < <input>" opt :n, "n for Ngrams", :type => :int, :default => 4 - opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => false + opt :fix, "Do not output lower order Ngrams.", :type => :bool, :default => false opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n" end while line = STDIN.gets a = [] - ngrams(line, conf[:n], conf[:fix]) { |ng| a << ng.join(' ') } + ngrams(line, conf[:n], conf[:fix]) { |ng| a << ng.join(" ") } a.reject! { |i| i.strip.size==0 } puts a.join conf[:separator] if a.size>0 end - @@ -1,4 +1,3 @@ #!/bin/sh tr '[:digit:]' $1 < $2 > $(basename $2 ${2##*.})nn.${2##*.} - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" files = [] (0..1).each { |i| files << ReadFile.new(ARGV[i]) } @@ -9,10 +9,9 @@ files = [] while line_f = files[0].gets line_e = files[1].gets line_f.strip!; line_e.strip! - next if line_f=='' || line_e=='' + next if line_f=="" || line_e=="" files[2].write line_f+"\n" files[3].write line_e+"\n" end files.each { |f| f.close } - diff --git a/no-non-printables b/no-non-printables index 9f9e3f9..2fb6f65 100755 --- a/no-non-printables +++ b/no-non-printables @@ -1,4 +1,3 @@ #!/bin/sh -sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g' | sed 's/[[:cntrl:]]//g' - +sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g' | sed 's/[[:cntrl:]]//g' diff --git a/nonbreaking_prefixes/README.txt b/nonbreaking-prefixes/README.txt index 02cdfcc..02cdfcc 100644 --- a/nonbreaking_prefixes/README.txt +++ b/nonbreaking-prefixes/README.txt diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ca b/nonbreaking-prefixes/nonbreaking_prefix.ca index 2f4fdfc..2f4fdfc 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.ca +++ b/nonbreaking-prefixes/nonbreaking_prefix.ca diff --git a/nonbreaking_prefixes/nonbreaking_prefix.cs b/nonbreaking-prefixes/nonbreaking_prefix.cs index dce6167..dce6167 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.cs +++ b/nonbreaking-prefixes/nonbreaking_prefix.cs diff --git a/nonbreaking_prefixes/nonbreaking_prefix.de b/nonbreaking-prefixes/nonbreaking_prefix.de index 35fdf5e..35fdf5e 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.de +++ b/nonbreaking-prefixes/nonbreaking_prefix.de diff --git a/nonbreaking_prefixes/nonbreaking_prefix.el b/nonbreaking-prefixes/nonbreaking_prefix.el index 0470f91..0470f91 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.el +++ b/nonbreaking-prefixes/nonbreaking_prefix.el diff --git a/nonbreaking_prefixes/nonbreaking_prefix.en b/nonbreaking-prefixes/nonbreaking_prefix.en index e1a3733..e1a3733 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.en +++ b/nonbreaking-prefixes/nonbreaking_prefix.en diff --git a/nonbreaking_prefixes/nonbreaking_prefix.es b/nonbreaking-prefixes/nonbreaking_prefix.es index d8b2755..d8b2755 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.es +++ b/nonbreaking-prefixes/nonbreaking_prefix.es diff --git a/nonbreaking_prefixes/nonbreaking_prefix.fr b/nonbreaking-prefixes/nonbreaking_prefix.fr index 28126fa..28126fa 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.fr +++ b/nonbreaking-prefixes/nonbreaking_prefix.fr diff --git a/nonbreaking_prefixes/nonbreaking_prefix.is b/nonbreaking-prefixes/nonbreaking_prefix.is index 5b8a710..5b8a710 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.is +++ b/nonbreaking-prefixes/nonbreaking_prefix.is diff --git a/nonbreaking_prefixes/nonbreaking_prefix.it b/nonbreaking-prefixes/nonbreaking_prefix.it index 992b9ec..992b9ec 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.it +++ b/nonbreaking-prefixes/nonbreaking_prefix.it diff --git a/nonbreaking_prefixes/nonbreaking_prefix.nl b/nonbreaking-prefixes/nonbreaking_prefix.nl index c80c417..c80c417 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.nl +++ b/nonbreaking-prefixes/nonbreaking_prefix.nl diff --git a/nonbreaking_prefixes/nonbreaking_prefix.pl b/nonbreaking-prefixes/nonbreaking_prefix.pl index 6b7c106..6b7c106 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.pl +++ b/nonbreaking-prefixes/nonbreaking_prefix.pl diff --git a/nonbreaking_prefixes/nonbreaking_prefix.pt b/nonbreaking-prefixes/nonbreaking_prefix.pt index 5d65bf2..5d65bf2 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.pt +++ b/nonbreaking-prefixes/nonbreaking_prefix.pt diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ro b/nonbreaking-prefixes/nonbreaking_prefix.ro index d489f46..d489f46 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.ro +++ b/nonbreaking-prefixes/nonbreaking_prefix.ro diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ru b/nonbreaking-prefixes/nonbreaking_prefix.ru index 444465b..444465b 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.ru +++ b/nonbreaking-prefixes/nonbreaking_prefix.ru diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sk b/nonbreaking-prefixes/nonbreaking_prefix.sk index 1198d48..1198d48 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.sk +++ b/nonbreaking-prefixes/nonbreaking_prefix.sk diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sl b/nonbreaking-prefixes/nonbreaking_prefix.sl index 230062c..230062c 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.sl +++ b/nonbreaking-prefixes/nonbreaking_prefix.sl diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sv b/nonbreaking-prefixes/nonbreaking_prefix.sv index df5ef29..df5ef29 100644 --- a/nonbreaking_prefixes/nonbreaking_prefix.sv +++ b/nonbreaking-prefixes/nonbreaking_prefix.sv @@ -5,4 +5,3 @@ sum=$(cat $1 | sum) for i in `cat $1`; do echo "$i" | div $sum done - diff --git a/norm-german b/norm-german index 85a39da..5c41f98 100755 --- a/norm-german +++ b/norm-german @@ -1,23 +1,23 @@ #!/usr/bin/env ruby -require 'thread' -require 'optimist' +require "thread" +require "optimist" -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" conf = Optimist::options do banner "norm_german < <file w/ lowercased tokens>" opt :upper, "uppercase", :type => :bool, :default => false - opt :threads, "#threads", :type => :int, :default => 1, :short => '-h' + opt :threads, "#threads", :type => :int, :default => 1, :short => "-h" opt :shard_size, "shard size", :type => :int, :default => 1000 opt :train, "train", :type => :bool opt :apply, "apply", :type => :bool end -pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ] -pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ] +pairs_lower = [ ["ß","ss"], ["ue", "ü"], ["ae","ä"], ["oe", "ö"] ] +pairs_upper = [ ["Ä", "Ae"], ["Ö", "Oe"], ["Ü", "Ue"] ] if conf[:upper] PAIRS = pairs_lower else @@ -84,4 +84,3 @@ token_stock.each { |i| h.merge! build_partial i end } - diff --git a/norm-hyphens b/norm-hyphens index 4a152a1..6491d13 100755 --- a/norm-hyphens +++ b/norm-hyphens @@ -1,4 +1,3 @@ -#!/bin/zsh -x +#!/bin/zsh sed "s|[ \t]\+\xc2\xad[ \t]\+||g" - @@ -3,10 +3,10 @@ # http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string-literal # https://www.cs.tut.fi/~jkorpela/chars/spaces.html -require 'htmlentities' +require "htmlentities" -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" coder = HTMLEntities.new @@ -24,7 +24,7 @@ while line = STDIN.gets line.gsub! /[\u{e000}-\u{f8ff}]/, " " # UTF-8 PUA line.gsub! /[\u{f0000}-\u{ffffd}]/, " " line.gsub! /[\u{100000}-\u{10fffd}]/, " " - line.gsub! "\r", " " # carriage return + line.gsub! "\r", " " # carriage return line.gsub! /[\u{2000}-\u{200f}]/, " " # EN QUAD -- RIGHT-TO-LEFT MARK line.gsub! /[\u{2028}-\u{202f}]/, " " # LINE SEPARATOR -- NARROW NO-BREAK SPACE line.gsub! /[\u{205f}-\u{206f}]/, " " # MEDIUM MATHEMATICAL SPACE -- NOMINAL DIGIT SHAPES @@ -32,4 +32,3 @@ while line = STDIN.gets line.gsub! /[[:space:]]+/, " " # collapse space puts coder.decode(line) end - @@ -1,9 +1,8 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" while line = STDIN.gets puts line.strip.split.length end - @@ -1,11 +1,10 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" i = 1 while line = STDIN.gets puts line if i%2!=0 i+=1 end - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" a = {} a.default = 0 @@ -11,10 +11,9 @@ ReadFile.readlines_strip(ARGV[1]).map { |segment| b[segment] += 1 } overlap = 0 a.each_key { |seg| - puts b[seg] - overlap = overlap+b[seg] + puts b[seg] + overlap = overlap+b[seg] } puts "---" puts overlap - diff --git a/paste-pairs b/paste-pairs index f6b8b31..7e08329 100755 --- a/paste-pairs +++ b/paste-pairs @@ -1,10 +1,8 @@ -#!/usr/bin/python +#!/usr/bin/env python3 import sys -from itertools import izip - -for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))): - print linenr, (src_line.strip()) - print linenr, (tgt_line.strip()) - print +for linenr, (src_line, tgt_line) in enumerate(zip(open(sys.argv[1]), open(sys.argv[2]))): + print(linenr, src_line.strip()) + print(linenr, tgt_line.strip()) + print() diff --git a/per-sentence-bleu b/per-sentence-bleu index 257eb3a..d815dc9 100755 --- a/per-sentence-bleu +++ b/per-sentence-bleu @@ -1,11 +1,11 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def main conf = Optimist::options do - opt :input, "input", :type => :string, :default => '-' + opt :input, "input", :type => :string, :default => "-" opt :references, "references", :type => :string, :required => true opt :len_hack, "hack of Nakov et al", :type => :int, :default => 0 opt :n, "N", :default => 4 @@ -16,7 +16,7 @@ def main input = ReadFile.new conf[:input] while line = input.gets i += 1 - if line.strip == '' + if line.strip == "" puts 0.0 next end @@ -26,4 +26,3 @@ def main end main - diff --git a/per-sentence-bleu-kbest b/per-sentence-bleu-kbest index dad1607..12a9f6f 100755 --- a/per-sentence-bleu-kbest +++ b/per-sentence-bleu-kbest @@ -1,11 +1,11 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def main conf = Optimist::options do - opt :kbests, "kbests", :type => :string, :default => '-' + opt :kbests, "kbests", :type => :string, :default => "-" opt :references, "references", :type => :string, :required => true end refs = ReadFile.new conf[:references] @@ -19,7 +19,7 @@ def main scores.each_with_index { |x,j| puts "#{j+1} ||| #{scores[j]} ||| #{list[j]}" if scores[j]==max && !o - puts "^^^ #{j+1} #{max}" + puts "^^^ #{j+1} #{max}" o = true end } @@ -29,4 +29,3 @@ def main end main - diff --git a/per-sentence-ter b/per-sentence-ter index 1a7670e..777d39c 100755 --- a/per-sentence-ter +++ b/per-sentence-ter @@ -1,14 +1,14 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' -require 'tempfile' +require "zipf" +require "optimist" +require "tempfile" def main conf = Optimist::options do - opt :input, "input", :type => :string, :default => '-' + opt :input, "input", :type => :string, :default => "-" opt :references, "references", :type => :string, :required => true - opt :mteval_bin, "cdec's mteval/fast_score", :type => :string, :default => '`/toolbox/cdec-dtrain/mteval/fast_score' + opt :mteval_bin, "cdec mteval/fast_score", :type => :string, :default => "`/toolbox/cdec-dtrain/mteval/fast_score" end refs = ReadFile.readlines_strip conf[:references] @@ -17,8 +17,8 @@ def main while line = input.gets line.strip! i += 1 - a = Tempfile.new 'pster' - b = Tempfile.new 'pster' + a = Tempfile.new "pster" + b = Tempfile.new "pster" a.write line+"\n" b.write refs[i]+"\n" a.close; b.close @@ -30,4 +30,3 @@ def main end main - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" data = [] while line = STDIN.gets @@ -18,4 +18,3 @@ if index.to_i == index else puts (data[index.to_i-1] + data[index.to_i]) / 2.0 end - @@ -4,4 +4,3 @@ pow = ARGV[0].to_f while line = STDIN.gets puts line.to_f**pow end - @@ -1,9 +1,8 @@ #!/bin/bash -pushd `dirname $0` > /dev/null -P=`pwd -P` +pushd "$(dirname "$0")" > /dev/null +P="$(pwd -P)" popd > /dev/null LANG=$1 $P/no-non-printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize-punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer-no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | $P/lowercase.perl 2>lowercase.$LANG.err - diff --git a/preprocess-no-lower b/preprocess-no-lower index afd87e9..7e3ad91 100755 --- a/preprocess-no-lower +++ b/preprocess-no-lower @@ -1,9 +1,8 @@ #!/bin/bash -pushd `dirname $0` > /dev/null -P=`pwd -P` +pushd "$(dirname "$0")" > /dev/null +P="$(pwd -P)" popd > /dev/null LANG=$1 $P/no-non-printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize-punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer-no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err - @@ -1,10 +1,10 @@ #!/usr/bin/env ruby -require 'bloom-filter' -require 'optimist' +require "bloom-filter" +require "optimist" -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" conf = Optimist::options do opt :size, "number of entries in the filter", :type => :int, :required => true @@ -19,6 +19,5 @@ while line = STDIN.gets f.insert(src+" ||| "+tgt) end -f.dump('pt.bloom') +f.dump("pt.bloom") f.close - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" a = ReadFile.readlines_strip ARGV[0] h = {} @@ -21,4 +21,3 @@ while line = STDIN.gets puts line end end - diff --git a/remove-devtest b/remove-devtest index 8e026f9..f322a6e 100755 --- a/remove-devtest +++ b/remove-devtest @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" train_src = ReadFile.new ARGV[0] train_tgt = ReadFile.new ARGV[1] @@ -39,7 +39,7 @@ while line_src = train_src.gets line_src_downcase = line_src line_tgt_downcase = line_tgt end - + if not devtest_h_src.has_key? line_src_downcase and not devtest_h_src.has_key? line_tgt_downcase \ and not devtest_h_tgt.has_key? line_src_downcase and not devtest_h_tgt.has_key? line_tgt_downcase train_src_out.write line_src diff --git a/remove-test-from-bitext b/remove-test-from-bitext index 43038d3..911a893 100755 --- a/remove-test-from-bitext +++ b/remove-test-from-bitext @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" test_source = ReadFile.new ARGV[0] test_target = ReadFile.new ARGV[1] @@ -13,7 +13,7 @@ while test_source_line = test_source.gets test_source_line.strip! test_target_line = test_target.gets test_target_line.strip! - + all_test_source_lines[test_source_line] = true all_test_target_lines[test_target_line] = true end diff --git a/repetition-rate b/repetition-rate index 87938ae..12e0fab 100755 --- a/repetition-rate +++ b/repetition-rate @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" windows = [] cur = [] @@ -9,7 +9,7 @@ while line = STDIN.gets if cur_sz >= 1000 windows << cur cur = [] - cur_sz = 0 + cur_sz = 0 end cur << line.strip cur_sz += cur.last.split.size @@ -37,8 +37,7 @@ windows.each { |w| rr = 1.0 enums.each_with_index { |i,j| - rr *= i/denoms[j] + rr *= i/denoms[j] } puts ((rr**0.25)*100).round 2 - @@ -4,4 +4,3 @@ r = ARGV[0].to_i while line = STDIN.gets puts line.to_f.round r end - diff --git a/rule-shapes b/rule-shapes index 589a670..91f8092 100755 --- a/rule-shapes +++ b/rule-shapes @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" def shape s res = [] @@ -24,6 +24,5 @@ end while line = STDIN.gets f, e = line.split(/\t/) f.strip!; e.strip! - puts shape(f).join('_')+"-"+shape(e).join('_') + puts shape(f).join("_")+"-"+shape(e).join("_") end - @@ -1,15 +1,15 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" opts = Optimist::options do banner "sample --size <n> [--shuffle] --file <line separated data>" opt :size, "Sample P % or # lines from file or N.", :type => :float opt :shuffle, "Sample is shuffled.", :type => :bool - opt :file, "Input file.", :type => :string, :default => '-' + opt :file, "Input file.", :type => :string, :default => "-" opt :output_index, "Output index number.", :type => :bool opt :N, "Sample --size from N items.", :type => :int, :default => -1 opt :absolute, "Sample absolute number of items.", :type => :bool @@ -19,10 +19,10 @@ input = [] index = [] i = 0 if opts[:N] == -1 - if opts[:file] == '-' + if opts[:file] == "-" file = STDIN else - file = File.new opts[:file], 'r' + file = File.new opts[:file], "r" end while line = file.gets input << line @@ -36,7 +36,6 @@ end sample = [] if !opts[:absolute] sample = index.sample(index.size*(opts[:size]/100.0)) - sample = index.sample(index.size*(opts[:size]/100.0)) else sample = index.sample(opts[:size]) end @@ -56,4 +55,3 @@ while idx = sample.shift end end end - @@ -1,11 +1,11 @@ #!/usr/bin/env ruby -require 'optimist' -require 'zipf' +require "optimist" +require "zipf" opts = Optimist::options do banner "sample --index <n> [--shuffle] [--file <line separated data>]" - opt :file, "Input file.", :type => :string, :default => '-' + opt :file, "Input file.", :type => :string, :default => "-" opt :index, "Index file.", :type => :string, :required => true end @@ -15,4 +15,3 @@ index = ReadFile.readlines_strip(opts[:index]).map{ |i| i.to_i } index.each { |i| puts input[i] } - diff --git a/select-from b/select-from index 0ccfeac..e9a394d 100755 --- a/select-from +++ b/select-from @@ -1,13 +1,13 @@ #!/usr/bin/env ruby -require 'optimist' -require 'zipf' +require "optimist" +require "zipf" opts = Optimist::options do banner "select_from [--invert] -i <file> < <line separated data>" - opt :index, "Line numbers to output.", :type => :string, :short => '-i', :required => true - opt :invert, "Invert selection.", :type => :bool, :short => '-j', :default => false - opt :from1, "Index starting from 1.", :type => :bool, :short => '-k', :default => false + opt :index, "Line numbers to output.", :type => :string, :short => "-i", :required => true + opt :invert, "Invert selection.", :type => :bool, :short => "-j", :default => false + opt :from1, "Index starting from 1.", :type => :bool, :short => "-k", :default => false end accept = {} @@ -30,4 +30,3 @@ while line = STDIN.gets end i += 1 end - diff --git a/sentencepiece-decode b/sentencepiece-decode index 5e07ffa..e715d09 100755 --- a/sentencepiece-decode +++ b/sentencepiece-decode @@ -1,9 +1,8 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" while line = STDIN.gets line = line.split.join "" puts line.gsub "▁", " " end - @@ -1,11 +1,11 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false) lc = `wc -l #{input}`.split.first.to_i - input_ext = input.split('.').last - refs_ext = refs.split('.').last + input_ext = input.split(".").last + refs_ext = refs.split(".").last index = (0..lc-1).to_a index.reverse! index.shuffle! if rand @@ -68,13 +68,12 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false end opts = Optimist::options do - opt :input, 'input', :type => :string, :required => true - opt :references, 'references', :type => :string, :required => true - opt :alignments, 'alignments', :type => :string, :required => true - opt :output_prefix, 'output prefix', :type => :string, :required => true - opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z' - opt :num_shards, 'number of shards', :type => :int, :required => true + opt :input, "input", :type => :string, :required => true + opt :references, "references", :type => :string, :required => true + opt :alignments, "alignments", :type => :string, :required => true + opt :output_prefix, "output prefix", :type => :string, :required => true + opt :randomize, "randomize", :type => :bool, :default => false, :short => "-z" + opt :num_shards, "number of shards", :type => :int, :required => true end make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize]) - diff --git a/sort-features b/sort-features index 88bd779..a91fb00 100755 --- a/sort-features +++ b/sort-features @@ -7,4 +7,3 @@ while line = STDIN.gets end h.sort_by { |name, value| -value }.each { |name, value| puts "#{name}\t#{value}" } - diff --git a/source-sides b/source-sides index b4490c6..9243f17 100755 --- a/source-sides +++ b/source-sides @@ -1,4 +1,3 @@ -#!/bin/zsh -x +#!/bin/zsh split_pipes -f 2 | sort | uniq | sed "s| |_|g" | sed "s|\[X,[12]\]|NX|g" - diff --git a/split-kbest b/split-kbest index ab425b0..52773e8 100755 --- a/split-kbest +++ b/split-kbest @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" def write_kbest l, fn f = WriteFile.new fn @@ -21,4 +21,3 @@ while line = STDIN.gets l << line end write_kbest l, "#{dir}/#{i}.gz" # last one - diff --git a/split-lines b/split-lines index 14b3a0f..0d036c3 100755 --- a/split-lines +++ b/split-lines @@ -1,14 +1,13 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" dir = ARGV[0] i = 0 while line = STDIN.gets src, tgt = line.split " ||| " - f = WriteFile.new "#{dir}/#{i}.src" + f = WriteFile.new "#{dir}/#{i}.src" f.write line f.close i += 1 end - diff --git a/split-pipes b/split-pipes index 862e8be..58dcac4 100755 --- a/split-pipes +++ b/split-pipes @@ -1,9 +1,9 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" conf = Optimist::options do banner "splitpipes -f <n> < <input>" @@ -32,10 +32,10 @@ end while line = STDIN.gets j = 1 - line.strip.split(' ||| ').each { |i| + line.strip.split(" ||| ").each { |i| if range && (conf[:field]..conf[:to]).include?(j) a << i.strip - elsif j == conf[:field] + elsif j == conf[:field] puts i.strip break end @@ -46,6 +46,3 @@ while line = STDIN.gets end a.clear end - - - @@ -3,4 +3,3 @@ while line = STDIN.gets puts Math.sqrt line.to_f end - diff --git a/stanford-parser-run b/stanford-parser-run index f8d4210..37efacd 100755 --- a/stanford-parser-run +++ b/stanford-parser-run @@ -1,7 +1,7 @@ #!/bin/bash if [ $# != 1 ]; then - echo "$0 text-file" + echo "$0 text-file" exit 1 fi @@ -10,4 +10,3 @@ export CLASSPATH=:/toolbox/stanfordparser_3_2_0/* IN=$1 cat $IN | java -server -mx25000m edu.stanford.nlp.parser.lexparser.LexicalizedParser -nthreads 8 -sentences newline -encoding utf-8 -tokenized -outputFormat "typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz - | tr '\n' '\t' | sed 's/\t\t/\n/g' | sed 's/\t/ /g' | sed 's/ *$//' | sed 's/, /,/g' > $IN.stp - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" conf = Optimist::options do banner "stddev [-r <d>] < <one number per line>" @@ -37,4 +37,3 @@ if conf[:round] >= 0 else puts stddev end - @@ -3,4 +3,3 @@ while line = STDIN.gets puts line.strip end - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" f = ReadFile.new ARGV[0] g = ReadFile.new ARGV[1] @@ -10,4 +10,3 @@ while line1 = f.gets d = line1.to_f - line2.to_f puts d end - @@ -6,4 +6,3 @@ while line = STDIN.gets end puts sum - @@ -1,8 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" while line = STDIN.gets puts tokenize(line.strip).size end - @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def main conf = Optimist::options do @@ -15,8 +15,8 @@ def main stopwords = [] if conf[:filter_stopwords] stopwords = ReadFile.readlines(conf[:filter_stopwords]).map{ |i| - i.split('|').first.strip - }.reject{ |i| i=='' } + i.split("|").first.strip + }.reject{ |i| i=="" } end docs = {} @@ -54,4 +54,3 @@ def main end main - diff --git a/tmx-extract.py b/tmx-extract index 00f18f5..7791eb6 100755 --- a/tmx-extract.py +++ b/tmx-extract @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # # Adapted from Apertium # http://wiki.apertium.org/wiki/Tools_for_TMX @@ -16,43 +16,43 @@ class TMXHandler(ContentHandler): self.files = {} self.files[slang] = sfile self.files[tlang] = tfile - self.inTag = '' - self.note = '' - self.tuid = '' - self.type = '' + self.inTag = "" + self.note = "" + self.tuid = "" + self.type = "" self.cur_pair = set() - self.cur_lang = '' + self.cur_lang = "" self.seg = {} - self.seg[slang] = '' - self.seg[tlang] = '' + self.seg[slang] = "" + self.seg[tlang] = "" def startElement(self, name, attrs): - if name == 'tu': + if name == "tu": self.cur_pair = set() - self.inTag = 'tu' - self.tuid = attrs.get('tuid','') - self.type = attrs.get('datatype','') - elif name == 'note': - self.inTag = 'note' + self.inTag = "tu" + self.tuid = attrs.get("tuid", "") + self.type = attrs.get("datatype", "") + elif name == "note": + self.inTag = "note" self.note = "" - elif name == 'tuv': - self.inTag = 'tuv' - self.cur_lang = attrs.get('xml:lang', '') + elif name == "tuv": + self.inTag = "tuv" + self.cur_lang = attrs.get("xml:lang", "") self.cur_pair.add(self.cur_lang) - elif name == 'seg': - self.inTag = 'seg' + elif name == "seg": + self.inTag = "seg" if self.cur_lang in self.pair: - self.seg[self.cur_lang] = '' + self.seg[self.cur_lang] = "" def characters (self, c): - if self.inTag == 'note': + if self.inTag == "note": self.note += c - elif self.inTag == 'seg' and self.cur_lang in self.pair: + elif self.inTag == "seg" and self.cur_lang in self.pair: self.seg[self.cur_lang] += c def endElement(self, name): - if name == 'tu' and self.pair == self.cur_pair: + if name == "tu" and self.pair == self.cur_pair: for lang in self.cur_pair: self.files[lang].write("{}\n".format(self.seg[lang].replace("\n", " ").strip())) @@ -61,16 +61,15 @@ if __name__ == "__main__": parser = make_parser() if len(sys.argv) < 3: - print('Usage: tmx-extract.py <file> <slang> <tlang>') - print('') + print(f"Usage: {sys.argv[0]} <file> <slang> <tlang>") + print() sys.exit(-1) - sfile_path = sys.argv[1] + "." + sys.argv[2] - tfile_path = sys.argv[1] + "." + sys.argv[3] + sfile_path = f"{sys.argv[1]}.{sys.argv[2]}" + tfile_path = f"{sys.argv[1]}.{sys.argv[3]}" - with open(sfile_path, 'w+') as sfile, open(tfile_path, 'w+') as tfile: + with open(sfile_path, "w+") as sfile, open(tfile_path, "w+") as tfile: curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile) parser.setContentHandler(curHandler) - with open(sys.argv[1], 'r') as tmx_file: + with open(sys.argv[1], "r") as tmx_file: parser.parse(tmx_file) - diff --git a/tmx-extract-original-py2.py b/tmx-extract-original-py2 index cbdb491..eb39d1d 100755 --- a/tmx-extract-original-py2.py +++ b/tmx-extract-original-py2 @@ -73,4 +73,3 @@ parser.parse(open(sys.argv[1])) sfile.close() tfile.close() - diff --git a/tmx-to-plain.py b/tmx-to-plain index 07cac6f..025d6e4 100644..100755 --- a/tmx-to-plain.py +++ b/tmx-to-plain @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import argparse import datetime import sys @@ -12,37 +14,37 @@ def extract_from_tmx(tmx_file_path, date, src_out_after, tgt_out_after): - with open(tmx_file_path, 'rb') as in_fp: + with open(tmx_file_path, "rb") as in_fp: tmx_file = tmxfile(in_fp) - + if src_out_after is not None and tgt_out_after is not None: src_out_after_fp = open(src_out_after, "w") tgt_out_after_fp = open(tgt_out_after, "w") - - + + with open(src_out_path, "w") as src_out_fp, open(tgt_out_path, "w") as tgt_out_fp: for index, node in enumerate(tmx_file.unit_iter()): src_out_fp_ = src_out_fp tgt_out_fp_ = tgt_out_fp - + if begin_date is not None: - date_string = node.get_target_dom().get('lastusagedate')[:8] - date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date() + date_string = node.get_target_dom().get("lastusagedate")[:8] + date_obj = datetime.datetime.strptime(date_string, "%Y%m%d").date() if date_obj < begin_date: continue - + if date is not None: - date_string = node.get_target_dom().get('changedate')[:8] - date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date() + date_string = node.get_target_dom().get("changedate")[:8] + date_obj = datetime.datetime.strptime(date_string, "%Y%m%d").date() if date_obj > date: src_out_fp_ = src_out_after_fp tgt_out_fp_ = tgt_out_after_fp - + src_string = f"{node.source}" tgt_string = f"{node.target}" - src_string = src_string.replace('\n', ' ').replace('\r', '') - tgt_string = tgt_string.replace('\n', ' ').replace('\r', '') - + src_string = src_string.replace("\n", " ").replace("\r", "") + tgt_string = tgt_string.replace("\n", " ").replace("\r", "") + src_out_fp_.write(f"{src_string}\n") tgt_out_fp_.write(f"{tgt_string}\n") if (index + 1) % 1000 == 0: @@ -56,7 +58,7 @@ def extract_from_tmx(tmx_file_path, def main(): - usage = "Usage: python tmx_to_plain.py [options]" + usage = f"Usage: {sys.argv[0]} [options]" parser = argparse.ArgumentParser(usage=usage) parser.add_argument("-i", "--input", help="input tmx file") parser.add_argument("-d", "--date", help="date for splitting the output") @@ -67,29 +69,27 @@ def main(): if args.input is None: parser.print_help() sys.exit(1) - - args.input - - src_out = args.input + ".src" - tgt_out = args.input + ".tgt" - - + + src_out = f"{args.input}.src" + tgt_out = f"{args.input}.tgt" + + if args.date is not None: - date = datetime.datetime.strptime(args.date, '%Y-%m-%d').date() - src_out_after = src_out + ".after." + args.date - tgt_out_after = tgt_out + ".after." + args.date + date = datetime.datetime.strptime(args.date, "%Y-%m-%d").date() + src_out_after = f"{src_out}.after.{args.date}" + tgt_out_after = f"{tgt_out}.after.{args.date}" else: date = None src_out_after = None tgt_out_after = None - + if args.begin_date is not None: - begin_date = datetime.datetime.strptime(args.begin_date, '%Y-%m-%d').date() + begin_date = datetime.datetime.strptime(args.begin_date, "%Y-%m-%d").date() else: begin_date = None - - extract_from_tmx(args.input, src_out, tgt_out, begin_date, date, src_out_after, tgt_out_after) - -if __name__ == '__main__': + extract_from_tmx(args.input, src_out, tgt_out, begin_date, date, src_out_after, tgt_out_after) + + +if __name__ == "__main__": main() @@ -4,9 +4,8 @@ while line = STDIN.gets encoding_options = { :invalid => :replace, :undef => :replace, - :replace => '?', + :replace => "?", :universal_newline => true } - puts line.encode 'ASCII', encoding_options + puts line.encode "ASCII", encoding_options end - @@ -1,9 +1,8 @@ -#!/usr/bin/ruby +#!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" while line = STDIN.gets line.strip.split(/\s/).each { |i| puts i } end - diff --git a/toks-per-line b/toks-per-line index 8a10cd4..9814f35 100755 --- a/toks-per-line +++ b/toks-per-line @@ -14,4 +14,3 @@ while line = STDIN.gets puts a.size end end - diff --git a/train-test-split b/train-test-split index 6aa4796..db5aad4 100755 --- a/train-test-split +++ b/train-test-split @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" conf = Optimist::options do opt :source, "source file", :type => :string, :required => true @@ -13,11 +13,11 @@ conf = Optimist::options do end source_filename = conf[:source] -source_extension = source_filename.split('.').last +source_extension = source_filename.split(".").last source_lines = ReadFile.readlines source_filename target_filename = conf[:target] -target_extension = target_filename.split('.').last +target_extension = target_filename.split(".").last target_lines = ReadFile.readlines target_filename size = conf[:size] diff --git a/tsv-exclude b/tsv-exclude index e951ea1..cee3923 100755 --- a/tsv-exclude +++ b/tsv-exclude @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'set' +require "zipf" +require "set" to_exclude0 = {} to_exclude1 = {} diff --git a/tsv-joint-set b/tsv-joint-set index c0dbdcf..ce77a9e 100755 --- a/tsv-joint-set +++ b/tsv-joint-set @@ -1,8 +1,8 @@ #!/usr/bin/env ruby -require 'set' -require 'zipf' -require 'optimist' +require "set" +require "zipf" +require "optimist" conf = Optimist::options do opt :n, "Desired number segments in test set.", :type => :int, :required => true @@ -50,4 +50,3 @@ outputs.each_with_index { |o,i| f.write o[0][j] + "\t" + o[1][j] + "\n" } } - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'set' +require "set" strictness = ARGV[0].to_i # 1 one-side # 2 just the pair @@ -21,14 +21,14 @@ if strictness == 1 seen = Set.new segments[side].each_with_index { |segment,i| if not seen.include? segment - puts "#{segments[i][0]}\t#{segments[i][1]}" + puts "#{segments[0][i]}\t#{segments[1][i]}" end seen << segment } elsif strictness == 2 seen = Set.new segments[0].each_index { |i| - segment_pair = [segments[i][0], segments[i][1]] + segment_pair = [segments[0][i], segments[1][i]] if not seen.include? segment_pair puts "#{segment_pair[0]}\t#{segment_pair[1]}" end @@ -46,4 +46,3 @@ elsif strictness == 3 seen_pairs << segment_pair } end - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" conf = Optimist::options do banner "stddev [-r <d>] < <one number per line>" @@ -32,4 +32,3 @@ if conf[:round] >= 0 else puts var end - @@ -1,4 +1,3 @@ #!/bin/sh $(dirname $0)/toks ${1+"$@"} | sort | uniq -c - @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" d = {} while line = STDIN.gets @@ -10,4 +10,3 @@ while line = STDIN.gets end puts d.size - diff --git a/zh-ko-or-ja b/zh-ko-or-ja index 0b42386..e049704 100755 --- a/zh-ko-or-ja +++ b/zh-ko-or-ja @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'script_detector' +require "zipf" +require "script_detector" $to_code = {} $to_code["Ambiguous Chinese"] = "??" @@ -15,4 +15,3 @@ while line = STDIN.gets code = $to_code[line.identify_script] puts code end - |
