summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <patrick@lilt.com>2026-02-26 10:05:59 +0000
committerPatrick Simianer <patrick@lilt.com>2026-02-26 10:05:59 +0000
commitb31ace79ea5f6b3f279c544cd3a443d6fbf2a24d (patch)
tree31f2b599fa5f6996aeb134390d58deb63eefe04a
parent8805e95ae94d798c6441f7e1b72c90e049563f17 (diff)
overhaulHEADmaster
-rwxr-xr-xadd-index1
-rwxr-xr-xadd-ln3
-rwxr-xr-xadd-seg13
-rwxr-xr-xadd-start-end5
-rwxr-xr-xavg2
-rwxr-xr-xavg-seg-len1
-rwxr-xr-xavg-weights7
-rwxr-xr-xbishuf1
-rwxr-xr-xbitext-filter-length9
-rwxr-xr-xbitext2tmx (renamed from bitext2tmx.py)20
-rwxr-xr-xbiuniq5
-rwxr-xr-xbleu-cmp3
-rwxr-xr-xcdec-hg-to-json1
-rwxr-xr-xchars3
-rwxr-xr-xcma3
-rwxr-xr-xcumul17
-rwxr-xr-xde-sgm1
-rwxr-xr-xdiv1
-rwxr-xr-xdot7
-rwxr-xr-xeven5
-rwxr-xr-xexclude5
-rwxr-xr-xfeature-dict5
-rwxr-xr-xfilter-illegal5
-rwxr-xr-xfilter-len3
-rwxr-xr-xfilter-tokens5
-rwxr-xr-xfirst-upper3
-rwxr-xr-xfix-utf-8-pua1
-rwxr-xr-xgigaword-collapse-tags5
-rwxr-xr-xhadoop-uniq1
-rwxr-xr-xhist-tok1
-rwxr-xr-xhtmlentities9
-rwxr-xr-xinv3
-rwxr-xr-xis-first-lower5
-rwxr-xr-xjoint-set5
-rwxr-xr-xkbest-bleu-oracles5
-rwxr-xr-xkendalls-tau11
-rwxr-xr-xkey-count5
-rwxr-xr-xkmeans17
-rwxr-xr-xlang11
-rwxr-xr-xlangid-polyglot3
-rwxr-xr-xlength-ratio3
-rwxr-xr-xlin-reg7
-rwxr-xr-xlog-reg11
-rwxr-xr-xltok7
-rwxr-xr-xmake-rule-features7
-rwxr-xr-xmax3
-rwxr-xr-xmax-len5
-rwxr-xr-xmedian3
-rwxr-xr-xmerge-files3
-rwxr-xr-xmerge-ttable17
-rwxr-xr-xmin3
-rwxr-xr-xmin-max17
-rwxr-xr-xmkidx3
-rwxr-xr-xmoses-1best3
-rwxr-xr-xmoving-sum3
-rwxr-xr-xmult1
-rwxr-xr-xnfc (renamed from NFC)5
-rwxr-xr-xng9
-rwxr-xr-xnn1
-rwxr-xr-xno-empty5
-rwxr-xr-xno-non-printables3
-rw-r--r--nonbreaking-prefixes/README.txt (renamed from nonbreaking_prefixes/README.txt)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.ca (renamed from nonbreaking_prefixes/nonbreaking_prefix.ca)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.cs (renamed from nonbreaking_prefixes/nonbreaking_prefix.cs)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.de (renamed from nonbreaking_prefixes/nonbreaking_prefix.de)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.el (renamed from nonbreaking_prefixes/nonbreaking_prefix.el)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.en (renamed from nonbreaking_prefixes/nonbreaking_prefix.en)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.es (renamed from nonbreaking_prefixes/nonbreaking_prefix.es)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.fr (renamed from nonbreaking_prefixes/nonbreaking_prefix.fr)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.is (renamed from nonbreaking_prefixes/nonbreaking_prefix.is)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.it (renamed from nonbreaking_prefixes/nonbreaking_prefix.it)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.nl (renamed from nonbreaking_prefixes/nonbreaking_prefix.nl)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.pl (renamed from nonbreaking_prefixes/nonbreaking_prefix.pl)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.pt (renamed from nonbreaking_prefixes/nonbreaking_prefix.pt)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.ro (renamed from nonbreaking_prefixes/nonbreaking_prefix.ro)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.ru (renamed from nonbreaking_prefixes/nonbreaking_prefix.ru)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.sk (renamed from nonbreaking_prefixes/nonbreaking_prefix.sk)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.sl (renamed from nonbreaking_prefixes/nonbreaking_prefix.sl)0
-rw-r--r--nonbreaking-prefixes/nonbreaking_prefix.sv (renamed from nonbreaking_prefixes/nonbreaking_prefix.sv)0
-rwxr-xr-xnorm1
-rwxr-xr-xnorm-german15
-rwxr-xr-xnorm-hyphens3
-rwxr-xr-xnormchr9
-rwxr-xr-xnum-tok5
-rwxr-xr-xodd5
-rwxr-xr-xoverlap7
-rwxr-xr-xpaste-pairs12
-rwxr-xr-xper-sentence-bleu9
-rwxr-xr-xper-sentence-bleu-kbest9
-rwxr-xr-xper-sentence-ter15
-rwxr-xr-xpercentile3
-rwxr-xr-xpot1
-rwxr-xr-xpreprocess5
-rwxr-xr-xpreprocess-no-lower5
-rwxr-xr-xpt-bloom11
-rwxr-xr-xpush-rules3
-rwxr-xr-xremove-devtest4
-rwxr-xr-xremove-test-from-bitext4
-rwxr-xr-xrepetition-rate7
-rwxr-xr-xround1
-rwxr-xr-xrule-shapes7
-rwxr-xr-xsample14
-rwxr-xr-xselect7
-rwxr-xr-xselect-from11
-rwxr-xr-xsentencepiece-decode3
-rwxr-xr-xshard19
-rwxr-xr-xsort-features1
-rwxr-xr-xsource-sides3
-rwxr-xr-xsplit-kbest3
-rwxr-xr-xsplit-lines5
-rwxr-xr-xsplit-pipes13
-rwxr-xr-xsqrt1
-rwxr-xr-xstanford-parser-run3
-rwxr-xr-xstddev3
-rwxr-xr-xstrips1
-rwxr-xr-xsubtract (renamed from substract)3
-rwxr-xr-xsum1
-rwxr-xr-xtc3
-rwxr-xr-xtf-idf9
-rwxr-xr-xtmx-extract (renamed from tmx-extract.py)59
-rwxr-xr-xtmx-extract-original-py2 (renamed from tmx-extract-original-py2.py)1
-rwxr-xr-x[-rw-r--r--]tmx-to-plain (renamed from tmx-to-plain.py)62
-rwxr-xr-xto-ascii5
-rwxr-xr-xtoks7
-rwxr-xr-xtoks-per-line1
-rwxr-xr-xtrain-test-split8
-rwxr-xr-xtsv-exclude4
-rwxr-xr-xtsv-joint-set7
-rwxr-xr-xtsv-uniq7
-rwxr-xr-xvar3
-rwxr-xr-xvocab1
-rwxr-xr-xvocab-2 (renamed from vocab2)3
-rwxr-xr-xzh-ko-or-ja5
133 files changed, 320 insertions, 432 deletions
diff --git a/add-index b/add-index
index 77a7e8d..b23fefe 100755
--- a/add-index
+++ b/add-index
@@ -9,4 +9,3 @@ while line = STDIN.gets
puts "#{i}\t#{line}"
i += 1
end
-
diff --git a/add-ln b/add-ln
index 35bc44d..c98f0a0 100755
--- a/add-ln
+++ b/add-ln
@@ -3,6 +3,5 @@
i = 0
while line = STDIN.gets
puts "#{i}\t#{line}"
- i += 1
+ i += 1
end
-
diff --git a/add-seg b/add-seg
index 14b8b6b..3825494 100755
--- a/add-seg
+++ b/add-seg
@@ -1,12 +1,12 @@
#!/usr/bin/env ruby
-require 'optimist'
-require 'zipf'
+require "optimist"
+require "zipf"
o = Optimist::options do
- opt :grammar, "(Absolute) path of folder containing grammars.", :type => :string, :short => '-g', :default => nil
+ opt :grammar, "(Absolute) path of folder containing grammars.", :type => :string, :short => "-g", :default => nil
opt :loo, "leave one out", :type => :bool, :default => false
- opt :start_id, "start with this id", :type => :int, :default => 0, :short => '-i'
+ opt :start_id, "start with this id", :type => :int, :default => 0, :short => "-i"
opt :nogz, "grammar files not gzipped", :type => :bool, :default => false
opt :index, "number according to index", :type => :string, :default => nil
end
@@ -19,8 +19,8 @@ end
i = o[:start_id]
j = 0
while line = STDIN.gets
- ext = '.gz'
- ext = '' if o[:nogz]
+ ext = ".gz"
+ ext = "" if o[:nogz]
s = "<seg"
if o[:loo] then s += " exclude=\"#{i}\"" end
if index.size > 0
@@ -33,4 +33,3 @@ while line = STDIN.gets
i += 1
j += 1
end
-
diff --git a/add-start-end b/add-start-end
index 30deaec..1e1061d 100755
--- a/add-start-end
+++ b/add-start-end
@@ -1,10 +1,9 @@
#!/usr/bin/env ruby
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
while line = STDIN.gets
puts "<s> #{line.strip} </s>"
end
-
diff --git a/avg b/avg
index ac912d6..6d28fa9 100755
--- a/avg
+++ b/avg
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'optimist'
+require "optimist"
conf = Optimist::options do
banner "avg < <one number per line>"
diff --git a/avg-seg-len b/avg-seg-len
index ee68827..bfd4f6c 100755
--- a/avg-seg-len
+++ b/avg-seg-len
@@ -6,4 +6,3 @@ while line = STDIN.gets
end
puts lens.inject(:+)/lens.size.to_f
-
diff --git a/avg-weights b/avg-weights
index f090da9..bc734e8 100755
--- a/avg-weights
+++ b/avg-weights
@@ -1,8 +1,8 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
-require 'zlib'
+require "zipf"
+require "optimist"
+require "zlib"
conf = Optimist::options do
opt :weights_files, "a number of weights files: name value", :required => true
@@ -30,4 +30,3 @@ h.each_pair { |k,w|
next if conf[:filter] and w.size < n
puts "#{k} #{w.inject(:+)/n}"
}
-
diff --git a/bishuf b/bishuf
index 62689aa..dd86e23 100755
--- a/bishuf
+++ b/bishuf
@@ -15,4 +15,3 @@ get_random() { seed="$1"; openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt <
echo "shuffling ..."
$SHUF --random-source=<(get_random 42) $1 > $1.shuf
$SHUF --random-source=<(get_random 42) $2 > $2.shuf
-
diff --git a/bitext-filter-length b/bitext-filter-length
index d1dc973..a77f10e 100755
--- a/bitext-filter-length
+++ b/bitext-filter-length
@@ -1,7 +1,7 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
def main
conf = Optimist::options do
@@ -17,8 +17,8 @@ def main
opt :reverse, "length ratios alway > 1", :type => :bool, :default => false, :short => "-r"
end
- fna,fnb = conf[:inputs].split ','
- a = ReadFile.new fna
+ fna,fnb = conf[:inputs].split ","
+ a = ReadFile.new fna
b = ReadFile.new fnb
if not conf[:output_index]
@@ -62,4 +62,3 @@ def main
end
main
-
diff --git a/bitext2tmx.py b/bitext2tmx
index 1cdc4b3..e9c8e23 100755
--- a/bitext2tmx.py
+++ b/bitext2tmx
@@ -7,23 +7,21 @@ from xml.sax.saxutils import escape
if __name__ == "__main__":
prefix = """<tmx version="1.4">
<header
- creationtool="bitext2tmx.py" creationtoolversion="1.0"
+ creationtool="bitext2tmx" creationtoolversion="1.0"
datatype="PlainText" segtype="sentence"
adminlang="en-us" srclang="en"
o-tmf="ABCTransMem"/>
<body>"""
- src_file = open(sys.argv[1], "r")
- tgt_file = open(sys.argv[2], "r")
+ with open(sys.argv[1], "r") as src_file, open(sys.argv[2], "r") as tgt_file:
+ src_lang = sys.argv[1].split(".")[-1]
+ tgt_lang = sys.argv[2].split(".")[-1]
- src_lang = sys.argv[1].split(".")[-1]
- tgt_lang = sys.argv[2].split(".")[-1]
-
- tus = []
- for src_line, tgt_line in zip(src_file.readlines(), tgt_file.readlines()):
- src_line = src_line.rstrip("\n")
- tgt_line = tgt_line.rstrip("\n")
- tus.append(f"""
+ tus = []
+ for src_line, tgt_line in zip(src_file.readlines(), tgt_file.readlines()):
+ src_line = src_line.rstrip("\n")
+ tgt_line = tgt_line.rstrip("\n")
+ tus.append(f"""
<tu>
<tuv xml:lang="{src_lang}">
<seg>{escape(src_line)}</seg>
diff --git a/biuniq b/biuniq
index b191ab0..9ad2d76 100755
--- a/biuniq
+++ b/biuniq
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
f1 = ReadFile.new ARGV[0]
f2 = ReadFile.new ARGV[1]
@@ -16,7 +16,7 @@ while line1 = f1.gets
line2 = f2.gets
if line2 == nil then line2 = "" end
line2.strip!
-
+
if !d1.include? line1 and !d2.include? line2
a1 << line1
a2 << line2
@@ -33,4 +33,3 @@ a1.each_with_index { |line1,i|
o1.write line1 + "\n"
o2.write a2[i] + "\n"
}
-
diff --git a/bleu-cmp b/bleu-cmp
index ed8460c..fe5370d 100755
--- a/bleu-cmp
+++ b/bleu-cmp
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
f = ReadFile.new ARGV[0]
g = ReadFile.new ARGV[1]
@@ -20,4 +20,3 @@ while line = f.gets
puts
i += 1
end
-
diff --git a/cdec-hg-to-json b/cdec-hg-to-json
index 5a26cf7..955cd6d 100755
--- a/cdec-hg-to-json
+++ b/cdec-hg-to-json
@@ -77,4 +77,3 @@ def main():
if __name__=="__main__":
main()
-
diff --git a/chars b/chars
index 359c2ab..5fed1c7 100755
--- a/chars
+++ b/chars
@@ -1,10 +1,9 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
while line = STDIN.gets
line.strip.each_char { |c|
puts c
}
end
-
diff --git a/cma b/cma
index 4647710..9e0f1f0 100755
--- a/cma
+++ b/cma
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'optimist'
+require "optimist"
conf = Optimist::options do
banner "cma < <one number per line>"
@@ -20,4 +20,3 @@ while line = STDIN.gets
end
STDOUT.flush
end
-
diff --git a/cumul b/cumul
index 93a7e90..45ff03e 100755
--- a/cumul
+++ b/cumul
@@ -1,6 +1,7 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
+require "tempfile"
f = ReadFile.new ARGV[0]
g = ReadFile.new ARGV[1]
@@ -17,16 +18,16 @@ while line = f.gets
sys1 << line1
sys2 << line2
- ff=File.new("/tmp/refs",'w+');ff.write(refs.join(""));ff.close
- ff=File.new("/tmp/sys1",'w+');ff.write(sys1.join(""));ff.close
- ff=File.new("/tmp/sys2",'w+');ff.write(sys2.join(""));ff.close
+ tmp_refs = Tempfile.new("refs"); tmp_refs.write(refs.join("")); tmp_refs.close
+ tmp_sys1 = Tempfile.new("sys1"); tmp_sys1.write(sys1.join("")); tmp_sys1.close
+ tmp_sys2 = Tempfile.new("sys2"); tmp_sys2.write(sys2.join("")); tmp_sys2.close
- #a = `~/multi-bleu.perl /tmp/refs < /tmp/sys1`.split[2].gsub(',','').to_f
- a = BLEU::bleu("/tmp/sys1", "/tmp/refs", 4)
- b = BLEU::bleu("/tmp/sys2", "/tmp/refs", 4)
+ a = BLEU::bleu(tmp_sys1.path, tmp_refs.path, 4)
+ b = BLEU::bleu(tmp_sys2.path, tmp_refs.path, 4)
+
+ tmp_refs.unlink; tmp_sys1.unlink; tmp_sys2.unlink
diffs << b-a
#puts ((diffs.inject(:+)/diffs.size)*100).round 2
puts (diffs[-1]*100).round 2
end
-
diff --git a/de-sgm b/de-sgm
index 3b3a8e0..8598aef 100755
--- a/de-sgm
+++ b/de-sgm
@@ -9,4 +9,3 @@ egrep -v -i "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|r
| sed "s|\s*</speaker>\s*$||" \
| sed "s|\s*<hl>\s*$||" \
| sed "s|\s*</hl>\s*$||"
-
diff --git a/div b/div
index 93585dc..d0e036e 100755
--- a/div
+++ b/div
@@ -5,4 +5,3 @@ exit if factor==0
while line = STDIN.gets
puts line.to_f / factor
end
-
diff --git a/dot b/dot
index da0dc58..9856069 100755
--- a/dot
+++ b/dot
@@ -1,9 +1,8 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
-a = SparseVector.from_file 'w', ' '
-b = SparseVector.from_file 'f', ' '
+a = SparseVector.from_file "w", " "
+b = SparseVector.from_file "f", " "
puts a.to_s
puts a.dot b
-
diff --git a/even b/even
index dcee3d9..1a9bfd4 100755
--- a/even
+++ b/even
@@ -1,11 +1,10 @@
#!/usr/bin/env ruby
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
i = 1
while line = STDIN.gets
puts line if i%2==0
i+=1
end
-
diff --git a/exclude b/exclude
index b5fe3cb..ee5a144 100755
--- a/exclude
+++ b/exclude
@@ -1,7 +1,7 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'set'
+require "zipf"
+require "set"
to_exclude = {}
f = ReadFile.new ARGV[0]
@@ -14,4 +14,3 @@ while line = STDIN.gets
puts line
end
end
-
diff --git a/feature-dict b/feature-dict
index 6849769..59ff020 100755
--- a/feature-dict
+++ b/feature-dict
@@ -7,7 +7,7 @@ l_i = 1
while line = STDIN.gets
STDERR.write "#{l_i}\n" if l_i%1000==0&&not_quiet
line.split.each { |i|
- f, v = i.split('=', 2)
+ f, v = i.split("=", 2)
if !feature_dict.has_key? f
feature_dict[f] = n
n += 1
@@ -16,9 +16,8 @@ while line = STDIN.gets
l_i += 1
end
-f = File.new ARGV[0], 'w'
+f = File.new ARGV[0], "w"
f.write Marshal.dump feature_dict
f.close
STDERR.write "size = #{feature_dict.size}\n"
-
diff --git a/filter-illegal b/filter-illegal
index 8b29f3e..e44b2ac 100755
--- a/filter-illegal
+++ b/filter-illegal
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
illegal = [ "[", "]", "|||" ]
@@ -15,11 +15,10 @@ while line0 = in0.gets
illegal.each { |k|
if line0.index(k) or line1.index(k) then
skip = true
- skipi << i
+ skipi << i
end
}
i += 1
end
skipi.each { |j| puts j }
-
diff --git a/filter-len b/filter-len
index fe45b57..1756849 100755
--- a/filter-len
+++ b/filter-len
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
a = ReadFile.new ARGV[0]
b = ReadFile.new ARGV[1]
@@ -24,4 +24,3 @@ a.close
b.close
a_out.close
b_out.close
-
diff --git a/filter-tokens b/filter-tokens
index 00c8f2c..c851bd3 100755
--- a/filter-tokens
+++ b/filter-tokens
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
bad_words = {}
ReadFile.readlines_strip(ARGV[0]).each { |line|
@@ -13,11 +13,10 @@ while line = STDIN.gets
tokens = line.split
bad_words.keys.each { |w|
if tokens.include? w
- bad = true
+ bad = true
break
end
}
puts i if bad
i += 1
end
-
diff --git a/first-upper b/first-upper
index 610e62c..f9b2ce9 100755
--- a/first-upper
+++ b/first-upper
@@ -1,10 +1,9 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
while line = STDIN.gets
line.strip!
line[0] = line[0].upcase
puts line
end
-
diff --git a/fix-utf-8-pua b/fix-utf-8-pua
index 674d424..da77850 100755
--- a/fix-utf-8-pua
+++ b/fix-utf-8-pua
@@ -7,4 +7,3 @@ while line = STDIN.gets
line.gsub! /[\u{e000}-\u{f8ff}]/, " "
puts line
end
-
diff --git a/gigaword-collapse-tags b/gigaword-collapse-tags
index cbaf7d7..f2339c4 100755
--- a/gigaword-collapse-tags
+++ b/gigaword-collapse-tags
@@ -2,8 +2,8 @@
# works with gigaword en v5
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
in_p = false
in_dateline = false
@@ -36,4 +36,3 @@ while line = STDIN.gets
puts line
end
end
-
diff --git a/hadoop-uniq b/hadoop-uniq
index 5052419..5f37fa4 100755
--- a/hadoop-uniq
+++ b/hadoop-uniq
@@ -8,4 +8,3 @@ $HADOOP_HOME/bin/hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \
-output d.uniq \
-mapper 'cut -d " " -f 1' \
-reducer /usr/bin/uniq
-
diff --git a/hist-tok b/hist-tok
index b81604f..3e1d453 100755
--- a/hist-tok
+++ b/hist-tok
@@ -21,4 +21,3 @@ sorted.sort_by! { |i|
sorted.each { |i|
puts "#{i[0]}\t#{i[1]}"
}
-
diff --git a/htmlentities b/htmlentities
index f3c2d34..c0ccc0a 100755
--- a/htmlentities
+++ b/htmlentities
@@ -1,9 +1,9 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
-require 'htmlentities'
+require "htmlentities"
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
coder = HTMLEntities.new
@@ -11,4 +11,3 @@ coder = HTMLEntities.new
while line = STDIN.gets
puts coder.decode(line.strip)
end
-
diff --git a/inv b/inv
index b13443f..aaa4783 100755
--- a/inv
+++ b/inv
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'optimist'
+require "optimist"
def main
conf = Optimist::options do
@@ -30,4 +30,3 @@ def main
end
main
-
diff --git a/is-first-lower b/is-first-lower
index 1cddb8e..a7e2073 100755
--- a/is-first-lower
+++ b/is-first-lower
@@ -1,11 +1,10 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
while line = STDIN.gets
line.strip!
- if line && line!='' && line[0].downcase?
+ if line && line!="" && line[0].downcase?
puts line
end
end
-
diff --git a/joint-set b/joint-set
index b9b9b22..a295862 100755
--- a/joint-set
+++ b/joint-set
@@ -1,7 +1,7 @@
#!/usr/bin/env ruby
-require 'set'
-require 'zipf'
+require "set"
+require "zipf"
n = ARGV.pop.to_i
@@ -27,4 +27,3 @@ all_sets.each { |set|
joint_set.each { |i|
puts i
}
-
diff --git a/kbest-bleu-oracles b/kbest-bleu-oracles
index ea76ab1..03f321d 100755
--- a/kbest-bleu-oracles
+++ b/kbest-bleu-oracles
@@ -1,7 +1,7 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
def get_context kbest_lists, references, n
a = []
@@ -48,4 +48,3 @@ def main
end
main
-
diff --git a/kendalls-tau b/kendalls-tau
index c0c20be..24f0744 100755
--- a/kendalls-tau
+++ b/kendalls-tau
@@ -2,7 +2,7 @@
#################################################
# reads space delimted pairs of scores as input,
-# outputs Kendall's τ
+# outputs Kendall"s τ
#################################################
def kendall_with_ties l
@@ -13,7 +13,7 @@ def kendall_with_ties l
l.each_with_index { |k,i|
l[i+1,l.size].each_with_index { |m,j|
if (k.first < m.first && k[1] < m[1]) ||
- (k.first > m.first && k[1] > m[1])
+ (k.first > m.first && k[1] > m[1])
concordant += 1
elsif (k.first == m.first && k[1] != m[1])
tie_a += 1
@@ -24,7 +24,7 @@ def kendall_with_ties l
end
}
}
-
+
return (concordant-disconcordant)/(Math.sqrt((concordant+disconcordant+tie_a)*(concordant+disconcordant+tie_b)))
end
@@ -34,7 +34,7 @@ def kendall l
l.each_with_index { |k,i|
l[i+1,l.size].each_with_index { |m,j|
if (k.first <= m.first && k[1] <= m[1]) ||
- (k.first >= m.first && k[1] >= m[1])
+ (k.first >= m.first && k[1] >= m[1])
concordant += 1
else
disconcordant += 1
@@ -60,7 +60,7 @@ def main
a,b = line.split
l << [a.to_f, b.to_f]
end
-
+
v = -1
if has_ties? l
v = kendall_with_ties l
@@ -72,4 +72,3 @@ def main
end
main
-
diff --git a/key-count b/key-count
index deaa522..b853362 100755
--- a/key-count
+++ b/key-count
@@ -1,7 +1,7 @@
#!/usr/bin/env ruby
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
h = {}
h.default = 0
@@ -11,4 +11,3 @@ while line = STDIN.gets
end
h.each_pair { |k,v| puts "#{k} #{v}" }
-
diff --git a/kmeans b/kmeans
index dcf7774..f49fc53 100755
--- a/kmeans
+++ b/kmeans
@@ -1,12 +1,12 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
def read_data fn
data = {}
ReadFile.new(fn).readlines_strip.map{ |i|
- a = i.split ' ', 2
+ a = i.split " ", 2
v = SparseVector.from_kv a.last
data[a.first] = v
}
@@ -30,7 +30,7 @@ end
def assign centroids, data
assignment = {}
data.each_pair { |name,feature_vector|
- min = 1.0/0
+ min = Float::INFINITY
min_index = nil
centroids.each_with_index { |c,i|
dist = c.euclidian_dist(feature_vector)
@@ -61,10 +61,10 @@ def main
opt :k, "k", :type => :int, :required => true
opt :input, "input: one feature vector per line", :type => :string, :required => true
opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
- opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3
- opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
+ opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => "-n", :default => 3
+ opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => "-j", :default => 2
end
- # data is 'ID f1=v1 f2=v2'
+ # data is "ID f1=v1 f2=v2"
data = read_data conf[:input]
k = conf[:k]
centroids = nil
@@ -86,7 +86,7 @@ def main
STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n"
0.upto(conf[:max_iterations]) do |i|
s = "iteration #{i}"
- STDERR.write "#{s}\n#{'-'*s.size}\n"
+ STDERR.write "#{s}\n#{"-" * s.size}\n"
assignment = assign centroids, data
sizes = []
assignment.each_pair { |centroid_index, a|
@@ -114,4 +114,3 @@ def main
end
main
-
diff --git a/lang b/lang
index 5caebd1..1b498d1 100755
--- a/lang
+++ b/lang
@@ -1,14 +1,14 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
import sys
import langdetect
from_stdin = False
-if sys.argv[1] == '-':
+if sys.argv[1] == "-":
f = sys.stdin
from_stdin = True
else:
- f = open(sys.argv[1], 'r')
+ f = open(sys.argv[1], "r")
try:
l = sys.argv[2].strip()
@@ -32,7 +32,7 @@ if min_p and not l:
if strict and not min_p:
strict = False
-
+
factory = langdetect.detector_factory.DetectorFactory()
factory.load_profile(langdetect.detector_factory.PROFILES_DIRECTORY)
@@ -71,5 +71,4 @@ for line in f:
print("unk")
if not from_stdin:
- f.close
-
+ f.close()
diff --git a/langid-polyglot b/langid-polyglot
index 0b0b20c..04f6b3b 100755
--- a/langid-polyglot
+++ b/langid-polyglot
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
import polyglot
from polyglot.detect import Detector
@@ -15,4 +15,3 @@ for line in fileinput.input():
except polyglot.detect.base.UnknownLanguage:
print("??")
pass
-
diff --git a/length-ratio b/length-ratio
index 4b4432d..5b38826 100755
--- a/length-ratio
+++ b/length-ratio
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
a = ReadFile.new ARGV[0]
b = ReadFile.new ARGV[1]
@@ -9,4 +9,3 @@ while linea = a.gets
lineb = b.gets
puts linea.strip.split.size.to_f / lineb.strip.split.size.to_f
end
-
diff --git a/lin-reg b/lin-reg
index 87dded5..eb9193e 100755
--- a/lin-reg
+++ b/lin-reg
@@ -1,7 +1,7 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
def read_data fn, scale
f = ReadFile.new fn
@@ -29,7 +29,7 @@ def main
opt :output, "output data", :type => :string, :required => true
opt :learning_rate, "learning rate", :type => :float, :default => 0.07
opt :stop, "stopping criterion", :type => :int, :default => 100
- opt :scale_features,"scale features", :type => :bool, :default => false, :short => '-t'
+ opt :scale_features,"scale features", :type => :bool, :default => false, :short => "-t"
opt :show_loss, "show loss per iter", :type => :bool, :default => false
end
data = read_data conf[:input], conf[:scale_features]
@@ -67,4 +67,3 @@ def main
end
main
-
diff --git a/log-reg b/log-reg
index 5e43555..99d9153 100755
--- a/log-reg
+++ b/log-reg
@@ -1,8 +1,8 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'matrix'
-require 'optimist'
+require "zipf"
+require "matrix"
+require "optimist"
def read_data fn
f = ReadFile.new fn
@@ -30,7 +30,7 @@ def approx_eql x, y, eps=10**-10
return false if !x||!y
return false if x.size!=y.size
x.each_with_index { |_,i|
- return false if (x[i]-y[i]).abs>eps
+ return false if (x[i]-y[i]).abs>eps
}
return true
end
@@ -48,7 +48,7 @@ def main
prev_model = nil
gradient = Vector.elements zeros
hessian = Matrix.build(dim,dim) { |i,j| 0.0 }
- i = 0
+ i = 0
while true
i += 1
data.each_with_index { |x,j|
@@ -68,4 +68,3 @@ def main
end
main
-
diff --git a/ltok b/ltok
index c90823e..fc25a65 100755
--- a/ltok
+++ b/ltok
@@ -1,9 +1,8 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
while line = STDIN.gets
puts line.strip.split(/\s/).size
end
-
diff --git a/make-rule-features b/make-rule-features
index 7adb6e9..ae2cecc 100755
--- a/make-rule-features
+++ b/make-rule-features
@@ -1,10 +1,10 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
def mkrf src, tgt
s = src.gsub /\[X,[1-9]\]/, "NX"
- t = tgt.gsub /\[X,([1-9])\]/,'N\1'
+ t = tgt.gsub /\[X,([1-9])\]/,"N\1"
return "R:X:#{s.gsub(" ","_")}:#{t.gsub(" ","_")}"
end
@@ -13,7 +13,7 @@ def mkrbf s, t
if t == "S"
s.gsub! /\[X,[1-9]\]/, "X"
else
- s.gsub! /\[X,([1-9])\]/, 'X\1'
+ s.gsub! /\[X,([1-9])\]/, "X\1"
end
s.reverse!
s += " >r<"
@@ -41,4 +41,3 @@ while line = STDIN.gets
end
h.keys.each { |f| puts f }
-
diff --git a/max b/max
index b2c1cae..15d0003 100755
--- a/max
+++ b/max
@@ -1,10 +1,9 @@
#!/usr/bin/env ruby
-max = -1.0/0
+max = -Float::INFINITY
while line = STDIN.gets
v = line.to_f
max = v if v > max
end
puts max
-
diff --git a/max-len b/max-len
index 69013b5..dab684f 100755
--- a/max-len
+++ b/max-len
@@ -1,11 +1,11 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
max = ARGV[0].to_i
i = 0
-while line = STDIN.gets
+while line = STDIN.gets
if tokenize(line).size <= max
puts i
else
@@ -13,4 +13,3 @@ while line = STDIN.gets
end
i += 1
end
-
diff --git a/median b/median
index 0b1950b..cc47dcd 100755
--- a/median
+++ b/median
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
a = []
while line = STDIN.gets
@@ -10,4 +10,3 @@ end
a.sort!
puts a[a.size/2]
-
diff --git a/merge-files b/merge-files
index 714b57d..78644ef 100755
--- a/merge-files
+++ b/merge-files
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
def usage
STDERR.write "merge_files <file>+\n"
@@ -28,4 +28,3 @@ hashes.each { |h|
counts.max.times { puts k }
}
}
-
diff --git a/merge-ttable b/merge-ttable
index 77eae9f..20e5429 100755
--- a/merge-ttable
+++ b/merge-ttable
@@ -1,20 +1,20 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
def main
conf = Optimist::options do
opt :f, "f files", :type => :string, :required => true
opt :e, "e files", :type => :string, :required => true
end
-
+
f_files = conf[:f].split
e_files = conf[:e].split
-
+
h = {}
f_files.each_with_index { |fn,i|
- fa = ReadFile.readlines_strip fn
+ fa = ReadFile.readlines_strip fn
ea = ReadFile.readlines_strip e_files[i]
fa.each_with_index { |fw,j|
if h.has_key? fw
@@ -24,11 +24,10 @@ def main
end
}
}
-
+
h.each_pair { |f,ea|
- puts "#{f}\t#{ea.first}"
- }
+ puts "#{f}\t#{ea.first}"
+ }
end
main
-
diff --git a/min b/min
index f8a7e42..edfecea 100755
--- a/min
+++ b/min
@@ -1,10 +1,9 @@
#!/usr/bin/env ruby
-min = 1.0/0
+min = Float::INFINITY
while line = STDIN.gets
v = line.to_f
min = v if v<min
end
puts min
-
diff --git a/min-max b/min-max
index f6c8d00..dee541f 100755
--- a/min-max
+++ b/min-max
@@ -1,15 +1,15 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
conf = Optimist::options do
opt :min, "minimum #tokens", :type => :int, :default => 1
- opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n'
- opt :in_f, "input 'French' file", :type => :string, :required => true
- opt :in_e, "input 'English' file", :type => :string, :required => true
- opt :out_f, "output 'French' file", :type => :string, :required => true
- opt :out_e, "output 'English' file", :type => :string, :required => true
+ opt :max, "maximum #tokens", :type => :int, :default => 80, :short => "-n"
+ opt :in_f, "input French file", :type => :string, :required => true
+ opt :in_e, "input English file", :type => :string, :required => true
+ opt :out_f, "output French file", :type => :string, :required => true
+ opt :out_e, "output English file", :type => :string, :required => true
opt :out_id, "output line Nos", :type => :string, :required => true
end
@@ -37,4 +37,3 @@ while f_line = files[:f_file].gets
end
files.values.each{ |f| f.close }
-
diff --git a/mkidx b/mkidx
index 046e131..6e67ba9 100755
--- a/mkidx
+++ b/mkidx
@@ -1,10 +1,9 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
i = ARGV[0].to_i
while line = STDIN.gets
puts i
i += 1
end
-
diff --git a/moses-1best b/moses-1best
index fd35cf8..ffe5e22 100755
--- a/moses-1best
+++ b/moses-1best
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
prev_idx = nil
while line = STDIN.gets
@@ -11,4 +11,3 @@ while line = STDIN.gets
prev_idx = idx
end
end
-
diff --git a/moving-sum b/moving-sum
index 697f47f..aff3527 100755
--- a/moving-sum
+++ b/moving-sum
@@ -1,10 +1,9 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
sum = 0.0
ReadFile.readlines_strip(ARGV[0]).each { |i|
sum += i.to_f
puts sum
}
-
diff --git a/mult b/mult
index 478ec5e..42dd74c 100755
--- a/mult
+++ b/mult
@@ -4,4 +4,3 @@ factor = ARGV[0].to_f
while line = STDIN.gets
puts line.to_f * factor
end
-
diff --git a/NFC b/nfc
index aec1c58..4af1aef 100755
--- a/NFC
+++ b/nfc
@@ -1,9 +1,8 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
import fileinput
import unicodedata
import sys
for line in fileinput.input():
- sys.stdout.write(unicodedata.normalize('NFC', line))
-
+ sys.stdout.write(unicodedata.normalize("NFC", line))
diff --git a/ng b/ng
index f3a031d..af8015a 100755
--- a/ng
+++ b/ng
@@ -1,19 +1,18 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
conf = Optimist::options do
banner "ng < <input>"
opt :n, "n for Ngrams", :type => :int, :default => 4
- opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => false
+ opt :fix, "Do not output lower order Ngrams.", :type => :bool, :default => false
opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n"
end
while line = STDIN.gets
a = []
- ngrams(line, conf[:n], conf[:fix]) { |ng| a << ng.join(' ') }
+ ngrams(line, conf[:n], conf[:fix]) { |ng| a << ng.join(" ") }
a.reject! { |i| i.strip.size==0 }
puts a.join conf[:separator] if a.size>0
end
-
diff --git a/nn b/nn
index 4d1dab7..d43a235 100755
--- a/nn
+++ b/nn
@@ -1,4 +1,3 @@
#!/bin/sh
tr '[:digit:]' $1 < $2 > $(basename $2 ${2##*.})nn.${2##*.}
-
diff --git a/no-empty b/no-empty
index da57e23..5a05fc1 100755
--- a/no-empty
+++ b/no-empty
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
files = []
(0..1).each { |i| files << ReadFile.new(ARGV[i]) }
@@ -9,10 +9,9 @@ files = []
while line_f = files[0].gets
line_e = files[1].gets
line_f.strip!; line_e.strip!
- next if line_f=='' || line_e==''
+ next if line_f=="" || line_e==""
files[2].write line_f+"\n"
files[3].write line_e+"\n"
end
files.each { |f| f.close }
-
diff --git a/no-non-printables b/no-non-printables
index 9f9e3f9..2fb6f65 100755
--- a/no-non-printables
+++ b/no-non-printables
@@ -1,4 +1,3 @@
#!/bin/sh
-sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g' | sed 's/[[:cntrl:]]//g'
-
+sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g' | sed 's/[[:cntrl:]]//g'
diff --git a/nonbreaking_prefixes/README.txt b/nonbreaking-prefixes/README.txt
index 02cdfcc..02cdfcc 100644
--- a/nonbreaking_prefixes/README.txt
+++ b/nonbreaking-prefixes/README.txt
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ca b/nonbreaking-prefixes/nonbreaking_prefix.ca
index 2f4fdfc..2f4fdfc 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.ca
+++ b/nonbreaking-prefixes/nonbreaking_prefix.ca
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.cs b/nonbreaking-prefixes/nonbreaking_prefix.cs
index dce6167..dce6167 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.cs
+++ b/nonbreaking-prefixes/nonbreaking_prefix.cs
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.de b/nonbreaking-prefixes/nonbreaking_prefix.de
index 35fdf5e..35fdf5e 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.de
+++ b/nonbreaking-prefixes/nonbreaking_prefix.de
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.el b/nonbreaking-prefixes/nonbreaking_prefix.el
index 0470f91..0470f91 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.el
+++ b/nonbreaking-prefixes/nonbreaking_prefix.el
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.en b/nonbreaking-prefixes/nonbreaking_prefix.en
index e1a3733..e1a3733 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.en
+++ b/nonbreaking-prefixes/nonbreaking_prefix.en
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.es b/nonbreaking-prefixes/nonbreaking_prefix.es
index d8b2755..d8b2755 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.es
+++ b/nonbreaking-prefixes/nonbreaking_prefix.es
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.fr b/nonbreaking-prefixes/nonbreaking_prefix.fr
index 28126fa..28126fa 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.fr
+++ b/nonbreaking-prefixes/nonbreaking_prefix.fr
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.is b/nonbreaking-prefixes/nonbreaking_prefix.is
index 5b8a710..5b8a710 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.is
+++ b/nonbreaking-prefixes/nonbreaking_prefix.is
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.it b/nonbreaking-prefixes/nonbreaking_prefix.it
index 992b9ec..992b9ec 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.it
+++ b/nonbreaking-prefixes/nonbreaking_prefix.it
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.nl b/nonbreaking-prefixes/nonbreaking_prefix.nl
index c80c417..c80c417 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.nl
+++ b/nonbreaking-prefixes/nonbreaking_prefix.nl
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.pl b/nonbreaking-prefixes/nonbreaking_prefix.pl
index 6b7c106..6b7c106 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.pl
+++ b/nonbreaking-prefixes/nonbreaking_prefix.pl
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.pt b/nonbreaking-prefixes/nonbreaking_prefix.pt
index 5d65bf2..5d65bf2 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.pt
+++ b/nonbreaking-prefixes/nonbreaking_prefix.pt
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ro b/nonbreaking-prefixes/nonbreaking_prefix.ro
index d489f46..d489f46 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.ro
+++ b/nonbreaking-prefixes/nonbreaking_prefix.ro
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ru b/nonbreaking-prefixes/nonbreaking_prefix.ru
index 444465b..444465b 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.ru
+++ b/nonbreaking-prefixes/nonbreaking_prefix.ru
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sk b/nonbreaking-prefixes/nonbreaking_prefix.sk
index 1198d48..1198d48 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.sk
+++ b/nonbreaking-prefixes/nonbreaking_prefix.sk
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sl b/nonbreaking-prefixes/nonbreaking_prefix.sl
index 230062c..230062c 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.sl
+++ b/nonbreaking-prefixes/nonbreaking_prefix.sl
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sv b/nonbreaking-prefixes/nonbreaking_prefix.sv
index df5ef29..df5ef29 100644
--- a/nonbreaking_prefixes/nonbreaking_prefix.sv
+++ b/nonbreaking-prefixes/nonbreaking_prefix.sv
diff --git a/norm b/norm
index 5573c37..3313216 100755
--- a/norm
+++ b/norm
@@ -5,4 +5,3 @@ sum=$(cat $1 | sum)
for i in `cat $1`; do
echo "$i" | div $sum
done
-
diff --git a/norm-german b/norm-german
index 85a39da..5c41f98 100755
--- a/norm-german
+++ b/norm-german
@@ -1,23 +1,23 @@
#!/usr/bin/env ruby
-require 'thread'
-require 'optimist'
+require "thread"
+require "optimist"
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
conf = Optimist::options do
banner "norm_german < <file w/ lowercased tokens>"
opt :upper, "uppercase", :type => :bool, :default => false
- opt :threads, "#threads", :type => :int, :default => 1, :short => '-h'
+ opt :threads, "#threads", :type => :int, :default => 1, :short => "-h"
opt :shard_size, "shard size", :type => :int, :default => 1000
opt :train, "train", :type => :bool
opt :apply, "apply", :type => :bool
end
-pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ]
-pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ]
+pairs_lower = [ ["ß","ss"], ["ue", "ü"], ["ae","ä"], ["oe", "ö"] ]
+pairs_upper = [ ["Ä", "Ae"], ["Ö", "Oe"], ["Ü", "Ue"] ]
if conf[:upper]
PAIRS = pairs_lower
else
@@ -84,4 +84,3 @@ token_stock.each { |i|
h.merge! build_partial i
end
}
-
diff --git a/norm-hyphens b/norm-hyphens
index 4a152a1..6491d13 100755
--- a/norm-hyphens
+++ b/norm-hyphens
@@ -1,4 +1,3 @@
-#!/bin/zsh -x
+#!/bin/zsh
sed "s|[ \t]\+\xc2\xad[ \t]\+||g"
-
diff --git a/normchr b/normchr
index f8e5798..02c6ce8 100755
--- a/normchr
+++ b/normchr
@@ -3,10 +3,10 @@
# http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string-literal
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
-require 'htmlentities'
+require "htmlentities"
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
coder = HTMLEntities.new
@@ -24,7 +24,7 @@ while line = STDIN.gets
line.gsub! /[\u{e000}-\u{f8ff}]/, " " # UTF-8 PUA
line.gsub! /[\u{f0000}-\u{ffffd}]/, " "
line.gsub! /[\u{100000}-\u{10fffd}]/, " "
- line.gsub! "\r", " " # carriage return
+ line.gsub! "\r", " " # carriage return
line.gsub! /[\u{2000}-\u{200f}]/, " " # EN QUAD -- RIGHT-TO-LEFT MARK
line.gsub! /[\u{2028}-\u{202f}]/, " " # LINE SEPARATOR -- NARROW NO-BREAK SPACE
line.gsub! /[\u{205f}-\u{206f}]/, " " # MEDIUM MATHEMATICAL SPACE -- NOMINAL DIGIT SHAPES
@@ -32,4 +32,3 @@ while line = STDIN.gets
line.gsub! /[[:space:]]+/, " " # collapse space
puts coder.decode(line)
end
-
diff --git a/num-tok b/num-tok
index 56cbae9..0c95aa8 100755
--- a/num-tok
+++ b/num-tok
@@ -1,9 +1,8 @@
#!/usr/bin/env ruby
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
while line = STDIN.gets
puts line.strip.split.length
end
-
diff --git a/odd b/odd
index 0bd9336..ced2861 100755
--- a/odd
+++ b/odd
@@ -1,11 +1,10 @@
#!/usr/bin/env ruby
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
i = 1
while line = STDIN.gets
puts line if i%2!=0
i+=1
end
-
diff --git a/overlap b/overlap
index 81f9c4b..95d27a3 100755
--- a/overlap
+++ b/overlap
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
a = {}
a.default = 0
@@ -11,10 +11,9 @@ ReadFile.readlines_strip(ARGV[1]).map { |segment| b[segment] += 1 }
overlap = 0
a.each_key { |seg|
- puts b[seg]
- overlap = overlap+b[seg]
+ puts b[seg]
+ overlap = overlap+b[seg]
}
puts "---"
puts overlap
-
diff --git a/paste-pairs b/paste-pairs
index f6b8b31..7e08329 100755
--- a/paste-pairs
+++ b/paste-pairs
@@ -1,10 +1,8 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
import sys
-from itertools import izip
-
-for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))):
- print linenr, (src_line.strip())
- print linenr, (tgt_line.strip())
- print
+for linenr, (src_line, tgt_line) in enumerate(zip(open(sys.argv[1]), open(sys.argv[2]))):
+ print(linenr, src_line.strip())
+ print(linenr, tgt_line.strip())
+ print()
diff --git a/per-sentence-bleu b/per-sentence-bleu
index 257eb3a..d815dc9 100755
--- a/per-sentence-bleu
+++ b/per-sentence-bleu
@@ -1,11 +1,11 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
def main
conf = Optimist::options do
- opt :input, "input", :type => :string, :default => '-'
+ opt :input, "input", :type => :string, :default => "-"
opt :references, "references", :type => :string, :required => true
opt :len_hack, "hack of Nakov et al", :type => :int, :default => 0
opt :n, "N", :default => 4
@@ -16,7 +16,7 @@ def main
input = ReadFile.new conf[:input]
while line = input.gets
i += 1
- if line.strip == ''
+ if line.strip == ""
puts 0.0
next
end
@@ -26,4 +26,3 @@ def main
end
main
-
diff --git a/per-sentence-bleu-kbest b/per-sentence-bleu-kbest
index dad1607..12a9f6f 100755
--- a/per-sentence-bleu-kbest
+++ b/per-sentence-bleu-kbest
@@ -1,11 +1,11 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
def main
conf = Optimist::options do
- opt :kbests, "kbests", :type => :string, :default => '-'
+ opt :kbests, "kbests", :type => :string, :default => "-"
opt :references, "references", :type => :string, :required => true
end
refs = ReadFile.new conf[:references]
@@ -19,7 +19,7 @@ def main
scores.each_with_index { |x,j|
puts "#{j+1} ||| #{scores[j]} ||| #{list[j]}"
if scores[j]==max && !o
- puts "^^^ #{j+1} #{max}"
+ puts "^^^ #{j+1} #{max}"
o = true
end
}
@@ -29,4 +29,3 @@ def main
end
main
-
diff --git a/per-sentence-ter b/per-sentence-ter
index 1a7670e..777d39c 100755
--- a/per-sentence-ter
+++ b/per-sentence-ter
@@ -1,14 +1,14 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
-require 'tempfile'
+require "zipf"
+require "optimist"
+require "tempfile"
def main
conf = Optimist::options do
- opt :input, "input", :type => :string, :default => '-'
+ opt :input, "input", :type => :string, :default => "-"
opt :references, "references", :type => :string, :required => true
- opt :mteval_bin, "cdec's mteval/fast_score", :type => :string, :default => '`/toolbox/cdec-dtrain/mteval/fast_score'
+ opt :mteval_bin, "cdec mteval/fast_score", :type => :string, :default => "`/toolbox/cdec-dtrain/mteval/fast_score"
end
refs = ReadFile.readlines_strip conf[:references]
@@ -17,8 +17,8 @@ def main
while line = input.gets
line.strip!
i += 1
- a = Tempfile.new 'pster'
- b = Tempfile.new 'pster'
+ a = Tempfile.new "pster"
+ b = Tempfile.new "pster"
a.write line+"\n"
b.write refs[i]+"\n"
a.close; b.close
@@ -30,4 +30,3 @@ def main
end
main
-
diff --git a/percentile b/percentile
index ba9ceb0..ec42a9a 100755
--- a/percentile
+++ b/percentile
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
data = []
while line = STDIN.gets
@@ -18,4 +18,3 @@ if index.to_i == index
else
puts (data[index.to_i-1] + data[index.to_i]) / 2.0
end
-
diff --git a/pot b/pot
index 24acabe..b703bca 100755
--- a/pot
+++ b/pot
@@ -4,4 +4,3 @@ pow = ARGV[0].to_f
while line = STDIN.gets
puts line.to_f**pow
end
-
diff --git a/preprocess b/preprocess
index a46b0a8..91de3bb 100755
--- a/preprocess
+++ b/preprocess
@@ -1,9 +1,8 @@
#!/bin/bash
-pushd `dirname $0` > /dev/null
-P=`pwd -P`
+pushd "$(dirname "$0")" > /dev/null
+P="$(pwd -P)"
popd > /dev/null
LANG=$1
$P/no-non-printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize-punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer-no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | $P/lowercase.perl 2>lowercase.$LANG.err
-
diff --git a/preprocess-no-lower b/preprocess-no-lower
index afd87e9..7e3ad91 100755
--- a/preprocess-no-lower
+++ b/preprocess-no-lower
@@ -1,9 +1,8 @@
#!/bin/bash
-pushd `dirname $0` > /dev/null
-P=`pwd -P`
+pushd "$(dirname "$0")" > /dev/null
+P="$(pwd -P)"
popd > /dev/null
LANG=$1
$P/no-non-printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize-punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer-no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err
-
diff --git a/pt-bloom b/pt-bloom
index 35234f1..b38939d 100755
--- a/pt-bloom
+++ b/pt-bloom
@@ -1,10 +1,10 @@
#!/usr/bin/env ruby
-require 'bloom-filter'
-require 'optimist'
+require "bloom-filter"
+require "optimist"
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
conf = Optimist::options do
opt :size, "number of entries in the filter", :type => :int, :required => true
@@ -19,6 +19,5 @@ while line = STDIN.gets
f.insert(src+" ||| "+tgt)
end
-f.dump('pt.bloom')
+f.dump("pt.bloom")
f.close
-
diff --git a/push-rules b/push-rules
index c97ab80..d0a4de7 100755
--- a/push-rules
+++ b/push-rules
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
a = ReadFile.readlines_strip ARGV[0]
h = {}
@@ -21,4 +21,3 @@ while line = STDIN.gets
puts line
end
end
-
diff --git a/remove-devtest b/remove-devtest
index 8e026f9..f322a6e 100755
--- a/remove-devtest
+++ b/remove-devtest
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
train_src = ReadFile.new ARGV[0]
train_tgt = ReadFile.new ARGV[1]
@@ -39,7 +39,7 @@ while line_src = train_src.gets
line_src_downcase = line_src
line_tgt_downcase = line_tgt
end
-
+
if not devtest_h_src.has_key? line_src_downcase and not devtest_h_src.has_key? line_tgt_downcase \
and not devtest_h_tgt.has_key? line_src_downcase and not devtest_h_tgt.has_key? line_tgt_downcase
train_src_out.write line_src
diff --git a/remove-test-from-bitext b/remove-test-from-bitext
index 43038d3..911a893 100755
--- a/remove-test-from-bitext
+++ b/remove-test-from-bitext
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
test_source = ReadFile.new ARGV[0]
test_target = ReadFile.new ARGV[1]
@@ -13,7 +13,7 @@ while test_source_line = test_source.gets
test_source_line.strip!
test_target_line = test_target.gets
test_target_line.strip!
-
+
all_test_source_lines[test_source_line] = true
all_test_target_lines[test_target_line] = true
end
diff --git a/repetition-rate b/repetition-rate
index 87938ae..12e0fab 100755
--- a/repetition-rate
+++ b/repetition-rate
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
windows = []
cur = []
@@ -9,7 +9,7 @@ while line = STDIN.gets
if cur_sz >= 1000
windows << cur
cur = []
- cur_sz = 0
+ cur_sz = 0
end
cur << line.strip
cur_sz += cur.last.split.size
@@ -37,8 +37,7 @@ windows.each { |w|
rr = 1.0
enums.each_with_index { |i,j|
- rr *= i/denoms[j]
+ rr *= i/denoms[j]
}
puts ((rr**0.25)*100).round 2
-
diff --git a/round b/round
index dfef800..55919d7 100755
--- a/round
+++ b/round
@@ -4,4 +4,3 @@ r = ARGV[0].to_i
while line = STDIN.gets
puts line.to_f.round r
end
-
diff --git a/rule-shapes b/rule-shapes
index 589a670..91f8092 100755
--- a/rule-shapes
+++ b/rule-shapes
@@ -1,7 +1,7 @@
#!/usr/bin/env ruby
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
def shape s
res = []
@@ -24,6 +24,5 @@ end
while line = STDIN.gets
f, e = line.split(/\t/)
f.strip!; e.strip!
- puts shape(f).join('_')+"-"+shape(e).join('_')
+ puts shape(f).join("_")+"-"+shape(e).join("_")
end
-
diff --git a/sample b/sample
index aa46ddb..dcef148 100755
--- a/sample
+++ b/sample
@@ -1,15 +1,15 @@
#!/usr/bin/env ruby
-require 'optimist'
+require "optimist"
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
opts = Optimist::options do
banner "sample --size <n> [--shuffle] --file <line separated data>"
opt :size, "Sample P % or # lines from file or N.", :type => :float
opt :shuffle, "Sample is shuffled.", :type => :bool
- opt :file, "Input file.", :type => :string, :default => '-'
+ opt :file, "Input file.", :type => :string, :default => "-"
opt :output_index, "Output index number.", :type => :bool
opt :N, "Sample --size from N items.", :type => :int, :default => -1
opt :absolute, "Sample absolute number of items.", :type => :bool
@@ -19,10 +19,10 @@ input = []
index = []
i = 0
if opts[:N] == -1
- if opts[:file] == '-'
+ if opts[:file] == "-"
file = STDIN
else
- file = File.new opts[:file], 'r'
+ file = File.new opts[:file], "r"
end
while line = file.gets
input << line
@@ -36,7 +36,6 @@ end
sample = []
if !opts[:absolute]
sample = index.sample(index.size*(opts[:size]/100.0))
- sample = index.sample(index.size*(opts[:size]/100.0))
else
sample = index.sample(opts[:size])
end
@@ -56,4 +55,3 @@ while idx = sample.shift
end
end
end
-
diff --git a/select b/select
index 36e4256..2c5616a 100755
--- a/select
+++ b/select
@@ -1,11 +1,11 @@
#!/usr/bin/env ruby
-require 'optimist'
-require 'zipf'
+require "optimist"
+require "zipf"
opts = Optimist::options do
banner "sample --index <n> [--shuffle] [--file <line separated data>]"
- opt :file, "Input file.", :type => :string, :default => '-'
+ opt :file, "Input file.", :type => :string, :default => "-"
opt :index, "Index file.", :type => :string, :required => true
end
@@ -15,4 +15,3 @@ index = ReadFile.readlines_strip(opts[:index]).map{ |i| i.to_i }
index.each { |i|
puts input[i]
}
-
diff --git a/select-from b/select-from
index 0ccfeac..e9a394d 100755
--- a/select-from
+++ b/select-from
@@ -1,13 +1,13 @@
#!/usr/bin/env ruby
-require 'optimist'
-require 'zipf'
+require "optimist"
+require "zipf"
opts = Optimist::options do
banner "select_from [--invert] -i <file> < <line separated data>"
- opt :index, "Line numbers to output.", :type => :string, :short => '-i', :required => true
- opt :invert, "Invert selection.", :type => :bool, :short => '-j', :default => false
- opt :from1, "Index starting from 1.", :type => :bool, :short => '-k', :default => false
+ opt :index, "Line numbers to output.", :type => :string, :short => "-i", :required => true
+ opt :invert, "Invert selection.", :type => :bool, :short => "-j", :default => false
+ opt :from1, "Index starting from 1.", :type => :bool, :short => "-k", :default => false
end
accept = {}
@@ -30,4 +30,3 @@ while line = STDIN.gets
end
i += 1
end
-
diff --git a/sentencepiece-decode b/sentencepiece-decode
index 5e07ffa..e715d09 100755
--- a/sentencepiece-decode
+++ b/sentencepiece-decode
@@ -1,9 +1,8 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
while line = STDIN.gets
line = line.split.join ""
puts line.gsub "▁", " "
end
-
diff --git a/shard b/shard
index 5294afd..4b639c5 100755
--- a/shard
+++ b/shard
@@ -1,11 +1,11 @@
#!/usr/bin/env ruby
-require 'optimist'
+require "optimist"
def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false)
lc = `wc -l #{input}`.split.first.to_i
- input_ext = input.split('.').last
- refs_ext = refs.split('.').last
+ input_ext = input.split(".").last
+ refs_ext = refs.split(".").last
index = (0..lc-1).to_a
index.reverse!
index.shuffle! if rand
@@ -68,13 +68,12 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false
end
opts = Optimist::options do
- opt :input, 'input', :type => :string, :required => true
- opt :references, 'references', :type => :string, :required => true
- opt :alignments, 'alignments', :type => :string, :required => true
- opt :output_prefix, 'output prefix', :type => :string, :required => true
- opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z'
- opt :num_shards, 'number of shards', :type => :int, :required => true
+ opt :input, "input", :type => :string, :required => true
+ opt :references, "references", :type => :string, :required => true
+ opt :alignments, "alignments", :type => :string, :required => true
+ opt :output_prefix, "output prefix", :type => :string, :required => true
+ opt :randomize, "randomize", :type => :bool, :default => false, :short => "-z"
+ opt :num_shards, "number of shards", :type => :int, :required => true
end
make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize])
-
diff --git a/sort-features b/sort-features
index 88bd779..a91fb00 100755
--- a/sort-features
+++ b/sort-features
@@ -7,4 +7,3 @@ while line = STDIN.gets
end
h.sort_by { |name, value| -value }.each { |name, value| puts "#{name}\t#{value}" }
-
diff --git a/source-sides b/source-sides
index b4490c6..9243f17 100755
--- a/source-sides
+++ b/source-sides
@@ -1,4 +1,3 @@
-#!/bin/zsh -x
+#!/bin/zsh
split_pipes -f 2 | sort | uniq | sed "s| |_|g" | sed "s|\[X,[12]\]|NX|g"
-
diff --git a/split-kbest b/split-kbest
index ab425b0..52773e8 100755
--- a/split-kbest
+++ b/split-kbest
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
def write_kbest l, fn
f = WriteFile.new fn
@@ -21,4 +21,3 @@ while line = STDIN.gets
l << line
end
write_kbest l, "#{dir}/#{i}.gz" # last one
-
diff --git a/split-lines b/split-lines
index 14b3a0f..0d036c3 100755
--- a/split-lines
+++ b/split-lines
@@ -1,14 +1,13 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
dir = ARGV[0]
i = 0
while line = STDIN.gets
src, tgt = line.split " ||| "
- f = WriteFile.new "#{dir}/#{i}.src"
+ f = WriteFile.new "#{dir}/#{i}.src"
f.write line
f.close
i += 1
end
-
diff --git a/split-pipes b/split-pipes
index 862e8be..58dcac4 100755
--- a/split-pipes
+++ b/split-pipes
@@ -1,9 +1,9 @@
#!/usr/bin/env ruby
-require 'optimist'
+require "optimist"
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
conf = Optimist::options do
banner "splitpipes -f <n> < <input>"
@@ -32,10 +32,10 @@ end
while line = STDIN.gets
j = 1
- line.strip.split(' ||| ').each { |i|
+ line.strip.split(" ||| ").each { |i|
if range && (conf[:field]..conf[:to]).include?(j)
a << i.strip
- elsif j == conf[:field]
+ elsif j == conf[:field]
puts i.strip
break
end
@@ -46,6 +46,3 @@ while line = STDIN.gets
end
a.clear
end
-
-
-
diff --git a/sqrt b/sqrt
index d0a67b1..39382e6 100755
--- a/sqrt
+++ b/sqrt
@@ -3,4 +3,3 @@
while line = STDIN.gets
puts Math.sqrt line.to_f
end
-
diff --git a/stanford-parser-run b/stanford-parser-run
index f8d4210..37efacd 100755
--- a/stanford-parser-run
+++ b/stanford-parser-run
@@ -1,7 +1,7 @@
#!/bin/bash
if [ $# != 1 ]; then
- echo "$0 text-file"
+ echo "$0 text-file"
exit 1
fi
@@ -10,4 +10,3 @@ export CLASSPATH=:/toolbox/stanfordparser_3_2_0/*
IN=$1
cat $IN | java -server -mx25000m edu.stanford.nlp.parser.lexparser.LexicalizedParser -nthreads 8 -sentences newline -encoding utf-8 -tokenized -outputFormat "typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz - | tr '\n' '\t' | sed 's/\t\t/\n/g' | sed 's/\t/ /g' | sed 's/ *$//' | sed 's/, /,/g' > $IN.stp
-
diff --git a/stddev b/stddev
index 15c245e..1b24bb5 100755
--- a/stddev
+++ b/stddev
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'optimist'
+require "optimist"
conf = Optimist::options do
banner "stddev [-r <d>] < <one number per line>"
@@ -37,4 +37,3 @@ if conf[:round] >= 0
else
puts stddev
end
-
diff --git a/strips b/strips
index 11c00b4..05d41cb 100755
--- a/strips
+++ b/strips
@@ -3,4 +3,3 @@
while line = STDIN.gets
puts line.strip
end
-
diff --git a/substract b/subtract
index 212b6da..ecd6c11 100755
--- a/substract
+++ b/subtract
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
f = ReadFile.new ARGV[0]
g = ReadFile.new ARGV[1]
@@ -10,4 +10,3 @@ while line1 = f.gets
d = line1.to_f - line2.to_f
puts d
end
-
diff --git a/sum b/sum
index acfa563..a3502e6 100755
--- a/sum
+++ b/sum
@@ -6,4 +6,3 @@ while line = STDIN.gets
end
puts sum
-
diff --git a/tc b/tc
index 7eefdd5..dd16fdf 100755
--- a/tc
+++ b/tc
@@ -1,8 +1,7 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
while line = STDIN.gets
puts tokenize(line.strip).size
end
-
diff --git a/tf-idf b/tf-idf
index 22c3dac..02f4c7b 100755
--- a/tf-idf
+++ b/tf-idf
@@ -1,7 +1,7 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
def main
conf = Optimist::options do
@@ -15,8 +15,8 @@ def main
stopwords = []
if conf[:filter_stopwords]
stopwords = ReadFile.readlines(conf[:filter_stopwords]).map{ |i|
- i.split('|').first.strip
- }.reject{ |i| i=='' }
+ i.split("|").first.strip
+ }.reject{ |i| i=="" }
end
docs = {}
@@ -54,4 +54,3 @@ def main
end
main
-
diff --git a/tmx-extract.py b/tmx-extract
index 00f18f5..7791eb6 100755
--- a/tmx-extract.py
+++ b/tmx-extract
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
#
# Adapted from Apertium
# http://wiki.apertium.org/wiki/Tools_for_TMX
@@ -16,43 +16,43 @@ class TMXHandler(ContentHandler):
self.files = {}
self.files[slang] = sfile
self.files[tlang] = tfile
- self.inTag = ''
- self.note = ''
- self.tuid = ''
- self.type = ''
+ self.inTag = ""
+ self.note = ""
+ self.tuid = ""
+ self.type = ""
self.cur_pair = set()
- self.cur_lang = ''
+ self.cur_lang = ""
self.seg = {}
- self.seg[slang] = ''
- self.seg[tlang] = ''
+ self.seg[slang] = ""
+ self.seg[tlang] = ""
def startElement(self, name, attrs):
- if name == 'tu':
+ if name == "tu":
self.cur_pair = set()
- self.inTag = 'tu'
- self.tuid = attrs.get('tuid','')
- self.type = attrs.get('datatype','')
- elif name == 'note':
- self.inTag = 'note'
+ self.inTag = "tu"
+ self.tuid = attrs.get("tuid", "")
+ self.type = attrs.get("datatype", "")
+ elif name == "note":
+ self.inTag = "note"
self.note = ""
- elif name == 'tuv':
- self.inTag = 'tuv'
- self.cur_lang = attrs.get('xml:lang', '')
+ elif name == "tuv":
+ self.inTag = "tuv"
+ self.cur_lang = attrs.get("xml:lang", "")
self.cur_pair.add(self.cur_lang)
- elif name == 'seg':
- self.inTag = 'seg'
+ elif name == "seg":
+ self.inTag = "seg"
if self.cur_lang in self.pair:
- self.seg[self.cur_lang] = ''
+ self.seg[self.cur_lang] = ""
def characters (self, c):
- if self.inTag == 'note':
+ if self.inTag == "note":
self.note += c
- elif self.inTag == 'seg' and self.cur_lang in self.pair:
+ elif self.inTag == "seg" and self.cur_lang in self.pair:
self.seg[self.cur_lang] += c
def endElement(self, name):
- if name == 'tu' and self.pair == self.cur_pair:
+ if name == "tu" and self.pair == self.cur_pair:
for lang in self.cur_pair:
self.files[lang].write("{}\n".format(self.seg[lang].replace("\n", " ").strip()))
@@ -61,16 +61,15 @@ if __name__ == "__main__":
parser = make_parser()
if len(sys.argv) < 3:
- print('Usage: tmx-extract.py <file> <slang> <tlang>')
- print('')
+ print(f"Usage: {sys.argv[0]} <file> <slang> <tlang>")
+ print()
sys.exit(-1)
- sfile_path = sys.argv[1] + "." + sys.argv[2]
- tfile_path = sys.argv[1] + "." + sys.argv[3]
+ sfile_path = f"{sys.argv[1]}.{sys.argv[2]}"
+ tfile_path = f"{sys.argv[1]}.{sys.argv[3]}"
- with open(sfile_path, 'w+') as sfile, open(tfile_path, 'w+') as tfile:
+ with open(sfile_path, "w+") as sfile, open(tfile_path, "w+") as tfile:
curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile)
parser.setContentHandler(curHandler)
- with open(sys.argv[1], 'r') as tmx_file:
+ with open(sys.argv[1], "r") as tmx_file:
parser.parse(tmx_file)
-
diff --git a/tmx-extract-original-py2.py b/tmx-extract-original-py2
index cbdb491..eb39d1d 100755
--- a/tmx-extract-original-py2.py
+++ b/tmx-extract-original-py2
@@ -73,4 +73,3 @@ parser.parse(open(sys.argv[1]))
sfile.close()
tfile.close()
-
diff --git a/tmx-to-plain.py b/tmx-to-plain
index 07cac6f..025d6e4 100644..100755
--- a/tmx-to-plain.py
+++ b/tmx-to-plain
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
import argparse
import datetime
import sys
@@ -12,37 +14,37 @@ def extract_from_tmx(tmx_file_path,
date,
src_out_after,
tgt_out_after):
- with open(tmx_file_path, 'rb') as in_fp:
+ with open(tmx_file_path, "rb") as in_fp:
tmx_file = tmxfile(in_fp)
-
+
if src_out_after is not None and tgt_out_after is not None:
src_out_after_fp = open(src_out_after, "w")
tgt_out_after_fp = open(tgt_out_after, "w")
-
-
+
+
with open(src_out_path, "w") as src_out_fp, open(tgt_out_path, "w") as tgt_out_fp:
for index, node in enumerate(tmx_file.unit_iter()):
src_out_fp_ = src_out_fp
tgt_out_fp_ = tgt_out_fp
-
+
if begin_date is not None:
- date_string = node.get_target_dom().get('lastusagedate')[:8]
- date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date()
+ date_string = node.get_target_dom().get("lastusagedate")[:8]
+ date_obj = datetime.datetime.strptime(date_string, "%Y%m%d").date()
if date_obj < begin_date:
continue
-
+
if date is not None:
- date_string = node.get_target_dom().get('changedate')[:8]
- date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date()
+ date_string = node.get_target_dom().get("changedate")[:8]
+ date_obj = datetime.datetime.strptime(date_string, "%Y%m%d").date()
if date_obj > date:
src_out_fp_ = src_out_after_fp
tgt_out_fp_ = tgt_out_after_fp
-
+
src_string = f"{node.source}"
tgt_string = f"{node.target}"
- src_string = src_string.replace('\n', ' ').replace('\r', '')
- tgt_string = tgt_string.replace('\n', ' ').replace('\r', '')
-
+ src_string = src_string.replace("\n", " ").replace("\r", "")
+ tgt_string = tgt_string.replace("\n", " ").replace("\r", "")
+
src_out_fp_.write(f"{src_string}\n")
tgt_out_fp_.write(f"{tgt_string}\n")
if (index + 1) % 1000 == 0:
@@ -56,7 +58,7 @@ def extract_from_tmx(tmx_file_path,
def main():
- usage = "Usage: python tmx_to_plain.py [options]"
+ usage = f"Usage: {sys.argv[0]} [options]"
parser = argparse.ArgumentParser(usage=usage)
parser.add_argument("-i", "--input", help="input tmx file")
parser.add_argument("-d", "--date", help="date for splitting the output")
@@ -67,29 +69,27 @@ def main():
if args.input is None:
parser.print_help()
sys.exit(1)
-
- args.input
-
- src_out = args.input + ".src"
- tgt_out = args.input + ".tgt"
-
-
+
+ src_out = f"{args.input}.src"
+ tgt_out = f"{args.input}.tgt"
+
+
if args.date is not None:
- date = datetime.datetime.strptime(args.date, '%Y-%m-%d').date()
- src_out_after = src_out + ".after." + args.date
- tgt_out_after = tgt_out + ".after." + args.date
+ date = datetime.datetime.strptime(args.date, "%Y-%m-%d").date()
+ src_out_after = f"{src_out}.after.{args.date}"
+ tgt_out_after = f"{tgt_out}.after.{args.date}"
else:
date = None
src_out_after = None
tgt_out_after = None
-
+
if args.begin_date is not None:
- begin_date = datetime.datetime.strptime(args.begin_date, '%Y-%m-%d').date()
+ begin_date = datetime.datetime.strptime(args.begin_date, "%Y-%m-%d").date()
else:
begin_date = None
-
- extract_from_tmx(args.input, src_out, tgt_out, begin_date, date, src_out_after, tgt_out_after)
-
-if __name__ == '__main__':
+ extract_from_tmx(args.input, src_out, tgt_out, begin_date, date, src_out_after, tgt_out_after)
+
+
+if __name__ == "__main__":
main()
diff --git a/to-ascii b/to-ascii
index 10fd1c2..7e2a842 100755
--- a/to-ascii
+++ b/to-ascii
@@ -4,9 +4,8 @@ while line = STDIN.gets
encoding_options = {
:invalid => :replace,
:undef => :replace,
- :replace => '?',
+ :replace => "?",
:universal_newline => true
}
- puts line.encode 'ASCII', encoding_options
+ puts line.encode "ASCII", encoding_options
end
-
diff --git a/toks b/toks
index 8bee29f..db8076f 100755
--- a/toks
+++ b/toks
@@ -1,9 +1,8 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
while line = STDIN.gets
line.strip.split(/\s/).each { |i| puts i }
end
-
diff --git a/toks-per-line b/toks-per-line
index 8a10cd4..9814f35 100755
--- a/toks-per-line
+++ b/toks-per-line
@@ -14,4 +14,3 @@ while line = STDIN.gets
puts a.size
end
end
-
diff --git a/train-test-split b/train-test-split
index 6aa4796..db5aad4 100755
--- a/train-test-split
+++ b/train-test-split
@@ -1,7 +1,7 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
conf = Optimist::options do
opt :source, "source file", :type => :string, :required => true
@@ -13,11 +13,11 @@ conf = Optimist::options do
end
source_filename = conf[:source]
-source_extension = source_filename.split('.').last
+source_extension = source_filename.split(".").last
source_lines = ReadFile.readlines source_filename
target_filename = conf[:target]
-target_extension = target_filename.split('.').last
+target_extension = target_filename.split(".").last
target_lines = ReadFile.readlines target_filename
size = conf[:size]
diff --git a/tsv-exclude b/tsv-exclude
index e951ea1..cee3923 100755
--- a/tsv-exclude
+++ b/tsv-exclude
@@ -1,7 +1,7 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'set'
+require "zipf"
+require "set"
to_exclude0 = {}
to_exclude1 = {}
diff --git a/tsv-joint-set b/tsv-joint-set
index c0dbdcf..ce77a9e 100755
--- a/tsv-joint-set
+++ b/tsv-joint-set
@@ -1,8 +1,8 @@
#!/usr/bin/env ruby
-require 'set'
-require 'zipf'
-require 'optimist'
+require "set"
+require "zipf"
+require "optimist"
conf = Optimist::options do
opt :n, "Desired number segments in test set.", :type => :int, :required => true
@@ -50,4 +50,3 @@ outputs.each_with_index { |o,i|
f.write o[0][j] + "\t" + o[1][j] + "\n"
}
}
-
diff --git a/tsv-uniq b/tsv-uniq
index fde79f2..6709e8d 100755
--- a/tsv-uniq
+++ b/tsv-uniq
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'set'
+require "set"
strictness = ARGV[0].to_i # 1 one-side
# 2 just the pair
@@ -21,14 +21,14 @@ if strictness == 1
seen = Set.new
segments[side].each_with_index { |segment,i|
if not seen.include? segment
- puts "#{segments[i][0]}\t#{segments[i][1]}"
+ puts "#{segments[0][i]}\t#{segments[1][i]}"
end
seen << segment
}
elsif strictness == 2
seen = Set.new
segments[0].each_index { |i|
- segment_pair = [segments[i][0], segments[i][1]]
+ segment_pair = [segments[0][i], segments[1][i]]
if not seen.include? segment_pair
puts "#{segment_pair[0]}\t#{segment_pair[1]}"
end
@@ -46,4 +46,3 @@ elsif strictness == 3
seen_pairs << segment_pair
}
end
-
diff --git a/var b/var
index 8ca6082..4e88f1e 100755
--- a/var
+++ b/var
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'optimist'
+require "optimist"
conf = Optimist::options do
banner "stddev [-r <d>] < <one number per line>"
@@ -32,4 +32,3 @@ if conf[:round] >= 0
else
puts var
end
-
diff --git a/vocab b/vocab
index e6bdcd9..b2a2de9 100755
--- a/vocab
+++ b/vocab
@@ -1,4 +1,3 @@
#!/bin/sh
$(dirname $0)/toks ${1+"$@"} | sort | uniq -c
-
diff --git a/vocab2 b/vocab-2
index 1991357..1004faf 100755
--- a/vocab2
+++ b/vocab-2
@@ -1,6 +1,6 @@
#!/usr/bin/env ruby
-require 'zipf'
+require "zipf"
d = {}
while line = STDIN.gets
@@ -10,4 +10,3 @@ while line = STDIN.gets
end
puts d.size
-
diff --git a/zh-ko-or-ja b/zh-ko-or-ja
index 0b42386..e049704 100755
--- a/zh-ko-or-ja
+++ b/zh-ko-or-ja
@@ -1,7 +1,7 @@
#!/usr/bin/env ruby
-require 'zipf'
-require 'script_detector'
+require "zipf"
+require "script_detector"
$to_code = {}
$to_code["Ambiguous Chinese"] = "??"
@@ -15,4 +15,3 @@ while line = STDIN.gets
code = $to_code[line.identify_script]
puts code
end
-