From b31ace79ea5f6b3f279c544cd3a443d6fbf2a24d Mon Sep 17 00:00:00 2001
From: Patrick Simianer <patrick@lilt.com>
Date: Thu, 26 Feb 2026 10:05:59 +0000
Subject: overhaul

---
 NFC                                        |   9 -
 add-index                                  |   1 -
 add-ln                                     |   3 +-
 add-seg                                    |  13 +-
 add-start-end                              |   5 +-
 avg                                        |   2 +-
 avg-seg-len                                |   1 -
 avg-weights                                |   7 +-
 bishuf                                     |   1 -
 bitext-filter-length                       |   9 +-
 bitext2tmx                                 |  39 +++
 bitext2tmx.py                              |  41 ---
 biuniq                                     |   5 +-
 bleu-cmp                                   |   3 +-
 cdec-hg-to-json                            |   1 -
 chars                                      |   3 +-
 cma                                        |   3 +-
 cumul                                      |  17 +-
 de-sgm                                     |   1 -
 div                                        |   1 -
 dot                                        |   7 +-
 even                                       |   5 +-
 exclude                                    |   5 +-
 feature-dict                               |   5 +-
 filter-illegal                             |   5 +-
 filter-len                                 |   3 +-
 filter-tokens                              |   5 +-
 first-upper                                |   3 +-
 fix-utf-8-pua                              |   1 -
 gigaword-collapse-tags                     |   5 +-
 hadoop-uniq                                |   1 -
 hist-tok                                   |   1 -
 htmlentities                               |   9 +-
 inv                                        |   3 +-
 is-first-lower                             |   5 +-
 joint-set                                  |   5 +-
 kbest-bleu-oracles                         |   5 +-
 kendalls-tau                               |  11 +-
 key-count                                  |   5 +-
 kmeans                                     |  17 +-
 lang                                       |  11 +-
 langid-polyglot                            |   3 +-
 length-ratio                               |   3 +-
 lin-reg                                    |   7 +-
 log-reg                                    |  11 +-
 ltok                                       |   7 +-
 make-rule-features                         |   7 +-
 max                                        |   3 +-
 max-len                                    |   5 +-
 median                                     |   3 +-
 merge-files                                |   3 +-
 merge-ttable                               |  17 +-
 min                                        |   3 +-
 min-max                                    |  17 +-
 mkidx                                      |   3 +-
 moses-1best                                |   3 +-
 moving-sum                                 |   3 +-
 mult                                       |   1 -
 nfc                                        |   8 +
 ng                                         |   9 +-
 nn                                         |   1 -
 no-empty                                   |   5 +-
 no-non-printables                          |   3 +-
 nonbreaking-prefixes/README.txt            |   5 +
 nonbreaking-prefixes/nonbreaking_prefix.ca |  75 +++++
 nonbreaking-prefixes/nonbreaking_prefix.cs | 390 ++++++++++++++++++++++++
 nonbreaking-prefixes/nonbreaking_prefix.de | 325 ++++++++++++++++++++
 nonbreaking-prefixes/nonbreaking_prefix.el |   2 +
 nonbreaking-prefixes/nonbreaking_prefix.en | 107 +++++++
 nonbreaking-prefixes/nonbreaking_prefix.es | 118 +++++++
 nonbreaking-prefixes/nonbreaking_prefix.fr | 153 ++++++++++
 nonbreaking-prefixes/nonbreaking_prefix.is | 251 +++++++++++++++
 nonbreaking-prefixes/nonbreaking_prefix.it | 180 +++++++++++
 nonbreaking-prefixes/nonbreaking_prefix.nl | 115 +++++++
 nonbreaking-prefixes/nonbreaking_prefix.pl | 283 +++++++++++++++++
 nonbreaking-prefixes/nonbreaking_prefix.pt | 210 +++++++++++++
 nonbreaking-prefixes/nonbreaking_prefix.ro |  38 +++
 nonbreaking-prefixes/nonbreaking_prefix.ru | 259 ++++++++++++++++
 nonbreaking-prefixes/nonbreaking_prefix.sk | 474 +++++++++++++++++++++++++++++
 nonbreaking-prefixes/nonbreaking_prefix.sl |  78 +++++
 nonbreaking-prefixes/nonbreaking_prefix.sv |  46 +++
 nonbreaking_prefixes/README.txt            |   5 -
 nonbreaking_prefixes/nonbreaking_prefix.ca |  75 -----
 nonbreaking_prefixes/nonbreaking_prefix.cs | 390 ------------------------
 nonbreaking_prefixes/nonbreaking_prefix.de | 325 --------------------
 nonbreaking_prefixes/nonbreaking_prefix.el |   2 -
 nonbreaking_prefixes/nonbreaking_prefix.en | 107 -------
 nonbreaking_prefixes/nonbreaking_prefix.es | 118 -------
 nonbreaking_prefixes/nonbreaking_prefix.fr | 153 ----------
 nonbreaking_prefixes/nonbreaking_prefix.is | 251 ---------------
 nonbreaking_prefixes/nonbreaking_prefix.it | 180 -----------
 nonbreaking_prefixes/nonbreaking_prefix.nl | 115 -------
 nonbreaking_prefixes/nonbreaking_prefix.pl | 283 -----------------
 nonbreaking_prefixes/nonbreaking_prefix.pt | 210 -------------
 nonbreaking_prefixes/nonbreaking_prefix.ro |  38 ---
 nonbreaking_prefixes/nonbreaking_prefix.ru | 259 ----------------
 nonbreaking_prefixes/nonbreaking_prefix.sk | 474 -----------------------------
 nonbreaking_prefixes/nonbreaking_prefix.sl |  78 -----
 nonbreaking_prefixes/nonbreaking_prefix.sv |  46 ---
 norm                                       |   1 -
 norm-german                                |  15 +-
 norm-hyphens                               |   3 +-
 normchr                                    |   9 +-
 num-tok                                    |   5 +-
 odd                                        |   5 +-
 overlap                                    |   7 +-
 paste-pairs                                |  12 +-
 per-sentence-bleu                          |   9 +-
 per-sentence-bleu-kbest                    |   9 +-
 per-sentence-ter                           |  15 +-
 percentile                                 |   3 +-
 pot                                        |   1 -
 preprocess                                 |   5 +-
 preprocess-no-lower                        |   5 +-
 pt-bloom                                   |  11 +-
 push-rules                                 |   3 +-
 remove-devtest                             |   4 +-
 remove-test-from-bitext                    |   4 +-
 repetition-rate                            |   7 +-
 round                                      |   1 -
 rule-shapes                                |   7 +-
 sample                                     |  14 +-
 select                                     |   7 +-
 select-from                                |  11 +-
 sentencepiece-decode                       |   3 +-
 shard                                      |  19 +-
 sort-features                              |   1 -
 source-sides                               |   3 +-
 split-kbest                                |   3 +-
 split-lines                                |   5 +-
 split-pipes                                |  13 +-
 sqrt                                       |   1 -
 stanford-parser-run                        |   3 +-
 stddev                                     |   3 +-
 strips                                     |   1 -
 substract                                  |  13 -
 subtract                                   |  12 +
 sum                                        |   1 -
 tc                                         |   3 +-
 tf-idf                                     |   9 +-
 tmx-extract                                |  75 +++++
 tmx-extract-original-py2                   |  75 +++++
 tmx-extract-original-py2.py                |  76 -----
 tmx-extract.py                             |  76 -----
 tmx-to-plain                               |  95 ++++++
 tmx-to-plain.py                            |  95 ------
 to-ascii                                   |   5 +-
 toks                                       |   7 +-
 toks-per-line                              |   1 -
 train-test-split                           |   8 +-
 tsv-exclude                                |   4 +-
 tsv-joint-set                              |   7 +-
 tsv-uniq                                   |   7 +-
 var                                        |   3 +-
 vocab                                      |   1 -
 vocab-2                                    |  12 +
 vocab2                                     |  13 -
 zh-ko-or-ja                                |   5 +-
 158 files changed, 3672 insertions(+), 3784 deletions(-)
 delete mode 100755 NFC
 create mode 100755 bitext2tmx
 delete mode 100755 bitext2tmx.py
 create mode 100755 nfc
 create mode 100644 nonbreaking-prefixes/README.txt
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.ca
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.cs
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.de
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.el
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.en
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.es
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.fr
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.is
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.it
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.nl
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.pl
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.pt
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.ro
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.ru
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.sk
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.sl
 create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.sv
 delete mode 100644 nonbreaking_prefixes/README.txt
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.ca
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.cs
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.de
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.el
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.en
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.es
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.fr
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.is
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.it
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.nl
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.pl
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.pt
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.ro
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.ru
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.sk
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.sl
 delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.sv
 delete mode 100755 substract
 create mode 100755 subtract
 create mode 100755 tmx-extract
 create mode 100755 tmx-extract-original-py2
 delete mode 100755 tmx-extract-original-py2.py
 delete mode 100755 tmx-extract.py
 create mode 100755 tmx-to-plain
 delete mode 100644 tmx-to-plain.py
 create mode 100755 vocab-2
 delete mode 100755 vocab2

diff --git a/NFC b/NFC
deleted file mode 100755
index aec1c58..0000000
--- a/NFC
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env python
-
-import fileinput
-import unicodedata
-import sys
-
-for line in fileinput.input():
-    sys.stdout.write(unicodedata.normalize('NFC', line))
-
diff --git a/add-index b/add-index
index 77a7e8d..b23fefe 100755
--- a/add-index
+++ b/add-index
@@ -9,4 +9,3 @@ while line = STDIN.gets
   puts "#{i}\t#{line}"
   i += 1
 end
-
diff --git a/add-ln b/add-ln
index 35bc44d..c98f0a0 100755
--- a/add-ln
+++ b/add-ln
@@ -3,6 +3,5 @@
 i = 0
 while line = STDIN.gets
   puts "#{i}\t#{line}"
-  i += 1  
+  i += 1
 end
-
diff --git a/add-seg b/add-seg
index 14b8b6b..3825494 100755
--- a/add-seg
+++ b/add-seg
@@ -1,12 +1,12 @@
 #!/usr/bin/env ruby
 
-require 'optimist'
-require 'zipf'
+require "optimist"
+require "zipf"
 
 o = Optimist::options do
-  opt :grammar, "(Absolute) path of folder containing grammars.", :type => :string, :short => '-g', :default => nil
+  opt :grammar, "(Absolute) path of folder containing grammars.", :type => :string, :short => "-g", :default => nil
   opt :loo, "leave one out", :type => :bool, :default => false
-  opt :start_id, "start with this id", :type => :int, :default => 0, :short => '-i'
+  opt :start_id, "start with this id", :type => :int, :default => 0, :short => "-i"
   opt :nogz, "grammar files not gzipped", :type => :bool, :default => false
   opt :index, "number according to index", :type => :string, :default => nil
 end
@@ -19,8 +19,8 @@ end
 i = o[:start_id]
 j = 0
 while line = STDIN.gets
-  ext = '.gz'
-  ext = '' if o[:nogz]
+  ext = ".gz"
+  ext = "" if o[:nogz]
   s = "<seg"
   if o[:loo] then s += " exclude=\"#{i}\"" end
   if index.size > 0
@@ -33,4 +33,3 @@ while line = STDIN.gets
   i += 1
   j += 1
 end
-
diff --git a/add-start-end b/add-start-end
index 30deaec..1e1061d 100755
--- a/add-start-end
+++ b/add-start-end
@@ -1,10 +1,9 @@
 #!/usr/bin/env ruby
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 
 while line = STDIN.gets
   puts "<s> #{line.strip} </s>"
 end
-
diff --git a/avg b/avg
index ac912d6..6d28fa9 100755
--- a/avg
+++ b/avg
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'optimist'
+require "optimist"
 
 conf = Optimist::options do
   banner "avg < <one number per line>"
diff --git a/avg-seg-len b/avg-seg-len
index ee68827..bfd4f6c 100755
--- a/avg-seg-len
+++ b/avg-seg-len
@@ -6,4 +6,3 @@ while line = STDIN.gets
 end
 
 puts lens.inject(:+)/lens.size.to_f
-
diff --git a/avg-weights b/avg-weights
index f090da9..bc734e8 100755
--- a/avg-weights
+++ b/avg-weights
@@ -1,8 +1,8 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
-require 'zlib'
+require "zipf"
+require "optimist"
+require "zlib"
 
 conf = Optimist::options do
   opt :weights_files, "a number of weights files: name value", :required => true
@@ -30,4 +30,3 @@ h.each_pair { |k,w|
   next if conf[:filter] and w.size < n
   puts "#{k} #{w.inject(:+)/n}"
 }
-
diff --git a/bishuf b/bishuf
index 62689aa..dd86e23 100755
--- a/bishuf
+++ b/bishuf
@@ -15,4 +15,3 @@ get_random() { seed="$1"; openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt <
 echo "shuffling ..."
 $SHUF --random-source=<(get_random 42) $1 > $1.shuf
 $SHUF --random-source=<(get_random 42) $2 > $2.shuf
-
diff --git a/bitext-filter-length b/bitext-filter-length
index d1dc973..a77f10e 100755
--- a/bitext-filter-length
+++ b/bitext-filter-length
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
 
 def main
   conf = Optimist::options do
@@ -17,8 +17,8 @@ def main
     opt :reverse, "length ratios alway > 1", :type => :bool, :default => false, :short => "-r"
   end
 
-  fna,fnb = conf[:inputs].split ','
-  a = ReadFile.new fna 
+  fna,fnb = conf[:inputs].split ","
+  a = ReadFile.new fna
   b = ReadFile.new fnb
 
   if not conf[:output_index]
@@ -62,4 +62,3 @@ def main
 end
 
 main
-
diff --git a/bitext2tmx b/bitext2tmx
new file mode 100755
index 0000000..e9c8e23
--- /dev/null
+++ b/bitext2tmx
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+
+import sys
+from xml.sax.saxutils import escape
+
+
+if __name__ == "__main__":
+    prefix = """<tmx version="1.4">
+  <header
+    creationtool="bitext2tmx" creationtoolversion="1.0"
+    datatype="PlainText" segtype="sentence"
+    adminlang="en-us" srclang="en"
+    o-tmf="ABCTransMem"/>
+  <body>"""
+
+    with open(sys.argv[1], "r") as src_file, open(sys.argv[2], "r") as tgt_file:
+        src_lang = sys.argv[1].split(".")[-1]
+        tgt_lang = sys.argv[2].split(".")[-1]
+
+        tus = []
+        for src_line, tgt_line in zip(src_file.readlines(), tgt_file.readlines()):
+            src_line = src_line.rstrip("\n")
+            tgt_line = tgt_line.rstrip("\n")
+            tus.append(f"""
+    <tu>
+      <tuv xml:lang="{src_lang}">
+        <seg>{escape(src_line)}</seg>
+      </tuv>
+      <tuv xml:lang="{tgt_lang}">
+        <seg>{escape(tgt_line)}</seg>
+      </tuv>
+    </tu>""")
+
+    suffix = """  </body>
+</tmx>"""
+
+    complete = "\n".join([prefix] + tus + [suffix])
+
+    print(complete)
diff --git a/bitext2tmx.py b/bitext2tmx.py
deleted file mode 100755
index 1cdc4b3..0000000
--- a/bitext2tmx.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-from xml.sax.saxutils import escape
-
-
-if __name__ == "__main__":
-    prefix = """<tmx version="1.4">
-  <header
-    creationtool="bitext2tmx.py" creationtoolversion="1.0"
-    datatype="PlainText" segtype="sentence"
-    adminlang="en-us" srclang="en"
-    o-tmf="ABCTransMem"/>
-  <body>"""
-
-    src_file = open(sys.argv[1], "r")
-    tgt_file = open(sys.argv[2], "r")
-
-    src_lang = sys.argv[1].split(".")[-1]
-    tgt_lang = sys.argv[2].split(".")[-1]
-
-    tus = []
-    for src_line, tgt_line in zip(src_file.readlines(), tgt_file.readlines()):
-        src_line = src_line.rstrip("\n")
-        tgt_line = tgt_line.rstrip("\n")
-        tus.append(f"""
-    <tu>
-      <tuv xml:lang="{src_lang}">
-        <seg>{escape(src_line)}</seg>
-      </tuv>
-      <tuv xml:lang="{tgt_lang}">
-        <seg>{escape(tgt_line)}</seg>
-      </tuv>
-    </tu>""")
-
-    suffix = """  </body>
-</tmx>"""
-
-    complete = "\n".join([prefix] + tus + [suffix])
-
-    print(complete)
diff --git a/biuniq b/biuniq
index b191ab0..9ad2d76 100755
--- a/biuniq
+++ b/biuniq
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 f1 = ReadFile.new ARGV[0]
 f2 = ReadFile.new ARGV[1]
@@ -16,7 +16,7 @@ while line1 = f1.gets
   line2 = f2.gets
   if line2 == nil then line2 = "" end
   line2.strip!
-  
+
   if !d1.include? line1 and !d2.include? line2
     a1 << line1
     a2 << line2
@@ -33,4 +33,3 @@ a1.each_with_index { |line1,i|
   o1.write line1 + "\n"
   o2.write a2[i] + "\n"
 }
-
diff --git a/bleu-cmp b/bleu-cmp
index ed8460c..fe5370d 100755
--- a/bleu-cmp
+++ b/bleu-cmp
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 f =  ReadFile.new ARGV[0]
 g =  ReadFile.new ARGV[1]
@@ -20,4 +20,3 @@ while line = f.gets
   puts
   i += 1
 end
-
diff --git a/cdec-hg-to-json b/cdec-hg-to-json
index 5a26cf7..955cd6d 100755
--- a/cdec-hg-to-json
+++ b/cdec-hg-to-json
@@ -77,4 +77,3 @@ def main():
 
 if __name__=="__main__":
   main()
-
diff --git a/chars b/chars
index 359c2ab..5fed1c7 100755
--- a/chars
+++ b/chars
@@ -1,10 +1,9 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 while line = STDIN.gets
   line.strip.each_char { |c|
     puts c
   }
 end
-
diff --git a/cma b/cma
index 4647710..9e0f1f0 100755
--- a/cma
+++ b/cma
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'optimist'
+require "optimist"
 
 conf = Optimist::options do
   banner "cma < <one number per line>"
@@ -20,4 +20,3 @@ while line = STDIN.gets
   end
   STDOUT.flush
 end
-
diff --git a/cumul b/cumul
index 93a7e90..45ff03e 100755
--- a/cumul
+++ b/cumul
@@ -1,6 +1,7 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
+require "tempfile"
 
 f =  ReadFile.new ARGV[0]
 g =  ReadFile.new ARGV[1]
@@ -17,16 +18,16 @@ while line = f.gets
   sys1 << line1
   sys2 << line2
 
-  ff=File.new("/tmp/refs",'w+');ff.write(refs.join(""));ff.close
-  ff=File.new("/tmp/sys1",'w+');ff.write(sys1.join(""));ff.close
-  ff=File.new("/tmp/sys2",'w+');ff.write(sys2.join(""));ff.close
+  tmp_refs = Tempfile.new("refs"); tmp_refs.write(refs.join("")); tmp_refs.close
+  tmp_sys1 = Tempfile.new("sys1"); tmp_sys1.write(sys1.join("")); tmp_sys1.close
+  tmp_sys2 = Tempfile.new("sys2"); tmp_sys2.write(sys2.join("")); tmp_sys2.close
 
-  #a = `~/multi-bleu.perl /tmp/refs < /tmp/sys1`.split[2].gsub(',','').to_f 
-  a = BLEU::bleu("/tmp/sys1", "/tmp/refs", 4)
-  b = BLEU::bleu("/tmp/sys2", "/tmp/refs", 4)
+  a = BLEU::bleu(tmp_sys1.path, tmp_refs.path, 4)
+  b = BLEU::bleu(tmp_sys2.path, tmp_refs.path, 4)
+
+  tmp_refs.unlink; tmp_sys1.unlink; tmp_sys2.unlink
   diffs << b-a
 
   #puts ((diffs.inject(:+)/diffs.size)*100).round 2
   puts (diffs[-1]*100).round 2
 end
-
diff --git a/de-sgm b/de-sgm
index 3b3a8e0..8598aef 100755
--- a/de-sgm
+++ b/de-sgm
@@ -9,4 +9,3 @@ egrep -v -i "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|r
   | sed "s|\s*</speaker>\s*$||" \
   | sed "s|\s*<hl>\s*$||" \
   | sed "s|\s*</hl>\s*$||"
-
diff --git a/div b/div
index 93585dc..d0e036e 100755
--- a/div
+++ b/div
@@ -5,4 +5,3 @@ exit if factor==0
 while line = STDIN.gets
   puts line.to_f / factor
 end
-
diff --git a/dot b/dot
index da0dc58..9856069 100755
--- a/dot
+++ b/dot
@@ -1,9 +1,8 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
-a = SparseVector.from_file 'w', ' '
-b = SparseVector.from_file 'f', ' '
+a = SparseVector.from_file "w", " "
+b = SparseVector.from_file "f", " "
 puts a.to_s
 puts a.dot b
-
diff --git a/even b/even
index dcee3d9..1a9bfd4 100755
--- a/even
+++ b/even
@@ -1,11 +1,10 @@
 #!/usr/bin/env ruby
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 i = 1
 while line = STDIN.gets
   puts line if i%2==0
   i+=1
 end
-
diff --git a/exclude b/exclude
index b5fe3cb..ee5a144 100755
--- a/exclude
+++ b/exclude
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'set'
+require "zipf"
+require "set"
 
 to_exclude = {}
 f = ReadFile.new ARGV[0]
@@ -14,4 +14,3 @@ while line = STDIN.gets
     puts line
   end
 end
-
diff --git a/feature-dict b/feature-dict
index 6849769..59ff020 100755
--- a/feature-dict
+++ b/feature-dict
@@ -7,7 +7,7 @@ l_i = 1
 while line = STDIN.gets
   STDERR.write "#{l_i}\n" if l_i%1000==0&&not_quiet
   line.split.each { |i|
-    f, v = i.split('=', 2)
+    f, v = i.split("=", 2)
     if !feature_dict.has_key? f
       feature_dict[f] = n
       n += 1
@@ -16,9 +16,8 @@ while line = STDIN.gets
   l_i += 1
 end
 
-f = File.new ARGV[0], 'w'
+f = File.new ARGV[0], "w"
 f.write Marshal.dump feature_dict
 f.close
 
 STDERR.write "size = #{feature_dict.size}\n"
-
diff --git a/filter-illegal b/filter-illegal
index 8b29f3e..e44b2ac 100755
--- a/filter-illegal
+++ b/filter-illegal
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 illegal = [ "[", "]", "|||" ]
 
@@ -15,11 +15,10 @@ while line0 = in0.gets
   illegal.each { |k|
     if line0.index(k) or line1.index(k) then
       skip = true
-      skipi << i 
+      skipi << i
     end
   }
   i += 1
 end
 
 skipi.each { |j| puts j }
-
diff --git a/filter-len b/filter-len
index fe45b57..1756849 100755
--- a/filter-len
+++ b/filter-len
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 a = ReadFile.new ARGV[0]
 b = ReadFile.new ARGV[1]
@@ -24,4 +24,3 @@ a.close
 b.close
 a_out.close
 b_out.close
-
diff --git a/filter-tokens b/filter-tokens
index 00c8f2c..c851bd3 100755
--- a/filter-tokens
+++ b/filter-tokens
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 bad_words = {}
 ReadFile.readlines_strip(ARGV[0]).each { |line|
@@ -13,11 +13,10 @@ while line = STDIN.gets
   tokens = line.split
   bad_words.keys.each { |w|
     if tokens.include? w
-      bad = true 
+      bad = true
       break
     end
   }
   puts i if bad
   i += 1
 end
-
diff --git a/first-upper b/first-upper
index 610e62c..f9b2ce9 100755
--- a/first-upper
+++ b/first-upper
@@ -1,10 +1,9 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 while line = STDIN.gets
   line.strip!
   line[0] = line[0].upcase
   puts line
 end
-
diff --git a/fix-utf-8-pua b/fix-utf-8-pua
index 674d424..da77850 100755
--- a/fix-utf-8-pua
+++ b/fix-utf-8-pua
@@ -7,4 +7,3 @@ while line = STDIN.gets
   line.gsub! /[\u{e000}-\u{f8ff}]/, " "
   puts line
 end
-
diff --git a/gigaword-collapse-tags b/gigaword-collapse-tags
index cbaf7d7..f2339c4 100755
--- a/gigaword-collapse-tags
+++ b/gigaword-collapse-tags
@@ -2,8 +2,8 @@
 
 # works with gigaword en v5
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 in_p = false
 in_dateline = false
@@ -36,4 +36,3 @@ while line = STDIN.gets
    puts line
  end
 end
-
diff --git a/hadoop-uniq b/hadoop-uniq
index 5052419..5f37fa4 100755
--- a/hadoop-uniq
+++ b/hadoop-uniq
@@ -8,4 +8,3 @@ $HADOOP_HOME/bin/hadoop  jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \
     -output d.uniq \
     -mapper 'cut -d " " -f 1' \
     -reducer /usr/bin/uniq
-
diff --git a/hist-tok b/hist-tok
index b81604f..3e1d453 100755
--- a/hist-tok
+++ b/hist-tok
@@ -21,4 +21,3 @@ sorted.sort_by! { |i|
 sorted.each { |i|
   puts "#{i[0]}\t#{i[1]}"
 }
-
diff --git a/htmlentities b/htmlentities
index f3c2d34..c0ccc0a 100755
--- a/htmlentities
+++ b/htmlentities
@@ -1,9 +1,9 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
 
-require 'htmlentities'
+require "htmlentities"
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 
 coder = HTMLEntities.new
@@ -11,4 +11,3 @@ coder = HTMLEntities.new
 while line = STDIN.gets
   puts coder.decode(line.strip)
 end
-
diff --git a/inv b/inv
index b13443f..aaa4783 100755
--- a/inv
+++ b/inv
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'optimist'
+require "optimist"
 
 def main
   conf = Optimist::options do
@@ -30,4 +30,3 @@ def main
 end
 
 main
-
diff --git a/is-first-lower b/is-first-lower
index 1cddb8e..a7e2073 100755
--- a/is-first-lower
+++ b/is-first-lower
@@ -1,11 +1,10 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 while line = STDIN.gets
   line.strip!
-  if line && line!='' && line[0].downcase?
+  if line && line!="" && line[0].downcase?
     puts line
   end
 end
-
diff --git a/joint-set b/joint-set
index b9b9b22..a295862 100755
--- a/joint-set
+++ b/joint-set
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 
-require 'set'
-require 'zipf'
+require "set"
+require "zipf"
 
 n = ARGV.pop.to_i
 
@@ -27,4 +27,3 @@ all_sets.each { |set|
 joint_set.each { |i|
   puts i
 }
-
diff --git a/kbest-bleu-oracles b/kbest-bleu-oracles
index ea76ab1..03f321d 100755
--- a/kbest-bleu-oracles
+++ b/kbest-bleu-oracles
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
 
 def get_context kbest_lists, references, n
   a = []
@@ -48,4 +48,3 @@ def main
 end
 
 main
-
diff --git a/kendalls-tau b/kendalls-tau
index c0c20be..24f0744 100755
--- a/kendalls-tau
+++ b/kendalls-tau
@@ -2,7 +2,7 @@
 
 #################################################
 # reads space delimted pairs of scores as input,
-# outputs Kendall's τ
+# outputs Kendall"s τ
 #################################################
 
 def kendall_with_ties l
@@ -13,7 +13,7 @@ def kendall_with_ties l
   l.each_with_index { |k,i|
     l[i+1,l.size].each_with_index { |m,j|
       if (k.first < m.first && k[1] < m[1]) ||
-         (k.first > m.first && k[1] > m[1]) 
+         (k.first > m.first && k[1] > m[1])
         concordant += 1
       elsif (k.first == m.first && k[1] != m[1])
         tie_a += 1
@@ -24,7 +24,7 @@ def kendall_with_ties l
       end
     }
   }
- 
+
   return (concordant-disconcordant)/(Math.sqrt((concordant+disconcordant+tie_a)*(concordant+disconcordant+tie_b)))
 end
 
@@ -34,7 +34,7 @@ def kendall l
   l.each_with_index { |k,i|
     l[i+1,l.size].each_with_index { |m,j|
       if (k.first <= m.first && k[1] <= m[1]) ||
-         (k.first >= m.first && k[1] >= m[1]) 
+         (k.first >= m.first && k[1] >= m[1])
         concordant += 1
       else
         disconcordant += 1
@@ -60,7 +60,7 @@ def main
     a,b = line.split
     l << [a.to_f, b.to_f]
   end
-  
+
   v = -1
   if has_ties? l
     v = kendall_with_ties l
@@ -72,4 +72,3 @@ def main
 end
 
 main
-
diff --git a/key-count b/key-count
index deaa522..b853362 100755
--- a/key-count
+++ b/key-count
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 h = {}
 h.default = 0
@@ -11,4 +11,3 @@ while line = STDIN.gets
 end
 
 h.each_pair { |k,v| puts "#{k} #{v}" }
-
diff --git a/kmeans b/kmeans
index dcf7774..f49fc53 100755
--- a/kmeans
+++ b/kmeans
@@ -1,12 +1,12 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
 
 def read_data fn
   data = {}
   ReadFile.new(fn).readlines_strip.map{ |i|
-    a = i.split ' ', 2
+    a = i.split " ", 2
     v = SparseVector.from_kv a.last
     data[a.first] = v
   }
@@ -30,7 +30,7 @@ end
 def assign centroids, data
   assignment = {}
   data.each_pair { |name,feature_vector|
-      min = 1.0/0
+      min = Float::INFINITY
       min_index = nil
       centroids.each_with_index { |c,i|
         dist = c.euclidian_dist(feature_vector)
@@ -61,10 +61,10 @@ def main
     opt :k, "k", :type => :int, :required => true
     opt :input, "input: one feature vector per line", :type => :string, :required => true
     opt :max_iterations, "max. number of iterations", :type => :int, :default => 100
-    opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3
-    opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2
+    opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => "-n", :default => 3
+    opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => "-j", :default => 2
   end
-  # data is 'ID f1=v1 f2=v2'
+  # data is "ID f1=v1 f2=v2"
   data = read_data conf[:input]
   k = conf[:k]
   centroids = nil
@@ -86,7 +86,7 @@ def main
   STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n"
   0.upto(conf[:max_iterations]) do |i|
     s = "iteration #{i}"
-    STDERR.write "#{s}\n#{'-'*s.size}\n"
+    STDERR.write "#{s}\n#{"-" * s.size}\n"
     assignment = assign centroids, data
     sizes = []
     assignment.each_pair { |centroid_index, a|
@@ -114,4 +114,3 @@ def main
 end
 
 main
-
diff --git a/lang b/lang
index 5caebd1..1b498d1 100755
--- a/lang
+++ b/lang
@@ -1,14 +1,14 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import sys
 import langdetect
 
 from_stdin = False
-if sys.argv[1] == '-':
+if sys.argv[1] == "-":
     f = sys.stdin
     from_stdin = True
 else:
-    f = open(sys.argv[1], 'r')
+    f = open(sys.argv[1], "r")
 
 try:
     l = sys.argv[2].strip()
@@ -32,7 +32,7 @@ if min_p and not l:
 
 if strict and not min_p:
    strict = False
-     
+
 
 factory = langdetect.detector_factory.DetectorFactory()
 factory.load_profile(langdetect.detector_factory.PROFILES_DIRECTORY)
@@ -71,5 +71,4 @@ for line in f:
                 print("unk")
 
 if not from_stdin:
-    f.close
-
+    f.close()
diff --git a/langid-polyglot b/langid-polyglot
index 0b0b20c..04f6b3b 100755
--- a/langid-polyglot
+++ b/langid-polyglot
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import polyglot
 from polyglot.detect import Detector
@@ -15,4 +15,3 @@ for line in fileinput.input():
     except polyglot.detect.base.UnknownLanguage:
         print("??")
         pass
-
diff --git a/length-ratio b/length-ratio
index 4b4432d..5b38826 100755
--- a/length-ratio
+++ b/length-ratio
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 a = ReadFile.new ARGV[0]
 b = ReadFile.new ARGV[1]
@@ -9,4 +9,3 @@ while linea = a.gets
   lineb = b.gets
   puts linea.strip.split.size.to_f / lineb.strip.split.size.to_f
 end
-
diff --git a/lin-reg b/lin-reg
index 87dded5..eb9193e 100755
--- a/lin-reg
+++ b/lin-reg
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
 
 def read_data fn, scale
   f = ReadFile.new fn
@@ -29,7 +29,7 @@ def main
     opt :output,        "output data",        :type => :string, :required => true
     opt :learning_rate, "learning rate",      :type => :float,  :default => 0.07
     opt :stop,          "stopping criterion", :type => :int,    :default => 100
-    opt :scale_features,"scale features",     :type => :bool,   :default => false, :short => '-t'
+    opt :scale_features,"scale features",     :type => :bool,   :default => false, :short => "-t"
     opt :show_loss,     "show loss per iter", :type => :bool,   :default => false
   end
   data = read_data conf[:input], conf[:scale_features]
@@ -67,4 +67,3 @@ def main
 end
 
 main
-
diff --git a/log-reg b/log-reg
index 5e43555..99d9153 100755
--- a/log-reg
+++ b/log-reg
@@ -1,8 +1,8 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'matrix'
-require 'optimist'
+require "zipf"
+require "matrix"
+require "optimist"
 
 def read_data fn
   f = ReadFile.new fn
@@ -30,7 +30,7 @@ def approx_eql x, y, eps=10**-10
   return false if !x||!y
   return false if x.size!=y.size
   x.each_with_index { |_,i|
-    return false if (x[i]-y[i]).abs>eps 
+    return false if (x[i]-y[i]).abs>eps
   }
   return true
 end
@@ -48,7 +48,7 @@ def main
   prev_model = nil
   gradient = Vector.elements zeros
   hessian = Matrix.build(dim,dim) { |i,j| 0.0 }
-  i = 0 
+  i = 0
   while true
     i += 1
     data.each_with_index { |x,j|
@@ -68,4 +68,3 @@ def main
 end
 
 main
-
diff --git a/ltok b/ltok
index c90823e..fc25a65 100755
--- a/ltok
+++ b/ltok
@@ -1,9 +1,8 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 while line = STDIN.gets
   puts line.strip.split(/\s/).size
 end
-
diff --git a/make-rule-features b/make-rule-features
index 7adb6e9..ae2cecc 100755
--- a/make-rule-features
+++ b/make-rule-features
@@ -1,10 +1,10 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 def mkrf src, tgt
   s = src.gsub /\[X,[1-9]\]/, "NX"
-  t = tgt.gsub /\[X,([1-9])\]/,'N\1'
+  t = tgt.gsub /\[X,([1-9])\]/,"N\1"
   return "R:X:#{s.gsub(" ","_")}:#{t.gsub(" ","_")}"
 end
 
@@ -13,7 +13,7 @@ def mkrbf s, t
   if t == "S"
     s.gsub! /\[X,[1-9]\]/, "X"
   else
-    s.gsub! /\[X,([1-9])\]/, 'X\1' 
+    s.gsub! /\[X,([1-9])\]/, "X\1"
   end
   s.reverse!
   s += " >r<"
@@ -41,4 +41,3 @@ while line = STDIN.gets
 end
 
 h.keys.each { |f| puts f }
-
diff --git a/max b/max
index b2c1cae..15d0003 100755
--- a/max
+++ b/max
@@ -1,10 +1,9 @@
 #!/usr/bin/env ruby
 
-max = -1.0/0
+max = -Float::INFINITY
 while line = STDIN.gets
   v = line.to_f
   max = v if v > max
 end
 
 puts max
-
diff --git a/max-len b/max-len
index 69013b5..dab684f 100755
--- a/max-len
+++ b/max-len
@@ -1,11 +1,11 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 max = ARGV[0].to_i
 
 i = 0
-while line = STDIN.gets 
+while line = STDIN.gets
   if tokenize(line).size <= max
     puts i
   else
@@ -13,4 +13,3 @@ while line = STDIN.gets
   end
   i += 1
 end
-
diff --git a/median b/median
index 0b1950b..cc47dcd 100755
--- a/median
+++ b/median
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 a = []
 while line = STDIN.gets
@@ -10,4 +10,3 @@ end
 a.sort!
 
 puts a[a.size/2]
-
diff --git a/merge-files b/merge-files
index 714b57d..78644ef 100755
--- a/merge-files
+++ b/merge-files
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 def usage
   STDERR.write "merge_files <file>+\n"
@@ -28,4 +28,3 @@ hashes.each { |h|
     counts.max.times { puts k }
   }
 }
-
diff --git a/merge-ttable b/merge-ttable
index 77eae9f..20e5429 100755
--- a/merge-ttable
+++ b/merge-ttable
@@ -1,20 +1,20 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
 
 def main
   conf = Optimist::options do
     opt :f, "f files", :type => :string, :required => true
     opt :e, "e files", :type => :string, :required => true
   end
-  
+
   f_files = conf[:f].split
   e_files = conf[:e].split
-  
+
   h = {}
   f_files.each_with_index { |fn,i|
-    fa = ReadFile.readlines_strip fn 
+    fa = ReadFile.readlines_strip fn
     ea = ReadFile.readlines_strip e_files[i]
     fa.each_with_index { |fw,j|
       if h.has_key? fw
@@ -24,11 +24,10 @@ def main
       end
     }
   }
-  
+
   h.each_pair { |f,ea|
-    puts "#{f}\t#{ea.first}" 
-  } 
+    puts "#{f}\t#{ea.first}"
+  }
 end
 
 main
-
diff --git a/min b/min
index f8a7e42..edfecea 100755
--- a/min
+++ b/min
@@ -1,10 +1,9 @@
 #!/usr/bin/env ruby
 
-min = 1.0/0
+min = Float::INFINITY
 while line = STDIN.gets
   v = line.to_f
   min = v if v<min
 end
 
 puts min
-
diff --git a/min-max b/min-max
index f6c8d00..dee541f 100755
--- a/min-max
+++ b/min-max
@@ -1,15 +1,15 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
 
 conf = Optimist::options do
   opt :min, "minimum #tokens", :type => :int, :default => 1
-  opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n'
-  opt :in_f, "input 'French' file", :type => :string, :required => true
-  opt :in_e, "input 'English' file", :type => :string, :required => true
-  opt :out_f, "output 'French' file", :type => :string, :required => true
-  opt :out_e, "output 'English' file", :type => :string, :required => true
+  opt :max, "maximum #tokens", :type => :int, :default => 80, :short => "-n"
+  opt :in_f, "input French file", :type => :string, :required => true
+  opt :in_e, "input English file", :type => :string, :required => true
+  opt :out_f, "output French file", :type => :string, :required => true
+  opt :out_e, "output English file", :type => :string, :required => true
   opt :out_id, "output line Nos", :type => :string, :required => true
 end
 
@@ -37,4 +37,3 @@ while f_line = files[:f_file].gets
 end
 
 files.values.each{ |f| f.close }
-
diff --git a/mkidx b/mkidx
index 046e131..6e67ba9 100755
--- a/mkidx
+++ b/mkidx
@@ -1,10 +1,9 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 i = ARGV[0].to_i
 while line = STDIN.gets
   puts i
   i += 1
 end
-
diff --git a/moses-1best b/moses-1best
index fd35cf8..ffe5e22 100755
--- a/moses-1best
+++ b/moses-1best
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 prev_idx = nil
 while line = STDIN.gets
@@ -11,4 +11,3 @@ while line = STDIN.gets
     prev_idx = idx
   end
 end
-
diff --git a/moving-sum b/moving-sum
index 697f47f..aff3527 100755
--- a/moving-sum
+++ b/moving-sum
@@ -1,10 +1,9 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 sum = 0.0
 ReadFile.readlines_strip(ARGV[0]).each { |i|
   sum += i.to_f
   puts sum
 }
-
diff --git a/mult b/mult
index 478ec5e..42dd74c 100755
--- a/mult
+++ b/mult
@@ -4,4 +4,3 @@ factor = ARGV[0].to_f
 while line = STDIN.gets
   puts line.to_f * factor
 end
-
diff --git a/nfc b/nfc
new file mode 100755
index 0000000..4af1aef
--- /dev/null
+++ b/nfc
@@ -0,0 +1,8 @@
+#!/usr/bin/env python3
+
+import fileinput
+import unicodedata
+import sys
+
+for line in fileinput.input():
+    sys.stdout.write(unicodedata.normalize("NFC", line))
diff --git a/ng b/ng
index f3a031d..af8015a 100755
--- a/ng
+++ b/ng
@@ -1,19 +1,18 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
 
 conf = Optimist::options do
   banner "ng < <input>"
   opt :n, "n for Ngrams", :type => :int, :default => 4
-  opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => false
+  opt :fix, "Do not output lower order Ngrams.", :type => :bool, :default => false
   opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n"
 end
 
 while line = STDIN.gets
   a = []
-  ngrams(line, conf[:n], conf[:fix]) { |ng| a << ng.join(' ') }
+  ngrams(line, conf[:n], conf[:fix]) { |ng| a << ng.join(" ") }
   a.reject! { |i| i.strip.size==0 }
   puts a.join conf[:separator] if a.size>0
 end
-
diff --git a/nn b/nn
index 4d1dab7..d43a235 100755
--- a/nn
+++ b/nn
@@ -1,4 +1,3 @@
 #!/bin/sh
 
 tr '[:digit:]' $1 < $2 > $(basename $2 ${2##*.})nn.${2##*.}
-
diff --git a/no-empty b/no-empty
index da57e23..5a05fc1 100755
--- a/no-empty
+++ b/no-empty
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 files = []
 (0..1).each { |i| files << ReadFile.new(ARGV[i]) }
@@ -9,10 +9,9 @@ files = []
 while line_f = files[0].gets
   line_e = files[1].gets
   line_f.strip!; line_e.strip!
-  next if line_f=='' || line_e==''
+  next if line_f=="" || line_e==""
   files[2].write line_f+"\n"
   files[3].write line_e+"\n"
 end
 
 files.each { |f| f.close }
-
diff --git a/no-non-printables b/no-non-printables
index 9f9e3f9..2fb6f65 100755
--- a/no-non-printables
+++ b/no-non-printables
@@ -1,4 +1,3 @@
 #!/bin/sh
 
-sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g' | sed 's/[[:cntrl:]]//g' 
-
+sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g' | sed 's/[[:cntrl:]]//g'
diff --git a/nonbreaking-prefixes/README.txt b/nonbreaking-prefixes/README.txt
new file mode 100644
index 0000000..02cdfcc
--- /dev/null
+++ b/nonbreaking-prefixes/README.txt
@@ -0,0 +1,5 @@
+The language suffix can be found here:
+
+http://www.loc.gov/standards/iso639-2/php/code_list.php
+
+
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.ca b/nonbreaking-prefixes/nonbreaking_prefix.ca
new file mode 100644
index 0000000..2f4fdfc
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.ca
@@ -0,0 +1,75 @@
+Dr
+Dra
+pàg
+p
+c
+av
+Sr
+Sra
+adm
+esq
+Prof
+S.A
+S.L
+p.e
+ptes
+Sta
+St
+pl
+màx
+cast
+dir
+nre
+fra
+admdora
+Emm
+Excma
+espf
+dc
+admdor
+tel
+angl
+aprox
+ca
+dept
+dj
+dl
+dt
+ds
+dg
+dv
+ed
+entl
+al
+i.e
+maj
+smin
+n
+núm
+pta
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.cs b/nonbreaking-prefixes/nonbreaking_prefix.cs
new file mode 100644
index 0000000..dce6167
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.cs
@@ -0,0 +1,390 @@
+Bc
+BcA
+Ing
+Ing.arch
+MUDr
+MVDr
+MgA
+Mgr
+JUDr
+PhDr
+RNDr
+PharmDr
+ThLic
+ThDr
+Ph.D
+Th.D
+prof
+doc
+CSc
+DrSc
+dr. h. c
+PaedDr
+Dr
+PhMr
+DiS
+abt
+ad
+a.i
+aj
+angl
+anon
+apod
+atd
+atp
+aut
+bd
+biogr
+b.m
+b.p
+b.r
+cca
+cit
+cizojaz
+c.k
+col
+čes
+čín
+čj
+ed
+facs
+fasc
+fol
+fot
+franc
+h.c
+hist
+hl
+hrsg
+ibid
+il
+ind
+inv.č
+jap
+jhdt
+jv
+koed
+kol
+korej
+kl
+krit
+lat
+lit
+m.a
+maď
+mj
+mp
+násl
+např
+nepubl
+něm
+no
+nr
+n.s
+okr
+odd
+odp
+obr
+opr
+orig
+phil
+pl
+pokrač
+pol
+port
+pozn
+př.kr
+př.n.l
+přel
+přeprac
+příl
+pseud
+pt
+red
+repr
+resp
+revid
+rkp
+roč
+roz
+rozš
+samost
+sect
+sest
+seš
+sign
+sl
+srv
+stol
+sv
+šk
+šk.ro
+špan
+tab
+t.č
+tis
+tj
+tř
+tzv
+univ
+uspoř
+vol
+vl.jm
+vs
+vyd
+vyobr
+zal
+zejm
+zkr
+zprac
+zvl
+n.p
+např
+než
+MUDr
+abl
+absol
+adj
+adv
+ak
+ak. sl
+akt
+alch
+amer
+anat
+angl
+anglosas
+arab
+arch
+archit
+arg
+astr
+astrol
+att
+bás
+belg
+bibl
+biol
+boh
+bot
+bulh
+círk
+csl
+č
+čas
+čes
+dat
+děj
+dep
+dět
+dial
+dór
+dopr
+dosl
+ekon
+epic
+etnonym
+eufem
+f
+fam
+fem
+fil
+film
+form
+fot
+fr
+fut
+fyz
+gen
+geogr
+geol
+geom
+germ
+gram
+hebr
+herald
+hist
+hl
+hovor
+hud
+hut
+chcsl
+chem
+ie
+imp
+impf
+ind
+indoevr
+inf
+instr
+interj
+ión
+iron
+it
+kanad
+katalán
+klas
+kniž
+komp
+konj
+ 
+konkr
+kř
+kuch
+lat
+lék
+les
+lid
+lit
+liturg
+lok
+log
+m
+mat
+meteor
+metr
+mod
+ms
+mysl
+n
+náb
+námoř
+neklas
+něm
+nesklon
+nom
+ob
+obch
+obyč
+ojed
+opt
+part
+pas
+pejor
+pers
+pf
+pl
+plpf
+ 
+práv
+prep
+předl
+přivl
+r
+rcsl
+refl
+reg
+rkp
+ř
+řec
+s
+samohl
+sg
+sl
+souhl
+spec
+srov
+stfr
+střv
+stsl
+subj
+subst
+superl
+sv
+sz
+táz
+tech
+telev
+teol
+trans
+typogr
+var
+vedl
+verb
+vl. jm
+voj
+vok
+vůb
+vulg
+výtv
+vztaž
+zahr
+zájm
+zast
+zejm
+ 
+zeměd
+zkr
+zř
+mj
+dl
+atp
+sport
+Mgr
+horn
+MVDr
+JUDr
+RSDr
+Bc
+PhDr
+ThDr
+Ing
+aj
+apod
+PharmDr
+pomn
+ev
+slang
+nprap
+odp
+dop
+pol
+st
+stol
+p. n. l
+před n. l
+n. l
+př. Kr
+po Kr
+př. n. l
+odd
+RNDr
+tzv
+atd
+tzn
+resp
+tj
+p
+br
+č. j
+čj
+č. p
+čp
+a. s
+s. r. o
+spol. s r. o
+p. o
+s. p
+v. o. s
+k. s
+o. p. s
+o. s
+v. r
+v z
+ml
+vč
+kr
+mld
+hod
+popř
+ap
+event
+rus
+slov
+rum
+švýc
+P. T
+zvl
+hor
+dol
+S.O.S
\ No newline at end of file
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.de b/nonbreaking-prefixes/nonbreaking_prefix.de
new file mode 100644
index 0000000..35fdf5e
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.de
@@ -0,0 +1,325 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+#no german words end in single lower-case letters, so we throw those in too.
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in German.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#Titles and Honorifics
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Ens
+Gen
+Gov
+Hon
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#Misc symbols
+Mio
+Mrd
+bzw
+v
+vs
+usw
+d.h
+z.B
+u.a
+etc
+Mrd
+MwSt
+ggf
+d.J
+D.h
+m.E
+vgl
+I.F
+z.T
+sogen
+ff
+u.E
+g.U
+g.g.A
+c.-à-d
+Buchst
+u.s.w
+sog
+u.ä
+Std
+evtl
+Zt
+Chr
+u.U
+o.ä
+Ltd
+b.A
+z.Zt
+spp
+sen
+SA
+k.o
+jun
+i.H.v
+dgl
+dergl
+Co
+zzt
+usf
+s.p.a
+Dkr
+Corp
+bzgl
+BSE
+
+#Number indicators
+# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
+No
+Nos
+Art
+Nr
+pp
+ca
+Ca
+
+#Ordinals are done with . in German - "1." = "1st" in English
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.el b/nonbreaking-prefixes/nonbreaking_prefix.el
new file mode 100644
index 0000000..0470f91
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.el
@@ -0,0 +1,2 @@
+# for now, just include the Greek equivalent of "Mr."
+κ
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.en b/nonbreaking-prefixes/nonbreaking_prefix.en
new file mode 100644
index 0000000..e1a3733
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.en
@@ -0,0 +1,107 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Drs
+Ens
+Gen
+Gov
+Hon
+Hr
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY# 
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.es b/nonbreaking-prefixes/nonbreaking_prefix.es
new file mode 100644
index 0000000..d8b2755
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.es
@@ -0,0 +1,118 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
+
+A.C
+Apdo
+Av
+Bco
+CC.AA
+Da
+Dep
+Dn
+Dr
+Dra
+EE.UU
+Excmo
+FF.CC
+Fil 
+Gral
+J.C
+Let
+Lic
+N.B
+P.D
+P.V.P
+Prof
+Pts
+Rte
+S.A
+S.A.R
+S.E
+S.L
+S.R.C
+Sr
+Sra
+Srta
+Sta
+Sto
+T.V.E
+Tel
+Ud
+Uds
+V.B
+V.E
+Vd
+Vds
+a/c
+adj
+admón
+afmo
+apdo
+av
+c
+c.f
+c.g
+cap
+cm
+cta
+dcha
+doc
+ej
+entlo
+esq
+etc
+f.c
+gr 
+grs
+izq
+kg
+km
+mg
+mm
+nÃºm
+núm
+p
+p.a
+p.ej
+ptas
+pÃ¡g 
+pÃ¡gs
+pág
+págs
+q.e.g.e
+q.e.s.m
+s
+s.s.s
+vid
+vol
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.fr b/nonbreaking-prefixes/nonbreaking_prefix.fr
new file mode 100644
index 0000000..28126fa
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.fr
@@ -0,0 +1,153 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#
+#any single upper case letter  followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+#no French words end in single lower-case letters, so we throw those in too?
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+# Period-final abbreviation list for French
+A.C.N
+A.M
+art
+ann
+apr
+av
+auj
+lib
+B.P
+boul
+ca
+c.-à-d
+cf
+ch.-l
+chap
+contr
+C.P.I
+C.Q.F.D
+C.N
+C.N.S
+C.S
+dir
+éd
+e.g
+env
+al
+etc
+E.V
+ex
+fasc
+fém
+fig
+fr
+hab
+ibid
+id
+i.e
+inf
+LL.AA
+LL.AA.II
+LL.AA.RR
+LL.AA.SS
+L.D
+LL.EE
+LL.MM
+LL.MM.II.RR
+loc.cit
+masc
+MM
+ms
+N.B
+N.D.A
+N.D.L.R
+N.D.T
+n/réf
+NN.SS
+N.S
+N.D
+N.P.A.I
+p.c.c
+pl
+pp
+p.ex
+p.j
+P.S
+R.A.S
+R.-V
+R.P
+R.I.P
+SS
+S.S
+S.A
+S.A.I
+S.A.R
+S.A.S
+S.E
+sec
+sect
+sing
+S.M
+S.M.I.R
+sq
+sqq
+suiv
+sup
+suppl
+tél
+T.S.V.P
+vb
+vol
+vs
+X.O
+Z.I
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.is b/nonbreaking-prefixes/nonbreaking_prefix.is
new file mode 100644
index 0000000..5b8a710
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.is
@@ -0,0 +1,251 @@
+no #NUMERIC_ONLY#
+No #NUMERIC_ONLY#
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+nR #NUMERIC_ONLY#
+NR #NUMERIC_ONLY#
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+^
+í
+á
+ó
+æ
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+ab.fn
+a.fn
+afs
+al
+alm
+alg
+andh
+ath
+aths
+atr
+ao
+au
+aukaf
+áfn
+áhrl.s
+áhrs
+ákv.gr
+ákv
+bh
+bls
+dr
+e.Kr
+et
+ef
+efn
+ennfr
+eink
+end
+e.st
+erl
+fél
+fskj
+fh
+f.hl
+físl
+fl
+fn
+fo
+forl
+frb
+frl
+frh
+frt
+fsl
+fsh
+fs
+fsk
+fst
+f.Kr
+ft
+fv
+fyrrn
+fyrrv
+germ
+gm
+gr
+hdl
+hdr
+hf
+hl
+hlsk
+hljsk
+hljv
+hljóðv
+hr
+hv
+hvk
+holl
+Hos
+höf
+hk
+hrl
+ísl
+kaf
+kap
+Khöfn
+kk
+kg
+kk
+km
+kl
+klst
+kr
+kt
+kgúrsk
+kvk
+leturbr
+lh
+lh.nt
+lh.þt
+lo
+ltr
+mlja
+mljó
+millj
+mm
+mms
+m.fl
+miðm
+mgr
+mst
+mín
+nf
+nh
+nhm
+nl
+nk
+nmgr
+no
+núv
+nt
+o.áfr
+o.m.fl
+ohf
+o.fl
+o.s.frv
+ófn
+ób
+óákv.gr
+óákv
+pfn
+PR
+pr
+Ritstj
+Rvík
+Rvk
+samb
+samhlj
+samn
+samn
+sbr
+sek
+sérn
+sf
+sfn
+sh
+sfn
+sh
+s.hl
+sk
+skv
+sl
+sn
+so
+ss.us
+s.st
+samþ
+sbr
+shlj
+sign
+skál
+st
+st.s
+stk
+sþ
+teg
+tbl
+tfn
+tl
+tvíhlj
+tvt
+till
+to
+umr
+uh
+us
+uppl
+útg
+vb
+Vf
+vh
+vkf
+Vl
+vl
+vlf
+vmf
+8vo
+vsk
+vth
+þt
+þf
+þjs
+þgf
+þlt
+þolm
+þm
+þml
+þýð
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.it b/nonbreaking-prefixes/nonbreaking_prefix.it
new file mode 100644
index 0000000..992b9ec
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.it
@@ -0,0 +1,180 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Amn 
+Arch 
+Asst
+Avv
+Bart
+Bcc
+Bldg
+Brig
+Bros
+C.A.P
+C.P
+Capt
+Cc
+Cmdr
+Co
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dott
+Dr
+Drs
+Egr
+Ens
+Gen
+Geom
+Gov
+Hon
+Hosp
+Hr
+Id
+Ing
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mo
+Mons
+Mr
+Mrs
+Ms
+Msgr
+N.B
+Op
+Ord
+P.S
+P.T
+Pfc
+Ph
+Prof
+Pvt
+RP
+RSVP
+Rag
+Rep
+Reps
+Res
+Rev
+Rif
+Rt
+S.A
+S.B.F
+S.P.M
+S.p.A
+S.r.l
+Sen
+Sens
+Sfc
+Sgt
+Sig
+Sigg
+Soc
+Spett
+Sr
+St
+Supt
+Surg
+V.P
+
+# other
+a.c 
+acc
+all 
+banc
+c.a
+c.c.p
+c.m
+c.p
+c.s
+c.v
+corr
+dott
+e.p.c
+ecc
+es 
+fatt
+gg
+int
+lett
+ogg
+on
+p.c
+p.c.c
+p.es
+p.f
+p.r
+p.v
+post
+pp
+racc
+ric
+s.n.c
+seg
+sgg
+ss
+tel
+u.s
+v.r
+v.s
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY# 
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.nl b/nonbreaking-prefixes/nonbreaking_prefix.nl
new file mode 100644
index 0000000..c80c417
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.nl
@@ -0,0 +1,115 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
+#         http://nl.wikipedia.org/wiki/Aanspreekvorm
+#         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+bacc
+bc
+bgen
+c.i
+dhr
+dr
+dr.h.c
+drs
+drs
+ds
+eint
+fa
+Fa
+fam
+gen
+genm
+ing
+ir
+jhr
+jkvr
+jr
+kand
+kol
+lgen
+lkol
+Lt
+maj
+Mej
+mevr
+Mme
+mr
+mr
+Mw
+o.b.s
+plv
+prof
+ritm
+tint
+Vz
+Z.D
+Z.D.H
+Z.E
+Z.Em
+Z.H
+Z.K.H
+Z.K.M
+Z.M
+z.v
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
+a.g.v
+bijv
+bijz
+bv
+d.w.z
+e.c
+e.g
+e.k
+ev
+i.p.v
+i.s.m
+i.t.t
+i.v.m
+m.a.w
+m.b.t
+m.b.v
+m.h.o
+m.i
+m.i.v
+v.w.t
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+Nr #NUMERIC_ONLY# 
+Nrs 
+nrs
+nr #NUMERIC_ONLY#
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.pl b/nonbreaking-prefixes/nonbreaking_prefix.pl
new file mode 100644
index 0000000..6b7c106
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.pl
@@ -0,0 +1,283 @@
+adw
+afr
+akad
+al
+Al
+am
+amer
+arch
+art
+Art
+artyst
+astr
+austr
+bałt
+bdb
+bł
+bm
+br
+bryg
+bryt
+centr
+ces
+chem
+chiń
+chir
+c.k
+c.o
+cyg
+cyw
+cyt
+czes
+czw
+cd
+Cd
+czyt
+ćw
+ćwicz
+daw
+dcn
+dekl
+demokr
+det
+diec
+dł
+dn
+dot
+dol
+dop
+dost
+dosł
+h.c
+ds
+dst
+duszp
+dypl
+egz
+ekol
+ekon
+elektr
+em
+ew
+fab
+farm
+fot
+fr
+gat
+gastr
+geogr
+geol
+gimn
+głęb
+gm
+godz
+górn
+gosp
+gr
+gram
+hist
+hiszp
+hr
+Hr
+hot
+id
+in
+im
+iron
+jn
+kard
+kat
+katol
+k.k
+kk
+kol
+kl
+k.p.a
+kpc
+k.p.c
+kpt
+kr
+k.r
+krak
+k.r.o
+kryt
+kult
+laic
+łac
+niem
+woj
+nb
+np
+Nb
+Np
+pol
+pow
+m.in
+pt
+ps
+Pt
+Ps
+cdn
+jw
+ryc
+rys
+Ryc
+Rys
+tj
+tzw
+Tzw
+tzn
+zob
+ang
+ub
+ul
+pw
+pn
+pl
+al
+k
+n
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+ww
+wł
+ur
+zm
+żyd
+żarg
+żyw
+wył
+bp
+bp
+wyst
+tow
+Tow
+o
+sp
+Sp
+st
+spółdz
+Spółdz
+społ
+spółgł
+stoł
+stow
+Stoł
+Stow
+zn
+zew
+zewn
+zdr
+zazw
+zast
+zaw
+zał
+zal
+zam
+zak
+zakł
+zagr
+zach
+adw
+Adw
+lek
+Lek
+med
+mec
+Mec
+doc
+Doc
+dyw
+dyr
+Dyw
+Dyr
+inż
+Inż
+mgr
+Mgr
+dh
+dr
+Dh
+Dr
+p
+P
+red
+Red
+prof
+prok
+Prof
+Prok
+hab
+płk
+Płk
+nadkom
+Nadkom
+podkom
+Podkom
+ks
+Ks
+gen
+Gen
+por
+Por
+reż
+Reż
+przyp
+Przyp
+śp
+św
+śW
+Śp
+Św
+ŚW
+szer
+Szer
+pkt #NUMERIC_ONLY#
+str #NUMERIC_ONLY#
+tab #NUMERIC_ONLY#
+Tab #NUMERIC_ONLY#
+tel
+ust #NUMERIC_ONLY#
+par #NUMERIC_ONLY#
+poz
+pok
+oo
+oO
+Oo
+OO
+r #NUMERIC_ONLY#
+l #NUMERIC_ONLY#
+s #NUMERIC_ONLY#
+najśw
+Najśw
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Ś
+Ć
+Ż
+Ź
+Dz
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.pt b/nonbreaking-prefixes/nonbreaking_prefix.pt
new file mode 100644
index 0000000..5d65bf2
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.pt
@@ -0,0 +1,210 @@
+#File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Art
+Ca
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+DRA
+Dr
+Dra
+Dras
+Drs
+Eng
+Enga
+Engas
+Engos
+Ex
+Exo
+Exmo
+Fig
+Gen
+Hosp
+Insp
+Lda
+MM
+MR
+MRS
+MS
+Maj
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+Sra
+Sras
+Srs
+Sto
+Supt
+Surg
+adj
+adm
+adv
+art
+cit
+col
+con
+corp
+cpl
+dr
+dra
+dras
+drs
+eng
+enga
+engas
+engos
+ex
+exo
+exmo
+fig
+op
+prof
+sr
+sra
+sras
+srs
+sto
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY# 
+Nos
+Art #NUMERIC_ONLY#
+Nr
+p #NUMERIC_ONLY#
+pp #NUMERIC_ONLY#
+
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.ro b/nonbreaking-prefixes/nonbreaking_prefix.ro
new file mode 100644
index 0000000..d489f46
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.ro
@@ -0,0 +1,38 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+dpdv
+etc
+șamd
+M.Ap.N
+dl
+Dl
+d-na
+D-na
+dvs
+Dvs
+pt
+Pt
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.ru b/nonbreaking-prefixes/nonbreaking_prefix.ru
new file mode 100644
index 0000000..444465b
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.ru
@@ -0,0 +1,259 @@
+TBD: Russian uppercase alphabet [А-Я]
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+0гг
+1гг
+2гг
+3гг
+4гг
+5гг
+6гг
+7гг
+8гг
+9гг
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+Xвв
+Vвв
+Iвв
+Lвв
+Mвв
+Cвв
+Xв
+Vв
+Iв
+Lв
+Mв
+Cв
+0м
+1м
+2м
+3м
+4м
+5м
+6м
+7м
+8м
+9м
+0мм
+1мм
+2мм
+3мм
+4мм
+5мм
+6мм
+7мм
+8мм
+9мм
+0см
+1см
+2см
+3см
+4см
+5см
+6см
+7см
+8см
+9см
+0дм
+1дм
+2дм
+3дм
+4дм
+5дм
+6дм
+7дм
+8дм
+9дм
+0л
+1л
+2л
+3л
+4л
+5л
+6л
+7л
+8л
+9л
+0км
+1км
+2км
+3км
+4км
+5км
+6км
+7км
+8км
+9км
+0га
+1га
+2га
+3га
+4га
+5га
+6га
+7га
+8га
+9га
+0кг
+1кг
+2кг
+3кг
+4кг
+5кг
+6кг
+7кг
+8кг
+9кг
+0т
+1т
+2т
+3т
+4т
+5т
+6т
+7т
+8т
+9т
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+0мг
+1мг
+2мг
+3мг
+4мг
+5мг
+6мг
+7мг
+8мг
+9мг
+бульв
+в
+вв
+г
+га
+гг
+гл
+гос
+д
+дм
+доп
+др
+е
+ед
+ед
+зам
+и
+инд
+исп
+Исп
+к
+кап
+кг
+кв
+кл
+км
+кол
+комн
+коп
+куб
+л
+лиц
+лл
+м
+макс
+мг
+мин
+мл
+млн
+млрд
+мм
+н
+наб
+нач
+неуд
+ном
+о
+обл
+обр
+общ
+ок
+ост
+отл
+п
+пер
+перераб
+пл
+пос
+пр
+просп
+проф
+р
+ред
+руб
+с
+сб
+св
+см
+соч
+ср
+ст
+стр
+т
+тел
+Тел
+тех
+тт
+туп
+тыс
+уд
+ул
+уч
+физ
+х
+хор
+ч
+чел
+шт
+экз
+э
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.sk b/nonbreaking-prefixes/nonbreaking_prefix.sk
new file mode 100644
index 0000000..1198d48
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.sk
@@ -0,0 +1,474 @@
+Bc
+Mgr
+RNDr
+PharmDr
+PhDr
+JUDr
+PaedDr
+ThDr
+Ing
+MUDr
+MDDr
+MVDr
+Dr
+ThLic
+PhD
+ArtD
+ThDr
+Dr
+DrSc
+CSs
+prof
+obr
+Obr
+Č
+č
+absol
+adj
+admin
+adr
+Adr
+adv
+advok
+afr
+ak
+akad
+akc
+akuz
+et
+al
+alch
+amer
+anat
+angl
+Angl
+anglosas
+anorg
+ap
+apod
+arch
+archeol
+archit
+arg
+art
+astr
+astrol
+astron
+atp
+atď
+austr
+Austr
+aut
+belg
+Belg
+bibl
+Bibl
+biol
+bot
+bud
+bás
+býv
+cest
+chem
+cirk
+csl
+čs
+Čs
+dat
+dep
+det
+dial
+diaľ
+dipl
+distrib
+dokl
+dosl
+dopr
+dram
+duš
+dv
+dvojčl
+dór
+ekol
+ekon
+el
+elektr
+elektrotech
+energet
+epic
+est
+etc
+etonym
+eufem
+európ
+Európ
+ev
+evid
+expr
+fa
+fam
+farm
+fem
+feud
+fil
+filat
+filoz
+fi
+fon
+form
+fot
+fr
+Fr
+franc
+Franc
+fraz
+fut
+fyz
+fyziol
+garb
+gen
+genet
+genpor
+geod
+geogr
+geol
+geom
+germ
+gr
+Gr
+gréc
+Gréc
+gréckokat
+hebr
+herald
+hist
+hlav
+hosp
+hromad
+hud
+hypok
+ident
+i.e
+ident
+imp
+impf
+indoeur
+inf
+inform
+instr
+int
+interj
+inšt
+inštr
+iron
+jap
+Jap
+jaz
+jedn
+juhoamer
+juhových
+juhozáp
+juž
+kanad
+Kanad
+kanc
+kapit
+kpt
+kart
+katastr
+knih
+kniž
+komp
+konj
+konkr
+kozmet
+krajč
+kresť
+kt
+kuch
+lat
+latinskoamer
+lek
+lex
+lingv
+lit
+litur
+log
+lok
+max
+Max
+maď
+Maď
+medzinár
+mest
+metr
+mil
+Mil
+min
+Min
+miner
+ml
+mld
+mn
+mod
+mytol
+napr
+nar
+Nar
+nasl
+nedok
+neg
+negat
+neklas
+nem
+Nem
+neodb
+neos
+neskl
+nesklon
+nespis
+nespráv
+neved
+než
+niekt
+niž
+nom
+náb
+nákl
+námor
+nár
+obch
+obj
+obv
+obyč
+obč
+občian
+odb
+odd
+ods
+ojed
+okr
+Okr
+opt
+opyt
+org
+os
+osob
+ot
+ovoc
+par
+part
+pejor
+pers
+pf
+Pf 
+P.f
+p.f
+pl
+Plk
+pod
+podst
+pokl
+polit
+politol
+polygr
+pomn
+popl
+por
+porad
+porov
+posch
+potrav
+použ
+poz
+pozit
+poľ
+poľno
+poľnohosp
+poľov
+pošt
+pož
+prac
+predl
+pren
+prep
+preuk
+priezv
+Priezv
+privl
+prof
+práv
+príd
+príj
+prík
+príp
+prír
+prísl
+príslov
+príč
+psych
+publ
+pís
+písm
+pôv
+refl
+reg
+rep
+resp
+rozk
+rozlič
+rozpráv
+roč
+Roč
+ryb
+rádiotech
+rím
+samohl
+semest
+sev
+severoamer
+severových
+severozáp
+sg
+skr
+skup
+sl
+Sloven
+soc
+soch
+sociol
+sp
+spol
+Spol
+spoloč
+spoluhl
+správ
+spôs
+st
+star
+starogréc
+starorím
+s.r.o
+stol
+stor
+str
+stredoamer
+stredoškol
+subj
+subst
+superl
+sv
+sz
+súkr
+súp
+súvzť
+tal
+Tal
+tech
+tel
+Tel
+telef
+teles
+telev
+teol
+trans
+turist
+tuzem
+typogr
+tzn
+tzv
+ukaz
+ul
+Ul
+umel
+univ
+ust
+ved
+vedľ
+verb
+veter
+vin
+viď
+vl
+vod
+vodohosp
+pnl
+vulg
+vyj
+vys
+vysokoškol
+vzťaž
+vôb
+vých
+výd
+výrob
+výsk
+výsl
+výtv
+výtvar
+význ
+včel
+vš
+všeob
+zahr
+zar
+zariad
+zast
+zastar
+zastaráv
+zb
+zdravot
+združ
+zjemn
+zlat
+zn
+Zn
+zool
+zr
+zried
+zv
+záhr
+zák
+zákl
+zám
+záp
+západoeur
+zázn
+územ
+účt
+čast
+čes
+Čes
+čl
+čísl
+živ
+pr
+fak
+Kr
+p.n.l
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.sl b/nonbreaking-prefixes/nonbreaking_prefix.sl
new file mode 100644
index 0000000..230062c
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.sl
@@ -0,0 +1,78 @@
+dr
+Dr
+itd
+itn
+št #NUMERIC_ONLY#
+Št #NUMERIC_ONLY#
+d
+jan
+Jan
+feb
+Feb
+mar
+Mar
+apr
+Apr
+jun
+Jun
+jul
+Jul
+avg
+Avg
+sept
+Sept
+sep
+Sep
+okt
+Okt
+nov
+Nov
+dec
+Dec
+tj
+Tj
+npr
+Npr
+sl
+Sl
+op
+Op
+gl
+Gl
+oz
+Oz
+prev
+dipl
+ing
+prim
+Prim
+cf
+Cf
+gl
+Gl
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/nonbreaking-prefixes/nonbreaking_prefix.sv b/nonbreaking-prefixes/nonbreaking_prefix.sv
new file mode 100644
index 0000000..df5ef29
--- /dev/null
+++ b/nonbreaking-prefixes/nonbreaking_prefix.sv
@@ -0,0 +1,46 @@
+#single upper case letter are usually initials
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+#misc abbreviations
+AB
+G
+VG
+dvs
+etc
+from
+iaf
+jfr
+kl
+kr
+mao
+mfl
+mm
+osv
+pga
+tex
+tom
+vs
diff --git a/nonbreaking_prefixes/README.txt b/nonbreaking_prefixes/README.txt
deleted file mode 100644
index 02cdfcc..0000000
--- a/nonbreaking_prefixes/README.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-The language suffix can be found here:
-
-http://www.loc.gov/standards/iso639-2/php/code_list.php
-
-
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ca b/nonbreaking_prefixes/nonbreaking_prefix.ca
deleted file mode 100644
index 2f4fdfc..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.ca
+++ /dev/null
@@ -1,75 +0,0 @@
-Dr
-Dra
-pàg
-p
-c
-av
-Sr
-Sra
-adm
-esq
-Prof
-S.A
-S.L
-p.e
-ptes
-Sta
-St
-pl
-màx
-cast
-dir
-nre
-fra
-admdora
-Emm
-Excma
-espf
-dc
-admdor
-tel
-angl
-aprox
-ca
-dept
-dj
-dl
-dt
-ds
-dg
-dv
-ed
-entl
-al
-i.e
-maj
-smin
-n
-núm
-pta
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.cs b/nonbreaking_prefixes/nonbreaking_prefix.cs
deleted file mode 100644
index dce6167..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.cs
+++ /dev/null
@@ -1,390 +0,0 @@
-Bc
-BcA
-Ing
-Ing.arch
-MUDr
-MVDr
-MgA
-Mgr
-JUDr
-PhDr
-RNDr
-PharmDr
-ThLic
-ThDr
-Ph.D
-Th.D
-prof
-doc
-CSc
-DrSc
-dr. h. c
-PaedDr
-Dr
-PhMr
-DiS
-abt
-ad
-a.i
-aj
-angl
-anon
-apod
-atd
-atp
-aut
-bd
-biogr
-b.m
-b.p
-b.r
-cca
-cit
-cizojaz
-c.k
-col
-čes
-čín
-čj
-ed
-facs
-fasc
-fol
-fot
-franc
-h.c
-hist
-hl
-hrsg
-ibid
-il
-ind
-inv.č
-jap
-jhdt
-jv
-koed
-kol
-korej
-kl
-krit
-lat
-lit
-m.a
-maď
-mj
-mp
-násl
-např
-nepubl
-něm
-no
-nr
-n.s
-okr
-odd
-odp
-obr
-opr
-orig
-phil
-pl
-pokrač
-pol
-port
-pozn
-př.kr
-př.n.l
-přel
-přeprac
-příl
-pseud
-pt
-red
-repr
-resp
-revid
-rkp
-roč
-roz
-rozš
-samost
-sect
-sest
-seš
-sign
-sl
-srv
-stol
-sv
-šk
-šk.ro
-špan
-tab
-t.č
-tis
-tj
-tř
-tzv
-univ
-uspoř
-vol
-vl.jm
-vs
-vyd
-vyobr
-zal
-zejm
-zkr
-zprac
-zvl
-n.p
-např
-než
-MUDr
-abl
-absol
-adj
-adv
-ak
-ak. sl
-akt
-alch
-amer
-anat
-angl
-anglosas
-arab
-arch
-archit
-arg
-astr
-astrol
-att
-bás
-belg
-bibl
-biol
-boh
-bot
-bulh
-círk
-csl
-č
-čas
-čes
-dat
-děj
-dep
-dět
-dial
-dór
-dopr
-dosl
-ekon
-epic
-etnonym
-eufem
-f
-fam
-fem
-fil
-film
-form
-fot
-fr
-fut
-fyz
-gen
-geogr
-geol
-geom
-germ
-gram
-hebr
-herald
-hist
-hl
-hovor
-hud
-hut
-chcsl
-chem
-ie
-imp
-impf
-ind
-indoevr
-inf
-instr
-interj
-ión
-iron
-it
-kanad
-katalán
-klas
-kniž
-komp
-konj
- 
-konkr
-kř
-kuch
-lat
-lék
-les
-lid
-lit
-liturg
-lok
-log
-m
-mat
-meteor
-metr
-mod
-ms
-mysl
-n
-náb
-námoř
-neklas
-něm
-nesklon
-nom
-ob
-obch
-obyč
-ojed
-opt
-part
-pas
-pejor
-pers
-pf
-pl
-plpf
- 
-práv
-prep
-předl
-přivl
-r
-rcsl
-refl
-reg
-rkp
-ř
-řec
-s
-samohl
-sg
-sl
-souhl
-spec
-srov
-stfr
-střv
-stsl
-subj
-subst
-superl
-sv
-sz
-táz
-tech
-telev
-teol
-trans
-typogr
-var
-vedl
-verb
-vl. jm
-voj
-vok
-vůb
-vulg
-výtv
-vztaž
-zahr
-zájm
-zast
-zejm
- 
-zeměd
-zkr
-zř
-mj
-dl
-atp
-sport
-Mgr
-horn
-MVDr
-JUDr
-RSDr
-Bc
-PhDr
-ThDr
-Ing
-aj
-apod
-PharmDr
-pomn
-ev
-slang
-nprap
-odp
-dop
-pol
-st
-stol
-p. n. l
-před n. l
-n. l
-př. Kr
-po Kr
-př. n. l
-odd
-RNDr
-tzv
-atd
-tzn
-resp
-tj
-p
-br
-č. j
-čj
-č. p
-čp
-a. s
-s. r. o
-spol. s r. o
-p. o
-s. p
-v. o. s
-k. s
-o. p. s
-o. s
-v. r
-v z
-ml
-vč
-kr
-mld
-hod
-popř
-ap
-event
-rus
-slov
-rum
-švýc
-P. T
-zvl
-hor
-dol
-S.O.S
\ No newline at end of file
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.de b/nonbreaking_prefixes/nonbreaking_prefix.de
deleted file mode 100644
index 35fdf5e..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.de
+++ /dev/null
@@ -1,325 +0,0 @@
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-
-#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
-#usually upper case letters are initials in a name
-#no german words end in single lower-case letters, so we throw those in too.
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
-
-
-#Roman Numerals. A dot after one of these is not a sentence break in German.
-I
-II
-III
-IV
-V
-VI
-VII
-VIII
-IX
-X
-XI
-XII
-XIII
-XIV
-XV
-XVI
-XVII
-XVIII
-XIX
-XX
-i
-ii
-iii
-iv
-v
-vi
-vii
-viii
-ix
-x
-xi
-xii
-xiii
-xiv
-xv
-xvi
-xvii
-xviii
-xix
-xx
-
-#Titles and Honorifics
-Adj
-Adm
-Adv
-Asst
-Bart
-Bldg
-Brig
-Bros
-Capt
-Cmdr
-Col
-Comdr
-Con
-Corp
-Cpl
-DR
-Dr
-Ens
-Gen
-Gov
-Hon
-Hosp
-Insp
-Lt
-MM
-MR
-MRS
-MS
-Maj
-Messrs
-Mlle
-Mme
-Mr
-Mrs
-Ms
-Msgr
-Op
-Ord
-Pfc
-Ph
-Prof
-Pvt
-Rep
-Reps
-Res
-Rev
-Rt
-Sen
-Sens
-Sfc
-Sgt
-Sr
-St
-Supt
-Surg
-
-#Misc symbols
-Mio
-Mrd
-bzw
-v
-vs
-usw
-d.h
-z.B
-u.a
-etc
-Mrd
-MwSt
-ggf
-d.J
-D.h
-m.E
-vgl
-I.F
-z.T
-sogen
-ff
-u.E
-g.U
-g.g.A
-c.-à-d
-Buchst
-u.s.w
-sog
-u.ä
-Std
-evtl
-Zt
-Chr
-u.U
-o.ä
-Ltd
-b.A
-z.Zt
-spp
-sen
-SA
-k.o
-jun
-i.H.v
-dgl
-dergl
-Co
-zzt
-usf
-s.p.a
-Dkr
-Corp
-bzgl
-BSE
-
-#Number indicators
-# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
-No
-Nos
-Art
-Nr
-pp
-ca
-Ca
-
-#Ordinals are done with . in German - "1." = "1st" in English
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-11
-12
-13
-14
-15
-16
-17
-18
-19
-20
-21
-22
-23
-24
-25
-26
-27
-28
-29
-30
-31
-32
-33
-34
-35
-36
-37
-38
-39
-40
-41
-42
-43
-44
-45
-46
-47
-48
-49
-50
-51
-52
-53
-54
-55
-56
-57
-58
-59
-60
-61
-62
-63
-64
-65
-66
-67
-68
-69
-70
-71
-72
-73
-74
-75
-76
-77
-78
-79
-80
-81
-82
-83
-84
-85
-86
-87
-88
-89
-90
-91
-92
-93
-94
-95
-96
-97
-98
-99
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.el b/nonbreaking_prefixes/nonbreaking_prefix.el
deleted file mode 100644
index 0470f91..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.el
+++ /dev/null
@@ -1,2 +0,0 @@
-# for now, just include the Greek equivalent of "Mr."
-κ
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.en b/nonbreaking_prefixes/nonbreaking_prefix.en
deleted file mode 100644
index e1a3733..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.en
+++ /dev/null
@@ -1,107 +0,0 @@
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-
-#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
-#usually upper case letters are initials in a name
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-
-#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
-Adj
-Adm
-Adv
-Asst
-Bart
-Bldg
-Brig
-Bros
-Capt
-Cmdr
-Col
-Comdr
-Con
-Corp
-Cpl
-DR
-Dr
-Drs
-Ens
-Gen
-Gov
-Hon
-Hr
-Hosp
-Insp
-Lt
-MM
-MR
-MRS
-MS
-Maj
-Messrs
-Mlle
-Mme
-Mr
-Mrs
-Ms
-Msgr
-Op
-Ord
-Pfc
-Ph
-Prof
-Pvt
-Rep
-Reps
-Res
-Rev
-Rt
-Sen
-Sens
-Sfc
-Sgt
-Sr
-St
-Supt
-Surg
-
-#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
-v
-vs
-i.e
-rev
-e.g
-
-#Numbers only. These should only induce breaks when followed by a numeric sequence
-# add NUMERIC_ONLY after the word for this function
-#This case is mostly for the english "No." which can either be a sentence of its own, or
-#if followed by a number, a non-breaking prefix
-No #NUMERIC_ONLY# 
-Nos
-Art #NUMERIC_ONLY#
-Nr
-pp #NUMERIC_ONLY#
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.es b/nonbreaking_prefixes/nonbreaking_prefix.es
deleted file mode 100644
index d8b2755..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.es
+++ /dev/null
@@ -1,118 +0,0 @@
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-
-#any single upper case letter  followed by a period is not a sentence ender
-#usually upper case letters are initials in a name
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-
-# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
-
-A.C
-Apdo
-Av
-Bco
-CC.AA
-Da
-Dep
-Dn
-Dr
-Dra
-EE.UU
-Excmo
-FF.CC
-Fil 
-Gral
-J.C
-Let
-Lic
-N.B
-P.D
-P.V.P
-Prof
-Pts
-Rte
-S.A
-S.A.R
-S.E
-S.L
-S.R.C
-Sr
-Sra
-Srta
-Sta
-Sto
-T.V.E
-Tel
-Ud
-Uds
-V.B
-V.E
-Vd
-Vds
-a/c
-adj
-admón
-afmo
-apdo
-av
-c
-c.f
-c.g
-cap
-cm
-cta
-dcha
-doc
-ej
-entlo
-esq
-etc
-f.c
-gr 
-grs
-izq
-kg
-km
-mg
-mm
-nÃºm
-núm
-p
-p.a
-p.ej
-ptas
-pÃ¡g 
-pÃ¡gs
-pág
-págs
-q.e.g.e
-q.e.s.m
-s
-s.s.s
-vid
-vol
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.fr b/nonbreaking_prefixes/nonbreaking_prefix.fr
deleted file mode 100644
index 28126fa..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.fr
+++ /dev/null
@@ -1,153 +0,0 @@
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-#
-#any single upper case letter  followed by a period is not a sentence ender
-#usually upper case letters are initials in a name
-#no French words end in single lower-case letters, so we throw those in too?
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
-
-# Period-final abbreviation list for French
-A.C.N
-A.M
-art
-ann
-apr
-av
-auj
-lib
-B.P
-boul
-ca
-c.-à-d
-cf
-ch.-l
-chap
-contr
-C.P.I
-C.Q.F.D
-C.N
-C.N.S
-C.S
-dir
-éd
-e.g
-env
-al
-etc
-E.V
-ex
-fasc
-fém
-fig
-fr
-hab
-ibid
-id
-i.e
-inf
-LL.AA
-LL.AA.II
-LL.AA.RR
-LL.AA.SS
-L.D
-LL.EE
-LL.MM
-LL.MM.II.RR
-loc.cit
-masc
-MM
-ms
-N.B
-N.D.A
-N.D.L.R
-N.D.T
-n/réf
-NN.SS
-N.S
-N.D
-N.P.A.I
-p.c.c
-pl
-pp
-p.ex
-p.j
-P.S
-R.A.S
-R.-V
-R.P
-R.I.P
-SS
-S.S
-S.A
-S.A.I
-S.A.R
-S.A.S
-S.E
-sec
-sect
-sing
-S.M
-S.M.I.R
-sq
-sqq
-suiv
-sup
-suppl
-tél
-T.S.V.P
-vb
-vol
-vs
-X.O
-Z.I
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.is b/nonbreaking_prefixes/nonbreaking_prefix.is
deleted file mode 100644
index 5b8a710..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.is
+++ /dev/null
@@ -1,251 +0,0 @@
-no #NUMERIC_ONLY#
-No #NUMERIC_ONLY#
-nr #NUMERIC_ONLY#
-Nr #NUMERIC_ONLY#
-nR #NUMERIC_ONLY#
-NR #NUMERIC_ONLY#
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
-^
-í
-á
-ó
-æ
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-ab.fn
-a.fn
-afs
-al
-alm
-alg
-andh
-ath
-aths
-atr
-ao
-au
-aukaf
-áfn
-áhrl.s
-áhrs
-ákv.gr
-ákv
-bh
-bls
-dr
-e.Kr
-et
-ef
-efn
-ennfr
-eink
-end
-e.st
-erl
-fél
-fskj
-fh
-f.hl
-físl
-fl
-fn
-fo
-forl
-frb
-frl
-frh
-frt
-fsl
-fsh
-fs
-fsk
-fst
-f.Kr
-ft
-fv
-fyrrn
-fyrrv
-germ
-gm
-gr
-hdl
-hdr
-hf
-hl
-hlsk
-hljsk
-hljv
-hljóðv
-hr
-hv
-hvk
-holl
-Hos
-höf
-hk
-hrl
-ísl
-kaf
-kap
-Khöfn
-kk
-kg
-kk
-km
-kl
-klst
-kr
-kt
-kgúrsk
-kvk
-leturbr
-lh
-lh.nt
-lh.þt
-lo
-ltr
-mlja
-mljó
-millj
-mm
-mms
-m.fl
-miðm
-mgr
-mst
-mín
-nf
-nh
-nhm
-nl
-nk
-nmgr
-no
-núv
-nt
-o.áfr
-o.m.fl
-ohf
-o.fl
-o.s.frv
-ófn
-ób
-óákv.gr
-óákv
-pfn
-PR
-pr
-Ritstj
-Rvík
-Rvk
-samb
-samhlj
-samn
-samn
-sbr
-sek
-sérn
-sf
-sfn
-sh
-sfn
-sh
-s.hl
-sk
-skv
-sl
-sn
-so
-ss.us
-s.st
-samþ
-sbr
-shlj
-sign
-skál
-st
-st.s
-stk
-sþ
-teg
-tbl
-tfn
-tl
-tvíhlj
-tvt
-till
-to
-umr
-uh
-us
-uppl
-útg
-vb
-Vf
-vh
-vkf
-Vl
-vl
-vlf
-vmf
-8vo
-vsk
-vth
-þt
-þf
-þjs
-þgf
-þlt
-þolm
-þm
-þml
-þýð
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.it b/nonbreaking_prefixes/nonbreaking_prefix.it
deleted file mode 100644
index 992b9ec..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.it
+++ /dev/null
@@ -1,180 +0,0 @@
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-
-#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
-#usually upper case letters are initials in a name
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-
-#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
-Adj
-Adm
-Adv
-Amn 
-Arch 
-Asst
-Avv
-Bart
-Bcc
-Bldg
-Brig
-Bros
-C.A.P
-C.P
-Capt
-Cc
-Cmdr
-Co
-Col
-Comdr
-Con
-Corp
-Cpl
-DR
-Dott
-Dr
-Drs
-Egr
-Ens
-Gen
-Geom
-Gov
-Hon
-Hosp
-Hr
-Id
-Ing
-Insp
-Lt
-MM
-MR
-MRS
-MS
-Maj
-Messrs
-Mlle
-Mme
-Mo
-Mons
-Mr
-Mrs
-Ms
-Msgr
-N.B
-Op
-Ord
-P.S
-P.T
-Pfc
-Ph
-Prof
-Pvt
-RP
-RSVP
-Rag
-Rep
-Reps
-Res
-Rev
-Rif
-Rt
-S.A
-S.B.F
-S.P.M
-S.p.A
-S.r.l
-Sen
-Sens
-Sfc
-Sgt
-Sig
-Sigg
-Soc
-Spett
-Sr
-St
-Supt
-Surg
-V.P
-
-# other
-a.c 
-acc
-all 
-banc
-c.a
-c.c.p
-c.m
-c.p
-c.s
-c.v
-corr
-dott
-e.p.c
-ecc
-es 
-fatt
-gg
-int
-lett
-ogg
-on
-p.c
-p.c.c
-p.es
-p.f
-p.r
-p.v
-post
-pp
-racc
-ric
-s.n.c
-seg
-sgg
-ss
-tel
-u.s
-v.r
-v.s
-
-#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
-v
-vs
-i.e
-rev
-e.g
-
-#Numbers only. These should only induce breaks when followed by a numeric sequence
-# add NUMERIC_ONLY after the word for this function
-#This case is mostly for the english "No." which can either be a sentence of its own, or
-#if followed by a number, a non-breaking prefix
-No #NUMERIC_ONLY# 
-Nos
-Art #NUMERIC_ONLY#
-Nr
-pp #NUMERIC_ONLY#
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.nl b/nonbreaking_prefixes/nonbreaking_prefix.nl
deleted file mode 100644
index c80c417..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.nl
+++ /dev/null
@@ -1,115 +0,0 @@
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
-#         http://nl.wikipedia.org/wiki/Aanspreekvorm
-#         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
-#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
-#usually upper case letters are initials in a name
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-
-#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
-bacc
-bc
-bgen
-c.i
-dhr
-dr
-dr.h.c
-drs
-drs
-ds
-eint
-fa
-Fa
-fam
-gen
-genm
-ing
-ir
-jhr
-jkvr
-jr
-kand
-kol
-lgen
-lkol
-Lt
-maj
-Mej
-mevr
-Mme
-mr
-mr
-Mw
-o.b.s
-plv
-prof
-ritm
-tint
-Vz
-Z.D
-Z.D.H
-Z.E
-Z.Em
-Z.H
-Z.K.H
-Z.K.M
-Z.M
-z.v
-
-#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
-#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
-a.g.v
-bijv
-bijz
-bv
-d.w.z
-e.c
-e.g
-e.k
-ev
-i.p.v
-i.s.m
-i.t.t
-i.v.m
-m.a.w
-m.b.t
-m.b.v
-m.h.o
-m.i
-m.i.v
-v.w.t
-
-#Numbers only. These should only induce breaks when followed by a numeric sequence
-# add NUMERIC_ONLY after the word for this function
-#This case is mostly for the english "No." which can either be a sentence of its own, or
-#if followed by a number, a non-breaking prefix
-Nr #NUMERIC_ONLY# 
-Nrs 
-nrs
-nr #NUMERIC_ONLY#
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.pl b/nonbreaking_prefixes/nonbreaking_prefix.pl
deleted file mode 100644
index 6b7c106..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.pl
+++ /dev/null
@@ -1,283 +0,0 @@
-adw
-afr
-akad
-al
-Al
-am
-amer
-arch
-art
-Art
-artyst
-astr
-austr
-bałt
-bdb
-bł
-bm
-br
-bryg
-bryt
-centr
-ces
-chem
-chiń
-chir
-c.k
-c.o
-cyg
-cyw
-cyt
-czes
-czw
-cd
-Cd
-czyt
-ćw
-ćwicz
-daw
-dcn
-dekl
-demokr
-det
-diec
-dł
-dn
-dot
-dol
-dop
-dost
-dosł
-h.c
-ds
-dst
-duszp
-dypl
-egz
-ekol
-ekon
-elektr
-em
-ew
-fab
-farm
-fot
-fr
-gat
-gastr
-geogr
-geol
-gimn
-głęb
-gm
-godz
-górn
-gosp
-gr
-gram
-hist
-hiszp
-hr
-Hr
-hot
-id
-in
-im
-iron
-jn
-kard
-kat
-katol
-k.k
-kk
-kol
-kl
-k.p.a
-kpc
-k.p.c
-kpt
-kr
-k.r
-krak
-k.r.o
-kryt
-kult
-laic
-łac
-niem
-woj
-nb
-np
-Nb
-Np
-pol
-pow
-m.in
-pt
-ps
-Pt
-Ps
-cdn
-jw
-ryc
-rys
-Ryc
-Rys
-tj
-tzw
-Tzw
-tzn
-zob
-ang
-ub
-ul
-pw
-pn
-pl
-al
-k
-n
-nr #NUMERIC_ONLY#
-Nr #NUMERIC_ONLY#
-ww
-wł
-ur
-zm
-żyd
-żarg
-żyw
-wył
-bp
-bp
-wyst
-tow
-Tow
-o
-sp
-Sp
-st
-spółdz
-Spółdz
-społ
-spółgł
-stoł
-stow
-Stoł
-Stow
-zn
-zew
-zewn
-zdr
-zazw
-zast
-zaw
-zał
-zal
-zam
-zak
-zakł
-zagr
-zach
-adw
-Adw
-lek
-Lek
-med
-mec
-Mec
-doc
-Doc
-dyw
-dyr
-Dyw
-Dyr
-inż
-Inż
-mgr
-Mgr
-dh
-dr
-Dh
-Dr
-p
-P
-red
-Red
-prof
-prok
-Prof
-Prok
-hab
-płk
-Płk
-nadkom
-Nadkom
-podkom
-Podkom
-ks
-Ks
-gen
-Gen
-por
-Por
-reż
-Reż
-przyp
-Przyp
-śp
-św
-śW
-Śp
-Św
-ŚW
-szer
-Szer
-pkt #NUMERIC_ONLY#
-str #NUMERIC_ONLY#
-tab #NUMERIC_ONLY#
-Tab #NUMERIC_ONLY#
-tel
-ust #NUMERIC_ONLY#
-par #NUMERIC_ONLY#
-poz
-pok
-oo
-oO
-Oo
-OO
-r #NUMERIC_ONLY#
-l #NUMERIC_ONLY#
-s #NUMERIC_ONLY#
-najśw
-Najśw
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-Ś
-Ć
-Ż
-Ź
-Dz
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.pt b/nonbreaking_prefixes/nonbreaking_prefix.pt
deleted file mode 100644
index 5d65bf2..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.pt
+++ /dev/null
@@ -1,210 +0,0 @@
-#File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-
-#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
-#usually upper case letters are initials in a name
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
-
-
-#Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
-I
-II
-III
-IV
-V
-VI
-VII
-VIII
-IX
-X
-XI
-XII
-XIII
-XIV
-XV
-XVI
-XVII
-XVIII
-XIX
-XX
-i
-ii
-iii
-iv
-v
-vi
-vii
-viii
-ix
-x
-xi
-xii
-xiii
-xiv
-xv
-xvi
-xvii
-xviii
-xix
-xx
-
-#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
-Adj
-Adm
-Adv
-Art
-Ca
-Capt
-Cmdr
-Col
-Comdr
-Con
-Corp
-Cpl
-DR
-DRA
-Dr
-Dra
-Dras
-Drs
-Eng
-Enga
-Engas
-Engos
-Ex
-Exo
-Exmo
-Fig
-Gen
-Hosp
-Insp
-Lda
-MM
-MR
-MRS
-MS
-Maj
-Mrs
-Ms
-Msgr
-Op
-Ord
-Pfc
-Ph
-Prof
-Pvt
-Rep
-Reps
-Res
-Rev
-Rt
-Sen
-Sens
-Sfc
-Sgt
-Sr
-Sra
-Sras
-Srs
-Sto
-Supt
-Surg
-adj
-adm
-adv
-art
-cit
-col
-con
-corp
-cpl
-dr
-dra
-dras
-drs
-eng
-enga
-engas
-engos
-ex
-exo
-exmo
-fig
-op
-prof
-sr
-sra
-sras
-srs
-sto
-
-#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
-v
-vs
-i.e
-rev
-e.g
-
-#Numbers only. These should only induce breaks when followed by a numeric sequence
-# add NUMERIC_ONLY after the word for this function
-#This case is mostly for the english "No." which can either be a sentence of its own, or
-#if followed by a number, a non-breaking prefix
-No #NUMERIC_ONLY# 
-Nos
-Art #NUMERIC_ONLY#
-Nr
-p #NUMERIC_ONLY#
-pp #NUMERIC_ONLY#
-
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ro b/nonbreaking_prefixes/nonbreaking_prefix.ro
deleted file mode 100644
index d489f46..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.ro
+++ /dev/null
@@ -1,38 +0,0 @@
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-dpdv
-etc
-șamd
-M.Ap.N
-dl
-Dl
-d-na
-D-na
-dvs
-Dvs
-pt
-Pt
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ru b/nonbreaking_prefixes/nonbreaking_prefix.ru
deleted file mode 100644
index 444465b..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.ru
+++ /dev/null
@@ -1,259 +0,0 @@
-TBD: Russian uppercase alphabet [А-Я]
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-0гг
-1гг
-2гг
-3гг
-4гг
-5гг
-6гг
-7гг
-8гг
-9гг
-0г
-1г
-2г
-3г
-4г
-5г
-6г
-7г
-8г
-9г
-Xвв
-Vвв
-Iвв
-Lвв
-Mвв
-Cвв
-Xв
-Vв
-Iв
-Lв
-Mв
-Cв
-0м
-1м
-2м
-3м
-4м
-5м
-6м
-7м
-8м
-9м
-0мм
-1мм
-2мм
-3мм
-4мм
-5мм
-6мм
-7мм
-8мм
-9мм
-0см
-1см
-2см
-3см
-4см
-5см
-6см
-7см
-8см
-9см
-0дм
-1дм
-2дм
-3дм
-4дм
-5дм
-6дм
-7дм
-8дм
-9дм
-0л
-1л
-2л
-3л
-4л
-5л
-6л
-7л
-8л
-9л
-0км
-1км
-2км
-3км
-4км
-5км
-6км
-7км
-8км
-9км
-0га
-1га
-2га
-3га
-4га
-5га
-6га
-7га
-8га
-9га
-0кг
-1кг
-2кг
-3кг
-4кг
-5кг
-6кг
-7кг
-8кг
-9кг
-0т
-1т
-2т
-3т
-4т
-5т
-6т
-7т
-8т
-9т
-0г
-1г
-2г
-3г
-4г
-5г
-6г
-7г
-8г
-9г
-0мг
-1мг
-2мг
-3мг
-4мг
-5мг
-6мг
-7мг
-8мг
-9мг
-бульв
-в
-вв
-г
-га
-гг
-гл
-гос
-д
-дм
-доп
-др
-е
-ед
-ед
-зам
-и
-инд
-исп
-Исп
-к
-кап
-кг
-кв
-кл
-км
-кол
-комн
-коп
-куб
-л
-лиц
-лл
-м
-макс
-мг
-мин
-мл
-млн
-млрд
-мм
-н
-наб
-нач
-неуд
-ном
-о
-обл
-обр
-общ
-ок
-ост
-отл
-п
-пер
-перераб
-пл
-пос
-пр
-просп
-проф
-р
-ред
-руб
-с
-сб
-св
-см
-соч
-ср
-ст
-стр
-т
-тел
-Тел
-тех
-тт
-туп
-тыс
-уд
-ул
-уч
-физ
-х
-хор
-ч
-чел
-шт
-экз
-э
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sk b/nonbreaking_prefixes/nonbreaking_prefix.sk
deleted file mode 100644
index 1198d48..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.sk
+++ /dev/null
@@ -1,474 +0,0 @@
-Bc
-Mgr
-RNDr
-PharmDr
-PhDr
-JUDr
-PaedDr
-ThDr
-Ing
-MUDr
-MDDr
-MVDr
-Dr
-ThLic
-PhD
-ArtD
-ThDr
-Dr
-DrSc
-CSs
-prof
-obr
-Obr
-Č
-č
-absol
-adj
-admin
-adr
-Adr
-adv
-advok
-afr
-ak
-akad
-akc
-akuz
-et
-al
-alch
-amer
-anat
-angl
-Angl
-anglosas
-anorg
-ap
-apod
-arch
-archeol
-archit
-arg
-art
-astr
-astrol
-astron
-atp
-atď
-austr
-Austr
-aut
-belg
-Belg
-bibl
-Bibl
-biol
-bot
-bud
-bás
-býv
-cest
-chem
-cirk
-csl
-čs
-Čs
-dat
-dep
-det
-dial
-diaľ
-dipl
-distrib
-dokl
-dosl
-dopr
-dram
-duš
-dv
-dvojčl
-dór
-ekol
-ekon
-el
-elektr
-elektrotech
-energet
-epic
-est
-etc
-etonym
-eufem
-európ
-Európ
-ev
-evid
-expr
-fa
-fam
-farm
-fem
-feud
-fil
-filat
-filoz
-fi
-fon
-form
-fot
-fr
-Fr
-franc
-Franc
-fraz
-fut
-fyz
-fyziol
-garb
-gen
-genet
-genpor
-geod
-geogr
-geol
-geom
-germ
-gr
-Gr
-gréc
-Gréc
-gréckokat
-hebr
-herald
-hist
-hlav
-hosp
-hromad
-hud
-hypok
-ident
-i.e
-ident
-imp
-impf
-indoeur
-inf
-inform
-instr
-int
-interj
-inšt
-inštr
-iron
-jap
-Jap
-jaz
-jedn
-juhoamer
-juhových
-juhozáp
-juž
-kanad
-Kanad
-kanc
-kapit
-kpt
-kart
-katastr
-knih
-kniž
-komp
-konj
-konkr
-kozmet
-krajč
-kresť
-kt
-kuch
-lat
-latinskoamer
-lek
-lex
-lingv
-lit
-litur
-log
-lok
-max
-Max
-maď
-Maď
-medzinár
-mest
-metr
-mil
-Mil
-min
-Min
-miner
-ml
-mld
-mn
-mod
-mytol
-napr
-nar
-Nar
-nasl
-nedok
-neg
-negat
-neklas
-nem
-Nem
-neodb
-neos
-neskl
-nesklon
-nespis
-nespráv
-neved
-než
-niekt
-niž
-nom
-náb
-nákl
-námor
-nár
-obch
-obj
-obv
-obyč
-obč
-občian
-odb
-odd
-ods
-ojed
-okr
-Okr
-opt
-opyt
-org
-os
-osob
-ot
-ovoc
-par
-part
-pejor
-pers
-pf
-Pf 
-P.f
-p.f
-pl
-Plk
-pod
-podst
-pokl
-polit
-politol
-polygr
-pomn
-popl
-por
-porad
-porov
-posch
-potrav
-použ
-poz
-pozit
-poľ
-poľno
-poľnohosp
-poľov
-pošt
-pož
-prac
-predl
-pren
-prep
-preuk
-priezv
-Priezv
-privl
-prof
-práv
-príd
-príj
-prík
-príp
-prír
-prísl
-príslov
-príč
-psych
-publ
-pís
-písm
-pôv
-refl
-reg
-rep
-resp
-rozk
-rozlič
-rozpráv
-roč
-Roč
-ryb
-rádiotech
-rím
-samohl
-semest
-sev
-severoamer
-severových
-severozáp
-sg
-skr
-skup
-sl
-Sloven
-soc
-soch
-sociol
-sp
-spol
-Spol
-spoloč
-spoluhl
-správ
-spôs
-st
-star
-starogréc
-starorím
-s.r.o
-stol
-stor
-str
-stredoamer
-stredoškol
-subj
-subst
-superl
-sv
-sz
-súkr
-súp
-súvzť
-tal
-Tal
-tech
-tel
-Tel
-telef
-teles
-telev
-teol
-trans
-turist
-tuzem
-typogr
-tzn
-tzv
-ukaz
-ul
-Ul
-umel
-univ
-ust
-ved
-vedľ
-verb
-veter
-vin
-viď
-vl
-vod
-vodohosp
-pnl
-vulg
-vyj
-vys
-vysokoškol
-vzťaž
-vôb
-vých
-výd
-výrob
-výsk
-výsl
-výtv
-výtvar
-význ
-včel
-vš
-všeob
-zahr
-zar
-zariad
-zast
-zastar
-zastaráv
-zb
-zdravot
-združ
-zjemn
-zlat
-zn
-Zn
-zool
-zr
-zried
-zv
-záhr
-zák
-zákl
-zám
-záp
-západoeur
-zázn
-územ
-účt
-čast
-čes
-Čes
-čl
-čísl
-živ
-pr
-fak
-Kr
-p.n.l
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sl b/nonbreaking_prefixes/nonbreaking_prefix.sl
deleted file mode 100644
index 230062c..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.sl
+++ /dev/null
@@ -1,78 +0,0 @@
-dr
-Dr
-itd
-itn
-št #NUMERIC_ONLY#
-Št #NUMERIC_ONLY#
-d
-jan
-Jan
-feb
-Feb
-mar
-Mar
-apr
-Apr
-jun
-Jun
-jul
-Jul
-avg
-Avg
-sept
-Sept
-sep
-Sep
-okt
-Okt
-nov
-Nov
-dec
-Dec
-tj
-Tj
-npr
-Npr
-sl
-Sl
-op
-Op
-gl
-Gl
-oz
-Oz
-prev
-dipl
-ing
-prim
-Prim
-cf
-Cf
-gl
-Gl
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sv b/nonbreaking_prefixes/nonbreaking_prefix.sv
deleted file mode 100644
index df5ef29..0000000
--- a/nonbreaking_prefixes/nonbreaking_prefix.sv
+++ /dev/null
@@ -1,46 +0,0 @@
-#single upper case letter are usually initials
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-#misc abbreviations
-AB
-G
-VG
-dvs
-etc
-from
-iaf
-jfr
-kl
-kr
-mao
-mfl
-mm
-osv
-pga
-tex
-tom
-vs
diff --git a/norm b/norm
index 5573c37..3313216 100755
--- a/norm
+++ b/norm
@@ -5,4 +5,3 @@ sum=$(cat $1 | sum)
 for i in `cat $1`; do
   echo "$i" | div $sum
 done
-
diff --git a/norm-german b/norm-german
index 85a39da..5c41f98 100755
--- a/norm-german
+++ b/norm-german
@@ -1,23 +1,23 @@
 #!/usr/bin/env ruby
 
-require 'thread'
-require 'optimist'
+require "thread"
+require "optimist"
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 
 conf = Optimist::options do
   banner "norm_german < <file w/ lowercased tokens>"
   opt :upper, "uppercase", :type => :bool, :default => false
-  opt :threads, "#threads", :type => :int, :default => 1, :short => '-h'
+  opt :threads, "#threads", :type => :int, :default => 1, :short => "-h"
   opt :shard_size, "shard size", :type => :int, :default => 1000
   opt :train, "train", :type => :bool
   opt :apply, "apply", :type => :bool
 end
 
-pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ]
-pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ]
+pairs_lower = [ ["ß","ss"], ["ue", "ü"], ["ae","ä"], ["oe", "ö"] ]
+pairs_upper = [ ["Ä", "Ae"], ["Ö", "Oe"], ["Ü", "Ue"] ]
 if conf[:upper]
   PAIRS = pairs_lower
 else
@@ -84,4 +84,3 @@ token_stock.each { |i|
     h.merge! build_partial i
   end
 }
-
diff --git a/norm-hyphens b/norm-hyphens
index 4a152a1..6491d13 100755
--- a/norm-hyphens
+++ b/norm-hyphens
@@ -1,4 +1,3 @@
-#!/bin/zsh -x
+#!/bin/zsh
 
 sed "s|[ \t]\+\xc2\xad[ \t]\+||g"
-
diff --git a/normchr b/normchr
index f8e5798..02c6ce8 100755
--- a/normchr
+++ b/normchr
@@ -3,10 +3,10 @@
 # http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string-literal
 # https://www.cs.tut.fi/~jkorpela/chars/spaces.html
 
-require 'htmlentities'
+require "htmlentities"
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 coder = HTMLEntities.new
 
@@ -24,7 +24,7 @@ while line = STDIN.gets
   line.gsub! /[\u{e000}-\u{f8ff}]/, " " # UTF-8 PUA
   line.gsub! /[\u{f0000}-\u{ffffd}]/, " "
   line.gsub! /[\u{100000}-\u{10fffd}]/, " "
-  line.gsub! "\r", " "                  # carriage return  
+  line.gsub! "\r", " "                  # carriage return
   line.gsub! /[\u{2000}-\u{200f}]/, " " #                   EN QUAD -- RIGHT-TO-LEFT MARK
   line.gsub! /[\u{2028}-\u{202f}]/, " " #            LINE SEPARATOR -- NARROW NO-BREAK SPACE
   line.gsub! /[\u{205f}-\u{206f}]/, " " # MEDIUM MATHEMATICAL SPACE -- NOMINAL DIGIT SHAPES
@@ -32,4 +32,3 @@ while line = STDIN.gets
   line.gsub! /[[:space:]]+/, " "        # collapse space
   puts coder.decode(line)
 end
-
diff --git a/num-tok b/num-tok
index 56cbae9..0c95aa8 100755
--- a/num-tok
+++ b/num-tok
@@ -1,9 +1,8 @@
 #!/usr/bin/env ruby
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 while line = STDIN.gets
   puts line.strip.split.length
 end
-
diff --git a/odd b/odd
index 0bd9336..ced2861 100755
--- a/odd
+++ b/odd
@@ -1,11 +1,10 @@
 #!/usr/bin/env ruby
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 i = 1
 while line = STDIN.gets
   puts line if i%2!=0
   i+=1
 end
-
diff --git a/overlap b/overlap
index 81f9c4b..95d27a3 100755
--- a/overlap
+++ b/overlap
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 a = {}
 a.default = 0
@@ -11,10 +11,9 @@ ReadFile.readlines_strip(ARGV[1]).map { |segment| b[segment] += 1 }
 
 overlap = 0
 a.each_key { |seg|
-  puts b[seg] 
-  overlap = overlap+b[seg] 
+  puts b[seg]
+  overlap = overlap+b[seg]
 }
 
 puts "---"
 puts overlap
-
diff --git a/paste-pairs b/paste-pairs
index f6b8b31..7e08329 100755
--- a/paste-pairs
+++ b/paste-pairs
@@ -1,10 +1,8 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
 import sys
-from itertools import izip
-
-for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))):
-  print linenr, (src_line.strip())
-  print linenr, (tgt_line.strip())
-  print
 
+for linenr, (src_line, tgt_line) in enumerate(zip(open(sys.argv[1]), open(sys.argv[2]))):
+    print(linenr, src_line.strip())
+    print(linenr, tgt_line.strip())
+    print()
diff --git a/per-sentence-bleu b/per-sentence-bleu
index 257eb3a..d815dc9 100755
--- a/per-sentence-bleu
+++ b/per-sentence-bleu
@@ -1,11 +1,11 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
 
 def main
   conf = Optimist::options do
-    opt :input, "input", :type => :string, :default => '-'
+    opt :input, "input", :type => :string, :default => "-"
     opt :references, "references", :type => :string, :required => true
     opt :len_hack, "hack of Nakov et al", :type => :int, :default => 0
     opt :n, "N", :default => 4
@@ -16,7 +16,7 @@ def main
   input = ReadFile.new conf[:input]
   while line = input.gets
     i += 1
-    if line.strip == ''
+    if line.strip == ""
       puts 0.0
       next
     end
@@ -26,4 +26,3 @@ def main
 end
 
 main
-
diff --git a/per-sentence-bleu-kbest b/per-sentence-bleu-kbest
index dad1607..12a9f6f 100755
--- a/per-sentence-bleu-kbest
+++ b/per-sentence-bleu-kbest
@@ -1,11 +1,11 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
 
 def main
   conf = Optimist::options do
-    opt :kbests, "kbests", :type => :string, :default => '-'
+    opt :kbests, "kbests", :type => :string, :default => "-"
     opt :references, "references", :type => :string, :required => true
   end
   refs = ReadFile.new conf[:references]
@@ -19,7 +19,7 @@ def main
     scores.each_with_index { |x,j|
       puts "#{j+1} ||| #{scores[j]} ||| #{list[j]}"
       if scores[j]==max && !o
-        puts "^^^ #{j+1} #{max}" 
+        puts "^^^ #{j+1} #{max}"
         o = true
       end
     }
@@ -29,4 +29,3 @@ def main
 end
 
 main
-
diff --git a/per-sentence-ter b/per-sentence-ter
index 1a7670e..777d39c 100755
--- a/per-sentence-ter
+++ b/per-sentence-ter
@@ -1,14 +1,14 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
-require 'tempfile'
+require "zipf"
+require "optimist"
+require "tempfile"
 
 def main
   conf = Optimist::options do
-    opt :input, "input", :type => :string, :default => '-'
+    opt :input, "input", :type => :string, :default => "-"
     opt :references, "references", :type => :string, :required => true
-    opt :mteval_bin, "cdec's mteval/fast_score", :type => :string, :default => '`/toolbox/cdec-dtrain/mteval/fast_score'
+    opt :mteval_bin, "cdec mteval/fast_score", :type => :string, :default => "`/toolbox/cdec-dtrain/mteval/fast_score"
   end
 
   refs = ReadFile.readlines_strip conf[:references]
@@ -17,8 +17,8 @@ def main
   while line = input.gets
     line.strip!
     i += 1
-    a = Tempfile.new 'pster'
-    b = Tempfile.new 'pster'
+    a = Tempfile.new "pster"
+    b = Tempfile.new "pster"
     a.write line+"\n"
     b.write refs[i]+"\n"
     a.close; b.close
@@ -30,4 +30,3 @@ def main
 end
 
 main
-
diff --git a/percentile b/percentile
index ba9ceb0..ec42a9a 100755
--- a/percentile
+++ b/percentile
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 data = []
 while line = STDIN.gets
@@ -18,4 +18,3 @@ if index.to_i == index
 else
   puts (data[index.to_i-1] + data[index.to_i]) / 2.0
 end
-
diff --git a/pot b/pot
index 24acabe..b703bca 100755
--- a/pot
+++ b/pot
@@ -4,4 +4,3 @@ pow = ARGV[0].to_f
 while line = STDIN.gets
   puts line.to_f**pow
 end
-
diff --git a/preprocess b/preprocess
index a46b0a8..91de3bb 100755
--- a/preprocess
+++ b/preprocess
@@ -1,9 +1,8 @@
 #!/bin/bash
 
-pushd `dirname $0` > /dev/null
-P=`pwd -P`
+pushd "$(dirname "$0")" > /dev/null
+P="$(pwd -P)"
 popd > /dev/null
 
 LANG=$1
 $P/no-non-printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize-punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer-no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | $P/lowercase.perl 2>lowercase.$LANG.err
-
diff --git a/preprocess-no-lower b/preprocess-no-lower
index afd87e9..7e3ad91 100755
--- a/preprocess-no-lower
+++ b/preprocess-no-lower
@@ -1,9 +1,8 @@
 #!/bin/bash
 
-pushd `dirname $0` > /dev/null
-P=`pwd -P`
+pushd "$(dirname "$0")" > /dev/null
+P="$(pwd -P)"
 popd > /dev/null
 
 LANG=$1
 $P/no-non-printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize-punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer-no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err
-
diff --git a/pt-bloom b/pt-bloom
index 35234f1..b38939d 100755
--- a/pt-bloom
+++ b/pt-bloom
@@ -1,10 +1,10 @@
 #!/usr/bin/env ruby
 
-require 'bloom-filter'
-require 'optimist'
+require "bloom-filter"
+require "optimist"
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 conf = Optimist::options do
   opt :size, "number of entries in the filter", :type => :int, :required => true
@@ -19,6 +19,5 @@ while line = STDIN.gets
   f.insert(src+" ||| "+tgt)
 end
 
-f.dump('pt.bloom')
+f.dump("pt.bloom")
 f.close
-
diff --git a/push-rules b/push-rules
index c97ab80..d0a4de7 100755
--- a/push-rules
+++ b/push-rules
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 a = ReadFile.readlines_strip ARGV[0]
 h = {}
@@ -21,4 +21,3 @@ while line = STDIN.gets
     puts line
   end
 end
-
diff --git a/remove-devtest b/remove-devtest
index 8e026f9..f322a6e 100755
--- a/remove-devtest
+++ b/remove-devtest
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 train_src = ReadFile.new ARGV[0]
 train_tgt = ReadFile.new ARGV[1]
@@ -39,7 +39,7 @@ while line_src = train_src.gets
     line_src_downcase = line_src
     line_tgt_downcase = line_tgt
   end
-  
+
   if not devtest_h_src.has_key? line_src_downcase and not devtest_h_src.has_key? line_tgt_downcase \
  and not devtest_h_tgt.has_key? line_src_downcase and not devtest_h_tgt.has_key? line_tgt_downcase
     train_src_out.write line_src
diff --git a/remove-test-from-bitext b/remove-test-from-bitext
index 43038d3..911a893 100755
--- a/remove-test-from-bitext
+++ b/remove-test-from-bitext
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 test_source = ReadFile.new ARGV[0]
 test_target = ReadFile.new ARGV[1]
@@ -13,7 +13,7 @@ while test_source_line = test_source.gets
   test_source_line.strip!
   test_target_line = test_target.gets
   test_target_line.strip!
-  
+
   all_test_source_lines[test_source_line] = true
   all_test_target_lines[test_target_line] = true
 end
diff --git a/repetition-rate b/repetition-rate
index 87938ae..12e0fab 100755
--- a/repetition-rate
+++ b/repetition-rate
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 windows = []
 cur = []
@@ -9,7 +9,7 @@ while line = STDIN.gets
   if cur_sz >= 1000
     windows << cur
     cur = []
-    cur_sz = 0 
+    cur_sz = 0
   end
   cur << line.strip
   cur_sz += cur.last.split.size
@@ -37,8 +37,7 @@ windows.each { |w|
 
 rr = 1.0
 enums.each_with_index { |i,j|
-  rr *= i/denoms[j] 
+  rr *= i/denoms[j]
 }
 
 puts ((rr**0.25)*100).round 2
-
diff --git a/round b/round
index dfef800..55919d7 100755
--- a/round
+++ b/round
@@ -4,4 +4,3 @@ r = ARGV[0].to_i
 while line = STDIN.gets
   puts line.to_f.round r
 end
-
diff --git a/rule-shapes b/rule-shapes
index 589a670..91f8092 100755
--- a/rule-shapes
+++ b/rule-shapes
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 def shape s
   res  = []
@@ -24,6 +24,5 @@ end
 while line = STDIN.gets
   f, e = line.split(/\t/)
   f.strip!; e.strip!
-  puts shape(f).join('_')+"-"+shape(e).join('_')
+  puts shape(f).join("_")+"-"+shape(e).join("_")
 end
-
diff --git a/sample b/sample
index aa46ddb..dcef148 100755
--- a/sample
+++ b/sample
@@ -1,15 +1,15 @@
 #!/usr/bin/env ruby
 
-require 'optimist'
+require "optimist"
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 opts = Optimist::options do
   banner "sample --size <n> [--shuffle] --file <line separated data>"
   opt :size, "Sample P % or # lines from file or N.", :type => :float
   opt :shuffle, "Sample is shuffled.", :type => :bool
-  opt :file, "Input file.", :type => :string, :default => '-'
+  opt :file, "Input file.", :type => :string, :default => "-"
   opt :output_index, "Output index number.", :type => :bool
   opt :N, "Sample --size from N items.", :type => :int, :default => -1
   opt :absolute, "Sample absolute number of items.", :type => :bool
@@ -19,10 +19,10 @@ input = []
 index = []
 i = 0
 if opts[:N] == -1
-  if opts[:file] == '-'
+  if opts[:file] == "-"
     file = STDIN
   else
-    file = File.new opts[:file], 'r'
+    file = File.new opts[:file], "r"
   end
   while line = file.gets
     input << line
@@ -36,7 +36,6 @@ end
 sample = []
 if !opts[:absolute]
   sample = index.sample(index.size*(opts[:size]/100.0))
-  sample = index.sample(index.size*(opts[:size]/100.0))
 else
   sample = index.sample(opts[:size])
 end
@@ -56,4 +55,3 @@ while idx = sample.shift
     end
   end
 end
-
diff --git a/select b/select
index 36e4256..2c5616a 100755
--- a/select
+++ b/select
@@ -1,11 +1,11 @@
 #!/usr/bin/env ruby
 
-require 'optimist'
-require 'zipf'
+require "optimist"
+require "zipf"
 
 opts = Optimist::options do
   banner "sample --index <n> [--shuffle] [--file <line separated data>]"
-  opt :file, "Input file.", :type => :string, :default => '-'
+  opt :file, "Input file.", :type => :string, :default => "-"
   opt :index, "Index file.", :type => :string, :required => true
 end
 
@@ -15,4 +15,3 @@ index = ReadFile.readlines_strip(opts[:index]).map{ |i| i.to_i }
 index.each { |i|
   puts input[i]
 }
-
diff --git a/select-from b/select-from
index 0ccfeac..e9a394d 100755
--- a/select-from
+++ b/select-from
@@ -1,13 +1,13 @@
 #!/usr/bin/env ruby
 
-require 'optimist'
-require 'zipf'
+require "optimist"
+require "zipf"
 
 opts = Optimist::options do
   banner "select_from [--invert] -i <file> < <line separated data>"
-  opt :index,  "Line numbers to output.", :type => :string, :short => '-i', :required => true
-  opt :invert, "Invert selection.", :type => :bool, :short => '-j', :default => false
-  opt :from1,  "Index starting from 1.", :type => :bool, :short => '-k', :default => false
+  opt :index,  "Line numbers to output.", :type => :string, :short => "-i", :required => true
+  opt :invert, "Invert selection.", :type => :bool, :short => "-j", :default => false
+  opt :from1,  "Index starting from 1.", :type => :bool, :short => "-k", :default => false
 end
 
 accept = {}
@@ -30,4 +30,3 @@ while line = STDIN.gets
   end
   i += 1
 end
-
diff --git a/sentencepiece-decode b/sentencepiece-decode
index 5e07ffa..e715d09 100755
--- a/sentencepiece-decode
+++ b/sentencepiece-decode
@@ -1,9 +1,8 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 while line = STDIN.gets
   line = line.split.join ""
   puts line.gsub "▁", " "
 end
-
diff --git a/shard b/shard
index 5294afd..4b639c5 100755
--- a/shard
+++ b/shard
@@ -1,11 +1,11 @@
 #!/usr/bin/env ruby
 
-require 'optimist'
+require "optimist"
 
 def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false)
   lc = `wc -l #{input}`.split.first.to_i
-  input_ext = input.split('.').last
-  refs_ext = refs.split('.').last
+  input_ext = input.split(".").last
+  refs_ext = refs.split(".").last
   index = (0..lc-1).to_a
   index.reverse!
   index.shuffle! if rand
@@ -68,13 +68,12 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false
 end
 
 opts = Optimist::options do
-  opt :input, 'input', :type => :string, :required => true
-  opt :references, 'references', :type => :string, :required => true
-  opt :alignments, 'alignments', :type => :string, :required => true
-  opt :output_prefix, 'output prefix', :type => :string, :required => true
-  opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z'
-  opt :num_shards, 'number of shards', :type => :int, :required => true
+  opt :input, "input", :type => :string, :required => true
+  opt :references, "references", :type => :string, :required => true
+  opt :alignments, "alignments", :type => :string, :required => true
+  opt :output_prefix, "output prefix", :type => :string, :required => true
+  opt :randomize, "randomize", :type => :bool, :default => false, :short => "-z"
+  opt :num_shards, "number of shards", :type => :int, :required => true
 end
 
 make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize])
-
diff --git a/sort-features b/sort-features
index 88bd779..a91fb00 100755
--- a/sort-features
+++ b/sort-features
@@ -7,4 +7,3 @@ while line = STDIN.gets
 end
 
 h.sort_by { |name, value| -value }.each { |name, value| puts "#{name}\t#{value}" }
-
diff --git a/source-sides b/source-sides
index b4490c6..9243f17 100755
--- a/source-sides
+++ b/source-sides
@@ -1,4 +1,3 @@
-#!/bin/zsh -x
+#!/bin/zsh
 
 split_pipes -f 2 | sort | uniq | sed "s| |_|g" | sed "s|\[X,[12]\]|NX|g"
-
diff --git a/split-kbest b/split-kbest
index ab425b0..52773e8 100755
--- a/split-kbest
+++ b/split-kbest
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 def write_kbest l, fn
   f = WriteFile.new fn
@@ -21,4 +21,3 @@ while line = STDIN.gets
   l << line
 end
 write_kbest l, "#{dir}/#{i}.gz" # last one
-
diff --git a/split-lines b/split-lines
index 14b3a0f..0d036c3 100755
--- a/split-lines
+++ b/split-lines
@@ -1,14 +1,13 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 dir = ARGV[0]
 i = 0
 while line = STDIN.gets
   src, tgt = line.split " ||| "
-  f = WriteFile.new "#{dir}/#{i}.src" 
+  f = WriteFile.new "#{dir}/#{i}.src"
   f.write line
   f.close
   i += 1
 end
-
diff --git a/split-pipes b/split-pipes
index 862e8be..58dcac4 100755
--- a/split-pipes
+++ b/split-pipes
@@ -1,9 +1,9 @@
 #!/usr/bin/env ruby
 
-require 'optimist'
+require "optimist"
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 conf = Optimist::options do
   banner "splitpipes -f <n> < <input>"
@@ -32,10 +32,10 @@ end
 
 while line = STDIN.gets
   j = 1
-  line.strip.split(' ||| ').each { |i|
+  line.strip.split(" ||| ").each { |i|
     if range && (conf[:field]..conf[:to]).include?(j)
       a << i.strip
-    elsif j == conf[:field] 
+    elsif j == conf[:field]
       puts i.strip
       break
     end
@@ -46,6 +46,3 @@ while line = STDIN.gets
   end
   a.clear
 end
-
-
-
diff --git a/sqrt b/sqrt
index d0a67b1..39382e6 100755
--- a/sqrt
+++ b/sqrt
@@ -3,4 +3,3 @@
 while line = STDIN.gets
   puts Math.sqrt line.to_f
 end
-
diff --git a/stanford-parser-run b/stanford-parser-run
index f8d4210..37efacd 100755
--- a/stanford-parser-run
+++ b/stanford-parser-run
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 if [ $# != 1 ]; then
-	echo "$0 text-file" 
+	echo "$0 text-file"
 	exit 1
 fi
 
@@ -10,4 +10,3 @@ export CLASSPATH=:/toolbox/stanfordparser_3_2_0/*
 IN=$1
 
 cat $IN | java -server -mx25000m  edu.stanford.nlp.parser.lexparser.LexicalizedParser -nthreads 8 -sentences newline -encoding utf-8 -tokenized -outputFormat "typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz - | tr '\n' '\t' | sed 's/\t\t/\n/g' | sed 's/\t/ /g' | sed 's/ *$//' | sed 's/, /,/g' > $IN.stp
-
diff --git a/stddev b/stddev
index 15c245e..1b24bb5 100755
--- a/stddev
+++ b/stddev
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'optimist'
+require "optimist"
 
 conf = Optimist::options do
   banner "stddev [-r <d>] < <one number per line>"
@@ -37,4 +37,3 @@ if conf[:round] >= 0
 else
   puts stddev
 end
-
diff --git a/strips b/strips
index 11c00b4..05d41cb 100755
--- a/strips
+++ b/strips
@@ -3,4 +3,3 @@
 while line = STDIN.gets
   puts line.strip
 end
-
diff --git a/substract b/substract
deleted file mode 100755
index 212b6da..0000000
--- a/substract
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'zipf'
-
-f = ReadFile.new ARGV[0]
-g = ReadFile.new ARGV[1]
-
-while line1 = f.gets
-  line2 = g.gets
-  d = line1.to_f - line2.to_f
-  puts d
-end
-
diff --git a/subtract b/subtract
new file mode 100755
index 0000000..ecd6c11
--- /dev/null
+++ b/subtract
@@ -0,0 +1,12 @@
+#!/usr/bin/env ruby
+
+require "zipf"
+
+f = ReadFile.new ARGV[0]
+g = ReadFile.new ARGV[1]
+
+while line1 = f.gets
+  line2 = g.gets
+  d = line1.to_f - line2.to_f
+  puts d
+end
diff --git a/sum b/sum
index acfa563..a3502e6 100755
--- a/sum
+++ b/sum
@@ -6,4 +6,3 @@ while line = STDIN.gets
 end
 
 puts sum
-
diff --git a/tc b/tc
index 7eefdd5..dd16fdf 100755
--- a/tc
+++ b/tc
@@ -1,8 +1,7 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
+require "zipf"
 
 while line = STDIN.gets
   puts tokenize(line.strip).size
 end
-
diff --git a/tf-idf b/tf-idf
index 22c3dac..02f4c7b 100755
--- a/tf-idf
+++ b/tf-idf
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
 
 def main
   conf = Optimist::options do
@@ -15,8 +15,8 @@ def main
   stopwords = []
   if conf[:filter_stopwords]
     stopwords = ReadFile.readlines(conf[:filter_stopwords]).map{ |i|
-      i.split('|').first.strip
-    }.reject{ |i| i=='' }
+      i.split("|").first.strip
+    }.reject{ |i| i=="" }
   end
 
   docs = {}
@@ -54,4 +54,3 @@ def main
 end
 
 main
-
diff --git a/tmx-extract b/tmx-extract
new file mode 100755
index 0000000..7791eb6
--- /dev/null
+++ b/tmx-extract
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+#
+# Adapted from Apertium
+# http://wiki.apertium.org/wiki/Tools_for_TMX
+#
+
+from xml.sax import make_parser
+from xml.sax.handler import ContentHandler
+
+import sys
+import codecs
+
+class TMXHandler(ContentHandler):
+    def __init__ (self, slang, tlang, sfile, tfile):
+        self.pair = set([slang, tlang])
+        self.files = {}
+        self.files[slang] = sfile
+        self.files[tlang] = tfile
+        self.inTag = ""
+        self.note = ""
+        self.tuid = ""
+        self.type = ""
+        self.cur_pair = set()
+        self.cur_lang = ""
+        self.seg = {}
+        self.seg[slang] = ""
+        self.seg[tlang] = ""
+
+    def startElement(self, name, attrs):
+
+        if name == "tu":
+            self.cur_pair = set()
+            self.inTag = "tu"
+            self.tuid = attrs.get("tuid", "")
+            self.type = attrs.get("datatype", "")
+        elif name == "note":
+            self.inTag = "note"
+            self.note = ""
+        elif name == "tuv":
+            self.inTag = "tuv"
+            self.cur_lang = attrs.get("xml:lang", "")
+            self.cur_pair.add(self.cur_lang)
+        elif name == "seg":
+            self.inTag = "seg"
+            if self.cur_lang in self.pair:
+                self.seg[self.cur_lang] = ""
+
+    def characters (self, c):
+        if self.inTag == "note":
+            self.note += c
+        elif self.inTag == "seg" and self.cur_lang in self.pair:
+            self.seg[self.cur_lang] += c
+
+    def endElement(self, name):
+        if name == "tu" and self.pair == self.cur_pair:
+            for lang in self.cur_pair:
+                self.files[lang].write("{}\n".format(self.seg[lang].replace("\n", " ").strip()))
+
+
+if __name__ == "__main__":
+    parser = make_parser()
+
+    if len(sys.argv) < 3:
+        print(f"Usage: {sys.argv[0]} <file> <slang> <tlang>")
+        print()
+        sys.exit(-1)
+
+    sfile_path = f"{sys.argv[1]}.{sys.argv[2]}"
+    tfile_path = f"{sys.argv[1]}.{sys.argv[3]}"
+
+    with open(sfile_path, "w+") as sfile, open(tfile_path, "w+") as tfile:
+        curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile)
+        parser.setContentHandler(curHandler)
+        with open(sys.argv[1], "r") as tmx_file:
+            parser.parse(tmx_file)
diff --git a/tmx-extract-original-py2 b/tmx-extract-original-py2
new file mode 100755
index 0000000..eb39d1d
--- /dev/null
+++ b/tmx-extract-original-py2
@@ -0,0 +1,75 @@
+#!/usr/bin/python2
+#
+# Adapted from Apertium
+# http://wiki.apertium.org/wiki/Tools_for_TMX
+#
+
+from xml.sax import make_parser
+from xml.sax.handler import ContentHandler
+
+import sys
+import codecs
+
+class TMXHandler(ContentHandler):
+    def __init__ (self, slang, tlang, sfile, tfile):
+        self.pair = set([slang, tlang])
+        self.files = {}
+        self.files[slang] = sfile
+        self.files[tlang] = tfile
+        self.inTag = ''
+        self.note = ''
+        self.tuid = ''
+        self.type = ''
+        self.cur_pair = set()
+        self.cur_lang = ''
+        self.seg = {}
+        self.seg[slang] = ''
+        self.seg[tlang] = ''
+
+    def startElement(self, name, attrs):
+
+        if name == 'tu':
+            self.cur_pair = set()
+            self.inTag = 'tu'
+            self.tuid = attrs.get('tuid','')
+            self.type = attrs.get('datatype','')
+        elif name == 'note':
+            self.inTag = 'note'
+            self.note = ""
+        elif name == 'tuv':
+            self.inTag = 'tuv'
+            self.cur_lang = attrs.get('xml:lang', '')
+            self.cur_pair.add(self.cur_lang)
+        elif name == 'seg':
+            self.inTag = 'seg'
+            if self.cur_lang in self.pair:
+                self.seg[self.cur_lang] = ''
+
+    def characters (self, c):
+        if self.inTag == 'note':
+            self.note += c
+        elif self.inTag == 'seg' and self.cur_lang in self.pair:
+            self.seg[self.cur_lang] += c
+
+    def endElement(self, name):
+        if name == 'tu' and self.pair == self.cur_pair:
+            for lang in self.cur_pair:
+                self.files[lang].write(self.seg[lang].encode('utf-8').replace("\n", " ").strip()+"\n")
+
+parser = make_parser()
+
+if len(sys.argv) < 3:
+    print 'Usage: tmx-extract.py <file> <slang> <tlang>'
+    print ''
+    sys.exit(-1)
+
+sfile = open(sys.argv[1]+"."+sys.argv[2], 'w+')
+tfile = open(sys.argv[1]+"."+sys.argv[3], 'w+')
+curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile)
+
+parser.setContentHandler(curHandler)
+
+parser.parse(open(sys.argv[1]))
+
+sfile.close()
+tfile.close()
diff --git a/tmx-extract-original-py2.py b/tmx-extract-original-py2.py
deleted file mode 100755
index cbdb491..0000000
--- a/tmx-extract-original-py2.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/python2
-#
-# Adapted from Apertium
-# http://wiki.apertium.org/wiki/Tools_for_TMX
-#
-
-from xml.sax import make_parser
-from xml.sax.handler import ContentHandler
-
-import sys
-import codecs
-
-class TMXHandler(ContentHandler):
-    def __init__ (self, slang, tlang, sfile, tfile):
-        self.pair = set([slang, tlang])
-        self.files = {}
-        self.files[slang] = sfile
-        self.files[tlang] = tfile
-        self.inTag = ''
-        self.note = ''
-        self.tuid = ''
-        self.type = ''
-        self.cur_pair = set()
-        self.cur_lang = ''
-        self.seg = {}
-        self.seg[slang] = ''
-        self.seg[tlang] = ''
-
-    def startElement(self, name, attrs):
-
-        if name == 'tu':
-            self.cur_pair = set()
-            self.inTag = 'tu'
-            self.tuid = attrs.get('tuid','')
-            self.type = attrs.get('datatype','')
-        elif name == 'note':
-            self.inTag = 'note'
-            self.note = ""
-        elif name == 'tuv':
-            self.inTag = 'tuv'
-            self.cur_lang = attrs.get('xml:lang', '')
-            self.cur_pair.add(self.cur_lang)
-        elif name == 'seg':
-            self.inTag = 'seg'
-            if self.cur_lang in self.pair:
-                self.seg[self.cur_lang] = ''
-
-    def characters (self, c):
-        if self.inTag == 'note':
-            self.note += c
-        elif self.inTag == 'seg' and self.cur_lang in self.pair:
-            self.seg[self.cur_lang] += c
-
-    def endElement(self, name):
-        if name == 'tu' and self.pair == self.cur_pair:
-            for lang in self.cur_pair:
-                self.files[lang].write(self.seg[lang].encode('utf-8').replace("\n", " ").strip()+"\n")
-
-parser = make_parser()
-
-if len(sys.argv) < 3:
-    print 'Usage: tmx-extract.py <file> <slang> <tlang>'
-    print ''
-    sys.exit(-1)
-
-sfile = open(sys.argv[1]+"."+sys.argv[2], 'w+')
-tfile = open(sys.argv[1]+"."+sys.argv[3], 'w+')
-curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile)
-
-parser.setContentHandler(curHandler)
-
-parser.parse(open(sys.argv[1]))
-
-sfile.close()
-tfile.close()
-
diff --git a/tmx-extract.py b/tmx-extract.py
deleted file mode 100755
index 00f18f5..0000000
--- a/tmx-extract.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/python3
-#
-# Adapted from Apertium
-# http://wiki.apertium.org/wiki/Tools_for_TMX
-#
-
-from xml.sax import make_parser
-from xml.sax.handler import ContentHandler
-
-import sys
-import codecs
-
-class TMXHandler(ContentHandler):
-    def __init__ (self, slang, tlang, sfile, tfile):
-        self.pair = set([slang, tlang])
-        self.files = {}
-        self.files[slang] = sfile
-        self.files[tlang] = tfile
-        self.inTag = ''
-        self.note = ''
-        self.tuid = ''
-        self.type = ''
-        self.cur_pair = set()
-        self.cur_lang = ''
-        self.seg = {}
-        self.seg[slang] = ''
-        self.seg[tlang] = ''
-
-    def startElement(self, name, attrs):
-
-        if name == 'tu':
-            self.cur_pair = set()
-            self.inTag = 'tu'
-            self.tuid = attrs.get('tuid','')
-            self.type = attrs.get('datatype','')
-        elif name == 'note':
-            self.inTag = 'note'
-            self.note = ""
-        elif name == 'tuv':
-            self.inTag = 'tuv'
-            self.cur_lang = attrs.get('xml:lang', '')
-            self.cur_pair.add(self.cur_lang)
-        elif name == 'seg':
-            self.inTag = 'seg'
-            if self.cur_lang in self.pair:
-                self.seg[self.cur_lang] = ''
-
-    def characters (self, c):
-        if self.inTag == 'note':
-            self.note += c
-        elif self.inTag == 'seg' and self.cur_lang in self.pair:
-            self.seg[self.cur_lang] += c
-
-    def endElement(self, name):
-        if name == 'tu' and self.pair == self.cur_pair:
-            for lang in self.cur_pair:
-                self.files[lang].write("{}\n".format(self.seg[lang].replace("\n", " ").strip()))
-
-
-if __name__ == "__main__":
-    parser = make_parser()
-
-    if len(sys.argv) < 3:
-        print('Usage: tmx-extract.py <file> <slang> <tlang>')
-        print('')
-        sys.exit(-1)
-
-    sfile_path = sys.argv[1] + "." + sys.argv[2]
-    tfile_path = sys.argv[1] + "." + sys.argv[3]
-
-    with open(sfile_path, 'w+') as sfile, open(tfile_path, 'w+') as tfile:
-        curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile)
-        parser.setContentHandler(curHandler)
-        with open(sys.argv[1], 'r') as tmx_file:
-            parser.parse(tmx_file)
-
diff --git a/tmx-to-plain b/tmx-to-plain
new file mode 100755
index 0000000..025d6e4
--- /dev/null
+++ b/tmx-to-plain
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+
+import argparse
+import datetime
+import sys
+
+from translate.storage.tmx import tmxfile
+
+
+def extract_from_tmx(tmx_file_path,
+                     src_out_path,
+                     tgt_out_path,
+                     begin_date,
+                     date,
+                     src_out_after,
+                     tgt_out_after):
+    with open(tmx_file_path, "rb") as in_fp:
+        tmx_file = tmxfile(in_fp)
+
+    if src_out_after is not None and tgt_out_after is not None:
+        src_out_after_fp = open(src_out_after, "w")
+        tgt_out_after_fp = open(tgt_out_after, "w")
+
+
+    with open(src_out_path, "w") as src_out_fp, open(tgt_out_path, "w") as tgt_out_fp:
+        for index, node in enumerate(tmx_file.unit_iter()):
+            src_out_fp_ = src_out_fp
+            tgt_out_fp_ = tgt_out_fp
+
+            if begin_date is not None:
+                date_string = node.get_target_dom().get("lastusagedate")[:8]
+                date_obj = datetime.datetime.strptime(date_string, "%Y%m%d").date()
+                if date_obj < begin_date:
+                    continue
+
+            if date is not None:
+                date_string = node.get_target_dom().get("changedate")[:8]
+                date_obj = datetime.datetime.strptime(date_string, "%Y%m%d").date()
+                if date_obj > date:
+                    src_out_fp_ = src_out_after_fp
+                    tgt_out_fp_ = tgt_out_after_fp
+
+            src_string = f"{node.source}"
+            tgt_string = f"{node.target}"
+            src_string = src_string.replace("\n", " ").replace("\r", "")
+            tgt_string = tgt_string.replace("\n", " ").replace("\r", "")
+
+            src_out_fp_.write(f"{src_string}\n")
+            tgt_out_fp_.write(f"{tgt_string}\n")
+            if (index + 1) % 1000 == 0:
+                sys.stdout.write(f"Processed {index + 1} lines\r")
+                sys.stdout.flush()
+
+    if src_out_after is not None and tgt_out_after is not None:
+        src_out_after_fp.close()
+        tgt_out_after_fp.close()
+
+
+def main():
+
+    usage = f"Usage: {sys.argv[0]} [options]"
+    parser = argparse.ArgumentParser(usage=usage)
+    parser.add_argument("-i", "--input", help="input tmx file")
+    parser.add_argument("-d", "--date", help="date for splitting the output")
+    parser.add_argument("-b", "--begin_date", help="earliest date (lastusage) to retain data")
+
+    args = parser.parse_args()
+
+    if args.input is None:
+        parser.print_help()
+        sys.exit(1)
+
+    src_out = f"{args.input}.src"
+    tgt_out = f"{args.input}.tgt"
+
+
+    if args.date is not None:
+        date = datetime.datetime.strptime(args.date, "%Y-%m-%d").date()
+        src_out_after = f"{src_out}.after.{args.date}"
+        tgt_out_after = f"{tgt_out}.after.{args.date}"
+    else:
+        date = None
+        src_out_after = None
+        tgt_out_after = None
+
+    if args.begin_date is not None:
+        begin_date = datetime.datetime.strptime(args.begin_date, "%Y-%m-%d").date()
+    else:
+        begin_date = None
+
+    extract_from_tmx(args.input, src_out, tgt_out, begin_date, date, src_out_after, tgt_out_after)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tmx-to-plain.py b/tmx-to-plain.py
deleted file mode 100644
index 07cac6f..0000000
--- a/tmx-to-plain.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import argparse
-import datetime
-import sys
-
-from translate.storage.tmx import tmxfile
-
-
-def extract_from_tmx(tmx_file_path,
-                     src_out_path,
-                     tgt_out_path,
-                     begin_date,
-                     date,
-                     src_out_after,
-                     tgt_out_after):
-    with open(tmx_file_path, 'rb') as in_fp:
-        tmx_file = tmxfile(in_fp)
-    
-    if src_out_after is not None and tgt_out_after is not None:
-        src_out_after_fp = open(src_out_after, "w")
-        tgt_out_after_fp = open(tgt_out_after, "w")
-    
-        
-    with open(src_out_path, "w") as src_out_fp, open(tgt_out_path, "w") as tgt_out_fp:
-        for index, node in enumerate(tmx_file.unit_iter()):
-            src_out_fp_ = src_out_fp
-            tgt_out_fp_ = tgt_out_fp
-            
-            if begin_date is not None:
-                date_string = node.get_target_dom().get('lastusagedate')[:8]
-                date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date()
-                if date_obj < begin_date:
-                    continue
-            
-            if date is not None:
-                date_string = node.get_target_dom().get('changedate')[:8]
-                date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date()
-                if date_obj > date:
-                    src_out_fp_ = src_out_after_fp
-                    tgt_out_fp_ = tgt_out_after_fp
-        
-            src_string = f"{node.source}"
-            tgt_string = f"{node.target}"
-            src_string = src_string.replace('\n', ' ').replace('\r', '')
-            tgt_string = tgt_string.replace('\n', ' ').replace('\r', '')
-        
-            src_out_fp_.write(f"{src_string}\n")
-            tgt_out_fp_.write(f"{tgt_string}\n")
-            if (index + 1) % 1000 == 0:
-                sys.stdout.write(f"Processed {index + 1} lines\r")
-                sys.stdout.flush()
-
-    if src_out_after is not None and tgt_out_after is not None:
-        src_out_after_fp.close()
-        tgt_out_after_fp.close()
-
-
-def main():
-
-    usage = "Usage: python tmx_to_plain.py [options]"
-    parser = argparse.ArgumentParser(usage=usage)
-    parser.add_argument("-i", "--input", help="input tmx file")
-    parser.add_argument("-d", "--date", help="date for splitting the output")
-    parser.add_argument("-b", "--begin_date", help="earliest date (lastusage) to retain data")
-
-    args = parser.parse_args()
-
-    if args.input is None:
-        parser.print_help()
-        sys.exit(1)
-    
-    args.input
-    
-    src_out = args.input + ".src"
-    tgt_out = args.input + ".tgt"
-    
-    
-    if args.date is not None:
-        date = datetime.datetime.strptime(args.date, '%Y-%m-%d').date()
-        src_out_after = src_out + ".after." + args.date
-        tgt_out_after = tgt_out + ".after." + args.date
-    else:
-        date = None
-        src_out_after = None
-        tgt_out_after = None
-        
-    if args.begin_date is not None:
-        begin_date = datetime.datetime.strptime(args.begin_date, '%Y-%m-%d').date()
-    else:
-        begin_date = None
-        
-    extract_from_tmx(args.input, src_out, tgt_out, begin_date, date, src_out_after, tgt_out_after)    
-    
-
-if __name__ == '__main__':
-    main()
diff --git a/to-ascii b/to-ascii
index 10fd1c2..7e2a842 100755
--- a/to-ascii
+++ b/to-ascii
@@ -4,9 +4,8 @@ while line = STDIN.gets
   encoding_options = {
     :invalid           => :replace,
     :undef             => :replace,
-    :replace           => '?',
+    :replace           => "?",
     :universal_newline => true
   }
-  puts line.encode 'ASCII', encoding_options
+  puts line.encode "ASCII", encoding_options
 end
-
diff --git a/toks b/toks
index 8bee29f..db8076f 100755
--- a/toks
+++ b/toks
@@ -1,9 +1,8 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
 
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
+STDIN.set_encoding "utf-8"
+STDOUT.set_encoding "utf-8"
 
 while line = STDIN.gets
   line.strip.split(/\s/).each { |i| puts i }
 end
-
diff --git a/toks-per-line b/toks-per-line
index 8a10cd4..9814f35 100755
--- a/toks-per-line
+++ b/toks-per-line
@@ -14,4 +14,3 @@ while line = STDIN.gets
     puts a.size
   end
 end
-
diff --git a/train-test-split b/train-test-split
index 6aa4796..db5aad4 100755
--- a/train-test-split
+++ b/train-test-split
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'optimist'
+require "zipf"
+require "optimist"
 
 conf = Optimist::options do
   opt :source, "source file", :type => :string, :required => true
@@ -13,11 +13,11 @@ conf = Optimist::options do
 end
 
 source_filename = conf[:source]
-source_extension = source_filename.split('.').last
+source_extension = source_filename.split(".").last
 source_lines = ReadFile.readlines source_filename
 
 target_filename = conf[:target]
-target_extension = target_filename.split('.').last
+target_extension = target_filename.split(".").last
 target_lines = ReadFile.readlines target_filename
 
 size = conf[:size]
diff --git a/tsv-exclude b/tsv-exclude
index e951ea1..cee3923 100755
--- a/tsv-exclude
+++ b/tsv-exclude
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'set'
+require "zipf"
+require "set"
 
 to_exclude0 = {}
 to_exclude1 = {}
diff --git a/tsv-joint-set b/tsv-joint-set
index c0dbdcf..ce77a9e 100755
--- a/tsv-joint-set
+++ b/tsv-joint-set
@@ -1,8 +1,8 @@
 #!/usr/bin/env ruby
 
-require 'set'
-require 'zipf'
-require 'optimist'
+require "set"
+require "zipf"
+require "optimist"
 
 conf = Optimist::options do
   opt :n, "Desired number segments in test set.", :type => :int, :required => true
@@ -50,4 +50,3 @@ outputs.each_with_index { |o,i|
     f.write o[0][j] + "\t" + o[1][j] + "\n"
   }
 }
-
diff --git a/tsv-uniq b/tsv-uniq
index fde79f2..6709e8d 100755
--- a/tsv-uniq
+++ b/tsv-uniq
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'set'
+require "set"
 
 strictness = ARGV[0].to_i # 1 one-side
                           # 2 just the pair
@@ -21,14 +21,14 @@ if strictness == 1
   seen = Set.new
   segments[side].each_with_index { |segment,i|
     if not seen.include? segment
-      puts "#{segments[i][0]}\t#{segments[i][1]}"
+      puts "#{segments[0][i]}\t#{segments[1][i]}"
     end
     seen << segment
   }
 elsif strictness == 2
   seen = Set.new
   segments[0].each_index { |i|
-    segment_pair = [segments[i][0], segments[i][1]]
+    segment_pair = [segments[0][i], segments[1][i]]
     if not seen.include? segment_pair
       puts "#{segment_pair[0]}\t#{segment_pair[1]}"
     end
@@ -46,4 +46,3 @@ elsif strictness == 3
     seen_pairs << segment_pair
   }
 end
-
diff --git a/var b/var
index 8ca6082..4e88f1e 100755
--- a/var
+++ b/var
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-require 'optimist'
+require "optimist"
 
 conf = Optimist::options do
   banner "stddev [-r <d>] < <one number per line>"
@@ -32,4 +32,3 @@ if conf[:round] >= 0
 else
   puts var
 end
-
diff --git a/vocab b/vocab
index e6bdcd9..b2a2de9 100755
--- a/vocab
+++ b/vocab
@@ -1,4 +1,3 @@
 #!/bin/sh
 
 $(dirname $0)/toks ${1+"$@"} | sort | uniq -c
-
diff --git a/vocab-2 b/vocab-2
new file mode 100755
index 0000000..1004faf
--- /dev/null
+++ b/vocab-2
@@ -0,0 +1,12 @@
+#!/usr/bin/env ruby
+
+require "zipf"
+
+d = {}
+while line = STDIN.gets
+  line.strip.split.each { |tok|
+    d[tok] = true
+  }
+end
+
+puts d.size
diff --git a/vocab2 b/vocab2
deleted file mode 100755
index 1991357..0000000
--- a/vocab2
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'zipf'
-
-d = {}
-while line = STDIN.gets
-  line.strip.split.each { |tok|
-    d[tok] = true
-  }
-end
-
-puts d.size
-
diff --git a/zh-ko-or-ja b/zh-ko-or-ja
index 0b42386..e049704 100755
--- a/zh-ko-or-ja
+++ b/zh-ko-or-ja
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 
-require 'zipf'
-require 'script_detector'
+require "zipf"
+require "script_detector"
 
 $to_code = {}
 $to_code["Ambiguous Chinese"] = "??"
@@ -15,4 +15,3 @@ while line = STDIN.gets
   code = $to_code[line.identify_script]
   puts code
 end
-
-- 
cgit v1.2.3