From e0b634754d1bef33dc8e72509c6990cccc32745a Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Thu, 9 Oct 2014 20:47:23 +0100
Subject: alles neu macht der mai

---
 README.md                         |  14 +-
 add_seg                           |   2 +-
 avg                               |   1 -
 avg_weights                       |   1 -
 cdec_hg_to_json                   |  80 +++++++++
 collapse_tags.rb                  |  40 -----
 dot                               |   9 +
 first_lower                       |  11 ++
 firstlower                        |  12 --
 gigaword_collapse_tags            |  39 +++++
 hg2json.py                        |  81 ---------
 kbest_bleu_oracles                |   2 -
 key_count                         |  14 ++
 keycount                          |  14 --
 kmeans                            |   2 -
 lin_reg                           |   2 -
 log_reg                           |   2 -
 max                               |   1 -
 median                            |   1 -
 mem_usage                         |  11 ++
 memusg                            |  12 --
 merge_files                       |   1 -
 merge_ttable                      |   2 -
 min                               |   1 -
 min_max                           |   1 -
 moses_1best                       |   1 -
 mult                              |   1 -
 no_empty                          |   1 -
 num_tok                           |   1 -
 odd                               |   1 -
 parse-stanford.sh                 |  13 --
 paste_pairs                       |   1 -
 per_sentence_bleu                 |   2 -
 per_sentence_bleu_kbest           |   2 -
 per_sentence_ter                  |   2 -
 pot                               |   1 -
 round                             |   1 -
 ruby_eval                         |   1 -
 rule_shapes                       |   1 -
 shard                             |   1 -
 split_pipes                       |  23 +++
 splitpipes                        |  24 ---
 stanford_parser_run               |  13 ++
 stddev                            |   1 -
 sum                               |   1 -
 tc                                |   1 -
 test/cdec_hg_to_json/cdec.ini     |   5 +
 test/cdec_hg_to_json/grammar.gz   | Bin 0 -> 1399915 bytes
 test/cdec_hg_to_json/hg.json.gz   | Bin 0 -> 318029 bytes
 test/cdec_hg_to_json/hg.meta      |   7 +
 test/cdec_hg_to_json/in           |   1 +
 test/cdec_hg_to_json/toy.cdec.ini |   2 +
 test/cdec_hg_to_json/toy.grammar  |  12 ++
 test/cdec_hg_to_json/toy.in       |   1 +
 test/cdec_hg_to_json/toy.weights  |   3 +
 test/cdec_hg_to_json/weights      |  17 ++
 test/hg2json/cdec.ini             |   5 -
 test/hg2json/grammar.gz           | Bin 1399915 -> 0 bytes
 test/hg2json/hg.json.gz           | Bin 318029 -> 0 bytes
 test/hg2json/hg.meta              |   7 -
 test/hg2json/in                   |   1 -
 test/hg2json/toy.cdec.ini         |   2 -
 test/hg2json/toy.grammar          |  12 --
 test/hg2json/toy.in               |   1 -
 test/hg2json/toy.weights          |   3 -
 test/hg2json/weights              |  17 --
 tf-idf                            |   2 -
 to_ascii                          |   1 -
 tokenizer-no-escape.perl          | 348 ++++++++++++++++++++++++++++++++++++++
 tokenizer.no-escape.perl          | 348 --------------------------------------
 toks                              |   1 -
 train_test_split                  |  50 ++++++
 traintestsplit                    |  51 ------
 var                               |   1 -
 74 files changed, 653 insertions(+), 694 deletions(-)
 create mode 100755 cdec_hg_to_json
 delete mode 100755 collapse_tags.rb
 create mode 100755 dot
 create mode 100755 first_lower
 delete mode 100755 firstlower
 create mode 100755 gigaword_collapse_tags
 delete mode 100755 hg2json.py
 create mode 100755 key_count
 delete mode 100755 keycount
 create mode 100755 mem_usage
 delete mode 100755 memusg
 delete mode 100755 parse-stanford.sh
 create mode 100755 split_pipes
 delete mode 100755 splitpipes
 create mode 100755 stanford_parser_run
 create mode 100644 test/cdec_hg_to_json/cdec.ini
 create mode 100644 test/cdec_hg_to_json/grammar.gz
 create mode 100644 test/cdec_hg_to_json/hg.json.gz
 create mode 100644 test/cdec_hg_to_json/hg.meta
 create mode 100644 test/cdec_hg_to_json/in
 create mode 100644 test/cdec_hg_to_json/toy.cdec.ini
 create mode 100644 test/cdec_hg_to_json/toy.grammar
 create mode 100644 test/cdec_hg_to_json/toy.in
 create mode 100644 test/cdec_hg_to_json/toy.weights
 create mode 100644 test/cdec_hg_to_json/weights
 delete mode 100644 test/hg2json/cdec.ini
 delete mode 100644 test/hg2json/grammar.gz
 delete mode 100644 test/hg2json/hg.json.gz
 delete mode 100644 test/hg2json/hg.meta
 delete mode 100644 test/hg2json/in
 delete mode 100644 test/hg2json/toy.cdec.ini
 delete mode 100644 test/hg2json/toy.grammar
 delete mode 100644 test/hg2json/toy.in
 delete mode 100644 test/hg2json/toy.weights
 delete mode 100644 test/hg2json/weights
 create mode 100755 tokenizer-no-escape.perl
 delete mode 100755 tokenizer.no-escape.perl
 create mode 100755 train_test_split
 delete mode 100755 traintestsplit

diff --git a/README.md b/README.md
index 3a6b1b7..fd42922 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,11 @@
-scripts
-=======
+a number of NLP related scripts. Some scripts require my zipf gem, see [1]
 
-A number of NLP related scripts.
-Some scripts require my zipf gem,
-see https://github.com/pks/zipf
+\*.perl taken from the moses [2] toolkit
 
-compound-splitter.perl and tokenizer.no-escape.perl
-taken from the moses [1] toolkit.
+mem\_usage taken from [3]
 
 
-[1] https://github.com/moses-smt/mosesdecoder
+[1] https://github.com/pks/zipf
+[2] https://github.com/moses-smt/mosesdecoder
+[3] https://gist.github.com/netj/526585
 
diff --git a/add_seg b/add_seg
index e4fe22d..7a4ca7a 100755
--- a/add_seg
+++ b/add_seg
@@ -24,8 +24,8 @@ while line = STDIN.gets
   s = "<seg"
   if cfg[:loo] then s += " exclude=\"#{i}\"" end
   if index.size > 0
-    puts s + " id=\"#{index[j]}\"> #{line.strip} </seg>"
     if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{index[j]}#{ext}\"" end
+    puts s + " id=\"#{index[j]}\"> #{line.strip} </seg>"
   else
     if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{i}#{ext}\"" end
     puts s + " id=\"#{i}\"> #{line.strip} </seg>"
diff --git a/avg b/avg
index ed31465..07e3de9 100755
--- a/avg
+++ b/avg
@@ -2,7 +2,6 @@
 
 require 'trollop'
 
-
 cfg = Trollop::options do
   banner "avg < <one number per line>"
   opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
diff --git a/avg_weights b/avg_weights
index 1f9053f..2e23440 100755
--- a/avg_weights
+++ b/avg_weights
@@ -4,7 +4,6 @@ require 'zipf'
 require 'trollop'
 require 'zlib'
 
-
 cfg = Trollop::options do
   opt :weights_files, "a number of weights files: name value", :required => true
   opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false
diff --git a/cdec_hg_to_json b/cdec_hg_to_json
new file mode 100755
index 0000000..5a26cf7
--- /dev/null
+++ b/cdec_hg_to_json
@@ -0,0 +1,80 @@
+#!/usr/bin/env python2
+
+import cdec
+import sys, argparse
+
+def hg2json(hg, weights):
+  """
+  output a JSON representation of a cdec hypegraph
+  (see http://aclweb.org/aclwiki/index.php?title=Hypergraph_Format )
+  """
+  res = ''
+  res += "{\n"
+  res += '"weights":{'+"\n"
+  a = []
+  for i in weights:
+    a.append( '"%s":%s'%(i[0], i[1]) )
+  res += ", ".join(a)+"\n"
+  res += "},\n"
+  res += '"nodes":'+"\n"
+  res += "[\n"
+  a = []
+  a.append( '{ "label":"root", "cat":"root" }' )
+  for i in hg.nodes:
+    a.append( '{ "label":"%s", "cat":"%s" }'%(i.id, i.cat) )
+  res += ",\n".join(a)+"\n"
+  res += "],\n"
+  res += '"edges":'+"\n"
+  res += "[\n"
+  a = []
+  for i in hg.edges:
+    s = "{"
+    s += '"head":"%s"'%(i.head_node.id)
+    xs = ' "f":{'
+    b = []
+    for j in i.feature_values:
+      b.append( '"%s":%s'%(j[0], j[1]) )
+    xs += ", ".join(b)
+    xs += "},"
+    c = []
+    for j in i.tail_nodes:
+      c.append( '"'+str(j.id)+'"' )
+    if len(c) > 0:
+      s += ', "tails":[ %s ],'%(",".join(c))
+    else:
+      s += ', "tails":[ "root" ],'
+    s += xs
+    s += ' "weight":%s }'%(i.prob)
+    a.append(s)
+  res += ",\n".join(a)+"\n"
+  res += "]\n"
+  res += "}\n"
+  return res
+
+def main():
+  parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs')
+  parser.add_argument('-c', '--config', required=True, help='decoder configuration')
+  parser.add_argument('-w', '--weights', required=True, help='feature weights')
+  args = parser.parse_args()
+  with open(args.config) as config:
+    config = config.read()
+  decoder = cdec.Decoder(config)
+  decoder.read_weights(args.weights)
+  ins = sys.stdin.readline().strip()
+  hg = decoder.translate(ins)
+
+  sys.stderr.write( "input:\n '%s'\n"%(ins) )
+  sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) )
+  num_nodes = 0
+  for i in hg.nodes: num_nodes+=1
+  sys.stderr.write( "# nodes = %s\n"%(num_nodes) )
+  num_edges = 0
+  for i in hg.edges: num_edges+=1
+  sys.stderr.write( "# edges = %s\n"%(num_edges) )
+  sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) )
+
+  print hg2json(hg, decoder.weights)
+
+if __name__=="__main__":
+  main()
+
diff --git a/collapse_tags.rb b/collapse_tags.rb
deleted file mode 100755
index 75fcaf5..0000000
--- a/collapse_tags.rb
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env ruby
-
-# works with gigaword en v5
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-
-in_p = false
-in_dateline = false
-collect = []
-
-while line = STDIN.gets
-  line.strip!
-  if line.downcase == "<dateline>"
-    in_dateline = true
-    next
-  elsif line.downcase == "</dateline>"
-    in_dateline = false
-    next
-  elsif in_dateline
-    next
-  elsif line.downcase == "<p>" and not in_p
-    in_p = true
-    collect = []
-    next
-  elsif line.downcase == "</p>" and in_p
-    if collect.size > 0
-        puts collect.join(" ").strip
-    end
-    in_p = false
-    next
- elsif in_p
-   collect.push line
-   next
- else
-   puts line
- end
-end
-
diff --git a/dot b/dot
new file mode 100755
index 0000000..da0dc58
--- /dev/null
+++ b/dot
@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+a = SparseVector.from_file 'w', ' '
+b = SparseVector.from_file 'f', ' '
+puts a.to_s
+puts a.dot b
+
diff --git a/first_lower b/first_lower
new file mode 100755
index 0000000..1cddb8e
--- /dev/null
+++ b/first_lower
@@ -0,0 +1,11 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+while line = STDIN.gets
+  line.strip!
+  if line && line!='' && line[0].downcase?
+    puts line
+  end
+end
+
diff --git a/firstlower b/firstlower
deleted file mode 100755
index 682a9b7..0000000
--- a/firstlower
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'zipf'
-
-
-while line = STDIN.gets
-  line.strip!
-  if line && line!='' && line[0].downcase?
-    puts line
-  end
-end
-
diff --git a/gigaword_collapse_tags b/gigaword_collapse_tags
new file mode 100755
index 0000000..cbaf7d7
--- /dev/null
+++ b/gigaword_collapse_tags
@@ -0,0 +1,39 @@
+#!/usr/bin/env ruby
+
+# works with gigaword en v5
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+in_p = false
+in_dateline = false
+collect = []
+
+while line = STDIN.gets
+  line.strip!
+  if line.downcase == "<dateline>"
+    in_dateline = true
+    next
+  elsif line.downcase == "</dateline>"
+    in_dateline = false
+    next
+  elsif in_dateline
+    next
+  elsif line.downcase == "<p>" and not in_p
+    in_p = true
+    collect = []
+    next
+  elsif line.downcase == "</p>" and in_p
+    if collect.size > 0
+        puts collect.join(" ").strip
+    end
+    in_p = false
+    next
+ elsif in_p
+   collect.push line
+   next
+ else
+   puts line
+ end
+end
+
diff --git a/hg2json.py b/hg2json.py
deleted file mode 100755
index 5bd5c2c..0000000
--- a/hg2json.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env python2
-
-import cdec
-import sys, argparse
-
-def hg2json(hg, weights):
-  """
-  output a JSON representation of a cdec hypegraph
-  (see http://aclweb.org/aclwiki/index.php?title=Hypergraph_Format )
-  """
-  res = ''
-  res += "{\n"
-  res += '"weights":{'+"\n"
-  a = []
-  for i in weights:
-    a.append( '"%s":%s'%(i[0], i[1]) )
-  res += ", ".join(a)+"\n"
-  res += "},\n"
-  res += '"nodes":'+"\n"
-  res += "[\n"
-  a = []
-  a.append( '{ "label":"root", "cat":"root" }' )
-  for i in hg.nodes:
-    a.append( '{ "label":"%s", "cat":"%s" }'%(i.id, i.cat) )
-  res += ",\n".join(a)+"\n"
-  res += "],\n"
-  res += '"edges":'+"\n"
-  res += "[\n"
-  a = []
-  for i in hg.edges:
-    s = "{"
-    s += '"head":"%s"'%(i.head_node.id)
-    xs = ' "f":{'
-    b = []
-    for j in i.feature_values:
-      b.append( '"%s":%s'%(j[0], j[1]) )
-    xs += ", ".join(b)
-    xs += "},"
-    c = []
-    for j in i.tail_nodes:
-      c.append( '"'+str(j.id)+'"' )
-    if len(c) > 0:
-      s += ', "tails":[ %s ],'%(",".join(c))
-    else:
-      s += ', "tails":[ "root" ],'
-    s += xs
-    s += ' "weight":%s }'%(i.prob)
-    a.append(s)
-  res += ",\n".join(a)+"\n"
-  res += "]\n"
-  res += "}\n"
-  return res
-
-def main():
-  parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs')
-  parser.add_argument('-c', '--config', required=True, help='decoder configuration')
-  parser.add_argument('-w', '--weights', required=True, help='feature weights')
-  args = parser.parse_args()
-  with open(args.config) as config:
-    config = config.read()
-  decoder = cdec.Decoder(config)
-  decoder.read_weights(args.weights)
-  ins = sys.stdin.readline().strip()
-  hg = decoder.translate(ins)
-
-  sys.stderr.write( "input:\n '%s'\n"%(ins) )
-  sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) )
-  num_nodes = 0
-  for i in hg.nodes: num_nodes+=1
-  sys.stderr.write( "# nodes = %s\n"%(num_nodes) )
-  num_edges = 0
-  for i in hg.edges: num_edges+=1
-  sys.stderr.write( "# edges = %s\n"%(num_edges) )
-  sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) )
-
-  print hg2json(hg, decoder.weights)
-
-
-if __name__=="__main__":
-  main()
-
diff --git a/kbest_bleu_oracles b/kbest_bleu_oracles
index 2ac344b..7db1c7e 100755
--- a/kbest_bleu_oracles
+++ b/kbest_bleu_oracles
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def get_context kbest_lists, references, n
   a = []
   kbest_lists.each_index { |i|
@@ -48,6 +47,5 @@ def main
   }
 end
 
-
 main
 
diff --git a/key_count b/key_count
new file mode 100755
index 0000000..deaa522
--- /dev/null
+++ b/key_count
@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+h = {}
+h.default = 0
+while line = STDIN.gets
+  line.strip!
+  h[line] += 1
+end
+
+h.each_pair { |k,v| puts "#{k} #{v}" }
+
diff --git a/keycount b/keycount
deleted file mode 100755
index deaa522..0000000
--- a/keycount
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env ruby
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-h = {}
-h.default = 0
-while line = STDIN.gets
-  line.strip!
-  h[line] += 1
-end
-
-h.each_pair { |k,v| puts "#{k} #{v}" }
-
diff --git a/kmeans b/kmeans
index ec28897..201864b 100755
--- a/kmeans
+++ b/kmeans
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def read_data fn
   data = {}
   ReadFile.new(fn).readlines_strip.map{ |i|
@@ -114,6 +113,5 @@ def main
   end
 end
 
-
 main
 
diff --git a/lin_reg b/lin_reg
index 168e7df..4a7c3b2 100755
--- a/lin_reg
+++ b/lin_reg
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def read_data fn, scale
   f = ReadFile.new fn
   data = []
@@ -67,6 +66,5 @@ def main
   puts model.to_s
 end
 
-
 main
 
diff --git a/log_reg b/log_reg
index e6f47eb..3916d0c 100755
--- a/log_reg
+++ b/log_reg
@@ -4,7 +4,6 @@ require 'zipf'
 require 'matrix'
 require 'trollop'
 
-
 def read_data fn
   f = ReadFile.new fn
   data = []
@@ -68,6 +67,5 @@ def main
   puts model.to_s
 end
 
-
 main
 
diff --git a/max b/max
index 87f3c73..b2c1cae 100755
--- a/max
+++ b/max
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 max = -1.0/0
 while line = STDIN.gets
   v = line.to_f
diff --git a/median b/median
index 9499c95..0b1950b 100755
--- a/median
+++ b/median
@@ -2,7 +2,6 @@
 
 require 'zipf'
 
-
 a = []
 while line = STDIN.gets
   a << line.to_f
diff --git a/mem_usage b/mem_usage
new file mode 100755
index 0000000..5c2104f
--- /dev/null
+++ b/mem_usage
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+"$@" &
+pid=$! peak=0
+while true; do
+  sleep 1
+  sample="$(ps -o rss= $pid 2> /dev/null)" || break
+  let peak='sample > peak ? sample : peak'
+done
+echo "$(( ${peak%% *} / 1024)) m"
+
diff --git a/memusg b/memusg
deleted file mode 100755
index a69daaa..0000000
--- a/memusg
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-
-"$@" &
-pid=$! peak=0
-while true; do
-  sleep 1
-  sample="$(ps -o rss= $pid 2> /dev/null)" || break
-  let peak='sample > peak ? sample : peak'
-done
-echo "$(( ${peak%% *} / 1024)) m"
-
diff --git a/merge_files b/merge_files
index 0b4941e..714b57d 100755
--- a/merge_files
+++ b/merge_files
@@ -2,7 +2,6 @@
 
 require 'zipf'
 
-
 def usage
   STDERR.write "merge_files <file>+\n"
   exit 1
diff --git a/merge_ttable b/merge_ttable
index 20d86d3..e4621f5 100755
--- a/merge_ttable
+++ b/merge_ttable
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def main
   cfg = Trollop::options do
     opt :f, "f files", :type => :string, :required => true
@@ -31,6 +30,5 @@ def main
   } 
 end
 
-
 main
 
diff --git a/min b/min
index 398b0fb..f8a7e42 100755
--- a/min
+++ b/min
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 min = 1.0/0
 while line = STDIN.gets
   v = line.to_f
diff --git a/min_max b/min_max
index 17dc566..b79a743 100755
--- a/min_max
+++ b/min_max
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 cfg = Trollop::options do
   opt :min, "minimum #tokens", :type => :int, :default => 1
   opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n'
diff --git a/moses_1best b/moses_1best
index 849ebf1..fd35cf8 100755
--- a/moses_1best
+++ b/moses_1best
@@ -2,7 +2,6 @@
 
 require 'zipf'
 
-
 prev_idx = nil
 while line = STDIN.gets
   line.strip!
diff --git a/mult b/mult
index 2ef0149..478ec5e 100755
--- a/mult
+++ b/mult
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 factor = ARGV[0].to_f
 while line = STDIN.gets
   puts line.to_f * factor
diff --git a/no_empty b/no_empty
index 96c9ce4..da57e23 100755
--- a/no_empty
+++ b/no_empty
@@ -2,7 +2,6 @@
 
 require 'zipf'
 
-
 files = []
 (0..1).each { |i| files << ReadFile.new(ARGV[i]) }
 (2..3).each { |i| files << WriteFile.new(ARGV[i]) }
diff --git a/num_tok b/num_tok
index 53b99a0..56cbae9 100755
--- a/num_tok
+++ b/num_tok
@@ -3,7 +3,6 @@
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-
 while line = STDIN.gets
   puts line.strip.split.length
 end
diff --git a/odd b/odd
index 93aaa80..0bd9336 100755
--- a/odd
+++ b/odd
@@ -3,7 +3,6 @@
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-
 i = 1
 while line = STDIN.gets
   puts line if i%2!=0
diff --git a/parse-stanford.sh b/parse-stanford.sh
deleted file mode 100755
index f8d4210..0000000
--- a/parse-stanford.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-if [ $# != 1 ]; then
-	echo "$0 text-file" 
-	exit 1
-fi
-
-export CLASSPATH=:/toolbox/stanfordparser_3_2_0/*
-
-IN=$1
-
-cat $IN | java -server -mx25000m  edu.stanford.nlp.parser.lexparser.LexicalizedParser -nthreads 8 -sentences newline -encoding utf-8 -tokenized -outputFormat "typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz - | tr '\n' '\t' | sed 's/\t\t/\n/g' | sed 's/\t/ /g' | sed 's/ *$//' | sed 's/, /,/g' > $IN.stp
-
diff --git a/paste_pairs b/paste_pairs
index 07c1f22..f6b8b31 100755
--- a/paste_pairs
+++ b/paste_pairs
@@ -3,7 +3,6 @@
 import sys
 from itertools import izip
 
-
 for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))):
   print linenr, (src_line.strip())
   print linenr, (tgt_line.strip())
diff --git a/per_sentence_bleu b/per_sentence_bleu
index 76fcf38..5bacd1a 100755
--- a/per_sentence_bleu
+++ b/per_sentence_bleu
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def main
   cfg = Trollop::options do
     opt :input, "input", :type => :string, :default => '-'
@@ -26,6 +25,5 @@ def main
   input.close
 end
 
-
 main
 
diff --git a/per_sentence_bleu_kbest b/per_sentence_bleu_kbest
index 4d821b3..e6a31cb 100755
--- a/per_sentence_bleu_kbest
+++ b/per_sentence_bleu_kbest
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def main
   cfg = Trollop::options do
     opt :kbests, "kbests", :type => :string, :default => '-'
@@ -29,6 +28,5 @@ def main
   }
 end
 
-
 main
 
diff --git a/per_sentence_ter b/per_sentence_ter
index 8b04be5..343708e 100755
--- a/per_sentence_ter
+++ b/per_sentence_ter
@@ -4,7 +4,6 @@ require 'zipf'
 require 'trollop'
 require 'tempfile'
 
-
 def main
   cfg = Trollop::options do
     opt :input, "input", :type => :string, :default => '-'
@@ -30,6 +29,5 @@ def main
   input.close
 end
 
-
 main
 
diff --git a/pot b/pot
index ec199ea..24acabe 100755
--- a/pot
+++ b/pot
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 pow = ARGV[0].to_f
 while line = STDIN.gets
   puts line.to_f**pow
diff --git a/round b/round
index 3dfbb6f..dfef800 100755
--- a/round
+++ b/round
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 r = ARGV[0].to_i
 while line = STDIN.gets
   puts line.to_f.round r
diff --git a/ruby_eval b/ruby_eval
index 96b2ecb..fe0d181 100755
--- a/ruby_eval
+++ b/ruby_eval
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 while line = STDIN.gets
   puts "#{eval line}"
 end
diff --git a/rule_shapes b/rule_shapes
index fd42249..589a670 100755
--- a/rule_shapes
+++ b/rule_shapes
@@ -3,7 +3,6 @@
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-
 def shape s
   res  = []
   in_t = false
diff --git a/shard b/shard
index f952104..6155123 100755
--- a/shard
+++ b/shard
@@ -2,7 +2,6 @@
 
 require 'trollop'
 
-
 def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false)
   lc = `wc -l #{input}`.split.first.to_i
   input_ext = input.split('.').last
diff --git a/split_pipes b/split_pipes
new file mode 100755
index 0000000..eeba69b
--- /dev/null
+++ b/split_pipes
@@ -0,0 +1,23 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+cfg = Trollop::options do
+  banner "splitpipes -f <n> < <input>"
+  opt :field, "field", :type => :int
+end
+
+while line = STDIN.gets
+  j = 1
+  line.strip.split(' ||| ').each { |i|
+    if j == cfg[:field]
+      puts i.strip
+      break
+    end
+    j += 1
+  }
+end
+
diff --git a/splitpipes b/splitpipes
deleted file mode 100755
index 35ee176..0000000
--- a/splitpipes
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'trollop'
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-
-cfg = Trollop::options do
-  banner "splitpipes -f <n> < <input>"
-  opt :field, "field", :type => :int
-end
-
-while line = STDIN.gets
-  j = 1
-  line.strip.split(' ||| ').each { |i|
-    if j == cfg[:field]
-      puts i.strip
-      break
-    end
-    j += 1
-  }
-end
-
diff --git a/stanford_parser_run b/stanford_parser_run
new file mode 100755
index 0000000..f8d4210
--- /dev/null
+++ b/stanford_parser_run
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+if [ $# != 1 ]; then
+	echo "$0 text-file" 
+	exit 1
+fi
+
+export CLASSPATH=:/toolbox/stanfordparser_3_2_0/*
+
+IN=$1
+
+cat $IN | java -server -mx25000m  edu.stanford.nlp.parser.lexparser.LexicalizedParser -nthreads 8 -sentences newline -encoding utf-8 -tokenized -outputFormat "typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz - | tr '\n' '\t' | sed 's/\t\t/\n/g' | sed 's/\t/ /g' | sed 's/ *$//' | sed 's/, /,/g' > $IN.stp
+
diff --git a/stddev b/stddev
index 5cda0e0..a7397b2 100755
--- a/stddev
+++ b/stddev
@@ -2,7 +2,6 @@
 
 require 'trollop'
 
-
 cfg = Trollop::options do
   banner "stddev [-r <d>] < <one number per line>"
   opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
diff --git a/sum b/sum
index dac72d3..acfa563 100755
--- a/sum
+++ b/sum
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 sum = 0.0
 while line = STDIN.gets
   sum += line.to_f
diff --git a/tc b/tc
index 993086a..7eefdd5 100755
--- a/tc
+++ b/tc
@@ -2,7 +2,6 @@
 
 require 'zipf'
 
-
 while line = STDIN.gets
   puts tokenize(line.strip).size
 end
diff --git a/test/cdec_hg_to_json/cdec.ini b/test/cdec_hg_to_json/cdec.ini
new file mode 100644
index 0000000..1ad25b5
--- /dev/null
+++ b/test/cdec_hg_to_json/cdec.ini
@@ -0,0 +1,5 @@
+formalism=scfg
+grammar=test/hg2json/grammar.gz
+add_pass_through_rules=true
+feature_function=WordPenalty
+intersection_strategy=full
diff --git a/test/cdec_hg_to_json/grammar.gz b/test/cdec_hg_to_json/grammar.gz
new file mode 100644
index 0000000..78dda98
Binary files /dev/null and b/test/cdec_hg_to_json/grammar.gz differ
diff --git a/test/cdec_hg_to_json/hg.json.gz b/test/cdec_hg_to_json/hg.json.gz
new file mode 100644
index 0000000..ed178c6
Binary files /dev/null and b/test/cdec_hg_to_json/hg.json.gz differ
diff --git a/test/cdec_hg_to_json/hg.meta b/test/cdec_hg_to_json/hg.meta
new file mode 100644
index 0000000..d33a54c
--- /dev/null
+++ b/test/cdec_hg_to_json/hg.meta
@@ -0,0 +1,7 @@
+input:
+ 'in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen .'
+viterbi translation:
+ 'which labor market desperate transformed into attempting gathered by failed to show any the non - is making festzuhalten gathered by pervez musharraf meant to its borders with within than the non - have pakistan 's intelligence relied constitutional for security as a its borders with declared a state of emergency - range missiles .'
+# nodes = 220
+# edges = 16640
+viterbi score = 228.95
diff --git a/test/cdec_hg_to_json/in b/test/cdec_hg_to_json/in
new file mode 100644
index 0000000..7dc411d
--- /dev/null
+++ b/test/cdec_hg_to_json/in
@@ -0,0 +1 @@
+in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen .
diff --git a/test/cdec_hg_to_json/toy.cdec.ini b/test/cdec_hg_to_json/toy.cdec.ini
new file mode 100644
index 0000000..d4a2896
--- /dev/null
+++ b/test/cdec_hg_to_json/toy.cdec.ini
@@ -0,0 +1,2 @@
+formalism=scfg
+grammar=test/hg2json/toy.grammar
diff --git a/test/cdec_hg_to_json/toy.grammar b/test/cdec_hg_to_json/toy.grammar
new file mode 100644
index 0000000..382c94f
--- /dev/null
+++ b/test/cdec_hg_to_json/toy.grammar
@@ -0,0 +1,12 @@
+[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0
+[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0
+[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1.0
+[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1
+[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1
+[JJ] ||| kleines ||| small ||| logp=0 use_small=1.0
+[JJ] ||| kleines ||| little ||| logp=0 use_little=1.0
+[JJ] ||| grosses ||| big ||| logp=0
+[JJ] ||| grosses ||| large ||| logp=0
+[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0
+[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0
+[V] ||| fand ||| found ||| logp=0
diff --git a/test/cdec_hg_to_json/toy.in b/test/cdec_hg_to_json/toy.in
new file mode 100644
index 0000000..e6df927
--- /dev/null
+++ b/test/cdec_hg_to_json/toy.in
@@ -0,0 +1 @@
+ich sah ein kleines haus
diff --git a/test/cdec_hg_to_json/toy.weights b/test/cdec_hg_to_json/toy.weights
new file mode 100644
index 0000000..70075b7
--- /dev/null
+++ b/test/cdec_hg_to_json/toy.weights
@@ -0,0 +1,3 @@
+logp 2
+use_house 0
+use_shell 1
diff --git a/test/cdec_hg_to_json/weights b/test/cdec_hg_to_json/weights
new file mode 100644
index 0000000..7f96f1d
--- /dev/null
+++ b/test/cdec_hg_to_json/weights
@@ -0,0 +1,17 @@
+PhraseModel_0 1.0
+PhraseModel_1 1.0
+PhraseModel_2 1.0
+PhraseModel_3 1.0
+PhraseModel_4 1.0
+PhraseModel_5 1.0
+PhraseModel_6 1.0
+PassThrough -1.0
+PassThrough_1 -1.0
+PassThrough_2 -1.0
+PassThrough_3 -1.0
+PassThrough_4 -1.0
+PassThrough_5 -1.0
+PassThrough_6 -1.0
+Glue 0.1
+LanguageModel 10.0
+LanguageModel_OOV -10
diff --git a/test/hg2json/cdec.ini b/test/hg2json/cdec.ini
deleted file mode 100644
index 1ad25b5..0000000
--- a/test/hg2json/cdec.ini
+++ /dev/null
@@ -1,5 +0,0 @@
-formalism=scfg
-grammar=test/hg2json/grammar.gz
-add_pass_through_rules=true
-feature_function=WordPenalty
-intersection_strategy=full
diff --git a/test/hg2json/grammar.gz b/test/hg2json/grammar.gz
deleted file mode 100644
index 78dda98..0000000
Binary files a/test/hg2json/grammar.gz and /dev/null differ
diff --git a/test/hg2json/hg.json.gz b/test/hg2json/hg.json.gz
deleted file mode 100644
index ed178c6..0000000
Binary files a/test/hg2json/hg.json.gz and /dev/null differ
diff --git a/test/hg2json/hg.meta b/test/hg2json/hg.meta
deleted file mode 100644
index d33a54c..0000000
--- a/test/hg2json/hg.meta
+++ /dev/null
@@ -1,7 +0,0 @@
-input:
- 'in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen .'
-viterbi translation:
- 'which labor market desperate transformed into attempting gathered by failed to show any the non - is making festzuhalten gathered by pervez musharraf meant to its borders with within than the non - have pakistan 's intelligence relied constitutional for security as a its borders with declared a state of emergency - range missiles .'
-# nodes = 220
-# edges = 16640
-viterbi score = 228.95
diff --git a/test/hg2json/in b/test/hg2json/in
deleted file mode 100644
index 7dc411d..0000000
--- a/test/hg2json/in
+++ /dev/null
@@ -1 +0,0 @@
-in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen .
diff --git a/test/hg2json/toy.cdec.ini b/test/hg2json/toy.cdec.ini
deleted file mode 100644
index d4a2896..0000000
--- a/test/hg2json/toy.cdec.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-formalism=scfg
-grammar=test/hg2json/toy.grammar
diff --git a/test/hg2json/toy.grammar b/test/hg2json/toy.grammar
deleted file mode 100644
index 382c94f..0000000
--- a/test/hg2json/toy.grammar
+++ /dev/null
@@ -1,12 +0,0 @@
-[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0
-[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0
-[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1.0
-[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1
-[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1
-[JJ] ||| kleines ||| small ||| logp=0 use_small=1.0
-[JJ] ||| kleines ||| little ||| logp=0 use_little=1.0
-[JJ] ||| grosses ||| big ||| logp=0
-[JJ] ||| grosses ||| large ||| logp=0
-[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0
-[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0
-[V] ||| fand ||| found ||| logp=0
diff --git a/test/hg2json/toy.in b/test/hg2json/toy.in
deleted file mode 100644
index e6df927..0000000
--- a/test/hg2json/toy.in
+++ /dev/null
@@ -1 +0,0 @@
-ich sah ein kleines haus
diff --git a/test/hg2json/toy.weights b/test/hg2json/toy.weights
deleted file mode 100644
index 70075b7..0000000
--- a/test/hg2json/toy.weights
+++ /dev/null
@@ -1,3 +0,0 @@
-logp 2
-use_house 0
-use_shell 1
diff --git a/test/hg2json/weights b/test/hg2json/weights
deleted file mode 100644
index 7f96f1d..0000000
--- a/test/hg2json/weights
+++ /dev/null
@@ -1,17 +0,0 @@
-PhraseModel_0 1.0
-PhraseModel_1 1.0
-PhraseModel_2 1.0
-PhraseModel_3 1.0
-PhraseModel_4 1.0
-PhraseModel_5 1.0
-PhraseModel_6 1.0
-PassThrough -1.0
-PassThrough_1 -1.0
-PassThrough_2 -1.0
-PassThrough_3 -1.0
-PassThrough_4 -1.0
-PassThrough_5 -1.0
-PassThrough_6 -1.0
-Glue 0.1
-LanguageModel 10.0
-LanguageModel_OOV -10
diff --git a/tf-idf b/tf-idf
index fc6c2ec..450de6b 100755
--- a/tf-idf
+++ b/tf-idf
@@ -3,7 +3,6 @@
 require 'zipf'
 require 'trollop'
 
-
 def main
   cfg = Trollop::options do
     opt :documents, "input files (documents)", :type => :string, :required => true
@@ -48,6 +47,5 @@ def main
   docs.each { |i| puts i.to_s }
 end
 
-
 main
 
diff --git a/to_ascii b/to_ascii
index 6c1d23e..10fd1c2 100755
--- a/to_ascii
+++ b/to_ascii
@@ -1,6 +1,5 @@
 #!/usr/bin/env ruby
 
-
 while line = STDIN.gets
   encoding_options = {
     :invalid           => :replace,
diff --git a/tokenizer-no-escape.perl b/tokenizer-no-escape.perl
new file mode 100755
index 0000000..4397360
--- /dev/null
+++ b/tokenizer-no-escape.perl
@@ -0,0 +1,348 @@
+#!/usr/bin/perl -w
+
+# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+#       (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+#       (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+#       (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use FindBin qw($RealBin);
+use strict;
+use Time::HiRes;
+#use Thread;
+
+my $mydir = "$RealBin/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
+
+while (@ARGV) 
+{
+	$_ = shift;
+	/^-b$/ && ($| = 1, next);
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+	/^-x$/ && ($SKIP_XML = 1, next);
+	/^-a$/ && ($AGGRESSIVE = 1, next);
+	/^-time$/ && ($TIMING = 1, next);
+	/^-threads$/ && ($NUM_THREADS = int(shift), next);
+	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+}
+
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+    $start_time = [ Time::HiRes::gettimeofday( ) ];
+}
+
+# print help message
+if ($HELP) 
+{
+	print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
+        print "Options:\n";
+        print "  -q     ... quiet.\n";
+        print "  -a     ... aggressive hyphen splitting.\n";
+        print "  -b     ... disable Perl buffering.\n";
+        print "  -time  ... enable processing time calculation.\n";
+	exit;
+}
+
+if (!$QUIET) 
+{
+	print STDERR "Tokenizer Version 1.1\n";
+	print STDERR "Language: $language\n";
+	print STDERR "Number of threads: $NUM_THREADS\n";
+}
+
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
+	print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
+
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+    while(<STDIN>) 
+    {
+        $count_sentences = $count_sentences + 1;
+        push(@batch_sentences, $_);
+        if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+        {
+            # assign each thread work
+            for (my $i=0; $i<$NUM_THREADS; $i++)
+            {
+                my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+                my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+                my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+                my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+                push(@thread_list, $new_thread);
+            }
+            foreach (@thread_list)
+            {
+                my $tokenized_list = $_->join;
+                foreach (@$tokenized_list)
+                {
+                    print $_;
+                }
+            }
+            # reset for the new run
+            @thread_list = ();
+            @batch_sentences = ();
+        }
+    }
+    # the last batch
+    if (scalar(@batch_sentences)>0)
+    {
+        # assign each thread work
+        for (my $i=0; $i<$NUM_THREADS; $i++)
+        {
+            my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+            if ($start_index >= scalar(@batch_sentences))
+            {
+                last;
+            }
+            my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+            if ($end_index >= scalar(@batch_sentences))
+            {
+                $end_index = scalar(@batch_sentences)-1;
+            }
+            my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+            my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+            push(@thread_list, $new_thread);
+        }
+        foreach (@thread_list)
+        {
+            my $tokenized_list = $_->join;
+            foreach (@$tokenized_list)
+            {
+                print $_;
+            }
+        }
+    }
+}
+else
+{# single thread only
+    while(<STDIN>) 
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/) 
+        {
+            #don't try to tokenize XML/HTML tag lines
+            print $_;
+        }
+        else 
+        {
+            print &tokenize($_);
+        }
+    }
+}
+
+if ($TIMING)
+{
+    my $duration = Time::HiRes::tv_interval( $start_time );
+    print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+    print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
+
+#####################################################################################
+# subroutines afterward
+
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array cotaining a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+    my(@text_list) = @_;
+    my(@tokenized_list) = ();
+    foreach (@text_list)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/) 
+        {
+            #don't try to tokenize XML/HTML tag lines
+            push(@tokenized_list, $_);
+        }
+        else
+        {
+            push(@tokenized_list, &tokenize($_));
+        }
+    }
+    return \@tokenized_list;
+}
+
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize 
+{
+    my($text) = @_;
+    chomp($text);
+    $text = " $text ";
+    
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+
+    # seperate out all "other" special characters
+    $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+
+    # aggressive hyphen splitting
+    if ($AGGRESSIVE) 
+    {
+        $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+    }
+
+    #multi-dots stay together
+    $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+    while($text =~ /DOTMULTI\./) 
+    {
+        $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+        $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+    }
+
+    # seperate out "," except if within numbers (5,300)
+    $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    # separate , pre and post number
+    $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+	      
+    # turn `into '
+    $text =~ s/\`/\'/g;
+	
+    #turn '' into "
+    $text =~ s/\'\'/ \" /g;
+
+    if ($language eq "en") 
+    {
+        #split contractions right
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+        #special case for "1990's"
+        $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+    } 
+    elsif (($language eq "fr") or ($language eq "it")) 
+    {
+        #split contractions left	
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+    } 
+    else 
+    {
+        $text =~ s/\'/ \' /g;
+    }
+	
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++) 
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/) 
+        {
+            my $pre = $1;
+            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) 
+            {
+                #no change
+			} 
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) 
+            {
+                #no change
+            } 
+            else 
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }		
+
+    # clean up extraneous spaces
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+
+    #restore multi-dots
+    while($text =~ /DOTDOTMULTI/) 
+    {
+        $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+    }
+    $text =~ s/DOTMULTI/./g;
+
+    #escape special chars
+    #$text =~ s/\&/\&amp;/g;   # escape escape
+    #$text =~ s/\|/\&#124;/g;  # factor separator
+    #$text =~ s/\</\&lt;/g;    # xml
+    #$text =~ s/\>/\&gt;/g;    # xml
+    #$text =~ s/\'/\&apos;/g;  # xml
+    #$text =~ s/\"/\&quot;/g;  # xml
+    #$text =~ s/\[/\&#91;/g;   # syntax non-terminal
+    #$text =~ s/\]/\&#93;/g;   # syntax non-terminal
+
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+
+    return $text;
+}
+
+sub load_prefixes 
+{
+    my ($language, $PREFIX_REF) = @_;
+	
+    my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+	
+    #default back to English if we don't have a language-specific prefix file
+    if (!(-e $prefixfile)) 
+    {
+        $prefixfile = "$mydir/nonbreaking_prefix.en";
+        print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+        die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+    }
+	
+    if (-e "$prefixfile") 
+    {
+        open(PREFIX, "<:utf8", "$prefixfile");
+        while (<PREFIX>) 
+        {
+            my $item = $_;
+            chomp($item);
+            if (($item) && (substr($item,0,1) ne "#")) 
+            {
+                if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) 
+                {
+                    $PREFIX_REF->{$1} = 2;
+                } 
+                else 
+                {
+                    $PREFIX_REF->{$item} = 1;
+                }
+            }
+        }
+        close(PREFIX);
+    }
+}
+
diff --git a/tokenizer.no-escape.perl b/tokenizer.no-escape.perl
deleted file mode 100755
index 4397360..0000000
--- a/tokenizer.no-escape.perl
+++ /dev/null
@@ -1,348 +0,0 @@
-#!/usr/bin/perl -w
-
-# Sample Tokenizer
-### Version 1.1
-# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
-# Version 1.1 updates:
-#       (1) add multithreading option "-threads NUM_THREADS" (default is 1);
-#       (2) add a timing option "-time" to calculate the average speed of this tokenizer;
-#       (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
-### Version 1.0
-# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
-# written by Josh Schroeder, based on code by Philipp Koehn
-
-binmode(STDIN, ":utf8");
-binmode(STDOUT, ":utf8");
-
-use FindBin qw($RealBin);
-use strict;
-use Time::HiRes;
-#use Thread;
-
-my $mydir = "$RealBin/nonbreaking_prefixes";
-
-my %NONBREAKING_PREFIX = ();
-my $language = "en";
-my $QUIET = 0;
-my $HELP = 0;
-my $AGGRESSIVE = 0;
-my $SKIP_XML = 0;
-my $TIMING = 0;
-my $NUM_THREADS = 1;
-my $NUM_SENTENCES_PER_THREAD = 2000;
-
-while (@ARGV) 
-{
-	$_ = shift;
-	/^-b$/ && ($| = 1, next);
-	/^-l$/ && ($language = shift, next);
-	/^-q$/ && ($QUIET = 1, next);
-	/^-h$/ && ($HELP = 1, next);
-	/^-x$/ && ($SKIP_XML = 1, next);
-	/^-a$/ && ($AGGRESSIVE = 1, next);
-	/^-time$/ && ($TIMING = 1, next);
-	/^-threads$/ && ($NUM_THREADS = int(shift), next);
-	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
-}
-
-# for time calculation
-my $start_time;
-if ($TIMING)
-{
-    $start_time = [ Time::HiRes::gettimeofday( ) ];
-}
-
-# print help message
-if ($HELP) 
-{
-	print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
-        print "Options:\n";
-        print "  -q     ... quiet.\n";
-        print "  -a     ... aggressive hyphen splitting.\n";
-        print "  -b     ... disable Perl buffering.\n";
-        print "  -time  ... enable processing time calculation.\n";
-	exit;
-}
-
-if (!$QUIET) 
-{
-	print STDERR "Tokenizer Version 1.1\n";
-	print STDERR "Language: $language\n";
-	print STDERR "Number of threads: $NUM_THREADS\n";
-}
-
-# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
-load_prefixes($language,\%NONBREAKING_PREFIX);
-
-if (scalar(%NONBREAKING_PREFIX) eq 0)
-{
-	print STDERR "Warning: No known abbreviations for language '$language'\n";
-}
-
-my @batch_sentences = ();
-my @thread_list = ();
-my $count_sentences = 0;
-
-if ($NUM_THREADS > 1)
-{# multi-threading tokenization
-    while(<STDIN>) 
-    {
-        $count_sentences = $count_sentences + 1;
-        push(@batch_sentences, $_);
-        if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
-        {
-            # assign each thread work
-            for (my $i=0; $i<$NUM_THREADS; $i++)
-            {
-                my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
-                my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
-                my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
-                my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
-                push(@thread_list, $new_thread);
-            }
-            foreach (@thread_list)
-            {
-                my $tokenized_list = $_->join;
-                foreach (@$tokenized_list)
-                {
-                    print $_;
-                }
-            }
-            # reset for the new run
-            @thread_list = ();
-            @batch_sentences = ();
-        }
-    }
-    # the last batch
-    if (scalar(@batch_sentences)>0)
-    {
-        # assign each thread work
-        for (my $i=0; $i<$NUM_THREADS; $i++)
-        {
-            my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
-            if ($start_index >= scalar(@batch_sentences))
-            {
-                last;
-            }
-            my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
-            if ($end_index >= scalar(@batch_sentences))
-            {
-                $end_index = scalar(@batch_sentences)-1;
-            }
-            my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
-            my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
-            push(@thread_list, $new_thread);
-        }
-        foreach (@thread_list)
-        {
-            my $tokenized_list = $_->join;
-            foreach (@$tokenized_list)
-            {
-                print $_;
-            }
-        }
-    }
-}
-else
-{# single thread only
-    while(<STDIN>) 
-    {
-        if (($SKIP_XML && /^<.+>$/) || /^\s*$/) 
-        {
-            #don't try to tokenize XML/HTML tag lines
-            print $_;
-        }
-        else 
-        {
-            print &tokenize($_);
-        }
-    }
-}
-
-if ($TIMING)
-{
-    my $duration = Time::HiRes::tv_interval( $start_time );
-    print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
-    print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
-}
-
-#####################################################################################
-# subroutines afterward
-
-# tokenize a batch of texts saved in an array
-# input: an array containing a batch of texts
-# return: another array cotaining a batch of tokenized texts for the input array
-sub tokenize_batch
-{
-    my(@text_list) = @_;
-    my(@tokenized_list) = ();
-    foreach (@text_list)
-    {
-        if (($SKIP_XML && /^<.+>$/) || /^\s*$/) 
-        {
-            #don't try to tokenize XML/HTML tag lines
-            push(@tokenized_list, $_);
-        }
-        else
-        {
-            push(@tokenized_list, &tokenize($_));
-        }
-    }
-    return \@tokenized_list;
-}
-
-# the actual tokenize function which tokenizes one input string
-# input: one string
-# return: the tokenized string for the input string
-sub tokenize 
-{
-    my($text) = @_;
-    chomp($text);
-    $text = " $text ";
-    
-    # remove ASCII junk
-    $text =~ s/\s+/ /g;
-    $text =~ s/[\000-\037]//g;
-
-    # seperate out all "other" special characters
-    $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
-
-    # aggressive hyphen splitting
-    if ($AGGRESSIVE) 
-    {
-        $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
-    }
-
-    #multi-dots stay together
-    $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
-    while($text =~ /DOTMULTI\./) 
-    {
-        $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
-        $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
-    }
-
-    # seperate out "," except if within numbers (5,300)
-    $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
-    # separate , pre and post number
-    $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
-    $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
-	      
-    # turn `into '
-    $text =~ s/\`/\'/g;
-	
-    #turn '' into "
-    $text =~ s/\'\'/ \" /g;
-
-    if ($language eq "en") 
-    {
-        #split contractions right
-        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
-        $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
-        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
-        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
-        #special case for "1990's"
-        $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
-    } 
-    elsif (($language eq "fr") or ($language eq "it")) 
-    {
-        #split contractions left	
-        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
-        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
-        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
-        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
-    } 
-    else 
-    {
-        $text =~ s/\'/ \' /g;
-    }
-	
-    #word token method
-    my @words = split(/\s/,$text);
-    $text = "";
-    for (my $i=0;$i<(scalar(@words));$i++) 
-    {
-        my $word = $words[$i];
-        if ( $word =~ /^(\S+)\.$/) 
-        {
-            my $pre = $1;
-            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) 
-            {
-                #no change
-			} 
-            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) 
-            {
-                #no change
-            } 
-            else 
-            {
-                $word = $pre." .";
-            }
-        }
-        $text .= $word." ";
-    }		
-
-    # clean up extraneous spaces
-    $text =~ s/ +/ /g;
-    $text =~ s/^ //g;
-    $text =~ s/ $//g;
-
-    #restore multi-dots
-    while($text =~ /DOTDOTMULTI/) 
-    {
-        $text =~ s/DOTDOTMULTI/DOTMULTI./g;
-    }
-    $text =~ s/DOTMULTI/./g;
-
-    #escape special chars
-    #$text =~ s/\&/\&amp;/g;   # escape escape
-    #$text =~ s/\|/\&#124;/g;  # factor separator
-    #$text =~ s/\</\&lt;/g;    # xml
-    #$text =~ s/\>/\&gt;/g;    # xml
-    #$text =~ s/\'/\&apos;/g;  # xml
-    #$text =~ s/\"/\&quot;/g;  # xml
-    #$text =~ s/\[/\&#91;/g;   # syntax non-terminal
-    #$text =~ s/\]/\&#93;/g;   # syntax non-terminal
-
-    #ensure final line break
-    $text .= "\n" unless $text =~ /\n$/;
-
-    return $text;
-}
-
-sub load_prefixes 
-{
-    my ($language, $PREFIX_REF) = @_;
-	
-    my $prefixfile = "$mydir/nonbreaking_prefix.$language";
-	
-    #default back to English if we don't have a language-specific prefix file
-    if (!(-e $prefixfile)) 
-    {
-        $prefixfile = "$mydir/nonbreaking_prefix.en";
-        print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
-        die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
-    }
-	
-    if (-e "$prefixfile") 
-    {
-        open(PREFIX, "<:utf8", "$prefixfile");
-        while (<PREFIX>) 
-        {
-            my $item = $_;
-            chomp($item);
-            if (($item) && (substr($item,0,1) ne "#")) 
-            {
-                if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) 
-                {
-                    $PREFIX_REF->{$1} = 2;
-                } 
-                else 
-                {
-                    $PREFIX_REF->{$item} = 1;
-                }
-            }
-        }
-        close(PREFIX);
-    }
-}
-
diff --git a/toks b/toks
index ed40dbb..8bee29f 100755
--- a/toks
+++ b/toks
@@ -3,7 +3,6 @@
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
 
-
 while line = STDIN.gets
   line.strip.split(/\s/).each { |i| puts i }
 end
diff --git a/train_test_split b/train_test_split
new file mode 100755
index 0000000..db56de9
--- /dev/null
+++ b/train_test_split
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+require 'trollop'
+
+cfg = Trollop::options do
+  opt :foreign, "foreign file", :type => :string, :required => true
+  opt :english, "english file", :type => :string, :required => true
+  opt :size, "one size", :type => :int, :required => true
+  opt :repeat, "number of repetitions", :type => :int, :default => 1
+  opt :prefix, "prefix for output files", :type => :string
+end
+fn = cfg[:foreign]
+fn_ext = fn.split('.').last
+f = ReadFile.readlines fn
+en = cfg[:english]
+en_ext = en.split('.').last
+e = ReadFile.readlines en
+size = cfg[:size]
+nlines_f = `wc -l #{fn}`.split()[0].to_i
+nlines_e = `wc -l #{en}`.split()[0].to_i
+if nlines_f != nlines_e
+  STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n"
+  exit 1
+end
+
+prefix = cfg[:prefix]
+a = (0..nlines_e-1).to_a
+i = 0
+cfg[:repeat].times {
+  b = a.sample(size)
+  ax = a.reject{|j| b.include? j}
+  `mkdir split_#{i}`
+  new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}"
+  new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}"
+  ax.each { |j|
+    new_f.write f[j]
+    new_e.write e[j]
+  }
+  new_f.close; new_e.close
+  new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}"
+  new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}"
+  b.each { |j|
+    new_f.write f[j]
+    new_e.write e[j]
+  }
+  new_f.close; new_e.close
+  i += 1
+}
+
diff --git a/traintestsplit b/traintestsplit
deleted file mode 100755
index ec88df1..0000000
--- a/traintestsplit
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'zipf'
-require 'trollop'
-
-
-cfg = Trollop::options do
-  opt :foreign, "foreign file", :type => :string, :required => true
-  opt :english, "english file", :type => :string, :required => true
-  opt :size, "one size", :type => :int, :required => true
-  opt :repeat, "number of repetitions", :type => :int, :default => 1
-  opt :prefix, "prefix for output files", :type => :string
-end
-fn = cfg[:foreign]
-fn_ext = fn.split('.').last
-f = ReadFile.readlines fn
-en = cfg[:english]
-en_ext = en.split('.').last
-e = ReadFile.readlines en
-size = cfg[:size]
-nlines_f = `wc -l #{fn}`.split()[0].to_i
-nlines_e = `wc -l #{en}`.split()[0].to_i
-if nlines_f != nlines_e
-  STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n"
-  exit 1
-end
-
-prefix = cfg[:prefix]
-a = (0..nlines_e-1).to_a
-i = 0
-cfg[:repeat].times {
-  b = a.sample(size)
-  ax = a.reject{|j| b.include? j}
-  `mkdir split_#{i}`
-  new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}"
-  new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}"
-  ax.each { |j|
-    new_f.write f[j]
-    new_e.write e[j]
-  }
-  new_f.close; new_e.close
-  new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}"
-  new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}"
-  b.each { |j|
-    new_f.write f[j]
-    new_e.write e[j]
-  }
-  new_f.close; new_e.close
-  i += 1
-}
-
diff --git a/var b/var
index fe4aa22..faccefa 100755
--- a/var
+++ b/var
@@ -2,7 +2,6 @@
 
 require 'trollop'
 
-
 cfg = Trollop::options do
   banner "stddev [-r <d>] < <one number per line>"
   opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
-- 
cgit v1.2.3