From e0b634754d1bef33dc8e72509c6990cccc32745a Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Thu, 9 Oct 2014 20:47:23 +0100
Subject: alles neu macht der mai
---
README.md | 14 +-
add_seg | 2 +-
avg | 1 -
avg_weights | 1 -
cdec_hg_to_json | 80 +++++++++
collapse_tags.rb | 40 -----
dot | 9 +
first_lower | 11 ++
firstlower | 12 --
gigaword_collapse_tags | 39 +++++
hg2json.py | 81 ---------
kbest_bleu_oracles | 2 -
key_count | 14 ++
keycount | 14 --
kmeans | 2 -
lin_reg | 2 -
log_reg | 2 -
max | 1 -
median | 1 -
mem_usage | 11 ++
memusg | 12 --
merge_files | 1 -
merge_ttable | 2 -
min | 1 -
min_max | 1 -
moses_1best | 1 -
mult | 1 -
no_empty | 1 -
num_tok | 1 -
odd | 1 -
parse-stanford.sh | 13 --
paste_pairs | 1 -
per_sentence_bleu | 2 -
per_sentence_bleu_kbest | 2 -
per_sentence_ter | 2 -
pot | 1 -
round | 1 -
ruby_eval | 1 -
rule_shapes | 1 -
shard | 1 -
split_pipes | 23 +++
splitpipes | 24 ---
stanford_parser_run | 13 ++
stddev | 1 -
sum | 1 -
tc | 1 -
test/cdec_hg_to_json/cdec.ini | 5 +
test/cdec_hg_to_json/grammar.gz | Bin 0 -> 1399915 bytes
test/cdec_hg_to_json/hg.json.gz | Bin 0 -> 318029 bytes
test/cdec_hg_to_json/hg.meta | 7 +
test/cdec_hg_to_json/in | 1 +
test/cdec_hg_to_json/toy.cdec.ini | 2 +
test/cdec_hg_to_json/toy.grammar | 12 ++
test/cdec_hg_to_json/toy.in | 1 +
test/cdec_hg_to_json/toy.weights | 3 +
test/cdec_hg_to_json/weights | 17 ++
test/hg2json/cdec.ini | 5 -
test/hg2json/grammar.gz | Bin 1399915 -> 0 bytes
test/hg2json/hg.json.gz | Bin 318029 -> 0 bytes
test/hg2json/hg.meta | 7 -
test/hg2json/in | 1 -
test/hg2json/toy.cdec.ini | 2 -
test/hg2json/toy.grammar | 12 --
test/hg2json/toy.in | 1 -
test/hg2json/toy.weights | 3 -
test/hg2json/weights | 17 --
tf-idf | 2 -
to_ascii | 1 -
tokenizer-no-escape.perl | 348 ++++++++++++++++++++++++++++++++++++++
tokenizer.no-escape.perl | 348 --------------------------------------
toks | 1 -
train_test_split | 50 ++++++
traintestsplit | 51 ------
var | 1 -
74 files changed, 653 insertions(+), 694 deletions(-)
create mode 100755 cdec_hg_to_json
delete mode 100755 collapse_tags.rb
create mode 100755 dot
create mode 100755 first_lower
delete mode 100755 firstlower
create mode 100755 gigaword_collapse_tags
delete mode 100755 hg2json.py
create mode 100755 key_count
delete mode 100755 keycount
create mode 100755 mem_usage
delete mode 100755 memusg
delete mode 100755 parse-stanford.sh
create mode 100755 split_pipes
delete mode 100755 splitpipes
create mode 100755 stanford_parser_run
create mode 100644 test/cdec_hg_to_json/cdec.ini
create mode 100644 test/cdec_hg_to_json/grammar.gz
create mode 100644 test/cdec_hg_to_json/hg.json.gz
create mode 100644 test/cdec_hg_to_json/hg.meta
create mode 100644 test/cdec_hg_to_json/in
create mode 100644 test/cdec_hg_to_json/toy.cdec.ini
create mode 100644 test/cdec_hg_to_json/toy.grammar
create mode 100644 test/cdec_hg_to_json/toy.in
create mode 100644 test/cdec_hg_to_json/toy.weights
create mode 100644 test/cdec_hg_to_json/weights
delete mode 100644 test/hg2json/cdec.ini
delete mode 100644 test/hg2json/grammar.gz
delete mode 100644 test/hg2json/hg.json.gz
delete mode 100644 test/hg2json/hg.meta
delete mode 100644 test/hg2json/in
delete mode 100644 test/hg2json/toy.cdec.ini
delete mode 100644 test/hg2json/toy.grammar
delete mode 100644 test/hg2json/toy.in
delete mode 100644 test/hg2json/toy.weights
delete mode 100644 test/hg2json/weights
create mode 100755 tokenizer-no-escape.perl
delete mode 100755 tokenizer.no-escape.perl
create mode 100755 train_test_split
delete mode 100755 traintestsplit
diff --git a/README.md b/README.md
index 3a6b1b7..fd42922 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,11 @@
-scripts
-=======
+a number of NLP related scripts. Some scripts require my zipf gem, see [1]
-A number of NLP related scripts.
-Some scripts require my zipf gem,
-see https://github.com/pks/zipf
+\*.perl taken from the moses [2] toolkit
-compound-splitter.perl and tokenizer.no-escape.perl
-taken from the moses [1] toolkit.
+mem\_usage taken from [3]
-[1] https://github.com/moses-smt/mosesdecoder
+[1] https://github.com/pks/zipf
+[2] https://github.com/moses-smt/mosesdecoder
+[3] https://gist.github.com/netj/526585
diff --git a/add_seg b/add_seg
index e4fe22d..7a4ca7a 100755
--- a/add_seg
+++ b/add_seg
@@ -24,8 +24,8 @@ while line = STDIN.gets
s = " 0
- puts s + " id=\"#{index[j]}\"> #{line.strip} "
if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{index[j]}#{ext}\"" end
+ puts s + " id=\"#{index[j]}\"> #{line.strip} "
else
if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{i}#{ext}\"" end
puts s + " id=\"#{i}\"> #{line.strip} "
diff --git a/avg b/avg
index ed31465..07e3de9 100755
--- a/avg
+++ b/avg
@@ -2,7 +2,6 @@
require 'trollop'
-
cfg = Trollop::options do
banner "avg < "
opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
diff --git a/avg_weights b/avg_weights
index 1f9053f..2e23440 100755
--- a/avg_weights
+++ b/avg_weights
@@ -4,7 +4,6 @@ require 'zipf'
require 'trollop'
require 'zlib'
-
cfg = Trollop::options do
opt :weights_files, "a number of weights files: name value", :required => true
opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false
diff --git a/cdec_hg_to_json b/cdec_hg_to_json
new file mode 100755
index 0000000..5a26cf7
--- /dev/null
+++ b/cdec_hg_to_json
@@ -0,0 +1,80 @@
+#!/usr/bin/env python2
+
+import cdec
+import sys, argparse
+
+def hg2json(hg, weights):
+ """
+ output a JSON representation of a cdec hypegraph
+ (see http://aclweb.org/aclwiki/index.php?title=Hypergraph_Format )
+ """
+ res = ''
+ res += "{\n"
+ res += '"weights":{'+"\n"
+ a = []
+ for i in weights:
+ a.append( '"%s":%s'%(i[0], i[1]) )
+ res += ", ".join(a)+"\n"
+ res += "},\n"
+ res += '"nodes":'+"\n"
+ res += "[\n"
+ a = []
+ a.append( '{ "label":"root", "cat":"root" }' )
+ for i in hg.nodes:
+ a.append( '{ "label":"%s", "cat":"%s" }'%(i.id, i.cat) )
+ res += ",\n".join(a)+"\n"
+ res += "],\n"
+ res += '"edges":'+"\n"
+ res += "[\n"
+ a = []
+ for i in hg.edges:
+ s = "{"
+ s += '"head":"%s"'%(i.head_node.id)
+ xs = ' "f":{'
+ b = []
+ for j in i.feature_values:
+ b.append( '"%s":%s'%(j[0], j[1]) )
+ xs += ", ".join(b)
+ xs += "},"
+ c = []
+ for j in i.tail_nodes:
+ c.append( '"'+str(j.id)+'"' )
+ if len(c) > 0:
+ s += ', "tails":[ %s ],'%(",".join(c))
+ else:
+ s += ', "tails":[ "root" ],'
+ s += xs
+ s += ' "weight":%s }'%(i.prob)
+ a.append(s)
+ res += ",\n".join(a)+"\n"
+ res += "]\n"
+ res += "}\n"
+ return res
+
+def main():
+ parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs')
+ parser.add_argument('-c', '--config', required=True, help='decoder configuration')
+ parser.add_argument('-w', '--weights', required=True, help='feature weights')
+ args = parser.parse_args()
+ with open(args.config) as config:
+ config = config.read()
+ decoder = cdec.Decoder(config)
+ decoder.read_weights(args.weights)
+ ins = sys.stdin.readline().strip()
+ hg = decoder.translate(ins)
+
+ sys.stderr.write( "input:\n '%s'\n"%(ins) )
+ sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) )
+ num_nodes = 0
+ for i in hg.nodes: num_nodes+=1
+ sys.stderr.write( "# nodes = %s\n"%(num_nodes) )
+ num_edges = 0
+ for i in hg.edges: num_edges+=1
+ sys.stderr.write( "# edges = %s\n"%(num_edges) )
+ sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) )
+
+ print hg2json(hg, decoder.weights)
+
+if __name__=="__main__":
+ main()
+
diff --git a/collapse_tags.rb b/collapse_tags.rb
deleted file mode 100755
index 75fcaf5..0000000
--- a/collapse_tags.rb
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env ruby
-
-# works with gigaword en v5
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-
-in_p = false
-in_dateline = false
-collect = []
-
-while line = STDIN.gets
- line.strip!
- if line.downcase == ""
- in_dateline = true
- next
- elsif line.downcase == ""
- in_dateline = false
- next
- elsif in_dateline
- next
- elsif line.downcase == "" and not in_p
- in_p = true
- collect = []
- next
- elsif line.downcase == "
" and in_p
- if collect.size > 0
- puts collect.join(" ").strip
- end
- in_p = false
- next
- elsif in_p
- collect.push line
- next
- else
- puts line
- end
-end
-
diff --git a/dot b/dot
new file mode 100755
index 0000000..da0dc58
--- /dev/null
+++ b/dot
@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+a = SparseVector.from_file 'w', ' '
+b = SparseVector.from_file 'f', ' '
+puts a.to_s
+puts a.dot b
+
diff --git a/first_lower b/first_lower
new file mode 100755
index 0000000..1cddb8e
--- /dev/null
+++ b/first_lower
@@ -0,0 +1,11 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+while line = STDIN.gets
+ line.strip!
+ if line && line!='' && line[0].downcase?
+ puts line
+ end
+end
+
diff --git a/firstlower b/firstlower
deleted file mode 100755
index 682a9b7..0000000
--- a/firstlower
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'zipf'
-
-
-while line = STDIN.gets
- line.strip!
- if line && line!='' && line[0].downcase?
- puts line
- end
-end
-
diff --git a/gigaword_collapse_tags b/gigaword_collapse_tags
new file mode 100755
index 0000000..cbaf7d7
--- /dev/null
+++ b/gigaword_collapse_tags
@@ -0,0 +1,39 @@
+#!/usr/bin/env ruby
+
+# works with gigaword en v5
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+in_p = false
+in_dateline = false
+collect = []
+
+while line = STDIN.gets
+ line.strip!
+ if line.downcase == ""
+ in_dateline = true
+ next
+ elsif line.downcase == ""
+ in_dateline = false
+ next
+ elsif in_dateline
+ next
+ elsif line.downcase == "" and not in_p
+ in_p = true
+ collect = []
+ next
+ elsif line.downcase == "
" and in_p
+ if collect.size > 0
+ puts collect.join(" ").strip
+ end
+ in_p = false
+ next
+ elsif in_p
+ collect.push line
+ next
+ else
+ puts line
+ end
+end
+
diff --git a/hg2json.py b/hg2json.py
deleted file mode 100755
index 5bd5c2c..0000000
--- a/hg2json.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env python2
-
-import cdec
-import sys, argparse
-
-def hg2json(hg, weights):
- """
- output a JSON representation of a cdec hypegraph
- (see http://aclweb.org/aclwiki/index.php?title=Hypergraph_Format )
- """
- res = ''
- res += "{\n"
- res += '"weights":{'+"\n"
- a = []
- for i in weights:
- a.append( '"%s":%s'%(i[0], i[1]) )
- res += ", ".join(a)+"\n"
- res += "},\n"
- res += '"nodes":'+"\n"
- res += "[\n"
- a = []
- a.append( '{ "label":"root", "cat":"root" }' )
- for i in hg.nodes:
- a.append( '{ "label":"%s", "cat":"%s" }'%(i.id, i.cat) )
- res += ",\n".join(a)+"\n"
- res += "],\n"
- res += '"edges":'+"\n"
- res += "[\n"
- a = []
- for i in hg.edges:
- s = "{"
- s += '"head":"%s"'%(i.head_node.id)
- xs = ' "f":{'
- b = []
- for j in i.feature_values:
- b.append( '"%s":%s'%(j[0], j[1]) )
- xs += ", ".join(b)
- xs += "},"
- c = []
- for j in i.tail_nodes:
- c.append( '"'+str(j.id)+'"' )
- if len(c) > 0:
- s += ', "tails":[ %s ],'%(",".join(c))
- else:
- s += ', "tails":[ "root" ],'
- s += xs
- s += ' "weight":%s }'%(i.prob)
- a.append(s)
- res += ",\n".join(a)+"\n"
- res += "]\n"
- res += "}\n"
- return res
-
-def main():
- parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs')
- parser.add_argument('-c', '--config', required=True, help='decoder configuration')
- parser.add_argument('-w', '--weights', required=True, help='feature weights')
- args = parser.parse_args()
- with open(args.config) as config:
- config = config.read()
- decoder = cdec.Decoder(config)
- decoder.read_weights(args.weights)
- ins = sys.stdin.readline().strip()
- hg = decoder.translate(ins)
-
- sys.stderr.write( "input:\n '%s'\n"%(ins) )
- sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) )
- num_nodes = 0
- for i in hg.nodes: num_nodes+=1
- sys.stderr.write( "# nodes = %s\n"%(num_nodes) )
- num_edges = 0
- for i in hg.edges: num_edges+=1
- sys.stderr.write( "# edges = %s\n"%(num_edges) )
- sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) )
-
- print hg2json(hg, decoder.weights)
-
-
-if __name__=="__main__":
- main()
-
diff --git a/kbest_bleu_oracles b/kbest_bleu_oracles
index 2ac344b..7db1c7e 100755
--- a/kbest_bleu_oracles
+++ b/kbest_bleu_oracles
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def get_context kbest_lists, references, n
a = []
kbest_lists.each_index { |i|
@@ -48,6 +47,5 @@ def main
}
end
-
main
diff --git a/key_count b/key_count
new file mode 100755
index 0000000..deaa522
--- /dev/null
+++ b/key_count
@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+h = {}
+h.default = 0
+while line = STDIN.gets
+ line.strip!
+ h[line] += 1
+end
+
+h.each_pair { |k,v| puts "#{k} #{v}" }
+
diff --git a/keycount b/keycount
deleted file mode 100755
index deaa522..0000000
--- a/keycount
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env ruby
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-h = {}
-h.default = 0
-while line = STDIN.gets
- line.strip!
- h[line] += 1
-end
-
-h.each_pair { |k,v| puts "#{k} #{v}" }
-
diff --git a/kmeans b/kmeans
index ec28897..201864b 100755
--- a/kmeans
+++ b/kmeans
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def read_data fn
data = {}
ReadFile.new(fn).readlines_strip.map{ |i|
@@ -114,6 +113,5 @@ def main
end
end
-
main
diff --git a/lin_reg b/lin_reg
index 168e7df..4a7c3b2 100755
--- a/lin_reg
+++ b/lin_reg
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def read_data fn, scale
f = ReadFile.new fn
data = []
@@ -67,6 +66,5 @@ def main
puts model.to_s
end
-
main
diff --git a/log_reg b/log_reg
index e6f47eb..3916d0c 100755
--- a/log_reg
+++ b/log_reg
@@ -4,7 +4,6 @@ require 'zipf'
require 'matrix'
require 'trollop'
-
def read_data fn
f = ReadFile.new fn
data = []
@@ -68,6 +67,5 @@ def main
puts model.to_s
end
-
main
diff --git a/max b/max
index 87f3c73..b2c1cae 100755
--- a/max
+++ b/max
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
max = -1.0/0
while line = STDIN.gets
v = line.to_f
diff --git a/median b/median
index 9499c95..0b1950b 100755
--- a/median
+++ b/median
@@ -2,7 +2,6 @@
require 'zipf'
-
a = []
while line = STDIN.gets
a << line.to_f
diff --git a/mem_usage b/mem_usage
new file mode 100755
index 0000000..5c2104f
--- /dev/null
+++ b/mem_usage
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+"$@" &
+pid=$! peak=0
+while true; do
+ sleep 1
+ sample="$(ps -o rss= $pid 2> /dev/null)" || break
+ let peak='sample > peak ? sample : peak'
+done
+echo "$(( ${peak%% *} / 1024)) m"
+
diff --git a/memusg b/memusg
deleted file mode 100755
index a69daaa..0000000
--- a/memusg
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-
-"$@" &
-pid=$! peak=0
-while true; do
- sleep 1
- sample="$(ps -o rss= $pid 2> /dev/null)" || break
- let peak='sample > peak ? sample : peak'
-done
-echo "$(( ${peak%% *} / 1024)) m"
-
diff --git a/merge_files b/merge_files
index 0b4941e..714b57d 100755
--- a/merge_files
+++ b/merge_files
@@ -2,7 +2,6 @@
require 'zipf'
-
def usage
STDERR.write "merge_files +\n"
exit 1
diff --git a/merge_ttable b/merge_ttable
index 20d86d3..e4621f5 100755
--- a/merge_ttable
+++ b/merge_ttable
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def main
cfg = Trollop::options do
opt :f, "f files", :type => :string, :required => true
@@ -31,6 +30,5 @@ def main
}
end
-
main
diff --git a/min b/min
index 398b0fb..f8a7e42 100755
--- a/min
+++ b/min
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
min = 1.0/0
while line = STDIN.gets
v = line.to_f
diff --git a/min_max b/min_max
index 17dc566..b79a743 100755
--- a/min_max
+++ b/min_max
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
cfg = Trollop::options do
opt :min, "minimum #tokens", :type => :int, :default => 1
opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n'
diff --git a/moses_1best b/moses_1best
index 849ebf1..fd35cf8 100755
--- a/moses_1best
+++ b/moses_1best
@@ -2,7 +2,6 @@
require 'zipf'
-
prev_idx = nil
while line = STDIN.gets
line.strip!
diff --git a/mult b/mult
index 2ef0149..478ec5e 100755
--- a/mult
+++ b/mult
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
factor = ARGV[0].to_f
while line = STDIN.gets
puts line.to_f * factor
diff --git a/no_empty b/no_empty
index 96c9ce4..da57e23 100755
--- a/no_empty
+++ b/no_empty
@@ -2,7 +2,6 @@
require 'zipf'
-
files = []
(0..1).each { |i| files << ReadFile.new(ARGV[i]) }
(2..3).each { |i| files << WriteFile.new(ARGV[i]) }
diff --git a/num_tok b/num_tok
index 53b99a0..56cbae9 100755
--- a/num_tok
+++ b/num_tok
@@ -3,7 +3,6 @@
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-
while line = STDIN.gets
puts line.strip.split.length
end
diff --git a/odd b/odd
index 93aaa80..0bd9336 100755
--- a/odd
+++ b/odd
@@ -3,7 +3,6 @@
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-
i = 1
while line = STDIN.gets
puts line if i%2!=0
diff --git a/parse-stanford.sh b/parse-stanford.sh
deleted file mode 100755
index f8d4210..0000000
--- a/parse-stanford.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-if [ $# != 1 ]; then
- echo "$0 text-file"
- exit 1
-fi
-
-export CLASSPATH=:/toolbox/stanfordparser_3_2_0/*
-
-IN=$1
-
-cat $IN | java -server -mx25000m edu.stanford.nlp.parser.lexparser.LexicalizedParser -nthreads 8 -sentences newline -encoding utf-8 -tokenized -outputFormat "typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz - | tr '\n' '\t' | sed 's/\t\t/\n/g' | sed 's/\t/ /g' | sed 's/ *$//' | sed 's/, /,/g' > $IN.stp
-
diff --git a/paste_pairs b/paste_pairs
index 07c1f22..f6b8b31 100755
--- a/paste_pairs
+++ b/paste_pairs
@@ -3,7 +3,6 @@
import sys
from itertools import izip
-
for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))):
print linenr, (src_line.strip())
print linenr, (tgt_line.strip())
diff --git a/per_sentence_bleu b/per_sentence_bleu
index 76fcf38..5bacd1a 100755
--- a/per_sentence_bleu
+++ b/per_sentence_bleu
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def main
cfg = Trollop::options do
opt :input, "input", :type => :string, :default => '-'
@@ -26,6 +25,5 @@ def main
input.close
end
-
main
diff --git a/per_sentence_bleu_kbest b/per_sentence_bleu_kbest
index 4d821b3..e6a31cb 100755
--- a/per_sentence_bleu_kbest
+++ b/per_sentence_bleu_kbest
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def main
cfg = Trollop::options do
opt :kbests, "kbests", :type => :string, :default => '-'
@@ -29,6 +28,5 @@ def main
}
end
-
main
diff --git a/per_sentence_ter b/per_sentence_ter
index 8b04be5..343708e 100755
--- a/per_sentence_ter
+++ b/per_sentence_ter
@@ -4,7 +4,6 @@ require 'zipf'
require 'trollop'
require 'tempfile'
-
def main
cfg = Trollop::options do
opt :input, "input", :type => :string, :default => '-'
@@ -30,6 +29,5 @@ def main
input.close
end
-
main
diff --git a/pot b/pot
index ec199ea..24acabe 100755
--- a/pot
+++ b/pot
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
pow = ARGV[0].to_f
while line = STDIN.gets
puts line.to_f**pow
diff --git a/round b/round
index 3dfbb6f..dfef800 100755
--- a/round
+++ b/round
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
r = ARGV[0].to_i
while line = STDIN.gets
puts line.to_f.round r
diff --git a/ruby_eval b/ruby_eval
index 96b2ecb..fe0d181 100755
--- a/ruby_eval
+++ b/ruby_eval
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
while line = STDIN.gets
puts "#{eval line}"
end
diff --git a/rule_shapes b/rule_shapes
index fd42249..589a670 100755
--- a/rule_shapes
+++ b/rule_shapes
@@ -3,7 +3,6 @@
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-
def shape s
res = []
in_t = false
diff --git a/shard b/shard
index f952104..6155123 100755
--- a/shard
+++ b/shard
@@ -2,7 +2,6 @@
require 'trollop'
-
def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false)
lc = `wc -l #{input}`.split.first.to_i
input_ext = input.split('.').last
diff --git a/split_pipes b/split_pipes
new file mode 100755
index 0000000..eeba69b
--- /dev/null
+++ b/split_pipes
@@ -0,0 +1,23 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+cfg = Trollop::options do
+ banner "splitpipes -f < "
+ opt :field, "field", :type => :int
+end
+
+while line = STDIN.gets
+ j = 1
+ line.strip.split(' ||| ').each { |i|
+ if j == cfg[:field]
+ puts i.strip
+ break
+ end
+ j += 1
+ }
+end
+
diff --git a/splitpipes b/splitpipes
deleted file mode 100755
index 35ee176..0000000
--- a/splitpipes
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'trollop'
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-
-
-cfg = Trollop::options do
- banner "splitpipes -f < "
- opt :field, "field", :type => :int
-end
-
-while line = STDIN.gets
- j = 1
- line.strip.split(' ||| ').each { |i|
- if j == cfg[:field]
- puts i.strip
- break
- end
- j += 1
- }
-end
-
diff --git a/stanford_parser_run b/stanford_parser_run
new file mode 100755
index 0000000..f8d4210
--- /dev/null
+++ b/stanford_parser_run
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+if [ $# != 1 ]; then
+ echo "$0 text-file"
+ exit 1
+fi
+
+export CLASSPATH=:/toolbox/stanfordparser_3_2_0/*
+
+IN=$1
+
+cat $IN | java -server -mx25000m edu.stanford.nlp.parser.lexparser.LexicalizedParser -nthreads 8 -sentences newline -encoding utf-8 -tokenized -outputFormat "typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz - | tr '\n' '\t' | sed 's/\t\t/\n/g' | sed 's/\t/ /g' | sed 's/ *$//' | sed 's/, /,/g' > $IN.stp
+
diff --git a/stddev b/stddev
index 5cda0e0..a7397b2 100755
--- a/stddev
+++ b/stddev
@@ -2,7 +2,6 @@
require 'trollop'
-
cfg = Trollop::options do
banner "stddev [-r ] < "
opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
diff --git a/sum b/sum
index dac72d3..acfa563 100755
--- a/sum
+++ b/sum
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
sum = 0.0
while line = STDIN.gets
sum += line.to_f
diff --git a/tc b/tc
index 993086a..7eefdd5 100755
--- a/tc
+++ b/tc
@@ -2,7 +2,6 @@
require 'zipf'
-
while line = STDIN.gets
puts tokenize(line.strip).size
end
diff --git a/test/cdec_hg_to_json/cdec.ini b/test/cdec_hg_to_json/cdec.ini
new file mode 100644
index 0000000..1ad25b5
--- /dev/null
+++ b/test/cdec_hg_to_json/cdec.ini
@@ -0,0 +1,5 @@
+formalism=scfg
+grammar=test/hg2json/grammar.gz
+add_pass_through_rules=true
+feature_function=WordPenalty
+intersection_strategy=full
diff --git a/test/cdec_hg_to_json/grammar.gz b/test/cdec_hg_to_json/grammar.gz
new file mode 100644
index 0000000..78dda98
Binary files /dev/null and b/test/cdec_hg_to_json/grammar.gz differ
diff --git a/test/cdec_hg_to_json/hg.json.gz b/test/cdec_hg_to_json/hg.json.gz
new file mode 100644
index 0000000..ed178c6
Binary files /dev/null and b/test/cdec_hg_to_json/hg.json.gz differ
diff --git a/test/cdec_hg_to_json/hg.meta b/test/cdec_hg_to_json/hg.meta
new file mode 100644
index 0000000..d33a54c
--- /dev/null
+++ b/test/cdec_hg_to_json/hg.meta
@@ -0,0 +1,7 @@
+input:
+ 'in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen .'
+viterbi translation:
+ 'which labor market desperate transformed into attempting gathered by failed to show any the non - is making festzuhalten gathered by pervez musharraf meant to its borders with within than the non - have pakistan 's intelligence relied constitutional for security as a its borders with declared a state of emergency - range missiles .'
+# nodes = 220
+# edges = 16640
+viterbi score = 228.95
diff --git a/test/cdec_hg_to_json/in b/test/cdec_hg_to_json/in
new file mode 100644
index 0000000..7dc411d
--- /dev/null
+++ b/test/cdec_hg_to_json/in
@@ -0,0 +1 @@
+in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen .
diff --git a/test/cdec_hg_to_json/toy.cdec.ini b/test/cdec_hg_to_json/toy.cdec.ini
new file mode 100644
index 0000000..d4a2896
--- /dev/null
+++ b/test/cdec_hg_to_json/toy.cdec.ini
@@ -0,0 +1,2 @@
+formalism=scfg
+grammar=test/hg2json/toy.grammar
diff --git a/test/cdec_hg_to_json/toy.grammar b/test/cdec_hg_to_json/toy.grammar
new file mode 100644
index 0000000..382c94f
--- /dev/null
+++ b/test/cdec_hg_to_json/toy.grammar
@@ -0,0 +1,12 @@
+[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0
+[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0
+[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1.0
+[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1
+[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1
+[JJ] ||| kleines ||| small ||| logp=0 use_small=1.0
+[JJ] ||| kleines ||| little ||| logp=0 use_little=1.0
+[JJ] ||| grosses ||| big ||| logp=0
+[JJ] ||| grosses ||| large ||| logp=0
+[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0
+[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0
+[V] ||| fand ||| found ||| logp=0
diff --git a/test/cdec_hg_to_json/toy.in b/test/cdec_hg_to_json/toy.in
new file mode 100644
index 0000000..e6df927
--- /dev/null
+++ b/test/cdec_hg_to_json/toy.in
@@ -0,0 +1 @@
+ich sah ein kleines haus
diff --git a/test/cdec_hg_to_json/toy.weights b/test/cdec_hg_to_json/toy.weights
new file mode 100644
index 0000000..70075b7
--- /dev/null
+++ b/test/cdec_hg_to_json/toy.weights
@@ -0,0 +1,3 @@
+logp 2
+use_house 0
+use_shell 1
diff --git a/test/cdec_hg_to_json/weights b/test/cdec_hg_to_json/weights
new file mode 100644
index 0000000..7f96f1d
--- /dev/null
+++ b/test/cdec_hg_to_json/weights
@@ -0,0 +1,17 @@
+PhraseModel_0 1.0
+PhraseModel_1 1.0
+PhraseModel_2 1.0
+PhraseModel_3 1.0
+PhraseModel_4 1.0
+PhraseModel_5 1.0
+PhraseModel_6 1.0
+PassThrough -1.0
+PassThrough_1 -1.0
+PassThrough_2 -1.0
+PassThrough_3 -1.0
+PassThrough_4 -1.0
+PassThrough_5 -1.0
+PassThrough_6 -1.0
+Glue 0.1
+LanguageModel 10.0
+LanguageModel_OOV -10
diff --git a/test/hg2json/cdec.ini b/test/hg2json/cdec.ini
deleted file mode 100644
index 1ad25b5..0000000
--- a/test/hg2json/cdec.ini
+++ /dev/null
@@ -1,5 +0,0 @@
-formalism=scfg
-grammar=test/hg2json/grammar.gz
-add_pass_through_rules=true
-feature_function=WordPenalty
-intersection_strategy=full
diff --git a/test/hg2json/grammar.gz b/test/hg2json/grammar.gz
deleted file mode 100644
index 78dda98..0000000
Binary files a/test/hg2json/grammar.gz and /dev/null differ
diff --git a/test/hg2json/hg.json.gz b/test/hg2json/hg.json.gz
deleted file mode 100644
index ed178c6..0000000
Binary files a/test/hg2json/hg.json.gz and /dev/null differ
diff --git a/test/hg2json/hg.meta b/test/hg2json/hg.meta
deleted file mode 100644
index d33a54c..0000000
--- a/test/hg2json/hg.meta
+++ /dev/null
@@ -1,7 +0,0 @@
-input:
- 'in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen .'
-viterbi translation:
- 'which labor market desperate transformed into attempting gathered by failed to show any the non - is making festzuhalten gathered by pervez musharraf meant to its borders with within than the non - have pakistan 's intelligence relied constitutional for security as a its borders with declared a state of emergency - range missiles .'
-# nodes = 220
-# edges = 16640
-viterbi score = 228.95
diff --git a/test/hg2json/in b/test/hg2json/in
deleted file mode 100644
index 7dc411d..0000000
--- a/test/hg2json/in
+++ /dev/null
@@ -1 +0,0 @@
-in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen .
diff --git a/test/hg2json/toy.cdec.ini b/test/hg2json/toy.cdec.ini
deleted file mode 100644
index d4a2896..0000000
--- a/test/hg2json/toy.cdec.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-formalism=scfg
-grammar=test/hg2json/toy.grammar
diff --git a/test/hg2json/toy.grammar b/test/hg2json/toy.grammar
deleted file mode 100644
index 382c94f..0000000
--- a/test/hg2json/toy.grammar
+++ /dev/null
@@ -1,12 +0,0 @@
-[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0
-[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0
-[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1.0
-[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1
-[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1
-[JJ] ||| kleines ||| small ||| logp=0 use_small=1.0
-[JJ] ||| kleines ||| little ||| logp=0 use_little=1.0
-[JJ] ||| grosses ||| big ||| logp=0
-[JJ] ||| grosses ||| large ||| logp=0
-[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0
-[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0
-[V] ||| fand ||| found ||| logp=0
diff --git a/test/hg2json/toy.in b/test/hg2json/toy.in
deleted file mode 100644
index e6df927..0000000
--- a/test/hg2json/toy.in
+++ /dev/null
@@ -1 +0,0 @@
-ich sah ein kleines haus
diff --git a/test/hg2json/toy.weights b/test/hg2json/toy.weights
deleted file mode 100644
index 70075b7..0000000
--- a/test/hg2json/toy.weights
+++ /dev/null
@@ -1,3 +0,0 @@
-logp 2
-use_house 0
-use_shell 1
diff --git a/test/hg2json/weights b/test/hg2json/weights
deleted file mode 100644
index 7f96f1d..0000000
--- a/test/hg2json/weights
+++ /dev/null
@@ -1,17 +0,0 @@
-PhraseModel_0 1.0
-PhraseModel_1 1.0
-PhraseModel_2 1.0
-PhraseModel_3 1.0
-PhraseModel_4 1.0
-PhraseModel_5 1.0
-PhraseModel_6 1.0
-PassThrough -1.0
-PassThrough_1 -1.0
-PassThrough_2 -1.0
-PassThrough_3 -1.0
-PassThrough_4 -1.0
-PassThrough_5 -1.0
-PassThrough_6 -1.0
-Glue 0.1
-LanguageModel 10.0
-LanguageModel_OOV -10
diff --git a/tf-idf b/tf-idf
index fc6c2ec..450de6b 100755
--- a/tf-idf
+++ b/tf-idf
@@ -3,7 +3,6 @@
require 'zipf'
require 'trollop'
-
def main
cfg = Trollop::options do
opt :documents, "input files (documents)", :type => :string, :required => true
@@ -48,6 +47,5 @@ def main
docs.each { |i| puts i.to_s }
end
-
main
diff --git a/to_ascii b/to_ascii
index 6c1d23e..10fd1c2 100755
--- a/to_ascii
+++ b/to_ascii
@@ -1,6 +1,5 @@
#!/usr/bin/env ruby
-
while line = STDIN.gets
encoding_options = {
:invalid => :replace,
diff --git a/tokenizer-no-escape.perl b/tokenizer-no-escape.perl
new file mode 100755
index 0000000..4397360
--- /dev/null
+++ b/tokenizer-no-escape.perl
@@ -0,0 +1,348 @@
+#!/usr/bin/perl -w
+
+# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+# (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+# (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use FindBin qw($RealBin);
+use strict;
+use Time::HiRes;
+#use Thread;
+
+my $mydir = "$RealBin/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
+
+while (@ARGV)
+{
+ $_ = shift;
+ /^-b$/ && ($| = 1, next);
+ /^-l$/ && ($language = shift, next);
+ /^-q$/ && ($QUIET = 1, next);
+ /^-h$/ && ($HELP = 1, next);
+ /^-x$/ && ($SKIP_XML = 1, next);
+ /^-a$/ && ($AGGRESSIVE = 1, next);
+ /^-time$/ && ($TIMING = 1, next);
+ /^-threads$/ && ($NUM_THREADS = int(shift), next);
+ /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+}
+
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+ $start_time = [ Time::HiRes::gettimeofday( ) ];
+}
+
+# print help message
+if ($HELP)
+{
+ print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
+ print "Options:\n";
+ print " -q ... quiet.\n";
+ print " -a ... aggressive hyphen splitting.\n";
+ print " -b ... disable Perl buffering.\n";
+ print " -time ... enable processing time calculation.\n";
+ exit;
+}
+
+if (!$QUIET)
+{
+ print STDERR "Tokenizer Version 1.1\n";
+ print STDERR "Language: $language\n";
+ print STDERR "Number of threads: $NUM_THREADS\n";
+}
+
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
+ print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
+
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+ while()
+ {
+ $count_sentences = $count_sentences + 1;
+ push(@batch_sentences, $_);
+ if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+ {
+ # assign each thread work
+ for (my $i=0; $i<$NUM_THREADS; $i++)
+ {
+ my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+ my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+ my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+ my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+ push(@thread_list, $new_thread);
+ }
+ foreach (@thread_list)
+ {
+ my $tokenized_list = $_->join;
+ foreach (@$tokenized_list)
+ {
+ print $_;
+ }
+ }
+ # reset for the new run
+ @thread_list = ();
+ @batch_sentences = ();
+ }
+ }
+ # the last batch
+ if (scalar(@batch_sentences)>0)
+ {
+ # assign each thread work
+ for (my $i=0; $i<$NUM_THREADS; $i++)
+ {
+ my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+ if ($start_index >= scalar(@batch_sentences))
+ {
+ last;
+ }
+ my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+ if ($end_index >= scalar(@batch_sentences))
+ {
+ $end_index = scalar(@batch_sentences)-1;
+ }
+ my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+ my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+ push(@thread_list, $new_thread);
+ }
+ foreach (@thread_list)
+ {
+ my $tokenized_list = $_->join;
+ foreach (@$tokenized_list)
+ {
+ print $_;
+ }
+ }
+ }
+}
+else
+{# single thread only
+ while()
+ {
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+ {
+ #don't try to tokenize XML/HTML tag lines
+ print $_;
+ }
+ else
+ {
+ print &tokenize($_);
+ }
+ }
+}
+
+if ($TIMING)
+{
+ my $duration = Time::HiRes::tv_interval( $start_time );
+ print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+ print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
+
+#####################################################################################
+# subroutines afterward
+
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array cotaining a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+ my(@text_list) = @_;
+ my(@tokenized_list) = ();
+ foreach (@text_list)
+ {
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+ {
+ #don't try to tokenize XML/HTML tag lines
+ push(@tokenized_list, $_);
+ }
+ else
+ {
+ push(@tokenized_list, &tokenize($_));
+ }
+ }
+ return \@tokenized_list;
+}
+
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize
+{
+ my($text) = @_;
+ chomp($text);
+ $text = " $text ";
+
+ # remove ASCII junk
+ $text =~ s/\s+/ /g;
+ $text =~ s/[\000-\037]//g;
+
+ # seperate out all "other" special characters
+ $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+
+ # aggressive hyphen splitting
+ if ($AGGRESSIVE)
+ {
+ $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+ }
+
+ #multi-dots stay together
+ $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+ while($text =~ /DOTMULTI\./)
+ {
+ $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+ $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+ }
+
+ # seperate out "," except if within numbers (5,300)
+ $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+ # separate , pre and post number
+ $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+ $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+ # turn `into '
+ $text =~ s/\`/\'/g;
+
+ #turn '' into "
+ $text =~ s/\'\'/ \" /g;
+
+ if ($language eq "en")
+ {
+ #split contractions right
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+ #special case for "1990's"
+ $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+ }
+ elsif (($language eq "fr") or ($language eq "it"))
+ {
+ #split contractions left
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+ }
+ else
+ {
+ $text =~ s/\'/ \' /g;
+ }
+
+ #word token method
+ my @words = split(/\s/,$text);
+ $text = "";
+ for (my $i=0;$i<(scalar(@words));$i++)
+ {
+ my $word = $words[$i];
+ if ( $word =~ /^(\S+)\.$/)
+ {
+ my $pre = $1;
+ if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml
+ #$text =~ s/\'/\'/g; # xml
+ #$text =~ s/\"/\"/g; # xml
+ #$text =~ s/\[/\[/g; # syntax non-terminal
+ #$text =~ s/\]/\]/g; # syntax non-terminal
+
+ #ensure final line break
+ $text .= "\n" unless $text =~ /\n$/;
+
+ return $text;
+}
+
+sub load_prefixes
+{
+ my ($language, $PREFIX_REF) = @_;
+
+ my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+
+ #default back to English if we don't have a language-specific prefix file
+ if (!(-e $prefixfile))
+ {
+ $prefixfile = "$mydir/nonbreaking_prefix.en";
+ print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+ die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+ }
+
+ if (-e "$prefixfile")
+ {
+ open(PREFIX, "<:utf8", "$prefixfile");
+ while ()
+ {
+ my $item = $_;
+ chomp($item);
+ if (($item) && (substr($item,0,1) ne "#"))
+ {
+ if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
+ {
+ $PREFIX_REF->{$1} = 2;
+ }
+ else
+ {
+ $PREFIX_REF->{$item} = 1;
+ }
+ }
+ }
+ close(PREFIX);
+ }
+}
+
diff --git a/tokenizer.no-escape.perl b/tokenizer.no-escape.perl
deleted file mode 100755
index 4397360..0000000
--- a/tokenizer.no-escape.perl
+++ /dev/null
@@ -1,348 +0,0 @@
-#!/usr/bin/perl -w
-
-# Sample Tokenizer
-### Version 1.1
-# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
-# Version 1.1 updates:
-# (1) add multithreading option "-threads NUM_THREADS" (default is 1);
-# (2) add a timing option "-time" to calculate the average speed of this tokenizer;
-# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
-### Version 1.0
-# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
-# written by Josh Schroeder, based on code by Philipp Koehn
-
-binmode(STDIN, ":utf8");
-binmode(STDOUT, ":utf8");
-
-use FindBin qw($RealBin);
-use strict;
-use Time::HiRes;
-#use Thread;
-
-my $mydir = "$RealBin/nonbreaking_prefixes";
-
-my %NONBREAKING_PREFIX = ();
-my $language = "en";
-my $QUIET = 0;
-my $HELP = 0;
-my $AGGRESSIVE = 0;
-my $SKIP_XML = 0;
-my $TIMING = 0;
-my $NUM_THREADS = 1;
-my $NUM_SENTENCES_PER_THREAD = 2000;
-
-while (@ARGV)
-{
- $_ = shift;
- /^-b$/ && ($| = 1, next);
- /^-l$/ && ($language = shift, next);
- /^-q$/ && ($QUIET = 1, next);
- /^-h$/ && ($HELP = 1, next);
- /^-x$/ && ($SKIP_XML = 1, next);
- /^-a$/ && ($AGGRESSIVE = 1, next);
- /^-time$/ && ($TIMING = 1, next);
- /^-threads$/ && ($NUM_THREADS = int(shift), next);
- /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
-}
-
-# for time calculation
-my $start_time;
-if ($TIMING)
-{
- $start_time = [ Time::HiRes::gettimeofday( ) ];
-}
-
-# print help message
-if ($HELP)
-{
- print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
- print "Options:\n";
- print " -q ... quiet.\n";
- print " -a ... aggressive hyphen splitting.\n";
- print " -b ... disable Perl buffering.\n";
- print " -time ... enable processing time calculation.\n";
- exit;
-}
-
-if (!$QUIET)
-{
- print STDERR "Tokenizer Version 1.1\n";
- print STDERR "Language: $language\n";
- print STDERR "Number of threads: $NUM_THREADS\n";
-}
-
-# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
-load_prefixes($language,\%NONBREAKING_PREFIX);
-
-if (scalar(%NONBREAKING_PREFIX) eq 0)
-{
- print STDERR "Warning: No known abbreviations for language '$language'\n";
-}
-
-my @batch_sentences = ();
-my @thread_list = ();
-my $count_sentences = 0;
-
-if ($NUM_THREADS > 1)
-{# multi-threading tokenization
- while()
- {
- $count_sentences = $count_sentences + 1;
- push(@batch_sentences, $_);
- if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
- {
- # assign each thread work
- for (my $i=0; $i<$NUM_THREADS; $i++)
- {
- my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
- my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
- my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
- my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
- push(@thread_list, $new_thread);
- }
- foreach (@thread_list)
- {
- my $tokenized_list = $_->join;
- foreach (@$tokenized_list)
- {
- print $_;
- }
- }
- # reset for the new run
- @thread_list = ();
- @batch_sentences = ();
- }
- }
- # the last batch
- if (scalar(@batch_sentences)>0)
- {
- # assign each thread work
- for (my $i=0; $i<$NUM_THREADS; $i++)
- {
- my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
- if ($start_index >= scalar(@batch_sentences))
- {
- last;
- }
- my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
- if ($end_index >= scalar(@batch_sentences))
- {
- $end_index = scalar(@batch_sentences)-1;
- }
- my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
- my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
- push(@thread_list, $new_thread);
- }
- foreach (@thread_list)
- {
- my $tokenized_list = $_->join;
- foreach (@$tokenized_list)
- {
- print $_;
- }
- }
- }
-}
-else
-{# single thread only
- while()
- {
- if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
- {
- #don't try to tokenize XML/HTML tag lines
- print $_;
- }
- else
- {
- print &tokenize($_);
- }
- }
-}
-
-if ($TIMING)
-{
- my $duration = Time::HiRes::tv_interval( $start_time );
- print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
- print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
-}
-
-#####################################################################################
-# subroutines afterward
-
-# tokenize a batch of texts saved in an array
-# input: an array containing a batch of texts
-# return: another array cotaining a batch of tokenized texts for the input array
-sub tokenize_batch
-{
- my(@text_list) = @_;
- my(@tokenized_list) = ();
- foreach (@text_list)
- {
- if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
- {
- #don't try to tokenize XML/HTML tag lines
- push(@tokenized_list, $_);
- }
- else
- {
- push(@tokenized_list, &tokenize($_));
- }
- }
- return \@tokenized_list;
-}
-
-# the actual tokenize function which tokenizes one input string
-# input: one string
-# return: the tokenized string for the input string
-sub tokenize
-{
- my($text) = @_;
- chomp($text);
- $text = " $text ";
-
- # remove ASCII junk
- $text =~ s/\s+/ /g;
- $text =~ s/[\000-\037]//g;
-
- # seperate out all "other" special characters
- $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
-
- # aggressive hyphen splitting
- if ($AGGRESSIVE)
- {
- $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
- }
-
- #multi-dots stay together
- $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
- while($text =~ /DOTMULTI\./)
- {
- $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
- $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
- }
-
- # seperate out "," except if within numbers (5,300)
- $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
- # separate , pre and post number
- $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
- $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
-
- # turn `into '
- $text =~ s/\`/\'/g;
-
- #turn '' into "
- $text =~ s/\'\'/ \" /g;
-
- if ($language eq "en")
- {
- #split contractions right
- $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
- #special case for "1990's"
- $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
- }
- elsif (($language eq "fr") or ($language eq "it"))
- {
- #split contractions left
- $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
- }
- else
- {
- $text =~ s/\'/ \' /g;
- }
-
- #word token method
- my @words = split(/\s/,$text);
- $text = "";
- for (my $i=0;$i<(scalar(@words));$i++)
- {
- my $word = $words[$i];
- if ( $word =~ /^(\S+)\.$/)
- {
- my $pre = $1;
- if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml
- #$text =~ s/\'/\'/g; # xml
- #$text =~ s/\"/\"/g; # xml
- #$text =~ s/\[/\[/g; # syntax non-terminal
- #$text =~ s/\]/\]/g; # syntax non-terminal
-
- #ensure final line break
- $text .= "\n" unless $text =~ /\n$/;
-
- return $text;
-}
-
-sub load_prefixes
-{
- my ($language, $PREFIX_REF) = @_;
-
- my $prefixfile = "$mydir/nonbreaking_prefix.$language";
-
- #default back to English if we don't have a language-specific prefix file
- if (!(-e $prefixfile))
- {
- $prefixfile = "$mydir/nonbreaking_prefix.en";
- print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
- die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
- }
-
- if (-e "$prefixfile")
- {
- open(PREFIX, "<:utf8", "$prefixfile");
- while ()
- {
- my $item = $_;
- chomp($item);
- if (($item) && (substr($item,0,1) ne "#"))
- {
- if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
- {
- $PREFIX_REF->{$1} = 2;
- }
- else
- {
- $PREFIX_REF->{$item} = 1;
- }
- }
- }
- close(PREFIX);
- }
-}
-
diff --git a/toks b/toks
index ed40dbb..8bee29f 100755
--- a/toks
+++ b/toks
@@ -3,7 +3,6 @@
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
-
while line = STDIN.gets
line.strip.split(/\s/).each { |i| puts i }
end
diff --git a/train_test_split b/train_test_split
new file mode 100755
index 0000000..db56de9
--- /dev/null
+++ b/train_test_split
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+require 'trollop'
+
+cfg = Trollop::options do
+ opt :foreign, "foreign file", :type => :string, :required => true
+ opt :english, "english file", :type => :string, :required => true
+ opt :size, "one size", :type => :int, :required => true
+ opt :repeat, "number of repetitions", :type => :int, :default => 1
+ opt :prefix, "prefix for output files", :type => :string
+end
+fn = cfg[:foreign]
+fn_ext = fn.split('.').last
+f = ReadFile.readlines fn
+en = cfg[:english]
+en_ext = en.split('.').last
+e = ReadFile.readlines en
+size = cfg[:size]
+nlines_f = `wc -l #{fn}`.split()[0].to_i
+nlines_e = `wc -l #{en}`.split()[0].to_i
+if nlines_f != nlines_e
+ STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n"
+ exit 1
+end
+
+prefix = cfg[:prefix]
+a = (0..nlines_e-1).to_a
+i = 0
+cfg[:repeat].times {
+ b = a.sample(size)
+ ax = a.reject{|j| b.include? j}
+ `mkdir split_#{i}`
+ new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}"
+ new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}"
+ ax.each { |j|
+ new_f.write f[j]
+ new_e.write e[j]
+ }
+ new_f.close; new_e.close
+ new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}"
+ new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}"
+ b.each { |j|
+ new_f.write f[j]
+ new_e.write e[j]
+ }
+ new_f.close; new_e.close
+ i += 1
+}
+
diff --git a/traintestsplit b/traintestsplit
deleted file mode 100755
index ec88df1..0000000
--- a/traintestsplit
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'zipf'
-require 'trollop'
-
-
-cfg = Trollop::options do
- opt :foreign, "foreign file", :type => :string, :required => true
- opt :english, "english file", :type => :string, :required => true
- opt :size, "one size", :type => :int, :required => true
- opt :repeat, "number of repetitions", :type => :int, :default => 1
- opt :prefix, "prefix for output files", :type => :string
-end
-fn = cfg[:foreign]
-fn_ext = fn.split('.').last
-f = ReadFile.readlines fn
-en = cfg[:english]
-en_ext = en.split('.').last
-e = ReadFile.readlines en
-size = cfg[:size]
-nlines_f = `wc -l #{fn}`.split()[0].to_i
-nlines_e = `wc -l #{en}`.split()[0].to_i
-if nlines_f != nlines_e
- STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n"
- exit 1
-end
-
-prefix = cfg[:prefix]
-a = (0..nlines_e-1).to_a
-i = 0
-cfg[:repeat].times {
- b = a.sample(size)
- ax = a.reject{|j| b.include? j}
- `mkdir split_#{i}`
- new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}"
- new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}"
- ax.each { |j|
- new_f.write f[j]
- new_e.write e[j]
- }
- new_f.close; new_e.close
- new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}"
- new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}"
- b.each { |j|
- new_f.write f[j]
- new_e.write e[j]
- }
- new_f.close; new_e.close
- i += 1
-}
-
diff --git a/var b/var
index fe4aa22..faccefa 100755
--- a/var
+++ b/var
@@ -2,7 +2,6 @@
require 'trollop'
-
cfg = Trollop::options do
banner "stddev [-r ] < "
opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
--
cgit v1.2.3