From e0b634754d1bef33dc8e72509c6990cccc32745a Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 9 Oct 2014 20:47:23 +0100 Subject: alles neu macht der mai --- README.md | 14 +- add_seg | 2 +- avg | 1 - avg_weights | 1 - cdec_hg_to_json | 80 +++++++++ collapse_tags.rb | 40 ----- dot | 9 + first_lower | 11 ++ firstlower | 12 -- gigaword_collapse_tags | 39 +++++ hg2json.py | 81 --------- kbest_bleu_oracles | 2 - key_count | 14 ++ keycount | 14 -- kmeans | 2 - lin_reg | 2 - log_reg | 2 - max | 1 - median | 1 - mem_usage | 11 ++ memusg | 12 -- merge_files | 1 - merge_ttable | 2 - min | 1 - min_max | 1 - moses_1best | 1 - mult | 1 - no_empty | 1 - num_tok | 1 - odd | 1 - parse-stanford.sh | 13 -- paste_pairs | 1 - per_sentence_bleu | 2 - per_sentence_bleu_kbest | 2 - per_sentence_ter | 2 - pot | 1 - round | 1 - ruby_eval | 1 - rule_shapes | 1 - shard | 1 - split_pipes | 23 +++ splitpipes | 24 --- stanford_parser_run | 13 ++ stddev | 1 - sum | 1 - tc | 1 - test/cdec_hg_to_json/cdec.ini | 5 + test/cdec_hg_to_json/grammar.gz | Bin 0 -> 1399915 bytes test/cdec_hg_to_json/hg.json.gz | Bin 0 -> 318029 bytes test/cdec_hg_to_json/hg.meta | 7 + test/cdec_hg_to_json/in | 1 + test/cdec_hg_to_json/toy.cdec.ini | 2 + test/cdec_hg_to_json/toy.grammar | 12 ++ test/cdec_hg_to_json/toy.in | 1 + test/cdec_hg_to_json/toy.weights | 3 + test/cdec_hg_to_json/weights | 17 ++ test/hg2json/cdec.ini | 5 - test/hg2json/grammar.gz | Bin 1399915 -> 0 bytes test/hg2json/hg.json.gz | Bin 318029 -> 0 bytes test/hg2json/hg.meta | 7 - test/hg2json/in | 1 - test/hg2json/toy.cdec.ini | 2 - test/hg2json/toy.grammar | 12 -- test/hg2json/toy.in | 1 - test/hg2json/toy.weights | 3 - test/hg2json/weights | 17 -- tf-idf | 2 - to_ascii | 1 - tokenizer-no-escape.perl | 348 ++++++++++++++++++++++++++++++++++++++ tokenizer.no-escape.perl | 348 -------------------------------------- toks | 1 - train_test_split | 50 ++++++ traintestsplit | 51 ------ var | 1 - 74 files changed, 653 insertions(+), 694 deletions(-) create mode 100755 cdec_hg_to_json delete mode 100755 collapse_tags.rb create mode 100755 dot create mode 100755 first_lower delete mode 100755 firstlower create mode 100755 gigaword_collapse_tags delete mode 100755 hg2json.py create mode 100755 key_count delete mode 100755 keycount create mode 100755 mem_usage delete mode 100755 memusg delete mode 100755 parse-stanford.sh create mode 100755 split_pipes delete mode 100755 splitpipes create mode 100755 stanford_parser_run create mode 100644 test/cdec_hg_to_json/cdec.ini create mode 100644 test/cdec_hg_to_json/grammar.gz create mode 100644 test/cdec_hg_to_json/hg.json.gz create mode 100644 test/cdec_hg_to_json/hg.meta create mode 100644 test/cdec_hg_to_json/in create mode 100644 test/cdec_hg_to_json/toy.cdec.ini create mode 100644 test/cdec_hg_to_json/toy.grammar create mode 100644 test/cdec_hg_to_json/toy.in create mode 100644 test/cdec_hg_to_json/toy.weights create mode 100644 test/cdec_hg_to_json/weights delete mode 100644 test/hg2json/cdec.ini delete mode 100644 test/hg2json/grammar.gz delete mode 100644 test/hg2json/hg.json.gz delete mode 100644 test/hg2json/hg.meta delete mode 100644 test/hg2json/in delete mode 100644 test/hg2json/toy.cdec.ini delete mode 100644 test/hg2json/toy.grammar delete mode 100644 test/hg2json/toy.in delete mode 100644 test/hg2json/toy.weights delete mode 100644 test/hg2json/weights create mode 100755 tokenizer-no-escape.perl delete mode 100755 tokenizer.no-escape.perl create mode 100755 train_test_split delete mode 100755 traintestsplit diff --git a/README.md b/README.md index 3a6b1b7..fd42922 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,11 @@ -scripts -======= +a number of NLP related scripts. Some scripts require my zipf gem, see [1] -A number of NLP related scripts. -Some scripts require my zipf gem, -see https://github.com/pks/zipf +\*.perl taken from the moses [2] toolkit -compound-splitter.perl and tokenizer.no-escape.perl -taken from the moses [1] toolkit. +mem\_usage taken from [3] -[1] https://github.com/moses-smt/mosesdecoder +[1] https://github.com/pks/zipf +[2] https://github.com/moses-smt/mosesdecoder +[3] https://gist.github.com/netj/526585 diff --git a/add_seg b/add_seg index e4fe22d..7a4ca7a 100755 --- a/add_seg +++ b/add_seg @@ -24,8 +24,8 @@ while line = STDIN.gets s = " 0 - puts s + " id=\"#{index[j]}\"> #{line.strip} " if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{index[j]}#{ext}\"" end + puts s + " id=\"#{index[j]}\"> #{line.strip} " else if cfg[:grammar] then s += " grammar=\"#{cfg[:grammar]}/grammar.#{i}#{ext}\"" end puts s + " id=\"#{i}\"> #{line.strip} " diff --git a/avg b/avg index ed31465..07e3de9 100755 --- a/avg +++ b/avg @@ -2,7 +2,6 @@ require 'trollop' - cfg = Trollop::options do banner "avg < " opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 diff --git a/avg_weights b/avg_weights index 1f9053f..2e23440 100755 --- a/avg_weights +++ b/avg_weights @@ -4,7 +4,6 @@ require 'zipf' require 'trollop' require 'zlib' - cfg = Trollop::options do opt :weights_files, "a number of weights files: name value", :required => true opt :filter, "Filter if key does not appear in every file.", :type => :bool, :default => false diff --git a/cdec_hg_to_json b/cdec_hg_to_json new file mode 100755 index 0000000..5a26cf7 --- /dev/null +++ b/cdec_hg_to_json @@ -0,0 +1,80 @@ +#!/usr/bin/env python2 + +import cdec +import sys, argparse + +def hg2json(hg, weights): + """ + output a JSON representation of a cdec hypegraph + (see http://aclweb.org/aclwiki/index.php?title=Hypergraph_Format ) + """ + res = '' + res += "{\n" + res += '"weights":{'+"\n" + a = [] + for i in weights: + a.append( '"%s":%s'%(i[0], i[1]) ) + res += ", ".join(a)+"\n" + res += "},\n" + res += '"nodes":'+"\n" + res += "[\n" + a = [] + a.append( '{ "label":"root", "cat":"root" }' ) + for i in hg.nodes: + a.append( '{ "label":"%s", "cat":"%s" }'%(i.id, i.cat) ) + res += ",\n".join(a)+"\n" + res += "],\n" + res += '"edges":'+"\n" + res += "[\n" + a = [] + for i in hg.edges: + s = "{" + s += '"head":"%s"'%(i.head_node.id) + xs = ' "f":{' + b = [] + for j in i.feature_values: + b.append( '"%s":%s'%(j[0], j[1]) ) + xs += ", ".join(b) + xs += "}," + c = [] + for j in i.tail_nodes: + c.append( '"'+str(j.id)+'"' ) + if len(c) > 0: + s += ', "tails":[ %s ],'%(",".join(c)) + else: + s += ', "tails":[ "root" ],' + s += xs + s += ' "weight":%s }'%(i.prob) + a.append(s) + res += ",\n".join(a)+"\n" + res += "]\n" + res += "}\n" + return res + +def main(): + parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs') + parser.add_argument('-c', '--config', required=True, help='decoder configuration') + parser.add_argument('-w', '--weights', required=True, help='feature weights') + args = parser.parse_args() + with open(args.config) as config: + config = config.read() + decoder = cdec.Decoder(config) + decoder.read_weights(args.weights) + ins = sys.stdin.readline().strip() + hg = decoder.translate(ins) + + sys.stderr.write( "input:\n '%s'\n"%(ins) ) + sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) ) + num_nodes = 0 + for i in hg.nodes: num_nodes+=1 + sys.stderr.write( "# nodes = %s\n"%(num_nodes) ) + num_edges = 0 + for i in hg.edges: num_edges+=1 + sys.stderr.write( "# edges = %s\n"%(num_edges) ) + sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) ) + + print hg2json(hg, decoder.weights) + +if __name__=="__main__": + main() + diff --git a/collapse_tags.rb b/collapse_tags.rb deleted file mode 100755 index 75fcaf5..0000000 --- a/collapse_tags.rb +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env ruby - -# works with gigaword en v5 - -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - - -in_p = false -in_dateline = false -collect = [] - -while line = STDIN.gets - line.strip! - if line.downcase == "" - in_dateline = true - next - elsif line.downcase == "" - in_dateline = false - next - elsif in_dateline - next - elsif line.downcase == "

" and not in_p - in_p = true - collect = [] - next - elsif line.downcase == "

" and in_p - if collect.size > 0 - puts collect.join(" ").strip - end - in_p = false - next - elsif in_p - collect.push line - next - else - puts line - end -end - diff --git a/dot b/dot new file mode 100755 index 0000000..da0dc58 --- /dev/null +++ b/dot @@ -0,0 +1,9 @@ +#!/usr/bin/env ruby + +require 'zipf' + +a = SparseVector.from_file 'w', ' ' +b = SparseVector.from_file 'f', ' ' +puts a.to_s +puts a.dot b + diff --git a/first_lower b/first_lower new file mode 100755 index 0000000..1cddb8e --- /dev/null +++ b/first_lower @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby + +require 'zipf' + +while line = STDIN.gets + line.strip! + if line && line!='' && line[0].downcase? + puts line + end +end + diff --git a/firstlower b/firstlower deleted file mode 100755 index 682a9b7..0000000 --- a/firstlower +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' - - -while line = STDIN.gets - line.strip! - if line && line!='' && line[0].downcase? - puts line - end -end - diff --git a/gigaword_collapse_tags b/gigaword_collapse_tags new file mode 100755 index 0000000..cbaf7d7 --- /dev/null +++ b/gigaword_collapse_tags @@ -0,0 +1,39 @@ +#!/usr/bin/env ruby + +# works with gigaword en v5 + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +in_p = false +in_dateline = false +collect = [] + +while line = STDIN.gets + line.strip! + if line.downcase == "" + in_dateline = true + next + elsif line.downcase == "" + in_dateline = false + next + elsif in_dateline + next + elsif line.downcase == "

" and not in_p + in_p = true + collect = [] + next + elsif line.downcase == "

" and in_p + if collect.size > 0 + puts collect.join(" ").strip + end + in_p = false + next + elsif in_p + collect.push line + next + else + puts line + end +end + diff --git a/hg2json.py b/hg2json.py deleted file mode 100755 index 5bd5c2c..0000000 --- a/hg2json.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python2 - -import cdec -import sys, argparse - -def hg2json(hg, weights): - """ - output a JSON representation of a cdec hypegraph - (see http://aclweb.org/aclwiki/index.php?title=Hypergraph_Format ) - """ - res = '' - res += "{\n" - res += '"weights":{'+"\n" - a = [] - for i in weights: - a.append( '"%s":%s'%(i[0], i[1]) ) - res += ", ".join(a)+"\n" - res += "},\n" - res += '"nodes":'+"\n" - res += "[\n" - a = [] - a.append( '{ "label":"root", "cat":"root" }' ) - for i in hg.nodes: - a.append( '{ "label":"%s", "cat":"%s" }'%(i.id, i.cat) ) - res += ",\n".join(a)+"\n" - res += "],\n" - res += '"edges":'+"\n" - res += "[\n" - a = [] - for i in hg.edges: - s = "{" - s += '"head":"%s"'%(i.head_node.id) - xs = ' "f":{' - b = [] - for j in i.feature_values: - b.append( '"%s":%s'%(j[0], j[1]) ) - xs += ", ".join(b) - xs += "}," - c = [] - for j in i.tail_nodes: - c.append( '"'+str(j.id)+'"' ) - if len(c) > 0: - s += ', "tails":[ %s ],'%(",".join(c)) - else: - s += ', "tails":[ "root" ],' - s += xs - s += ' "weight":%s }'%(i.prob) - a.append(s) - res += ",\n".join(a)+"\n" - res += "]\n" - res += "}\n" - return res - -def main(): - parser = argparse.ArgumentParser(description='get a proper json representation of cdec hypergraphs') - parser.add_argument('-c', '--config', required=True, help='decoder configuration') - parser.add_argument('-w', '--weights', required=True, help='feature weights') - args = parser.parse_args() - with open(args.config) as config: - config = config.read() - decoder = cdec.Decoder(config) - decoder.read_weights(args.weights) - ins = sys.stdin.readline().strip() - hg = decoder.translate(ins) - - sys.stderr.write( "input:\n '%s'\n"%(ins) ) - sys.stderr.write( "viterbi translation:\n '%s'\n"%(hg.viterbi()) ) - num_nodes = 0 - for i in hg.nodes: num_nodes+=1 - sys.stderr.write( "# nodes = %s\n"%(num_nodes) ) - num_edges = 0 - for i in hg.edges: num_edges+=1 - sys.stderr.write( "# edges = %s\n"%(num_edges) ) - sys.stderr.write( "viterbi score = %s\n"%(round(hg.viterbi_features().dot(decoder.weights), 2)) ) - - print hg2json(hg, decoder.weights) - - -if __name__=="__main__": - main() - diff --git a/kbest_bleu_oracles b/kbest_bleu_oracles index 2ac344b..7db1c7e 100755 --- a/kbest_bleu_oracles +++ b/kbest_bleu_oracles @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def get_context kbest_lists, references, n a = [] kbest_lists.each_index { |i| @@ -48,6 +47,5 @@ def main } end - main diff --git a/key_count b/key_count new file mode 100755 index 0000000..deaa522 --- /dev/null +++ b/key_count @@ -0,0 +1,14 @@ +#!/usr/bin/env ruby + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +h = {} +h.default = 0 +while line = STDIN.gets + line.strip! + h[line] += 1 +end + +h.each_pair { |k,v| puts "#{k} #{v}" } + diff --git a/keycount b/keycount deleted file mode 100755 index deaa522..0000000 --- a/keycount +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env ruby - -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - -h = {} -h.default = 0 -while line = STDIN.gets - line.strip! - h[line] += 1 -end - -h.each_pair { |k,v| puts "#{k} #{v}" } - diff --git a/kmeans b/kmeans index ec28897..201864b 100755 --- a/kmeans +++ b/kmeans @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def read_data fn data = {} ReadFile.new(fn).readlines_strip.map{ |i| @@ -114,6 +113,5 @@ def main end end - main diff --git a/lin_reg b/lin_reg index 168e7df..4a7c3b2 100755 --- a/lin_reg +++ b/lin_reg @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def read_data fn, scale f = ReadFile.new fn data = [] @@ -67,6 +66,5 @@ def main puts model.to_s end - main diff --git a/log_reg b/log_reg index e6f47eb..3916d0c 100755 --- a/log_reg +++ b/log_reg @@ -4,7 +4,6 @@ require 'zipf' require 'matrix' require 'trollop' - def read_data fn f = ReadFile.new fn data = [] @@ -68,6 +67,5 @@ def main puts model.to_s end - main diff --git a/max b/max index 87f3c73..b2c1cae 100755 --- a/max +++ b/max @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - max = -1.0/0 while line = STDIN.gets v = line.to_f diff --git a/median b/median index 9499c95..0b1950b 100755 --- a/median +++ b/median @@ -2,7 +2,6 @@ require 'zipf' - a = [] while line = STDIN.gets a << line.to_f diff --git a/mem_usage b/mem_usage new file mode 100755 index 0000000..5c2104f --- /dev/null +++ b/mem_usage @@ -0,0 +1,11 @@ +#!/bin/bash + +"$@" & +pid=$! peak=0 +while true; do + sleep 1 + sample="$(ps -o rss= $pid 2> /dev/null)" || break + let peak='sample > peak ? sample : peak' +done +echo "$(( ${peak%% *} / 1024)) m" + diff --git a/memusg b/memusg deleted file mode 100755 index a69daaa..0000000 --- a/memusg +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - - -"$@" & -pid=$! peak=0 -while true; do - sleep 1 - sample="$(ps -o rss= $pid 2> /dev/null)" || break - let peak='sample > peak ? sample : peak' -done -echo "$(( ${peak%% *} / 1024)) m" - diff --git a/merge_files b/merge_files index 0b4941e..714b57d 100755 --- a/merge_files +++ b/merge_files @@ -2,7 +2,6 @@ require 'zipf' - def usage STDERR.write "merge_files +\n" exit 1 diff --git a/merge_ttable b/merge_ttable index 20d86d3..e4621f5 100755 --- a/merge_ttable +++ b/merge_ttable @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def main cfg = Trollop::options do opt :f, "f files", :type => :string, :required => true @@ -31,6 +30,5 @@ def main } end - main diff --git a/min b/min index 398b0fb..f8a7e42 100755 --- a/min +++ b/min @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - min = 1.0/0 while line = STDIN.gets v = line.to_f diff --git a/min_max b/min_max index 17dc566..b79a743 100755 --- a/min_max +++ b/min_max @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - cfg = Trollop::options do opt :min, "minimum #tokens", :type => :int, :default => 1 opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n' diff --git a/moses_1best b/moses_1best index 849ebf1..fd35cf8 100755 --- a/moses_1best +++ b/moses_1best @@ -2,7 +2,6 @@ require 'zipf' - prev_idx = nil while line = STDIN.gets line.strip! diff --git a/mult b/mult index 2ef0149..478ec5e 100755 --- a/mult +++ b/mult @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - factor = ARGV[0].to_f while line = STDIN.gets puts line.to_f * factor diff --git a/no_empty b/no_empty index 96c9ce4..da57e23 100755 --- a/no_empty +++ b/no_empty @@ -2,7 +2,6 @@ require 'zipf' - files = [] (0..1).each { |i| files << ReadFile.new(ARGV[i]) } (2..3).each { |i| files << WriteFile.new(ARGV[i]) } diff --git a/num_tok b/num_tok index 53b99a0..56cbae9 100755 --- a/num_tok +++ b/num_tok @@ -3,7 +3,6 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' - while line = STDIN.gets puts line.strip.split.length end diff --git a/odd b/odd index 93aaa80..0bd9336 100755 --- a/odd +++ b/odd @@ -3,7 +3,6 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' - i = 1 while line = STDIN.gets puts line if i%2!=0 diff --git a/parse-stanford.sh b/parse-stanford.sh deleted file mode 100755 index f8d4210..0000000 --- a/parse-stanford.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -if [ $# != 1 ]; then - echo "$0 text-file" - exit 1 -fi - -export CLASSPATH=:/toolbox/stanfordparser_3_2_0/* - -IN=$1 - -cat $IN | java -server -mx25000m edu.stanford.nlp.parser.lexparser.LexicalizedParser -nthreads 8 -sentences newline -encoding utf-8 -tokenized -outputFormat "typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz - | tr '\n' '\t' | sed 's/\t\t/\n/g' | sed 's/\t/ /g' | sed 's/ *$//' | sed 's/, /,/g' > $IN.stp - diff --git a/paste_pairs b/paste_pairs index 07c1f22..f6b8b31 100755 --- a/paste_pairs +++ b/paste_pairs @@ -3,7 +3,6 @@ import sys from itertools import izip - for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))): print linenr, (src_line.strip()) print linenr, (tgt_line.strip()) diff --git a/per_sentence_bleu b/per_sentence_bleu index 76fcf38..5bacd1a 100755 --- a/per_sentence_bleu +++ b/per_sentence_bleu @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def main cfg = Trollop::options do opt :input, "input", :type => :string, :default => '-' @@ -26,6 +25,5 @@ def main input.close end - main diff --git a/per_sentence_bleu_kbest b/per_sentence_bleu_kbest index 4d821b3..e6a31cb 100755 --- a/per_sentence_bleu_kbest +++ b/per_sentence_bleu_kbest @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def main cfg = Trollop::options do opt :kbests, "kbests", :type => :string, :default => '-' @@ -29,6 +28,5 @@ def main } end - main diff --git a/per_sentence_ter b/per_sentence_ter index 8b04be5..343708e 100755 --- a/per_sentence_ter +++ b/per_sentence_ter @@ -4,7 +4,6 @@ require 'zipf' require 'trollop' require 'tempfile' - def main cfg = Trollop::options do opt :input, "input", :type => :string, :default => '-' @@ -30,6 +29,5 @@ def main input.close end - main diff --git a/pot b/pot index ec199ea..24acabe 100755 --- a/pot +++ b/pot @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - pow = ARGV[0].to_f while line = STDIN.gets puts line.to_f**pow diff --git a/round b/round index 3dfbb6f..dfef800 100755 --- a/round +++ b/round @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - r = ARGV[0].to_i while line = STDIN.gets puts line.to_f.round r diff --git a/ruby_eval b/ruby_eval index 96b2ecb..fe0d181 100755 --- a/ruby_eval +++ b/ruby_eval @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - while line = STDIN.gets puts "#{eval line}" end diff --git a/rule_shapes b/rule_shapes index fd42249..589a670 100755 --- a/rule_shapes +++ b/rule_shapes @@ -3,7 +3,6 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' - def shape s res = [] in_t = false diff --git a/shard b/shard index f952104..6155123 100755 --- a/shard +++ b/shard @@ -2,7 +2,6 @@ require 'trollop' - def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false) lc = `wc -l #{input}`.split.first.to_i input_ext = input.split('.').last diff --git a/split_pipes b/split_pipes new file mode 100755 index 0000000..eeba69b --- /dev/null +++ b/split_pipes @@ -0,0 +1,23 @@ +#!/usr/bin/env ruby + +require 'trollop' + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +cfg = Trollop::options do + banner "splitpipes -f < " + opt :field, "field", :type => :int +end + +while line = STDIN.gets + j = 1 + line.strip.split(' ||| ').each { |i| + if j == cfg[:field] + puts i.strip + break + end + j += 1 + } +end + diff --git a/splitpipes b/splitpipes deleted file mode 100755 index 35ee176..0000000 --- a/splitpipes +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env ruby - -require 'trollop' - -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' - - -cfg = Trollop::options do - banner "splitpipes -f < " - opt :field, "field", :type => :int -end - -while line = STDIN.gets - j = 1 - line.strip.split(' ||| ').each { |i| - if j == cfg[:field] - puts i.strip - break - end - j += 1 - } -end - diff --git a/stanford_parser_run b/stanford_parser_run new file mode 100755 index 0000000..f8d4210 --- /dev/null +++ b/stanford_parser_run @@ -0,0 +1,13 @@ +#!/bin/bash + +if [ $# != 1 ]; then + echo "$0 text-file" + exit 1 +fi + +export CLASSPATH=:/toolbox/stanfordparser_3_2_0/* + +IN=$1 + +cat $IN | java -server -mx25000m edu.stanford.nlp.parser.lexparser.LexicalizedParser -nthreads 8 -sentences newline -encoding utf-8 -tokenized -outputFormat "typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz - | tr '\n' '\t' | sed 's/\t\t/\n/g' | sed 's/\t/ /g' | sed 's/ *$//' | sed 's/, /,/g' > $IN.stp + diff --git a/stddev b/stddev index 5cda0e0..a7397b2 100755 --- a/stddev +++ b/stddev @@ -2,7 +2,6 @@ require 'trollop' - cfg = Trollop::options do banner "stddev [-r ] < " opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 diff --git a/sum b/sum index dac72d3..acfa563 100755 --- a/sum +++ b/sum @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - sum = 0.0 while line = STDIN.gets sum += line.to_f diff --git a/tc b/tc index 993086a..7eefdd5 100755 --- a/tc +++ b/tc @@ -2,7 +2,6 @@ require 'zipf' - while line = STDIN.gets puts tokenize(line.strip).size end diff --git a/test/cdec_hg_to_json/cdec.ini b/test/cdec_hg_to_json/cdec.ini new file mode 100644 index 0000000..1ad25b5 --- /dev/null +++ b/test/cdec_hg_to_json/cdec.ini @@ -0,0 +1,5 @@ +formalism=scfg +grammar=test/hg2json/grammar.gz +add_pass_through_rules=true +feature_function=WordPenalty +intersection_strategy=full diff --git a/test/cdec_hg_to_json/grammar.gz b/test/cdec_hg_to_json/grammar.gz new file mode 100644 index 0000000..78dda98 Binary files /dev/null and b/test/cdec_hg_to_json/grammar.gz differ diff --git a/test/cdec_hg_to_json/hg.json.gz b/test/cdec_hg_to_json/hg.json.gz new file mode 100644 index 0000000..ed178c6 Binary files /dev/null and b/test/cdec_hg_to_json/hg.json.gz differ diff --git a/test/cdec_hg_to_json/hg.meta b/test/cdec_hg_to_json/hg.meta new file mode 100644 index 0000000..d33a54c --- /dev/null +++ b/test/cdec_hg_to_json/hg.meta @@ -0,0 +1,7 @@ +input: + 'in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen .' +viterbi translation: + 'which labor market desperate transformed into attempting gathered by failed to show any the non - is making festzuhalten gathered by pervez musharraf meant to its borders with within than the non - have pakistan 's intelligence relied constitutional for security as a its borders with declared a state of emergency - range missiles .' +# nodes = 220 +# edges = 16640 +viterbi score = 228.95 diff --git a/test/cdec_hg_to_json/in b/test/cdec_hg_to_json/in new file mode 100644 index 0000000..7dc411d --- /dev/null +++ b/test/cdec_hg_to_json/in @@ -0,0 +1 @@ +in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen . diff --git a/test/cdec_hg_to_json/toy.cdec.ini b/test/cdec_hg_to_json/toy.cdec.ini new file mode 100644 index 0000000..d4a2896 --- /dev/null +++ b/test/cdec_hg_to_json/toy.cdec.ini @@ -0,0 +1,2 @@ +formalism=scfg +grammar=test/hg2json/toy.grammar diff --git a/test/cdec_hg_to_json/toy.grammar b/test/cdec_hg_to_json/toy.grammar new file mode 100644 index 0000000..382c94f --- /dev/null +++ b/test/cdec_hg_to_json/toy.grammar @@ -0,0 +1,12 @@ +[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 +[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0 +[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1.0 +[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 +[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 +[JJ] ||| kleines ||| small ||| logp=0 use_small=1.0 +[JJ] ||| kleines ||| little ||| logp=0 use_little=1.0 +[JJ] ||| grosses ||| big ||| logp=0 +[JJ] ||| grosses ||| large ||| logp=0 +[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 +[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0 +[V] ||| fand ||| found ||| logp=0 diff --git a/test/cdec_hg_to_json/toy.in b/test/cdec_hg_to_json/toy.in new file mode 100644 index 0000000..e6df927 --- /dev/null +++ b/test/cdec_hg_to_json/toy.in @@ -0,0 +1 @@ +ich sah ein kleines haus diff --git a/test/cdec_hg_to_json/toy.weights b/test/cdec_hg_to_json/toy.weights new file mode 100644 index 0000000..70075b7 --- /dev/null +++ b/test/cdec_hg_to_json/toy.weights @@ -0,0 +1,3 @@ +logp 2 +use_house 0 +use_shell 1 diff --git a/test/cdec_hg_to_json/weights b/test/cdec_hg_to_json/weights new file mode 100644 index 0000000..7f96f1d --- /dev/null +++ b/test/cdec_hg_to_json/weights @@ -0,0 +1,17 @@ +PhraseModel_0 1.0 +PhraseModel_1 1.0 +PhraseModel_2 1.0 +PhraseModel_3 1.0 +PhraseModel_4 1.0 +PhraseModel_5 1.0 +PhraseModel_6 1.0 +PassThrough -1.0 +PassThrough_1 -1.0 +PassThrough_2 -1.0 +PassThrough_3 -1.0 +PassThrough_4 -1.0 +PassThrough_5 -1.0 +PassThrough_6 -1.0 +Glue 0.1 +LanguageModel 10.0 +LanguageModel_OOV -10 diff --git a/test/hg2json/cdec.ini b/test/hg2json/cdec.ini deleted file mode 100644 index 1ad25b5..0000000 --- a/test/hg2json/cdec.ini +++ /dev/null @@ -1,5 +0,0 @@ -formalism=scfg -grammar=test/hg2json/grammar.gz -add_pass_through_rules=true -feature_function=WordPenalty -intersection_strategy=full diff --git a/test/hg2json/grammar.gz b/test/hg2json/grammar.gz deleted file mode 100644 index 78dda98..0000000 Binary files a/test/hg2json/grammar.gz and /dev/null differ diff --git a/test/hg2json/hg.json.gz b/test/hg2json/hg.json.gz deleted file mode 100644 index ed178c6..0000000 Binary files a/test/hg2json/hg.json.gz and /dev/null differ diff --git a/test/hg2json/hg.meta b/test/hg2json/hg.meta deleted file mode 100644 index d33a54c..0000000 --- a/test/hg2json/hg.meta +++ /dev/null @@ -1,7 +0,0 @@ -input: - 'in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen .' -viterbi translation: - 'which labor market desperate transformed into attempting gathered by failed to show any the non - is making festzuhalten gathered by pervez musharraf meant to its borders with within than the non - have pakistan 's intelligence relied constitutional for security as a its borders with declared a state of emergency - range missiles .' -# nodes = 220 -# edges = 16640 -viterbi score = 228.95 diff --git a/test/hg2json/in b/test/hg2json/in deleted file mode 100644 index 7dc411d..0000000 --- a/test/hg2json/in +++ /dev/null @@ -1 +0,0 @@ -in dem verzweifelten versuch , an der macht festzuhalten , hat pervez musharraf den rahmen der pakistanischen verfassung verlassen und den notstand ausgerufen . diff --git a/test/hg2json/toy.cdec.ini b/test/hg2json/toy.cdec.ini deleted file mode 100644 index d4a2896..0000000 --- a/test/hg2json/toy.cdec.ini +++ /dev/null @@ -1,2 +0,0 @@ -formalism=scfg -grammar=test/hg2json/toy.grammar diff --git a/test/hg2json/toy.grammar b/test/hg2json/toy.grammar deleted file mode 100644 index 382c94f..0000000 --- a/test/hg2json/toy.grammar +++ /dev/null @@ -1,12 +0,0 @@ -[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 -[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0 -[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1.0 -[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 -[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 -[JJ] ||| kleines ||| small ||| logp=0 use_small=1.0 -[JJ] ||| kleines ||| little ||| logp=0 use_little=1.0 -[JJ] ||| grosses ||| big ||| logp=0 -[JJ] ||| grosses ||| large ||| logp=0 -[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 -[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0 -[V] ||| fand ||| found ||| logp=0 diff --git a/test/hg2json/toy.in b/test/hg2json/toy.in deleted file mode 100644 index e6df927..0000000 --- a/test/hg2json/toy.in +++ /dev/null @@ -1 +0,0 @@ -ich sah ein kleines haus diff --git a/test/hg2json/toy.weights b/test/hg2json/toy.weights deleted file mode 100644 index 70075b7..0000000 --- a/test/hg2json/toy.weights +++ /dev/null @@ -1,3 +0,0 @@ -logp 2 -use_house 0 -use_shell 1 diff --git a/test/hg2json/weights b/test/hg2json/weights deleted file mode 100644 index 7f96f1d..0000000 --- a/test/hg2json/weights +++ /dev/null @@ -1,17 +0,0 @@ -PhraseModel_0 1.0 -PhraseModel_1 1.0 -PhraseModel_2 1.0 -PhraseModel_3 1.0 -PhraseModel_4 1.0 -PhraseModel_5 1.0 -PhraseModel_6 1.0 -PassThrough -1.0 -PassThrough_1 -1.0 -PassThrough_2 -1.0 -PassThrough_3 -1.0 -PassThrough_4 -1.0 -PassThrough_5 -1.0 -PassThrough_6 -1.0 -Glue 0.1 -LanguageModel 10.0 -LanguageModel_OOV -10 diff --git a/tf-idf b/tf-idf index fc6c2ec..450de6b 100755 --- a/tf-idf +++ b/tf-idf @@ -3,7 +3,6 @@ require 'zipf' require 'trollop' - def main cfg = Trollop::options do opt :documents, "input files (documents)", :type => :string, :required => true @@ -48,6 +47,5 @@ def main docs.each { |i| puts i.to_s } end - main diff --git a/to_ascii b/to_ascii index 6c1d23e..10fd1c2 100755 --- a/to_ascii +++ b/to_ascii @@ -1,6 +1,5 @@ #!/usr/bin/env ruby - while line = STDIN.gets encoding_options = { :invalid => :replace, diff --git a/tokenizer-no-escape.perl b/tokenizer-no-escape.perl new file mode 100755 index 0000000..4397360 --- /dev/null +++ b/tokenizer-no-escape.perl @@ -0,0 +1,348 @@ +#!/usr/bin/perl -w + +# Sample Tokenizer +### Version 1.1 +# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn +# Version 1.1 updates: +# (1) add multithreading option "-threads NUM_THREADS" (default is 1); +# (2) add a timing option "-time" to calculate the average speed of this tokenizer; +# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed); +### Version 1.0 +# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $ +# written by Josh Schroeder, based on code by Philipp Koehn + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +use FindBin qw($RealBin); +use strict; +use Time::HiRes; +#use Thread; + +my $mydir = "$RealBin/nonbreaking_prefixes"; + +my %NONBREAKING_PREFIX = (); +my $language = "en"; +my $QUIET = 0; +my $HELP = 0; +my $AGGRESSIVE = 0; +my $SKIP_XML = 0; +my $TIMING = 0; +my $NUM_THREADS = 1; +my $NUM_SENTENCES_PER_THREAD = 2000; + +while (@ARGV) +{ + $_ = shift; + /^-b$/ && ($| = 1, next); + /^-l$/ && ($language = shift, next); + /^-q$/ && ($QUIET = 1, next); + /^-h$/ && ($HELP = 1, next); + /^-x$/ && ($SKIP_XML = 1, next); + /^-a$/ && ($AGGRESSIVE = 1, next); + /^-time$/ && ($TIMING = 1, next); + /^-threads$/ && ($NUM_THREADS = int(shift), next); + /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next); +} + +# for time calculation +my $start_time; +if ($TIMING) +{ + $start_time = [ Time::HiRes::gettimeofday( ) ]; +} + +# print help message +if ($HELP) +{ + print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n"; + print "Options:\n"; + print " -q ... quiet.\n"; + print " -a ... aggressive hyphen splitting.\n"; + print " -b ... disable Perl buffering.\n"; + print " -time ... enable processing time calculation.\n"; + exit; +} + +if (!$QUIET) +{ + print STDERR "Tokenizer Version 1.1\n"; + print STDERR "Language: $language\n"; + print STDERR "Number of threads: $NUM_THREADS\n"; +} + +# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes +load_prefixes($language,\%NONBREAKING_PREFIX); + +if (scalar(%NONBREAKING_PREFIX) eq 0) +{ + print STDERR "Warning: No known abbreviations for language '$language'\n"; +} + +my @batch_sentences = (); +my @thread_list = (); +my $count_sentences = 0; + +if ($NUM_THREADS > 1) +{# multi-threading tokenization + while() + { + $count_sentences = $count_sentences + 1; + push(@batch_sentences, $_); + if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS)) + { + # assign each thread work + for (my $i=0; $i<$NUM_THREADS; $i++) + { + my $start_index = $i*$NUM_SENTENCES_PER_THREAD; + my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; + my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; + my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; + push(@thread_list, $new_thread); + } + foreach (@thread_list) + { + my $tokenized_list = $_->join; + foreach (@$tokenized_list) + { + print $_; + } + } + # reset for the new run + @thread_list = (); + @batch_sentences = (); + } + } + # the last batch + if (scalar(@batch_sentences)>0) + { + # assign each thread work + for (my $i=0; $i<$NUM_THREADS; $i++) + { + my $start_index = $i*$NUM_SENTENCES_PER_THREAD; + if ($start_index >= scalar(@batch_sentences)) + { + last; + } + my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; + if ($end_index >= scalar(@batch_sentences)) + { + $end_index = scalar(@batch_sentences)-1; + } + my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; + my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; + push(@thread_list, $new_thread); + } + foreach (@thread_list) + { + my $tokenized_list = $_->join; + foreach (@$tokenized_list) + { + print $_; + } + } + } +} +else +{# single thread only + while() + { + if (($SKIP_XML && /^<.+>$/) || /^\s*$/) + { + #don't try to tokenize XML/HTML tag lines + print $_; + } + else + { + print &tokenize($_); + } + } +} + +if ($TIMING) +{ + my $duration = Time::HiRes::tv_interval( $start_time ); + print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n"); + print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n"); +} + +##################################################################################### +# subroutines afterward + +# tokenize a batch of texts saved in an array +# input: an array containing a batch of texts +# return: another array cotaining a batch of tokenized texts for the input array +sub tokenize_batch +{ + my(@text_list) = @_; + my(@tokenized_list) = (); + foreach (@text_list) + { + if (($SKIP_XML && /^<.+>$/) || /^\s*$/) + { + #don't try to tokenize XML/HTML tag lines + push(@tokenized_list, $_); + } + else + { + push(@tokenized_list, &tokenize($_)); + } + } + return \@tokenized_list; +} + +# the actual tokenize function which tokenizes one input string +# input: one string +# return: the tokenized string for the input string +sub tokenize +{ + my($text) = @_; + chomp($text); + $text = " $text "; + + # remove ASCII junk + $text =~ s/\s+/ /g; + $text =~ s/[\000-\037]//g; + + # seperate out all "other" special characters + $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; + + # aggressive hyphen splitting + if ($AGGRESSIVE) + { + $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g; + } + + #multi-dots stay together + $text =~ s/\.([\.]+)/ DOTMULTI$1/g; + while($text =~ /DOTMULTI\./) + { + $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; + $text =~ s/DOTMULTI\./DOTDOTMULTI/g; + } + + # seperate out "," except if within numbers (5,300) + $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + # separate , pre and post number + $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; + + # turn `into ' + $text =~ s/\`/\'/g; + + #turn '' into " + $text =~ s/\'\'/ \" /g; + + if ($language eq "en") + { + #split contractions right + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; + #special case for "1990's" + $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; + } + elsif (($language eq "fr") or ($language eq "it")) + { + #split contractions left + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; + } + else + { + $text =~ s/\'/ \' /g; + } + + #word token method + my @words = split(/\s/,$text); + $text = ""; + for (my $i=0;$i<(scalar(@words));$i++) + { + my $word = $words[$i]; + if ( $word =~ /^(\S+)\.$/) + { + my $pre = $1; + if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml + #$text =~ s/\'/\'/g; # xml + #$text =~ s/\"/\"/g; # xml + #$text =~ s/\[/\[/g; # syntax non-terminal + #$text =~ s/\]/\]/g; # syntax non-terminal + + #ensure final line break + $text .= "\n" unless $text =~ /\n$/; + + return $text; +} + +sub load_prefixes +{ + my ($language, $PREFIX_REF) = @_; + + my $prefixfile = "$mydir/nonbreaking_prefix.$language"; + + #default back to English if we don't have a language-specific prefix file + if (!(-e $prefixfile)) + { + $prefixfile = "$mydir/nonbreaking_prefix.en"; + print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; + die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); + } + + if (-e "$prefixfile") + { + open(PREFIX, "<:utf8", "$prefixfile"); + while () + { + my $item = $_; + chomp($item); + if (($item) && (substr($item,0,1) ne "#")) + { + if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) + { + $PREFIX_REF->{$1} = 2; + } + else + { + $PREFIX_REF->{$item} = 1; + } + } + } + close(PREFIX); + } +} + diff --git a/tokenizer.no-escape.perl b/tokenizer.no-escape.perl deleted file mode 100755 index 4397360..0000000 --- a/tokenizer.no-escape.perl +++ /dev/null @@ -1,348 +0,0 @@ -#!/usr/bin/perl -w - -# Sample Tokenizer -### Version 1.1 -# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn -# Version 1.1 updates: -# (1) add multithreading option "-threads NUM_THREADS" (default is 1); -# (2) add a timing option "-time" to calculate the average speed of this tokenizer; -# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed); -### Version 1.0 -# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $ -# written by Josh Schroeder, based on code by Philipp Koehn - -binmode(STDIN, ":utf8"); -binmode(STDOUT, ":utf8"); - -use FindBin qw($RealBin); -use strict; -use Time::HiRes; -#use Thread; - -my $mydir = "$RealBin/nonbreaking_prefixes"; - -my %NONBREAKING_PREFIX = (); -my $language = "en"; -my $QUIET = 0; -my $HELP = 0; -my $AGGRESSIVE = 0; -my $SKIP_XML = 0; -my $TIMING = 0; -my $NUM_THREADS = 1; -my $NUM_SENTENCES_PER_THREAD = 2000; - -while (@ARGV) -{ - $_ = shift; - /^-b$/ && ($| = 1, next); - /^-l$/ && ($language = shift, next); - /^-q$/ && ($QUIET = 1, next); - /^-h$/ && ($HELP = 1, next); - /^-x$/ && ($SKIP_XML = 1, next); - /^-a$/ && ($AGGRESSIVE = 1, next); - /^-time$/ && ($TIMING = 1, next); - /^-threads$/ && ($NUM_THREADS = int(shift), next); - /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next); -} - -# for time calculation -my $start_time; -if ($TIMING) -{ - $start_time = [ Time::HiRes::gettimeofday( ) ]; -} - -# print help message -if ($HELP) -{ - print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n"; - print "Options:\n"; - print " -q ... quiet.\n"; - print " -a ... aggressive hyphen splitting.\n"; - print " -b ... disable Perl buffering.\n"; - print " -time ... enable processing time calculation.\n"; - exit; -} - -if (!$QUIET) -{ - print STDERR "Tokenizer Version 1.1\n"; - print STDERR "Language: $language\n"; - print STDERR "Number of threads: $NUM_THREADS\n"; -} - -# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes -load_prefixes($language,\%NONBREAKING_PREFIX); - -if (scalar(%NONBREAKING_PREFIX) eq 0) -{ - print STDERR "Warning: No known abbreviations for language '$language'\n"; -} - -my @batch_sentences = (); -my @thread_list = (); -my $count_sentences = 0; - -if ($NUM_THREADS > 1) -{# multi-threading tokenization - while() - { - $count_sentences = $count_sentences + 1; - push(@batch_sentences, $_); - if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS)) - { - # assign each thread work - for (my $i=0; $i<$NUM_THREADS; $i++) - { - my $start_index = $i*$NUM_SENTENCES_PER_THREAD; - my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; - my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; - my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; - push(@thread_list, $new_thread); - } - foreach (@thread_list) - { - my $tokenized_list = $_->join; - foreach (@$tokenized_list) - { - print $_; - } - } - # reset for the new run - @thread_list = (); - @batch_sentences = (); - } - } - # the last batch - if (scalar(@batch_sentences)>0) - { - # assign each thread work - for (my $i=0; $i<$NUM_THREADS; $i++) - { - my $start_index = $i*$NUM_SENTENCES_PER_THREAD; - if ($start_index >= scalar(@batch_sentences)) - { - last; - } - my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; - if ($end_index >= scalar(@batch_sentences)) - { - $end_index = scalar(@batch_sentences)-1; - } - my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; - my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; - push(@thread_list, $new_thread); - } - foreach (@thread_list) - { - my $tokenized_list = $_->join; - foreach (@$tokenized_list) - { - print $_; - } - } - } -} -else -{# single thread only - while() - { - if (($SKIP_XML && /^<.+>$/) || /^\s*$/) - { - #don't try to tokenize XML/HTML tag lines - print $_; - } - else - { - print &tokenize($_); - } - } -} - -if ($TIMING) -{ - my $duration = Time::HiRes::tv_interval( $start_time ); - print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n"); - print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n"); -} - -##################################################################################### -# subroutines afterward - -# tokenize a batch of texts saved in an array -# input: an array containing a batch of texts -# return: another array cotaining a batch of tokenized texts for the input array -sub tokenize_batch -{ - my(@text_list) = @_; - my(@tokenized_list) = (); - foreach (@text_list) - { - if (($SKIP_XML && /^<.+>$/) || /^\s*$/) - { - #don't try to tokenize XML/HTML tag lines - push(@tokenized_list, $_); - } - else - { - push(@tokenized_list, &tokenize($_)); - } - } - return \@tokenized_list; -} - -# the actual tokenize function which tokenizes one input string -# input: one string -# return: the tokenized string for the input string -sub tokenize -{ - my($text) = @_; - chomp($text); - $text = " $text "; - - # remove ASCII junk - $text =~ s/\s+/ /g; - $text =~ s/[\000-\037]//g; - - # seperate out all "other" special characters - $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; - - # aggressive hyphen splitting - if ($AGGRESSIVE) - { - $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g; - } - - #multi-dots stay together - $text =~ s/\.([\.]+)/ DOTMULTI$1/g; - while($text =~ /DOTMULTI\./) - { - $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; - $text =~ s/DOTMULTI\./DOTDOTMULTI/g; - } - - # seperate out "," except if within numbers (5,300) - $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; - # separate , pre and post number - $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; - $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; - - # turn `into ' - $text =~ s/\`/\'/g; - - #turn '' into " - $text =~ s/\'\'/ \" /g; - - if ($language eq "en") - { - #split contractions right - $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; - $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; - $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; - $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; - #special case for "1990's" - $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; - } - elsif (($language eq "fr") or ($language eq "it")) - { - #split contractions left - $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; - $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; - $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; - $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; - } - else - { - $text =~ s/\'/ \' /g; - } - - #word token method - my @words = split(/\s/,$text); - $text = ""; - for (my $i=0;$i<(scalar(@words));$i++) - { - my $word = $words[$i]; - if ( $word =~ /^(\S+)\.$/) - { - my $pre = $1; - if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml - #$text =~ s/\'/\'/g; # xml - #$text =~ s/\"/\"/g; # xml - #$text =~ s/\[/\[/g; # syntax non-terminal - #$text =~ s/\]/\]/g; # syntax non-terminal - - #ensure final line break - $text .= "\n" unless $text =~ /\n$/; - - return $text; -} - -sub load_prefixes -{ - my ($language, $PREFIX_REF) = @_; - - my $prefixfile = "$mydir/nonbreaking_prefix.$language"; - - #default back to English if we don't have a language-specific prefix file - if (!(-e $prefixfile)) - { - $prefixfile = "$mydir/nonbreaking_prefix.en"; - print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; - die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); - } - - if (-e "$prefixfile") - { - open(PREFIX, "<:utf8", "$prefixfile"); - while () - { - my $item = $_; - chomp($item); - if (($item) && (substr($item,0,1) ne "#")) - { - if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) - { - $PREFIX_REF->{$1} = 2; - } - else - { - $PREFIX_REF->{$item} = 1; - } - } - } - close(PREFIX); - } -} - diff --git a/toks b/toks index ed40dbb..8bee29f 100755 --- a/toks +++ b/toks @@ -3,7 +3,6 @@ STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' - while line = STDIN.gets line.strip.split(/\s/).each { |i| puts i } end diff --git a/train_test_split b/train_test_split new file mode 100755 index 0000000..db56de9 --- /dev/null +++ b/train_test_split @@ -0,0 +1,50 @@ +#!/usr/bin/env ruby + +require 'zipf' +require 'trollop' + +cfg = Trollop::options do + opt :foreign, "foreign file", :type => :string, :required => true + opt :english, "english file", :type => :string, :required => true + opt :size, "one size", :type => :int, :required => true + opt :repeat, "number of repetitions", :type => :int, :default => 1 + opt :prefix, "prefix for output files", :type => :string +end +fn = cfg[:foreign] +fn_ext = fn.split('.').last +f = ReadFile.readlines fn +en = cfg[:english] +en_ext = en.split('.').last +e = ReadFile.readlines en +size = cfg[:size] +nlines_f = `wc -l #{fn}`.split()[0].to_i +nlines_e = `wc -l #{en}`.split()[0].to_i +if nlines_f != nlines_e + STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" + exit 1 +end + +prefix = cfg[:prefix] +a = (0..nlines_e-1).to_a +i = 0 +cfg[:repeat].times { + b = a.sample(size) + ax = a.reject{|j| b.include? j} + `mkdir split_#{i}` + new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}" + new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}" + ax.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}" + new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}" + b.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + i += 1 +} + diff --git a/traintestsplit b/traintestsplit deleted file mode 100755 index ec88df1..0000000 --- a/traintestsplit +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' -require 'trollop' - - -cfg = Trollop::options do - opt :foreign, "foreign file", :type => :string, :required => true - opt :english, "english file", :type => :string, :required => true - opt :size, "one size", :type => :int, :required => true - opt :repeat, "number of repetitions", :type => :int, :default => 1 - opt :prefix, "prefix for output files", :type => :string -end -fn = cfg[:foreign] -fn_ext = fn.split('.').last -f = ReadFile.readlines fn -en = cfg[:english] -en_ext = en.split('.').last -e = ReadFile.readlines en -size = cfg[:size] -nlines_f = `wc -l #{fn}`.split()[0].to_i -nlines_e = `wc -l #{en}`.split()[0].to_i -if nlines_f != nlines_e - STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" - exit 1 -end - -prefix = cfg[:prefix] -a = (0..nlines_e-1).to_a -i = 0 -cfg[:repeat].times { - b = a.sample(size) - ax = a.reject{|j| b.include? j} - `mkdir split_#{i}` - new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}" - new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}" - ax.each { |j| - new_f.write f[j] - new_e.write e[j] - } - new_f.close; new_e.close - new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}" - new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}" - b.each { |j| - new_f.write f[j] - new_e.write e[j] - } - new_f.close; new_e.close - i += 1 -} - diff --git a/var b/var index fe4aa22..faccefa 100755 --- a/var +++ b/var @@ -2,7 +2,6 @@ require 'trollop' - cfg = Trollop::options do banner "stddev [-r ] < " opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 -- cgit v1.2.3