From b31ace79ea5f6b3f279c544cd3a443d6fbf2a24d Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 26 Feb 2026 10:05:59 +0000 Subject: overhaul --- NFC | 9 - add-index | 1 - add-ln | 3 +- add-seg | 13 +- add-start-end | 5 +- avg | 2 +- avg-seg-len | 1 - avg-weights | 7 +- bishuf | 1 - bitext-filter-length | 9 +- bitext2tmx | 39 +++ bitext2tmx.py | 41 --- biuniq | 5 +- bleu-cmp | 3 +- cdec-hg-to-json | 1 - chars | 3 +- cma | 3 +- cumul | 17 +- de-sgm | 1 - div | 1 - dot | 7 +- even | 5 +- exclude | 5 +- feature-dict | 5 +- filter-illegal | 5 +- filter-len | 3 +- filter-tokens | 5 +- first-upper | 3 +- fix-utf-8-pua | 1 - gigaword-collapse-tags | 5 +- hadoop-uniq | 1 - hist-tok | 1 - htmlentities | 9 +- inv | 3 +- is-first-lower | 5 +- joint-set | 5 +- kbest-bleu-oracles | 5 +- kendalls-tau | 11 +- key-count | 5 +- kmeans | 17 +- lang | 11 +- langid-polyglot | 3 +- length-ratio | 3 +- lin-reg | 7 +- log-reg | 11 +- ltok | 7 +- make-rule-features | 7 +- max | 3 +- max-len | 5 +- median | 3 +- merge-files | 3 +- merge-ttable | 17 +- min | 3 +- min-max | 17 +- mkidx | 3 +- moses-1best | 3 +- moving-sum | 3 +- mult | 1 - nfc | 8 + ng | 9 +- nn | 1 - no-empty | 5 +- no-non-printables | 3 +- nonbreaking-prefixes/README.txt | 5 + nonbreaking-prefixes/nonbreaking_prefix.ca | 75 +++++ nonbreaking-prefixes/nonbreaking_prefix.cs | 390 ++++++++++++++++++++++++ nonbreaking-prefixes/nonbreaking_prefix.de | 325 ++++++++++++++++++++ nonbreaking-prefixes/nonbreaking_prefix.el | 2 + nonbreaking-prefixes/nonbreaking_prefix.en | 107 +++++++ nonbreaking-prefixes/nonbreaking_prefix.es | 118 +++++++ nonbreaking-prefixes/nonbreaking_prefix.fr | 153 ++++++++++ nonbreaking-prefixes/nonbreaking_prefix.is | 251 +++++++++++++++ nonbreaking-prefixes/nonbreaking_prefix.it | 180 +++++++++++ nonbreaking-prefixes/nonbreaking_prefix.nl | 115 +++++++ nonbreaking-prefixes/nonbreaking_prefix.pl | 283 +++++++++++++++++ nonbreaking-prefixes/nonbreaking_prefix.pt | 210 +++++++++++++ nonbreaking-prefixes/nonbreaking_prefix.ro | 38 +++ nonbreaking-prefixes/nonbreaking_prefix.ru | 259 ++++++++++++++++ nonbreaking-prefixes/nonbreaking_prefix.sk | 474 +++++++++++++++++++++++++++++ nonbreaking-prefixes/nonbreaking_prefix.sl | 78 +++++ nonbreaking-prefixes/nonbreaking_prefix.sv | 46 +++ nonbreaking_prefixes/README.txt | 5 - nonbreaking_prefixes/nonbreaking_prefix.ca | 75 ----- nonbreaking_prefixes/nonbreaking_prefix.cs | 390 ------------------------ nonbreaking_prefixes/nonbreaking_prefix.de | 325 -------------------- nonbreaking_prefixes/nonbreaking_prefix.el | 2 - nonbreaking_prefixes/nonbreaking_prefix.en | 107 ------- nonbreaking_prefixes/nonbreaking_prefix.es | 118 ------- nonbreaking_prefixes/nonbreaking_prefix.fr | 153 ---------- nonbreaking_prefixes/nonbreaking_prefix.is | 251 --------------- nonbreaking_prefixes/nonbreaking_prefix.it | 180 ----------- nonbreaking_prefixes/nonbreaking_prefix.nl | 115 ------- nonbreaking_prefixes/nonbreaking_prefix.pl | 283 ----------------- nonbreaking_prefixes/nonbreaking_prefix.pt | 210 ------------- nonbreaking_prefixes/nonbreaking_prefix.ro | 38 --- nonbreaking_prefixes/nonbreaking_prefix.ru | 259 ---------------- nonbreaking_prefixes/nonbreaking_prefix.sk | 474 ----------------------------- nonbreaking_prefixes/nonbreaking_prefix.sl | 78 ----- nonbreaking_prefixes/nonbreaking_prefix.sv | 46 --- norm | 1 - norm-german | 15 +- norm-hyphens | 3 +- normchr | 9 +- num-tok | 5 +- odd | 5 +- overlap | 7 +- paste-pairs | 12 +- per-sentence-bleu | 9 +- per-sentence-bleu-kbest | 9 +- per-sentence-ter | 15 +- percentile | 3 +- pot | 1 - preprocess | 5 +- preprocess-no-lower | 5 +- pt-bloom | 11 +- push-rules | 3 +- remove-devtest | 4 +- remove-test-from-bitext | 4 +- repetition-rate | 7 +- round | 1 - rule-shapes | 7 +- sample | 14 +- select | 7 +- select-from | 11 +- sentencepiece-decode | 3 +- shard | 19 +- sort-features | 1 - source-sides | 3 +- split-kbest | 3 +- split-lines | 5 +- split-pipes | 13 +- sqrt | 1 - stanford-parser-run | 3 +- stddev | 3 +- strips | 1 - substract | 13 - subtract | 12 + sum | 1 - tc | 3 +- tf-idf | 9 +- tmx-extract | 75 +++++ tmx-extract-original-py2 | 75 +++++ tmx-extract-original-py2.py | 76 ----- tmx-extract.py | 76 ----- tmx-to-plain | 95 ++++++ tmx-to-plain.py | 95 ------ to-ascii | 5 +- toks | 7 +- toks-per-line | 1 - train-test-split | 8 +- tsv-exclude | 4 +- tsv-joint-set | 7 +- tsv-uniq | 7 +- var | 3 +- vocab | 1 - vocab-2 | 12 + vocab2 | 13 - zh-ko-or-ja | 5 +- 158 files changed, 3672 insertions(+), 3784 deletions(-) delete mode 100755 NFC create mode 100755 bitext2tmx delete mode 100755 bitext2tmx.py create mode 100755 nfc create mode 100644 nonbreaking-prefixes/README.txt create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.ca create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.cs create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.de create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.el create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.en create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.es create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.fr create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.is create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.it create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.nl create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.pl create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.pt create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.ro create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.ru create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.sk create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.sl create mode 100644 nonbreaking-prefixes/nonbreaking_prefix.sv delete mode 100644 nonbreaking_prefixes/README.txt delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.ca delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.cs delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.de delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.el delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.en delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.es delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.fr delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.is delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.it delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.nl delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.pl delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.pt delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.ro delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.ru delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.sk delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.sl delete mode 100644 nonbreaking_prefixes/nonbreaking_prefix.sv delete mode 100755 substract create mode 100755 subtract create mode 100755 tmx-extract create mode 100755 tmx-extract-original-py2 delete mode 100755 tmx-extract-original-py2.py delete mode 100755 tmx-extract.py create mode 100755 tmx-to-plain delete mode 100644 tmx-to-plain.py create mode 100755 vocab-2 delete mode 100755 vocab2 diff --git a/NFC b/NFC deleted file mode 100755 index aec1c58..0000000 --- a/NFC +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python - -import fileinput -import unicodedata -import sys - -for line in fileinput.input(): - sys.stdout.write(unicodedata.normalize('NFC', line)) - diff --git a/add-index b/add-index index 77a7e8d..b23fefe 100755 --- a/add-index +++ b/add-index @@ -9,4 +9,3 @@ while line = STDIN.gets puts "#{i}\t#{line}" i += 1 end - diff --git a/add-ln b/add-ln index 35bc44d..c98f0a0 100755 --- a/add-ln +++ b/add-ln @@ -3,6 +3,5 @@ i = 0 while line = STDIN.gets puts "#{i}\t#{line}" - i += 1 + i += 1 end - diff --git a/add-seg b/add-seg index 14b8b6b..3825494 100755 --- a/add-seg +++ b/add-seg @@ -1,12 +1,12 @@ #!/usr/bin/env ruby -require 'optimist' -require 'zipf' +require "optimist" +require "zipf" o = Optimist::options do - opt :grammar, "(Absolute) path of folder containing grammars.", :type => :string, :short => '-g', :default => nil + opt :grammar, "(Absolute) path of folder containing grammars.", :type => :string, :short => "-g", :default => nil opt :loo, "leave one out", :type => :bool, :default => false - opt :start_id, "start with this id", :type => :int, :default => 0, :short => '-i' + opt :start_id, "start with this id", :type => :int, :default => 0, :short => "-i" opt :nogz, "grammar files not gzipped", :type => :bool, :default => false opt :index, "number according to index", :type => :string, :default => nil end @@ -19,8 +19,8 @@ end i = o[:start_id] j = 0 while line = STDIN.gets - ext = '.gz' - ext = '' if o[:nogz] + ext = ".gz" + ext = "" if o[:nogz] s = " 0 @@ -33,4 +33,3 @@ while line = STDIN.gets i += 1 j += 1 end - diff --git a/add-start-end b/add-start-end index 30deaec..1e1061d 100755 --- a/add-start-end +++ b/add-start-end @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" while line = STDIN.gets puts " #{line.strip} " end - diff --git a/avg b/avg index ac912d6..6d28fa9 100755 --- a/avg +++ b/avg @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" conf = Optimist::options do banner "avg < " diff --git a/avg-seg-len b/avg-seg-len index ee68827..bfd4f6c 100755 --- a/avg-seg-len +++ b/avg-seg-len @@ -6,4 +6,3 @@ while line = STDIN.gets end puts lens.inject(:+)/lens.size.to_f - diff --git a/avg-weights b/avg-weights index f090da9..bc734e8 100755 --- a/avg-weights +++ b/avg-weights @@ -1,8 +1,8 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' -require 'zlib' +require "zipf" +require "optimist" +require "zlib" conf = Optimist::options do opt :weights_files, "a number of weights files: name value", :required => true @@ -30,4 +30,3 @@ h.each_pair { |k,w| next if conf[:filter] and w.size < n puts "#{k} #{w.inject(:+)/n}" } - diff --git a/bishuf b/bishuf index 62689aa..dd86e23 100755 --- a/bishuf +++ b/bishuf @@ -15,4 +15,3 @@ get_random() { seed="$1"; openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt < echo "shuffling ..." $SHUF --random-source=<(get_random 42) $1 > $1.shuf $SHUF --random-source=<(get_random 42) $2 > $2.shuf - diff --git a/bitext-filter-length b/bitext-filter-length index d1dc973..a77f10e 100755 --- a/bitext-filter-length +++ b/bitext-filter-length @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def main conf = Optimist::options do @@ -17,8 +17,8 @@ def main opt :reverse, "length ratios alway > 1", :type => :bool, :default => false, :short => "-r" end - fna,fnb = conf[:inputs].split ',' - a = ReadFile.new fna + fna,fnb = conf[:inputs].split "," + a = ReadFile.new fna b = ReadFile.new fnb if not conf[:output_index] @@ -62,4 +62,3 @@ def main end main - diff --git a/bitext2tmx b/bitext2tmx new file mode 100755 index 0000000..e9c8e23 --- /dev/null +++ b/bitext2tmx @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +import sys +from xml.sax.saxutils import escape + + +if __name__ == "__main__": + prefix = """ +
+ """ + + with open(sys.argv[1], "r") as src_file, open(sys.argv[2], "r") as tgt_file: + src_lang = sys.argv[1].split(".")[-1] + tgt_lang = sys.argv[2].split(".")[-1] + + tus = [] + for src_line, tgt_line in zip(src_file.readlines(), tgt_file.readlines()): + src_line = src_line.rstrip("\n") + tgt_line = tgt_line.rstrip("\n") + tus.append(f""" + + + {escape(src_line)} + + + {escape(tgt_line)} + + """) + + suffix = """ +""" + + complete = "\n".join([prefix] + tus + [suffix]) + + print(complete) diff --git a/bitext2tmx.py b/bitext2tmx.py deleted file mode 100755 index 1cdc4b3..0000000 --- a/bitext2tmx.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 - -import sys -from xml.sax.saxutils import escape - - -if __name__ == "__main__": - prefix = """ -
- """ - - src_file = open(sys.argv[1], "r") - tgt_file = open(sys.argv[2], "r") - - src_lang = sys.argv[1].split(".")[-1] - tgt_lang = sys.argv[2].split(".")[-1] - - tus = [] - for src_line, tgt_line in zip(src_file.readlines(), tgt_file.readlines()): - src_line = src_line.rstrip("\n") - tgt_line = tgt_line.rstrip("\n") - tus.append(f""" - - - {escape(src_line)} - - - {escape(tgt_line)} - - """) - - suffix = """ -""" - - complete = "\n".join([prefix] + tus + [suffix]) - - print(complete) diff --git a/biuniq b/biuniq index b191ab0..9ad2d76 100755 --- a/biuniq +++ b/biuniq @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" f1 = ReadFile.new ARGV[0] f2 = ReadFile.new ARGV[1] @@ -16,7 +16,7 @@ while line1 = f1.gets line2 = f2.gets if line2 == nil then line2 = "" end line2.strip! - + if !d1.include? line1 and !d2.include? line2 a1 << line1 a2 << line2 @@ -33,4 +33,3 @@ a1.each_with_index { |line1,i| o1.write line1 + "\n" o2.write a2[i] + "\n" } - diff --git a/bleu-cmp b/bleu-cmp index ed8460c..fe5370d 100755 --- a/bleu-cmp +++ b/bleu-cmp @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" f = ReadFile.new ARGV[0] g = ReadFile.new ARGV[1] @@ -20,4 +20,3 @@ while line = f.gets puts i += 1 end - diff --git a/cdec-hg-to-json b/cdec-hg-to-json index 5a26cf7..955cd6d 100755 --- a/cdec-hg-to-json +++ b/cdec-hg-to-json @@ -77,4 +77,3 @@ def main(): if __name__=="__main__": main() - diff --git a/chars b/chars index 359c2ab..5fed1c7 100755 --- a/chars +++ b/chars @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" while line = STDIN.gets line.strip.each_char { |c| puts c } end - diff --git a/cma b/cma index 4647710..9e0f1f0 100755 --- a/cma +++ b/cma @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" conf = Optimist::options do banner "cma < " @@ -20,4 +20,3 @@ while line = STDIN.gets end STDOUT.flush end - diff --git a/cumul b/cumul index 93a7e90..45ff03e 100755 --- a/cumul +++ b/cumul @@ -1,6 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" +require "tempfile" f = ReadFile.new ARGV[0] g = ReadFile.new ARGV[1] @@ -17,16 +18,16 @@ while line = f.gets sys1 << line1 sys2 << line2 - ff=File.new("/tmp/refs",'w+');ff.write(refs.join(""));ff.close - ff=File.new("/tmp/sys1",'w+');ff.write(sys1.join(""));ff.close - ff=File.new("/tmp/sys2",'w+');ff.write(sys2.join(""));ff.close + tmp_refs = Tempfile.new("refs"); tmp_refs.write(refs.join("")); tmp_refs.close + tmp_sys1 = Tempfile.new("sys1"); tmp_sys1.write(sys1.join("")); tmp_sys1.close + tmp_sys2 = Tempfile.new("sys2"); tmp_sys2.write(sys2.join("")); tmp_sys2.close - #a = `~/multi-bleu.perl /tmp/refs < /tmp/sys1`.split[2].gsub(',','').to_f - a = BLEU::bleu("/tmp/sys1", "/tmp/refs", 4) - b = BLEU::bleu("/tmp/sys2", "/tmp/refs", 4) + a = BLEU::bleu(tmp_sys1.path, tmp_refs.path, 4) + b = BLEU::bleu(tmp_sys2.path, tmp_refs.path, 4) + + tmp_refs.unlink; tmp_sys1.unlink; tmp_sys2.unlink diffs << b-a #puts ((diffs.inject(:+)/diffs.size)*100).round 2 puts (diffs[-1]*100).round 2 end - diff --git a/de-sgm b/de-sgm index 3b3a8e0..8598aef 100755 --- a/de-sgm +++ b/de-sgm @@ -9,4 +9,3 @@ egrep -v -i "^[[:space:]]*(<\?xml.*\?>|\s*$||" \ | sed "s|\s*\s*$||" \ | sed "s|\s*\s*$||" - diff --git a/div b/div index 93585dc..d0e036e 100755 --- a/div +++ b/div @@ -5,4 +5,3 @@ exit if factor==0 while line = STDIN.gets puts line.to_f / factor end - diff --git a/dot b/dot index da0dc58..9856069 100755 --- a/dot +++ b/dot @@ -1,9 +1,8 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" -a = SparseVector.from_file 'w', ' ' -b = SparseVector.from_file 'f', ' ' +a = SparseVector.from_file "w", " " +b = SparseVector.from_file "f", " " puts a.to_s puts a.dot b - diff --git a/even b/even index dcee3d9..1a9bfd4 100755 --- a/even +++ b/even @@ -1,11 +1,10 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" i = 1 while line = STDIN.gets puts line if i%2==0 i+=1 end - diff --git a/exclude b/exclude index b5fe3cb..ee5a144 100755 --- a/exclude +++ b/exclude @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'set' +require "zipf" +require "set" to_exclude = {} f = ReadFile.new ARGV[0] @@ -14,4 +14,3 @@ while line = STDIN.gets puts line end end - diff --git a/feature-dict b/feature-dict index 6849769..59ff020 100755 --- a/feature-dict +++ b/feature-dict @@ -7,7 +7,7 @@ l_i = 1 while line = STDIN.gets STDERR.write "#{l_i}\n" if l_i%1000==0&¬_quiet line.split.each { |i| - f, v = i.split('=', 2) + f, v = i.split("=", 2) if !feature_dict.has_key? f feature_dict[f] = n n += 1 @@ -16,9 +16,8 @@ while line = STDIN.gets l_i += 1 end -f = File.new ARGV[0], 'w' +f = File.new ARGV[0], "w" f.write Marshal.dump feature_dict f.close STDERR.write "size = #{feature_dict.size}\n" - diff --git a/filter-illegal b/filter-illegal index 8b29f3e..e44b2ac 100755 --- a/filter-illegal +++ b/filter-illegal @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" illegal = [ "[", "]", "|||" ] @@ -15,11 +15,10 @@ while line0 = in0.gets illegal.each { |k| if line0.index(k) or line1.index(k) then skip = true - skipi << i + skipi << i end } i += 1 end skipi.each { |j| puts j } - diff --git a/filter-len b/filter-len index fe45b57..1756849 100755 --- a/filter-len +++ b/filter-len @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" a = ReadFile.new ARGV[0] b = ReadFile.new ARGV[1] @@ -24,4 +24,3 @@ a.close b.close a_out.close b_out.close - diff --git a/filter-tokens b/filter-tokens index 00c8f2c..c851bd3 100755 --- a/filter-tokens +++ b/filter-tokens @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" bad_words = {} ReadFile.readlines_strip(ARGV[0]).each { |line| @@ -13,11 +13,10 @@ while line = STDIN.gets tokens = line.split bad_words.keys.each { |w| if tokens.include? w - bad = true + bad = true break end } puts i if bad i += 1 end - diff --git a/first-upper b/first-upper index 610e62c..f9b2ce9 100755 --- a/first-upper +++ b/first-upper @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" while line = STDIN.gets line.strip! line[0] = line[0].upcase puts line end - diff --git a/fix-utf-8-pua b/fix-utf-8-pua index 674d424..da77850 100755 --- a/fix-utf-8-pua +++ b/fix-utf-8-pua @@ -7,4 +7,3 @@ while line = STDIN.gets line.gsub! /[\u{e000}-\u{f8ff}]/, " " puts line end - diff --git a/gigaword-collapse-tags b/gigaword-collapse-tags index cbaf7d7..f2339c4 100755 --- a/gigaword-collapse-tags +++ b/gigaword-collapse-tags @@ -2,8 +2,8 @@ # works with gigaword en v5 -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" in_p = false in_dateline = false @@ -36,4 +36,3 @@ while line = STDIN.gets puts line end end - diff --git a/hadoop-uniq b/hadoop-uniq index 5052419..5f37fa4 100755 --- a/hadoop-uniq +++ b/hadoop-uniq @@ -8,4 +8,3 @@ $HADOOP_HOME/bin/hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \ -output d.uniq \ -mapper 'cut -d " " -f 1' \ -reducer /usr/bin/uniq - diff --git a/hist-tok b/hist-tok index b81604f..3e1d453 100755 --- a/hist-tok +++ b/hist-tok @@ -21,4 +21,3 @@ sorted.sort_by! { |i| sorted.each { |i| puts "#{i[0]}\t#{i[1]}" } - diff --git a/htmlentities b/htmlentities index f3c2d34..c0ccc0a 100755 --- a/htmlentities +++ b/htmlentities @@ -1,9 +1,9 @@ -#!/usr/bin/ruby +#!/usr/bin/env ruby -require 'htmlentities' +require "htmlentities" -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" coder = HTMLEntities.new @@ -11,4 +11,3 @@ coder = HTMLEntities.new while line = STDIN.gets puts coder.decode(line.strip) end - diff --git a/inv b/inv index b13443f..aaa4783 100755 --- a/inv +++ b/inv @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" def main conf = Optimist::options do @@ -30,4 +30,3 @@ def main end main - diff --git a/is-first-lower b/is-first-lower index 1cddb8e..a7e2073 100755 --- a/is-first-lower +++ b/is-first-lower @@ -1,11 +1,10 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" while line = STDIN.gets line.strip! - if line && line!='' && line[0].downcase? + if line && line!="" && line[0].downcase? puts line end end - diff --git a/joint-set b/joint-set index b9b9b22..a295862 100755 --- a/joint-set +++ b/joint-set @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'set' -require 'zipf' +require "set" +require "zipf" n = ARGV.pop.to_i @@ -27,4 +27,3 @@ all_sets.each { |set| joint_set.each { |i| puts i } - diff --git a/kbest-bleu-oracles b/kbest-bleu-oracles index ea76ab1..03f321d 100755 --- a/kbest-bleu-oracles +++ b/kbest-bleu-oracles @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def get_context kbest_lists, references, n a = [] @@ -48,4 +48,3 @@ def main end main - diff --git a/kendalls-tau b/kendalls-tau index c0c20be..24f0744 100755 --- a/kendalls-tau +++ b/kendalls-tau @@ -2,7 +2,7 @@ ################################################# # reads space delimted pairs of scores as input, -# outputs Kendall's τ +# outputs Kendall"s τ ################################################# def kendall_with_ties l @@ -13,7 +13,7 @@ def kendall_with_ties l l.each_with_index { |k,i| l[i+1,l.size].each_with_index { |m,j| if (k.first < m.first && k[1] < m[1]) || - (k.first > m.first && k[1] > m[1]) + (k.first > m.first && k[1] > m[1]) concordant += 1 elsif (k.first == m.first && k[1] != m[1]) tie_a += 1 @@ -24,7 +24,7 @@ def kendall_with_ties l end } } - + return (concordant-disconcordant)/(Math.sqrt((concordant+disconcordant+tie_a)*(concordant+disconcordant+tie_b))) end @@ -34,7 +34,7 @@ def kendall l l.each_with_index { |k,i| l[i+1,l.size].each_with_index { |m,j| if (k.first <= m.first && k[1] <= m[1]) || - (k.first >= m.first && k[1] >= m[1]) + (k.first >= m.first && k[1] >= m[1]) concordant += 1 else disconcordant += 1 @@ -60,7 +60,7 @@ def main a,b = line.split l << [a.to_f, b.to_f] end - + v = -1 if has_ties? l v = kendall_with_ties l @@ -72,4 +72,3 @@ def main end main - diff --git a/key-count b/key-count index deaa522..b853362 100755 --- a/key-count +++ b/key-count @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" h = {} h.default = 0 @@ -11,4 +11,3 @@ while line = STDIN.gets end h.each_pair { |k,v| puts "#{k} #{v}" } - diff --git a/kmeans b/kmeans index dcf7774..f49fc53 100755 --- a/kmeans +++ b/kmeans @@ -1,12 +1,12 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def read_data fn data = {} ReadFile.new(fn).readlines_strip.map{ |i| - a = i.split ' ', 2 + a = i.split " ", 2 v = SparseVector.from_kv a.last data[a.first] = v } @@ -30,7 +30,7 @@ end def assign centroids, data assignment = {} data.each_pair { |name,feature_vector| - min = 1.0/0 + min = Float::INFINITY min_index = nil centroids.each_with_index { |c,i| dist = c.euclidian_dist(feature_vector) @@ -61,10 +61,10 @@ def main opt :k, "k", :type => :int, :required => true opt :input, "input: one feature vector per line", :type => :string, :required => true opt :max_iterations, "max. number of iterations", :type => :int, :default => 100 - opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => '-n', :default => 3 - opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => '-j', :default => 2 + opt :max_no_change, "max. No of stalled iterations before stopping ", :type => :int, :short => "-n", :default => 3 + opt :init, "centroid initialization (1: sample k features vectors, 2: k-times do sample k feature and build mean)", :type => :int, :short => "-j", :default => 2 end - # data is 'ID f1=v1 f2=v2' + # data is "ID f1=v1 f2=v2" data = read_data conf[:input] k = conf[:k] centroids = nil @@ -86,7 +86,7 @@ def main STDERR.write "expected cluster sz=#{data.size/k.to_f}\n\n" 0.upto(conf[:max_iterations]) do |i| s = "iteration #{i}" - STDERR.write "#{s}\n#{'-'*s.size}\n" + STDERR.write "#{s}\n#{"-" * s.size}\n" assignment = assign centroids, data sizes = [] assignment.each_pair { |centroid_index, a| @@ -114,4 +114,3 @@ def main end main - diff --git a/lang b/lang index 5caebd1..1b498d1 100755 --- a/lang +++ b/lang @@ -1,14 +1,14 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import langdetect from_stdin = False -if sys.argv[1] == '-': +if sys.argv[1] == "-": f = sys.stdin from_stdin = True else: - f = open(sys.argv[1], 'r') + f = open(sys.argv[1], "r") try: l = sys.argv[2].strip() @@ -32,7 +32,7 @@ if min_p and not l: if strict and not min_p: strict = False - + factory = langdetect.detector_factory.DetectorFactory() factory.load_profile(langdetect.detector_factory.PROFILES_DIRECTORY) @@ -71,5 +71,4 @@ for line in f: print("unk") if not from_stdin: - f.close - + f.close() diff --git a/langid-polyglot b/langid-polyglot index 0b0b20c..04f6b3b 100755 --- a/langid-polyglot +++ b/langid-polyglot @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import polyglot from polyglot.detect import Detector @@ -15,4 +15,3 @@ for line in fileinput.input(): except polyglot.detect.base.UnknownLanguage: print("??") pass - diff --git a/length-ratio b/length-ratio index 4b4432d..5b38826 100755 --- a/length-ratio +++ b/length-ratio @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" a = ReadFile.new ARGV[0] b = ReadFile.new ARGV[1] @@ -9,4 +9,3 @@ while linea = a.gets lineb = b.gets puts linea.strip.split.size.to_f / lineb.strip.split.size.to_f end - diff --git a/lin-reg b/lin-reg index 87dded5..eb9193e 100755 --- a/lin-reg +++ b/lin-reg @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def read_data fn, scale f = ReadFile.new fn @@ -29,7 +29,7 @@ def main opt :output, "output data", :type => :string, :required => true opt :learning_rate, "learning rate", :type => :float, :default => 0.07 opt :stop, "stopping criterion", :type => :int, :default => 100 - opt :scale_features,"scale features", :type => :bool, :default => false, :short => '-t' + opt :scale_features,"scale features", :type => :bool, :default => false, :short => "-t" opt :show_loss, "show loss per iter", :type => :bool, :default => false end data = read_data conf[:input], conf[:scale_features] @@ -67,4 +67,3 @@ def main end main - diff --git a/log-reg b/log-reg index 5e43555..99d9153 100755 --- a/log-reg +++ b/log-reg @@ -1,8 +1,8 @@ #!/usr/bin/env ruby -require 'zipf' -require 'matrix' -require 'optimist' +require "zipf" +require "matrix" +require "optimist" def read_data fn f = ReadFile.new fn @@ -30,7 +30,7 @@ def approx_eql x, y, eps=10**-10 return false if !x||!y return false if x.size!=y.size x.each_with_index { |_,i| - return false if (x[i]-y[i]).abs>eps + return false if (x[i]-y[i]).abs>eps } return true end @@ -48,7 +48,7 @@ def main prev_model = nil gradient = Vector.elements zeros hessian = Matrix.build(dim,dim) { |i,j| 0.0 } - i = 0 + i = 0 while true i += 1 data.each_with_index { |x,j| @@ -68,4 +68,3 @@ def main end main - diff --git a/ltok b/ltok index c90823e..fc25a65 100755 --- a/ltok +++ b/ltok @@ -1,9 +1,8 @@ -#!/usr/bin/ruby +#!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" while line = STDIN.gets puts line.strip.split(/\s/).size end - diff --git a/make-rule-features b/make-rule-features index 7adb6e9..ae2cecc 100755 --- a/make-rule-features +++ b/make-rule-features @@ -1,10 +1,10 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" def mkrf src, tgt s = src.gsub /\[X,[1-9]\]/, "NX" - t = tgt.gsub /\[X,([1-9])\]/,'N\1' + t = tgt.gsub /\[X,([1-9])\]/,"N\1" return "R:X:#{s.gsub(" ","_")}:#{t.gsub(" ","_")}" end @@ -13,7 +13,7 @@ def mkrbf s, t if t == "S" s.gsub! /\[X,[1-9]\]/, "X" else - s.gsub! /\[X,([1-9])\]/, 'X\1' + s.gsub! /\[X,([1-9])\]/, "X\1" end s.reverse! s += " >r<" @@ -41,4 +41,3 @@ while line = STDIN.gets end h.keys.each { |f| puts f } - diff --git a/max b/max index b2c1cae..15d0003 100755 --- a/max +++ b/max @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -max = -1.0/0 +max = -Float::INFINITY while line = STDIN.gets v = line.to_f max = v if v > max end puts max - diff --git a/max-len b/max-len index 69013b5..dab684f 100755 --- a/max-len +++ b/max-len @@ -1,11 +1,11 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" max = ARGV[0].to_i i = 0 -while line = STDIN.gets +while line = STDIN.gets if tokenize(line).size <= max puts i else @@ -13,4 +13,3 @@ while line = STDIN.gets end i += 1 end - diff --git a/median b/median index 0b1950b..cc47dcd 100755 --- a/median +++ b/median @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" a = [] while line = STDIN.gets @@ -10,4 +10,3 @@ end a.sort! puts a[a.size/2] - diff --git a/merge-files b/merge-files index 714b57d..78644ef 100755 --- a/merge-files +++ b/merge-files @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" def usage STDERR.write "merge_files +\n" @@ -28,4 +28,3 @@ hashes.each { |h| counts.max.times { puts k } } } - diff --git a/merge-ttable b/merge-ttable index 77eae9f..20e5429 100755 --- a/merge-ttable +++ b/merge-ttable @@ -1,20 +1,20 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def main conf = Optimist::options do opt :f, "f files", :type => :string, :required => true opt :e, "e files", :type => :string, :required => true end - + f_files = conf[:f].split e_files = conf[:e].split - + h = {} f_files.each_with_index { |fn,i| - fa = ReadFile.readlines_strip fn + fa = ReadFile.readlines_strip fn ea = ReadFile.readlines_strip e_files[i] fa.each_with_index { |fw,j| if h.has_key? fw @@ -24,11 +24,10 @@ def main end } } - + h.each_pair { |f,ea| - puts "#{f}\t#{ea.first}" - } + puts "#{f}\t#{ea.first}" + } end main - diff --git a/min b/min index f8a7e42..edfecea 100755 --- a/min +++ b/min @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -min = 1.0/0 +min = Float::INFINITY while line = STDIN.gets v = line.to_f min = v if v :int, :default => 1 - opt :max, "maximum #tokens", :type => :int, :default => 80, :short => '-n' - opt :in_f, "input 'French' file", :type => :string, :required => true - opt :in_e, "input 'English' file", :type => :string, :required => true - opt :out_f, "output 'French' file", :type => :string, :required => true - opt :out_e, "output 'English' file", :type => :string, :required => true + opt :max, "maximum #tokens", :type => :int, :default => 80, :short => "-n" + opt :in_f, "input French file", :type => :string, :required => true + opt :in_e, "input English file", :type => :string, :required => true + opt :out_f, "output French file", :type => :string, :required => true + opt :out_e, "output English file", :type => :string, :required => true opt :out_id, "output line Nos", :type => :string, :required => true end @@ -37,4 +37,3 @@ while f_line = files[:f_file].gets end files.values.each{ |f| f.close } - diff --git a/mkidx b/mkidx index 046e131..6e67ba9 100755 --- a/mkidx +++ b/mkidx @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" i = ARGV[0].to_i while line = STDIN.gets puts i i += 1 end - diff --git a/moses-1best b/moses-1best index fd35cf8..ffe5e22 100755 --- a/moses-1best +++ b/moses-1best @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" prev_idx = nil while line = STDIN.gets @@ -11,4 +11,3 @@ while line = STDIN.gets prev_idx = idx end end - diff --git a/moving-sum b/moving-sum index 697f47f..aff3527 100755 --- a/moving-sum +++ b/moving-sum @@ -1,10 +1,9 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" sum = 0.0 ReadFile.readlines_strip(ARGV[0]).each { |i| sum += i.to_f puts sum } - diff --git a/mult b/mult index 478ec5e..42dd74c 100755 --- a/mult +++ b/mult @@ -4,4 +4,3 @@ factor = ARGV[0].to_f while line = STDIN.gets puts line.to_f * factor end - diff --git a/nfc b/nfc new file mode 100755 index 0000000..4af1aef --- /dev/null +++ b/nfc @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 + +import fileinput +import unicodedata +import sys + +for line in fileinput.input(): + sys.stdout.write(unicodedata.normalize("NFC", line)) diff --git a/ng b/ng index f3a031d..af8015a 100755 --- a/ng +++ b/ng @@ -1,19 +1,18 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" conf = Optimist::options do banner "ng < " opt :n, "n for Ngrams", :type => :int, :default => 4 - opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => false + opt :fix, "Do not output lower order Ngrams.", :type => :bool, :default => false opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n" end while line = STDIN.gets a = [] - ngrams(line, conf[:n], conf[:fix]) { |ng| a << ng.join(' ') } + ngrams(line, conf[:n], conf[:fix]) { |ng| a << ng.join(" ") } a.reject! { |i| i.strip.size==0 } puts a.join conf[:separator] if a.size>0 end - diff --git a/nn b/nn index 4d1dab7..d43a235 100755 --- a/nn +++ b/nn @@ -1,4 +1,3 @@ #!/bin/sh tr '[:digit:]' $1 < $2 > $(basename $2 ${2##*.})nn.${2##*.} - diff --git a/no-empty b/no-empty index da57e23..5a05fc1 100755 --- a/no-empty +++ b/no-empty @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" files = [] (0..1).each { |i| files << ReadFile.new(ARGV[i]) } @@ -9,10 +9,9 @@ files = [] while line_f = files[0].gets line_e = files[1].gets line_f.strip!; line_e.strip! - next if line_f=='' || line_e=='' + next if line_f=="" || line_e=="" files[2].write line_f+"\n" files[3].write line_e+"\n" end files.each { |f| f.close } - diff --git a/no-non-printables b/no-non-printables index 9f9e3f9..2fb6f65 100755 --- a/no-non-printables +++ b/no-non-printables @@ -1,4 +1,3 @@ #!/bin/sh -sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g' | sed 's/[[:cntrl:]]//g' - +sed 's/\xEF\xBB\xBF//g' | sed 's/\xEF\xB7\x93//g' | sed 's/[[:cntrl:]]//g' diff --git a/nonbreaking-prefixes/README.txt b/nonbreaking-prefixes/README.txt new file mode 100644 index 0000000..02cdfcc --- /dev/null +++ b/nonbreaking-prefixes/README.txt @@ -0,0 +1,5 @@ +The language suffix can be found here: + +http://www.loc.gov/standards/iso639-2/php/code_list.php + + diff --git a/nonbreaking-prefixes/nonbreaking_prefix.ca b/nonbreaking-prefixes/nonbreaking_prefix.ca new file mode 100644 index 0000000..2f4fdfc --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.ca @@ -0,0 +1,75 @@ +Dr +Dra +pàg +p +c +av +Sr +Sra +adm +esq +Prof +S.A +S.L +p.e +ptes +Sta +St +pl +màx +cast +dir +nre +fra +admdora +Emm +Excma +espf +dc +admdor +tel +angl +aprox +ca +dept +dj +dl +dt +ds +dg +dv +ed +entl +al +i.e +maj +smin +n +núm +pta +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z diff --git a/nonbreaking-prefixes/nonbreaking_prefix.cs b/nonbreaking-prefixes/nonbreaking_prefix.cs new file mode 100644 index 0000000..dce6167 --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.cs @@ -0,0 +1,390 @@ +Bc +BcA +Ing +Ing.arch +MUDr +MVDr +MgA +Mgr +JUDr +PhDr +RNDr +PharmDr +ThLic +ThDr +Ph.D +Th.D +prof +doc +CSc +DrSc +dr. h. c +PaedDr +Dr +PhMr +DiS +abt +ad +a.i +aj +angl +anon +apod +atd +atp +aut +bd +biogr +b.m +b.p +b.r +cca +cit +cizojaz +c.k +col +čes +čín +čj +ed +facs +fasc +fol +fot +franc +h.c +hist +hl +hrsg +ibid +il +ind +inv.č +jap +jhdt +jv +koed +kol +korej +kl +krit +lat +lit +m.a +maď +mj +mp +násl +např +nepubl +něm +no +nr +n.s +okr +odd +odp +obr +opr +orig +phil +pl +pokrač +pol +port +pozn +př.kr +př.n.l +přel +přeprac +příl +pseud +pt +red +repr +resp +revid +rkp +roč +roz +rozš +samost +sect +sest +seš +sign +sl +srv +stol +sv +šk +šk.ro +špan +tab +t.č +tis +tj +tř +tzv +univ +uspoř +vol +vl.jm +vs +vyd +vyobr +zal +zejm +zkr +zprac +zvl +n.p +např +než +MUDr +abl +absol +adj +adv +ak +ak. sl +akt +alch +amer +anat +angl +anglosas +arab +arch +archit +arg +astr +astrol +att +bás +belg +bibl +biol +boh +bot +bulh +círk +csl +č +čas +čes +dat +děj +dep +dět +dial +dór +dopr +dosl +ekon +epic +etnonym +eufem +f +fam +fem +fil +film +form +fot +fr +fut +fyz +gen +geogr +geol +geom +germ +gram +hebr +herald +hist +hl +hovor +hud +hut +chcsl +chem +ie +imp +impf +ind +indoevr +inf +instr +interj +ión +iron +it +kanad +katalán +klas +kniž +komp +konj + +konkr +kř +kuch +lat +lék +les +lid +lit +liturg +lok +log +m +mat +meteor +metr +mod +ms +mysl +n +náb +námoř +neklas +něm +nesklon +nom +ob +obch +obyč +ojed +opt +part +pas +pejor +pers +pf +pl +plpf + +práv +prep +předl +přivl +r +rcsl +refl +reg +rkp +ř +řec +s +samohl +sg +sl +souhl +spec +srov +stfr +střv +stsl +subj +subst +superl +sv +sz +táz +tech +telev +teol +trans +typogr +var +vedl +verb +vl. jm +voj +vok +vůb +vulg +výtv +vztaž +zahr +zájm +zast +zejm + +zeměd +zkr +zř +mj +dl +atp +sport +Mgr +horn +MVDr +JUDr +RSDr +Bc +PhDr +ThDr +Ing +aj +apod +PharmDr +pomn +ev +slang +nprap +odp +dop +pol +st +stol +p. n. l +před n. l +n. l +př. Kr +po Kr +př. n. l +odd +RNDr +tzv +atd +tzn +resp +tj +p +br +č. j +čj +č. p +čp +a. s +s. r. o +spol. s r. o +p. o +s. p +v. o. s +k. s +o. p. s +o. s +v. r +v z +ml +vč +kr +mld +hod +popř +ap +event +rus +slov +rum +švýc +P. T +zvl +hor +dol +S.O.S \ No newline at end of file diff --git a/nonbreaking-prefixes/nonbreaking_prefix.de b/nonbreaking-prefixes/nonbreaking_prefix.de new file mode 100644 index 0000000..35fdf5e --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.de @@ -0,0 +1,325 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +#no german words end in single lower-case letters, so we throw those in too. +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + + +#Roman Numerals. A dot after one of these is not a sentence break in German. +I +II +III +IV +V +VI +VII +VIII +IX +X +XI +XII +XIII +XIV +XV +XVI +XVII +XVIII +XIX +XX +i +ii +iii +iv +v +vi +vii +viii +ix +x +xi +xii +xiii +xiv +xv +xvi +xvii +xviii +xix +xx + +#Titles and Honorifics +Adj +Adm +Adv +Asst +Bart +Bldg +Brig +Bros +Capt +Cmdr +Col +Comdr +Con +Corp +Cpl +DR +Dr +Ens +Gen +Gov +Hon +Hosp +Insp +Lt +MM +MR +MRS +MS +Maj +Messrs +Mlle +Mme +Mr +Mrs +Ms +Msgr +Op +Ord +Pfc +Ph +Prof +Pvt +Rep +Reps +Res +Rev +Rt +Sen +Sens +Sfc +Sgt +Sr +St +Supt +Surg + +#Misc symbols +Mio +Mrd +bzw +v +vs +usw +d.h +z.B +u.a +etc +Mrd +MwSt +ggf +d.J +D.h +m.E +vgl +I.F +z.T +sogen +ff +u.E +g.U +g.g.A +c.-à-d +Buchst +u.s.w +sog +u.ä +Std +evtl +Zt +Chr +u.U +o.ä +Ltd +b.A +z.Zt +spp +sen +SA +k.o +jun +i.H.v +dgl +dergl +Co +zzt +usf +s.p.a +Dkr +Corp +bzgl +BSE + +#Number indicators +# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it +No +Nos +Art +Nr +pp +ca +Ca + +#Ordinals are done with . in German - "1." = "1st" in English +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 diff --git a/nonbreaking-prefixes/nonbreaking_prefix.el b/nonbreaking-prefixes/nonbreaking_prefix.el new file mode 100644 index 0000000..0470f91 --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.el @@ -0,0 +1,2 @@ +# for now, just include the Greek equivalent of "Mr." +κ diff --git a/nonbreaking-prefixes/nonbreaking_prefix.en b/nonbreaking-prefixes/nonbreaking_prefix.en new file mode 100644 index 0000000..e1a3733 --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.en @@ -0,0 +1,107 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Adj +Adm +Adv +Asst +Bart +Bldg +Brig +Bros +Capt +Cmdr +Col +Comdr +Con +Corp +Cpl +DR +Dr +Drs +Ens +Gen +Gov +Hon +Hr +Hosp +Insp +Lt +MM +MR +MRS +MS +Maj +Messrs +Mlle +Mme +Mr +Mrs +Ms +Msgr +Op +Ord +Pfc +Ph +Prof +Pvt +Rep +Reps +Res +Rev +Rt +Sen +Sens +Sfc +Sgt +Sr +St +Supt +Surg + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +v +vs +i.e +rev +e.g + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +No #NUMERIC_ONLY# +Nos +Art #NUMERIC_ONLY# +Nr +pp #NUMERIC_ONLY# diff --git a/nonbreaking-prefixes/nonbreaking_prefix.es b/nonbreaking-prefixes/nonbreaking_prefix.es new file mode 100644 index 0000000..d8b2755 --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.es @@ -0,0 +1,118 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + +# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm + +A.C +Apdo +Av +Bco +CC.AA +Da +Dep +Dn +Dr +Dra +EE.UU +Excmo +FF.CC +Fil +Gral +J.C +Let +Lic +N.B +P.D +P.V.P +Prof +Pts +Rte +S.A +S.A.R +S.E +S.L +S.R.C +Sr +Sra +Srta +Sta +Sto +T.V.E +Tel +Ud +Uds +V.B +V.E +Vd +Vds +a/c +adj +admón +afmo +apdo +av +c +c.f +c.g +cap +cm +cta +dcha +doc +ej +entlo +esq +etc +f.c +gr +grs +izq +kg +km +mg +mm +núm +núm +p +p.a +p.ej +ptas +pág +págs +pág +págs +q.e.g.e +q.e.s.m +s +s.s.s +vid +vol diff --git a/nonbreaking-prefixes/nonbreaking_prefix.fr b/nonbreaking-prefixes/nonbreaking_prefix.fr new file mode 100644 index 0000000..28126fa --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.fr @@ -0,0 +1,153 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. +# +#any single upper case letter followed by a period is not a sentence ender +#usually upper case letters are initials in a name +#no French words end in single lower-case letters, so we throw those in too? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + +# Period-final abbreviation list for French +A.C.N +A.M +art +ann +apr +av +auj +lib +B.P +boul +ca +c.-à-d +cf +ch.-l +chap +contr +C.P.I +C.Q.F.D +C.N +C.N.S +C.S +dir +éd +e.g +env +al +etc +E.V +ex +fasc +fém +fig +fr +hab +ibid +id +i.e +inf +LL.AA +LL.AA.II +LL.AA.RR +LL.AA.SS +L.D +LL.EE +LL.MM +LL.MM.II.RR +loc.cit +masc +MM +ms +N.B +N.D.A +N.D.L.R +N.D.T +n/réf +NN.SS +N.S +N.D +N.P.A.I +p.c.c +pl +pp +p.ex +p.j +P.S +R.A.S +R.-V +R.P +R.I.P +SS +S.S +S.A +S.A.I +S.A.R +S.A.S +S.E +sec +sect +sing +S.M +S.M.I.R +sq +sqq +suiv +sup +suppl +tél +T.S.V.P +vb +vol +vs +X.O +Z.I diff --git a/nonbreaking-prefixes/nonbreaking_prefix.is b/nonbreaking-prefixes/nonbreaking_prefix.is new file mode 100644 index 0000000..5b8a710 --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.is @@ -0,0 +1,251 @@ +no #NUMERIC_ONLY# +No #NUMERIC_ONLY# +nr #NUMERIC_ONLY# +Nr #NUMERIC_ONLY# +nR #NUMERIC_ONLY# +NR #NUMERIC_ONLY# +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +^ +í +á +ó +æ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +ab.fn +a.fn +afs +al +alm +alg +andh +ath +aths +atr +ao +au +aukaf +áfn +áhrl.s +áhrs +ákv.gr +ákv +bh +bls +dr +e.Kr +et +ef +efn +ennfr +eink +end +e.st +erl +fél +fskj +fh +f.hl +físl +fl +fn +fo +forl +frb +frl +frh +frt +fsl +fsh +fs +fsk +fst +f.Kr +ft +fv +fyrrn +fyrrv +germ +gm +gr +hdl +hdr +hf +hl +hlsk +hljsk +hljv +hljóðv +hr +hv +hvk +holl +Hos +höf +hk +hrl +ísl +kaf +kap +Khöfn +kk +kg +kk +km +kl +klst +kr +kt +kgúrsk +kvk +leturbr +lh +lh.nt +lh.þt +lo +ltr +mlja +mljó +millj +mm +mms +m.fl +miðm +mgr +mst +mín +nf +nh +nhm +nl +nk +nmgr +no +núv +nt +o.áfr +o.m.fl +ohf +o.fl +o.s.frv +ófn +ób +óákv.gr +óákv +pfn +PR +pr +Ritstj +Rvík +Rvk +samb +samhlj +samn +samn +sbr +sek +sérn +sf +sfn +sh +sfn +sh +s.hl +sk +skv +sl +sn +so +ss.us +s.st +samþ +sbr +shlj +sign +skál +st +st.s +stk +sþ +teg +tbl +tfn +tl +tvíhlj +tvt +till +to +umr +uh +us +uppl +útg +vb +Vf +vh +vkf +Vl +vl +vlf +vmf +8vo +vsk +vth +þt +þf +þjs +þgf +þlt +þolm +þm +þml +þýð diff --git a/nonbreaking-prefixes/nonbreaking_prefix.it b/nonbreaking-prefixes/nonbreaking_prefix.it new file mode 100644 index 0000000..992b9ec --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.it @@ -0,0 +1,180 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Adj +Adm +Adv +Amn +Arch +Asst +Avv +Bart +Bcc +Bldg +Brig +Bros +C.A.P +C.P +Capt +Cc +Cmdr +Co +Col +Comdr +Con +Corp +Cpl +DR +Dott +Dr +Drs +Egr +Ens +Gen +Geom +Gov +Hon +Hosp +Hr +Id +Ing +Insp +Lt +MM +MR +MRS +MS +Maj +Messrs +Mlle +Mme +Mo +Mons +Mr +Mrs +Ms +Msgr +N.B +Op +Ord +P.S +P.T +Pfc +Ph +Prof +Pvt +RP +RSVP +Rag +Rep +Reps +Res +Rev +Rif +Rt +S.A +S.B.F +S.P.M +S.p.A +S.r.l +Sen +Sens +Sfc +Sgt +Sig +Sigg +Soc +Spett +Sr +St +Supt +Surg +V.P + +# other +a.c +acc +all +banc +c.a +c.c.p +c.m +c.p +c.s +c.v +corr +dott +e.p.c +ecc +es +fatt +gg +int +lett +ogg +on +p.c +p.c.c +p.es +p.f +p.r +p.v +post +pp +racc +ric +s.n.c +seg +sgg +ss +tel +u.s +v.r +v.s + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +v +vs +i.e +rev +e.g + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +No #NUMERIC_ONLY# +Nos +Art #NUMERIC_ONLY# +Nr +pp #NUMERIC_ONLY# diff --git a/nonbreaking-prefixes/nonbreaking_prefix.nl b/nonbreaking-prefixes/nonbreaking_prefix.nl new file mode 100644 index 0000000..c80c417 --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.nl @@ -0,0 +1,115 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. +#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen +# http://nl.wikipedia.org/wiki/Aanspreekvorm +# http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +bacc +bc +bgen +c.i +dhr +dr +dr.h.c +drs +drs +ds +eint +fa +Fa +fam +gen +genm +ing +ir +jhr +jkvr +jr +kand +kol +lgen +lkol +Lt +maj +Mej +mevr +Mme +mr +mr +Mw +o.b.s +plv +prof +ritm +tint +Vz +Z.D +Z.D.H +Z.E +Z.Em +Z.H +Z.K.H +Z.K.M +Z.M +z.v + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence +a.g.v +bijv +bijz +bv +d.w.z +e.c +e.g +e.k +ev +i.p.v +i.s.m +i.t.t +i.v.m +m.a.w +m.b.t +m.b.v +m.h.o +m.i +m.i.v +v.w.t + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +Nr #NUMERIC_ONLY# +Nrs +nrs +nr #NUMERIC_ONLY# diff --git a/nonbreaking-prefixes/nonbreaking_prefix.pl b/nonbreaking-prefixes/nonbreaking_prefix.pl new file mode 100644 index 0000000..6b7c106 --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.pl @@ -0,0 +1,283 @@ +adw +afr +akad +al +Al +am +amer +arch +art +Art +artyst +astr +austr +bałt +bdb +bł +bm +br +bryg +bryt +centr +ces +chem +chiń +chir +c.k +c.o +cyg +cyw +cyt +czes +czw +cd +Cd +czyt +ćw +ćwicz +daw +dcn +dekl +demokr +det +diec +dł +dn +dot +dol +dop +dost +dosł +h.c +ds +dst +duszp +dypl +egz +ekol +ekon +elektr +em +ew +fab +farm +fot +fr +gat +gastr +geogr +geol +gimn +głęb +gm +godz +górn +gosp +gr +gram +hist +hiszp +hr +Hr +hot +id +in +im +iron +jn +kard +kat +katol +k.k +kk +kol +kl +k.p.a +kpc +k.p.c +kpt +kr +k.r +krak +k.r.o +kryt +kult +laic +łac +niem +woj +nb +np +Nb +Np +pol +pow +m.in +pt +ps +Pt +Ps +cdn +jw +ryc +rys +Ryc +Rys +tj +tzw +Tzw +tzn +zob +ang +ub +ul +pw +pn +pl +al +k +n +nr #NUMERIC_ONLY# +Nr #NUMERIC_ONLY# +ww +wł +ur +zm +żyd +żarg +żyw +wył +bp +bp +wyst +tow +Tow +o +sp +Sp +st +spółdz +Spółdz +społ +spółgł +stoł +stow +Stoł +Stow +zn +zew +zewn +zdr +zazw +zast +zaw +zał +zal +zam +zak +zakł +zagr +zach +adw +Adw +lek +Lek +med +mec +Mec +doc +Doc +dyw +dyr +Dyw +Dyr +inż +Inż +mgr +Mgr +dh +dr +Dh +Dr +p +P +red +Red +prof +prok +Prof +Prok +hab +płk +Płk +nadkom +Nadkom +podkom +Podkom +ks +Ks +gen +Gen +por +Por +reż +Reż +przyp +Przyp +śp +św +śW +Śp +Św +ŚW +szer +Szer +pkt #NUMERIC_ONLY# +str #NUMERIC_ONLY# +tab #NUMERIC_ONLY# +Tab #NUMERIC_ONLY# +tel +ust #NUMERIC_ONLY# +par #NUMERIC_ONLY# +poz +pok +oo +oO +Oo +OO +r #NUMERIC_ONLY# +l #NUMERIC_ONLY# +s #NUMERIC_ONLY# +najśw +Najśw +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +Ś +Ć +Ż +Ź +Dz diff --git a/nonbreaking-prefixes/nonbreaking_prefix.pt b/nonbreaking-prefixes/nonbreaking_prefix.pt new file mode 100644 index 0000000..5d65bf2 --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.pt @@ -0,0 +1,210 @@ +#File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009. +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + + +#Roman Numerals. A dot after one of these is not a sentence break in Portuguese. +I +II +III +IV +V +VI +VII +VIII +IX +X +XI +XII +XIII +XIV +XV +XVI +XVII +XVIII +XIX +XX +i +ii +iii +iv +v +vi +vii +viii +ix +x +xi +xii +xiii +xiv +xv +xvi +xvii +xviii +xix +xx + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Adj +Adm +Adv +Art +Ca +Capt +Cmdr +Col +Comdr +Con +Corp +Cpl +DR +DRA +Dr +Dra +Dras +Drs +Eng +Enga +Engas +Engos +Ex +Exo +Exmo +Fig +Gen +Hosp +Insp +Lda +MM +MR +MRS +MS +Maj +Mrs +Ms +Msgr +Op +Ord +Pfc +Ph +Prof +Pvt +Rep +Reps +Res +Rev +Rt +Sen +Sens +Sfc +Sgt +Sr +Sra +Sras +Srs +Sto +Supt +Surg +adj +adm +adv +art +cit +col +con +corp +cpl +dr +dra +dras +drs +eng +enga +engas +engos +ex +exo +exmo +fig +op +prof +sr +sra +sras +srs +sto + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +v +vs +i.e +rev +e.g + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +No #NUMERIC_ONLY# +Nos +Art #NUMERIC_ONLY# +Nr +p #NUMERIC_ONLY# +pp #NUMERIC_ONLY# + diff --git a/nonbreaking-prefixes/nonbreaking_prefix.ro b/nonbreaking-prefixes/nonbreaking_prefix.ro new file mode 100644 index 0000000..d489f46 --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.ro @@ -0,0 +1,38 @@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +dpdv +etc +șamd +M.Ap.N +dl +Dl +d-na +D-na +dvs +Dvs +pt +Pt diff --git a/nonbreaking-prefixes/nonbreaking_prefix.ru b/nonbreaking-prefixes/nonbreaking_prefix.ru new file mode 100644 index 0000000..444465b --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.ru @@ -0,0 +1,259 @@ +TBD: Russian uppercase alphabet [А-Я] +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +0гг +1гг +2гг +3гг +4гг +5гг +6гг +7гг +8гг +9гг +0г +1г +2г +3г +4г +5г +6г +7г +8г +9г +Xвв +Vвв +Iвв +Lвв +Mвв +Cвв +Xв +Vв +Iв +Lв +Mв +Cв +0м +1м +2м +3м +4м +5м +6м +7м +8м +9м +0мм +1мм +2мм +3мм +4мм +5мм +6мм +7мм +8мм +9мм +0см +1см +2см +3см +4см +5см +6см +7см +8см +9см +0дм +1дм +2дм +3дм +4дм +5дм +6дм +7дм +8дм +9дм +0л +1л +2л +3л +4л +5л +6л +7л +8л +9л +0км +1км +2км +3км +4км +5км +6км +7км +8км +9км +0га +1га +2га +3га +4га +5га +6га +7га +8га +9га +0кг +1кг +2кг +3кг +4кг +5кг +6кг +7кг +8кг +9кг +0т +1т +2т +3т +4т +5т +6т +7т +8т +9т +0г +1г +2г +3г +4г +5г +6г +7г +8г +9г +0мг +1мг +2мг +3мг +4мг +5мг +6мг +7мг +8мг +9мг +бульв +в +вв +г +га +гг +гл +гос +д +дм +доп +др +е +ед +ед +зам +и +инд +исп +Исп +к +кап +кг +кв +кл +км +кол +комн +коп +куб +л +лиц +лл +м +макс +мг +мин +мл +млн +млрд +мм +н +наб +нач +неуд +ном +о +обл +обр +общ +ок +ост +отл +п +пер +перераб +пл +пос +пр +просп +проф +р +ред +руб +с +сб +св +см +соч +ср +ст +стр +т +тел +Тел +тех +тт +туп +тыс +уд +ул +уч +физ +х +хор +ч +чел +шт +экз +э diff --git a/nonbreaking-prefixes/nonbreaking_prefix.sk b/nonbreaking-prefixes/nonbreaking_prefix.sk new file mode 100644 index 0000000..1198d48 --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.sk @@ -0,0 +1,474 @@ +Bc +Mgr +RNDr +PharmDr +PhDr +JUDr +PaedDr +ThDr +Ing +MUDr +MDDr +MVDr +Dr +ThLic +PhD +ArtD +ThDr +Dr +DrSc +CSs +prof +obr +Obr +Č +č +absol +adj +admin +adr +Adr +adv +advok +afr +ak +akad +akc +akuz +et +al +alch +amer +anat +angl +Angl +anglosas +anorg +ap +apod +arch +archeol +archit +arg +art +astr +astrol +astron +atp +atď +austr +Austr +aut +belg +Belg +bibl +Bibl +biol +bot +bud +bás +býv +cest +chem +cirk +csl +čs +Čs +dat +dep +det +dial +diaľ +dipl +distrib +dokl +dosl +dopr +dram +duš +dv +dvojčl +dór +ekol +ekon +el +elektr +elektrotech +energet +epic +est +etc +etonym +eufem +európ +Európ +ev +evid +expr +fa +fam +farm +fem +feud +fil +filat +filoz +fi +fon +form +fot +fr +Fr +franc +Franc +fraz +fut +fyz +fyziol +garb +gen +genet +genpor +geod +geogr +geol +geom +germ +gr +Gr +gréc +Gréc +gréckokat +hebr +herald +hist +hlav +hosp +hromad +hud +hypok +ident +i.e +ident +imp +impf +indoeur +inf +inform +instr +int +interj +inšt +inštr +iron +jap +Jap +jaz +jedn +juhoamer +juhových +juhozáp +juž +kanad +Kanad +kanc +kapit +kpt +kart +katastr +knih +kniž +komp +konj +konkr +kozmet +krajč +kresť +kt +kuch +lat +latinskoamer +lek +lex +lingv +lit +litur +log +lok +max +Max +maď +Maď +medzinár +mest +metr +mil +Mil +min +Min +miner +ml +mld +mn +mod +mytol +napr +nar +Nar +nasl +nedok +neg +negat +neklas +nem +Nem +neodb +neos +neskl +nesklon +nespis +nespráv +neved +než +niekt +niž +nom +náb +nákl +námor +nár +obch +obj +obv +obyč +obč +občian +odb +odd +ods +ojed +okr +Okr +opt +opyt +org +os +osob +ot +ovoc +par +part +pejor +pers +pf +Pf +P.f +p.f +pl +Plk +pod +podst +pokl +polit +politol +polygr +pomn +popl +por +porad +porov +posch +potrav +použ +poz +pozit +poľ +poľno +poľnohosp +poľov +pošt +pož +prac +predl +pren +prep +preuk +priezv +Priezv +privl +prof +práv +príd +príj +prík +príp +prír +prísl +príslov +príč +psych +publ +pís +písm +pôv +refl +reg +rep +resp +rozk +rozlič +rozpráv +roč +Roč +ryb +rádiotech +rím +samohl +semest +sev +severoamer +severových +severozáp +sg +skr +skup +sl +Sloven +soc +soch +sociol +sp +spol +Spol +spoloč +spoluhl +správ +spôs +st +star +starogréc +starorím +s.r.o +stol +stor +str +stredoamer +stredoškol +subj +subst +superl +sv +sz +súkr +súp +súvzť +tal +Tal +tech +tel +Tel +telef +teles +telev +teol +trans +turist +tuzem +typogr +tzn +tzv +ukaz +ul +Ul +umel +univ +ust +ved +vedľ +verb +veter +vin +viď +vl +vod +vodohosp +pnl +vulg +vyj +vys +vysokoškol +vzťaž +vôb +vých +výd +výrob +výsk +výsl +výtv +výtvar +význ +včel +vš +všeob +zahr +zar +zariad +zast +zastar +zastaráv +zb +zdravot +združ +zjemn +zlat +zn +Zn +zool +zr +zried +zv +záhr +zák +zákl +zám +záp +západoeur +zázn +územ +účt +čast +čes +Čes +čl +čísl +živ +pr +fak +Kr +p.n.l +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z diff --git a/nonbreaking-prefixes/nonbreaking_prefix.sl b/nonbreaking-prefixes/nonbreaking_prefix.sl new file mode 100644 index 0000000..230062c --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.sl @@ -0,0 +1,78 @@ +dr +Dr +itd +itn +št #NUMERIC_ONLY# +Št #NUMERIC_ONLY# +d +jan +Jan +feb +Feb +mar +Mar +apr +Apr +jun +Jun +jul +Jul +avg +Avg +sept +Sept +sep +Sep +okt +Okt +nov +Nov +dec +Dec +tj +Tj +npr +Npr +sl +Sl +op +Op +gl +Gl +oz +Oz +prev +dipl +ing +prim +Prim +cf +Cf +gl +Gl +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z diff --git a/nonbreaking-prefixes/nonbreaking_prefix.sv b/nonbreaking-prefixes/nonbreaking_prefix.sv new file mode 100644 index 0000000..df5ef29 --- /dev/null +++ b/nonbreaking-prefixes/nonbreaking_prefix.sv @@ -0,0 +1,46 @@ +#single upper case letter are usually initials +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +#misc abbreviations +AB +G +VG +dvs +etc +from +iaf +jfr +kl +kr +mao +mfl +mm +osv +pga +tex +tom +vs diff --git a/nonbreaking_prefixes/README.txt b/nonbreaking_prefixes/README.txt deleted file mode 100644 index 02cdfcc..0000000 --- a/nonbreaking_prefixes/README.txt +++ /dev/null @@ -1,5 +0,0 @@ -The language suffix can be found here: - -http://www.loc.gov/standards/iso639-2/php/code_list.php - - diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ca b/nonbreaking_prefixes/nonbreaking_prefix.ca deleted file mode 100644 index 2f4fdfc..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.ca +++ /dev/null @@ -1,75 +0,0 @@ -Dr -Dra -pàg -p -c -av -Sr -Sra -adm -esq -Prof -S.A -S.L -p.e -ptes -Sta -St -pl -màx -cast -dir -nre -fra -admdora -Emm -Excma -espf -dc -admdor -tel -angl -aprox -ca -dept -dj -dl -dt -ds -dg -dv -ed -entl -al -i.e -maj -smin -n -núm -pta -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z diff --git a/nonbreaking_prefixes/nonbreaking_prefix.cs b/nonbreaking_prefixes/nonbreaking_prefix.cs deleted file mode 100644 index dce6167..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.cs +++ /dev/null @@ -1,390 +0,0 @@ -Bc -BcA -Ing -Ing.arch -MUDr -MVDr -MgA -Mgr -JUDr -PhDr -RNDr -PharmDr -ThLic -ThDr -Ph.D -Th.D -prof -doc -CSc -DrSc -dr. h. c -PaedDr -Dr -PhMr -DiS -abt -ad -a.i -aj -angl -anon -apod -atd -atp -aut -bd -biogr -b.m -b.p -b.r -cca -cit -cizojaz -c.k -col -čes -čín -čj -ed -facs -fasc -fol -fot -franc -h.c -hist -hl -hrsg -ibid -il -ind -inv.č -jap -jhdt -jv -koed -kol -korej -kl -krit -lat -lit -m.a -maď -mj -mp -násl -např -nepubl -něm -no -nr -n.s -okr -odd -odp -obr -opr -orig -phil -pl -pokrač -pol -port -pozn -př.kr -př.n.l -přel -přeprac -příl -pseud -pt -red -repr -resp -revid -rkp -roč -roz -rozš -samost -sect -sest -seš -sign -sl -srv -stol -sv -šk -šk.ro -špan -tab -t.č -tis -tj -tř -tzv -univ -uspoř -vol -vl.jm -vs -vyd -vyobr -zal -zejm -zkr -zprac -zvl -n.p -např -než -MUDr -abl -absol -adj -adv -ak -ak. sl -akt -alch -amer -anat -angl -anglosas -arab -arch -archit -arg -astr -astrol -att -bás -belg -bibl -biol -boh -bot -bulh -círk -csl -č -čas -čes -dat -děj -dep -dět -dial -dór -dopr -dosl -ekon -epic -etnonym -eufem -f -fam -fem -fil -film -form -fot -fr -fut -fyz -gen -geogr -geol -geom -germ -gram -hebr -herald -hist -hl -hovor -hud -hut -chcsl -chem -ie -imp -impf -ind -indoevr -inf -instr -interj -ión -iron -it -kanad -katalán -klas -kniž -komp -konj - -konkr -kř -kuch -lat -lék -les -lid -lit -liturg -lok -log -m -mat -meteor -metr -mod -ms -mysl -n -náb -námoř -neklas -něm -nesklon -nom -ob -obch -obyč -ojed -opt -part -pas -pejor -pers -pf -pl -plpf - -práv -prep -předl -přivl -r -rcsl -refl -reg -rkp -ř -řec -s -samohl -sg -sl -souhl -spec -srov -stfr -střv -stsl -subj -subst -superl -sv -sz -táz -tech -telev -teol -trans -typogr -var -vedl -verb -vl. jm -voj -vok -vůb -vulg -výtv -vztaž -zahr -zájm -zast -zejm - -zeměd -zkr -zř -mj -dl -atp -sport -Mgr -horn -MVDr -JUDr -RSDr -Bc -PhDr -ThDr -Ing -aj -apod -PharmDr -pomn -ev -slang -nprap -odp -dop -pol -st -stol -p. n. l -před n. l -n. l -př. Kr -po Kr -př. n. l -odd -RNDr -tzv -atd -tzn -resp -tj -p -br -č. j -čj -č. p -čp -a. s -s. r. o -spol. s r. o -p. o -s. p -v. o. s -k. s -o. p. s -o. s -v. r -v z -ml -vč -kr -mld -hod -popř -ap -event -rus -slov -rum -švýc -P. T -zvl -hor -dol -S.O.S \ No newline at end of file diff --git a/nonbreaking_prefixes/nonbreaking_prefix.de b/nonbreaking_prefixes/nonbreaking_prefix.de deleted file mode 100644 index 35fdf5e..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.de +++ /dev/null @@ -1,325 +0,0 @@ -#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. -#Special cases are included for prefixes that ONLY appear before 0-9 numbers. - -#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) -#usually upper case letters are initials in a name -#no german words end in single lower-case letters, so we throw those in too. -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z - - -#Roman Numerals. A dot after one of these is not a sentence break in German. -I -II -III -IV -V -VI -VII -VIII -IX -X -XI -XII -XIII -XIV -XV -XVI -XVII -XVIII -XIX -XX -i -ii -iii -iv -v -vi -vii -viii -ix -x -xi -xii -xiii -xiv -xv -xvi -xvii -xviii -xix -xx - -#Titles and Honorifics -Adj -Adm -Adv -Asst -Bart -Bldg -Brig -Bros -Capt -Cmdr -Col -Comdr -Con -Corp -Cpl -DR -Dr -Ens -Gen -Gov -Hon -Hosp -Insp -Lt -MM -MR -MRS -MS -Maj -Messrs -Mlle -Mme -Mr -Mrs -Ms -Msgr -Op -Ord -Pfc -Ph -Prof -Pvt -Rep -Reps -Res -Rev -Rt -Sen -Sens -Sfc -Sgt -Sr -St -Supt -Surg - -#Misc symbols -Mio -Mrd -bzw -v -vs -usw -d.h -z.B -u.a -etc -Mrd -MwSt -ggf -d.J -D.h -m.E -vgl -I.F -z.T -sogen -ff -u.E -g.U -g.g.A -c.-à-d -Buchst -u.s.w -sog -u.ä -Std -evtl -Zt -Chr -u.U -o.ä -Ltd -b.A -z.Zt -spp -sen -SA -k.o -jun -i.H.v -dgl -dergl -Co -zzt -usf -s.p.a -Dkr -Corp -bzgl -BSE - -#Number indicators -# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it -No -Nos -Art -Nr -pp -ca -Ca - -#Ordinals are done with . in German - "1." = "1st" in English -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -25 -26 -27 -28 -29 -30 -31 -32 -33 -34 -35 -36 -37 -38 -39 -40 -41 -42 -43 -44 -45 -46 -47 -48 -49 -50 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 -62 -63 -64 -65 -66 -67 -68 -69 -70 -71 -72 -73 -74 -75 -76 -77 -78 -79 -80 -81 -82 -83 -84 -85 -86 -87 -88 -89 -90 -91 -92 -93 -94 -95 -96 -97 -98 -99 diff --git a/nonbreaking_prefixes/nonbreaking_prefix.el b/nonbreaking_prefixes/nonbreaking_prefix.el deleted file mode 100644 index 0470f91..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.el +++ /dev/null @@ -1,2 +0,0 @@ -# for now, just include the Greek equivalent of "Mr." -κ diff --git a/nonbreaking_prefixes/nonbreaking_prefix.en b/nonbreaking_prefixes/nonbreaking_prefix.en deleted file mode 100644 index e1a3733..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.en +++ /dev/null @@ -1,107 +0,0 @@ -#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. -#Special cases are included for prefixes that ONLY appear before 0-9 numbers. - -#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) -#usually upper case letters are initials in a name -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z - -#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks -Adj -Adm -Adv -Asst -Bart -Bldg -Brig -Bros -Capt -Cmdr -Col -Comdr -Con -Corp -Cpl -DR -Dr -Drs -Ens -Gen -Gov -Hon -Hr -Hosp -Insp -Lt -MM -MR -MRS -MS -Maj -Messrs -Mlle -Mme -Mr -Mrs -Ms -Msgr -Op -Ord -Pfc -Ph -Prof -Pvt -Rep -Reps -Res -Rev -Rt -Sen -Sens -Sfc -Sgt -Sr -St -Supt -Surg - -#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) -v -vs -i.e -rev -e.g - -#Numbers only. These should only induce breaks when followed by a numeric sequence -# add NUMERIC_ONLY after the word for this function -#This case is mostly for the english "No." which can either be a sentence of its own, or -#if followed by a number, a non-breaking prefix -No #NUMERIC_ONLY# -Nos -Art #NUMERIC_ONLY# -Nr -pp #NUMERIC_ONLY# diff --git a/nonbreaking_prefixes/nonbreaking_prefix.es b/nonbreaking_prefixes/nonbreaking_prefix.es deleted file mode 100644 index d8b2755..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.es +++ /dev/null @@ -1,118 +0,0 @@ -#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. -#Special cases are included for prefixes that ONLY appear before 0-9 numbers. - -#any single upper case letter followed by a period is not a sentence ender -#usually upper case letters are initials in a name -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z - -# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm - -A.C -Apdo -Av -Bco -CC.AA -Da -Dep -Dn -Dr -Dra -EE.UU -Excmo -FF.CC -Fil -Gral -J.C -Let -Lic -N.B -P.D -P.V.P -Prof -Pts -Rte -S.A -S.A.R -S.E -S.L -S.R.C -Sr -Sra -Srta -Sta -Sto -T.V.E -Tel -Ud -Uds -V.B -V.E -Vd -Vds -a/c -adj -admón -afmo -apdo -av -c -c.f -c.g -cap -cm -cta -dcha -doc -ej -entlo -esq -etc -f.c -gr -grs -izq -kg -km -mg -mm -núm -núm -p -p.a -p.ej -ptas -pág -págs -pág -págs -q.e.g.e -q.e.s.m -s -s.s.s -vid -vol diff --git a/nonbreaking_prefixes/nonbreaking_prefix.fr b/nonbreaking_prefixes/nonbreaking_prefix.fr deleted file mode 100644 index 28126fa..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.fr +++ /dev/null @@ -1,153 +0,0 @@ -#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. -#Special cases are included for prefixes that ONLY appear before 0-9 numbers. -# -#any single upper case letter followed by a period is not a sentence ender -#usually upper case letters are initials in a name -#no French words end in single lower-case letters, so we throw those in too? -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z - -# Period-final abbreviation list for French -A.C.N -A.M -art -ann -apr -av -auj -lib -B.P -boul -ca -c.-à-d -cf -ch.-l -chap -contr -C.P.I -C.Q.F.D -C.N -C.N.S -C.S -dir -éd -e.g -env -al -etc -E.V -ex -fasc -fém -fig -fr -hab -ibid -id -i.e -inf -LL.AA -LL.AA.II -LL.AA.RR -LL.AA.SS -L.D -LL.EE -LL.MM -LL.MM.II.RR -loc.cit -masc -MM -ms -N.B -N.D.A -N.D.L.R -N.D.T -n/réf -NN.SS -N.S -N.D -N.P.A.I -p.c.c -pl -pp -p.ex -p.j -P.S -R.A.S -R.-V -R.P -R.I.P -SS -S.S -S.A -S.A.I -S.A.R -S.A.S -S.E -sec -sect -sing -S.M -S.M.I.R -sq -sqq -suiv -sup -suppl -tél -T.S.V.P -vb -vol -vs -X.O -Z.I diff --git a/nonbreaking_prefixes/nonbreaking_prefix.is b/nonbreaking_prefixes/nonbreaking_prefix.is deleted file mode 100644 index 5b8a710..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.is +++ /dev/null @@ -1,251 +0,0 @@ -no #NUMERIC_ONLY# -No #NUMERIC_ONLY# -nr #NUMERIC_ONLY# -Nr #NUMERIC_ONLY# -nR #NUMERIC_ONLY# -NR #NUMERIC_ONLY# -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -^ -í -á -ó -æ -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -ab.fn -a.fn -afs -al -alm -alg -andh -ath -aths -atr -ao -au -aukaf -áfn -áhrl.s -áhrs -ákv.gr -ákv -bh -bls -dr -e.Kr -et -ef -efn -ennfr -eink -end -e.st -erl -fél -fskj -fh -f.hl -físl -fl -fn -fo -forl -frb -frl -frh -frt -fsl -fsh -fs -fsk -fst -f.Kr -ft -fv -fyrrn -fyrrv -germ -gm -gr -hdl -hdr -hf -hl -hlsk -hljsk -hljv -hljóðv -hr -hv -hvk -holl -Hos -höf -hk -hrl -ísl -kaf -kap -Khöfn -kk -kg -kk -km -kl -klst -kr -kt -kgúrsk -kvk -leturbr -lh -lh.nt -lh.þt -lo -ltr -mlja -mljó -millj -mm -mms -m.fl -miðm -mgr -mst -mín -nf -nh -nhm -nl -nk -nmgr -no -núv -nt -o.áfr -o.m.fl -ohf -o.fl -o.s.frv -ófn -ób -óákv.gr -óákv -pfn -PR -pr -Ritstj -Rvík -Rvk -samb -samhlj -samn -samn -sbr -sek -sérn -sf -sfn -sh -sfn -sh -s.hl -sk -skv -sl -sn -so -ss.us -s.st -samþ -sbr -shlj -sign -skál -st -st.s -stk -sþ -teg -tbl -tfn -tl -tvíhlj -tvt -till -to -umr -uh -us -uppl -útg -vb -Vf -vh -vkf -Vl -vl -vlf -vmf -8vo -vsk -vth -þt -þf -þjs -þgf -þlt -þolm -þm -þml -þýð diff --git a/nonbreaking_prefixes/nonbreaking_prefix.it b/nonbreaking_prefixes/nonbreaking_prefix.it deleted file mode 100644 index 992b9ec..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.it +++ /dev/null @@ -1,180 +0,0 @@ -#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. -#Special cases are included for prefixes that ONLY appear before 0-9 numbers. - -#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) -#usually upper case letters are initials in a name -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z - -#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks -Adj -Adm -Adv -Amn -Arch -Asst -Avv -Bart -Bcc -Bldg -Brig -Bros -C.A.P -C.P -Capt -Cc -Cmdr -Co -Col -Comdr -Con -Corp -Cpl -DR -Dott -Dr -Drs -Egr -Ens -Gen -Geom -Gov -Hon -Hosp -Hr -Id -Ing -Insp -Lt -MM -MR -MRS -MS -Maj -Messrs -Mlle -Mme -Mo -Mons -Mr -Mrs -Ms -Msgr -N.B -Op -Ord -P.S -P.T -Pfc -Ph -Prof -Pvt -RP -RSVP -Rag -Rep -Reps -Res -Rev -Rif -Rt -S.A -S.B.F -S.P.M -S.p.A -S.r.l -Sen -Sens -Sfc -Sgt -Sig -Sigg -Soc -Spett -Sr -St -Supt -Surg -V.P - -# other -a.c -acc -all -banc -c.a -c.c.p -c.m -c.p -c.s -c.v -corr -dott -e.p.c -ecc -es -fatt -gg -int -lett -ogg -on -p.c -p.c.c -p.es -p.f -p.r -p.v -post -pp -racc -ric -s.n.c -seg -sgg -ss -tel -u.s -v.r -v.s - -#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) -v -vs -i.e -rev -e.g - -#Numbers only. These should only induce breaks when followed by a numeric sequence -# add NUMERIC_ONLY after the word for this function -#This case is mostly for the english "No." which can either be a sentence of its own, or -#if followed by a number, a non-breaking prefix -No #NUMERIC_ONLY# -Nos -Art #NUMERIC_ONLY# -Nr -pp #NUMERIC_ONLY# diff --git a/nonbreaking_prefixes/nonbreaking_prefix.nl b/nonbreaking_prefixes/nonbreaking_prefix.nl deleted file mode 100644 index c80c417..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.nl +++ /dev/null @@ -1,115 +0,0 @@ -#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. -#Special cases are included for prefixes that ONLY appear before 0-9 numbers. -#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen -# http://nl.wikipedia.org/wiki/Aanspreekvorm -# http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs -#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) -#usually upper case letters are initials in a name -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z - -#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks -bacc -bc -bgen -c.i -dhr -dr -dr.h.c -drs -drs -ds -eint -fa -Fa -fam -gen -genm -ing -ir -jhr -jkvr -jr -kand -kol -lgen -lkol -Lt -maj -Mej -mevr -Mme -mr -mr -Mw -o.b.s -plv -prof -ritm -tint -Vz -Z.D -Z.D.H -Z.E -Z.Em -Z.H -Z.K.H -Z.K.M -Z.M -z.v - -#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) -#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence -a.g.v -bijv -bijz -bv -d.w.z -e.c -e.g -e.k -ev -i.p.v -i.s.m -i.t.t -i.v.m -m.a.w -m.b.t -m.b.v -m.h.o -m.i -m.i.v -v.w.t - -#Numbers only. These should only induce breaks when followed by a numeric sequence -# add NUMERIC_ONLY after the word for this function -#This case is mostly for the english "No." which can either be a sentence of its own, or -#if followed by a number, a non-breaking prefix -Nr #NUMERIC_ONLY# -Nrs -nrs -nr #NUMERIC_ONLY# diff --git a/nonbreaking_prefixes/nonbreaking_prefix.pl b/nonbreaking_prefixes/nonbreaking_prefix.pl deleted file mode 100644 index 6b7c106..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.pl +++ /dev/null @@ -1,283 +0,0 @@ -adw -afr -akad -al -Al -am -amer -arch -art -Art -artyst -astr -austr -bałt -bdb -bł -bm -br -bryg -bryt -centr -ces -chem -chiń -chir -c.k -c.o -cyg -cyw -cyt -czes -czw -cd -Cd -czyt -ćw -ćwicz -daw -dcn -dekl -demokr -det -diec -dł -dn -dot -dol -dop -dost -dosł -h.c -ds -dst -duszp -dypl -egz -ekol -ekon -elektr -em -ew -fab -farm -fot -fr -gat -gastr -geogr -geol -gimn -głęb -gm -godz -górn -gosp -gr -gram -hist -hiszp -hr -Hr -hot -id -in -im -iron -jn -kard -kat -katol -k.k -kk -kol -kl -k.p.a -kpc -k.p.c -kpt -kr -k.r -krak -k.r.o -kryt -kult -laic -łac -niem -woj -nb -np -Nb -Np -pol -pow -m.in -pt -ps -Pt -Ps -cdn -jw -ryc -rys -Ryc -Rys -tj -tzw -Tzw -tzn -zob -ang -ub -ul -pw -pn -pl -al -k -n -nr #NUMERIC_ONLY# -Nr #NUMERIC_ONLY# -ww -wł -ur -zm -żyd -żarg -żyw -wył -bp -bp -wyst -tow -Tow -o -sp -Sp -st -spółdz -Spółdz -społ -spółgł -stoł -stow -Stoł -Stow -zn -zew -zewn -zdr -zazw -zast -zaw -zał -zal -zam -zak -zakł -zagr -zach -adw -Adw -lek -Lek -med -mec -Mec -doc -Doc -dyw -dyr -Dyw -Dyr -inż -Inż -mgr -Mgr -dh -dr -Dh -Dr -p -P -red -Red -prof -prok -Prof -Prok -hab -płk -Płk -nadkom -Nadkom -podkom -Podkom -ks -Ks -gen -Gen -por -Por -reż -Reż -przyp -Przyp -śp -św -śW -Śp -Św -ŚW -szer -Szer -pkt #NUMERIC_ONLY# -str #NUMERIC_ONLY# -tab #NUMERIC_ONLY# -Tab #NUMERIC_ONLY# -tel -ust #NUMERIC_ONLY# -par #NUMERIC_ONLY# -poz -pok -oo -oO -Oo -OO -r #NUMERIC_ONLY# -l #NUMERIC_ONLY# -s #NUMERIC_ONLY# -najśw -Najśw -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -Ś -Ć -Ż -Ź -Dz diff --git a/nonbreaking_prefixes/nonbreaking_prefix.pt b/nonbreaking_prefixes/nonbreaking_prefix.pt deleted file mode 100644 index 5d65bf2..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.pt +++ /dev/null @@ -1,210 +0,0 @@ -#File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009. -#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. -#Special cases are included for prefixes that ONLY appear before 0-9 numbers. - -#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) -#usually upper case letters are initials in a name -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z - - -#Roman Numerals. A dot after one of these is not a sentence break in Portuguese. -I -II -III -IV -V -VI -VII -VIII -IX -X -XI -XII -XIII -XIV -XV -XVI -XVII -XVIII -XIX -XX -i -ii -iii -iv -v -vi -vii -viii -ix -x -xi -xii -xiii -xiv -xv -xvi -xvii -xviii -xix -xx - -#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks -Adj -Adm -Adv -Art -Ca -Capt -Cmdr -Col -Comdr -Con -Corp -Cpl -DR -DRA -Dr -Dra -Dras -Drs -Eng -Enga -Engas -Engos -Ex -Exo -Exmo -Fig -Gen -Hosp -Insp -Lda -MM -MR -MRS -MS -Maj -Mrs -Ms -Msgr -Op -Ord -Pfc -Ph -Prof -Pvt -Rep -Reps -Res -Rev -Rt -Sen -Sens -Sfc -Sgt -Sr -Sra -Sras -Srs -Sto -Supt -Surg -adj -adm -adv -art -cit -col -con -corp -cpl -dr -dra -dras -drs -eng -enga -engas -engos -ex -exo -exmo -fig -op -prof -sr -sra -sras -srs -sto - -#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) -v -vs -i.e -rev -e.g - -#Numbers only. These should only induce breaks when followed by a numeric sequence -# add NUMERIC_ONLY after the word for this function -#This case is mostly for the english "No." which can either be a sentence of its own, or -#if followed by a number, a non-breaking prefix -No #NUMERIC_ONLY# -Nos -Art #NUMERIC_ONLY# -Nr -p #NUMERIC_ONLY# -pp #NUMERIC_ONLY# - diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ro b/nonbreaking_prefixes/nonbreaking_prefix.ro deleted file mode 100644 index d489f46..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.ro +++ /dev/null @@ -1,38 +0,0 @@ -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -dpdv -etc -șamd -M.Ap.N -dl -Dl -d-na -D-na -dvs -Dvs -pt -Pt diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ru b/nonbreaking_prefixes/nonbreaking_prefix.ru deleted file mode 100644 index 444465b..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.ru +++ /dev/null @@ -1,259 +0,0 @@ -TBD: Russian uppercase alphabet [А-Я] -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -0гг -1гг -2гг -3гг -4гг -5гг -6гг -7гг -8гг -9гг -0г -1г -2г -3г -4г -5г -6г -7г -8г -9г -Xвв -Vвв -Iвв -Lвв -Mвв -Cвв -Xв -Vв -Iв -Lв -Mв -Cв -0м -1м -2м -3м -4м -5м -6м -7м -8м -9м -0мм -1мм -2мм -3мм -4мм -5мм -6мм -7мм -8мм -9мм -0см -1см -2см -3см -4см -5см -6см -7см -8см -9см -0дм -1дм -2дм -3дм -4дм -5дм -6дм -7дм -8дм -9дм -0л -1л -2л -3л -4л -5л -6л -7л -8л -9л -0км -1км -2км -3км -4км -5км -6км -7км -8км -9км -0га -1га -2га -3га -4га -5га -6га -7га -8га -9га -0кг -1кг -2кг -3кг -4кг -5кг -6кг -7кг -8кг -9кг -0т -1т -2т -3т -4т -5т -6т -7т -8т -9т -0г -1г -2г -3г -4г -5г -6г -7г -8г -9г -0мг -1мг -2мг -3мг -4мг -5мг -6мг -7мг -8мг -9мг -бульв -в -вв -г -га -гг -гл -гос -д -дм -доп -др -е -ед -ед -зам -и -инд -исп -Исп -к -кап -кг -кв -кл -км -кол -комн -коп -куб -л -лиц -лл -м -макс -мг -мин -мл -млн -млрд -мм -н -наб -нач -неуд -ном -о -обл -обр -общ -ок -ост -отл -п -пер -перераб -пл -пос -пр -просп -проф -р -ред -руб -с -сб -св -см -соч -ср -ст -стр -т -тел -Тел -тех -тт -туп -тыс -уд -ул -уч -физ -х -хор -ч -чел -шт -экз -э diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sk b/nonbreaking_prefixes/nonbreaking_prefix.sk deleted file mode 100644 index 1198d48..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.sk +++ /dev/null @@ -1,474 +0,0 @@ -Bc -Mgr -RNDr -PharmDr -PhDr -JUDr -PaedDr -ThDr -Ing -MUDr -MDDr -MVDr -Dr -ThLic -PhD -ArtD -ThDr -Dr -DrSc -CSs -prof -obr -Obr -Č -č -absol -adj -admin -adr -Adr -adv -advok -afr -ak -akad -akc -akuz -et -al -alch -amer -anat -angl -Angl -anglosas -anorg -ap -apod -arch -archeol -archit -arg -art -astr -astrol -astron -atp -atď -austr -Austr -aut -belg -Belg -bibl -Bibl -biol -bot -bud -bás -býv -cest -chem -cirk -csl -čs -Čs -dat -dep -det -dial -diaľ -dipl -distrib -dokl -dosl -dopr -dram -duš -dv -dvojčl -dór -ekol -ekon -el -elektr -elektrotech -energet -epic -est -etc -etonym -eufem -európ -Európ -ev -evid -expr -fa -fam -farm -fem -feud -fil -filat -filoz -fi -fon -form -fot -fr -Fr -franc -Franc -fraz -fut -fyz -fyziol -garb -gen -genet -genpor -geod -geogr -geol -geom -germ -gr -Gr -gréc -Gréc -gréckokat -hebr -herald -hist -hlav -hosp -hromad -hud -hypok -ident -i.e -ident -imp -impf -indoeur -inf -inform -instr -int -interj -inšt -inštr -iron -jap -Jap -jaz -jedn -juhoamer -juhových -juhozáp -juž -kanad -Kanad -kanc -kapit -kpt -kart -katastr -knih -kniž -komp -konj -konkr -kozmet -krajč -kresť -kt -kuch -lat -latinskoamer -lek -lex -lingv -lit -litur -log -lok -max -Max -maď -Maď -medzinár -mest -metr -mil -Mil -min -Min -miner -ml -mld -mn -mod -mytol -napr -nar -Nar -nasl -nedok -neg -negat -neklas -nem -Nem -neodb -neos -neskl -nesklon -nespis -nespráv -neved -než -niekt -niž -nom -náb -nákl -námor -nár -obch -obj -obv -obyč -obč -občian -odb -odd -ods -ojed -okr -Okr -opt -opyt -org -os -osob -ot -ovoc -par -part -pejor -pers -pf -Pf -P.f -p.f -pl -Plk -pod -podst -pokl -polit -politol -polygr -pomn -popl -por -porad -porov -posch -potrav -použ -poz -pozit -poľ -poľno -poľnohosp -poľov -pošt -pož -prac -predl -pren -prep -preuk -priezv -Priezv -privl -prof -práv -príd -príj -prík -príp -prír -prísl -príslov -príč -psych -publ -pís -písm -pôv -refl -reg -rep -resp -rozk -rozlič -rozpráv -roč -Roč -ryb -rádiotech -rím -samohl -semest -sev -severoamer -severových -severozáp -sg -skr -skup -sl -Sloven -soc -soch -sociol -sp -spol -Spol -spoloč -spoluhl -správ -spôs -st -star -starogréc -starorím -s.r.o -stol -stor -str -stredoamer -stredoškol -subj -subst -superl -sv -sz -súkr -súp -súvzť -tal -Tal -tech -tel -Tel -telef -teles -telev -teol -trans -turist -tuzem -typogr -tzn -tzv -ukaz -ul -Ul -umel -univ -ust -ved -vedľ -verb -veter -vin -viď -vl -vod -vodohosp -pnl -vulg -vyj -vys -vysokoškol -vzťaž -vôb -vých -výd -výrob -výsk -výsl -výtv -výtvar -význ -včel -vš -všeob -zahr -zar -zariad -zast -zastar -zastaráv -zb -zdravot -združ -zjemn -zlat -zn -Zn -zool -zr -zried -zv -záhr -zák -zákl -zám -záp -západoeur -zázn -územ -účt -čast -čes -Čes -čl -čísl -živ -pr -fak -Kr -p.n.l -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sl b/nonbreaking_prefixes/nonbreaking_prefix.sl deleted file mode 100644 index 230062c..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.sl +++ /dev/null @@ -1,78 +0,0 @@ -dr -Dr -itd -itn -št #NUMERIC_ONLY# -Št #NUMERIC_ONLY# -d -jan -Jan -feb -Feb -mar -Mar -apr -Apr -jun -Jun -jul -Jul -avg -Avg -sept -Sept -sep -Sep -okt -Okt -nov -Nov -dec -Dec -tj -Tj -npr -Npr -sl -Sl -op -Op -gl -Gl -oz -Oz -prev -dipl -ing -prim -Prim -cf -Cf -gl -Gl -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sv b/nonbreaking_prefixes/nonbreaking_prefix.sv deleted file mode 100644 index df5ef29..0000000 --- a/nonbreaking_prefixes/nonbreaking_prefix.sv +++ /dev/null @@ -1,46 +0,0 @@ -#single upper case letter are usually initials -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -#misc abbreviations -AB -G -VG -dvs -etc -from -iaf -jfr -kl -kr -mao -mfl -mm -osv -pga -tex -tom -vs diff --git a/norm b/norm index 5573c37..3313216 100755 --- a/norm +++ b/norm @@ -5,4 +5,3 @@ sum=$(cat $1 | sum) for i in `cat $1`; do echo "$i" | div $sum done - diff --git a/norm-german b/norm-german index 85a39da..5c41f98 100755 --- a/norm-german +++ b/norm-german @@ -1,23 +1,23 @@ #!/usr/bin/env ruby -require 'thread' -require 'optimist' +require "thread" +require "optimist" -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" conf = Optimist::options do banner "norm_german < " opt :upper, "uppercase", :type => :bool, :default => false - opt :threads, "#threads", :type => :int, :default => 1, :short => '-h' + opt :threads, "#threads", :type => :int, :default => 1, :short => "-h" opt :shard_size, "shard size", :type => :int, :default => 1000 opt :train, "train", :type => :bool opt :apply, "apply", :type => :bool end -pairs_lower = [ ['ß','ss'], ['ue', 'ü'], ['ae','ä'], ['oe', 'ö'] ] -pairs_upper = [ ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'] ] +pairs_lower = [ ["ß","ss"], ["ue", "ü"], ["ae","ä"], ["oe", "ö"] ] +pairs_upper = [ ["Ä", "Ae"], ["Ö", "Oe"], ["Ü", "Ue"] ] if conf[:upper] PAIRS = pairs_lower else @@ -84,4 +84,3 @@ token_stock.each { |i| h.merge! build_partial i end } - diff --git a/norm-hyphens b/norm-hyphens index 4a152a1..6491d13 100755 --- a/norm-hyphens +++ b/norm-hyphens @@ -1,4 +1,3 @@ -#!/bin/zsh -x +#!/bin/zsh sed "s|[ \t]\+\xc2\xad[ \t]\+||g" - diff --git a/normchr b/normchr index f8e5798..02c6ce8 100755 --- a/normchr +++ b/normchr @@ -3,10 +3,10 @@ # http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string-literal # https://www.cs.tut.fi/~jkorpela/chars/spaces.html -require 'htmlentities' +require "htmlentities" -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" coder = HTMLEntities.new @@ -24,7 +24,7 @@ while line = STDIN.gets line.gsub! /[\u{e000}-\u{f8ff}]/, " " # UTF-8 PUA line.gsub! /[\u{f0000}-\u{ffffd}]/, " " line.gsub! /[\u{100000}-\u{10fffd}]/, " " - line.gsub! "\r", " " # carriage return + line.gsub! "\r", " " # carriage return line.gsub! /[\u{2000}-\u{200f}]/, " " # EN QUAD -- RIGHT-TO-LEFT MARK line.gsub! /[\u{2028}-\u{202f}]/, " " # LINE SEPARATOR -- NARROW NO-BREAK SPACE line.gsub! /[\u{205f}-\u{206f}]/, " " # MEDIUM MATHEMATICAL SPACE -- NOMINAL DIGIT SHAPES @@ -32,4 +32,3 @@ while line = STDIN.gets line.gsub! /[[:space:]]+/, " " # collapse space puts coder.decode(line) end - diff --git a/num-tok b/num-tok index 56cbae9..0c95aa8 100755 --- a/num-tok +++ b/num-tok @@ -1,9 +1,8 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" while line = STDIN.gets puts line.strip.split.length end - diff --git a/odd b/odd index 0bd9336..ced2861 100755 --- a/odd +++ b/odd @@ -1,11 +1,10 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" i = 1 while line = STDIN.gets puts line if i%2!=0 i+=1 end - diff --git a/overlap b/overlap index 81f9c4b..95d27a3 100755 --- a/overlap +++ b/overlap @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" a = {} a.default = 0 @@ -11,10 +11,9 @@ ReadFile.readlines_strip(ARGV[1]).map { |segment| b[segment] += 1 } overlap = 0 a.each_key { |seg| - puts b[seg] - overlap = overlap+b[seg] + puts b[seg] + overlap = overlap+b[seg] } puts "---" puts overlap - diff --git a/paste-pairs b/paste-pairs index f6b8b31..7e08329 100755 --- a/paste-pairs +++ b/paste-pairs @@ -1,10 +1,8 @@ -#!/usr/bin/python +#!/usr/bin/env python3 import sys -from itertools import izip - -for linenr, (src_line, tgt_line) in enumerate(izip(open(sys.argv[1]), open(sys.argv[2]))): - print linenr, (src_line.strip()) - print linenr, (tgt_line.strip()) - print +for linenr, (src_line, tgt_line) in enumerate(zip(open(sys.argv[1]), open(sys.argv[2]))): + print(linenr, src_line.strip()) + print(linenr, tgt_line.strip()) + print() diff --git a/per-sentence-bleu b/per-sentence-bleu index 257eb3a..d815dc9 100755 --- a/per-sentence-bleu +++ b/per-sentence-bleu @@ -1,11 +1,11 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def main conf = Optimist::options do - opt :input, "input", :type => :string, :default => '-' + opt :input, "input", :type => :string, :default => "-" opt :references, "references", :type => :string, :required => true opt :len_hack, "hack of Nakov et al", :type => :int, :default => 0 opt :n, "N", :default => 4 @@ -16,7 +16,7 @@ def main input = ReadFile.new conf[:input] while line = input.gets i += 1 - if line.strip == '' + if line.strip == "" puts 0.0 next end @@ -26,4 +26,3 @@ def main end main - diff --git a/per-sentence-bleu-kbest b/per-sentence-bleu-kbest index dad1607..12a9f6f 100755 --- a/per-sentence-bleu-kbest +++ b/per-sentence-bleu-kbest @@ -1,11 +1,11 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def main conf = Optimist::options do - opt :kbests, "kbests", :type => :string, :default => '-' + opt :kbests, "kbests", :type => :string, :default => "-" opt :references, "references", :type => :string, :required => true end refs = ReadFile.new conf[:references] @@ -19,7 +19,7 @@ def main scores.each_with_index { |x,j| puts "#{j+1} ||| #{scores[j]} ||| #{list[j]}" if scores[j]==max && !o - puts "^^^ #{j+1} #{max}" + puts "^^^ #{j+1} #{max}" o = true end } @@ -29,4 +29,3 @@ def main end main - diff --git a/per-sentence-ter b/per-sentence-ter index 1a7670e..777d39c 100755 --- a/per-sentence-ter +++ b/per-sentence-ter @@ -1,14 +1,14 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' -require 'tempfile' +require "zipf" +require "optimist" +require "tempfile" def main conf = Optimist::options do - opt :input, "input", :type => :string, :default => '-' + opt :input, "input", :type => :string, :default => "-" opt :references, "references", :type => :string, :required => true - opt :mteval_bin, "cdec's mteval/fast_score", :type => :string, :default => '`/toolbox/cdec-dtrain/mteval/fast_score' + opt :mteval_bin, "cdec mteval/fast_score", :type => :string, :default => "`/toolbox/cdec-dtrain/mteval/fast_score" end refs = ReadFile.readlines_strip conf[:references] @@ -17,8 +17,8 @@ def main while line = input.gets line.strip! i += 1 - a = Tempfile.new 'pster' - b = Tempfile.new 'pster' + a = Tempfile.new "pster" + b = Tempfile.new "pster" a.write line+"\n" b.write refs[i]+"\n" a.close; b.close @@ -30,4 +30,3 @@ def main end main - diff --git a/percentile b/percentile index ba9ceb0..ec42a9a 100755 --- a/percentile +++ b/percentile @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" data = [] while line = STDIN.gets @@ -18,4 +18,3 @@ if index.to_i == index else puts (data[index.to_i-1] + data[index.to_i]) / 2.0 end - diff --git a/pot b/pot index 24acabe..b703bca 100755 --- a/pot +++ b/pot @@ -4,4 +4,3 @@ pow = ARGV[0].to_f while line = STDIN.gets puts line.to_f**pow end - diff --git a/preprocess b/preprocess index a46b0a8..91de3bb 100755 --- a/preprocess +++ b/preprocess @@ -1,9 +1,8 @@ #!/bin/bash -pushd `dirname $0` > /dev/null -P=`pwd -P` +pushd "$(dirname "$0")" > /dev/null +P="$(pwd -P)" popd > /dev/null LANG=$1 $P/no-non-printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize-punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer-no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | $P/lowercase.perl 2>lowercase.$LANG.err - diff --git a/preprocess-no-lower b/preprocess-no-lower index afd87e9..7e3ad91 100755 --- a/preprocess-no-lower +++ b/preprocess-no-lower @@ -1,9 +1,8 @@ #!/bin/bash -pushd `dirname $0` > /dev/null -P=`pwd -P` +pushd "$(dirname "$0")" > /dev/null +P="$(pwd -P)" popd > /dev/null LANG=$1 $P/no-non-printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize-punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer-no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err - diff --git a/pt-bloom b/pt-bloom index 35234f1..b38939d 100755 --- a/pt-bloom +++ b/pt-bloom @@ -1,10 +1,10 @@ #!/usr/bin/env ruby -require 'bloom-filter' -require 'optimist' +require "bloom-filter" +require "optimist" -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" conf = Optimist::options do opt :size, "number of entries in the filter", :type => :int, :required => true @@ -19,6 +19,5 @@ while line = STDIN.gets f.insert(src+" ||| "+tgt) end -f.dump('pt.bloom') +f.dump("pt.bloom") f.close - diff --git a/push-rules b/push-rules index c97ab80..d0a4de7 100755 --- a/push-rules +++ b/push-rules @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" a = ReadFile.readlines_strip ARGV[0] h = {} @@ -21,4 +21,3 @@ while line = STDIN.gets puts line end end - diff --git a/remove-devtest b/remove-devtest index 8e026f9..f322a6e 100755 --- a/remove-devtest +++ b/remove-devtest @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" train_src = ReadFile.new ARGV[0] train_tgt = ReadFile.new ARGV[1] @@ -39,7 +39,7 @@ while line_src = train_src.gets line_src_downcase = line_src line_tgt_downcase = line_tgt end - + if not devtest_h_src.has_key? line_src_downcase and not devtest_h_src.has_key? line_tgt_downcase \ and not devtest_h_tgt.has_key? line_src_downcase and not devtest_h_tgt.has_key? line_tgt_downcase train_src_out.write line_src diff --git a/remove-test-from-bitext b/remove-test-from-bitext index 43038d3..911a893 100755 --- a/remove-test-from-bitext +++ b/remove-test-from-bitext @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" test_source = ReadFile.new ARGV[0] test_target = ReadFile.new ARGV[1] @@ -13,7 +13,7 @@ while test_source_line = test_source.gets test_source_line.strip! test_target_line = test_target.gets test_target_line.strip! - + all_test_source_lines[test_source_line] = true all_test_target_lines[test_target_line] = true end diff --git a/repetition-rate b/repetition-rate index 87938ae..12e0fab 100755 --- a/repetition-rate +++ b/repetition-rate @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" windows = [] cur = [] @@ -9,7 +9,7 @@ while line = STDIN.gets if cur_sz >= 1000 windows << cur cur = [] - cur_sz = 0 + cur_sz = 0 end cur << line.strip cur_sz += cur.last.split.size @@ -37,8 +37,7 @@ windows.each { |w| rr = 1.0 enums.each_with_index { |i,j| - rr *= i/denoms[j] + rr *= i/denoms[j] } puts ((rr**0.25)*100).round 2 - diff --git a/round b/round index dfef800..55919d7 100755 --- a/round +++ b/round @@ -4,4 +4,3 @@ r = ARGV[0].to_i while line = STDIN.gets puts line.to_f.round r end - diff --git a/rule-shapes b/rule-shapes index 589a670..91f8092 100755 --- a/rule-shapes +++ b/rule-shapes @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" def shape s res = [] @@ -24,6 +24,5 @@ end while line = STDIN.gets f, e = line.split(/\t/) f.strip!; e.strip! - puts shape(f).join('_')+"-"+shape(e).join('_') + puts shape(f).join("_")+"-"+shape(e).join("_") end - diff --git a/sample b/sample index aa46ddb..dcef148 100755 --- a/sample +++ b/sample @@ -1,15 +1,15 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" opts = Optimist::options do banner "sample --size [--shuffle] --file " opt :size, "Sample P % or # lines from file or N.", :type => :float opt :shuffle, "Sample is shuffled.", :type => :bool - opt :file, "Input file.", :type => :string, :default => '-' + opt :file, "Input file.", :type => :string, :default => "-" opt :output_index, "Output index number.", :type => :bool opt :N, "Sample --size from N items.", :type => :int, :default => -1 opt :absolute, "Sample absolute number of items.", :type => :bool @@ -19,10 +19,10 @@ input = [] index = [] i = 0 if opts[:N] == -1 - if opts[:file] == '-' + if opts[:file] == "-" file = STDIN else - file = File.new opts[:file], 'r' + file = File.new opts[:file], "r" end while line = file.gets input << line @@ -36,7 +36,6 @@ end sample = [] if !opts[:absolute] sample = index.sample(index.size*(opts[:size]/100.0)) - sample = index.sample(index.size*(opts[:size]/100.0)) else sample = index.sample(opts[:size]) end @@ -56,4 +55,3 @@ while idx = sample.shift end end end - diff --git a/select b/select index 36e4256..2c5616a 100755 --- a/select +++ b/select @@ -1,11 +1,11 @@ #!/usr/bin/env ruby -require 'optimist' -require 'zipf' +require "optimist" +require "zipf" opts = Optimist::options do banner "sample --index [--shuffle] [--file ]" - opt :file, "Input file.", :type => :string, :default => '-' + opt :file, "Input file.", :type => :string, :default => "-" opt :index, "Index file.", :type => :string, :required => true end @@ -15,4 +15,3 @@ index = ReadFile.readlines_strip(opts[:index]).map{ |i| i.to_i } index.each { |i| puts input[i] } - diff --git a/select-from b/select-from index 0ccfeac..e9a394d 100755 --- a/select-from +++ b/select-from @@ -1,13 +1,13 @@ #!/usr/bin/env ruby -require 'optimist' -require 'zipf' +require "optimist" +require "zipf" opts = Optimist::options do banner "select_from [--invert] -i < " - opt :index, "Line numbers to output.", :type => :string, :short => '-i', :required => true - opt :invert, "Invert selection.", :type => :bool, :short => '-j', :default => false - opt :from1, "Index starting from 1.", :type => :bool, :short => '-k', :default => false + opt :index, "Line numbers to output.", :type => :string, :short => "-i", :required => true + opt :invert, "Invert selection.", :type => :bool, :short => "-j", :default => false + opt :from1, "Index starting from 1.", :type => :bool, :short => "-k", :default => false end accept = {} @@ -30,4 +30,3 @@ while line = STDIN.gets end i += 1 end - diff --git a/sentencepiece-decode b/sentencepiece-decode index 5e07ffa..e715d09 100755 --- a/sentencepiece-decode +++ b/sentencepiece-decode @@ -1,9 +1,8 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" while line = STDIN.gets line = line.split.join "" puts line.gsub "▁", " " end - diff --git a/shard b/shard index 5294afd..4b639c5 100755 --- a/shard +++ b/shard @@ -1,11 +1,11 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false) lc = `wc -l #{input}`.split.first.to_i - input_ext = input.split('.').last - refs_ext = refs.split('.').last + input_ext = input.split(".").last + refs_ext = refs.split(".").last index = (0..lc-1).to_a index.reverse! index.shuffle! if rand @@ -68,13 +68,12 @@ def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false end opts = Optimist::options do - opt :input, 'input', :type => :string, :required => true - opt :references, 'references', :type => :string, :required => true - opt :alignments, 'alignments', :type => :string, :required => true - opt :output_prefix, 'output prefix', :type => :string, :required => true - opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z' - opt :num_shards, 'number of shards', :type => :int, :required => true + opt :input, "input", :type => :string, :required => true + opt :references, "references", :type => :string, :required => true + opt :alignments, "alignments", :type => :string, :required => true + opt :output_prefix, "output prefix", :type => :string, :required => true + opt :randomize, "randomize", :type => :bool, :default => false, :short => "-z" + opt :num_shards, "number of shards", :type => :int, :required => true end make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize]) - diff --git a/sort-features b/sort-features index 88bd779..a91fb00 100755 --- a/sort-features +++ b/sort-features @@ -7,4 +7,3 @@ while line = STDIN.gets end h.sort_by { |name, value| -value }.each { |name, value| puts "#{name}\t#{value}" } - diff --git a/source-sides b/source-sides index b4490c6..9243f17 100755 --- a/source-sides +++ b/source-sides @@ -1,4 +1,3 @@ -#!/bin/zsh -x +#!/bin/zsh split_pipes -f 2 | sort | uniq | sed "s| |_|g" | sed "s|\[X,[12]\]|NX|g" - diff --git a/split-kbest b/split-kbest index ab425b0..52773e8 100755 --- a/split-kbest +++ b/split-kbest @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" def write_kbest l, fn f = WriteFile.new fn @@ -21,4 +21,3 @@ while line = STDIN.gets l << line end write_kbest l, "#{dir}/#{i}.gz" # last one - diff --git a/split-lines b/split-lines index 14b3a0f..0d036c3 100755 --- a/split-lines +++ b/split-lines @@ -1,14 +1,13 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" dir = ARGV[0] i = 0 while line = STDIN.gets src, tgt = line.split " ||| " - f = WriteFile.new "#{dir}/#{i}.src" + f = WriteFile.new "#{dir}/#{i}.src" f.write line f.close i += 1 end - diff --git a/split-pipes b/split-pipes index 862e8be..58dcac4 100755 --- a/split-pipes +++ b/split-pipes @@ -1,9 +1,9 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" conf = Optimist::options do banner "splitpipes -f < " @@ -32,10 +32,10 @@ end while line = STDIN.gets j = 1 - line.strip.split(' ||| ').each { |i| + line.strip.split(" ||| ").each { |i| if range && (conf[:field]..conf[:to]).include?(j) a << i.strip - elsif j == conf[:field] + elsif j == conf[:field] puts i.strip break end @@ -46,6 +46,3 @@ while line = STDIN.gets end a.clear end - - - diff --git a/sqrt b/sqrt index d0a67b1..39382e6 100755 --- a/sqrt +++ b/sqrt @@ -3,4 +3,3 @@ while line = STDIN.gets puts Math.sqrt line.to_f end - diff --git a/stanford-parser-run b/stanford-parser-run index f8d4210..37efacd 100755 --- a/stanford-parser-run +++ b/stanford-parser-run @@ -1,7 +1,7 @@ #!/bin/bash if [ $# != 1 ]; then - echo "$0 text-file" + echo "$0 text-file" exit 1 fi @@ -10,4 +10,3 @@ export CLASSPATH=:/toolbox/stanfordparser_3_2_0/* IN=$1 cat $IN | java -server -mx25000m edu.stanford.nlp.parser.lexparser.LexicalizedParser -nthreads 8 -sentences newline -encoding utf-8 -tokenized -outputFormat "typedDependencies" -outputFormatOptions "basicDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz - | tr '\n' '\t' | sed 's/\t\t/\n/g' | sed 's/\t/ /g' | sed 's/ *$//' | sed 's/, /,/g' > $IN.stp - diff --git a/stddev b/stddev index 15c245e..1b24bb5 100755 --- a/stddev +++ b/stddev @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" conf = Optimist::options do banner "stddev [-r ] < " @@ -37,4 +37,3 @@ if conf[:round] >= 0 else puts stddev end - diff --git a/strips b/strips index 11c00b4..05d41cb 100755 --- a/strips +++ b/strips @@ -3,4 +3,3 @@ while line = STDIN.gets puts line.strip end - diff --git a/substract b/substract deleted file mode 100755 index 212b6da..0000000 --- a/substract +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' - -f = ReadFile.new ARGV[0] -g = ReadFile.new ARGV[1] - -while line1 = f.gets - line2 = g.gets - d = line1.to_f - line2.to_f - puts d -end - diff --git a/subtract b/subtract new file mode 100755 index 0000000..ecd6c11 --- /dev/null +++ b/subtract @@ -0,0 +1,12 @@ +#!/usr/bin/env ruby + +require "zipf" + +f = ReadFile.new ARGV[0] +g = ReadFile.new ARGV[1] + +while line1 = f.gets + line2 = g.gets + d = line1.to_f - line2.to_f + puts d +end diff --git a/sum b/sum index acfa563..a3502e6 100755 --- a/sum +++ b/sum @@ -6,4 +6,3 @@ while line = STDIN.gets end puts sum - diff --git a/tc b/tc index 7eefdd5..dd16fdf 100755 --- a/tc +++ b/tc @@ -1,8 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' +require "zipf" while line = STDIN.gets puts tokenize(line.strip).size end - diff --git a/tf-idf b/tf-idf index 22c3dac..02f4c7b 100755 --- a/tf-idf +++ b/tf-idf @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" def main conf = Optimist::options do @@ -15,8 +15,8 @@ def main stopwords = [] if conf[:filter_stopwords] stopwords = ReadFile.readlines(conf[:filter_stopwords]).map{ |i| - i.split('|').first.strip - }.reject{ |i| i=='' } + i.split("|").first.strip + }.reject{ |i| i=="" } end docs = {} @@ -54,4 +54,3 @@ def main end main - diff --git a/tmx-extract b/tmx-extract new file mode 100755 index 0000000..7791eb6 --- /dev/null +++ b/tmx-extract @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# +# Adapted from Apertium +# http://wiki.apertium.org/wiki/Tools_for_TMX +# + +from xml.sax import make_parser +from xml.sax.handler import ContentHandler + +import sys +import codecs + +class TMXHandler(ContentHandler): + def __init__ (self, slang, tlang, sfile, tfile): + self.pair = set([slang, tlang]) + self.files = {} + self.files[slang] = sfile + self.files[tlang] = tfile + self.inTag = "" + self.note = "" + self.tuid = "" + self.type = "" + self.cur_pair = set() + self.cur_lang = "" + self.seg = {} + self.seg[slang] = "" + self.seg[tlang] = "" + + def startElement(self, name, attrs): + + if name == "tu": + self.cur_pair = set() + self.inTag = "tu" + self.tuid = attrs.get("tuid", "") + self.type = attrs.get("datatype", "") + elif name == "note": + self.inTag = "note" + self.note = "" + elif name == "tuv": + self.inTag = "tuv" + self.cur_lang = attrs.get("xml:lang", "") + self.cur_pair.add(self.cur_lang) + elif name == "seg": + self.inTag = "seg" + if self.cur_lang in self.pair: + self.seg[self.cur_lang] = "" + + def characters (self, c): + if self.inTag == "note": + self.note += c + elif self.inTag == "seg" and self.cur_lang in self.pair: + self.seg[self.cur_lang] += c + + def endElement(self, name): + if name == "tu" and self.pair == self.cur_pair: + for lang in self.cur_pair: + self.files[lang].write("{}\n".format(self.seg[lang].replace("\n", " ").strip())) + + +if __name__ == "__main__": + parser = make_parser() + + if len(sys.argv) < 3: + print(f"Usage: {sys.argv[0]} ") + print() + sys.exit(-1) + + sfile_path = f"{sys.argv[1]}.{sys.argv[2]}" + tfile_path = f"{sys.argv[1]}.{sys.argv[3]}" + + with open(sfile_path, "w+") as sfile, open(tfile_path, "w+") as tfile: + curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile) + parser.setContentHandler(curHandler) + with open(sys.argv[1], "r") as tmx_file: + parser.parse(tmx_file) diff --git a/tmx-extract-original-py2 b/tmx-extract-original-py2 new file mode 100755 index 0000000..eb39d1d --- /dev/null +++ b/tmx-extract-original-py2 @@ -0,0 +1,75 @@ +#!/usr/bin/python2 +# +# Adapted from Apertium +# http://wiki.apertium.org/wiki/Tools_for_TMX +# + +from xml.sax import make_parser +from xml.sax.handler import ContentHandler + +import sys +import codecs + +class TMXHandler(ContentHandler): + def __init__ (self, slang, tlang, sfile, tfile): + self.pair = set([slang, tlang]) + self.files = {} + self.files[slang] = sfile + self.files[tlang] = tfile + self.inTag = '' + self.note = '' + self.tuid = '' + self.type = '' + self.cur_pair = set() + self.cur_lang = '' + self.seg = {} + self.seg[slang] = '' + self.seg[tlang] = '' + + def startElement(self, name, attrs): + + if name == 'tu': + self.cur_pair = set() + self.inTag = 'tu' + self.tuid = attrs.get('tuid','') + self.type = attrs.get('datatype','') + elif name == 'note': + self.inTag = 'note' + self.note = "" + elif name == 'tuv': + self.inTag = 'tuv' + self.cur_lang = attrs.get('xml:lang', '') + self.cur_pair.add(self.cur_lang) + elif name == 'seg': + self.inTag = 'seg' + if self.cur_lang in self.pair: + self.seg[self.cur_lang] = '' + + def characters (self, c): + if self.inTag == 'note': + self.note += c + elif self.inTag == 'seg' and self.cur_lang in self.pair: + self.seg[self.cur_lang] += c + + def endElement(self, name): + if name == 'tu' and self.pair == self.cur_pair: + for lang in self.cur_pair: + self.files[lang].write(self.seg[lang].encode('utf-8').replace("\n", " ").strip()+"\n") + +parser = make_parser() + +if len(sys.argv) < 3: + print 'Usage: tmx-extract.py ' + print '' + sys.exit(-1) + +sfile = open(sys.argv[1]+"."+sys.argv[2], 'w+') +tfile = open(sys.argv[1]+"."+sys.argv[3], 'w+') +curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile) + +parser.setContentHandler(curHandler) + +parser.parse(open(sys.argv[1])) + +sfile.close() +tfile.close() diff --git a/tmx-extract-original-py2.py b/tmx-extract-original-py2.py deleted file mode 100755 index cbdb491..0000000 --- a/tmx-extract-original-py2.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/python2 -# -# Adapted from Apertium -# http://wiki.apertium.org/wiki/Tools_for_TMX -# - -from xml.sax import make_parser -from xml.sax.handler import ContentHandler - -import sys -import codecs - -class TMXHandler(ContentHandler): - def __init__ (self, slang, tlang, sfile, tfile): - self.pair = set([slang, tlang]) - self.files = {} - self.files[slang] = sfile - self.files[tlang] = tfile - self.inTag = '' - self.note = '' - self.tuid = '' - self.type = '' - self.cur_pair = set() - self.cur_lang = '' - self.seg = {} - self.seg[slang] = '' - self.seg[tlang] = '' - - def startElement(self, name, attrs): - - if name == 'tu': - self.cur_pair = set() - self.inTag = 'tu' - self.tuid = attrs.get('tuid','') - self.type = attrs.get('datatype','') - elif name == 'note': - self.inTag = 'note' - self.note = "" - elif name == 'tuv': - self.inTag = 'tuv' - self.cur_lang = attrs.get('xml:lang', '') - self.cur_pair.add(self.cur_lang) - elif name == 'seg': - self.inTag = 'seg' - if self.cur_lang in self.pair: - self.seg[self.cur_lang] = '' - - def characters (self, c): - if self.inTag == 'note': - self.note += c - elif self.inTag == 'seg' and self.cur_lang in self.pair: - self.seg[self.cur_lang] += c - - def endElement(self, name): - if name == 'tu' and self.pair == self.cur_pair: - for lang in self.cur_pair: - self.files[lang].write(self.seg[lang].encode('utf-8').replace("\n", " ").strip()+"\n") - -parser = make_parser() - -if len(sys.argv) < 3: - print 'Usage: tmx-extract.py ' - print '' - sys.exit(-1) - -sfile = open(sys.argv[1]+"."+sys.argv[2], 'w+') -tfile = open(sys.argv[1]+"."+sys.argv[3], 'w+') -curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile) - -parser.setContentHandler(curHandler) - -parser.parse(open(sys.argv[1])) - -sfile.close() -tfile.close() - diff --git a/tmx-extract.py b/tmx-extract.py deleted file mode 100755 index 00f18f5..0000000 --- a/tmx-extract.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/python3 -# -# Adapted from Apertium -# http://wiki.apertium.org/wiki/Tools_for_TMX -# - -from xml.sax import make_parser -from xml.sax.handler import ContentHandler - -import sys -import codecs - -class TMXHandler(ContentHandler): - def __init__ (self, slang, tlang, sfile, tfile): - self.pair = set([slang, tlang]) - self.files = {} - self.files[slang] = sfile - self.files[tlang] = tfile - self.inTag = '' - self.note = '' - self.tuid = '' - self.type = '' - self.cur_pair = set() - self.cur_lang = '' - self.seg = {} - self.seg[slang] = '' - self.seg[tlang] = '' - - def startElement(self, name, attrs): - - if name == 'tu': - self.cur_pair = set() - self.inTag = 'tu' - self.tuid = attrs.get('tuid','') - self.type = attrs.get('datatype','') - elif name == 'note': - self.inTag = 'note' - self.note = "" - elif name == 'tuv': - self.inTag = 'tuv' - self.cur_lang = attrs.get('xml:lang', '') - self.cur_pair.add(self.cur_lang) - elif name == 'seg': - self.inTag = 'seg' - if self.cur_lang in self.pair: - self.seg[self.cur_lang] = '' - - def characters (self, c): - if self.inTag == 'note': - self.note += c - elif self.inTag == 'seg' and self.cur_lang in self.pair: - self.seg[self.cur_lang] += c - - def endElement(self, name): - if name == 'tu' and self.pair == self.cur_pair: - for lang in self.cur_pair: - self.files[lang].write("{}\n".format(self.seg[lang].replace("\n", " ").strip())) - - -if __name__ == "__main__": - parser = make_parser() - - if len(sys.argv) < 3: - print('Usage: tmx-extract.py ') - print('') - sys.exit(-1) - - sfile_path = sys.argv[1] + "." + sys.argv[2] - tfile_path = sys.argv[1] + "." + sys.argv[3] - - with open(sfile_path, 'w+') as sfile, open(tfile_path, 'w+') as tfile: - curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile) - parser.setContentHandler(curHandler) - with open(sys.argv[1], 'r') as tmx_file: - parser.parse(tmx_file) - diff --git a/tmx-to-plain b/tmx-to-plain new file mode 100755 index 0000000..025d6e4 --- /dev/null +++ b/tmx-to-plain @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +import argparse +import datetime +import sys + +from translate.storage.tmx import tmxfile + + +def extract_from_tmx(tmx_file_path, + src_out_path, + tgt_out_path, + begin_date, + date, + src_out_after, + tgt_out_after): + with open(tmx_file_path, "rb") as in_fp: + tmx_file = tmxfile(in_fp) + + if src_out_after is not None and tgt_out_after is not None: + src_out_after_fp = open(src_out_after, "w") + tgt_out_after_fp = open(tgt_out_after, "w") + + + with open(src_out_path, "w") as src_out_fp, open(tgt_out_path, "w") as tgt_out_fp: + for index, node in enumerate(tmx_file.unit_iter()): + src_out_fp_ = src_out_fp + tgt_out_fp_ = tgt_out_fp + + if begin_date is not None: + date_string = node.get_target_dom().get("lastusagedate")[:8] + date_obj = datetime.datetime.strptime(date_string, "%Y%m%d").date() + if date_obj < begin_date: + continue + + if date is not None: + date_string = node.get_target_dom().get("changedate")[:8] + date_obj = datetime.datetime.strptime(date_string, "%Y%m%d").date() + if date_obj > date: + src_out_fp_ = src_out_after_fp + tgt_out_fp_ = tgt_out_after_fp + + src_string = f"{node.source}" + tgt_string = f"{node.target}" + src_string = src_string.replace("\n", " ").replace("\r", "") + tgt_string = tgt_string.replace("\n", " ").replace("\r", "") + + src_out_fp_.write(f"{src_string}\n") + tgt_out_fp_.write(f"{tgt_string}\n") + if (index + 1) % 1000 == 0: + sys.stdout.write(f"Processed {index + 1} lines\r") + sys.stdout.flush() + + if src_out_after is not None and tgt_out_after is not None: + src_out_after_fp.close() + tgt_out_after_fp.close() + + +def main(): + + usage = f"Usage: {sys.argv[0]} [options]" + parser = argparse.ArgumentParser(usage=usage) + parser.add_argument("-i", "--input", help="input tmx file") + parser.add_argument("-d", "--date", help="date for splitting the output") + parser.add_argument("-b", "--begin_date", help="earliest date (lastusage) to retain data") + + args = parser.parse_args() + + if args.input is None: + parser.print_help() + sys.exit(1) + + src_out = f"{args.input}.src" + tgt_out = f"{args.input}.tgt" + + + if args.date is not None: + date = datetime.datetime.strptime(args.date, "%Y-%m-%d").date() + src_out_after = f"{src_out}.after.{args.date}" + tgt_out_after = f"{tgt_out}.after.{args.date}" + else: + date = None + src_out_after = None + tgt_out_after = None + + if args.begin_date is not None: + begin_date = datetime.datetime.strptime(args.begin_date, "%Y-%m-%d").date() + else: + begin_date = None + + extract_from_tmx(args.input, src_out, tgt_out, begin_date, date, src_out_after, tgt_out_after) + + +if __name__ == "__main__": + main() diff --git a/tmx-to-plain.py b/tmx-to-plain.py deleted file mode 100644 index 07cac6f..0000000 --- a/tmx-to-plain.py +++ /dev/null @@ -1,95 +0,0 @@ -import argparse -import datetime -import sys - -from translate.storage.tmx import tmxfile - - -def extract_from_tmx(tmx_file_path, - src_out_path, - tgt_out_path, - begin_date, - date, - src_out_after, - tgt_out_after): - with open(tmx_file_path, 'rb') as in_fp: - tmx_file = tmxfile(in_fp) - - if src_out_after is not None and tgt_out_after is not None: - src_out_after_fp = open(src_out_after, "w") - tgt_out_after_fp = open(tgt_out_after, "w") - - - with open(src_out_path, "w") as src_out_fp, open(tgt_out_path, "w") as tgt_out_fp: - for index, node in enumerate(tmx_file.unit_iter()): - src_out_fp_ = src_out_fp - tgt_out_fp_ = tgt_out_fp - - if begin_date is not None: - date_string = node.get_target_dom().get('lastusagedate')[:8] - date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date() - if date_obj < begin_date: - continue - - if date is not None: - date_string = node.get_target_dom().get('changedate')[:8] - date_obj = datetime.datetime.strptime(date_string, '%Y%m%d').date() - if date_obj > date: - src_out_fp_ = src_out_after_fp - tgt_out_fp_ = tgt_out_after_fp - - src_string = f"{node.source}" - tgt_string = f"{node.target}" - src_string = src_string.replace('\n', ' ').replace('\r', '') - tgt_string = tgt_string.replace('\n', ' ').replace('\r', '') - - src_out_fp_.write(f"{src_string}\n") - tgt_out_fp_.write(f"{tgt_string}\n") - if (index + 1) % 1000 == 0: - sys.stdout.write(f"Processed {index + 1} lines\r") - sys.stdout.flush() - - if src_out_after is not None and tgt_out_after is not None: - src_out_after_fp.close() - tgt_out_after_fp.close() - - -def main(): - - usage = "Usage: python tmx_to_plain.py [options]" - parser = argparse.ArgumentParser(usage=usage) - parser.add_argument("-i", "--input", help="input tmx file") - parser.add_argument("-d", "--date", help="date for splitting the output") - parser.add_argument("-b", "--begin_date", help="earliest date (lastusage) to retain data") - - args = parser.parse_args() - - if args.input is None: - parser.print_help() - sys.exit(1) - - args.input - - src_out = args.input + ".src" - tgt_out = args.input + ".tgt" - - - if args.date is not None: - date = datetime.datetime.strptime(args.date, '%Y-%m-%d').date() - src_out_after = src_out + ".after." + args.date - tgt_out_after = tgt_out + ".after." + args.date - else: - date = None - src_out_after = None - tgt_out_after = None - - if args.begin_date is not None: - begin_date = datetime.datetime.strptime(args.begin_date, '%Y-%m-%d').date() - else: - begin_date = None - - extract_from_tmx(args.input, src_out, tgt_out, begin_date, date, src_out_after, tgt_out_after) - - -if __name__ == '__main__': - main() diff --git a/to-ascii b/to-ascii index 10fd1c2..7e2a842 100755 --- a/to-ascii +++ b/to-ascii @@ -4,9 +4,8 @@ while line = STDIN.gets encoding_options = { :invalid => :replace, :undef => :replace, - :replace => '?', + :replace => "?", :universal_newline => true } - puts line.encode 'ASCII', encoding_options + puts line.encode "ASCII", encoding_options end - diff --git a/toks b/toks index 8bee29f..db8076f 100755 --- a/toks +++ b/toks @@ -1,9 +1,8 @@ -#!/usr/bin/ruby +#!/usr/bin/env ruby -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' +STDIN.set_encoding "utf-8" +STDOUT.set_encoding "utf-8" while line = STDIN.gets line.strip.split(/\s/).each { |i| puts i } end - diff --git a/toks-per-line b/toks-per-line index 8a10cd4..9814f35 100755 --- a/toks-per-line +++ b/toks-per-line @@ -14,4 +14,3 @@ while line = STDIN.gets puts a.size end end - diff --git a/train-test-split b/train-test-split index 6aa4796..db5aad4 100755 --- a/train-test-split +++ b/train-test-split @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'optimist' +require "zipf" +require "optimist" conf = Optimist::options do opt :source, "source file", :type => :string, :required => true @@ -13,11 +13,11 @@ conf = Optimist::options do end source_filename = conf[:source] -source_extension = source_filename.split('.').last +source_extension = source_filename.split(".").last source_lines = ReadFile.readlines source_filename target_filename = conf[:target] -target_extension = target_filename.split('.').last +target_extension = target_filename.split(".").last target_lines = ReadFile.readlines target_filename size = conf[:size] diff --git a/tsv-exclude b/tsv-exclude index e951ea1..cee3923 100755 --- a/tsv-exclude +++ b/tsv-exclude @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'set' +require "zipf" +require "set" to_exclude0 = {} to_exclude1 = {} diff --git a/tsv-joint-set b/tsv-joint-set index c0dbdcf..ce77a9e 100755 --- a/tsv-joint-set +++ b/tsv-joint-set @@ -1,8 +1,8 @@ #!/usr/bin/env ruby -require 'set' -require 'zipf' -require 'optimist' +require "set" +require "zipf" +require "optimist" conf = Optimist::options do opt :n, "Desired number segments in test set.", :type => :int, :required => true @@ -50,4 +50,3 @@ outputs.each_with_index { |o,i| f.write o[0][j] + "\t" + o[1][j] + "\n" } } - diff --git a/tsv-uniq b/tsv-uniq index fde79f2..6709e8d 100755 --- a/tsv-uniq +++ b/tsv-uniq @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'set' +require "set" strictness = ARGV[0].to_i # 1 one-side # 2 just the pair @@ -21,14 +21,14 @@ if strictness == 1 seen = Set.new segments[side].each_with_index { |segment,i| if not seen.include? segment - puts "#{segments[i][0]}\t#{segments[i][1]}" + puts "#{segments[0][i]}\t#{segments[1][i]}" end seen << segment } elsif strictness == 2 seen = Set.new segments[0].each_index { |i| - segment_pair = [segments[i][0], segments[i][1]] + segment_pair = [segments[0][i], segments[1][i]] if not seen.include? segment_pair puts "#{segment_pair[0]}\t#{segment_pair[1]}" end @@ -46,4 +46,3 @@ elsif strictness == 3 seen_pairs << segment_pair } end - diff --git a/var b/var index 8ca6082..4e88f1e 100755 --- a/var +++ b/var @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'optimist' +require "optimist" conf = Optimist::options do banner "stddev [-r ] < " @@ -32,4 +32,3 @@ if conf[:round] >= 0 else puts var end - diff --git a/vocab b/vocab index e6bdcd9..b2a2de9 100755 --- a/vocab +++ b/vocab @@ -1,4 +1,3 @@ #!/bin/sh $(dirname $0)/toks ${1+"$@"} | sort | uniq -c - diff --git a/vocab-2 b/vocab-2 new file mode 100755 index 0000000..1004faf --- /dev/null +++ b/vocab-2 @@ -0,0 +1,12 @@ +#!/usr/bin/env ruby + +require "zipf" + +d = {} +while line = STDIN.gets + line.strip.split.each { |tok| + d[tok] = true + } +end + +puts d.size diff --git a/vocab2 b/vocab2 deleted file mode 100755 index 1991357..0000000 --- a/vocab2 +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' - -d = {} -while line = STDIN.gets - line.strip.split.each { |tok| - d[tok] = true - } -end - -puts d.size - diff --git a/zh-ko-or-ja b/zh-ko-or-ja index 0b42386..e049704 100755 --- a/zh-ko-or-ja +++ b/zh-ko-or-ja @@ -1,7 +1,7 @@ #!/usr/bin/env ruby -require 'zipf' -require 'script_detector' +require "zipf" +require "script_detector" $to_code = {} $to_code["Ambiguous Chinese"] = "??" @@ -15,4 +15,3 @@ while line = STDIN.gets code = $to_code[line.identify_script] puts code end - -- cgit v1.2.3