From 587af4bea66b5bbe89163154141f4f5fa4c518e5 Mon Sep 17 00:00:00 2001 From: thomasZen Date: Mon, 13 Jan 2020 14:17:30 +0100 Subject: Remove paragraph opening and closing tag (#1) --- de-sgm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/de-sgm b/de-sgm index 664c18c..0b9177d 100755 --- a/de-sgm +++ b/de-sgm @@ -3,5 +3,5 @@ egrep -v "^[[:space:]]*(<\?xml.*\?>|]*>)[[:space:]]*$" \ | egrep -v "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*[[:space:]]*$" \ - | sed "s|]*>\s*||" | sed "s|\s*$||" + | sed "s|]*>\s*||" | sed "s|\s*$||" | egrep -v "^[[:space:]]*

[[:space:]]*$|^[[:space:]]*

[[:space:]]*$" -- cgit v1.2.3 From 53c3d328dbe9a56b54f6d7dc51491ecf92081fef Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 3 Feb 2020 15:23:47 +0000 Subject: de-sgm --- de-sgm | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/de-sgm b/de-sgm index 0b9177d..452edfe 100755 --- a/de-sgm +++ b/de-sgm @@ -1,7 +1,10 @@ #!/bin/sh - egrep -v "^[[:space:]]*(<\?xml.*\?>|]*>)[[:space:]]*$" \ | egrep -v "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*[[:space:]]*$" \ - | sed "s|]*>\s*||" | sed "s|\s*$||" | egrep -v "^[[:space:]]*

[[:space:]]*$|^[[:space:]]*

[[:space:]]*$" + | sed "s|]*>\s*||" \ + | sed "s|\s*\s*$||" \ + | egrep -v "^[[:space:]]*

[[:space:]]*$|^[[:space:]]*

[[:space:]]*$" \ + | sed "s|\s*||" \ + | sed "s|\s*\s*$||" -- cgit v1.2.3 From 8a80dcf1353f93f8bfc9d5976c58861687add941 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 3 Feb 2020 15:24:01 +0000 Subject: print out all chars --- chars | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100755 chars diff --git a/chars b/chars new file mode 100755 index 0000000..359c2ab --- /dev/null +++ b/chars @@ -0,0 +1,10 @@ +#!/usr/bin/env ruby + +require 'zipf' + +while line = STDIN.gets + line.strip.each_char { |c| + puts c + } +end + -- cgit v1.2.3 From 4bcee680245bb606c2ce06a304d78c1ac4c83134 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 3 Feb 2020 15:24:31 +0000 Subject: NFC normalization in python --- NFC | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100755 NFC diff --git a/NFC b/NFC new file mode 100755 index 0000000..aec1c58 --- /dev/null +++ b/NFC @@ -0,0 +1,9 @@ +#!/usr/bin/env python + +import fileinput +import unicodedata +import sys + +for line in fileinput.input(): + sys.stdout.write(unicodedata.normalize('NFC', line)) + -- cgit v1.2.3 From 0ff116339aff9b421e4ca5d4680b3981530d9b99 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 3 Feb 2020 15:25:12 +0000 Subject: zh-ko-or-ja --- zh-ko-or-ja | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100755 zh-ko-or-ja diff --git a/zh-ko-or-ja b/zh-ko-or-ja new file mode 100755 index 0000000..0b42386 --- /dev/null +++ b/zh-ko-or-ja @@ -0,0 +1,18 @@ +#!/usr/bin/env ruby + +require 'zipf' +require 'script_detector' + +$to_code = {} +$to_code["Ambiguous Chinese"] = "??" +$to_code["Simplified Chinese"] = "zh" +$to_code["Traditional Chinese"] = "zt" +$to_code["Korean"] = "ko" +$to_code["Japanese"] = "ja" +$to_code.default = "??" + +while line = STDIN.gets + code = $to_code[line.identify_script] + puts code +end + -- cgit v1.2.3 From 276b954705f7a9d46c4241aa25e6bb19be8716e1 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 3 Feb 2020 15:25:42 +0000 Subject: langid-polyglot --- langid-polyglot | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100755 langid-polyglot diff --git a/langid-polyglot b/langid-polyglot new file mode 100755 index 0000000..0b0b20c --- /dev/null +++ b/langid-polyglot @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +import polyglot +from polyglot.detect import Detector +import fileinput + +for line in fileinput.input(): + try: + for lang in Detector(line).languages: + if lang.confidence > 80.0: + print(lang.confidence) + else: + print("??") + break + except polyglot.detect.base.UnknownLanguage: + print("??") + pass + -- cgit v1.2.3 From 4bb4c4b4e35d00a4a8e96a3f1de4301f481b9cc6 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 19 Feb 2020 15:16:50 +0000 Subject: sentencepiece-decode --- sentencepiece-decode | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100755 sentencepiece-decode diff --git a/sentencepiece-decode b/sentencepiece-decode new file mode 100755 index 0000000..5e07ffa --- /dev/null +++ b/sentencepiece-decode @@ -0,0 +1,9 @@ +#!/usr/bin/env ruby + +require 'zipf' + +while line = STDIN.gets + line = line.split.join "" + puts line.gsub "▁", " " +end + -- cgit v1.2.3 From 8c1c7c21f16e5800d615130831a8932a5d2b2bd2 Mon Sep 17 00:00:00 2001 From: pks Date: Wed, 19 Feb 2020 16:19:09 +0100 Subject: misc. scripts --- add-index | 12 ++++++++++++ exclude | 17 +++++++++++++++++ joint-set | 30 ++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) create mode 100755 add-index create mode 100755 exclude create mode 100755 joint-set diff --git a/add-index b/add-index new file mode 100755 index 0000000..77a7e8d --- /dev/null +++ b/add-index @@ -0,0 +1,12 @@ +#!/usr/bin/env ruby + +i = 0 +if ARGV.size > 0 + i = ARGV[0].to_i +end + +while line = STDIN.gets + puts "#{i}\t#{line}" + i += 1 +end + diff --git a/exclude b/exclude new file mode 100755 index 0000000..b5fe3cb --- /dev/null +++ b/exclude @@ -0,0 +1,17 @@ +#!/usr/bin/env ruby + +require 'zipf' +require 'set' + +to_exclude = {} +f = ReadFile.new ARGV[0] +while line = f.gets + to_exclude[line] = true +end + +while line = STDIN.gets + if not to_exclude.has_key? line + puts line + end +end + diff --git a/joint-set b/joint-set new file mode 100755 index 0000000..b9b9b22 --- /dev/null +++ b/joint-set @@ -0,0 +1,30 @@ +#!/usr/bin/env ruby + +require 'set' +require 'zipf' + +n = ARGV.pop.to_i + +all = [] +all_sets = [] +ARGV.each { |file| + fp = ReadFile.new file + a = [] + s = Set.new + while line = fp.gets + a << line + s << line + end + all << a + all_sets << s +} + +joint_set = all_sets.pop +all_sets.each { |set| + joint_set &= set +} + +joint_set.each { |i| + puts i +} + -- cgit v1.2.3 From 5178c4f31dd3b8eb1f1cba2b632863f8a92af029 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 19 Feb 2020 16:19:45 +0100 Subject: TSV utils --- tsv-joint-set | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ tsv-uniq | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100755 tsv-joint-set create mode 100755 tsv-uniq diff --git a/tsv-joint-set b/tsv-joint-set new file mode 100755 index 0000000..c0dbdcf --- /dev/null +++ b/tsv-joint-set @@ -0,0 +1,53 @@ +#!/usr/bin/env ruby + +require 'set' +require 'zipf' +require 'optimist' + +conf = Optimist::options do + opt :n, "Desired number segments in test set.", :type => :int, :required => true + opt :tsv, ".tsv files", :type => :strings, :required => true + opt :fields, "Which field (0 or 1) to use for each file", :type => :ints, :required => true +end + +all = [] +all_sets = [] +conf[:tsv].each_with_index { |file,file_index| + fp = ReadFile.new file + a = [[],[]] + s = Set.new + while line = fp.gets + p0, p1 = line.strip.split "\t" + a[0] << p0 + a[1] << p1 + s << a[conf[:fields][file_index]].last + end + all << a + all_sets << s +} + +joint_set = all_sets.pop +all_sets.each { |set| + joint_set &= set +} +sample = joint_set.to_a.shuffle.take conf[:n] + +outputs = [] +all.each_with_index { |a,i| + o = [[],[]] + a[conf[:fields][i]].each_with_index { |segment,j| + if sample.include? segment + o[0] << a[0][j] + o[1] << a[1][j] + end + } + outputs << o +} + +outputs.each_with_index { |o,i| + f = WriteFile.new (conf[:tsv][i] + ".joint") + o[0].each_index { |j| + f.write o[0][j] + "\t" + o[1][j] + "\n" + } +} + diff --git a/tsv-uniq b/tsv-uniq new file mode 100755 index 0000000..fde79f2 --- /dev/null +++ b/tsv-uniq @@ -0,0 +1,49 @@ +#!/usr/bin/env ruby + +require 'set' + +strictness = ARGV[0].to_i # 1 one-side + # 2 just the pair + # 3 the pair and one side + +if strictness == 1 or strictness == 3 + side = ARGV[1].to_i # 0 or 1 +end + +segments = [[],[]] +while line = STDIN.gets + src, tgt = line.strip.split "\t" + segments[0] << src + segments[1] << tgt +end + +if strictness == 1 + seen = Set.new + segments[side].each_with_index { |segment,i| + if not seen.include? segment + puts "#{segments[i][0]}\t#{segments[i][1]}" + end + seen << segment + } +elsif strictness == 2 + seen = Set.new + segments[0].each_index { |i| + segment_pair = [segments[i][0], segments[i][1]] + if not seen.include? segment_pair + puts "#{segment_pair[0]}\t#{segment_pair[1]}" + end + seen << segment_pair + } +elsif strictness == 3 + seen = Set.new + seen_pairs = Set.new + segments[side].each_with_index { |segment,i| + segment_pair = [segments[0][i], segments[1][i]] + if not seen_pairs.include? segment_pair and not seen.include? segment + puts "#{segment_pair[0]}\t#{segment_pair[1]}" + end + seen << segment + seen_pairs << segment_pair + } +end + -- cgit v1.2.3 From 7d2fd2bf643671377e990b1c944aa3650397e3da Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 9 Mar 2020 17:48:55 +0000 Subject: de-sgm: match more stuff --- de-sgm | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/de-sgm b/de-sgm index 452edfe..fd4546e 100755 --- a/de-sgm +++ b/de-sgm @@ -1,10 +1,13 @@ #!/bin/sh -egrep -v "^[[:space:]]*(<\?xml.*\?>|]*>)[[:space:]]*$" \ - | egrep -v "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*[[:space:]]*$" \ +egrep -v -i "^[[:space:]]*(<\?xml.*\?>|]*>)[[:space:]]*$" \ + | egrep -v -i "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*[[:space:]]*$" \ | sed "s|]*>\s*||" \ | sed "s|\s*\s*$||" \ - | egrep -v "^[[:space:]]*

[[:space:]]*$|^[[:space:]]*

[[:space:]]*$" \ + | egrep -v -i "^[[:space:]]*

[[:space:]]*$|^[[:space:]]*

[[:space:]]*$" \ | sed "s|\s*||" \ - | sed "s|\s*\s*$||" + | sed "s|\s*\s*$||" \ + | sed "s|\s*\s*$||" \ + | sed "s|\s*\s*$||" \ + | grep -v -P "^\s*$" -- cgit v1.2.3 From a4215712a7611c8f2e9d907e0cd9c734de14a8af Mon Sep 17 00:00:00 2001 From: thomasZen Date: Mon, 11 May 2020 15:55:14 +0200 Subject: Update tmx-extract.py to use python3 --- tmx-extract.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tmx-extract.py b/tmx-extract.py index 90a298a..e8ec959 100755 --- a/tmx-extract.py +++ b/tmx-extract.py @@ -1,4 +1,4 @@ -#!/usr/bin/python2 +#!/usr/bin/python3 # # Adapted from Apertium # http://wiki.apertium.org/wiki/Tools_for_TMX @@ -54,23 +54,23 @@ class TMXHandler(ContentHandler): def endElement(self, name): if name == 'tu' and self.pair == self.cur_pair: for lang in self.cur_pair: - self.files[lang].write(self.seg[lang].encode('utf-8').strip()+"\n") + self.files[lang].write("{}\n".format(self.seg[lang].strip())) -parser = make_parser() -if len(sys.argv) < 3: - print 'Usage: tmx-extract.py ' - print '' - sys.exit(-1) +if __name__ == "__main__": + parser = make_parser() -sfile = open(sys.argv[1]+"."+sys.argv[2], 'w+') -tfile = open(sys.argv[1]+"."+sys.argv[3], 'w+') -curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile) + if len(sys.argv) < 3: + print('Usage: tmx-extract.py ') + print('') + sys.exit(-1) -parser.setContentHandler(curHandler) + sfile_path = sys.argv[1] + "." + sys.argv[2] + tfile_path = sys.argv[1] + "." + sys.argv[3] -parser.parse(open(sys.argv[1])) - -sfile.close() -tfile.close() + with open(sfile_path, 'w+') as sfile, open(tfile_path, 'w+') as tfile: + curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile) + parser.setContentHandler(curHandler) + with open(sys.argv[1], 'r') as tmx_file: + parser.parse(tmx_file) -- cgit v1.2.3 From 4732fb3be94ba3f88b18295cf1c00e8c616eec73 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 30 Jun 2020 12:28:21 +0200 Subject: tmx-extract.py: replace newlines --- tmx-extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tmx-extract.py b/tmx-extract.py index e8ec959..00f18f5 100755 --- a/tmx-extract.py +++ b/tmx-extract.py @@ -54,7 +54,7 @@ class TMXHandler(ContentHandler): def endElement(self, name): if name == 'tu' and self.pair == self.cur_pair: for lang in self.cur_pair: - self.files[lang].write("{}\n".format(self.seg[lang].strip())) + self.files[lang].write("{}\n".format(self.seg[lang].replace("\n", " ").strip())) if __name__ == "__main__": -- cgit v1.2.3