summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <pks@pks.rocks>2020-08-12 07:32:06 +0200
committerPatrick Simianer <pks@pks.rocks>2020-08-12 07:32:06 +0200
commit64e8bdba930479249b8dfbc4b5d4b659a95433f0 (patch)
treee26969b03d8380ee8d2cbc1328f851772006133c
parent74e20e00dfbffdcf117778049e47acd79e320110 (diff)
parent4732fb3be94ba3f88b18295cf1c00e8c616eec73 (diff)
Merge branch 'master' of ssh://github.com/pks/nlp_scripts
-rwxr-xr-xNFC9
-rwxr-xr-xadd-index12
-rwxr-xr-xchars10
-rwxr-xr-xde-sgm14
-rwxr-xr-xexclude17
-rwxr-xr-xjoint-set30
-rwxr-xr-xlangid-polyglot18
-rwxr-xr-xsentencepiece-decode9
-rwxr-xr-xtmx-extract.py30
-rwxr-xr-xtsv-joint-set53
-rwxr-xr-xtsv-uniq49
-rwxr-xr-xzh-ko-or-ja18
12 files changed, 250 insertions, 19 deletions
diff --git a/NFC b/NFC
new file mode 100755
index 0000000..aec1c58
--- /dev/null
+++ b/NFC
@@ -0,0 +1,9 @@
+#!/usr/bin/env python
+
+import fileinput
+import unicodedata
+import sys
+
+for line in fileinput.input():
+ sys.stdout.write(unicodedata.normalize('NFC', line))
+
diff --git a/add-index b/add-index
new file mode 100755
index 0000000..77a7e8d
--- /dev/null
+++ b/add-index
@@ -0,0 +1,12 @@
+#!/usr/bin/env ruby
+
+i = 0
+if ARGV.size > 0
+ i = ARGV[0].to_i
+end
+
+while line = STDIN.gets
+ puts "#{i}\t#{line}"
+ i += 1
+end
+
diff --git a/chars b/chars
new file mode 100755
index 0000000..359c2ab
--- /dev/null
+++ b/chars
@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+while line = STDIN.gets
+ line.strip.each_char { |c|
+ puts c
+ }
+end
+
diff --git a/de-sgm b/de-sgm
index 664c18c..fd4546e 100755
--- a/de-sgm
+++ b/de-sgm
@@ -1,7 +1,13 @@
#!/bin/sh
-
-egrep -v "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|reviewer)[^>]*>)[[:space:]]*$" \
- | egrep -v "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*</(url|description|keywords|talkid|title|translator|reviewer)>[[:space:]]*$" \
- | sed "s|<seg[^>]*>\s*||" | sed "s|\s*</seg>$||"
+egrep -v -i "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|reviewer)[^>]*>)[[:space:]]*$" \
+ | egrep -v -i "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*</(url|description|keywords|talkid|title|translator|reviewer)>[[:space:]]*$" \
+ | sed "s|<seg[^>]*>\s*||" \
+ | sed "s|\s*</seg>\s*$||" \
+ | egrep -v -i "^[[:space:]]*<p>[[:space:]]*$|^[[:space:]]*</p>[[:space:]]*$" \
+ | sed "s|<speaker>\s*||" \
+ | sed "s|\s*</speaker>\s*$||" \
+ | sed "s|\s*<hl>\s*$||" \
+ | sed "s|\s*</hl>\s*$||" \
+ | grep -v -P "^\s*$"
diff --git a/exclude b/exclude
new file mode 100755
index 0000000..b5fe3cb
--- /dev/null
+++ b/exclude
@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+require 'set'
+
+to_exclude = {}
+f = ReadFile.new ARGV[0]
+while line = f.gets
+ to_exclude[line] = true
+end
+
+while line = STDIN.gets
+ if not to_exclude.has_key? line
+ puts line
+ end
+end
+
diff --git a/joint-set b/joint-set
new file mode 100755
index 0000000..b9b9b22
--- /dev/null
+++ b/joint-set
@@ -0,0 +1,30 @@
+#!/usr/bin/env ruby
+
+require 'set'
+require 'zipf'
+
+n = ARGV.pop.to_i
+
+all = []
+all_sets = []
+ARGV.each { |file|
+ fp = ReadFile.new file
+ a = []
+ s = Set.new
+ while line = fp.gets
+ a << line
+ s << line
+ end
+ all << a
+ all_sets << s
+}
+
+joint_set = all_sets.pop
+all_sets.each { |set|
+ joint_set &= set
+}
+
+joint_set.each { |i|
+ puts i
+}
+
diff --git a/langid-polyglot b/langid-polyglot
new file mode 100755
index 0000000..0b0b20c
--- /dev/null
+++ b/langid-polyglot
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+import polyglot
+from polyglot.detect import Detector
+import fileinput
+
+for line in fileinput.input():
+ try:
+ for lang in Detector(line).languages:
+ if lang.confidence > 80.0:
+ print(lang.confidence)
+ else:
+ print("??")
+ break
+ except polyglot.detect.base.UnknownLanguage:
+ print("??")
+ pass
+
diff --git a/sentencepiece-decode b/sentencepiece-decode
new file mode 100755
index 0000000..5e07ffa
--- /dev/null
+++ b/sentencepiece-decode
@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+while line = STDIN.gets
+ line = line.split.join ""
+ puts line.gsub "▁", " "
+end
+
diff --git a/tmx-extract.py b/tmx-extract.py
index 90a298a..00f18f5 100755
--- a/tmx-extract.py
+++ b/tmx-extract.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python2
+#!/usr/bin/python3
#
# Adapted from Apertium
# http://wiki.apertium.org/wiki/Tools_for_TMX
@@ -54,23 +54,23 @@ class TMXHandler(ContentHandler):
def endElement(self, name):
if name == 'tu' and self.pair == self.cur_pair:
for lang in self.cur_pair:
- self.files[lang].write(self.seg[lang].encode('utf-8').strip()+"\n")
+ self.files[lang].write("{}\n".format(self.seg[lang].replace("\n", " ").strip()))
-parser = make_parser()
-if len(sys.argv) < 3:
- print 'Usage: tmx-extract.py <file> <slang> <tlang>'
- print ''
- sys.exit(-1)
+if __name__ == "__main__":
+ parser = make_parser()
-sfile = open(sys.argv[1]+"."+sys.argv[2], 'w+')
-tfile = open(sys.argv[1]+"."+sys.argv[3], 'w+')
-curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile)
+ if len(sys.argv) < 3:
+ print('Usage: tmx-extract.py <file> <slang> <tlang>')
+ print('')
+ sys.exit(-1)
-parser.setContentHandler(curHandler)
+ sfile_path = sys.argv[1] + "." + sys.argv[2]
+ tfile_path = sys.argv[1] + "." + sys.argv[3]
-parser.parse(open(sys.argv[1]))
-
-sfile.close()
-tfile.close()
+ with open(sfile_path, 'w+') as sfile, open(tfile_path, 'w+') as tfile:
+ curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile)
+ parser.setContentHandler(curHandler)
+ with open(sys.argv[1], 'r') as tmx_file:
+ parser.parse(tmx_file)
diff --git a/tsv-joint-set b/tsv-joint-set
new file mode 100755
index 0000000..c0dbdcf
--- /dev/null
+++ b/tsv-joint-set
@@ -0,0 +1,53 @@
+#!/usr/bin/env ruby
+
+require 'set'
+require 'zipf'
+require 'optimist'
+
+conf = Optimist::options do
+ opt :n, "Desired number segments in test set.", :type => :int, :required => true
+ opt :tsv, ".tsv files", :type => :strings, :required => true
+ opt :fields, "Which field (0 or 1) to use for each file", :type => :ints, :required => true
+end
+
+all = []
+all_sets = []
+conf[:tsv].each_with_index { |file,file_index|
+ fp = ReadFile.new file
+ a = [[],[]]
+ s = Set.new
+ while line = fp.gets
+ p0, p1 = line.strip.split "\t"
+ a[0] << p0
+ a[1] << p1
+ s << a[conf[:fields][file_index]].last
+ end
+ all << a
+ all_sets << s
+}
+
+joint_set = all_sets.pop
+all_sets.each { |set|
+ joint_set &= set
+}
+sample = joint_set.to_a.shuffle.take conf[:n]
+
+outputs = []
+all.each_with_index { |a,i|
+ o = [[],[]]
+ a[conf[:fields][i]].each_with_index { |segment,j|
+ if sample.include? segment
+ o[0] << a[0][j]
+ o[1] << a[1][j]
+ end
+ }
+ outputs << o
+}
+
+outputs.each_with_index { |o,i|
+ f = WriteFile.new (conf[:tsv][i] + ".joint")
+ o[0].each_index { |j|
+ f.write o[0][j] + "\t" + o[1][j] + "\n"
+ }
+}
+
diff --git a/tsv-uniq b/tsv-uniq
new file mode 100755
index 0000000..fde79f2
--- /dev/null
+++ b/tsv-uniq
@@ -0,0 +1,49 @@
+#!/usr/bin/env ruby
+
+require 'set'
+
+strictness = ARGV[0].to_i # 1 one-side
+ # 2 just the pair
+ # 3 the pair and one side
+
+if strictness == 1 or strictness == 3
+ side = ARGV[1].to_i # 0 or 1
+end
+
+segments = [[],[]]
+while line = STDIN.gets
+ src, tgt = line.strip.split "\t"
+ segments[0] << src
+ segments[1] << tgt
+end
+
+if strictness == 1
+ seen = Set.new
+ segments[side].each_with_index { |segment,i|
+ if not seen.include? segment
+ puts "#{segments[i][0]}\t#{segments[i][1]}"
+ end
+ seen << segment
+ }
+elsif strictness == 2
+ seen = Set.new
+ segments[0].each_index { |i|
+ segment_pair = [segments[i][0], segments[i][1]]
+ if not seen.include? segment_pair
+ puts "#{segment_pair[0]}\t#{segment_pair[1]}"
+ end
+ seen << segment_pair
+ }
+elsif strictness == 3
+ seen = Set.new
+ seen_pairs = Set.new
+ segments[side].each_with_index { |segment,i|
+ segment_pair = [segments[0][i], segments[1][i]]
+ if not seen_pairs.include? segment_pair and not seen.include? segment
+ puts "#{segment_pair[0]}\t#{segment_pair[1]}"
+ end
+ seen << segment
+ seen_pairs << segment_pair
+ }
+end
+
diff --git a/zh-ko-or-ja b/zh-ko-or-ja
new file mode 100755
index 0000000..0b42386
--- /dev/null
+++ b/zh-ko-or-ja
@@ -0,0 +1,18 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+require 'script_detector'
+
+$to_code = {}
+$to_code["Ambiguous Chinese"] = "??"
+$to_code["Simplified Chinese"] = "zh"
+$to_code["Traditional Chinese"] = "zt"
+$to_code["Korean"] = "ko"
+$to_code["Japanese"] = "ja"
+$to_code.default = "??"
+
+while line = STDIN.gets
+ code = $to_code[line.identify_script]
+ puts code
+end
+