diff options
| author | Patrick Simianer <pks@pks.rocks> | 2020-08-12 07:32:06 +0200 | 
|---|---|---|
| committer | Patrick Simianer <pks@pks.rocks> | 2020-08-12 07:32:06 +0200 | 
| commit | 64e8bdba930479249b8dfbc4b5d4b659a95433f0 (patch) | |
| tree | e26969b03d8380ee8d2cbc1328f851772006133c | |
| parent | 74e20e00dfbffdcf117778049e47acd79e320110 (diff) | |
| parent | 4732fb3be94ba3f88b18295cf1c00e8c616eec73 (diff) | |
Merge branch 'master' of ssh://github.com/pks/nlp_scripts
| -rwxr-xr-x | NFC | 9 | ||||
| -rwxr-xr-x | add-index | 12 | ||||
| -rwxr-xr-x | chars | 10 | ||||
| -rwxr-xr-x | de-sgm | 14 | ||||
| -rwxr-xr-x | exclude | 17 | ||||
| -rwxr-xr-x | joint-set | 30 | ||||
| -rwxr-xr-x | langid-polyglot | 18 | ||||
| -rwxr-xr-x | sentencepiece-decode | 9 | ||||
| -rwxr-xr-x | tmx-extract.py | 30 | ||||
| -rwxr-xr-x | tsv-joint-set | 53 | ||||
| -rwxr-xr-x | tsv-uniq | 49 | ||||
| -rwxr-xr-x | zh-ko-or-ja | 18 | 
12 files changed, 250 insertions, 19 deletions
| @@ -0,0 +1,9 @@ +#!/usr/bin/env python + +import fileinput +import unicodedata +import sys + +for line in fileinput.input(): +    sys.stdout.write(unicodedata.normalize('NFC', line)) + diff --git a/add-index b/add-index new file mode 100755 index 0000000..77a7e8d --- /dev/null +++ b/add-index @@ -0,0 +1,12 @@ +#!/usr/bin/env ruby + +i = 0 +if ARGV.size > 0 +  i = ARGV[0].to_i +end + +while line = STDIN.gets +  puts "#{i}\t#{line}" +  i += 1 +end + @@ -0,0 +1,10 @@ +#!/usr/bin/env ruby + +require 'zipf' + +while line = STDIN.gets +  line.strip.each_char { |c| +    puts c +  } +end + @@ -1,7 +1,13 @@  #!/bin/sh - -egrep -v "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|reviewer)[^>]*>)[[:space:]]*$" \ -  | egrep -v "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*</(url|description|keywords|talkid|title|translator|reviewer)>[[:space:]]*$" \ -  | sed "s|<seg[^>]*>\s*||" | sed "s|\s*</seg>$||" +egrep -v -i "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|reviewer)[^>]*>)[[:space:]]*$" \ +  | egrep -v -i "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*</(url|description|keywords|talkid|title|translator|reviewer)>[[:space:]]*$" \ +  | sed "s|<seg[^>]*>\s*||" \ +  | sed "s|\s*</seg>\s*$||" \ +  | egrep -v -i "^[[:space:]]*<p>[[:space:]]*$|^[[:space:]]*</p>[[:space:]]*$" \ +  | sed "s|<speaker>\s*||" \ +  | sed "s|\s*</speaker>\s*$||" \ +  | sed "s|\s*<hl>\s*$||" \ +  | sed "s|\s*</hl>\s*$||" \ +  | grep -v -P "^\s*$" @@ -0,0 +1,17 @@ +#!/usr/bin/env ruby + +require 'zipf' +require 'set' + +to_exclude = {} +f = ReadFile.new ARGV[0] +while line = f.gets +  to_exclude[line] = true +end + +while line = STDIN.gets +  if not to_exclude.has_key? line +    puts line +  end +end + diff --git a/joint-set b/joint-set new file mode 100755 index 0000000..b9b9b22 --- /dev/null +++ b/joint-set @@ -0,0 +1,30 @@ +#!/usr/bin/env ruby + +require 'set' +require 'zipf' + +n = ARGV.pop.to_i + +all = [] +all_sets = [] +ARGV.each { |file| +  fp = ReadFile.new file +  a = [] +  s = Set.new +  while line = fp.gets +    a << line +    s << line +  end +  all << a +  all_sets << s +} + +joint_set = all_sets.pop +all_sets.each { |set| +  joint_set &= set +} + +joint_set.each { |i| +  puts i +} + diff --git a/langid-polyglot b/langid-polyglot new file mode 100755 index 0000000..0b0b20c --- /dev/null +++ b/langid-polyglot @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +import polyglot +from polyglot.detect import Detector +import fileinput + +for line in fileinput.input(): +    try: +        for lang in Detector(line).languages: +            if lang.confidence > 80.0: +                print(lang.confidence) +            else: +                print("??") +            break +    except polyglot.detect.base.UnknownLanguage: +        print("??") +        pass + diff --git a/sentencepiece-decode b/sentencepiece-decode new file mode 100755 index 0000000..5e07ffa --- /dev/null +++ b/sentencepiece-decode @@ -0,0 +1,9 @@ +#!/usr/bin/env ruby + +require 'zipf' + +while line = STDIN.gets +  line = line.split.join "" +  puts line.gsub "▁", " " +end + diff --git a/tmx-extract.py b/tmx-extract.py index 90a298a..00f18f5 100755 --- a/tmx-extract.py +++ b/tmx-extract.py @@ -1,4 +1,4 @@ -#!/usr/bin/python2 +#!/usr/bin/python3  #  # Adapted from Apertium  # http://wiki.apertium.org/wiki/Tools_for_TMX @@ -54,23 +54,23 @@ class TMXHandler(ContentHandler):      def endElement(self, name):          if name == 'tu' and self.pair == self.cur_pair:              for lang in self.cur_pair: -                self.files[lang].write(self.seg[lang].encode('utf-8').strip()+"\n") +                self.files[lang].write("{}\n".format(self.seg[lang].replace("\n", " ").strip())) -parser = make_parser() -if len(sys.argv) < 3: -    print 'Usage: tmx-extract.py <file> <slang> <tlang>' -    print '' -    sys.exit(-1) +if __name__ == "__main__": +    parser = make_parser() -sfile = open(sys.argv[1]+"."+sys.argv[2], 'w+') -tfile = open(sys.argv[1]+"."+sys.argv[3], 'w+') -curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile) +    if len(sys.argv) < 3: +        print('Usage: tmx-extract.py <file> <slang> <tlang>') +        print('') +        sys.exit(-1) -parser.setContentHandler(curHandler) +    sfile_path = sys.argv[1] + "." + sys.argv[2] +    tfile_path = sys.argv[1] + "." + sys.argv[3] -parser.parse(open(sys.argv[1])) - -sfile.close() -tfile.close() +    with open(sfile_path, 'w+') as sfile, open(tfile_path, 'w+') as tfile: +        curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile) +        parser.setContentHandler(curHandler) +        with open(sys.argv[1], 'r') as tmx_file: +            parser.parse(tmx_file) diff --git a/tsv-joint-set b/tsv-joint-set new file mode 100755 index 0000000..c0dbdcf --- /dev/null +++ b/tsv-joint-set @@ -0,0 +1,53 @@ +#!/usr/bin/env ruby + +require 'set' +require 'zipf' +require 'optimist' + +conf = Optimist::options do +  opt :n, "Desired number segments in test set.", :type => :int, :required => true +  opt :tsv, ".tsv files", :type => :strings, :required => true +  opt :fields, "Which field (0 or 1) to use for each file", :type => :ints, :required => true +end + +all = [] +all_sets = [] +conf[:tsv].each_with_index { |file,file_index| +  fp = ReadFile.new file +  a = [[],[]] +  s = Set.new +  while line = fp.gets +    p0, p1 = line.strip.split "\t" +    a[0] << p0 +    a[1] << p1 +    s << a[conf[:fields][file_index]].last +  end +  all << a +  all_sets << s +} + +joint_set = all_sets.pop +all_sets.each { |set| +  joint_set &= set +} +sample = joint_set.to_a.shuffle.take conf[:n] + +outputs = [] +all.each_with_index { |a,i| +  o = [[],[]] +  a[conf[:fields][i]].each_with_index { |segment,j| +    if sample.include? segment +      o[0] << a[0][j] +      o[1] << a[1][j] +    end +  } +  outputs << o +} + +outputs.each_with_index { |o,i| +  f = WriteFile.new (conf[:tsv][i] + ".joint") +  o[0].each_index { |j| +    f.write o[0][j] + "\t" + o[1][j] + "\n" +  } +} + diff --git a/tsv-uniq b/tsv-uniq new file mode 100755 index 0000000..fde79f2 --- /dev/null +++ b/tsv-uniq @@ -0,0 +1,49 @@ +#!/usr/bin/env ruby + +require 'set' + +strictness = ARGV[0].to_i # 1 one-side +                          # 2 just the pair +                          # 3 the pair and one side + +if strictness == 1 or strictness == 3 +  side = ARGV[1].to_i # 0 or 1 +end + +segments = [[],[]] +while line = STDIN.gets +  src, tgt = line.strip.split "\t" +  segments[0] << src +  segments[1] << tgt +end + +if strictness == 1 +  seen = Set.new +  segments[side].each_with_index { |segment,i| +    if not seen.include? segment +      puts "#{segments[i][0]}\t#{segments[i][1]}" +    end +    seen << segment +  } +elsif strictness == 2 +  seen = Set.new +  segments[0].each_index { |i| +    segment_pair = [segments[i][0], segments[i][1]] +    if not seen.include? segment_pair +      puts "#{segment_pair[0]}\t#{segment_pair[1]}" +    end +    seen << segment_pair +  } +elsif strictness == 3 +  seen = Set.new +  seen_pairs = Set.new +  segments[side].each_with_index { |segment,i| +    segment_pair = [segments[0][i], segments[1][i]] +    if not seen_pairs.include? segment_pair and not seen.include? segment +      puts "#{segment_pair[0]}\t#{segment_pair[1]}" +    end +    seen << segment +    seen_pairs << segment_pair +  } +end + diff --git a/zh-ko-or-ja b/zh-ko-or-ja new file mode 100755 index 0000000..0b42386 --- /dev/null +++ b/zh-ko-or-ja @@ -0,0 +1,18 @@ +#!/usr/bin/env ruby + +require 'zipf' +require 'script_detector' + +$to_code = {} +$to_code["Ambiguous Chinese"] = "??" +$to_code["Simplified Chinese"] = "zh" +$to_code["Traditional Chinese"] = "zt" +$to_code["Korean"] = "ko" +$to_code["Japanese"] = "ja" +$to_code.default = "??" + +while line = STDIN.gets +  code = $to_code[line.identify_script] +  puts code +end + | 
