summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xNFC9
-rwxr-xr-xadd-index12
-rwxr-xr-xchars10
-rwxr-xr-xde-sgm14
-rwxr-xr-xexclude17
-rwxr-xr-xjoint-set30
-rwxr-xr-xlangid-polyglot18
-rwxr-xr-xsentencepiece-decode9
-rwxr-xr-xtsv-joint-set53
-rwxr-xr-xtsv-uniq49
-rwxr-xr-xzh-ko-or-ja18
11 files changed, 235 insertions, 4 deletions
diff --git a/NFC b/NFC
new file mode 100755
index 0000000..aec1c58
--- /dev/null
+++ b/NFC
@@ -0,0 +1,9 @@
+#!/usr/bin/env python
+
+import fileinput
+import unicodedata
+import sys
+
+for line in fileinput.input():
+ sys.stdout.write(unicodedata.normalize('NFC', line))
+
diff --git a/add-index b/add-index
new file mode 100755
index 0000000..77a7e8d
--- /dev/null
+++ b/add-index
@@ -0,0 +1,12 @@
+#!/usr/bin/env ruby
+
+i = 0
+if ARGV.size > 0
+ i = ARGV[0].to_i
+end
+
+while line = STDIN.gets
+ puts "#{i}\t#{line}"
+ i += 1
+end
+
diff --git a/chars b/chars
new file mode 100755
index 0000000..359c2ab
--- /dev/null
+++ b/chars
@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+while line = STDIN.gets
+ line.strip.each_char { |c|
+ puts c
+ }
+end
+
diff --git a/de-sgm b/de-sgm
index 0b9177d..fd4546e 100755
--- a/de-sgm
+++ b/de-sgm
@@ -1,7 +1,13 @@
#!/bin/sh
-
-egrep -v "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|reviewer)[^>]*>)[[:space:]]*$" \
- | egrep -v "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*</(url|description|keywords|talkid|title|translator|reviewer)>[[:space:]]*$" \
- | sed "s|<seg[^>]*>\s*||" | sed "s|\s*</seg>$||" | egrep -v "^[[:space:]]*<p>[[:space:]]*$|^[[:space:]]*</p>[[:space:]]*$"
+egrep -v -i "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset|translator|reviewer)[^>]*>)[[:space:]]*$" \
+ | egrep -v -i "^[[:space:]]*<(url|description|keywords|talkid|title|translator|reviewer)[^>]*>.*</(url|description|keywords|talkid|title|translator|reviewer)>[[:space:]]*$" \
+ | sed "s|<seg[^>]*>\s*||" \
+ | sed "s|\s*</seg>\s*$||" \
+ | egrep -v -i "^[[:space:]]*<p>[[:space:]]*$|^[[:space:]]*</p>[[:space:]]*$" \
+ | sed "s|<speaker>\s*||" \
+ | sed "s|\s*</speaker>\s*$||" \
+ | sed "s|\s*<hl>\s*$||" \
+ | sed "s|\s*</hl>\s*$||" \
+ | grep -v -P "^\s*$"
diff --git a/exclude b/exclude
new file mode 100755
index 0000000..b5fe3cb
--- /dev/null
+++ b/exclude
@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+require 'set'
+
+to_exclude = {}
+f = ReadFile.new ARGV[0]
+while line = f.gets
+ to_exclude[line] = true
+end
+
+while line = STDIN.gets
+ if not to_exclude.has_key? line
+ puts line
+ end
+end
+
diff --git a/joint-set b/joint-set
new file mode 100755
index 0000000..b9b9b22
--- /dev/null
+++ b/joint-set
@@ -0,0 +1,30 @@
+#!/usr/bin/env ruby
+
+require 'set'
+require 'zipf'
+
+n = ARGV.pop.to_i
+
+all = []
+all_sets = []
+ARGV.each { |file|
+ fp = ReadFile.new file
+ a = []
+ s = Set.new
+ while line = fp.gets
+ a << line
+ s << line
+ end
+ all << a
+ all_sets << s
+}
+
+joint_set = all_sets.pop
+all_sets.each { |set|
+ joint_set &= set
+}
+
+joint_set.each { |i|
+ puts i
+}
+
diff --git a/langid-polyglot b/langid-polyglot
new file mode 100755
index 0000000..0b0b20c
--- /dev/null
+++ b/langid-polyglot
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+import polyglot
+from polyglot.detect import Detector
+import fileinput
+
+for line in fileinput.input():
+ try:
+ for lang in Detector(line).languages:
+ if lang.confidence > 80.0:
+ print(lang.confidence)
+ else:
+ print("??")
+ break
+ except polyglot.detect.base.UnknownLanguage:
+ print("??")
+ pass
+
diff --git a/sentencepiece-decode b/sentencepiece-decode
new file mode 100755
index 0000000..5e07ffa
--- /dev/null
+++ b/sentencepiece-decode
@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+while line = STDIN.gets
+ line = line.split.join ""
+ puts line.gsub "▁", " "
+end
+
diff --git a/tsv-joint-set b/tsv-joint-set
new file mode 100755
index 0000000..c0dbdcf
--- /dev/null
+++ b/tsv-joint-set
@@ -0,0 +1,53 @@
+#!/usr/bin/env ruby
+
+require 'set'
+require 'zipf'
+require 'optimist'
+
+conf = Optimist::options do
+ opt :n, "Desired number segments in test set.", :type => :int, :required => true
+ opt :tsv, ".tsv files", :type => :strings, :required => true
+ opt :fields, "Which field (0 or 1) to use for each file", :type => :ints, :required => true
+end
+
+all = []
+all_sets = []
+conf[:tsv].each_with_index { |file,file_index|
+ fp = ReadFile.new file
+ a = [[],[]]
+ s = Set.new
+ while line = fp.gets
+ p0, p1 = line.strip.split "\t"
+ a[0] << p0
+ a[1] << p1
+ s << a[conf[:fields][file_index]].last
+ end
+ all << a
+ all_sets << s
+}
+
+joint_set = all_sets.pop
+all_sets.each { |set|
+ joint_set &= set
+}
+sample = joint_set.to_a.shuffle.take conf[:n]
+
+outputs = []
+all.each_with_index { |a,i|
+ o = [[],[]]
+ a[conf[:fields][i]].each_with_index { |segment,j|
+ if sample.include? segment
+ o[0] << a[0][j]
+ o[1] << a[1][j]
+ end
+ }
+ outputs << o
+}
+
+outputs.each_with_index { |o,i|
+ f = WriteFile.new (conf[:tsv][i] + ".joint")
+ o[0].each_index { |j|
+ f.write o[0][j] + "\t" + o[1][j] + "\n"
+ }
+}
+
diff --git a/tsv-uniq b/tsv-uniq
new file mode 100755
index 0000000..fde79f2
--- /dev/null
+++ b/tsv-uniq
@@ -0,0 +1,49 @@
+#!/usr/bin/env ruby
+
+require 'set'
+
+strictness = ARGV[0].to_i # 1 one-side
+ # 2 just the pair
+ # 3 the pair and one side
+
+if strictness == 1 or strictness == 3
+ side = ARGV[1].to_i # 0 or 1
+end
+
+segments = [[],[]]
+while line = STDIN.gets
+ src, tgt = line.strip.split "\t"
+ segments[0] << src
+ segments[1] << tgt
+end
+
+if strictness == 1
+ seen = Set.new
+ segments[side].each_with_index { |segment,i|
+ if not seen.include? segment
+ puts "#{segments[i][0]}\t#{segments[i][1]}"
+ end
+ seen << segment
+ }
+elsif strictness == 2
+ seen = Set.new
+ segments[0].each_index { |i|
+ segment_pair = [segments[i][0], segments[i][1]]
+ if not seen.include? segment_pair
+ puts "#{segment_pair[0]}\t#{segment_pair[1]}"
+ end
+ seen << segment_pair
+ }
+elsif strictness == 3
+ seen = Set.new
+ seen_pairs = Set.new
+ segments[side].each_with_index { |segment,i|
+ segment_pair = [segments[0][i], segments[1][i]]
+ if not seen_pairs.include? segment_pair and not seen.include? segment
+ puts "#{segment_pair[0]}\t#{segment_pair[1]}"
+ end
+ seen << segment
+ seen_pairs << segment_pair
+ }
+end
+
diff --git a/zh-ko-or-ja b/zh-ko-or-ja
new file mode 100755
index 0000000..0b42386
--- /dev/null
+++ b/zh-ko-or-ja
@@ -0,0 +1,18 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+require 'script_detector'
+
+$to_code = {}
+$to_code["Ambiguous Chinese"] = "??"
+$to_code["Simplified Chinese"] = "zh"
+$to_code["Traditional Chinese"] = "zt"
+$to_code["Korean"] = "ko"
+$to_code["Japanese"] = "ja"
+$to_code.default = "??"
+
+while line = STDIN.gets
+ code = $to_code[line.identify_script]
+ puts code
+end
+