summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <patrick@lilt.com>2019-12-24 09:43:56 +0000
committerPatrick Simianer <patrick@lilt.com>2019-12-24 09:43:56 +0000
commitb76bedcec1750586b88536203090c22d97dd64e7 (patch)
tree93c27d0dbd0f45f68692869d67d6059b2453cafc
parenta4c4b61203c22c1aea71800466d157d79013070e (diff)
parentd6ad8327a873043ba01aeb226dabd3a8716f82ae (diff)
Merge branch 'master' of ssh://github.com/pks/nlp_scripts
-rwxr-xr-xbiuniq35
-rwxr-xr-xmkidx10
-rwxr-xr-xtmx-extract.py90
3 files changed, 90 insertions, 45 deletions
diff --git a/biuniq b/biuniq
new file mode 100755
index 0000000..097c88a
--- /dev/null
+++ b/biuniq
@@ -0,0 +1,35 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+f1 = ReadFile.new ARGV[0]
+f2 = ReadFile.new ARGV[1]
+
+d1 = {}
+d2 = {}
+a1 = []
+a2 = []
+
+while line1 = f1.gets
+ line1.strip!
+
+ line2 = f2.gets
+ line2.strip!
+
+ if !d1.include? line1 and !d2.include? line2
+ a1 << line1
+ a2 << line2
+ end
+
+ d1[line1] = true
+ d1[line2] = true
+end
+
+o1 = WriteFile.new ARGV[0]+".out"
+o2 = WriteFile.new ARGV[1]+".out"
+
+a1.each_with_index { |line1,i|
+ o1.write line1 + "\n"
+ o2.write a2[i] + "\n"
+}
+
diff --git a/mkidx b/mkidx
new file mode 100755
index 0000000..046e131
--- /dev/null
+++ b/mkidx
@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+i = ARGV[0].to_i
+while line = STDIN.gets
+ puts i
+ i += 1
+end
+
diff --git a/tmx-extract.py b/tmx-extract.py
index 20e4bac..90a298a 100755
--- a/tmx-extract.py
+++ b/tmx-extract.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python2
#
# Adapted from Apertium
# http://wiki.apertium.org/wiki/Tools_for_TMX
@@ -11,57 +11,57 @@ import sys
import codecs
class TMXHandler(ContentHandler):
- def __init__ (self, slang, tlang, sfile, tfile):
- self.pair = set([slang, tlang])
- self.files = {}
- self.files[slang] = sfile
- self.files[tlang] = tfile
- self.inTag = ''
- self.note = ''
- self.tuid = ''
- self.type = ''
- self.cur_pair = set()
- self.cur_lang = ''
- self.seg = {}
- self.seg[slang] = ''
- self.seg[tlang] = ''
-
- def startElement(self, name, attrs):
+ def __init__ (self, slang, tlang, sfile, tfile):
+ self.pair = set([slang, tlang])
+ self.files = {}
+ self.files[slang] = sfile
+ self.files[tlang] = tfile
+ self.inTag = ''
+ self.note = ''
+ self.tuid = ''
+ self.type = ''
+ self.cur_pair = set()
+ self.cur_lang = ''
+ self.seg = {}
+ self.seg[slang] = ''
+ self.seg[tlang] = ''
- if name == 'tu':
- self.cur_pair = set();
- self.inTag = 'tu'
- self.tuid = attrs.get('tuid','')
- self.type = attrs.get('datatype','')
- elif name == 'note':
- self.inTag = 'note'
- self.note = ""
- elif name == 'tuv':
- self.inTag = 'tuv'
- self.cur_lang = attrs.get('xml:lang', '')
- self.cur_pair.add(self.cur_lang)
- elif name == 'seg':
- self.inTag = 'seg'
- if self.cur_lang in self.pair:
- self.seg[self.cur_lang] = ''
+ def startElement(self, name, attrs):
- def characters (self, c):
- if self.inTag == 'note':
- self.note += c
- elif self.inTag == 'seg' and self.cur_lang in self.pair:
- self.seg[self.cur_lang] += c
+ if name == 'tu':
+ self.cur_pair = set()
+ self.inTag = 'tu'
+ self.tuid = attrs.get('tuid','')
+ self.type = attrs.get('datatype','')
+ elif name == 'note':
+ self.inTag = 'note'
+ self.note = ""
+ elif name == 'tuv':
+ self.inTag = 'tuv'
+ self.cur_lang = attrs.get('xml:lang', '')
+ self.cur_pair.add(self.cur_lang)
+ elif name == 'seg':
+ self.inTag = 'seg'
+ if self.cur_lang in self.pair:
+ self.seg[self.cur_lang] = ''
- def endElement(self, name):
- if name == 'tu' and self.pair == self.cur_pair:
- for lang in self.cur_pair:
- self.files[lang].write(self.seg[lang].encode('utf-8').strip()+"\n")
+ def characters (self, c):
+ if self.inTag == 'note':
+ self.note += c
+ elif self.inTag == 'seg' and self.cur_lang in self.pair:
+ self.seg[self.cur_lang] += c
+
+ def endElement(self, name):
+ if name == 'tu' and self.pair == self.cur_pair:
+ for lang in self.cur_pair:
+ self.files[lang].write(self.seg[lang].encode('utf-8').strip()+"\n")
parser = make_parser()
if len(sys.argv) < 3:
- print 'Usage: tmx-extract.py <file> <slang> <tlang>'
- print ''
- sys.exit(-1)
+ print 'Usage: tmx-extract.py <file> <slang> <tlang>'
+ print ''
+ sys.exit(-1)
sfile = open(sys.argv[1]+"."+sys.argv[2], 'w+')
tfile = open(sys.argv[1]+"."+sys.argv[3], 'w+')