Merge branch 'master' of https://github.com/pks/nlp_scripts

author: Patrick Simianer <p@simianer.de> 2018-04-11 13:15:30 +0000
committer: Patrick Simianer <p@simianer.de> 2018-04-11 13:15:30 +0000
commit: c4bf03972a71cd3507aa8ef9c3a0ca37a01ace77 (patch)
tree: 0e1e7f53e9383a67e44f62490e648e0d6084d687
parent: 3562d3042ee2b95855e976870221a35a60001834 (diff)
parent: 2920497fb155cd7285a77c1c01f6d424d8fd30e9 (diff)
3 files changed, 108 insertions, 18 deletions
diff --git a/bitext-filter-length b/bitext-filter-length
index d812568..7f82a65 100755
--- a/bitext-filter-length
+++ b/bitext-filter-length
@@ -9,10 +9,9 @@ def main
     opt :output_suffix, "output suffix", :type => :string, :default => ".out", :short => "-S"
     opt :min_len, "minimum length", :type => :int, :default => 1, :short => "-m"
     opt :max_len, "maximum length", :type => :int, :default => 1000, :short => "-M"
-    #opt :ratio, "length ratio", :type => :float, :default => 0.0001, :short => "-r"
-    #opt :ratio_min_len, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-R"
-    opt :ratio_avg, "length ratio average", :type => :float, :required => true, :short => "-A"
-    opt :ratio_std, "length ratio standard deviation", :type => :float, :required => true, :short => "-T"
+    opt :ignore_below, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-I"
+    opt :ratio_mean, "length ratio average", :type => :float, :required => true, :short => "-A"
+    opt :ratio_stddev, "length ratio standard deviation", :type => :float, :required => true, :short => "-T"
   end
 
   fna,fnb = conf[:inputs].split ','
@@ -21,28 +20,21 @@ def main
   a_out = WriteFile.new fna+conf[:output_suffix]
   b_out = WriteFile.new fnb+conf[:output_suffix]
 
-  ratio_lower = conf[:ratio_avg] - conf[:ratio_std]
-  ratio_upper = conf[:ratio_avg] + conf[:ratio_std]
+  ratio_lower = conf[:ratio_mean] - conf[:ratio_stddev]
+  ratio_upper = conf[:ratio_mean] + conf[:ratio_stddev]
 
   while linea = a.gets
     lineb = b.gets
     sza = linea.strip.split.size
     szb = lineb.strip.split.size
-    #_ = [sza,szb].map{|i|i.to_f}.sort
     ratio = sza.to_f/szb.to_f
-    if sza >= conf[:min_len] and szb >= conf[:min_len] and
-       sza <= conf[:max_len] and szb <= conf[:max_len] and
-       ratio >= ratio_lower and
-       ratio <= ratio_upper
-      #if _[0] >= conf[:ratio_min_len]
-      #  ratio_ok = (_[0] / _[1]) >= conf[:ratio]
-      #else
-      #  ratio_ok = true
-      #end
-      #if ratio_ok
+    if (sza > 0 and sza <= conf[:ignore_below] and szb > 0 and szb <= conf[:ignore_below]) or
+        (sza >= conf[:min_len] and szb >= conf[:min_len] and
+         sza <= conf[:max_len] and szb <= conf[:max_len] and
+         ratio >= ratio_lower and
+       ratio <= ratio_upper)
       a_out.write linea
       b_out.write lineb
-      #end
     end
   end
 
diff --git a/cma b/cma
new file mode 100755
index 0000000..9133a0d
--- /dev/null
+++ b/cma
@@ -0,0 +1,22 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+conf = Trollop::options do
+  banner "cma < <one number per line>"
+  opt :round, "Number of digits after decimal point.", :type => :int, :default => -1
+end
+
+cma = 0.0
+i = 0
+while line = STDIN.gets
+  x = line.to_f
+  cma = cma + ((x - cma)/(i+1))
+  i +=1
+  if conf[:round] >= 0
+    puts cma.round conf[:round]
+  else
+    puts cma
+  end
+end
+
diff --git a/tmx-extract.py b/tmx-extract.py
new file mode 100755
index 0000000..20e4bac
--- /dev/null
+++ b/tmx-extract.py
@@ -0,0 +1,76 @@
+#!/usr/bin/python
+#
+# Adapted from Apertium
+# http://wiki.apertium.org/wiki/Tools_for_TMX
+#
+
+from xml.sax import make_parser
+from xml.sax.handler import ContentHandler
+
+import sys
+import codecs
+
+class TMXHandler(ContentHandler):
+	def __init__ (self, slang, tlang, sfile, tfile):
+		self.pair = set([slang, tlang])
+                self.files = {}
+                self.files[slang] = sfile
+                self.files[tlang] = tfile
+		self.inTag = ''
+		self.note = ''
+		self.tuid = ''
+		self.type = ''
+		self.cur_pair = set()
+		self.cur_lang = ''
+		self.seg = {}
+		self.seg[slang] = ''
+		self.seg[tlang] = ''
+	
+	def startElement(self, name, attrs):
+
+		if name == 'tu':
+			self.cur_pair = set();	
+			self.inTag = 'tu'
+			self.tuid = attrs.get('tuid','')
+			self.type = attrs.get('datatype','')
+		elif name == 'note':
+			self.inTag = 'note'
+			self.note = ""
+		elif name == 'tuv':
+			self.inTag = 'tuv'
+			self.cur_lang = attrs.get('xml:lang', '')
+			self.cur_pair.add(self.cur_lang)
+		elif name == 'seg':
+			self.inTag = 'seg'
+			if self.cur_lang in self.pair:
+				self.seg[self.cur_lang] = ''
+
+	def characters (self, c):
+		if self.inTag == 'note':
+			self.note += c
+		elif self.inTag == 'seg' and self.cur_lang in self.pair:
+			self.seg[self.cur_lang] += c
+
+	def endElement(self, name):
+		if name == 'tu' and self.pair == self.cur_pair:
+			for lang in self.cur_pair: 		
+				self.files[lang].write(self.seg[lang].encode('utf-8').strip()+"\n")
+
+parser = make_parser()
+
+if len(sys.argv) < 3:
+	print 'Usage: tmx-extract.py <file> <slang> <tlang>'
+	print ''
+	sys.exit(-1)
+
+sfile = open(sys.argv[1]+"."+sys.argv[2], 'w+')
+tfile = open(sys.argv[1]+"."+sys.argv[3], 'w+')
+curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile)
+
+parser.setContentHandler(curHandler)
+
+parser.parse(open(sys.argv[1]))
+
+sfile.close()
+tfile.close()
+
author	Patrick Simianer <p@simianer.de>	2018-04-11 13:15:30 +0000
committer	Patrick Simianer <p@simianer.de>	2018-04-11 13:15:30 +0000
commit	c4bf03972a71cd3507aa8ef9c3a0ca37a01ace77 (patch)
tree	0e1e7f53e9383a67e44f62490e648e0d6084d687
parent	3562d3042ee2b95855e976870221a35a60001834 (diff)
parent	2920497fb155cd7285a77c1c01f6d424d8fd30e9 (diff)