diff options
-rwxr-xr-x | bitext-filter-length | 28 | ||||
-rwxr-xr-x | cma | 22 | ||||
-rwxr-xr-x | tmx-extract.py | 76 |
3 files changed, 108 insertions, 18 deletions
diff --git a/bitext-filter-length b/bitext-filter-length index d812568..7f82a65 100755 --- a/bitext-filter-length +++ b/bitext-filter-length @@ -9,10 +9,9 @@ def main opt :output_suffix, "output suffix", :type => :string, :default => ".out", :short => "-S" opt :min_len, "minimum length", :type => :int, :default => 1, :short => "-m" opt :max_len, "maximum length", :type => :int, :default => 1000, :short => "-M" - #opt :ratio, "length ratio", :type => :float, :default => 0.0001, :short => "-r" - #opt :ratio_min_len, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-R" - opt :ratio_avg, "length ratio average", :type => :float, :required => true, :short => "-A" - opt :ratio_std, "length ratio standard deviation", :type => :float, :required => true, :short => "-T" + opt :ignore_below, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-I" + opt :ratio_mean, "length ratio average", :type => :float, :required => true, :short => "-A" + opt :ratio_stddev, "length ratio standard deviation", :type => :float, :required => true, :short => "-T" end fna,fnb = conf[:inputs].split ',' @@ -21,28 +20,21 @@ def main a_out = WriteFile.new fna+conf[:output_suffix] b_out = WriteFile.new fnb+conf[:output_suffix] - ratio_lower = conf[:ratio_avg] - conf[:ratio_std] - ratio_upper = conf[:ratio_avg] + conf[:ratio_std] + ratio_lower = conf[:ratio_mean] - conf[:ratio_stddev] + ratio_upper = conf[:ratio_mean] + conf[:ratio_stddev] while linea = a.gets lineb = b.gets sza = linea.strip.split.size szb = lineb.strip.split.size - #_ = [sza,szb].map{|i|i.to_f}.sort ratio = sza.to_f/szb.to_f - if sza >= conf[:min_len] and szb >= conf[:min_len] and - sza <= conf[:max_len] and szb <= conf[:max_len] and - ratio >= ratio_lower and - ratio <= ratio_upper - #if _[0] >= conf[:ratio_min_len] - # ratio_ok = (_[0] / _[1]) >= conf[:ratio] - #else - # ratio_ok = true - #end - #if ratio_ok + if (sza > 0 and sza <= conf[:ignore_below] and szb > 0 and szb <= conf[:ignore_below]) or + (sza >= conf[:min_len] and szb >= conf[:min_len] and + sza <= conf[:max_len] and szb <= conf[:max_len] and + ratio >= ratio_lower and + ratio <= ratio_upper) a_out.write linea b_out.write lineb - #end end end @@ -0,0 +1,22 @@ +#!/usr/bin/env ruby + +require 'trollop' + +conf = Trollop::options do + banner "cma < <one number per line>" + opt :round, "Number of digits after decimal point.", :type => :int, :default => -1 +end + +cma = 0.0 +i = 0 +while line = STDIN.gets + x = line.to_f + cma = cma + ((x - cma)/(i+1)) + i +=1 + if conf[:round] >= 0 + puts cma.round conf[:round] + else + puts cma + end +end + diff --git a/tmx-extract.py b/tmx-extract.py new file mode 100755 index 0000000..20e4bac --- /dev/null +++ b/tmx-extract.py @@ -0,0 +1,76 @@ +#!/usr/bin/python +# +# Adapted from Apertium +# http://wiki.apertium.org/wiki/Tools_for_TMX +# + +from xml.sax import make_parser +from xml.sax.handler import ContentHandler + +import sys +import codecs + +class TMXHandler(ContentHandler): + def __init__ (self, slang, tlang, sfile, tfile): + self.pair = set([slang, tlang]) + self.files = {} + self.files[slang] = sfile + self.files[tlang] = tfile + self.inTag = '' + self.note = '' + self.tuid = '' + self.type = '' + self.cur_pair = set() + self.cur_lang = '' + self.seg = {} + self.seg[slang] = '' + self.seg[tlang] = '' + + def startElement(self, name, attrs): + + if name == 'tu': + self.cur_pair = set(); + self.inTag = 'tu' + self.tuid = attrs.get('tuid','') + self.type = attrs.get('datatype','') + elif name == 'note': + self.inTag = 'note' + self.note = "" + elif name == 'tuv': + self.inTag = 'tuv' + self.cur_lang = attrs.get('xml:lang', '') + self.cur_pair.add(self.cur_lang) + elif name == 'seg': + self.inTag = 'seg' + if self.cur_lang in self.pair: + self.seg[self.cur_lang] = '' + + def characters (self, c): + if self.inTag == 'note': + self.note += c + elif self.inTag == 'seg' and self.cur_lang in self.pair: + self.seg[self.cur_lang] += c + + def endElement(self, name): + if name == 'tu' and self.pair == self.cur_pair: + for lang in self.cur_pair: + self.files[lang].write(self.seg[lang].encode('utf-8').strip()+"\n") + +parser = make_parser() + +if len(sys.argv) < 3: + print 'Usage: tmx-extract.py <file> <slang> <tlang>' + print '' + sys.exit(-1) + +sfile = open(sys.argv[1]+"."+sys.argv[2], 'w+') +tfile = open(sys.argv[1]+"."+sys.argv[3], 'w+') +curHandler = TMXHandler(sys.argv[2], sys.argv[3], sfile, tfile) + +parser.setContentHandler(curHandler) + +parser.parse(open(sys.argv[1])) + +sfile.close() +tfile.close() + |